diff --git a/.gitignore b/.gitignore index 4bfe1e5..302b56c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ SOURCES/nagios-agents-metadata-105ab8a.tar.gz -SOURCES/pacemaker-744a30d.tar.gz +SOURCES/pacemaker-4b1f869.tar.gz diff --git a/.pacemaker.metadata b/.pacemaker.metadata index 31dac21..1c52241 100644 --- a/.pacemaker.metadata +++ b/.pacemaker.metadata @@ -1,2 +1,2 @@ ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz -98d783c49fa894c5bdc30f907f5355539030578d SOURCES/pacemaker-744a30d.tar.gz +dfd19e7ec7aa96520f4948fc37d48ea69835bbdb SOURCES/pacemaker-4b1f869.tar.gz diff --git a/SOURCES/001-status-deletion.patch b/SOURCES/001-status-deletion.patch new file mode 100644 index 0000000..ca35c21 --- /dev/null +++ b/SOURCES/001-status-deletion.patch @@ -0,0 +1,420 @@ +From 6c529bb624ad548f66ce6ef1fa80b77c688918f4 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 22 Nov 2019 16:39:54 -0600 +Subject: [PATCH 1/4] Refactor: controller: rename struct recurring_op_s to + active_op_t + +... because it holds both recurring and pending non-recurring actions, +and the name was confusing +--- + daemons/controld/controld_execd.c | 18 +++++++++--------- + daemons/controld/controld_execd_state.c | 4 ++-- + daemons/controld/controld_lrm.h | 8 ++++---- + 3 files changed, 15 insertions(+), 15 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 9e8dd36..48f35dd 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -403,7 +403,7 @@ lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, + GHashTableIter gIter; + const char *key = NULL; + rsc_history_t *entry = NULL; +- struct recurring_op_s *pending = NULL; ++ active_op_t *pending = NULL; + + crm_debug("Checking for active resources before exit"); + +@@ -909,7 +909,7 @@ static gboolean + lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data) + { + const char *rsc = user_data; +- struct recurring_op_s *pending = value; ++ active_op_t *pending = value; + + if (crm_str_eq(rsc, pending->rsc_id, TRUE)) { + crm_info("Removing op %s:%d for deleted resource %s", +@@ -1137,7 +1137,7 @@ cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op, + { + int rc = pcmk_ok; + char *local_key = NULL; +- struct recurring_op_s *pending = NULL; ++ active_op_t *pending = NULL; + + CRM_CHECK(op != 0, return FALSE); + CRM_CHECK(rsc_id != NULL, return FALSE); +@@ -1203,7 +1203,7 @@ cancel_action_by_key(gpointer key, gpointer value, gpointer user_data) + { + gboolean remove = FALSE; + struct cancel_data *data = user_data; +- struct recurring_op_s *op = (struct recurring_op_s *)value; ++ active_op_t *op = value; + + if (crm_str_eq(op->op_key, data->key, TRUE)) { + data->done = TRUE; +@@ -2107,7 +2107,7 @@ stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data) + { + gboolean remove = FALSE; + struct stop_recurring_action_s *event = user_data; +- struct recurring_op_s *op = (struct recurring_op_s *)value; ++ active_op_t *op = value; + + if ((op->interval_ms != 0) + && crm_str_eq(op->rsc_id, event->rsc->id, TRUE)) { +@@ -2124,7 +2124,7 @@ stop_recurring_actions(gpointer key, gpointer value, gpointer user_data) + { + gboolean remove = FALSE; + lrm_state_t *lrm_state = user_data; +- struct recurring_op_s *op = (struct recurring_op_s *)value; ++ active_op_t *op = value; + + if (op->interval_ms != 0) { + crm_info("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, +@@ -2297,9 +2297,9 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat + * for them to complete during shutdown + */ + char *call_id_s = make_stop_id(rsc->id, call_id); +- struct recurring_op_s *pending = NULL; ++ active_op_t *pending = NULL; + +- pending = calloc(1, sizeof(struct recurring_op_s)); ++ pending = calloc(1, sizeof(active_op_t)); + crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s); + + pending->call_id = call_id; +@@ -2517,7 +2517,7 @@ did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id, + + void + process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, +- struct recurring_op_s *pending, xmlNode *action_xml) ++ active_op_t *pending, xmlNode *action_xml) + { + char *op_id = NULL; + char *op_key = NULL; +diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c +index 0e21d18..473da97 100644 +--- a/daemons/controld/controld_execd_state.c ++++ b/daemons/controld/controld_execd_state.c +@@ -44,7 +44,7 @@ free_deletion_op(gpointer value) + static void + free_recurring_op(gpointer value) + { +- struct recurring_op_s *op = (struct recurring_op_s *)value; ++ active_op_t *op = value; + + free(op->user_data); + free(op->rsc_id); +@@ -61,7 +61,7 @@ fail_pending_op(gpointer key, gpointer value, gpointer user_data) + { + lrmd_event_data_t event = { 0, }; + lrm_state_t *lrm_state = user_data; +- struct recurring_op_s *op = (struct recurring_op_s *)value; ++ active_op_t *op = value; + + crm_trace("Pre-emptively failing " CRM_OP_FMT " on %s (call=%s, %s)", + op->rsc_id, op->op_type, op->interval_ms, +diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h +index 598682b..27df5d7 100644 +--- a/daemons/controld/controld_lrm.h ++++ b/daemons/controld/controld_lrm.h +@@ -33,8 +33,8 @@ typedef struct resource_history_s { + + void history_free(gpointer data); + +-/* TODO - Replace this with lrmd_event_data_t */ +-struct recurring_op_s { ++// In-flight action (recurring or pending) ++typedef struct active_op_s { + guint interval_ms; + int call_id; + gboolean remove; +@@ -45,7 +45,7 @@ struct recurring_op_s { + char *op_key; + char *user_data; + GHashTable *params; +-}; ++} active_op_t; + + typedef struct lrm_state_s { + const char *node_name; +@@ -164,4 +164,4 @@ void remote_ra_process_maintenance_nodes(xmlNode *xml); + gboolean remote_ra_controlling_guest(lrm_state_t * lrm_state); + + void process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, +- struct recurring_op_s *pending, xmlNode *action_xml); ++ active_op_t *pending, xmlNode *action_xml); +-- +1.8.3.1 + + +From 93a59f1df8fe11d365032d75f10cb4189ad2f1f8 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 22 Nov 2019 16:45:31 -0600 +Subject: [PATCH 2/4] Refactor: controller: convert active_op_t booleans to + bitmask + +--- + daemons/controld/controld_execd.c | 11 +++++------ + daemons/controld/controld_lrm.h | 8 ++++++-- + 2 files changed, 11 insertions(+), 8 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 48f35dd..2c9d9c0 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1148,18 +1148,17 @@ cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op, + pending = g_hash_table_lookup(lrm_state->pending_ops, key); + + if (pending) { +- if (remove && pending->remove == FALSE) { +- pending->remove = TRUE; ++ if (remove && is_not_set(pending->flags, active_op_remove)) { ++ set_bit(pending->flags, active_op_remove); + crm_debug("Scheduling %s for removal", key); + } + +- if (pending->cancelled) { ++ if (is_set(pending->flags, active_op_cancelled)) { + crm_debug("Operation %s already cancelled", key); + free(local_key); + return FALSE; + } +- +- pending->cancelled = TRUE; ++ set_bit(pending->flags, active_op_cancelled); + + } else { + crm_info("No pending op found for %s", key); +@@ -2652,7 +2651,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, + crm_err("Recurring operation %s was cancelled without transition information", + op_key); + +- } else if (pending->remove) { ++ } else if (is_set(pending->flags, active_op_remove)) { + /* This recurring operation was cancelled (by us) and pending, and we + * have been waiting for it to finish. + */ +diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h +index 27df5d7..3ab7048 100644 +--- a/daemons/controld/controld_lrm.h ++++ b/daemons/controld/controld_lrm.h +@@ -33,12 +33,16 @@ typedef struct resource_history_s { + + void history_free(gpointer data); + ++enum active_op_e { ++ active_op_remove = (1 << 0), ++ active_op_cancelled = (1 << 1), ++}; ++ + // In-flight action (recurring or pending) + typedef struct active_op_s { + guint interval_ms; + int call_id; +- gboolean remove; +- gboolean cancelled; ++ uint32_t flags; // bitmask of active_op_e + time_t start_time; + char *rsc_id; + char *op_type; +-- +1.8.3.1 + + +From 4d087d021d325e26b41a9b36b5b190dc7b25334c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 22 Nov 2019 16:58:25 -0600 +Subject: [PATCH 3/4] Refactor: controller: remove unused argument + +--- + daemons/controld/controld_execd.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 2c9d9c0..46c1958 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -43,8 +43,8 @@ static int delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int ca + + static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, + const char *rsc_id, const char *operation); +-static void do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation, +- xmlNode * msg, xmlNode * request); ++static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, ++ const char *operation, xmlNode *msg); + + void send_direct_ack(const char *to_host, const char *to_sys, + lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id); +@@ -1858,7 +1858,7 @@ do_lrm_invoke(long long action, + crm_rsc_delete, user_name); + + } else { +- do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, input->msg); ++ do_lrm_rsc_op(lrm_state, rsc, operation, input->xml); + } + + lrmd_free_rsc_info(rsc); +@@ -2170,8 +2170,8 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t + } + + static void +-do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg, +- xmlNode * request) ++do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, ++ const char *operation, xmlNode *msg) + { + int call_id = 0; + char *op_id = NULL; +-- +1.8.3.1 + + +From 356b417274918b7da6cdd9c72c036c923160b318 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 6 Dec 2019 12:15:05 -0600 +Subject: [PATCH 4/4] Refactor: scheduler: combine two "if" statements + +... for readability, and ease of adding another block later +--- + lib/pacemaker/pcmk_sched_graph.c | 120 +++++++++++++++++++-------------------- + 1 file changed, 60 insertions(+), 60 deletions(-) + +diff --git a/lib/pacemaker/pcmk_sched_graph.c b/lib/pacemaker/pcmk_sched_graph.c +index e5a8a01..a6967fe 100644 +--- a/lib/pacemaker/pcmk_sched_graph.c ++++ b/lib/pacemaker/pcmk_sched_graph.c +@@ -1088,71 +1088,71 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) + return action_xml; + } + +- /* List affected resource */ +- if (action->rsc) { +- if (is_set(action->flags, pe_action_pseudo) == FALSE) { +- int lpc = 0; +- +- xmlNode *rsc_xml = create_xml_node(action_xml, crm_element_name(action->rsc->xml)); +- +- const char *attr_list[] = { +- XML_AGENT_ATTR_CLASS, +- XML_AGENT_ATTR_PROVIDER, +- XML_ATTR_TYPE +- }; +- +- if (is_set(action->rsc->flags, pe_rsc_orphan) && action->rsc->clone_name) { +- /* Do not use the 'instance free' name here as that +- * might interfere with the instance we plan to keep. +- * Ie. if there are more than two named /anonymous/ +- * instances on a given node, we need to make sure the +- * command goes to the right one. +- * +- * Keep this block, even when everyone is using +- * 'instance free' anonymous clone names - it means +- * we'll do the right thing if anyone toggles the +- * unique flag to 'off' +- */ +- crm_debug("Using orphan clone name %s instead of %s", action->rsc->id, +- action->rsc->clone_name); +- crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->clone_name); +- crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id); ++ if (action->rsc && is_not_set(action->flags, pe_action_pseudo)) { ++ int lpc = 0; ++ xmlNode *rsc_xml = NULL; ++ const char *attr_list[] = { ++ XML_AGENT_ATTR_CLASS, ++ XML_AGENT_ATTR_PROVIDER, ++ XML_ATTR_TYPE ++ }; ++ ++ // List affected resource ++ ++ rsc_xml = create_xml_node(action_xml, ++ crm_element_name(action->rsc->xml)); ++ if (is_set(action->rsc->flags, pe_rsc_orphan) ++ && action->rsc->clone_name) { ++ /* Do not use the 'instance free' name here as that ++ * might interfere with the instance we plan to keep. ++ * Ie. if there are more than two named /anonymous/ ++ * instances on a given node, we need to make sure the ++ * command goes to the right one. ++ * ++ * Keep this block, even when everyone is using ++ * 'instance free' anonymous clone names - it means ++ * we'll do the right thing if anyone toggles the ++ * unique flag to 'off' ++ */ ++ crm_debug("Using orphan clone name %s instead of %s", action->rsc->id, ++ action->rsc->clone_name); ++ crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->clone_name); ++ crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id); + +- } else if (is_not_set(action->rsc->flags, pe_rsc_unique)) { +- const char *xml_id = ID(action->rsc->xml); +- +- crm_debug("Using anonymous clone name %s for %s (aka. %s)", xml_id, action->rsc->id, +- action->rsc->clone_name); +- +- /* ID is what we'd like client to use +- * ID_LONG is what they might know it as instead +- * +- * ID_LONG is only strictly needed /here/ during the +- * transition period until all nodes in the cluster +- * are running the new software /and/ have rebooted +- * once (meaning that they've only ever spoken to a DC +- * supporting this feature). +- * +- * If anyone toggles the unique flag to 'on', the +- * 'instance free' name will correspond to an orphan +- * and fall into the clause above instead +- */ +- crm_xml_add(rsc_xml, XML_ATTR_ID, xml_id); +- if (action->rsc->clone_name && safe_str_neq(xml_id, action->rsc->clone_name)) { +- crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->clone_name); +- } else { +- crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id); +- } ++ } else if (is_not_set(action->rsc->flags, pe_rsc_unique)) { ++ const char *xml_id = ID(action->rsc->xml); ++ ++ crm_debug("Using anonymous clone name %s for %s (aka. %s)", xml_id, action->rsc->id, ++ action->rsc->clone_name); + ++ /* ID is what we'd like client to use ++ * ID_LONG is what they might know it as instead ++ * ++ * ID_LONG is only strictly needed /here/ during the ++ * transition period until all nodes in the cluster ++ * are running the new software /and/ have rebooted ++ * once (meaning that they've only ever spoken to a DC ++ * supporting this feature). ++ * ++ * If anyone toggles the unique flag to 'on', the ++ * 'instance free' name will correspond to an orphan ++ * and fall into the clause above instead ++ */ ++ crm_xml_add(rsc_xml, XML_ATTR_ID, xml_id); ++ if (action->rsc->clone_name && safe_str_neq(xml_id, action->rsc->clone_name)) { ++ crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->clone_name); + } else { +- CRM_ASSERT(action->rsc->clone_name == NULL); +- crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->id); ++ crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id); + } + +- for (lpc = 0; lpc < DIMOF(attr_list); lpc++) { +- crm_xml_add(rsc_xml, attr_list[lpc], +- g_hash_table_lookup(action->rsc->meta, attr_list[lpc])); +- } ++ } else { ++ CRM_ASSERT(action->rsc->clone_name == NULL); ++ crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->id); ++ } ++ ++ for (lpc = 0; lpc < DIMOF(attr_list); lpc++) { ++ crm_xml_add(rsc_xml, attr_list[lpc], ++ g_hash_table_lookup(action->rsc->meta, attr_list[lpc])); + } + } + +-- +1.8.3.1 + diff --git a/SOURCES/001-xmldiffs.patch b/SOURCES/001-xmldiffs.patch deleted file mode 100644 index 3afcd94..0000000 --- a/SOURCES/001-xmldiffs.patch +++ /dev/null @@ -1,284 +0,0 @@ -From 66e5e4d83e90be3cecab7bf5f50d0e10fbaa7cea Mon Sep 17 00:00:00 2001 -From: "Gao,Yan" -Date: Fri, 26 Apr 2019 11:52:59 +0200 -Subject: [PATCH 1/3] Fix: libcrmcommon: correctly apply XML diffs with - multiple move/create changes - -Given a resource group: -``` - - - - - - - -``` - -, if we'd like to change it to: -``` - - - - - - - -``` - -, the generated XML diff would be like: -``` - - - - - - -``` - -Previously after applying the XML diff, the resulting XML would be a mess: -``` - - - - - - - -``` -It's because the positions of the already moved XML objects could be -affected by the later moved objects. - -This commit fixes it by temporarily putting "move" objects after the -last sibling and also delaying the adding of any "create" objects, then -placing them to the target positions in the right order. ---- - lib/common/xml.c | 126 ++++++++++++++++++++++++++++++++++++++++++------------- - 1 file changed, 97 insertions(+), 29 deletions(-) - -diff --git a/lib/common/xml.c b/lib/common/xml.c -index 66b5f66..d815a48 100644 ---- a/lib/common/xml.c -+++ b/lib/common/xml.c -@@ -1466,11 +1466,40 @@ __xml_find_path(xmlNode *top, const char *key, int target_position) - return target; - } - -+typedef struct xml_change_obj_s { -+ xmlNode *change; -+ xmlNode *match; -+} xml_change_obj_t; -+ -+static gint -+sort_change_obj_by_position(gconstpointer a, gconstpointer b) -+{ -+ const xml_change_obj_t *change_obj_a = a; -+ const xml_change_obj_t *change_obj_b = b; -+ int position_a = -1; -+ int position_b = -1; -+ -+ crm_element_value_int(change_obj_a->change, XML_DIFF_POSITION, &position_a); -+ crm_element_value_int(change_obj_b->change, XML_DIFF_POSITION, &position_b); -+ -+ if (position_a < position_b) { -+ return -1; -+ -+ } else if (position_a > position_b) { -+ return 1; -+ } -+ -+ return 0; -+} -+ - static int - xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset) - { - int rc = pcmk_ok; - xmlNode *change = NULL; -+ GListPtr change_objs = NULL; -+ GListPtr gIter = NULL; -+ - for (change = __xml_first_child(patchset); change != NULL; change = __xml_next(change)) { - xmlNode *match = NULL; - const char *op = crm_element_value(change, XML_DIFF_OP); -@@ -1482,6 +1511,7 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset) - continue; - } - -+ // "delete" changes for XML comments are generated with "position" - if(strcmp(op, "delete") == 0) { - crm_element_value_int(change, XML_DIFF_POSITION, &position); - } -@@ -1497,7 +1527,71 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset) - rc = -pcmk_err_diff_failed; - continue; - -- } else if(strcmp(op, "create") == 0) { -+ } else if (strcmp(op, "create") == 0 || strcmp(op, "move") == 0) { -+ // Delay the adding of a "create" object -+ xml_change_obj_t *change_obj = calloc(1, sizeof(xml_change_obj_t)); -+ -+ CRM_ASSERT(change_obj != NULL); -+ -+ change_obj->change = change; -+ change_obj->match = match; -+ -+ change_objs = g_list_append(change_objs, change_obj); -+ -+ if (strcmp(op, "move") == 0) { -+ // Temporarily put the "move" object after the last sibling -+ if (match->parent != NULL && match->parent->last != NULL) { -+ xmlAddNextSibling(match->parent->last, match); -+ } -+ } -+ -+ } else if(strcmp(op, "delete") == 0) { -+ free_xml(match); -+ -+ } else if(strcmp(op, "modify") == 0) { -+ xmlAttr *pIter = pcmk__first_xml_attr(match); -+ xmlNode *attrs = __xml_first_child(first_named_child(change, XML_DIFF_RESULT)); -+ -+ if(attrs == NULL) { -+ rc = -ENOMSG; -+ continue; -+ } -+ while(pIter != NULL) { -+ const char *name = (const char *)pIter->name; -+ -+ pIter = pIter->next; -+ xml_remove_prop(match, name); -+ } -+ -+ for (pIter = pcmk__first_xml_attr(attrs); pIter != NULL; pIter = pIter->next) { -+ const char *name = (const char *)pIter->name; -+ const char *value = crm_element_value(attrs, name); -+ -+ crm_xml_add(match, name, value); -+ } -+ -+ } else { -+ crm_err("Unknown operation: %s", op); -+ } -+ } -+ -+ // Changes should be generated in the right order. Double checking. -+ change_objs = g_list_sort(change_objs, sort_change_obj_by_position); -+ -+ for (gIter = change_objs; gIter; gIter = gIter->next) { -+ xml_change_obj_t *change_obj = gIter->data; -+ xmlNode *match = change_obj->match; -+ const char *op = NULL; -+ const char *xpath = NULL; -+ -+ change = change_obj->change; -+ -+ op = crm_element_value(change, XML_DIFF_OP); -+ xpath = crm_element_value(change, XML_DIFF_PATH); -+ -+ crm_trace("Continue performing %s on %s with %p", op, xpath, match); -+ -+ if(strcmp(op, "create") == 0) { - int position = 0; - xmlNode *child = NULL; - xmlNode *match_child = NULL; -@@ -1565,36 +1659,10 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset) - match->name, ID(match), __xml_offset(match), position, match->prev); - rc = -pcmk_err_diff_failed; - } -- -- } else if(strcmp(op, "delete") == 0) { -- free_xml(match); -- -- } else if(strcmp(op, "modify") == 0) { -- xmlAttr *pIter = pcmk__first_xml_attr(match); -- xmlNode *attrs = __xml_first_child(first_named_child(change, XML_DIFF_RESULT)); -- -- if(attrs == NULL) { -- rc = -ENOMSG; -- continue; -- } -- while(pIter != NULL) { -- const char *name = (const char *)pIter->name; -- -- pIter = pIter->next; -- xml_remove_prop(match, name); -- } -- -- for (pIter = pcmk__first_xml_attr(attrs); pIter != NULL; pIter = pIter->next) { -- const char *name = (const char *)pIter->name; -- const char *value = crm_element_value(attrs, name); -- -- crm_xml_add(match, name, value); -- } -- -- } else { -- crm_err("Unknown operation: %s", op); - } - } -+ -+ g_list_free_full(change_objs, free); - return rc; - } - --- -1.8.3.1 - - -From f8d008d8d3a29900ee0c6decbb71a243fa4c2d8c Mon Sep 17 00:00:00 2001 -From: "Gao,Yan" -Date: Tue, 30 Apr 2019 00:15:03 +0200 -Subject: [PATCH 2/3] Fix: libcrmcommon: avoid possible use-of-NULL when - applying XML diffs - ---- - lib/common/xml.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/lib/common/xml.c b/lib/common/xml.c -index d815a48..fe87de6 100644 ---- a/lib/common/xml.c -+++ b/lib/common/xml.c -@@ -1506,11 +1506,12 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset) - const char *xpath = crm_element_value(change, XML_DIFF_PATH); - int position = -1; - -- crm_trace("Processing %s %s", change->name, op); - if(op == NULL) { - continue; - } - -+ crm_trace("Processing %s %s", change->name, op); -+ - // "delete" changes for XML comments are generated with "position" - if(strcmp(op, "delete") == 0) { - crm_element_value_int(change, XML_DIFF_POSITION, &position); --- -1.8.3.1 - - -From e6b2bf0cf7e7ed839583d529b190a7a6cd1bd594 Mon Sep 17 00:00:00 2001 -From: "Gao,Yan" -Date: Tue, 30 Apr 2019 00:19:46 +0200 -Subject: [PATCH 3/3] Fix: libcrmcommon: return error when applying XML diffs - containing unknown operations - ---- - lib/common/xml.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/lib/common/xml.c b/lib/common/xml.c -index fe87de6..940c4b9 100644 ---- a/lib/common/xml.c -+++ b/lib/common/xml.c -@@ -1573,6 +1573,7 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset) - - } else { - crm_err("Unknown operation: %s", op); -+ rc = -pcmk_err_diff_failed; - } - } - --- -1.8.3.1 - diff --git a/SOURCES/002-failed-monitors.patch b/SOURCES/002-failed-monitors.patch deleted file mode 100644 index 1adf1a4..0000000 --- a/SOURCES/002-failed-monitors.patch +++ /dev/null @@ -1,273 +0,0 @@ -From 5470f1d9c776dbf753e015fa96153b6a63c17b83 Mon Sep 17 00:00:00 2001 -From: "Gao,Yan" -Date: Thu, 9 May 2019 13:24:35 +0200 -Subject: [PATCH] Fix: controller: confirm cancel of failed monitors - -Usually after a monitor has been cancelled from executor, contoller -erases the corresponding lrm_rsc_op from the cib, and DC will confirm -the cancel action by process_op_deletion() according to the cib diff. - -But if a monitor has failed, the lrm_rsc_op will be recorded as -"last_failure". When cancelling it, the lrm_rsc_op won't get erased from -the cib given the logic on purpose in erase_lrm_history_by_op(). So that -the cancel action won't have a chance to get confirmed by DC with -process_op_deletion(). - -Previously cluster transition would get stuck waiting for the remaining -action timer to time out. - -This commit fixes the issue by directly acknowledging the cancel action -in this case and enabling DC to be able to confirm it. - -This also moves get_node_id() function into controld_utils.c for common -use. - -Producer: -``` - # Insert a 10s sleep in the monitor action of RA - # /usr/lib/ocf/resource.d/pacemaker/Stateful: - - stateful_monitor() { - + sleep 10 - stateful_check_state "master" - - # Add a promotable clone resource: - - crm configure primitive stateful ocf:pacemaker:Stateful \ - op monitor interval=5 role=Master \ - op monitor interval=10 role=Slave - crm configure clone p-clone stateful \ - meta promotable=true - - # Wait for the resource instance to be started, promoted to be master, - # and monitor for master role to complete. - - # Set is-managed=false for the promotable clone: - crm_resource --meta -p is-managed -v false -r p-clone - - # Change the status of the master instance to be slave and immediately - # enforce refresh of it: - echo slave > /var/run/Stateful-stateful.state; crm_resource --refresh -r stateful --force - - # Wait for probe to complete, and then monitor for slave role to be - # issued: - sleep 15 - - # While the monitor for slave role is still in progress, change the - # status to be master again: - echo master > /var/run/Stateful-stateful.state - - # The monitor for slave role returns error. Cluster issues monitor for - # master role instead and tries to cancel the failed one for slave role. - # But cluster transition gets stuck. Depending on the monitor timeout - # configured for the slave role plus cluster-delay, only after that - # controller eventually says: - - pacemaker-controld[21205] error: Node opensuse150 did not send cancel result (via controller) within 20000ms (action timeout plus cluster-delay) - pacemaker-controld[21205] error: [Action 1]: In-flight rsc op stateful_monitor_10000 on opensuse150 (priority: 0, waiting: none) - pacemaker-controld[21205] notice: Transition 6 aborted: Action lost - -``` ---- - daemons/controld/controld_execd.c | 38 ++++++++++++++++++++++++++++++++ - daemons/controld/controld_te_callbacks.c | 21 ++---------------- - daemons/controld/controld_te_events.c | 32 +++++++++++++++++++++++++++ - daemons/controld/controld_transition.h | 1 + - daemons/controld/controld_utils.c | 13 +++++++++++ - daemons/controld/controld_utils.h | 2 ++ - 6 files changed, 88 insertions(+), 19 deletions(-) - -diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c -index 976fed1..8282fed 100644 ---- a/daemons/controld/controld_execd.c -+++ b/daemons/controld/controld_execd.c -@@ -2476,6 +2476,30 @@ unescape_newlines(const char *string) - return ret; - } - -+static bool -+did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id, -+ const char * op_type, guint interval_ms) -+{ -+ rsc_history_t *entry = NULL; -+ -+ CRM_CHECK(lrm_state != NULL, return FALSE); -+ CRM_CHECK(rsc_id != NULL, return FALSE); -+ CRM_CHECK(op_type != NULL, return FALSE); -+ -+ entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id); -+ if (entry == NULL || entry->failed == NULL) { -+ return FALSE; -+ } -+ -+ if (crm_str_eq(entry->failed->rsc_id, rsc_id, TRUE) -+ && safe_str_eq(entry->failed->op_type, op_type) -+ && entry->failed->interval_ms == interval_ms) { -+ return TRUE; -+ } -+ -+ return FALSE; -+} -+ - void - process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, - struct recurring_op_s *pending, xmlNode *action_xml) -@@ -2605,6 +2629,20 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, - erase_lrm_history_by_op(lrm_state, op); - } - -+ /* If the recurring operation had failed, the lrm_rsc_op is recorded as -+ * "last_failure" which won't get erased from the cib given the logic on -+ * purpose in erase_lrm_history_by_op(). So that the cancel action won't -+ * have a chance to get confirmed by DC with process_op_deletion(). -+ * Cluster transition would get stuck waiting for the remaining action -+ * timer to time out. -+ * -+ * Directly acknowledge the cancel operation in this case. -+ */ -+ if (did_lrm_rsc_op_fail(lrm_state, pending->rsc_id, -+ pending->op_type, pending->interval_ms)) { -+ need_direct_ack = TRUE; -+ } -+ - } else if (op->rsc_deleted) { - /* This recurring operation was cancelled (but not by us, and the - * executor does not have resource information, likely due to resource -diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c -index 51d908e..22b5f4b 100644 ---- a/daemons/controld/controld_te_callbacks.c -+++ b/daemons/controld/controld_te_callbacks.c -@@ -32,19 +32,6 @@ static unsigned long int stonith_max_attempts = 10; - /* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ - #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" - --static const char * --get_node_id(xmlNode * rsc_op) --{ -- xmlNode *node = rsc_op; -- -- while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { -- node = node->parent; -- } -- -- CRM_CHECK(node != NULL, return NULL); -- return ID(node); --} -- - void - update_stonith_max_attempts(const char* value) - { -@@ -374,12 +361,8 @@ process_op_deletion(const char *xpath, xmlNode *change) - node_uuid = extract_node_uuid(xpath); - cancel = get_cancel_action(key, node_uuid); - if (cancel) { -- crm_info("Cancellation of %s on %s confirmed (%d)", -- key, node_uuid, cancel->id); -- stop_te_timer(cancel->timer); -- te_action_confirmed(cancel); -- update_graph(transition_graph, cancel); -- trigger_graph(); -+ confirm_cancel_action(cancel); -+ - } else { - abort_transition(INFINITY, tg_restart, "Resource operation removal", - change); -diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c -index c0d096f..b7b48a4 100644 ---- a/daemons/controld/controld_te_events.c -+++ b/daemons/controld/controld_te_events.c -@@ -355,6 +355,27 @@ get_cancel_action(const char *id, const char *node) - return NULL; - } - -+void -+confirm_cancel_action(crm_action_t *cancel) -+{ -+ const char *op_key = NULL; -+ const char *node_name = NULL; -+ -+ CRM_ASSERT(cancel != NULL); -+ -+ op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY); -+ node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET); -+ -+ stop_te_timer(cancel->timer); -+ te_action_confirmed(cancel); -+ update_graph(transition_graph, cancel); -+ -+ crm_info("Cancellation of %s on %s confirmed (action %d)", -+ op_key, node_name, cancel->id); -+ -+ trigger_graph(); -+} -+ - /* downed nodes are listed like: ... */ - #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \ - "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']" -@@ -471,6 +492,17 @@ process_graph_event(xmlNode *event, const char *event_node) - /* Recurring actions have the transition number they were first - * scheduled in. - */ -+ -+ if (status == PCMK_LRM_OP_CANCELLED) { -+ const char *node_id = get_node_id(event); -+ -+ action = get_cancel_action(id, node_id); -+ if (action) { -+ confirm_cancel_action(action); -+ } -+ goto bail; -+ } -+ - desc = "arrived after initial scheduling"; - abort_transition(INFINITY, tg_restart, "Change in recurring result", - event); -diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h -index 0a33599..a162f99 100644 ---- a/daemons/controld/controld_transition.h -+++ b/daemons/controld/controld_transition.h -@@ -25,6 +25,7 @@ void execute_stonith_cleanup(void); - /* tengine */ - extern crm_action_t *match_down_event(const char *target); - extern crm_action_t *get_cancel_action(const char *id, const char *node); -+void confirm_cancel_action(crm_action_t *cancel); - - void controld_record_action_timeout(crm_action_t *action); - extern gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node); -diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c -index ca7e15d..35922f0 100644 ---- a/daemons/controld/controld_utils.c -+++ b/daemons/controld/controld_utils.c -@@ -1073,3 +1073,16 @@ feature_set_compatible(const char *dc_version, const char *join_version) - // DC's minor version must be the same or older - return dc_v <= join_v; - } -+ -+const char * -+get_node_id(xmlNode *lrm_rsc_op) -+{ -+ xmlNode *node = lrm_rsc_op; -+ -+ while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { -+ node = node->parent; -+ } -+ -+ CRM_CHECK(node != NULL, return NULL); -+ return ID(node); -+} -diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h -index 2a92db5..68992f5 100644 ---- a/daemons/controld/controld_utils.h -+++ b/daemons/controld/controld_utils.h -@@ -95,6 +95,8 @@ unsigned int cib_op_timeout(void); - bool feature_set_compatible(const char *dc_version, const char *join_version); - bool controld_action_is_recordable(const char *action); - -+const char *get_node_id(xmlNode *lrm_rsc_op); -+ - /* Convenience macro for registering a CIB callback - * (assumes that data can be freed with free()) - */ --- -1.8.3.1 - diff --git a/SOURCES/002-status-deletion.patch b/SOURCES/002-status-deletion.patch new file mode 100644 index 0000000..1a31cdc --- /dev/null +++ b/SOURCES/002-status-deletion.patch @@ -0,0 +1,2064 @@ +From 9e4addbcb67ea8e36ba853f1e401d8a6cb6a0aa3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 20 Dec 2019 11:34:06 -0600 +Subject: [PATCH 1/8] Refactor: scheduler: reduce code duplication when + displaying resources + +Refactor native_output_string() to use GString, for readability and +maintainability. Refactor common_print() to use it, to reduce duplication and +ensure displays are consistent. + +This makes a couple small changes in how things are shown: + +* If pe_print_dev is enabled (a debugging flag not actually used by anything), + the additional resource fields are shown with the resource flags rather than + their own parenthesized list. + +* The new output model is now consistent with the legacy print model in + displaying resource flags with commas (not spaces) between them. +--- + include/crm/pengine/common.h | 24 +-- + lib/pengine/native.c | 410 +++++++++++++++++-------------------------- + 2 files changed, 168 insertions(+), 266 deletions(-) + +diff --git a/include/crm/pengine/common.h b/include/crm/pengine/common.h +index e497f9c..48c2b66 100644 +--- a/include/crm/pengine/common.h ++++ b/include/crm/pengine/common.h +@@ -1,22 +1,12 @@ +-/* +- * Copyright 2004-2018 the Pacemaker project contributors ++/* ++ * Copyright 2004-2019 the Pacemaker project contributors + * + * The version control history for this file may have further details. +- * +- * This program is free software; you can redistribute it and/or +- * modify it under the terms of the GNU Lesser General Public +- * License as published by the Free Software Foundation; either +- * version 2 of the License, or (at your option) any later version. +- * +- * This software is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- * General Public License for more details. +- * +- * You should have received a copy of the GNU Lesser General Public +- * License along with this library; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ * ++ * This source code is licensed under the GNU Lesser General Public License ++ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ ++ + #ifndef PE_COMMON__H + # define PE_COMMON__H + +@@ -104,7 +94,7 @@ enum pe_print_options { + pe_print_html = 0x0002, + pe_print_ncurses = 0x0004, + pe_print_printf = 0x0008, +- pe_print_dev = 0x0010, ++ pe_print_dev = 0x0010, // Debugging (@COMPAT probably not useful) + pe_print_details = 0x0020, + pe_print_max_details = 0x0040, + pe_print_rsconly = 0x0080, +diff --git a/lib/pengine/native.c b/lib/pengine/native.c +index fdb98e0..8fd98bc 100644 +--- a/lib/pengine/native.c ++++ b/lib/pengine/native.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -490,165 +490,172 @@ native_print_xml(resource_t * rsc, const char *pre_text, long options, void *pri + } + } + +-/* making this inline rather than a macro prevents a coverity "unreachable" +- * warning on the first usage +- */ +-static inline const char * +-comma_if(int i) ++// Append a flag to resource description string's flags list ++static bool ++add_output_flag(GString *s, const char *flag_desc, bool have_flags) + { +- return i? ", " : ""; ++ g_string_append(s, (have_flags? ", " : " (")); ++ g_string_append(s, flag_desc); ++ return true; + } + +-static char * +-flags_string(pe_resource_t *rsc, pe_node_t *node, long options, +- const char *target_role) ++// Append a node name to resource description string's node list ++static bool ++add_output_node(GString *s, const char *node, bool have_nodes) + { +- char *flags[6] = { NULL, }; +- char *result = NULL; +- int ndx = 0; ++ g_string_append(s, (have_nodes? " " : " [ ")); ++ g_string_append(s, node); ++ return true; ++} ++ ++/*! ++ * \internal ++ * \brief Create a string description of a resource ++ * ++ * \param[in] rsc Resource to describe ++ * \param[in] name Desired identifier for the resource ++ * \param[in] node If not NULL, node that resource is "on" ++ * \param[in] options Bitmask of pe_print_* ++ * \param[in] target_role Resource's target role ++ * \param[in] show_nodes Whether to display nodes when multiply active ++ * ++ * \return Newly allocated string description of resource ++ * \note Caller must free the result with g_free(). ++ */ ++static gchar * ++native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node, ++ long options, const char *target_role, bool show_nodes) ++{ ++ const char *class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); ++ const char *provider = NULL; ++ const char *kind = crm_element_value(rsc->xml, XML_ATTR_TYPE); ++ char *retval = NULL; ++ GString *outstr = NULL; ++ bool have_flags = false; ++ ++ CRM_CHECK(name != NULL, name = "unknown"); ++ CRM_CHECK(kind != NULL, kind = "unknown"); ++ CRM_CHECK(class != NULL, class = "unknown"); ++ ++ if (is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_provider)) { ++ provider = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); ++ } + +- if (node && node->details->online == FALSE && node->details->unclean) { +- flags[ndx++] = strdup("UNCLEAN"); ++ if (is_set(options, pe_print_rsconly) ++ || pcmk__list_of_multiple(rsc->running_on)) { ++ node = NULL; + } + ++ // We need a string of at least this size ++ outstr = g_string_sized_new(strlen(name) + strlen(class) + strlen(kind) ++ + (provider? (strlen(provider) + 2) : 0) ++ + (node? strlen(node->details->uname) + 1 : 0) ++ + 11); ++ ++ // Resource name and agent ++ g_string_printf(outstr, "%s\t(%s%s%s:%s):\t", name, class, ++ /* @COMPAT This should be a single ':' (see CLBZ#5395) but ++ * to avoid breaking anything relying on it, we're keeping ++ * it like this until the next minor version bump. ++ */ ++ (provider? "::" : ""), (provider? provider : ""), kind); ++ ++ // State on node ++ if (is_set(rsc->flags, pe_rsc_orphan)) { ++ g_string_append(outstr, " ORPHANED"); ++ } ++ if (is_set(rsc->flags, pe_rsc_failed)) { ++ enum rsc_role_e role = native_displayable_role(rsc); ++ ++ if (role > RSC_ROLE_SLAVE) { ++ g_string_append_printf(outstr, " FAILED %s", role2text(role)); ++ } else { ++ g_string_append(outstr, " FAILED"); ++ } ++ } else { ++ g_string_append(outstr, native_displayable_state(rsc, options)); ++ } ++ if (node) { ++ g_string_append_printf(outstr, " %s", node->details->uname); ++ } ++ ++ // Flags, as: ( [...]) ++ if (node && !(node->details->online) && node->details->unclean) { ++ have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); ++ } + if (is_set(options, pe_print_pending)) { + const char *pending_task = native_pending_task(rsc); + + if (pending_task) { +- flags[ndx++] = strdup(pending_task); ++ have_flags = add_output_flag(outstr, pending_task, have_flags); + } + } +- + if (target_role) { + enum rsc_role_e target_role_e = text2role(target_role); + +- /* Ignore target role Started, as it is the default anyways +- * (and would also allow a Master to be Master). +- * Show if target role limits our abilities. */ ++ /* Only show target role if it limits our abilities (i.e. ignore ++ * Started, as it is the default anyways, and doesn't prevent the ++ * resource from becoming Master). ++ */ + if (target_role_e == RSC_ROLE_STOPPED) { +- flags[ndx++] = strdup("disabled"); ++ have_flags = add_output_flag(outstr, "disabled", have_flags); + + } else if (is_set(uber_parent(rsc)->flags, pe_rsc_promotable) + && target_role_e == RSC_ROLE_SLAVE) { +- flags[ndx++] = crm_strdup_printf("target-role:%s", target_role); ++ have_flags = add_output_flag(outstr, "target-role:", have_flags); ++ g_string_append(outstr, target_role); + } + } +- + if (is_set(rsc->flags, pe_rsc_block)) { +- flags[ndx++] = strdup("blocked"); +- ++ have_flags = add_output_flag(outstr, "blocked", have_flags); + } else if (is_not_set(rsc->flags, pe_rsc_managed)) { +- flags[ndx++] = strdup("unmanaged"); ++ have_flags = add_output_flag(outstr, "unmanaged", have_flags); + } +- + if (is_set(rsc->flags, pe_rsc_failure_ignored)) { +- flags[ndx++] = strdup("failure ignored"); ++ have_flags = add_output_flag(outstr, "failure ignored", have_flags); + } +- +- if (ndx > 0) { +- char *total = g_strjoinv(" ", flags); +- +- result = crm_strdup_printf(" (%s)", total); +- g_free(total); +- } +- +- while (--ndx >= 0) { +- free(flags[ndx]); +- } +- return result; +-} +- +-static char * +-native_output_string(resource_t *rsc, const char *name, node_t *node, long options, +- const char *target_role) { +- const char *desc = NULL; +- const char *class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); +- const char *kind = crm_element_value(rsc->xml, XML_ATTR_TYPE); +- enum rsc_role_e role = native_displayable_role(rsc); +- +- char *retval = NULL; +- +- char *unames = NULL; +- char *provider = NULL; +- const char *orphan = NULL; +- char *role_s = NULL; +- char *node_s = NULL; +- char *print_dev_s = NULL; +- char *flags_s = NULL; +- +- CRM_ASSERT(kind != NULL); +- +- if (is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_provider)) { +- provider = crm_strdup_printf("::%s", crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER)); ++ if (is_set(options, pe_print_dev)) { ++ if (is_set(options, pe_rsc_provisional)) { ++ have_flags = add_output_flag(outstr, "provisional", have_flags); ++ } ++ if (is_not_set(options, pe_rsc_runnable)) { ++ have_flags = add_output_flag(outstr, "non-startable", have_flags); ++ } ++ have_flags = add_output_flag(outstr, "variant:", have_flags); ++ g_string_append_printf(outstr, "%s priority:%f", ++ crm_element_name(rsc->xml), ++ (double) (rsc->priority)); + } +- +- if (is_set(rsc->flags, pe_rsc_orphan)) { +- orphan = " ORPHANED"; ++ if (have_flags) { ++ g_string_append(outstr, ")"); + } + +- if (role > RSC_ROLE_SLAVE && is_set(rsc->flags, pe_rsc_failed)) { +- role_s = crm_strdup_printf(" FAILED %s", role2text(role)); +- } else if (is_set(rsc->flags, pe_rsc_failed)) { +- role_s = crm_strdup_printf(" FAILED"); +- } else { +- role_s = crm_strdup_printf(" %s", native_displayable_state(rsc, options)); +- } ++ // User-supplied description ++ if (is_set(options, pe_print_rsconly) ++ || pcmk__list_of_multiple(rsc->running_on)) { ++ const char *desc = crm_element_value(rsc->xml, XML_ATTR_DESC); + +- if (node) { +- node_s = crm_strdup_printf(" %s", node->details->uname); ++ if (desc) { ++ g_string_append_printf(outstr, " %s", desc); ++ } + } + +- if (is_set(options, pe_print_rsconly) || g_list_length(rsc->running_on) > 1) { +- desc = crm_element_value(rsc->xml, XML_ATTR_DESC); +- } ++ if (show_nodes && is_not_set(options, pe_print_rsconly) ++ && pcmk__list_of_multiple(rsc->running_on)) { ++ bool have_nodes = false; + +- if (is_not_set(options, pe_print_rsconly) && g_list_length(rsc->running_on) > 1) { +- GListPtr gIter = rsc->running_on; +- gchar **arr = calloc(g_list_length(rsc->running_on)+1, sizeof(gchar *)); +- int i = 0; +- char *total = NULL; ++ for (GList *iter = rsc->running_on; iter != NULL; iter = iter->next) { ++ pe_node_t *n = (pe_node_t *) iter->data; + +- for (; gIter != NULL; gIter = gIter->next) { +- node_t *n = (node_t *) gIter->data; +- arr[i] = (gchar *) strdup(n->details->uname); +- i++; ++ have_nodes = add_output_node(outstr, n->details->uname, have_nodes); ++ } ++ if (have_nodes) { ++ g_string_append(outstr, " ]"); + } +- +- total = g_strjoinv(" ", arr); +- unames = crm_strdup_printf(" [ %s ]", total); +- +- g_free(total); +- g_strfreev(arr); + } + +- if (is_set(options, pe_print_dev)) { +- print_dev_s = crm_strdup_printf(" (%s%svariant=%s, priority=%f)", +- is_set(rsc->flags, pe_rsc_provisional) ? "provisional, " : "", +- is_set(rsc->flags, pe_rsc_runnable) ? "" : "non-startable, ", +- crm_element_name(rsc->xml), (double)rsc->priority); +- } +- +- flags_s = flags_string(rsc, node, options, target_role); +- +- retval = crm_strdup_printf("%s\t(%s%s:%s):\t%s%s%s%s%s%s%s%s", +- name, class, +- provider ? provider : "", +- kind, +- orphan ? orphan : "", +- role_s, +- node_s ? node_s : "", +- print_dev_s ? print_dev_s : "", +- flags_s ? flags_s : "", +- desc ? " " : "", desc ? desc : "", +- unames ? unames : ""); +- +- free(provider); +- free(role_s); +- free(node_s); +- free(unames); +- free(print_dev_s); +- free(flags_s); +- ++ retval = outstr->str; ++ g_string_free(outstr, FALSE); + return retval; + } + +@@ -656,7 +663,6 @@ void + pe__common_output_html(pcmk__output_t *out, resource_t * rsc, + const char *name, node_t *node, long options) + { +- char *s = NULL; + const char *kind = crm_element_value(rsc->xml, XML_ATTR_TYPE); + const char *target_role = NULL; + +@@ -675,10 +681,6 @@ pe__common_output_html(pcmk__output_t *out, resource_t * rsc, + target_role = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_TARGET_ROLE); + } + +- if ((options & pe_print_rsconly) || g_list_length(rsc->running_on) > 1) { +- node = NULL; +- } +- + if (is_not_set(rsc->flags, pe_rsc_managed)) { + cl = "rsc-managed"; + +@@ -698,10 +700,14 @@ pe__common_output_html(pcmk__output_t *out, resource_t * rsc, + cl = "rsc-ok"; + } + +- s = native_output_string(rsc, name, node, options, target_role); +- list_node = pcmk__output_create_html_node(out, "li", NULL, NULL, NULL); +- pcmk_create_html_node(list_node, "span", NULL, cl, s); +- free(s); ++ { ++ gchar *s = native_output_string(rsc, name, node, options, target_role, ++ true); ++ ++ list_node = pcmk__output_create_html_node(out, "li", NULL, NULL, NULL); ++ pcmk_create_html_node(list_node, "span", NULL, cl, s); ++ g_free(s); ++ } + + if (is_set(options, pe_print_details)) { + GHashTableIter iter; +@@ -744,7 +750,6 @@ void + pe__common_output_text(pcmk__output_t *out, resource_t * rsc, + const char *name, node_t *node, long options) + { +- char *s = NULL; + const char *target_role = NULL; + + CRM_ASSERT(rsc->variant == pe_native); +@@ -758,13 +763,13 @@ pe__common_output_text(pcmk__output_t *out, resource_t * rsc, + target_role = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_TARGET_ROLE); + } + +- if (is_set(options, pe_print_rsconly) || g_list_length(rsc->running_on) > 1) { +- node = NULL; +- } ++ { ++ gchar *s = native_output_string(rsc, name, node, options, target_role, ++ true); + +- s = native_output_string(rsc, name, node, options, target_role); +- out->list_item(out, NULL, "%s", s); +- free(s); ++ out->list_item(out, NULL, "%s", s); ++ g_free(s); ++ } + + if (is_set(options, pe_print_details)) { + GHashTableIter iter; +@@ -806,22 +811,14 @@ pe__common_output_text(pcmk__output_t *out, resource_t * rsc, + void + common_print(resource_t * rsc, const char *pre_text, const char *name, node_t *node, long options, void *print_data) + { +- const char *desc = NULL; +- const char *class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); +- const char *kind = crm_element_value(rsc->xml, XML_ATTR_TYPE); + const char *target_role = NULL; +- enum rsc_role_e role = native_displayable_role(rsc); +- +- int offset = 0; +- int flagOffset = 0; +- char buffer[LINE_MAX]; +- char flagBuffer[LINE_MAX]; + + CRM_ASSERT(rsc->variant == pe_native); +- CRM_ASSERT(kind != NULL); + + if (rsc->meta) { +- const char *is_internal = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_INTERNAL_RSC); ++ const char *is_internal = g_hash_table_lookup(rsc->meta, ++ XML_RSC_ATTR_INTERNAL_RSC); ++ + if (crm_is_true(is_internal) && is_not_set(options, pe_print_implicit)) { + crm_trace("skipping print of internal resource %s", rsc->id); + return; +@@ -829,17 +826,13 @@ common_print(resource_t * rsc, const char *pre_text, const char *name, node_t *n + target_role = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_TARGET_ROLE); + } + +- if (pre_text == NULL && (options & pe_print_printf)) { +- pre_text = " "; +- } +- + if (options & pe_print_xml) { + native_print_xml(rsc, pre_text, options, print_data); + return; + } + +- if ((options & pe_print_rsconly) || g_list_length(rsc->running_on) > 1) { +- node = NULL; ++ if ((pre_text == NULL) && (options & pe_print_printf)) { ++ pre_text = " "; + } + + if (options & pe_print_html) { +@@ -849,10 +842,10 @@ common_print(resource_t * rsc, const char *pre_text, const char *name, node_t *n + } else if (is_set(rsc->flags, pe_rsc_failed)) { + status_print(""); + +- } else if (rsc->variant == pe_native && (rsc->running_on == NULL)) { ++ } else if (rsc->running_on == NULL) { + status_print(""); + +- } else if (g_list_length(rsc->running_on) > 1) { ++ } else if (pcmk__list_of_multiple(rsc->running_on)) { + status_print(""); + + } else if (is_set(rsc->flags, pe_rsc_failure_ignored)) { +@@ -863,106 +856,29 @@ common_print(resource_t * rsc, const char *pre_text, const char *name, node_t *n + } + } + +- if(pre_text) { +- offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", pre_text); +- } +- offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", name); +- offset += snprintf(buffer + offset, LINE_MAX - offset, "\t(%s", class); +- if (is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_provider)) { +- const char *prov = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); +- offset += snprintf(buffer + offset, LINE_MAX - offset, "::%s", prov); +- } +- offset += snprintf(buffer + offset, LINE_MAX - offset, ":%s):\t", kind); +- if(is_set(rsc->flags, pe_rsc_orphan)) { +- offset += snprintf(buffer + offset, LINE_MAX - offset, " ORPHANED "); +- } +- if(role > RSC_ROLE_SLAVE && is_set(rsc->flags, pe_rsc_failed)) { +- offset += snprintf(buffer + offset, LINE_MAX - offset, "FAILED %s", role2text(role)); +- } else if(is_set(rsc->flags, pe_rsc_failed)) { +- offset += snprintf(buffer + offset, LINE_MAX - offset, "FAILED"); +- } else { +- const char *rsc_state = native_displayable_state(rsc, options); +- +- offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", rsc_state); +- } +- +- if(node) { +- offset += snprintf(buffer + offset, LINE_MAX - offset, " %s", node->details->uname); +- +- if (node->details->online == FALSE && node->details->unclean) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%sUNCLEAN", comma_if(flagOffset)); +- } +- } +- +- if (options & pe_print_pending) { +- const char *pending_task = native_pending_task(rsc); +- +- if (pending_task) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%s%s", comma_if(flagOffset), pending_task); +- } +- } +- +- if (target_role) { +- enum rsc_role_e target_role_e = text2role(target_role); +- +- /* Ignore target role Started, as it is the default anyways +- * (and would also allow a Master to be Master). +- * Show if target role limits our abilities. */ +- if (target_role_e == RSC_ROLE_STOPPED) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%sdisabled", comma_if(flagOffset)); +- +- } else if (is_set(uber_parent(rsc)->flags, pe_rsc_promotable) +- && target_role_e == RSC_ROLE_SLAVE) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%starget-role:%s", comma_if(flagOffset), target_role); +- } +- } +- +- if (is_set(rsc->flags, pe_rsc_block)) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%sblocked", comma_if(flagOffset)); +- +- } else if (is_not_set(rsc->flags, pe_rsc_managed)) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%sunmanaged", comma_if(flagOffset)); +- } +- +- if(is_set(rsc->flags, pe_rsc_failure_ignored)) { +- flagOffset += snprintf(flagBuffer + flagOffset, LINE_MAX - flagOffset, +- "%sfailure ignored", comma_if(flagOffset)); +- } +- +- if ((options & pe_print_rsconly) || g_list_length(rsc->running_on) > 1) { +- desc = crm_element_value(rsc->xml, XML_ATTR_DESC); +- } +- +- CRM_LOG_ASSERT(offset > 0); +- if(flagOffset > 0) { +- status_print("%s (%s)%s%s", buffer, flagBuffer, desc?" ":"", desc?desc:""); +- } else { +- status_print("%s%s%s", buffer, desc?" ":"", desc?desc:""); ++ { ++ gchar *resource_s = native_output_string(rsc, name, node, options, ++ target_role, false); ++ status_print("%s%s", (pre_text? pre_text : ""), resource_s); ++ g_free(resource_s); + } + + #if CURSES_ENABLED +- if ((options & pe_print_rsconly) || g_list_length(rsc->running_on) > 1) { +- /* Done */ +- +- } else if (options & pe_print_ncurses) { ++ if (is_set(options, pe_print_ncurses) ++ && is_not_set(options, pe_print_rsconly) ++ && !pcmk__list_of_multiple(rsc->running_on)) { + /* coverity[negative_returns] False positive */ + move(-1, 0); + } + #endif + +- if (options & pe_print_html) { ++ if (is_set(options, pe_print_html)) { + status_print(" "); + } + +- if ((options & pe_print_rsconly)) { ++ if (is_not_set(options, pe_print_rsconly) ++ && pcmk__list_of_multiple(rsc->running_on)) { + +- } else if (g_list_length(rsc->running_on) > 1) { + GListPtr gIter = rsc->running_on; + int counter = 0; + +@@ -1025,10 +941,6 @@ common_print(resource_t * rsc, const char *pre_text, const char *name, node_t *n + GHashTableIter iter; + node_t *n = NULL; + +- status_print("%s\t(%s%svariant=%s, priority=%f)", pre_text, +- is_set(rsc->flags, pe_rsc_provisional) ? "provisional, " : "", +- is_set(rsc->flags, pe_rsc_runnable) ? "" : "non-startable, ", +- crm_element_name(rsc->xml), (double)rsc->priority); + status_print("%s\tAllowed Nodes", pre_text); + g_hash_table_iter_init(&iter, rsc->allowed_nodes); + while (g_hash_table_iter_next(&iter, NULL, (void **)&n)) { +-- +1.8.3.1 + + +From 41e911be8ea9151b3f0758c2c22c0e69b8b78d93 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 19 Dec 2019 17:18:41 -0600 +Subject: [PATCH 2/8] Log: scheduler: drop redundant trace messages + +We logged "applying placement constraints" three times. +--- + lib/pacemaker/pcmk_sched_allocate.c | 17 ++++------------- + 1 file changed, 4 insertions(+), 13 deletions(-) + +diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c +index ca43c71..dde8b69 100644 +--- a/lib/pacemaker/pcmk_sched_allocate.c ++++ b/lib/pacemaker/pcmk_sched_allocate.c +@@ -623,21 +623,15 @@ check_actions(pe_working_set_t * data_set) + } + } + +-static gboolean ++static void + apply_placement_constraints(pe_working_set_t * data_set) + { +- GListPtr gIter = NULL; +- +- crm_trace("Applying constraints..."); +- +- for (gIter = data_set->placement_constraints; gIter != NULL; gIter = gIter->next) { ++ for (GList *gIter = data_set->placement_constraints; ++ gIter != NULL; gIter = gIter->next) { + pe__location_t *cons = gIter->data; + + cons->rsc_lh->cmds->rsc_location(cons->rsc_lh, cons); + } +- +- return TRUE; +- + } + + static gboolean +@@ -994,10 +988,7 @@ stage2(pe_working_set_t * data_set) + { + GListPtr gIter = NULL; + +- crm_trace("Applying placement constraints"); +- +- gIter = data_set->nodes; +- for (; gIter != NULL; gIter = gIter->next) { ++ for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { + node_t *node = (node_t *) gIter->data; + + if (node == NULL) { +-- +1.8.3.1 + + +From 7fe136e19b5018d609beb8bad4e34234739572c9 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Sat, 7 Dec 2019 12:13:11 -0600 +Subject: [PATCH 3/8] Refactor: libcrmcommon: convenience functions for list + length comparisons + +... for efficiency and readability +--- + include/crm/common/internal.h | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h +index da2c7d7..484c836 100644 +--- a/include/crm/common/internal.h ++++ b/include/crm/common/internal.h +@@ -126,6 +126,20 @@ crm_getpid_s() + return crm_strdup_printf("%lu", (unsigned long) getpid()); + } + ++// More efficient than g_list_length(list) == 1 ++static inline bool ++pcmk__list_of_1(GList *list) ++{ ++ return list && (list->next == NULL); ++} ++ ++// More efficient than g_list_length(list) > 1 ++static inline bool ++pcmk__list_of_multiple(GList *list) ++{ ++ return list && (list->next != NULL); ++} ++ + /* convenience functions for failure-related node attributes */ + + #define CRM_FAIL_COUNT_PREFIX "fail-count" +-- +1.8.3.1 + + +From 9ff4f6bca540576f0a3333c959e8014ed168353f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 16 Dec 2019 14:13:30 -0600 +Subject: [PATCH 4/8] Refactor: libcrmcommon: add convenience macros for + plurals + +I've avoided making s_if_plural() an official API due to its hackiness, but +it really is the best solution for now. Promote it to pcmk__plural_s(), along +with a companion macro pcmk__plural_alt() for more complicated plurals. +--- + include/crm/common/internal.h | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h +index 484c836..ee560c9 100644 +--- a/include/crm/common/internal.h ++++ b/include/crm/common/internal.h +@@ -107,6 +107,29 @@ bool crm_compress_string(const char *data, int length, int max, char **result, + unsigned int *result_len); + gint crm_alpha_sort(gconstpointer a, gconstpointer b); + ++/* Correctly displaying singular or plural is complicated; consider "1 node has" ++ * vs. "2 nodes have". A flexible solution is to pluralize entire strings, e.g. ++ * ++ * if (a == 1) { ++ * crm_info("singular message"): ++ * } else { ++ * crm_info("plural message"); ++ * } ++ * ++ * though even that's not sufficient for all languages besides English (if we ++ * ever desire to do translations of output and log messages). But the following ++ * convenience macros are "good enough" and more concise for many cases. ++ */ ++ ++/* Example: ++ * crm_info("Found %d %s", nentries, ++ * pcmk__plural_alt(nentries, "entry", "entries")); ++ */ ++#define pcmk__plural_alt(i, s1, s2) (((i) == 1)? (s1) : (s2)) ++ ++// Example: crm_info("Found %d node%s", nnodes, pcmk__plural_s(nnodes)); ++#define pcmk__plural_s(i) pcmk__plural_alt(i, "", "s") ++ + static inline char * + crm_concat(const char *prefix, const char *suffix, char join) + { +-- +1.8.3.1 + + +From 0378db5030400202e59b2bae0dabd65d00a3e9c8 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 12 Dec 2019 20:50:50 -0600 +Subject: [PATCH 5/8] Log: controller: improve join messages + +--- + daemons/controld/controld_fsa.c | 81 ++++---- + daemons/controld/controld_join_dc.c | 383 +++++++++++++++++++++--------------- + 2 files changed, 268 insertions(+), 196 deletions(-) + +diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c +index 6760224..b985fa9 100644 +--- a/daemons/controld/controld_fsa.c ++++ b/daemons/controld/controld_fsa.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -460,12 +460,53 @@ log_fsa_input(fsa_data_t * stored_msg) + } + } + ++static void ++check_join_counts(fsa_data_t *msg_data) ++{ ++ int count; ++ guint npeers; ++ ++ count = crmd_join_phase_count(crm_join_finalized); ++ if (count > 0) { ++ crm_err("%d cluster node%s failed to confirm join", ++ count, pcmk__plural_s(count)); ++ crmd_join_phase_log(LOG_NOTICE); ++ return; ++ } ++ ++ npeers = crm_active_peers(); ++ count = crmd_join_phase_count(crm_join_confirmed); ++ if (count == npeers) { ++ if (npeers == 1) { ++ crm_debug("Sole active cluster node is fully joined"); ++ } else { ++ crm_debug("All %d active cluster nodes are fully joined", count); ++ } ++ ++ } else if (count > npeers) { ++ crm_err("New election needed because more nodes confirmed join " ++ "than are in membership (%d > %u)", count, npeers); ++ register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); ++ ++ } else if (saved_ccm_membership_id != crm_peer_seq) { ++ crm_info("New join needed because membership changed (%llu -> %llu)", ++ saved_ccm_membership_id, crm_peer_seq); ++ register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); ++ ++ } else { ++ crm_warn("Only %d of %u active cluster nodes fully joined " ++ "(%d did not respond to offer)", ++ count, npeers, crmd_join_phase_count(crm_join_welcomed)); ++ } ++} ++ + long long + do_state_transition(long long actions, + enum crmd_fsa_state cur_state, + enum crmd_fsa_state next_state, fsa_data_t * msg_data) + { + int level = LOG_INFO; ++ int count = 0; + long long tmp = actions; + gboolean clear_recovery_bit = TRUE; + +@@ -563,13 +604,14 @@ do_state_transition(long long actions, + crm_warn("Progressed to state %s after %s", + fsa_state2string(next_state), fsa_cause2string(cause)); + } +- if (crmd_join_phase_count(crm_join_welcomed) > 0) { +- crm_warn("%u cluster nodes failed to respond" +- " to the join offer.", crmd_join_phase_count(crm_join_welcomed)); ++ count = crmd_join_phase_count(crm_join_welcomed); ++ if (count > 0) { ++ crm_warn("%d cluster node%s failed to respond to join offer", ++ count, pcmk__plural_s(count)); + crmd_join_phase_log(LOG_NOTICE); + + } else { +- crm_debug("All %d cluster nodes responded to the join offer.", ++ crm_debug("All cluster nodes (%d) responded to join offer", + crmd_join_phase_count(crm_join_integrated)); + } + break; +@@ -581,34 +623,7 @@ do_state_transition(long long actions, + crm_info("Progressed to state %s after %s", + fsa_state2string(next_state), fsa_cause2string(cause)); + } +- +- if (crmd_join_phase_count(crm_join_finalized) > 0) { +- crm_err("%u cluster nodes failed to confirm their join.", +- crmd_join_phase_count(crm_join_finalized)); +- crmd_join_phase_log(LOG_NOTICE); +- +- } else if (crmd_join_phase_count(crm_join_confirmed) +- == crm_active_peers()) { +- crm_debug("All %u cluster nodes are" +- " eligible to run resources.", crm_active_peers()); +- +- } else if (crmd_join_phase_count(crm_join_confirmed) > crm_active_peers()) { +- crm_err("We have more confirmed nodes than our membership does: %d vs. %d", +- crmd_join_phase_count(crm_join_confirmed), crm_active_peers()); +- register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); +- +- } else if (saved_ccm_membership_id != crm_peer_seq) { +- crm_info("Membership changed: %llu -> %llu - join restart", +- saved_ccm_membership_id, crm_peer_seq); +- register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); +- +- } else { +- crm_warn("Only %u of %u cluster " +- "nodes are eligible to run resources - continue %d", +- crmd_join_phase_count(crm_join_confirmed), +- crm_active_peers(), crmd_join_phase_count(crm_join_welcomed)); +- } +-/* initialize_join(FALSE); */ ++ check_join_counts(msg_data); + break; + + case S_STOPPING: +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index 988aaa6..54324b2 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -26,7 +26,11 @@ void finalize_join_for(gpointer key, gpointer value, gpointer user_data); + void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); + gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); + ++/* Numeric counter used to identify join rounds (an unsigned int would be ++ * appropriate, except we get and set it in XML as int) ++ */ + static int current_join_id = 0; ++ + unsigned long long saved_ccm_membership_id = 0; + + void +@@ -34,12 +38,7 @@ crm_update_peer_join(const char *source, crm_node_t * node, enum crm_join_phase + { + enum crm_join_phase last = 0; + +- if(node == NULL) { +- crm_err("Could not update join because node not specified" +- CRM_XS " join-%u source=%s phase=%s", +- current_join_id, source, crm_join_phase_str(phase)); +- return; +- } ++ CRM_CHECK(node != NULL, return); + + /* Remote nodes do not participate in joins */ + if (is_set(node->flags, crm_remote_node)) { +@@ -49,21 +48,23 @@ crm_update_peer_join(const char *source, crm_node_t * node, enum crm_join_phase + last = node->join; + + if(phase == last) { +- crm_trace("%s: Node %s[%u] - join-%u phase still %s", +- source, node->uname, node->id, current_join_id, +- crm_join_phase_str(last)); ++ crm_trace("Node %s join-%d phase is still %s " ++ CRM_XS " nodeid=%u source=%s", ++ node->uname, current_join_id, crm_join_phase_str(last), ++ node->id, source); + + } else if ((phase <= crm_join_none) || (phase == (last + 1))) { + node->join = phase; +- crm_info("%s: Node %s[%u] - join-%u phase %s -> %s", +- source, node->uname, node->id, current_join_id, +- crm_join_phase_str(last), crm_join_phase_str(phase)); ++ crm_trace("Node %s join-%d phase is now %s (was %s) " ++ CRM_XS " nodeid=%u source=%s", ++ node->uname, current_join_id, crm_join_phase_str(phase), ++ crm_join_phase_str(last), node->id, source); + + } else { +- crm_err("Could not update join for node %s because phase transition invalid " +- CRM_XS " join-%u source=%s node_id=%u last=%s new=%s", +- node->uname, current_join_id, source, node->id, +- crm_join_phase_str(last), crm_join_phase_str(phase)); ++ crm_warn("Rejecting join-%d phase update for node %s because " ++ "can't go from %s to %s " CRM_XS " nodeid=%u source=%s", ++ current_join_id, node->uname, crm_join_phase_str(last), ++ crm_join_phase_str(phase), node->id, source); + } + } + +@@ -73,9 +74,7 @@ initialize_join(gboolean before) + GHashTableIter iter; + crm_node_t *peer = NULL; + +- /* clear out/reset a bunch of stuff */ +- crm_debug("join-%d: Initializing join data (flag=%s)", +- current_join_id, before ? "true" : "false"); ++ crm_debug("Starting new join round join-%d", current_join_id); + + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { +@@ -128,7 +127,9 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data) + + CRM_ASSERT(member != NULL); + if (crm_is_peer_active(member) == FALSE) { +- crm_info("Not making an offer to %s: not active (%s)", member->uname, member->state); ++ crm_info("Not making join-%d offer to inactive node %s", ++ current_join_id, ++ (member->uname? member->uname : "with unknown name")); + if(member->expected == NULL && safe_str_eq(member->state, CRM_NODE_LOST)) { + /* You would think this unsafe, but in fact this plus an + * active resource is what causes it to be fenced. +@@ -145,17 +146,21 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data) + } + + if (member->uname == NULL) { +- crm_info("No recipient for welcome message.(Node uuid:%s)", member->uuid); ++ crm_info("Not making join-%d offer to node uuid %s with unknown name", ++ current_join_id, member->uuid); + return; + } + + if (saved_ccm_membership_id != crm_peer_seq) { + saved_ccm_membership_id = crm_peer_seq; +- crm_info("Making join offers based on membership %llu", crm_peer_seq); ++ crm_info("Making join-%d offers based on membership event %llu", ++ current_join_id, crm_peer_seq); + } + + if(user_data && member->join > crm_join_none) { +- crm_info("Skipping %s: already known %d", member->uname, member->join); ++ crm_info("Not making join-%d offer to already known node %s (%s)", ++ current_join_id, member->uname, ++ crm_join_phase_str(member->join)); + return; + } + +@@ -166,14 +171,11 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data) + // Advertise our feature set so the joining node can bail if not compatible + crm_xml_add(offer, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); + +- /* send the welcome */ +- crm_info("join-%d: Sending offer to %s", current_join_id, member->uname); +- ++ crm_info("Sending join-%d offer to %s", current_join_id, member->uname); + send_cluster_message(member, crm_msg_crmd, offer, TRUE); + free_xml(offer); + + crm_update_peer_join(__FUNCTION__, member, crm_join_welcomed); +- /* crm_update_peer_expected(__FUNCTION__, member, CRMD_JOINSTATE_PENDING); */ + } + + /* A_DC_JOIN_OFFER_ALL */ +@@ -183,6 +185,8 @@ do_dc_join_offer_all(long long action, + enum crmd_fsa_state cur_state, + enum crmd_fsa_input current_input, fsa_data_t * msg_data) + { ++ int count; ++ + /* Reset everyone's status back to down or in_ccm in the CIB. + * Any nodes that are active in the CIB but not in the cluster membership + * will be seen as offline by the scheduler anyway. +@@ -197,9 +201,11 @@ do_dc_join_offer_all(long long action, + } + g_hash_table_foreach(crm_peer_cache, join_make_offer, NULL); + ++ count = crmd_join_phase_count(crm_join_welcomed); ++ crm_info("Waiting on join-%d requests from %d outstanding node%s", ++ current_join_id, count, pcmk__plural_s(count)); ++ + // Don't waste time by invoking the scheduler yet +- crm_info("join-%d: Waiting on %d outstanding join acks", +- current_join_id, crmd_join_phase_count(crm_join_welcomed)); + } + + /* A_DC_JOIN_OFFER_ONE */ +@@ -211,50 +217,40 @@ do_dc_join_offer_one(long long action, + { + crm_node_t *member; + ha_msg_input_t *welcome = NULL; +- +- const char *op = NULL; ++ int count; + const char *join_to = NULL; + +- if (msg_data->data) { +- welcome = fsa_typed_data(fsa_dt_ha_msg); +- +- } else { +- crm_info("An unknown node joined - (re-)offer to any unconfirmed nodes"); ++ if (msg_data->data == NULL) { ++ crm_info("Making join-%d offers to any unconfirmed nodes " ++ "because an unknown node joined", current_join_id); + g_hash_table_foreach(crm_peer_cache, join_make_offer, &member); + check_join_state(cur_state, __FUNCTION__); + return; + } + ++ welcome = fsa_typed_data(fsa_dt_ha_msg); + if (welcome == NULL) { +- crm_err("Attempt to send welcome message without a message to reply to!"); ++ // fsa_typed_data() already logged an error + return; + } + + join_to = crm_element_value(welcome->msg, F_CRM_HOST_FROM); + if (join_to == NULL) { +- crm_err("Attempt to send welcome message without a host to reply to!"); ++ crm_err("Can't make join-%d offer to unknown node", current_join_id); + return; + } +- + member = crm_get_peer(0, join_to); +- op = crm_element_value(welcome->msg, F_CRM_TASK); +- if (join_to != NULL && (cur_state == S_INTEGRATION || cur_state == S_FINALIZE_JOIN)) { +- /* note: it _is_ possible that a node will have been +- * sick or starting up when the original offer was made. +- * however, it will either re-announce itself in due course +- * _or_ we can re-store the original offer on the client. +- */ +- crm_trace("(Re-)offering membership to %s...", join_to); +- } + +- crm_info("join-%d: Processing %s request from %s in state %s", +- current_join_id, op, join_to, fsa_state2string(cur_state)); ++ /* It is possible that a node will have been sick or starting up when the ++ * original offer was made. However, it will either re-announce itself in ++ * due course, or we can re-store the original offer on the client. ++ */ + + crm_update_peer_join(__FUNCTION__, member, crm_join_none); + join_make_offer(NULL, member, NULL); + +- /* always offer to the DC (ourselves) +- * this ensures the correct value for max_generation_from ++ /* If the offer isn't to the local node, make an offer to the local node as ++ * well, to ensure the correct value for max_generation_from. + */ + if (strcmp(join_to, fsa_our_uname) != 0) { + member = crm_get_peer(0, fsa_our_uname); +@@ -266,9 +262,11 @@ do_dc_join_offer_one(long long action, + */ + abort_transition(INFINITY, tg_restart, "Node join", NULL); + ++ count = crmd_join_phase_count(crm_join_welcomed); ++ crm_info("Waiting on join-%d requests from %d outstanding node%s", ++ current_join_id, count, pcmk__plural_s(count)); ++ + // Don't waste time by invoking the scheduler yet +- crm_debug("Waiting on %d outstanding join acks for join-%d", +- crmd_join_phase_count(crm_join_welcomed), current_join_id); + } + + static int +@@ -301,22 +299,31 @@ do_dc_join_filter_offer(long long action, + + int cmp = 0; + int join_id = -1; ++ int count = 0; + gboolean ack_nack_bool = TRUE; +- const char *ack_nack = CRMD_JOINSTATE_MEMBER; + ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); + + const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM); + const char *ref = crm_element_value(join_ack->msg, F_CRM_REFERENCE); + const char *join_version = crm_element_value(join_ack->msg, + XML_ATTR_CRM_VERSION); ++ crm_node_t *join_node = NULL; + +- crm_node_t *join_node = crm_get_peer(0, join_from); +- +- crm_debug("Processing req from %s", join_from); ++ if (join_from == NULL) { ++ crm_err("Ignoring invalid join request without node name"); ++ return; ++ } ++ join_node = crm_get_peer(0, join_from); + +- generation = join_ack->xml; + crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); ++ if (join_id != current_join_id) { ++ crm_debug("Ignoring join-%d request from %s because we are on join-%d", ++ join_id, join_from, current_join_id); ++ check_join_state(cur_state, __FUNCTION__); ++ return; ++ } + ++ generation = join_ack->xml; + if (max_generation_xml != NULL && generation != NULL) { + int lpc = 0; + +@@ -331,68 +338,71 @@ do_dc_join_filter_offer(long long action, + } + } + +- if (join_id != current_join_id) { +- crm_debug("Invalid response from %s: join-%d vs. join-%d", +- join_from, join_id, current_join_id); +- check_join_state(cur_state, __FUNCTION__); +- return; ++ if (ref == NULL) { ++ ref = "none"; // for logging only ++ } + +- } else if (join_node == NULL || crm_is_peer_active(join_node) == FALSE) { +- crm_err("Node %s is not a member", join_from); ++ if (crm_is_peer_active(join_node) == FALSE) { ++ crm_err("Rejecting join-%d request from inactive node %s " ++ CRM_XS " ref=%s", join_id, join_from, ref); + ack_nack_bool = FALSE; + + } else if (generation == NULL) { +- crm_err("Generation was NULL"); ++ crm_err("Rejecting invalid join-%d request from node %s " ++ "missing CIB generation " CRM_XS " ref=%s", ++ join_id, join_from, ref); + ack_nack_bool = FALSE; + + } else if ((join_version == NULL) + || !feature_set_compatible(CRM_FEATURE_SET, join_version)) { +- crm_err("Node %s feature set (%s) is incompatible with ours (%s)", +- join_from, (join_version? join_version : "pre-3.1.0"), +- CRM_FEATURE_SET); ++ crm_err("Rejecting join-%d request from node %s because feature set %s" ++ " is incompatible with ours (%s) " CRM_XS " ref=%s", ++ join_id, join_from, (join_version? join_version : "pre-3.1.0"), ++ CRM_FEATURE_SET, ref); + ack_nack_bool = FALSE; + + } else if (max_generation_xml == NULL) { ++ crm_debug("Accepting join-%d request from %s " ++ "(with first CIB generation) " CRM_XS " ref=%s", ++ join_id, join_from, ref); + max_generation_xml = copy_xml(generation); + max_generation_from = strdup(join_from); + + } else if (cmp < 0 || (cmp == 0 && safe_str_eq(join_from, fsa_our_uname))) { +- crm_debug("%s has a better generation number than" +- " the current max %s", join_from, max_generation_from); +- if (max_generation_xml) { +- crm_log_xml_debug(max_generation_xml, "Max generation"); +- } +- crm_log_xml_debug(generation, "Their generation"); ++ crm_debug("Accepting join-%d request from %s (with better " ++ "CIB generation than current best from %s) " CRM_XS " ref=%s", ++ join_id, join_from, max_generation_from, ref); ++ crm_log_xml_debug(max_generation_xml, "Old max generation"); ++ crm_log_xml_debug(generation, "New max generation"); + + free(max_generation_from); + free_xml(max_generation_xml); + + max_generation_from = strdup(join_from); + max_generation_xml = copy_xml(join_ack->xml); ++ ++ } else { ++ crm_debug("Accepting join-%d request from %s " CRM_XS " ref=%s", ++ join_id, join_from, ref); + } + + if (ack_nack_bool == FALSE) { +- /* NACK this client */ +- ack_nack = CRMD_JOINSTATE_NACK; + crm_update_peer_join(__FUNCTION__, join_node, crm_join_nack); +- crm_err("Rejecting cluster join request from %s " CRM_XS +- " NACK join-%d ref=%s", join_from, join_id, ref); +- ++ crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_NACK); + } else { +- crm_debug("join-%d: Welcoming node %s (ref %s)", join_id, join_from, ref); + crm_update_peer_join(__FUNCTION__, join_node, crm_join_integrated); ++ crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_MEMBER); + } + +- crm_update_peer_expected(__FUNCTION__, join_node, ack_nack); +- +- crm_debug("%u nodes have been integrated into join-%d", +- crmd_join_phase_count(crm_join_integrated), join_id); +- ++ count = crmd_join_phase_count(crm_join_integrated); ++ crm_debug("%d node%s currently integrated in join-%d", ++ count, pcmk__plural_s(count), join_id); + + if (check_join_state(cur_state, __FUNCTION__) == FALSE) { + // Don't waste time by invoking the scheduler yet +- crm_debug("join-%d: Still waiting on %d outstanding offers", +- join_id, crmd_join_phase_count(crm_join_welcomed)); ++ count = crmd_join_phase_count(crm_join_welcomed); ++ crm_debug("Waiting on join-%d requests from %d outstanding node%s", ++ join_id, count, pcmk__plural_s(count)); + } + } + +@@ -405,21 +415,24 @@ do_dc_join_finalize(long long action, + { + char *sync_from = NULL; + int rc = pcmk_ok; ++ int count_welcomed = crmd_join_phase_count(crm_join_welcomed); ++ int count_integrated = crmd_join_phase_count(crm_join_integrated); + + /* This we can do straight away and avoid clients timing us out + * while we compute the latest CIB + */ +- crm_debug("Finalizing join-%d for %d clients", +- current_join_id, crmd_join_phase_count(crm_join_integrated)); +- +- crmd_join_phase_log(LOG_INFO); +- if (crmd_join_phase_count(crm_join_welcomed) != 0) { +- crm_info("Waiting for %d more nodes", crmd_join_phase_count(crm_join_welcomed)); ++ if (count_welcomed != 0) { ++ crm_debug("Waiting on join-%d requests from %d outstanding node%s " ++ "before finalizing join", current_join_id, count_welcomed, ++ pcmk__plural_s(count_welcomed)); ++ crmd_join_phase_log(LOG_DEBUG); + /* crmd_fsa_stall(FALSE); Needed? */ + return; + +- } else if (crmd_join_phase_count(crm_join_integrated) == 0) { +- /* Nothing to do */ ++ } else if (count_integrated == 0) { ++ crm_debug("Finalization not needed for join-%d at the current time", ++ current_join_id); ++ crmd_join_phase_log(LOG_DEBUG); + check_join_state(fsa_state, __FUNCTION__); + return; + } +@@ -430,8 +443,9 @@ do_dc_join_finalize(long long action, + } + + if (is_set(fsa_input_register, R_IN_TRANSITION)) { +- crm_warn("Delaying response to cluster join offer while transition in progress " +- CRM_XS " join-%d", current_join_id); ++ crm_warn("Delaying join-%d finalization while transition in progress", ++ current_join_id); ++ crmd_join_phase_log(LOG_DEBUG); + crmd_fsa_stall(FALSE); + return; + } +@@ -440,18 +454,20 @@ do_dc_join_finalize(long long action, + /* ask for the agreed best CIB */ + sync_from = strdup(max_generation_from); + set_bit(fsa_input_register, R_CIB_ASKED); +- crm_notice("Syncing the Cluster Information Base from %s to rest of cluster " +- CRM_XS " join-%d", sync_from, current_join_id); +- crm_log_xml_notice(max_generation_xml, "Requested version"); ++ crm_notice("Finalizing join-%d for %d node%s (sync'ing CIB from %s)", ++ current_join_id, count_integrated, ++ pcmk__plural_s(count_integrated), sync_from); ++ crm_log_xml_notice(max_generation_xml, "Requested CIB version"); + + } else { + /* Send _our_ CIB out to everyone */ + sync_from = strdup(fsa_our_uname); +- crm_info("join-%d: Syncing our CIB to the rest of the cluster", +- current_join_id); +- crm_log_xml_debug(max_generation_xml, "Requested version"); ++ crm_debug("Finalizing join-%d for %d node%s (sync'ing from local CIB)", ++ current_join_id, count_integrated, ++ pcmk__plural_s(count_integrated)); ++ crm_log_xml_debug(max_generation_xml, "Requested CIB version"); + } +- ++ crmd_join_phase_log(LOG_DEBUG); + + rc = fsa_cib_conn->cmds->sync_from(fsa_cib_conn, sync_from, NULL, cib_quorum_override); + fsa_register_cib_callback(rc, FALSE, sync_from, finalize_sync_callback); +@@ -463,26 +479,33 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi + CRM_LOG_ASSERT(-EPERM != rc); + clear_bit(fsa_input_register, R_CIB_ASKED); + if (rc != pcmk_ok) { +- do_crm_log((rc == -pcmk_err_old_data ? LOG_WARNING : LOG_ERR), +- "Sync from %s failed: %s", (char *)user_data, pcmk_strerror(rc)); ++ do_crm_log(((rc == -pcmk_err_old_data)? LOG_WARNING : LOG_ERR), ++ "Could not sync CIB from %s in join-%d: %s", ++ (char *) user_data, current_join_id, pcmk_strerror(rc)); + + /* restart the whole join process */ + register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __FUNCTION__); + +- } else if (AM_I_DC && fsa_state == S_FINALIZE_JOIN) { ++ } else if (!AM_I_DC) { ++ crm_debug("Sync'ed CIB for join-%d but no longer DC", current_join_id); ++ ++ } else if (fsa_state != S_FINALIZE_JOIN) { ++ crm_debug("Sync'ed CIB for join-%d but no longer in S_FINALIZE_JOIN (%s)", ++ current_join_id, fsa_state2string(fsa_state)); ++ ++ } else { + set_bit(fsa_input_register, R_HAVE_CIB); + clear_bit(fsa_input_register, R_CIB_ASKED); + + /* make sure dc_uuid is re-set to us */ + if (check_join_state(fsa_state, __FUNCTION__) == FALSE) { +- crm_debug("Notifying %d clients of join-%d results", +- crmd_join_phase_count(crm_join_integrated), current_join_id); ++ int count_integrated = crmd_join_phase_count(crm_join_integrated); ++ ++ crm_debug("Notifying %d node%s of join-%d results", ++ count_integrated, pcmk__plural_s(count_integrated), ++ current_join_id); + g_hash_table_foreach(crm_peer_cache, finalize_join_for, NULL); + } +- +- } else { +- crm_debug("No longer the DC in S_FINALIZE_JOIN: %s in %s", +- AM_I_DC ? "DC" : "controller", fsa_state2string(fsa_state)); + } + } + +@@ -492,11 +515,14 @@ join_update_complete_callback(xmlNode * msg, int call_id, int rc, xmlNode * outp + fsa_data_t *msg_data = NULL; + + if (rc == pcmk_ok) { +- crm_debug("Join update %d complete", call_id); ++ crm_debug("join-%d node history update (via CIB call %d) complete", ++ current_join_id, call_id); + check_join_state(fsa_state, __FUNCTION__); + + } else { +- crm_err("Join update %d failed", call_id); ++ crm_err("join-%d node history update (via CIB call %d) failed: %s " ++ "(next transition may determine resource status incorrectly)", ++ current_join_id, call_id, pcmk_strerror(rc)); + crm_log_xml_debug(msg, "failed"); + register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); + } +@@ -515,61 +541,75 @@ do_dc_join_ack(long long action, + + const char *op = crm_element_value(join_ack->msg, F_CRM_TASK); + const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM); +- crm_node_t *peer = crm_get_peer(0, join_from); ++ crm_node_t *peer = NULL; + +- if (safe_str_neq(op, CRM_OP_JOIN_CONFIRM) || peer == NULL) { +- crm_debug("Ignoring op=%s message from %s", op, join_from); ++ // Sanity checks ++ if (join_from == NULL) { ++ crm_warn("Ignoring message received without node identification"); ++ return; ++ } ++ if (op == NULL) { ++ crm_warn("Ignoring message received from %s without task", join_from); + return; + } + +- crm_trace("Processing ack from %s", join_from); +- crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); ++ if (strcmp(op, CRM_OP_JOIN_CONFIRM)) { ++ crm_debug("Ignoring '%s' message from %s while waiting for '%s'", ++ op, join_from, CRM_OP_JOIN_CONFIRM); ++ return; ++ } + ++ if (crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id) != 0) { ++ crm_warn("Ignoring join confirmation from %s without valid join ID", ++ join_from); ++ return; ++ } ++ ++ peer = crm_get_peer(0, join_from); + if (peer->join != crm_join_finalized) { +- crm_info("Join not in progress: ignoring join-%d from %s (phase = %d)", +- join_id, join_from, peer->join); ++ crm_info("Ignoring out-of-sequence join-%d confirmation from %s " ++ "(currently %s not %s)", ++ join_id, join_from, crm_join_phase_str(peer->join), ++ crm_join_phase_str(crm_join_finalized)); + return; ++ } + +- } else if (join_id != current_join_id) { +- crm_err("Invalid response from %s: join-%d vs. join-%d", +- join_from, join_id, current_join_id); ++ if (join_id != current_join_id) { ++ crm_err("Rejecting join-%d confirmation from %s " ++ "because currently on join-%d", ++ join_id, join_from, current_join_id); + crm_update_peer_join(__FUNCTION__, peer, crm_join_nack); + return; + } + + crm_update_peer_join(__FUNCTION__, peer, crm_join_confirmed); + +- crm_info("join-%d: Updating node state to %s for %s", +- join_id, CRMD_JOINSTATE_MEMBER, join_from); +- +- /* update CIB with the current LRM status from the node +- * We don't need to notify the TE of these updates, a transition will +- * be started in due time ++ /* Update CIB with node's current executor state. A new transition will be ++ * triggered later, when the CIB notifies us of the change. + */ + erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local); +- + if (safe_str_eq(join_from, fsa_our_uname)) { + xmlNode *now_dc_lrmd_state = do_lrm_query(TRUE, fsa_our_uname); + + if (now_dc_lrmd_state != NULL) { +- crm_debug("Local executor state updated from query"); + fsa_cib_update(XML_CIB_TAG_STATUS, now_dc_lrmd_state, + cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL); + free_xml(now_dc_lrmd_state); ++ crm_debug("Updating local node history for join-%d " ++ "from query result (via CIB call %d)", join_id, call_id); + } else { +- crm_warn("Local executor state updated from join acknowledgement because query failed"); + fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml, + cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL); ++ crm_warn("Updating local node history from join-%d confirmation " ++ "because query failed (via CIB call %d)", join_id, call_id); + } + } else { +- crm_debug("Executor state for %s updated from join acknowledgement", +- join_from); + fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml, + cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL); ++ crm_debug("Updating node history for %s from join-%d confirmation " ++ "(via CIB call %d)", join_from, join_id, call_id); + } +- + fsa_register_cib_callback(call_id, FALSE, NULL, join_update_complete_callback); +- crm_debug("join-%d: Registered callback for CIB status update %d", join_id, call_id); + } + + void +@@ -581,17 +621,16 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data) + const char *join_to = join_node->uname; + + if(join_node->join != crm_join_integrated) { +- crm_trace("Skipping %s in state %d", join_to, join_node->join); ++ crm_trace("Not updating non-integrated node %s (%s) for join-%d", ++ join_to, crm_join_phase_str(join_node->join), ++ current_join_id); + return; + } + +- /* make sure a node entry exists for the new node */ +- crm_trace("Creating node entry for %s", join_to); +- ++ crm_trace("Updating node state for %s", join_to); + tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE); + set_uuid(tmp1, XML_ATTR_UUID, join_node); + crm_xml_add(tmp1, XML_ATTR_UNAME, join_to); +- + fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1); + free_xml(tmp1); + +@@ -610,11 +649,10 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data) + return; + } + +- /* send the ack/nack to the node */ +- acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to); +- +- crm_debug("join-%d: ACK'ing join request from %s", ++ // Acknowledge node's join request ++ crm_debug("Acknowledging join-%d request from %s", + current_join_id, join_to); ++ acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to); + crm_xml_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_TRUE); + crm_update_peer_join(__FUNCTION__, join_node, crm_join_finalized); + crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_MEMBER); +@@ -629,11 +667,11 @@ check_join_state(enum crmd_fsa_state cur_state, const char *source) + { + static unsigned long long highest_seq = 0; + +- crm_debug("Invoked by %s in state: %s", source, fsa_state2string(cur_state)); +- + if (saved_ccm_membership_id != crm_peer_seq) { +- crm_debug("%s: Membership changed since join started: %llu -> %llu (%llu)", +- source, saved_ccm_membership_id, crm_peer_seq, highest_seq); ++ crm_debug("join-%d: Membership changed from %llu to %llu " ++ CRM_XS " highest=%llu state=%s for=%s", ++ current_join_id, saved_ccm_membership_id, crm_peer_seq, highest_seq, ++ fsa_state2string(cur_state), source); + if(highest_seq < crm_peer_seq) { + /* Don't spam the FSA with duplicates */ + highest_seq = crm_peer_seq; +@@ -642,34 +680,53 @@ check_join_state(enum crmd_fsa_state cur_state, const char *source) + + } else if (cur_state == S_INTEGRATION) { + if (crmd_join_phase_count(crm_join_welcomed) == 0) { +- crm_debug("join-%d: Integration of %d peers complete: %s", +- current_join_id, crmd_join_phase_count(crm_join_integrated), source); ++ int count = crmd_join_phase_count(crm_join_integrated); ++ ++ crm_debug("join-%d: Integration of %d peer%s complete " ++ CRM_XS " state=%s for=%s", ++ current_join_id, count, pcmk__plural_s(count), ++ fsa_state2string(cur_state), source); + register_fsa_input_before(C_FSA_INTERNAL, I_INTEGRATED, NULL); + return TRUE; + } + + } else if (cur_state == S_FINALIZE_JOIN) { + if (is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { +- crm_debug("join-%d: Delaying I_FINALIZED until we have the CIB", current_join_id); ++ crm_debug("join-%d: Delaying finalization until we have CIB " ++ CRM_XS " state=%s for=%s", ++ current_join_id, fsa_state2string(cur_state), source); + return TRUE; + + } else if (crmd_join_phase_count(crm_join_welcomed) != 0) { +- crm_debug("join-%d: Still waiting on %d welcomed nodes", +- current_join_id, crmd_join_phase_count(crm_join_welcomed)); ++ int count = crmd_join_phase_count(crm_join_welcomed); ++ ++ crm_debug("join-%d: Still waiting on %d welcomed node%s " ++ CRM_XS " state=%s for=%s", ++ current_join_id, count, pcmk__plural_s(count), ++ fsa_state2string(cur_state), source); + crmd_join_phase_log(LOG_DEBUG); + + } else if (crmd_join_phase_count(crm_join_integrated) != 0) { +- crm_debug("join-%d: Still waiting on %d integrated nodes", +- current_join_id, crmd_join_phase_count(crm_join_integrated)); ++ int count = crmd_join_phase_count(crm_join_integrated); ++ ++ crm_debug("join-%d: Still waiting on %d integrated node%s " ++ CRM_XS " state=%s for=%s", ++ current_join_id, count, pcmk__plural_s(count), ++ fsa_state2string(cur_state), source); + crmd_join_phase_log(LOG_DEBUG); + + } else if (crmd_join_phase_count(crm_join_finalized) != 0) { +- crm_debug("join-%d: Still waiting on %d finalized nodes", +- current_join_id, crmd_join_phase_count(crm_join_finalized)); ++ int count = crmd_join_phase_count(crm_join_finalized); ++ ++ crm_debug("join-%d: Still waiting on %d finalized node%s " ++ CRM_XS " state=%s for=%s", ++ current_join_id, count, pcmk__plural_s(count), ++ fsa_state2string(cur_state), source); + crmd_join_phase_log(LOG_DEBUG); + + } else { +- crm_debug("join-%d complete: %s", current_join_id, source); ++ crm_debug("join-%d: Complete " CRM_XS " state=%s for=%s", ++ current_join_id, fsa_state2string(cur_state), source); + register_fsa_input_later(C_FSA_INTERNAL, I_FINALIZED, NULL); + return TRUE; + } +-- +1.8.3.1 + + +From 034b27734d05e8aeddb586f2daaede8314f9516f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 13 Dec 2019 10:39:34 -0600 +Subject: [PATCH 6/8] Log: controller: improve CIB status deletion messages + +--- + daemons/controld/controld_utils.c | 25 +++++++++++++++++-------- + 1 file changed, 17 insertions(+), 8 deletions(-) + +diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c +index 3acd488..bb8ace9 100644 +--- a/daemons/controld/controld_utils.c ++++ b/daemons/controld/controld_utils.c +@@ -751,14 +751,18 @@ update_dc(xmlNode * msg) + return TRUE; + } + +-#define STATUS_PATH_MAX 512 + static void + erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) + { + char *xpath = user_data; + +- do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, +- "Deletion of \"%s\": %s (rc=%d)", xpath, pcmk_strerror(rc), rc); ++ if (rc == 0) { ++ crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded", ++ xpath, call_id); ++ } else { ++ crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s " ++ CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc); ++ } + } + + #define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s" +@@ -766,14 +770,19 @@ erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void + void + erase_status_tag(const char *uname, const char *tag, int options) + { +- if (fsa_cib_conn && uname) { ++ CRM_CHECK(uname != NULL, return); ++ ++ if (fsa_cib_conn == NULL) { ++ crm_warn("Unable to delete CIB '%s' section for node %s: " ++ "no CIB connection", tag, uname); ++ } else { + int call_id; + char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag); + +- crm_info("Deleting %s status entries for %s " CRM_XS " xpath=%s", +- tag, uname, xpath); +- call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, xpath, NULL, +- cib_quorum_override | cib_xpath | options); ++ options |= cib_quorum_override|cib_xpath; ++ call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, xpath, NULL, options); ++ crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) " ++ CRM_XS " xpath=%s", tag, uname, call_id, xpath); + fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback); + // CIB library handles freeing xpath + } +-- +1.8.3.1 + + +From 73510818bc9905dcc130893198590b10c0067425 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 13 Dec 2019 10:36:56 -0600 +Subject: [PATCH 7/8] Refactor: controller: move erase_status_tag() to + controld_based.c + +--- + daemons/controld/controld_based.c | 38 ++++++++++++++++++++++++++++++++++++++ + daemons/controld/controld_utils.c | 37 ------------------------------------- + 2 files changed, 38 insertions(+), 37 deletions(-) + +diff --git a/daemons/controld/controld_based.c b/daemons/controld/controld_based.c +index e6a4612..1db5650 100644 +--- a/daemons/controld/controld_based.c ++++ b/daemons/controld/controld_based.c +@@ -168,3 +168,41 @@ controld_action_is_recordable(const char *action) + } + return TRUE; + } ++ ++static void ++erase_xpath_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, ++ void *user_data) ++{ ++ char *xpath = user_data; ++ ++ if (rc == 0) { ++ crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded", ++ xpath, call_id); ++ } else { ++ crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s " ++ CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc); ++ } ++} ++ ++#define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s" ++ ++void ++erase_status_tag(const char *uname, const char *tag, int options) ++{ ++ CRM_CHECK(uname != NULL, return); ++ ++ if (fsa_cib_conn == NULL) { ++ crm_warn("Unable to delete CIB '%s' section for node %s: " ++ "no CIB connection", tag, uname); ++ } else { ++ int call_id; ++ char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag); ++ ++ options |= cib_quorum_override|cib_xpath; ++ call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, xpath, NULL, options); ++ crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) " ++ CRM_XS " xpath=%s", tag, uname, call_id, xpath); ++ fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback); ++ // CIB library handles freeing xpath ++ } ++} +diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c +index bb8ace9..4ed6aeb 100644 +--- a/daemons/controld/controld_utils.c ++++ b/daemons/controld/controld_utils.c +@@ -751,43 +751,6 @@ update_dc(xmlNode * msg) + return TRUE; + } + +-static void +-erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) +-{ +- char *xpath = user_data; +- +- if (rc == 0) { +- crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded", +- xpath, call_id); +- } else { +- crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s " +- CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc); +- } +-} +- +-#define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s" +- +-void +-erase_status_tag(const char *uname, const char *tag, int options) +-{ +- CRM_CHECK(uname != NULL, return); +- +- if (fsa_cib_conn == NULL) { +- crm_warn("Unable to delete CIB '%s' section for node %s: " +- "no CIB connection", tag, uname); +- } else { +- int call_id; +- char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag); +- +- options |= cib_quorum_override|cib_xpath; +- call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, xpath, NULL, options); +- crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) " +- CRM_XS " xpath=%s", tag, uname, call_id, xpath); +- fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback); +- // CIB library handles freeing xpath +- } +-} +- + void crmd_peer_down(crm_node_t *peer, bool full) + { + if(full && peer->state == NULL) { +-- +1.8.3.1 + + +From c4cc759e733db894957d039f65572cc21704224f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 13 Dec 2019 11:16:25 -0600 +Subject: [PATCH 8/8] Refactor: controller: improve efficiency when deleting + node state + +Rename erase_status_xpath() to controld_delete_node_state() to follow current +naming practice. + +Instead of passing it a node_state subsection name, pass a new enum value +indicating what to erase (resource history, transient node attributes, or +both). This allows us to improve the log messages further, as well as improving +efficiency when both need to be cleared. +--- + daemons/controld/controld_based.c | 69 +++++++++++++++++++++++++++-------- + daemons/controld/controld_callbacks.c | 8 +++- + daemons/controld/controld_execd.c | 3 +- + daemons/controld/controld_fencing.c | 5 +-- + daemons/controld/controld_join_dc.c | 3 +- + daemons/controld/controld_remote_ra.c | 24 ++++++------ + daemons/controld/controld_utils.h | 11 +++++- + 7 files changed, 87 insertions(+), 36 deletions(-) + +diff --git a/daemons/controld/controld_based.c b/daemons/controld/controld_based.c +index 1db5650..008a02d 100644 +--- a/daemons/controld/controld_based.c ++++ b/daemons/controld/controld_based.c +@@ -170,39 +170,76 @@ controld_action_is_recordable(const char *action) + } + + static void +-erase_xpath_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, +- void *user_data) ++cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, ++ void *user_data) + { +- char *xpath = user_data; ++ char *desc = user_data; + + if (rc == 0) { +- crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded", +- xpath, call_id); ++ crm_debug("Deletion of %s (via CIB call %d) succeeded", desc, call_id); + } else { +- crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s " +- CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc); ++ crm_warn("Deletion of %s (via CIB call %d) failed: %s " CRM_XS " rc=%d", ++ desc, call_id, pcmk_strerror(rc), rc); + } + } + +-#define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s" ++// Searches for various portions of node_state to delete + ++// Match a particular node's node_state (takes node name 1x) ++#define XPATH_NODE_STATE "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" ++ ++// Node's lrm section (name 1x) ++#define XPATH_NODE_LRM XPATH_NODE_STATE "/" XML_CIB_TAG_LRM ++ ++// Node's transient_attributes section (name 1x) ++#define XPATH_NODE_ATTRS XPATH_NODE_STATE "/" XML_TAG_TRANSIENT_NODEATTRS ++ ++// Everything under node_state (name 1x) ++#define XPATH_NODE_ALL XPATH_NODE_STATE "/*" ++ ++/*! ++ * \internal ++ * \brief Delete subsection of a node's CIB node_state ++ * ++ * \param[in] uname Desired node ++ * \param[in] section Subsection of node_state to delete ++ * \param[in] options CIB call options to use ++ */ + void +-erase_status_tag(const char *uname, const char *tag, int options) ++controld_delete_node_state(const char *uname, enum controld_section_e section, ++ int options) + { ++ char *xpath = NULL; ++ char *desc = NULL; ++ + CRM_CHECK(uname != NULL, return); ++ switch (section) { ++ case controld_section_lrm: ++ xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); ++ desc = crm_strdup_printf("resource history for node %s", uname); ++ break; ++ case controld_section_attrs: ++ xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname); ++ desc = crm_strdup_printf("transient attributes for node %s", uname); ++ break; ++ case controld_section_all: ++ xpath = crm_strdup_printf(XPATH_NODE_ALL, uname); ++ desc = crm_strdup_printf("all state for node %s", uname); ++ break; ++ } + + if (fsa_cib_conn == NULL) { +- crm_warn("Unable to delete CIB '%s' section for node %s: " +- "no CIB connection", tag, uname); ++ crm_warn("Unable to delete %s: no CIB connection", desc); ++ free(desc); + } else { + int call_id; +- char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag); + + options |= cib_quorum_override|cib_xpath; + call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, xpath, NULL, options); +- crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) " +- CRM_XS " xpath=%s", tag, uname, call_id, xpath); +- fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback); +- // CIB library handles freeing xpath ++ crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s", ++ desc, call_id, xpath); ++ fsa_register_cib_callback(call_id, FALSE, desc, cib_delete_callback); ++ // CIB library handles freeing desc + } ++ free(xpath); + } +diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c +index 5cbd392..f7e3db2 100644 +--- a/daemons/controld/controld_callbacks.c ++++ b/daemons/controld/controld_callbacks.c +@@ -200,14 +200,18 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d + * transient attributes intact until it rejoins. + */ + if (compare_version(fsa_our_dc_version, "3.0.9") > 0) { +- erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); ++ controld_delete_node_state(node->uname, ++ controld_section_attrs, ++ cib_scope_local); + } + + } else if(AM_I_DC) { + if (appeared) { + te_trigger_stonith_history_sync(FALSE); + } else { +- erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); ++ controld_delete_node_state(node->uname, ++ controld_section_attrs, ++ cib_scope_local); + } + } + break; +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 46c1958..b7deeae 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1411,7 +1411,8 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys, + } + + /* Now delete the copy in the CIB */ +- erase_status_tag(lrm_state->node_name, XML_CIB_TAG_LRM, cib_scope_local); ++ controld_delete_node_state(lrm_state->node_name, controld_section_lrm, ++ cib_scope_local); + + /* Finally, _delete_ the value in pacemaker-attrd -- setting it to FALSE + * would result in the scheduler sending us back here again +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index d9b1e1e..9897cf3 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -229,9 +229,8 @@ send_stonith_update(crm_action_t *action, const char *target, const char *uuid) + /* Make sure it sticks */ + /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ + +- erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local); +- erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); +- ++ controld_delete_node_state(peer->uname, controld_section_all, ++ cib_scope_local); + free_xml(node_state); + return; + } +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index 54324b2..ac6b430 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -587,7 +587,8 @@ do_dc_join_ack(long long action, + /* Update CIB with node's current executor state. A new transition will be + * triggered later, when the CIB notifies us of the change. + */ +- erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local); ++ controld_delete_node_state(join_from, controld_section_lrm, ++ cib_scope_local); + if (safe_str_eq(join_from, fsa_our_uname)) { + xmlNode *now_dc_lrmd_state = do_lrm_query(TRUE, fsa_our_uname); + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 4fbae45..2d3dfa7 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -181,13 +181,13 @@ remote_node_up(const char *node_name) + CRM_CHECK(node_name != NULL, return); + crm_info("Announcing pacemaker_remote node %s", node_name); + +- /* Clear node's operation history. The node's transient attributes should +- * and normally will be cleared when the node leaves, but since remote node +- * state has a number of corner cases, clear them here as well, to be sure. ++ /* Clear node's entire state (resource history and transient attributes). ++ * The transient attributes should and normally will be cleared when the ++ * node leaves, but since remote node state has a number of corner cases, ++ * clear them here as well, to be sure. + */ + call_opt = crmd_cib_smart_opt(); +- erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); +- erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); ++ controld_delete_node_state(node_name, controld_section_all, call_opt); + + /* Clear node's probed attribute */ + update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE); +@@ -252,15 +252,15 @@ remote_node_down(const char *node_name, const enum down_opts opts) + /* Purge node from attrd's memory */ + update_attrd_remote_node_removed(node_name, NULL); + +- /* Purge node's transient attributes */ +- erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); +- +- /* Normally, the LRM operation history should be kept until the node comes +- * back up. However, after a successful fence, we want to clear it, so we +- * don't think resources are still running on the node. ++ /* Normally, only node attributes should be erased, and the resource history ++ * should be kept until the node comes back up. However, after a successful ++ * fence, we want to clear the history as well, so we don't think resources ++ * are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); ++ controld_delete_node_state(node_name, controld_section_all, call_opt); ++ } else { ++ controld_delete_node_state(node_name, controld_section_attrs, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h +index cf04f13..f902361 100644 +--- a/daemons/controld/controld_utils.h ++++ b/daemons/controld/controld_utils.h +@@ -70,7 +70,6 @@ xmlNode *create_node_state_update(crm_node_t *node, int flags, + xmlNode *parent, const char *source); + void populate_cib_nodes(enum node_update_flags flags, const char *source); + void crm_update_quorum(gboolean quorum, gboolean force_update); +-void erase_status_tag(const char *uname, const char *tag, int options); + void controld_close_attrd_ipc(void); + void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node); + void update_attrd_remote_node_removed(const char *host, const char *user_name); +@@ -87,6 +86,16 @@ unsigned int cib_op_timeout(void); + bool feature_set_compatible(const char *dc_version, const char *join_version); + bool controld_action_is_recordable(const char *action); + ++// Subsections of node_state ++enum controld_section_e { ++ controld_section_lrm, ++ controld_section_attrs, ++ controld_section_all, ++}; ++ ++void controld_delete_node_state(const char *uname, ++ enum controld_section_e section, int options); ++ + const char *get_node_id(xmlNode *lrm_rsc_op); + + /* Convenience macro for registering a CIB callback +-- +1.8.3.1 + diff --git a/SOURCES/003-fencer-logs.patch b/SOURCES/003-fencer-logs.patch deleted file mode 100644 index 072b4cd..0000000 --- a/SOURCES/003-fencer-logs.patch +++ /dev/null @@ -1,652 +0,0 @@ -From 0a884f325e1049febc28bf0419ab307dd0bce5af Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 16 May 2019 20:04:57 -0500 -Subject: [PATCH] Log: various: improve fencer connection messages - -Previously, log messages around fencer connections were inconsistent. - -This attempts to make them more consistent by: having stonith_api_signon() log -only at debug level, letting the callers log at a level appropriate to the -situation using the return code; functionizing retrying a connection; and -using similar wording across clients. - -This also does a bit of refactoring for better error checking and improved -efficiency. ---- - daemons/controld/controld_control.c | 7 +- - daemons/controld/controld_te_utils.c | 59 ++++++----- - daemons/execd/pacemaker-execd.c | 28 ++--- - daemons/fenced/cts-fence-helper.c | 38 +++---- - include/crm/stonith-ng.h | 4 + - lib/fencing/st_client.c | 195 ++++++++++++++++++++--------------- - tools/crm_mon.c | 1 - - tools/stonith_admin.c | 29 +----- - 8 files changed, 181 insertions(+), 180 deletions(-) - -diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c -index 89b5b5d..6d9f335 100644 ---- a/daemons/controld/controld_control.c -+++ b/daemons/controld/controld_control.c -@@ -628,10 +628,11 @@ do_started(long long action, - register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); - } - -+ // Try connecting to fencer (retrying later in mainloop if failed) - if (stonith_reconnect == NULL) { -- int dummy; -- -- stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); -+ stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, -+ te_connect_stonith, -+ GINT_TO_POINTER(TRUE)); - } - set_bit(fsa_input_register, R_ST_REQUIRED); - mainloop_set_trigger(stonith_reconnect); -diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c -index 5606ed6..22f83ad 100644 ---- a/daemons/controld/controld_te_utils.c -+++ b/daemons/controld/controld_te_utils.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -385,10 +385,18 @@ te_trigger_stonith_history_sync(void) - mainloop_timer_start(stonith_history_sync_timer); - } - -+/*! -+ * \brief Connect to fencer -+ * -+ * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop -+ * -+ * \return TRUE -+ * \note If user_data is NULL, this will wait 2s between attempts, for up to -+ * 30 attempts, meaning the controller could be blocked as long as 58s. -+ */ - gboolean - te_connect_stonith(gpointer user_data) - { -- int lpc = 0; - int rc = pcmk_ok; - - if (stonith_api == NULL) { -@@ -396,42 +404,41 @@ te_connect_stonith(gpointer user_data) - } - - if (stonith_api->state != stonith_disconnected) { -- crm_trace("Still connected"); -+ crm_trace("Already connected to fencer, no need to retry"); - return TRUE; - } - -- for (lpc = 0; lpc < 30; lpc++) { -- crm_debug("Attempting connection to fencing daemon..."); -- -- sleep(1); -- rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); -- -- if (rc == pcmk_ok) { -- break; -+ if (user_data == NULL) { -+ // Blocking (retry failures now until successful) -+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); -+ if (rc != pcmk_ok) { -+ crm_err("Could not connect to fencer in 30 attempts: %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); - } -- -- if (user_data != NULL) { -+ } else { -+ // Non-blocking (retry failures later in main loop) -+ rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); -+ if (rc != pcmk_ok) { - if (is_set(fsa_input_register, R_ST_REQUIRED)) { -- crm_err("Sign-in failed: triggered a retry"); -+ crm_err("Fencer connection failed (will retry): %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); - mainloop_set_trigger(stonith_reconnect); - } else { -- crm_info("Sign-in failed, but no longer required"); -+ crm_info("Fencer connection failed (ignoring because no longer required): %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); - } - return TRUE; - } -- -- crm_err("Sign-in failed: pausing and trying again in 2s..."); -- sleep(1); - } - -- CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */ -- stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, -- tengine_stonith_connection_destroy); -- -- stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, -- tengine_stonith_notify); -- -- crm_trace("Connected"); -+ if (rc == pcmk_ok) { -+ stonith_api->cmds->register_notification(stonith_api, -+ T_STONITH_NOTIFY_DISCONNECT, -+ tengine_stonith_connection_destroy); -+ stonith_api->cmds->register_notification(stonith_api, -+ T_STONITH_NOTIFY_FENCE, -+ tengine_stonith_notify); -+ } - return TRUE; - } - -diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c -index 21bb0ed..e2fdfca 100644 ---- a/daemons/execd/pacemaker-execd.c -+++ b/daemons/execd/pacemaker-execd.c -@@ -65,28 +65,20 @@ get_stonith_connection(void) - stonith_api = NULL; - } - -- if (!stonith_api) { -- int rc = 0; -- int tries = 10; -+ if (stonith_api == NULL) { -+ int rc = pcmk_ok; - - stonith_api = stonith_api_new(); -- do { -- rc = stonith_api->cmds->connect(stonith_api, "pacemaker-execd", NULL); -- if (rc == pcmk_ok) { -- stonith_api->cmds->register_notification(stonith_api, -- T_STONITH_NOTIFY_DISCONNECT, -- stonith_connection_destroy_cb); -- break; -- } -- sleep(1); -- tries--; -- } while (tries); -- -- if (rc) { -- crm_err("Unable to connect to stonith daemon to execute command. error: %s", -- pcmk_strerror(rc)); -+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 10); -+ if (rc != pcmk_ok) { -+ crm_err("Could not connect to fencer in 10 attempts: %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); - stonith_api_delete(stonith_api); - stonith_api = NULL; -+ } else { -+ stonith_api->cmds->register_notification(stonith_api, -+ T_STONITH_NOTIFY_DISCONNECT, -+ stonith_connection_destroy_cb); - } - } - return stonith_api; -diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c -index c5ce1ab..4552fc1 100644 ---- a/daemons/fenced/cts-fence-helper.c -+++ b/daemons/fenced/cts-fence-helper.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2018 Andrew Beekhof -+ * Copyright 2009-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -124,8 +124,10 @@ passive_test(void) - int rc = 0; - - rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); -- crm_debug("Connect: %d", rc); -- -+ if (rc != pcmk_ok) { -+ stonith_api_delete(st); -+ crm_exit(CRM_EX_DISCONNECT); -+ } - st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback); - st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback); - st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); -@@ -271,8 +273,10 @@ sanity_tests(void) - int rc = 0; - - rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); -- crm_debug("Connect: %d", rc); -- -+ if (rc != pcmk_ok) { -+ stonith_api_delete(st); -+ crm_exit(CRM_EX_DISCONNECT); -+ } - st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback); - st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback); - st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); -@@ -295,7 +299,10 @@ standard_dev_test(void) - stonith_key_value_t *params = NULL; - - rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); -- crm_debug("Connect: %d", rc); -+ if (rc != pcmk_ok) { -+ stonith_api_delete(st); -+ crm_exit(CRM_EX_DISCONNECT); -+ } - - params = stonith_key_value_add(params, "pcmk_host_map", "some-host=pcmk-7 true_1_node1=3,4"); - -@@ -502,23 +509,12 @@ test_register_async_devices(int check_event) - static void - try_mainloop_connect(int check_event) - { -- int tries = 10; -- int i = 0; -- int rc = 0; -+ int rc = stonith_api_connect_retry(st, crm_system_name, 10); - -- for (i = 0; i < tries; i++) { -- rc = st->cmds->connect(st, crm_system_name, NULL); -- -- if (!rc) { -- crm_info("stonith client connection established"); -- mainloop_test_done(TRUE); -- return; -- } else { -- crm_info("stonith client connection failed"); -- } -- sleep(1); -+ if (rc == pcmk_ok) { -+ mainloop_test_done(TRUE); -+ return; - } -- - crm_err("API CONNECTION FAILURE"); - mainloop_test_done(FALSE); - } -diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h -index b7365a9..b640732 100644 ---- a/include/crm/stonith-ng.h -+++ b/include/crm/stonith-ng.h -@@ -430,6 +430,10 @@ void stonith_key_value_freeall(stonith_key_value_t * kvp, int keys, int values); - - void stonith_history_free(stonith_history_t *history); - -+// Convenience functions -+int stonith_api_connect_retry(stonith_t *st, const char *name, -+ int max_attempts); -+ - /* Basic helpers that allows nodes to be fenced and the history to be - * queried without mainloop or the caller understanding the full API - * -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 270ef8d..ceee944 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU Lesser General Public License - * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -@@ -1415,14 +1415,21 @@ static int - stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) - { - int rc = pcmk_ok; -- stonith_private_t *native = stonith->st_private; -+ stonith_private_t *native = NULL; -+ const char *display_name = name? name : "client"; - - static struct ipc_client_callbacks st_callbacks = { - .dispatch = stonith_dispatch_internal, - .destroy = stonith_connection_destroy - }; - -- crm_trace("Connecting command channel"); -+ CRM_CHECK(stonith != NULL, return -EINVAL); -+ -+ native = stonith->st_private; -+ CRM_ASSERT(native != NULL); -+ -+ crm_debug("Attempting fencer connection by %s with%s mainloop", -+ display_name, (stonith_fd? "out" : "")); - - stonith->state = stonith_connected_command; - if (stonith_fd) { -@@ -1432,8 +1439,9 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) - if (native->ipc && crm_ipc_connect(native->ipc)) { - *stonith_fd = crm_ipc_get_fd(native->ipc); - } else if (native->ipc) { -- crm_perror(LOG_ERR, "Connection to fencer failed"); -- rc = -ENOTCONN; -+ crm_ipc_close(native->ipc); -+ crm_ipc_destroy(native->ipc); -+ native->ipc = NULL; - } - - } else { -@@ -1444,11 +1452,8 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) - } - - if (native->ipc == NULL) { -- crm_debug("Could not connect to the Stonith API"); - rc = -ENOTCONN; -- } -- -- if (rc == pcmk_ok) { -+ } else { - xmlNode *reply = NULL; - xmlNode *hello = create_xml_node(NULL, "stonith_command"); - -@@ -1458,11 +1463,12 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) - rc = crm_ipc_send(native->ipc, hello, crm_ipc_client_response, -1, &reply); - - if (rc < 0) { -- crm_perror(LOG_DEBUG, "Couldn't complete registration with the fencing API: %d", rc); -+ crm_debug("Couldn't register with the fencer: %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); - rc = -ECOMM; - - } else if (reply == NULL) { -- crm_err("Did not receive registration reply"); -+ crm_debug("Couldn't register with the fencer: no reply"); - rc = -EPROTO; - - } else { -@@ -1470,18 +1476,23 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) - const char *tmp_ticket = crm_element_value(reply, F_STONITH_CLIENTID); - - if (safe_str_neq(msg_type, CRM_OP_REGISTER)) { -- crm_err("Invalid registration message: %s", msg_type); -- crm_log_xml_err(reply, "Bad reply"); -+ crm_debug("Couldn't register with the fencer: invalid reply type '%s'", -+ (msg_type? msg_type : "(missing)")); -+ crm_log_xml_debug(reply, "Invalid fencer reply"); - rc = -EPROTO; - - } else if (tmp_ticket == NULL) { -- crm_err("No registration token provided"); -- crm_log_xml_err(reply, "Bad reply"); -+ crm_debug("Couldn't register with the fencer: no token in reply"); -+ crm_log_xml_debug(reply, "Invalid fencer reply"); - rc = -EPROTO; - - } else { -- crm_trace("Obtained registration token: %s", tmp_ticket); - native->token = strdup(tmp_ticket); -+#if HAVE_MSGFROMIPC_TIMEOUT -+ stonith->call_timeout = MAX_IPC_DELAY; -+#endif -+ crm_debug("Connection to fencer by %s succeeded (registration token: %s)", -+ display_name, native->token); - rc = pcmk_ok; - } - } -@@ -1490,16 +1501,11 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) - free_xml(hello); - } - -- if (rc == pcmk_ok) { --#if HAVE_MSGFROMIPC_TIMEOUT -- stonith->call_timeout = MAX_IPC_DELAY; --#endif -- crm_debug("Connection to fencer successful"); -- return pcmk_ok; -+ if (rc != pcmk_ok) { -+ crm_debug("Connection attempt to fencer by %s failed: %s " -+ CRM_XS " rc=%d", display_name, pcmk_strerror(rc), rc); -+ stonith->cmds->disconnect(stonith); - } -- -- crm_debug("Connection to fencer failed: %s", pcmk_strerror(rc)); -- stonith->cmds->disconnect(stonith); - return rc; - } - -@@ -2071,6 +2077,36 @@ stonith_api_new(void) - return new_stonith; - } - -+/*! -+ * \brief Make a blocking connection attempt to the fencer -+ * -+ * \param[in,out] st Fencer API object -+ * \param[in] name Client name to use with fencer -+ * \param[in] max_attempts Return error if this many attempts fail -+ * -+ * \return pcmk_ok on success, result of last attempt otherwise -+ */ -+int -+stonith_api_connect_retry(stonith_t *st, const char *name, int max_attempts) -+{ -+ int rc = -EINVAL; // if max_attempts is not positive -+ -+ for (int attempt = 1; attempt <= max_attempts; attempt++) { -+ rc = st->cmds->connect(st, name, NULL); -+ if (rc == pcmk_ok) { -+ return pcmk_ok; -+ } else if (attempt < max_attempts) { -+ crm_notice("Fencer connection attempt %d of %d failed (retrying in 2s): %s " -+ CRM_XS " rc=%d", -+ attempt, max_attempts, pcmk_strerror(rc), rc); -+ sleep(2); -+ } -+ } -+ crm_notice("Could not connect to fencer: %s " CRM_XS " rc=%d", -+ pcmk_strerror(rc), rc); -+ return rc; -+} -+ - stonith_key_value_t * - stonith_key_value_add(stonith_key_value_t * head, const char *key, const char *value) - { -@@ -2122,85 +2158,78 @@ stonith_key_value_freeall(stonith_key_value_t * head, int keys, int values) - int - stonith_api_kick(uint32_t nodeid, const char *uname, int timeout, bool off) - { -- char *name = NULL; -- const char *action = "reboot"; -- -- int rc = -EPROTO; -- stonith_t *st = NULL; -- enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide; -+ int rc = pcmk_ok; -+ stonith_t *st = stonith_api_new(); -+ const char *action = off? "off" : "reboot"; - - api_log_open(); -- st = stonith_api_new(); -- if (st) { -- rc = st->cmds->connect(st, "stonith-api", NULL); -- if(rc != pcmk_ok) { -- api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc); -- } -+ if (st == NULL) { -+ api_log(LOG_ERR, "API initialization failed, could not kick (%s) node %u/%s", -+ action, nodeid, uname); -+ return -EPROTO; - } - -- if (uname != NULL) { -- name = strdup(uname); -- -- } else if (nodeid > 0) { -- opts |= st_opt_cs_nodeid; -- name = crm_itoa(nodeid); -- } -- -- if (off) { -- action = "off"; -- } -- -- if (rc == pcmk_ok) { -+ rc = st->cmds->connect(st, "stonith-api", NULL); -+ if (rc != pcmk_ok) { -+ api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", -+ action, nodeid, uname, pcmk_strerror(rc), rc); -+ } else { -+ char *name = NULL; -+ enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide; -+ -+ if (uname != NULL) { -+ name = strdup(uname); -+ } else if (nodeid > 0) { -+ opts |= st_opt_cs_nodeid; -+ name = crm_itoa(nodeid); -+ } - rc = st->cmds->fence(st, opts, name, action, timeout, 0); -- if(rc != pcmk_ok) { -- api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc); -+ free(name); -+ -+ if (rc != pcmk_ok) { -+ api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", -+ action, nodeid, uname, pcmk_strerror(rc), rc); - } else { -- api_log(LOG_NOTICE, "Node %u/%s kicked: %s ", nodeid, uname, action); -+ api_log(LOG_NOTICE, "Node %u/%s kicked: %s", nodeid, uname, action); - } - } - -- if (st) { -- st->cmds->disconnect(st); -- stonith_api_delete(st); -- } -- -- free(name); -+ stonith_api_delete(st); - return rc; - } - - time_t - stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress) - { -- int rc = 0; -- char *name = NULL; -- -+ int rc = pcmk_ok; - time_t when = 0; -- stonith_t *st = NULL; -+ stonith_t *st = stonith_api_new(); - stonith_history_t *history = NULL, *hp = NULL; -- enum stonith_call_options opts = st_opt_sync_call; -- -- st = stonith_api_new(); -- if (st) { -- rc = st->cmds->connect(st, "stonith-api", NULL); -- if(rc != pcmk_ok) { -- api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc); -- } -- } -- -- if (uname != NULL) { -- name = strdup(uname); - -- } else if (nodeid > 0) { -- opts |= st_opt_cs_nodeid; -- name = crm_itoa(nodeid); -+ if (st == NULL) { -+ api_log(LOG_ERR, "Could not retrieve fence history for %u/%s: " -+ "API initialization failed", nodeid, uname); -+ return when; - } - -- if (st && rc == pcmk_ok) { -+ rc = st->cmds->connect(st, "stonith-api", NULL); -+ if (rc != pcmk_ok) { -+ api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc); -+ } else { - int entries = 0; - int progress = 0; - int completed = 0; -- -+ char *name = NULL; -+ enum stonith_call_options opts = st_opt_sync_call; -+ -+ if (uname != NULL) { -+ name = strdup(uname); -+ } else if (nodeid > 0) { -+ opts |= st_opt_cs_nodeid; -+ name = crm_itoa(nodeid); -+ } - rc = st->cmds->history(st, opts, name, &history, 120); -+ free(name); - - for (hp = history; hp; hp = hp->next) { - entries++; -@@ -2227,15 +2256,11 @@ stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress) - } - } - -- if (st) { -- st->cmds->disconnect(st); -- stonith_api_delete(st); -- } -+ stonith_api_delete(st); - - if(when) { - api_log(LOG_INFO, "Node %u/%s last kicked at: %ld", nodeid, uname, (long int)when); - } -- free(name); - return when; - } - -diff --git a/tools/crm_mon.c b/tools/crm_mon.c -index e101b62..bed0796 100644 ---- a/tools/crm_mon.c -+++ b/tools/crm_mon.c -@@ -298,7 +298,6 @@ cib_connect(gboolean full) - } - - if ((fence_connect) && (st->state == stonith_disconnected)) { -- crm_trace("Connecting to stonith"); - rc = st->cmds->connect(st, crm_system_name, NULL); - if (rc == pcmk_ok) { - crm_trace("Setting up stonith callbacks"); -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index d960fb1..6be66c6 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -198,31 +198,6 @@ struct { - int rc; - } async_fence_data; - --static int --try_mainloop_connect(void) --{ -- stonith_t *st = async_fence_data.st; -- int tries = 10; -- int i = 0; -- int rc = 0; -- -- for (i = 0; i < tries; i++) { -- crm_debug("Connecting as %s", async_fence_data.name); -- rc = st->cmds->connect(st, async_fence_data.name, NULL); -- -- if (!rc) { -- crm_debug("stonith client connection established"); -- return 0; -- } else { -- crm_debug("stonith client connection failed"); -- } -- sleep(1); -- } -- -- crm_err("Could not connect to the fencer"); -- return -1; --} -- - static void - notify_callback(stonith_t * st, stonith_event_t * e) - { -@@ -251,8 +226,10 @@ async_fence_helper(gpointer user_data) - { - stonith_t *st = async_fence_data.st; - int call_id = 0; -+ int rc = stonith_api_connect_retry(st, async_fence_data.name, 10); - -- if (try_mainloop_connect()) { -+ if (rc != pcmk_ok) { -+ fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); - g_main_loop_quit(mainloop); - return TRUE; - } --- -1.8.3.1 - diff --git a/SOURCES/003-return-codes.patch b/SOURCES/003-return-codes.patch new file mode 100644 index 0000000..e4448af --- /dev/null +++ b/SOURCES/003-return-codes.patch @@ -0,0 +1,908 @@ +From 55ebd895ba2c64713c3db2590ffe22c15b8563e3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 13 Dec 2019 16:05:05 -0600 +Subject: [PATCH] Refactor: libcrmcommon: introduce new set of return codes + +Since we plan to introduce a high-level public API, it's a good time to +introduce some best practices. + +Most Pacemaker API functions currently return an integer return code, such that +its absolute value is either a system error number or a custom pcmk_err_* +number. This is less than ideal because system error numbers are constrained +only to the positive int range, so there's the possibility (though not noticed +in the wild) that system errors and custom errors could collide. + +The new method being introduced here still uses an integer return code, +but negative values are from a new enumeration, and positive values are +system error numbers. 0 still represents success. + +It is expected that the new method will be used with new functions, and +existing internal functions will be gradually refactored to use it as well. +Existing public API functions can be addressed at the next backward +compatibility break (2.1.0). +--- + include/crm/common/results.h | 59 ++++- + lib/common/results.c | 536 ++++++++++++++++++++++++++++++------------- + tools/crm_error.c | 100 +++++--- + 3 files changed, 493 insertions(+), 202 deletions(-) + +diff --git a/include/crm/common/results.h b/include/crm/common/results.h +index 7a32110..b29a016 100644 +--- a/include/crm/common/results.h ++++ b/include/crm/common/results.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2012-2019 the Pacemaker project contributors ++ * Copyright 2012-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -49,11 +49,21 @@ extern "C" { + /* + * Function return codes + * ++ * Most Pacemaker API functions return an integer return code. There are two ++ * alternative interpretations. The legacy interpration is that the absolute ++ * value of the return code is either a system error number or a custom ++ * pcmk_err_* number. This is less than ideal because system error numbers are ++ * constrained only to the positive int range, so there's the possibility ++ * (though not noticed in the wild) that system errors and custom errors could ++ * collide. The new intepretation is that negative values are from the pcmk_rc_e ++ * enum, and positive values are system error numbers. Both use 0 for success. ++ * + * For system error codes, see: + * - /usr/include/asm-generic/errno.h + * - /usr/include/asm-generic/errno-base.h + */ + ++// Legacy custom return codes for Pacemaker API functions (deprecated) + # define pcmk_ok 0 + # define PCMK_ERROR_OFFSET 190 /* Replacements on non-linux systems, see include/portability.h */ + # define PCMK_CUSTOM_OFFSET 200 /* Purely custom codes */ +@@ -75,6 +85,48 @@ extern "C" { + # define pcmk_err_bad_nvpair 216 + # define pcmk_err_unknown_format 217 + ++/*! ++ * \enum pcmk_rc_e ++ * \brief Return codes for Pacemaker API functions ++ * ++ * Any Pacemaker API function documented as returning a "standard Pacemaker ++ * return code" will return pcmk_rc_ok (0) on success, and one of this ++ * enumeration's other (negative) values or a (positive) system error number ++ * otherwise. The custom codes are at -1001 and lower, so that the caller may ++ * use -1 through -1000 for their own custom values if desired. While generally ++ * referred to as "errors", nonzero values simply indicate a result, which might ++ * or might not be an error depending on the calling context. ++ */ ++enum pcmk_rc_e { ++ /* When adding new values, use consecutively lower numbers, update the array ++ * in lib/common/results.c, and test with crm_error. ++ */ ++ pcmk_rc_no_quorum = -1017, ++ pcmk_rc_schema_validation = -1016, ++ pcmk_rc_schema_unchanged = -1015, ++ pcmk_rc_transform_failed = -1014, ++ pcmk_rc_old_data = -1013, ++ pcmk_rc_diff_failed = -1012, ++ pcmk_rc_diff_resync = -1011, ++ pcmk_rc_cib_modified = -1010, ++ pcmk_rc_cib_backup = -1009, ++ pcmk_rc_cib_save = -1008, ++ pcmk_rc_cib_corrupt = -1007, ++ pcmk_rc_multiple = -1006, ++ pcmk_rc_node_unknown = -1005, ++ pcmk_rc_already = -1004, ++ pcmk_rc_bad_nvpair = -1003, ++ pcmk_rc_unknown_format = -1002, ++ // Developers: Use a more specific code than pcmk_rc_error whenever possible ++ pcmk_rc_error = -1001, ++ ++ // Values -1 through -1000 reserved for caller use ++ ++ pcmk_rc_ok = 0 ++ ++ // Positive values reserved for system error numbers ++}; ++ + /* + * Exit status codes + * +@@ -150,6 +202,11 @@ typedef enum crm_exit_e { + CRM_EX_MAX = 255, // ensure crm_exit_t can hold this + } crm_exit_t; + ++const char *pcmk_rc_name(int rc); ++const char *pcmk_rc_str(int rc); ++crm_exit_t pcmk_rc2exitc(int rc); ++int pcmk_rc2legacy(int rc); ++int pcmk_legacy2rc(int legacy_rc); + const char *pcmk_strerror(int rc); + const char *pcmk_errorname(int rc); + const char *bz2_strerror(int rc); +diff --git a/lib/common/results.c b/lib/common/results.c +index b80191c..189648f 100644 +--- a/lib/common/results.c ++++ b/lib/common/results.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -22,148 +22,14 @@ + #include + #include + ++// @COMPAT Legacy function return codes ++ ++//! \deprecated Use standard return codes and pcmk_rc_name() instead + const char * + pcmk_errorname(int rc) + { +- int error = abs(rc); +- +- switch (error) { +- case E2BIG: return "E2BIG"; +- case EACCES: return "EACCES"; +- case EADDRINUSE: return "EADDRINUSE"; +- case EADDRNOTAVAIL: return "EADDRNOTAVAIL"; +- case EAFNOSUPPORT: return "EAFNOSUPPORT"; +- case EAGAIN: return "EAGAIN"; +- case EALREADY: return "EALREADY"; +- case EBADF: return "EBADF"; +- case EBADMSG: return "EBADMSG"; +- case EBUSY: return "EBUSY"; +- case ECANCELED: return "ECANCELED"; +- case ECHILD: return "ECHILD"; +- case ECOMM: return "ECOMM"; +- case ECONNABORTED: return "ECONNABORTED"; +- case ECONNREFUSED: return "ECONNREFUSED"; +- case ECONNRESET: return "ECONNRESET"; +- /* case EDEADLK: return "EDEADLK"; */ +- case EDESTADDRREQ: return "EDESTADDRREQ"; +- case EDOM: return "EDOM"; +- case EDQUOT: return "EDQUOT"; +- case EEXIST: return "EEXIST"; +- case EFAULT: return "EFAULT"; +- case EFBIG: return "EFBIG"; +- case EHOSTDOWN: return "EHOSTDOWN"; +- case EHOSTUNREACH: return "EHOSTUNREACH"; +- case EIDRM: return "EIDRM"; +- case EILSEQ: return "EILSEQ"; +- case EINPROGRESS: return "EINPROGRESS"; +- case EINTR: return "EINTR"; +- case EINVAL: return "EINVAL"; +- case EIO: return "EIO"; +- case EISCONN: return "EISCONN"; +- case EISDIR: return "EISDIR"; +- case ELIBACC: return "ELIBACC"; +- case ELOOP: return "ELOOP"; +- case EMFILE: return "EMFILE"; +- case EMLINK: return "EMLINK"; +- case EMSGSIZE: return "EMSGSIZE"; +-#ifdef EMULTIHOP // Not available on OpenBSD +- case EMULTIHOP: return "EMULTIHOP"; +-#endif +- case ENAMETOOLONG: return "ENAMETOOLONG"; +- case ENETDOWN: return "ENETDOWN"; +- case ENETRESET: return "ENETRESET"; +- case ENETUNREACH: return "ENETUNREACH"; +- case ENFILE: return "ENFILE"; +- case ENOBUFS: return "ENOBUFS"; +- case ENODATA: return "ENODATA"; +- case ENODEV: return "ENODEV"; +- case ENOENT: return "ENOENT"; +- case ENOEXEC: return "ENOEXEC"; +- case ENOKEY: return "ENOKEY"; +- case ENOLCK: return "ENOLCK"; +-#ifdef ENOLINK // Not available on OpenBSD +- case ENOLINK: return "ENOLINK"; +-#endif +- case ENOMEM: return "ENOMEM"; +- case ENOMSG: return "ENOMSG"; +- case ENOPROTOOPT: return "ENOPROTOOPT"; +- case ENOSPC: return "ENOSPC"; +- case ENOSR: return "ENOSR"; +- case ENOSTR: return "ENOSTR"; +- case ENOSYS: return "ENOSYS"; +- case ENOTBLK: return "ENOTBLK"; +- case ENOTCONN: return "ENOTCONN"; +- case ENOTDIR: return "ENOTDIR"; +- case ENOTEMPTY: return "ENOTEMPTY"; +- case ENOTSOCK: return "ENOTSOCK"; +- /* case ENOTSUP: return "ENOTSUP"; */ +- case ENOTTY: return "ENOTTY"; +- case ENOTUNIQ: return "ENOTUNIQ"; +- case ENXIO: return "ENXIO"; +- case EOPNOTSUPP: return "EOPNOTSUPP"; +- case EOVERFLOW: return "EOVERFLOW"; +- case EPERM: return "EPERM"; +- case EPFNOSUPPORT: return "EPFNOSUPPORT"; +- case EPIPE: return "EPIPE"; +- case EPROTO: return "EPROTO"; +- case EPROTONOSUPPORT: return "EPROTONOSUPPORT"; +- case EPROTOTYPE: return "EPROTOTYPE"; +- case ERANGE: return "ERANGE"; +- case EREMOTE: return "EREMOTE"; +- case EREMOTEIO: return "EREMOTEIO"; +- +- case EROFS: return "EROFS"; +- case ESHUTDOWN: return "ESHUTDOWN"; +- case ESPIPE: return "ESPIPE"; +- case ESOCKTNOSUPPORT: return "ESOCKTNOSUPPORT"; +- case ESRCH: return "ESRCH"; +- case ESTALE: return "ESTALE"; +- case ETIME: return "ETIME"; +- case ETIMEDOUT: return "ETIMEDOUT"; +- case ETXTBSY: return "ETXTBSY"; +- case EUNATCH: return "EUNATCH"; +- case EUSERS: return "EUSERS"; +- /* case EWOULDBLOCK: return "EWOULDBLOCK"; */ +- case EXDEV: return "EXDEV"; +- +-#ifdef EBADE +- /* Not available on OSX */ +- case EBADE: return "EBADE"; +- case EBADFD: return "EBADFD"; +- case EBADSLT: return "EBADSLT"; +- case EDEADLOCK: return "EDEADLOCK"; +- case EBADR: return "EBADR"; +- case EBADRQC: return "EBADRQC"; +- case ECHRNG: return "ECHRNG"; +-#ifdef EISNAM /* Not available on Illumos/Solaris */ +- case EISNAM: return "EISNAM"; +- case EKEYEXPIRED: return "EKEYEXPIRED"; +- case EKEYREJECTED: return "EKEYREJECTED"; +- case EKEYREVOKED: return "EKEYREVOKED"; +-#endif +- case EL2HLT: return "EL2HLT"; +- case EL2NSYNC: return "EL2NSYNC"; +- case EL3HLT: return "EL3HLT"; +- case EL3RST: return "EL3RST"; +- case ELIBBAD: return "ELIBBAD"; +- case ELIBMAX: return "ELIBMAX"; +- case ELIBSCN: return "ELIBSCN"; +- case ELIBEXEC: return "ELIBEXEC"; +-#ifdef ENOMEDIUM /* Not available on Illumos/Solaris */ +- case ENOMEDIUM: return "ENOMEDIUM"; +- case EMEDIUMTYPE: return "EMEDIUMTYPE"; +-#endif +- case ENONET: return "ENONET"; +- case ENOPKG: return "ENOPKG"; +- case EREMCHG: return "EREMCHG"; +- case ERESTART: return "ERESTART"; +- case ESTRPIPE: return "ESTRPIPE"; +-#ifdef EUCLEAN /* Not available on Illumos/Solaris */ +- case EUCLEAN: return "EUCLEAN"; +-#endif +- case EXFULL: return "EXFULL"; +-#endif +- ++ rc = abs(rc); ++ switch (rc) { + case pcmk_err_generic: return "pcmk_err_generic"; + case pcmk_err_no_quorum: return "pcmk_err_no_quorum"; + case pcmk_err_schema_validation: return "pcmk_err_schema_validation"; +@@ -180,24 +46,26 @@ pcmk_errorname(int rc) + case pcmk_err_already: return "pcmk_err_already"; + case pcmk_err_bad_nvpair: return "pcmk_err_bad_nvpair"; + case pcmk_err_unknown_format: return "pcmk_err_unknown_format"; ++ default: return pcmk_rc_name(rc); // system errno + } +- return "Unknown"; + } + ++//! \deprecated Use standard return codes and pcmk_rc_str() instead + const char * + pcmk_strerror(int rc) + { +- int error = abs(rc); +- +- if (error == 0) { ++ if (rc == 0) { + return "OK"; ++ } + +- // Of course error > 0 ... unless someone passed INT_MIN as rc +- } else if ((error > 0) && (error < PCMK_ERROR_OFFSET)) { +- return strerror(error); ++ rc = abs(rc); ++ ++ // Of course rc > 0 ... unless someone passed INT_MIN as rc ++ if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) { ++ return strerror(rc); + } + +- switch (error) { ++ switch (rc) { + case pcmk_err_generic: + return "Generic Pacemaker error"; + case pcmk_err_no_quorum: +@@ -253,11 +121,313 @@ pcmk_strerror(int rc) + case ENOKEY: + return "Required key not available"; + } +- + crm_err("Unknown error code: %d", rc); + return "Unknown error"; + } + ++// Standard Pacemaker API return codes ++ ++/* This array is used only for nonzero values of pcmk_rc_e. Its values must be ++ * kept in the exact reverse order of the enum value numbering (i.e. add new ++ * values to the end of the array). ++ */ ++static struct pcmk__rc_info { ++ const char *name; ++ const char *desc; ++ int legacy_rc; ++} pcmk__rcs[] = { ++ { "pcmk_rc_error", ++ "Error", ++ -pcmk_err_generic, ++ }, ++ { "pcmk_rc_unknown_format", ++ "Unknown output format", ++ -pcmk_err_unknown_format, ++ }, ++ { "pcmk_rc_bad_nvpair", ++ "Bad name/value pair given", ++ -pcmk_err_bad_nvpair, ++ }, ++ { "pcmk_rc_already", ++ "Already in requested state", ++ -pcmk_err_already, ++ }, ++ { "pcmk_rc_node_unknown", ++ "Node not found", ++ -pcmk_err_node_unknown, ++ }, ++ { "pcmk_rc_multiple", ++ "Resource active on multiple nodes", ++ -pcmk_err_multiple, ++ }, ++ { "pcmk_rc_cib_corrupt", ++ "Could not parse on-disk configuration", ++ -pcmk_err_cib_corrupt, ++ }, ++ { "pcmk_rc_cib_save", ++ "Could not save new configuration to disk", ++ -pcmk_err_cib_save, ++ }, ++ { "pcmk_rc_cib_backup", ++ "Could not archive previous configuration", ++ -pcmk_err_cib_backup, ++ }, ++ { "pcmk_rc_cib_modified", ++ "On-disk configuration was manually modified", ++ -pcmk_err_cib_modified, ++ }, ++ { "pcmk_rc_diff_resync", ++ "Application of update diff failed, requesting full refresh", ++ -pcmk_err_diff_resync, ++ }, ++ { "pcmk_rc_diff_failed", ++ "Application of update diff failed", ++ -pcmk_err_diff_failed, ++ }, ++ { "pcmk_rc_old_data", ++ "Update was older than existing configuration", ++ -pcmk_err_old_data, ++ }, ++ { "pcmk_rc_transform_failed", ++ "Schema transform failed", ++ -pcmk_err_transform_failed, ++ }, ++ { "pcmk_rc_schema_unchanged", ++ "Schema is already the latest available", ++ -pcmk_err_schema_unchanged, ++ }, ++ { "pcmk_rc_schema_validation", ++ "Update does not conform to the configured schema", ++ -pcmk_err_schema_validation, ++ }, ++ { "pcmk_rc_no_quorum", ++ "Operation requires quorum", ++ -pcmk_err_no_quorum, ++ }, ++}; ++ ++#define PCMK__N_RC (sizeof(pcmk__rcs) / sizeof(struct pcmk__rc_info)) ++ ++/*! ++ * \brief Get a return code constant name as a string ++ * ++ * \param[in] rc Integer return code to convert ++ * ++ * \return String of constant name corresponding to rc ++ */ ++const char * ++pcmk_rc_name(int rc) ++{ ++ if ((rc <= pcmk_rc_error) && ((pcmk_rc_error - rc) < PCMK__N_RC)) { ++ return pcmk__rcs[pcmk_rc_error - rc].name; ++ } ++ switch (rc) { ++ case pcmk_rc_ok: return "pcmk_rc_ok"; ++ case E2BIG: return "E2BIG"; ++ case EACCES: return "EACCES"; ++ case EADDRINUSE: return "EADDRINUSE"; ++ case EADDRNOTAVAIL: return "EADDRNOTAVAIL"; ++ case EAFNOSUPPORT: return "EAFNOSUPPORT"; ++ case EAGAIN: return "EAGAIN"; ++ case EALREADY: return "EALREADY"; ++ case EBADF: return "EBADF"; ++ case EBADMSG: return "EBADMSG"; ++ case EBUSY: return "EBUSY"; ++ case ECANCELED: return "ECANCELED"; ++ case ECHILD: return "ECHILD"; ++ case ECOMM: return "ECOMM"; ++ case ECONNABORTED: return "ECONNABORTED"; ++ case ECONNREFUSED: return "ECONNREFUSED"; ++ case ECONNRESET: return "ECONNRESET"; ++ /* case EDEADLK: return "EDEADLK"; */ ++ case EDESTADDRREQ: return "EDESTADDRREQ"; ++ case EDOM: return "EDOM"; ++ case EDQUOT: return "EDQUOT"; ++ case EEXIST: return "EEXIST"; ++ case EFAULT: return "EFAULT"; ++ case EFBIG: return "EFBIG"; ++ case EHOSTDOWN: return "EHOSTDOWN"; ++ case EHOSTUNREACH: return "EHOSTUNREACH"; ++ case EIDRM: return "EIDRM"; ++ case EILSEQ: return "EILSEQ"; ++ case EINPROGRESS: return "EINPROGRESS"; ++ case EINTR: return "EINTR"; ++ case EINVAL: return "EINVAL"; ++ case EIO: return "EIO"; ++ case EISCONN: return "EISCONN"; ++ case EISDIR: return "EISDIR"; ++ case ELIBACC: return "ELIBACC"; ++ case ELOOP: return "ELOOP"; ++ case EMFILE: return "EMFILE"; ++ case EMLINK: return "EMLINK"; ++ case EMSGSIZE: return "EMSGSIZE"; ++#ifdef EMULTIHOP // Not available on OpenBSD ++ case EMULTIHOP: return "EMULTIHOP"; ++#endif ++ case ENAMETOOLONG: return "ENAMETOOLONG"; ++ case ENETDOWN: return "ENETDOWN"; ++ case ENETRESET: return "ENETRESET"; ++ case ENETUNREACH: return "ENETUNREACH"; ++ case ENFILE: return "ENFILE"; ++ case ENOBUFS: return "ENOBUFS"; ++ case ENODATA: return "ENODATA"; ++ case ENODEV: return "ENODEV"; ++ case ENOENT: return "ENOENT"; ++ case ENOEXEC: return "ENOEXEC"; ++ case ENOKEY: return "ENOKEY"; ++ case ENOLCK: return "ENOLCK"; ++#ifdef ENOLINK // Not available on OpenBSD ++ case ENOLINK: return "ENOLINK"; ++#endif ++ case ENOMEM: return "ENOMEM"; ++ case ENOMSG: return "ENOMSG"; ++ case ENOPROTOOPT: return "ENOPROTOOPT"; ++ case ENOSPC: return "ENOSPC"; ++ case ENOSR: return "ENOSR"; ++ case ENOSTR: return "ENOSTR"; ++ case ENOSYS: return "ENOSYS"; ++ case ENOTBLK: return "ENOTBLK"; ++ case ENOTCONN: return "ENOTCONN"; ++ case ENOTDIR: return "ENOTDIR"; ++ case ENOTEMPTY: return "ENOTEMPTY"; ++ case ENOTSOCK: return "ENOTSOCK"; ++#if ENOTSUP != EOPNOTSUPP ++ case ENOTSUP: return "ENOTSUP"; ++#endif ++ case ENOTTY: return "ENOTTY"; ++ case ENOTUNIQ: return "ENOTUNIQ"; ++ case ENXIO: return "ENXIO"; ++ case EOPNOTSUPP: return "EOPNOTSUPP"; ++ case EOVERFLOW: return "EOVERFLOW"; ++ case EPERM: return "EPERM"; ++ case EPFNOSUPPORT: return "EPFNOSUPPORT"; ++ case EPIPE: return "EPIPE"; ++ case EPROTO: return "EPROTO"; ++ case EPROTONOSUPPORT: return "EPROTONOSUPPORT"; ++ case EPROTOTYPE: return "EPROTOTYPE"; ++ case ERANGE: return "ERANGE"; ++ case EREMOTE: return "EREMOTE"; ++ case EREMOTEIO: return "EREMOTEIO"; ++ case EROFS: return "EROFS"; ++ case ESHUTDOWN: return "ESHUTDOWN"; ++ case ESPIPE: return "ESPIPE"; ++ case ESOCKTNOSUPPORT: return "ESOCKTNOSUPPORT"; ++ case ESRCH: return "ESRCH"; ++ case ESTALE: return "ESTALE"; ++ case ETIME: return "ETIME"; ++ case ETIMEDOUT: return "ETIMEDOUT"; ++ case ETXTBSY: return "ETXTBSY"; ++ case EUNATCH: return "EUNATCH"; ++ case EUSERS: return "EUSERS"; ++ /* case EWOULDBLOCK: return "EWOULDBLOCK"; */ ++ case EXDEV: return "EXDEV"; ++ ++#ifdef EBADE // Not available on OS X ++ case EBADE: return "EBADE"; ++ case EBADFD: return "EBADFD"; ++ case EBADSLT: return "EBADSLT"; ++ case EDEADLOCK: return "EDEADLOCK"; ++ case EBADR: return "EBADR"; ++ case EBADRQC: return "EBADRQC"; ++ case ECHRNG: return "ECHRNG"; ++#ifdef EISNAM // Not available on OS X, Illumos, Solaris ++ case EISNAM: return "EISNAM"; ++ case EKEYEXPIRED: return "EKEYEXPIRED"; ++ case EKEYREJECTED: return "EKEYREJECTED"; ++ case EKEYREVOKED: return "EKEYREVOKED"; ++#endif ++ case EL2HLT: return "EL2HLT"; ++ case EL2NSYNC: return "EL2NSYNC"; ++ case EL3HLT: return "EL3HLT"; ++ case EL3RST: return "EL3RST"; ++ case ELIBBAD: return "ELIBBAD"; ++ case ELIBMAX: return "ELIBMAX"; ++ case ELIBSCN: return "ELIBSCN"; ++ case ELIBEXEC: return "ELIBEXEC"; ++#ifdef ENOMEDIUM // Not available on OS X, Illumos, Solaris ++ case ENOMEDIUM: return "ENOMEDIUM"; ++ case EMEDIUMTYPE: return "EMEDIUMTYPE"; ++#endif ++ case ENONET: return "ENONET"; ++ case ENOPKG: return "ENOPKG"; ++ case EREMCHG: return "EREMCHG"; ++ case ERESTART: return "ERESTART"; ++ case ESTRPIPE: return "ESTRPIPE"; ++#ifdef EUCLEAN // Not available on OS X, Illumos, Solaris ++ case EUCLEAN: return "EUCLEAN"; ++#endif ++ case EXFULL: return "EXFULL"; ++#endif // EBADE ++ default: return "Unknown"; ++ } ++} ++ ++/*! ++ * \brief Get a user-friendly description of a return code ++ * ++ * \param[in] rc Integer return code to convert ++ * ++ * \return String description of rc ++ */ ++const char * ++pcmk_rc_str(int rc) ++{ ++ if (rc == pcmk_rc_ok) { ++ return "OK"; ++ } ++ if ((rc <= pcmk_rc_error) && ((pcmk_rc_error - rc) < PCMK__N_RC)) { ++ return pcmk__rcs[pcmk_rc_error - rc].desc; ++ } ++ if (rc < 0) { ++ return "Unknown error"; ++ } ++ return strerror(rc); ++} ++ ++// This returns negative values for errors ++//! \deprecated Use standard return codes instead ++int ++pcmk_rc2legacy(int rc) ++{ ++ if (rc >= 0) { ++ return -rc; // OK or system errno ++ } ++ if ((rc <= pcmk_rc_error) && ((pcmk_rc_error - rc) < PCMK__N_RC)) { ++ return pcmk__rcs[pcmk_rc_error - rc].legacy_rc; ++ } ++ return -pcmk_err_generic; ++} ++ ++//! \deprecated Use standard return codes instead ++int ++pcmk_legacy2rc(int legacy_rc) ++{ ++ legacy_rc = abs(legacy_rc); ++ switch (legacy_rc) { ++ case pcmk_err_no_quorum: return pcmk_rc_no_quorum; ++ case pcmk_err_schema_validation: return pcmk_rc_schema_validation; ++ case pcmk_err_schema_unchanged: return pcmk_rc_schema_unchanged; ++ case pcmk_err_transform_failed: return pcmk_rc_transform_failed; ++ case pcmk_err_old_data: return pcmk_rc_old_data; ++ case pcmk_err_diff_failed: return pcmk_rc_diff_failed; ++ case pcmk_err_diff_resync: return pcmk_rc_diff_resync; ++ case pcmk_err_cib_modified: return pcmk_rc_cib_modified; ++ case pcmk_err_cib_backup: return pcmk_rc_cib_backup; ++ case pcmk_err_cib_save: return pcmk_rc_cib_save; ++ case pcmk_err_cib_corrupt: return pcmk_rc_cib_corrupt; ++ case pcmk_err_multiple: return pcmk_rc_multiple; ++ case pcmk_err_node_unknown: return pcmk_rc_node_unknown; ++ case pcmk_err_already: return pcmk_rc_already; ++ case pcmk_err_bad_nvpair: return pcmk_rc_bad_nvpair; ++ case pcmk_err_unknown_format: return pcmk_rc_unknown_format; ++ case pcmk_err_generic: return pcmk_rc_error; ++ case pcmk_ok: return pcmk_rc_ok; ++ default: return legacy_rc; // system errno ++ } ++} ++ ++// Exit status codes ++ + const char * + crm_exit_name(crm_exit_t exit_code) + { +@@ -347,26 +517,17 @@ crm_exit_str(crm_exit_t exit_code) + case CRM_EX_TIMEOUT: return "Timeout occurred"; + case CRM_EX_MAX: return "Error occurred"; + } +- if (exit_code > 128) { ++ if ((exit_code > 128) && (exit_code < CRM_EX_MAX)) { + return "Interrupted by signal"; + } + return "Unknown exit status"; + } + +-/*! +- * \brief Map an errno to a similar exit status +- * +- * \param[in] errno Error number to map +- * +- * \return Exit status corresponding to errno +- */ ++//! \deprecated Use standard return codes and pcmk_rc2exitc() instead + crm_exit_t + crm_errno2exit(int rc) + { + rc = abs(rc); // Convenience for functions that return -errno +- if (rc == EOPNOTSUPP) { +- rc = ENOTSUP; // Values are same on Linux, can't use both in case +- } + switch (rc) { + case pcmk_ok: + return CRM_EX_OK; +@@ -384,6 +545,48 @@ crm_errno2exit(int rc) + case pcmk_err_bad_nvpair: + return CRM_EX_INVALID_PARAM; + ++ case pcmk_err_already: ++ return CRM_EX_EXISTS; ++ ++ case pcmk_err_multiple: ++ return CRM_EX_MULTIPLE; ++ ++ case pcmk_err_node_unknown: ++ case pcmk_err_unknown_format: ++ return CRM_EX_NOSUCH; ++ ++ default: ++ return pcmk_rc2exitc(rc); // system errno ++ } ++} ++ ++/*! ++ * \brief Map a function return code to the most similar exit code ++ * ++ * \param[in] rc Function return code ++ * ++ * \return Most similar exit code ++ */ ++crm_exit_t ++pcmk_rc2exitc(int rc) ++{ ++ switch (rc) { ++ case pcmk_rc_ok: ++ return CRM_EX_OK; ++ ++ case pcmk_rc_no_quorum: ++ return CRM_EX_QUORUM; ++ ++ case pcmk_rc_old_data: ++ return CRM_EX_OLD; ++ ++ case pcmk_rc_schema_validation: ++ case pcmk_rc_transform_failed: ++ return CRM_EX_CONFIG; ++ ++ case pcmk_rc_bad_nvpair: ++ return CRM_EX_INVALID_PARAM; ++ + case EACCES: + return CRM_EX_INSUFFICIENT_PRIV; + +@@ -414,22 +617,25 @@ crm_errno2exit(int rc) + return CRM_EX_DISCONNECT; + + case EEXIST: +- case pcmk_err_already: ++ case pcmk_rc_already: + return CRM_EX_EXISTS; + + case EIO: + return CRM_EX_IOERR; + + case ENOTSUP: ++#if EOPNOTSUPP != ENOTSUP ++ case EOPNOTSUPP: ++#endif + return CRM_EX_UNIMPLEMENT_FEATURE; + + case ENOTUNIQ: +- case pcmk_err_multiple: ++ case pcmk_rc_multiple: + return CRM_EX_MULTIPLE; + + case ENXIO: +- case pcmk_err_node_unknown: +- case pcmk_err_unknown_format: ++ case pcmk_rc_node_unknown: ++ case pcmk_rc_unknown_format: + return CRM_EX_NOSUCH; + + case ETIME: +@@ -441,6 +647,8 @@ crm_errno2exit(int rc) + } + } + ++// Other functions ++ + const char * + bz2_strerror(int rc) + { +diff --git a/tools/crm_error.c b/tools/crm_error.c +index f6dc73c..0dcae05 100644 +--- a/tools/crm_error.c ++++ b/tools/crm_error.c +@@ -1,21 +1,10 @@ +-/* +- * Copyright 2012-2018 the Pacemaker project contributors ++/* ++ * Copyright 2012-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. +- * +- * This program is free software; you can redistribute it and/or +- * modify it under the terms of the GNU General Public +- * License as published by the Free Software Foundation; either +- * version 2 of the License, or (at your option) any later version. +- * +- * This software is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- * General Public License for more details. +- * +- * You should have received a copy of the GNU General Public +- * License along with this library; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ * ++ * This source code is licensed under the GNU General Public License version 2 ++ * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + + #include +@@ -33,12 +22,31 @@ static struct crm_option long_options[] = { + "\n\t\t\tUseful for looking for sources of the error in source code"}, + + {"list", 0, 0, 'l', "\tShow all known errors."}, +- {"exit", 0, 0, 'X', "\tInterpret as exit code rather than function return value"}, ++ {"exit", 0, 0, 'X', "\tInterpret as exit code rather than legacy function return value"}, ++ {"rc", 0, 0, 'r', "\tInterpret as return code rather than legacy function return value"}, + + {0, 0, 0, 0} + }; + /* *INDENT-ON* */ + ++static bool as_exit_code = false; ++static bool as_rc = false; ++ ++static void ++get_strings(int rc, const char **name, const char **str) ++{ ++ if (as_exit_code) { ++ *str = crm_exit_str((crm_exit_t) rc); ++ *name = crm_exit_name(rc); ++ } else if (as_rc) { ++ *str = pcmk_rc_str(rc); ++ *name = pcmk_rc_name(rc); ++ } else { ++ *str = pcmk_strerror(rc); ++ *name = pcmk_errorname(rc); ++ } ++} ++ + int + main(int argc, char **argv) + { +@@ -49,10 +57,12 @@ main(int argc, char **argv) + + bool do_list = FALSE; + bool with_name = FALSE; +- bool as_exit_code = FALSE; ++ ++ const char *name = NULL; ++ const char *desc = NULL; + + crm_log_cli_init("crm_error"); +- crm_set_options(NULL, "[options] -- rc", long_options, ++ crm_set_options(NULL, "[options] -- [...]", long_options, + "Tool for displaying the textual name or description of a reported error code"); + + while (flag >= 0) { +@@ -73,6 +83,9 @@ main(int argc, char **argv) + case 'l': + do_list = TRUE; + break; ++ case 'r': ++ as_rc = true; ++ break; + case 'X': + as_exit_code = TRUE; + break; +@@ -83,30 +96,43 @@ main(int argc, char **argv) + } + + if(do_list) { +- for (rc = 0; rc < 256; rc++) { +- const char *name = as_exit_code? crm_exit_name(rc) : pcmk_errorname(rc); +- const char *desc = as_exit_code? crm_exit_str(rc) : pcmk_strerror(rc); ++ int start, end, width; ++ ++ // 256 is a hacky magic number that "should" be enough ++ if (as_rc) { ++ start = pcmk_rc_error - 256; ++ end = PCMK_CUSTOM_OFFSET; ++ width = 4; ++ } else { ++ start = 0; ++ end = 256; ++ width = 3; ++ } ++ ++ for (rc = start; rc < end; rc++) { ++ if (rc == (pcmk_rc_error + 1)) { ++ // Values in between are reserved for callers, no use iterating ++ rc = pcmk_rc_ok; ++ } ++ get_strings(rc, &name, &desc); + if (!name || !strcmp(name, "Unknown") || !strcmp(name, "CRM_EX_UNKNOWN")) { +- /* Unknown */ ++ // Undefined + } else if(with_name) { +- printf("%.3d: %-26s %s\n", rc, name, desc); ++ printf("% .*d: %-26s %s\n", width, rc, name, desc); + } else { +- printf("%.3d: %s\n", rc, desc); ++ printf("% .*d: %s\n", width, rc, desc); + } + } +- return CRM_EX_OK; +- } + +- for (lpc = optind; lpc < argc; lpc++) { +- const char *str, *name; +- +- rc = crm_atoi(argv[lpc], NULL); +- str = as_exit_code? crm_exit_str(rc) : pcmk_strerror(rc); +- if(with_name) { +- name = as_exit_code? crm_exit_name(rc) : pcmk_errorname(rc); +- printf("%s - %s\n", name, str); +- } else { +- printf("%s\n", str); ++ } else { ++ for (lpc = optind; lpc < argc; lpc++) { ++ rc = crm_atoi(argv[lpc], NULL); ++ get_strings(rc, &name, &desc); ++ if (with_name) { ++ printf("%s - %s\n", name, desc); ++ } else { ++ printf("%s\n", desc); ++ } + } + } + return CRM_EX_OK; +-- +1.8.3.1 + diff --git a/SOURCES/004-concurrent-fencing.patch b/SOURCES/004-concurrent-fencing.patch deleted file mode 100644 index 4bab3e6..0000000 --- a/SOURCES/004-concurrent-fencing.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 463eb8e36e2d2bf10a0e37938e0924ea6699f041 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 30 May 2019 08:37:52 -0500 -Subject: [PATCH] Low: libpe_status: offer compile-time option to change - concurrent-fencing default - -We most likely want to make concurrent-fencing default to true at some point. -For now, offer that possibility via a compile-time constant, for experimenting. ---- - lib/pengine/common.c | 8 +++++++- - lib/pengine/status.c | 3 +++ - 2 files changed, 10 insertions(+), 1 deletion(-) - -diff --git a/lib/pengine/common.c b/lib/pengine/common.c -index 9513633..3a283b4 100644 ---- a/lib/pengine/common.c -+++ b/lib/pengine/common.c -@@ -95,7 +95,13 @@ static pe_cluster_option pe_opts[] = { - "How long to wait for the STONITH action (reboot,on,off) to complete", NULL }, - { XML_ATTR_HAVE_WATCHDOG, NULL, "boolean", NULL, "false", &check_boolean, - "Enable watchdog integration", "Set automatically by the cluster if SBD is detected. User configured values are ignored." }, -- { "concurrent-fencing", NULL, "boolean", NULL, "false", &check_boolean, -+ { "concurrent-fencing", NULL, "boolean", NULL, -+#ifdef DEFAULT_CONCURRENT_FENCING_TRUE -+ "true", -+#else -+ "false", -+#endif -+ &check_boolean, - "Allow performing fencing operations in parallel", NULL }, - { "startup-fencing", NULL, "boolean", NULL, "true", &check_boolean, - "STONITH unseen nodes", "Advanced Use Only! Not using the default is very unsafe!" }, -diff --git a/lib/pengine/status.c b/lib/pengine/status.c -index 3ccfac4..a8b0947 100644 ---- a/lib/pengine/status.c -+++ b/lib/pengine/status.c -@@ -354,6 +354,9 @@ set_working_set_defaults(pe_working_set_t * data_set) - set_bit(data_set->flags, pe_flag_stop_rsc_orphans); - set_bit(data_set->flags, pe_flag_symmetric_cluster); - set_bit(data_set->flags, pe_flag_stop_action_orphans); -+#ifdef DEFAULT_CONCURRENT_FENCING_TRUE -+ set_bit(data_set->flags, pe_flag_concurrent_fencing); -+#endif - } - - resource_t * --- -1.8.3.1 - diff --git a/SOURCES/004-unused.patch b/SOURCES/004-unused.patch new file mode 100644 index 0000000..e732b42 --- /dev/null +++ b/SOURCES/004-unused.patch @@ -0,0 +1,159 @@ +From 6df10102c02f93890c1994136b3ce6a60b33a05e Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 6 Jan 2020 11:01:38 -0600 +Subject: [PATCH] Refactor: controller: remove unused function arguments + +... and rename affected functions +--- + daemons/controld/controld_execd.c | 2 +- + daemons/controld/controld_fsa.c | 1 - + daemons/controld/controld_fsa.h | 4 ++-- + daemons/controld/controld_join_client.c | 4 ++-- + daemons/controld/controld_join_dc.c | 32 ++++++++++++++------------------ + 5 files changed, 19 insertions(+), 24 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index f068413..16751b9 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -839,7 +839,7 @@ do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags) + } + + xmlNode * +-do_lrm_query(gboolean is_replace, const char *node_name) ++controld_query_executor_state(const char *node_name) + { + lrm_state_t *lrm_state = lrm_state_find(node_name); + +diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c +index bd732bc..db2b3f3 100644 +--- a/daemons/controld/controld_fsa.c ++++ b/daemons/controld/controld_fsa.c +@@ -41,7 +41,6 @@ enum crmd_fsa_state fsa_state = S_STARTING; + + extern uint highest_born_on; + extern uint num_join_invites; +-extern void initialize_join(gboolean before); + + #define DOT_PREFIX "actions:trace: " + #define do_dot_log(fmt, args...) crm_trace( fmt, ##args) +diff --git a/daemons/controld/controld_fsa.h b/daemons/controld/controld_fsa.h +index 06794cb..8aaaadf 100644 +--- a/daemons/controld/controld_fsa.h ++++ b/daemons/controld/controld_fsa.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -484,7 +484,7 @@ extern gboolean ever_had_quorum; + // These should be moved elsewhere + void do_update_cib_nodes(gboolean overwrite, const char *caller); + int crmd_cib_smart_opt(void); +-xmlNode *do_lrm_query(gboolean, const char *node_name); ++xmlNode *controld_query_executor_state(const char *node_name); + + const char *fsa_input2string(enum crmd_fsa_input input); + const char *fsa_state2string(enum crmd_fsa_state state); +diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c +index 4ac0d2a..383ee29 100644 +--- a/daemons/controld/controld_join_client.c ++++ b/daemons/controld/controld_join_client.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -264,7 +264,7 @@ do_cl_join_finalize_respond(long long action, + update_dc_expected(input->msg); + + /* send our status section to the DC */ +- tmp1 = do_lrm_query(TRUE, fsa_our_uname); ++ tmp1 = controld_query_executor_state(fsa_our_uname); + if (tmp1 != NULL) { + xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc, + CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index ac6b430..885b2a9 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -21,7 +21,6 @@ char *max_epoch = NULL; + char *max_generation_from = NULL; + xmlNode *max_generation_xml = NULL; + +-void initialize_join(gboolean before); + void finalize_join_for(gpointer key, gpointer value, gpointer user_data); + void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); + gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); +@@ -68,8 +67,8 @@ crm_update_peer_join(const char *source, crm_node_t * node, enum crm_join_phase + } + } + +-void +-initialize_join(gboolean before) ++static void ++start_join_round() + { + GHashTableIter iter; + crm_node_t *peer = NULL; +@@ -80,19 +79,16 @@ initialize_join(gboolean before) + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) { + crm_update_peer_join(__FUNCTION__, peer, crm_join_none); + } +- +- if (before) { +- if (max_generation_from != NULL) { +- free(max_generation_from); +- max_generation_from = NULL; +- } +- if (max_generation_xml != NULL) { +- free_xml(max_generation_xml); +- max_generation_xml = NULL; +- } +- clear_bit(fsa_input_register, R_HAVE_CIB); +- clear_bit(fsa_input_register, R_CIB_ASKED); ++ if (max_generation_from != NULL) { ++ free(max_generation_from); ++ max_generation_from = NULL; ++ } ++ if (max_generation_xml != NULL) { ++ free_xml(max_generation_xml); ++ max_generation_xml = NULL; + } ++ clear_bit(fsa_input_register, R_HAVE_CIB); ++ clear_bit(fsa_input_register, R_CIB_ASKED); + } + + /*! +@@ -192,7 +188,7 @@ do_dc_join_offer_all(long long action, + * will be seen as offline by the scheduler anyway. + */ + current_join_id++; +- initialize_join(TRUE); ++ start_join_round(); + /* do_update_cib_nodes(TRUE, __FUNCTION__); */ + + update_dc(NULL); +@@ -590,7 +586,7 @@ do_dc_join_ack(long long action, + controld_delete_node_state(join_from, controld_section_lrm, + cib_scope_local); + if (safe_str_eq(join_from, fsa_our_uname)) { +- xmlNode *now_dc_lrmd_state = do_lrm_query(TRUE, fsa_our_uname); ++ xmlNode *now_dc_lrmd_state = controld_query_executor_state(fsa_our_uname); + + if (now_dc_lrmd_state != NULL) { + fsa_cib_update(XML_CIB_TAG_STATUS, now_dc_lrmd_state, +-- +1.8.3.1 + diff --git a/SOURCES/005-glib-priorities.patch b/SOURCES/005-glib-priorities.patch deleted file mode 100644 index 3106932..0000000 --- a/SOURCES/005-glib-priorities.patch +++ /dev/null @@ -1,211 +0,0 @@ -From 65170ffd5fa10cbda176b3f88e817d534b6331d6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= -Date: Wed, 29 Aug 2018 15:49:58 +0200 -Subject: [PATCH 1/2] Low: mainloop: make it possible to specify server's - priority in mainloop - ---- - include/crm/common/mainloop.h | 24 +++++++++++++ - lib/common/mainloop.c | 82 +++++++++++++++++++++++++++++++++++++++++-- - 2 files changed, 103 insertions(+), 3 deletions(-) - -diff --git a/include/crm/common/mainloop.h b/include/crm/common/mainloop.h -index 85da1cd..2cfb63e 100644 ---- a/include/crm/common/mainloop.h -+++ b/include/crm/common/mainloop.h -@@ -79,6 +79,30 @@ struct ipc_client_callbacks { - qb_ipcs_service_t *mainloop_add_ipc_server(const char *name, enum qb_ipc_type type, - struct qb_ipcs_service_handlers *callbacks); - -+/*! -+ * \brief Start server-side API end-point, hooked into the internal event loop -+ * -+ * \param[in] name name of the IPC end-point ("address" for the client) -+ * \param[in] type selects libqb's IPC back-end (or use #QB_IPC_NATIVE) -+ * \param[in] callbacks defines libqb's IPC service-level handlers -+ * \param[in] priority priority relative to other events handled in the -+ * abstract handling loop, use #QB_LOOP_MED when unsure -+ * -+ * \return libqb's opaque handle to the created service abstraction -+ * -+ * \note For portability concerns, do not use this function if you keep -+ * \p priority as #QB_LOOP_MED, stick with #mainloop_add_ipc_server -+ * (with exactly such semantics) instead (once you link with this new -+ * symbol employed, you can't downgrade the library freely anymore). -+ * -+ * \note The intended effect will only get fully reflected when run-time -+ * linked to patched libqb: https://github.com/ClusterLabs/libqb/pull/352 -+ */ -+qb_ipcs_service_t *mainloop_add_ipc_server_with_prio(const char *name, -+ enum qb_ipc_type type, -+ struct qb_ipcs_service_handlers *callbacks, -+ enum qb_loop_priority prio); -+ - void mainloop_del_ipc_server(qb_ipcs_service_t * server); - - mainloop_io_t *mainloop_add_ipc_client(const char *name, int priority, size_t max_size, -diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c -index 18f7014..17e69f0 100644 ---- a/lib/common/mainloop.c -+++ b/lib/common/mainloop.c -@@ -509,6 +509,65 @@ gio_poll_destroy(gpointer data) - } - } - -+/*! -+ * \internal -+ * \brief Convert libqb's poll priority into GLib's one -+ * -+ * \param[in] prio libqb's poll priority (#QB_LOOP_MED assumed as fallback) -+ * -+ * \return best matching GLib's priority -+ */ -+static gint -+conv_prio_libqb2glib(enum qb_loop_priority prio) -+{ -+ gint ret = G_PRIORITY_DEFAULT; -+ switch (prio) { -+ case QB_LOOP_LOW: -+ ret = G_PRIORITY_LOW; -+ break; -+ case QB_LOOP_HIGH: -+ ret = G_PRIORITY_HIGH; -+ break; -+ default: -+ crm_trace("Invalid libqb's loop priority %d, assuming QB_LOOP_MED", -+ prio); -+ /* fall-through */ -+ case QB_LOOP_MED: -+ break; -+ } -+ return ret; -+} -+ -+/*! -+ * \internal -+ * \brief Convert libqb's poll priority to rate limiting spec -+ * -+ * \param[in] prio libqb's poll priority (#QB_LOOP_MED assumed as fallback) -+ * -+ * \return best matching rate limiting spec -+ */ -+static enum qb_ipcs_rate_limit -+conv_libqb_prio2ratelimit(enum qb_loop_priority prio) -+{ -+ /* this is an inversion of what libqb's qb_ipcs_request_rate_limit does */ -+ enum qb_ipcs_rate_limit ret = QB_IPCS_RATE_NORMAL; -+ switch (prio) { -+ case QB_LOOP_LOW: -+ ret = QB_IPCS_RATE_SLOW; -+ break; -+ case QB_LOOP_HIGH: -+ ret = QB_IPCS_RATE_FAST; -+ break; -+ default: -+ crm_trace("Invalid libqb's loop priority %d, assuming QB_LOOP_MED", -+ prio); -+ /* fall-through */ -+ case QB_LOOP_MED: -+ break; -+ } -+ return ret; -+} -+ - static int32_t - gio_poll_dispatch_update(enum qb_loop_priority p, int32_t fd, int32_t evts, - void *data, qb_ipcs_dispatch_fn_t fn, int32_t add) -@@ -555,8 +614,8 @@ gio_poll_dispatch_update(enum qb_loop_priority p, int32_t fd, int32_t evts, - adaptor->p = p; - adaptor->is_used++; - adaptor->source = -- g_io_add_watch_full(channel, G_PRIORITY_DEFAULT, evts, gio_read_socket, adaptor, -- gio_poll_destroy); -+ g_io_add_watch_full(channel, conv_prio_libqb2glib(p), evts, -+ gio_read_socket, adaptor, gio_poll_destroy); - - /* Now that mainloop now holds a reference to channel, - * thanks to g_io_add_watch_full(), drop ours from g_io_channel_unix_new(). -@@ -640,7 +699,15 @@ pick_ipc_type(enum qb_ipc_type requested) - - qb_ipcs_service_t * - mainloop_add_ipc_server(const char *name, enum qb_ipc_type type, -- struct qb_ipcs_service_handlers * callbacks) -+ struct qb_ipcs_service_handlers *callbacks) -+{ -+ return mainloop_add_ipc_server_with_prio(name, type, callbacks, QB_LOOP_MED); -+} -+ -+qb_ipcs_service_t * -+mainloop_add_ipc_server_with_prio(const char *name, enum qb_ipc_type type, -+ struct qb_ipcs_service_handlers *callbacks, -+ enum qb_loop_priority prio) - { - int rc = 0; - qb_ipcs_service_t *server = NULL; -@@ -652,6 +719,15 @@ mainloop_add_ipc_server(const char *name, enum qb_ipc_type type, - crm_client_init(); - server = qb_ipcs_create(name, 0, pick_ipc_type(type), callbacks); - -+ if (server == NULL) { -+ crm_err("Could not create %s IPC server: %s (%d)", name, pcmk_strerror(rc), rc); -+ return NULL; -+ } -+ -+ if (prio != QB_LOOP_MED) { -+ qb_ipcs_request_rate_limit(server, conv_libqb_prio2ratelimit(prio)); -+ } -+ - #ifdef HAVE_IPCS_GET_BUFFER_SIZE - /* All clients should use at least ipc_buffer_max as their buffer size */ - qb_ipcs_enforce_buffer_size(server, crm_ipc_default_buffer_size()); --- -1.8.3.1 - - -From 3401f25994e8cc059898550082f9b75f2d07f103 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= -Date: Wed, 29 Aug 2018 15:50:57 +0200 -Subject: [PATCH 2/2] High: stonith-ng's function cannot be blocked with CIB - updates forever - -In the high-load (or high-rate-config-change) scenarios, -pacemaker-fenced would be unable to provide service when basically DoS'd -with CIB update notifications. Try to reconcile that with elevated -priority of the server's proper listening interface in the mainloop, at -worst, it will try to fence with slightly outdated config, but appears -to be less bad than not carrying the execution at all, for instance. -Other daemons might be considered as well. - -Prerequisites: -- https://github.com/ClusterLabs/libqb/pull/352 - (libqb used to contain a bug due to which one particular step in the - initial-client-connection-accepting-at-the-server procedure that would - be carried out with hard-coded (and hence possibly lower than competing - events') priority, which backfires exactly in this case (once the - pacemaker part is fixed -- by the means of elevating priority for - the API end-point of fenced so that it won't get consistently - overridden with a non-socket-based event source/trigger) - -How to verify: -- mocked/based -N (see commit adding that module to mocked based daemon) ---- - lib/common/utils.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/lib/common/utils.c b/lib/common/utils.c -index 758eb1b..d1c3e26 100644 ---- a/lib/common/utils.c -+++ b/lib/common/utils.c -@@ -1031,7 +1031,8 @@ attrd_ipc_server_init(qb_ipcs_service_t **ipcs, struct qb_ipcs_service_handlers - void - stonith_ipc_server_init(qb_ipcs_service_t **ipcs, struct qb_ipcs_service_handlers *cb) - { -- *ipcs = mainloop_add_ipc_server("stonith-ng", QB_IPC_NATIVE, cb); -+ *ipcs = mainloop_add_ipc_server_with_prio("stonith-ng", QB_IPC_NATIVE, cb, -+ QB_LOOP_HIGH); - - if (*ipcs == NULL) { - crm_err("Failed to create fencer: exiting and inhibiting respawn."); --- -1.8.3.1 - diff --git a/SOURCES/005-shutdown-lock.patch b/SOURCES/005-shutdown-lock.patch new file mode 100644 index 0000000..9a4fe46 --- /dev/null +++ b/SOURCES/005-shutdown-lock.patch @@ -0,0 +1,207 @@ +From 4bdda97ff76d0e682f4f58bc632cd2cbd417c423 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 12:52:21 -0600 +Subject: [PATCH 01/18] Log: controller: improve messages when deleting CIB + resource history + +This also moves delete_rsc_status() to controld_based.c and renames it. +--- + daemons/controld/controld_based.c | 71 +++++++++++++++++++++++++++++++++++++++ + daemons/controld/controld_execd.c | 47 +++++--------------------- + daemons/controld/controld_utils.h | 4 ++- + 3 files changed, 83 insertions(+), 39 deletions(-) + +diff --git a/daemons/controld/controld_based.c b/daemons/controld/controld_based.c +index 42e321f..f3a7c4f 100644 +--- a/daemons/controld/controld_based.c ++++ b/daemons/controld/controld_based.c +@@ -243,3 +243,74 @@ controld_delete_node_state(const char *uname, enum controld_section_e section, + } + free(xpath); + } ++ ++// Takes node name and resource ID ++#define XPATH_RESOURCE_HISTORY "//" XML_CIB_TAG_STATE \ ++ "[@" XML_ATTR_UNAME "='%s'] /" \ ++ XML_CIB_TAG_LRM "/" XML_LRM_TAG_RESOURCES \ ++ "/" XML_LRM_TAG_RESOURCE \ ++ "[@" XML_ATTR_ID "='%s']" ++// @TODO could add "and @XML_CONFIG_ATTR_SHUTDOWN_LOCK" to limit to locks ++ ++/*! ++ * \internal ++ * \brief Clear resource history from CIB for a given resource and node ++ * ++ * \param[in] rsc_id ID of resource to be cleared ++ * \param[in] node Node whose resource history should be cleared ++ * \param[in] user_name ACL user name to use ++ * \param[in] call_options CIB call options ++ * ++ * \return Standard Pacemaker return code ++ */ ++int ++controld_delete_resource_history(const char *rsc_id, const char *node, ++ const char *user_name, int call_options) ++{ ++ char *desc = NULL; ++ char *xpath = NULL; ++ int rc = pcmk_rc_ok; ++ ++ CRM_CHECK((rsc_id != NULL) && (node != NULL), return EINVAL); ++ ++ desc = crm_strdup_printf("resource history for %s on %s", rsc_id, node); ++ if (fsa_cib_conn == NULL) { ++ crm_err("Unable to clear %s: no CIB connection", desc); ++ free(desc); ++ return ENOTCONN; ++ } ++ ++ // Ask CIB to delete the entry ++ xpath = crm_strdup_printf(XPATH_RESOURCE_HISTORY, node, rsc_id); ++ rc = cib_internal_op(fsa_cib_conn, CIB_OP_DELETE, NULL, xpath, NULL, ++ NULL, call_options|cib_xpath, user_name); ++ ++ if (rc < 0) { ++ rc = pcmk_legacy2rc(rc); ++ crm_err("Could not delete resource status of %s on %s%s%s: %s " ++ CRM_XS " rc=%d", rsc_id, node, ++ (user_name? " for user " : ""), (user_name? user_name : ""), ++ pcmk_rc_str(rc), rc); ++ free(desc); ++ free(xpath); ++ return rc; ++ } ++ ++ if (is_set(call_options, cib_sync_call)) { ++ if (is_set(call_options, cib_dryrun)) { ++ crm_debug("Deletion of %s would succeed", desc); ++ } else { ++ crm_debug("Deletion of %s succeeded", desc); ++ } ++ free(desc); ++ ++ } else { ++ crm_info("Clearing %s (via CIB call %d) " CRM_XS " xpath=%s", ++ desc, rc, xpath); ++ fsa_register_cib_callback(rc, FALSE, desc, cib_delete_callback); ++ // CIB library handles freeing desc ++ } ++ ++ free(xpath); ++ return pcmk_rc_ok; ++} +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 16751b9..212739e 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -36,8 +36,6 @@ struct delete_event_s { + static gboolean is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id); + static gboolean build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list); + static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data); +-static int delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int call_options, +- const char *user_name); + + static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, + const char *rsc_id, const char *operation); +@@ -169,7 +167,8 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_ + + if (op->rsc_deleted) { + crm_debug("Purged history for '%s' after %s", op->rsc_id, op->op_type); +- delete_rsc_status(lrm_state, op->rsc_id, cib_quorum_override, NULL); ++ controld_delete_resource_history(op->rsc_id, lrm_state->node_name, ++ NULL, crmd_cib_smart_opt()); + return; + } + +@@ -917,31 +916,6 @@ lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data) + return FALSE; + } + +-/* +- * Remove the rsc from the CIB +- * +- * Avoids refreshing the entire LRM section of this host +- */ +-#define RSC_TEMPLATE "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']" +- +-static int +-delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int call_options, +- const char *user_name) +-{ +- char *rsc_xpath = NULL; +- int rc = pcmk_ok; +- +- CRM_CHECK(rsc_id != NULL, return -ENXIO); +- +- rsc_xpath = crm_strdup_printf(RSC_TEMPLATE, lrm_state->node_name, rsc_id); +- +- rc = cib_internal_op(fsa_cib_conn, CIB_OP_DELETE, NULL, rsc_xpath, +- NULL, NULL, call_options | cib_xpath, user_name); +- +- free(rsc_xpath); +- return rc; +-} +- + static void + delete_rsc_entry(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, + GHashTableIter * rsc_gIter, int rc, const char *user_name) +@@ -958,7 +932,8 @@ delete_rsc_entry(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rs + else + g_hash_table_remove(lrm_state->resource_history, rsc_id_copy); + crm_debug("sync: Sending delete op for %s", rsc_id_copy); +- delete_rsc_status(lrm_state, rsc_id_copy, cib_quorum_override, user_name); ++ controld_delete_resource_history(rsc_id_copy, lrm_state->node_name, ++ user_name, crmd_cib_smart_opt()); + + g_hash_table_foreach_remove(lrm_state->pending_ops, lrm_remove_deleted_op, rsc_id_copy); + free(rsc_id_copy); +@@ -1694,21 +1669,17 @@ do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state, + gboolean unregister = TRUE; + + #if ENABLE_ACL +- int cib_rc = delete_rsc_status(lrm_state, rsc->id, +- cib_dryrun|cib_sync_call, user_name); ++ int cib_rc = controld_delete_resource_history(rsc->id, lrm_state->node_name, ++ user_name, ++ cib_dryrun|cib_sync_call); + +- if (cib_rc != pcmk_ok) { ++ if (cib_rc != pcmk_rc_ok) { + lrmd_event_data_t *op = NULL; + +- crm_err("Could not delete resource status of %s for %s (user %s) on %s: %s" +- CRM_XS " rc=%d", +- rsc->id, from_sys, (user_name? user_name : "unknown"), +- from_host, pcmk_strerror(cib_rc), cib_rc); +- + op = construct_op(lrm_state, input->xml, rsc->id, CRMD_ACTION_DELETE); + op->op_status = PCMK_LRM_OP_ERROR; + +- if (cib_rc == -EACCES) { ++ if (cib_rc == EACCES) { + op->rc = PCMK_OCF_INSUFFICIENT_PRIV; + } else { + op->rc = PCMK_OCF_UNKNOWN_ERROR; +diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h +index f902361..ca8cddb 100644 +--- a/daemons/controld/controld_utils.h ++++ b/daemons/controld/controld_utils.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -95,6 +95,8 @@ enum controld_section_e { + + void controld_delete_node_state(const char *uname, + enum controld_section_e section, int options); ++int controld_delete_resource_history(const char *rsc_id, const char *node, ++ const char *user_name, int call_options); + + const char *get_node_id(xmlNode *lrm_rsc_op); + +-- +1.8.3.1 + diff --git a/SOURCES/006-bundle-fixes.patch b/SOURCES/006-bundle-fixes.patch deleted file mode 100644 index 1c3ea40..0000000 --- a/SOURCES/006-bundle-fixes.patch +++ /dev/null @@ -1,233 +0,0 @@ -From 169d424cf88594f15e7e66baa705df6b727aa807 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 4 Jun 2019 16:24:16 -0500 -Subject: [PATCH 1/4] Log: pacemaker-remoted: use different default log if pid - 1 - -When pacemaker-remoted runs as pid 1 inside a container, there may not be a -/var/log/pacemaker directory. To get around this, use a default log of -/var/log/pcmk-init.log when running as pid 1. - -This was chosen over alternatives (creating the /var/log/pacemaker directory, -or passing the log location as an environment variable when creating the -implicit container resource) because it both avoids forcing a restart of -active bundles due to configuration change (as well as preserving regression -test output) and allows users to configure an explicit log location via the -container image or the bundle's extra arguments. ---- - daemons/execd/pacemaker-execd.c | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c -index e2fdfca..cfa5500 100644 ---- a/daemons/execd/pacemaker-execd.c -+++ b/daemons/execd/pacemaker-execd.c -@@ -429,6 +429,14 @@ static void spawn_pidone(int argc, char **argv, char **envp) - return; - } - -+ /* Containers can be expected to have /var/log, but they may not have -+ * /var/log/pacemaker, so use a different default if no value has been -+ * explicitly configured in the container's environment. -+ */ -+ if (daemon_option("logfile") == NULL) { -+ set_daemon_option("logfile", "/var/log/pcmk-init.log"); -+ } -+ - sigfillset(&set); - sigprocmask(SIG_BLOCK, &set, 0); - --- -1.8.3.1 - - -From 7e362387a092b5617b36a69961115f7703e4d801 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 17 May 2019 12:39:43 -0500 -Subject: [PATCH 2/4] Refactor: libpe_status: add enum for bundle mount flags - -More readable than 0 or 1 ---- - lib/pengine/bundle.c | 17 +++++++++-------- - lib/pengine/variant.h | 9 ++++++++- - 2 files changed, 17 insertions(+), 9 deletions(-) - -diff --git a/lib/pengine/bundle.c b/lib/pengine/bundle.c -index 3b32f04..b223f03 100644 ---- a/lib/pengine/bundle.c -+++ b/lib/pengine/bundle.c -@@ -228,7 +228,7 @@ create_docker_resource(pe_resource_t *parent, pe__bundle_variant_data_t *data, - for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) { - pe__bundle_mount_t *mount = pIter->data; - -- if(mount->flags) { -+ if (is_set(mount->flags, pe__bundle_mount_subdir)) { - char *source = crm_strdup_printf( - "%s/%s-%d", mount->source, data->prefix, replica->offset); - -@@ -396,7 +396,7 @@ create_podman_resource(pe_resource_t *parent, pe__bundle_variant_data_t *data, - for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) { - pe__bundle_mount_t *mount = pIter->data; - -- if(mount->flags) { -+ if (is_set(mount->flags, pe__bundle_mount_subdir)) { - char *source = crm_strdup_printf( - "%s/%s-%d", mount->source, data->prefix, replica->offset); - -@@ -562,7 +562,7 @@ create_rkt_resource(pe_resource_t *parent, pe__bundle_variant_data_t *data, - for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) { - pe__bundle_mount_t *mount = pIter->data; - -- if(mount->flags) { -+ if (is_set(mount->flags, pe__bundle_mount_subdir)) { - char *source = crm_strdup_printf( - "%s/%s-%d", mount->source, data->prefix, replica->offset); - -@@ -894,7 +894,7 @@ create_container(pe_resource_t *parent, pe__bundle_variant_data_t *data, - - static void - mount_add(pe__bundle_variant_data_t *bundle_data, const char *source, -- const char *target, const char *options, int flags) -+ const char *target, const char *options, uint32_t flags) - { - pe__bundle_mount_t *mount = calloc(1, sizeof(pe__bundle_mount_t)); - -@@ -1142,11 +1142,11 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set) - const char *source = crm_element_value(xml_child, "source-dir"); - const char *target = crm_element_value(xml_child, "target-dir"); - const char *options = crm_element_value(xml_child, "options"); -- int flags = 0; -+ int flags = pe__bundle_mount_none; - - if (source == NULL) { - source = crm_element_value(xml_child, "source-dir-root"); -- flags = 1; -+ set_bit(flags, pe__bundle_mount_subdir); - } - - if (source && target) { -@@ -1251,9 +1251,10 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set) - * reasonable. - */ - mount_add(bundle_data, DEFAULT_REMOTE_KEY_LOCATION, -- DEFAULT_REMOTE_KEY_LOCATION, NULL, 0); -+ DEFAULT_REMOTE_KEY_LOCATION, NULL, pe__bundle_mount_none); - -- mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL, 1); -+ mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL, -+ pe__bundle_mount_subdir); - - port = calloc(1, sizeof(pe__bundle_port_t)); - if(bundle_data->control_port) { -diff --git a/lib/pengine/variant.h b/lib/pengine/variant.h -index f46aa11..7f77eef 100644 ---- a/lib/pengine/variant.h -+++ b/lib/pengine/variant.h -@@ -51,11 +51,18 @@ typedef struct { - pe_resource_t *remote; - } pe__bundle_replica_t; - -+enum pe__bundle_mount_flags { -+ pe__bundle_mount_none = 0x00, -+ -+ // mount instance-specific subdirectory rather than source directly -+ pe__bundle_mount_subdir = 0x01 -+}; -+ - typedef struct { - char *source; - char *target; - char *options; -- int flags; -+ uint32_t flags; // bitmask of pe__bundle_mount_flags - } pe__bundle_mount_t; - - typedef struct { --- -1.8.3.1 - - -From 87eac95868930ffda4d964c2b6bd9960b6893cc9 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 17 May 2019 14:13:54 -0500 -Subject: [PATCH 3/4] Fix: controller: don't check join status after remote - node appears - -Only cluster nodes have join state ---- - daemons/controld/controld_callbacks.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c -index 06ffb9d..3ce7470 100644 ---- a/daemons/controld/controld_callbacks.c -+++ b/daemons/controld/controld_callbacks.c -@@ -228,7 +228,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d - crm_trace("Alive=%d, appeared=%d, down=%d", - alive, appeared, (down? down->id : -1)); - -- if (appeared && (alive > 0)) { -+ if (appeared && (alive > 0) && !is_remote) { - register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); - } - --- -1.8.3.1 - - -From 5755b63850a17cd91bca28e83c39119378fe1887 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Sat, 18 May 2019 21:59:00 -0500 -Subject: [PATCH 4/4] Doc: Pacemaker Explained: document effect of SELinux on - bundle storage - ---- - doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt | 15 ++++++++++++--- - 1 file changed, 12 insertions(+), 3 deletions(-) - -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt -index e431626..4a181df 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt -@@ -999,11 +999,11 @@ association with Docker, Inc. is implied.] - -+ options="rw,Z"/> - -+ options="rw,Z"/> - - - -@@ -1293,7 +1293,8 @@ indexterm:[bundle,storage,storage-mapping] - - |options - | --|File system mount options to use when mapping the storage -+|A comma-separated list of file system mount options to use when mapping the -+ storage - indexterm:[options,storage-mapping] - indexterm:[storage-mapping,Property,options] - -@@ -1322,6 +1323,14 @@ The +PCMK_authkey_location+ environment variable must not be set to anything - other than the default of `/etc/pacemaker/authkey` on any node in the cluster. - ==== - -+[IMPORTANT] -+==== -+If SELinux is used in enforcing mode on the host, you must ensure the container -+is allowed to use any storage you mount into it. For Docker and podman bundles, -+adding "Z" to the mount options will create a container-specific label for the -+mount that allows the container access. -+==== -+ - === Bundle Primitive === - - A bundle may optionally contain one ++ resource --- -1.8.3.1 - diff --git a/SOURCES/006-shutdown-lock.patch b/SOURCES/006-shutdown-lock.patch new file mode 100644 index 0000000..357a2e8 --- /dev/null +++ b/SOURCES/006-shutdown-lock.patch @@ -0,0 +1,252 @@ +From 3d8a7dc405e98cd8fe637d3e283bc0468d50bc71 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 15 Jan 2020 17:56:44 -0600 +Subject: [PATCH 02/18] Refactor: controller: functionize parts of resource + deletion notification + +... for future reuse +--- + daemons/controld/controld_execd.c | 116 +++++++++++++++++++++++++------------- + daemons/controld/controld_lrm.h | 11 +++- + 2 files changed, 88 insertions(+), 39 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 212739e..82f2bf1 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -42,9 +42,6 @@ static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op + static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, + const char *operation, xmlNode *msg); + +-void send_direct_ack(const char *to_host, const char *to_sys, +- lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id); +- + static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, + int log_level); + static int do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op); +@@ -278,7 +275,7 @@ send_task_ok_ack(lrm_state_t *lrm_state, ha_msg_input_t *input, + + op->rc = PCMK_OCF_OK; + op->op_status = PCMK_LRM_OP_DONE; +- send_direct_ack(ack_host, ack_sys, rsc, op, rsc_id); ++ controld_ack_event_directly(ack_host, ack_sys, rsc, op, rsc_id); + lrmd_free_event(op); + } + +@@ -850,6 +847,57 @@ controld_query_executor_state(const char *node_name) + node_update_cluster|node_update_peer); + } + ++/*! ++ * \internal ++ * \brief Map standard Pacemaker return code to operation status and OCF code ++ * ++ * \param[out] event Executor event whose status and return code should be set ++ * \param[in] rc Standard Pacemaker return code ++ */ ++void ++controld_rc2event(lrmd_event_data_t *event, int rc) ++{ ++ switch (rc) { ++ case pcmk_rc_ok: ++ event->rc = PCMK_OCF_OK; ++ event->op_status = PCMK_LRM_OP_DONE; ++ break; ++ case EACCES: ++ event->rc = PCMK_OCF_INSUFFICIENT_PRIV; ++ event->op_status = PCMK_LRM_OP_ERROR; ++ break; ++ default: ++ event->rc = PCMK_OCF_UNKNOWN_ERROR; ++ event->op_status = PCMK_LRM_OP_ERROR; ++ break; ++ } ++} ++ ++/*! ++ * \internal ++ * \brief Trigger a new transition after CIB status was deleted ++ * ++ * If a CIB status delete was not expected (as part of the transition graph), ++ * trigger a new transition by updating the (arbitrary) "last-lrm-refresh" ++ * cluster property. ++ * ++ * \param[in] from_sys IPC name that requested the delete ++ * \param[in] rsc_id Resource whose status was deleted (for logging only) ++ */ ++void ++controld_trigger_delete_refresh(const char *from_sys, const char *rsc_id) ++{ ++ if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { ++ char *now_s = crm_strdup_printf("%lld", (long long) time(NULL)); ++ ++ crm_debug("Triggering a refresh after %s cleaned %s", from_sys, rsc_id); ++ update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, ++ NULL, NULL, NULL, NULL, "last-lrm-refresh", now_s, ++ FALSE, NULL, NULL); ++ free(now_s); ++ } ++} ++ + static void + notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, int rc) + { +@@ -860,33 +908,11 @@ notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_ + crm_info("Notifying %s on %s that %s was%s deleted", + from_sys, (from_host? from_host : "localhost"), rsc_id, + ((rc == pcmk_ok)? "" : " not")); +- + op = construct_op(lrm_state, input->xml, rsc_id, CRMD_ACTION_DELETE); +- +- if (rc == pcmk_ok) { +- op->op_status = PCMK_LRM_OP_DONE; +- op->rc = PCMK_OCF_OK; +- } else { +- op->op_status = PCMK_LRM_OP_ERROR; +- op->rc = PCMK_OCF_UNKNOWN_ERROR; +- } +- +- send_direct_ack(from_host, from_sys, NULL, op, rsc_id); ++ controld_rc2event(op, pcmk_legacy2rc(rc)); ++ controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id); + lrmd_free_event(op); +- +- if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { +- /* this isn't expected - trigger a new transition */ +- time_t now = time(NULL); +- char *now_s = crm_itoa(now); +- +- crm_debug("Triggering a refresh after %s deleted %s from the executor", +- from_sys, rsc_id); +- +- update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, +- "last-lrm-refresh", now_s, FALSE, NULL, NULL); +- +- free(now_s); +- } ++ controld_trigger_delete_refresh(from_sys, rsc_id); + } + + static gboolean +@@ -1495,7 +1521,7 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name, + #if ENABLE_ACL + if (user_name && is_privileged(user_name) == FALSE) { + crm_err("%s does not have permission to fail %s", user_name, ID(xml_rsc)); +- send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); ++ controld_ack_event_directly(from_host, from_sys, NULL, op, ID(xml_rsc)); + lrmd_free_event(op); + return; + } +@@ -1514,7 +1540,7 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name, + crm_log_xml_warn(xml, "bad input"); + } + +- send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); ++ controld_ack_event_directly(from_host, from_sys, NULL, op, ID(xml_rsc)); + lrmd_free_event(op); + } + +@@ -1684,7 +1710,7 @@ do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state, + } else { + op->rc = PCMK_OCF_UNKNOWN_ERROR; + } +- send_direct_ack(from_host, from_sys, NULL, op, rsc->id); ++ controld_ack_event_directly(from_host, from_sys, NULL, op, rsc->id); + lrmd_free_event(op); + return; + } +@@ -2000,9 +2026,23 @@ construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, const char *rsc_id, cons + return op; + } + ++/*! ++ * \internal ++ * \brief Send a (synthesized) event result ++ * ++ * Reply with a synthesized event result directly, as opposed to going through ++ * the executor. ++ * ++ * \param[in] to_host Host to send result to ++ * \param[in] to_sys IPC name to send result to (NULL for transition engine) ++ * \param[in] rsc Type information about resource the result is for ++ * \param[in] op Event with result to send ++ * \param[in] rsc_id ID of resource the result is for ++ */ + void +-send_direct_ack(const char *to_host, const char *to_sys, +- lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id) ++controld_ack_event_directly(const char *to_host, const char *to_sys, ++ lrmd_rsc_info_t *rsc, lrmd_event_data_t *op, ++ const char *rsc_id) + { + xmlNode *reply = NULL; + xmlNode *update, *iter; +@@ -2221,7 +2261,7 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, + + op->rc = PCMK_OCF_UNKNOWN_ERROR; + op->op_status = PCMK_LRM_OP_INVALID; +- send_direct_ack(NULL, NULL, rsc, op, rsc->id); ++ controld_ack_event_directly(NULL, NULL, rsc, op, rsc->id); + lrmd_free_event(op); + free(op_id); + return; +@@ -2288,7 +2328,7 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, + decode_transition_key(op->user_data, NULL, NULL, NULL, &target_rc); + op->rc = target_rc; + op->op_status = PCMK_LRM_OP_DONE; +- send_direct_ack(NULL, NULL, rsc, op, rsc->id); ++ controld_ack_event_directly(NULL, NULL, rsc, op, rsc->id); + } + + pending->params = op->params; +@@ -2388,7 +2428,7 @@ do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data + + } else { + crm_warn("Resource %s no longer exists in the executor", op->rsc_id); +- send_direct_ack(NULL, NULL, rsc, op, op->rsc_id); ++ controld_ack_event_directly(NULL, NULL, rsc, op, op->rsc_id); + goto cleanup; + } + +@@ -2660,7 +2700,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, + } + + if (need_direct_ack) { +- send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); ++ controld_ack_event_directly(NULL, NULL, NULL, op, op->rsc_id); + } + + if(remove == FALSE) { +diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h +index 3ab7048..7acac2a 100644 +--- a/daemons/controld/controld_lrm.h ++++ b/daemons/controld/controld_lrm.h +@@ -1,11 +1,13 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ ++#ifndef CONTROLD_LRM__H ++# define CONTROLD_LRM__H + + #include + #include +@@ -169,3 +171,10 @@ gboolean remote_ra_controlling_guest(lrm_state_t * lrm_state); + + void process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, + active_op_t *pending, xmlNode *action_xml); ++void controld_ack_event_directly(const char *to_host, const char *to_sys, ++ lrmd_rsc_info_t *rsc, lrmd_event_data_t *op, ++ const char *rsc_id); ++void controld_rc2event(lrmd_event_data_t *event, int rc); ++void controld_trigger_delete_refresh(const char *from_sys, const char *rsc_id); ++ ++#endif +-- +1.8.3.1 + diff --git a/SOURCES/007-fork-controld_fencing.patch b/SOURCES/007-fork-controld_fencing.patch deleted file mode 100644 index 903f01a..0000000 --- a/SOURCES/007-fork-controld_fencing.patch +++ /dev/null @@ -1,2158 +0,0 @@ -From edd133ade2bd9b003d3437280271a9c9dbab3ed6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 23 May 2019 16:36:12 -0500 -Subject: [PATCH] Refactor: controller: separate fencing-related functionality - into own source file - -Before: - 748 daemons/controld/controld_te_actions.c - 942 daemons/controld/controld_te_callbacks.c - 725 daemons/controld/controld_te_utils.c - 84 daemons/controld/controld_transition.h - 110 daemons/controld/controld_utils.h - -After: - 838 daemons/controld/controld_fencing.c - 37 daemons/controld/controld_fencing.h - 631 daemons/controld/controld_te_actions.c - 701 daemons/controld/controld_te_callbacks.c - 298 daemons/controld/controld_te_utils.c - 65 daemons/controld/controld_transition.h - 106 daemons/controld/controld_utils.h ---- - daemons/controld/Makefile.am | 5 +- - daemons/controld/controld_callbacks.c | 3 +- - daemons/controld/controld_control.c | 2 +- - daemons/controld/controld_election.c | 3 +- - daemons/controld/controld_fencing.c | 838 +++++++++++++++++++++++++++++++ - daemons/controld/controld_fencing.h | 37 ++ - daemons/controld/controld_fsa.c | 1 + - daemons/controld/controld_messages.c | 1 + - daemons/controld/controld_te_actions.c | 121 +---- - daemons/controld/controld_te_callbacks.c | 243 +-------- - daemons/controld/controld_te_utils.c | 429 +--------------- - daemons/controld/controld_transition.c | 1 - - daemons/controld/controld_transition.h | 21 +- - daemons/controld/controld_utils.h | 4 - - 14 files changed, 891 insertions(+), 818 deletions(-) - create mode 100644 daemons/controld/controld_fencing.c - create mode 100644 daemons/controld/controld_fencing.h - -diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am -index 17c3342..858e1bb 100644 ---- a/daemons/controld/Makefile.am -+++ b/daemons/controld/Makefile.am -@@ -1,5 +1,7 @@ - # --# Copyright 2004-2018 Andrew Beekhof -+# Copyright 2018-2019 the Pacemaker project contributors -+# -+# The version control history for this file may have further details. - # - # This source code is licensed under the GNU General Public License version 2 - # or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -46,6 +48,7 @@ pacemaker_controld_SOURCES = pacemaker-controld.c \ - controld_election.c \ - controld_execd.c \ - controld_execd_state.c \ -+ controld_fencing.c \ - controld_fsa.c \ - controld_join_client.c \ - controld_join_dc.c \ -diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c -index a188263..06ffb9d 100644 ---- a/daemons/controld/controld_callbacks.c -+++ b/daemons/controld/controld_callbacks.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - #include - #include - -diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c -index 6d9f335..7f918c0 100644 ---- a/daemons/controld/controld_control.c -+++ b/daemons/controld/controld_control.c -@@ -25,6 +25,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -147,7 +148,6 @@ extern char *max_generation_from; - extern xmlNode *max_generation_xml; - extern GHashTable *resource_history; - extern GHashTable *voted; --extern char *te_client_id; - - void - crmd_fast_exit(crm_exit_t exit_code) -diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c -index 5d6858c..9e49c7b 100644 ---- a/daemons/controld/controld_election.c -+++ b/daemons/controld/controld_election.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2019 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -18,6 +18,7 @@ - #include - #include - #include -+#include - #include - #include - #include -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -new file mode 100644 -index 0000000..cde57b5 ---- /dev/null -+++ b/daemons/controld/controld_fencing.c -@@ -0,0 +1,838 @@ -+/* -+ * Copyright 2004-2019 the Pacemaker project contributors -+ * -+ * This source code is licensed under the GNU General Public License version 2 -+ * or later (GPLv2+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+ -+#ifdef HAVE_SYS_REBOOT_H -+# include -+# include -+#endif -+ -+/* -+ * stonith failure counting -+ * -+ * We don't want to get stuck in a permanent fencing loop. Keep track of the -+ * number of fencing failures for each target node, and the most we'll restart a -+ * transition for. -+ */ -+ -+struct st_fail_rec { -+ int count; -+}; -+ -+static unsigned long int stonith_max_attempts = 10; -+static GHashTable *stonith_failures = NULL; -+ -+void -+update_stonith_max_attempts(const char *value) -+{ -+ if (safe_str_eq(value, CRM_INFINITY_S)) { -+ stonith_max_attempts = CRM_SCORE_INFINITY; -+ } else { -+ stonith_max_attempts = crm_int_helper(value, NULL); -+ } -+} -+ -+static gboolean -+too_many_st_failures(const char *target) -+{ -+ GHashTableIter iter; -+ const char *key = NULL; -+ struct st_fail_rec *value = NULL; -+ -+ if (stonith_failures == NULL) { -+ return FALSE; -+ } -+ -+ if (target == NULL) { -+ g_hash_table_iter_init(&iter, stonith_failures); -+ while (g_hash_table_iter_next(&iter, (gpointer *) &key, -+ (gpointer *) &value)) { -+ -+ if (value->count >= stonith_max_attempts) { -+ target = (const char*)key; -+ goto too_many; -+ } -+ } -+ } else { -+ value = g_hash_table_lookup(stonith_failures, target); -+ if ((value != NULL) && (value->count >= stonith_max_attempts)) { -+ goto too_many; -+ } -+ } -+ return FALSE; -+ -+too_many: -+ crm_warn("Too many failures (%d) to fence %s, giving up", -+ value->count, target); -+ return TRUE; -+} -+ -+/*! -+ * \internal -+ * \brief Reset a stonith fail count -+ * -+ * \param[in] target Name of node to reset, or NULL for all -+ */ -+void -+st_fail_count_reset(const char *target) -+{ -+ if (stonith_failures == NULL) { -+ return; -+ } -+ -+ if (target) { -+ struct st_fail_rec *rec = NULL; -+ -+ rec = g_hash_table_lookup(stonith_failures, target); -+ if (rec) { -+ rec->count = 0; -+ } -+ } else { -+ GHashTableIter iter; -+ const char *key = NULL; -+ struct st_fail_rec *rec = NULL; -+ -+ g_hash_table_iter_init(&iter, stonith_failures); -+ while (g_hash_table_iter_next(&iter, (gpointer *) &key, -+ (gpointer *) &rec)) { -+ rec->count = 0; -+ } -+ } -+} -+ -+static void -+st_fail_count_increment(const char *target) -+{ -+ struct st_fail_rec *rec = NULL; -+ -+ if (stonith_failures == NULL) { -+ stonith_failures = crm_str_table_new(); -+ } -+ -+ rec = g_hash_table_lookup(stonith_failures, target); -+ if (rec) { -+ rec->count++; -+ } else { -+ rec = malloc(sizeof(struct st_fail_rec)); -+ if(rec == NULL) { -+ return; -+ } -+ -+ rec->count = 1; -+ g_hash_table_insert(stonith_failures, strdup(target), rec); -+ } -+} -+ -+/* end stonith fail count functions */ -+ -+ -+static void -+cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, -+ void *user_data) -+{ -+ if (rc < pcmk_ok) { -+ crm_err("Fencing update %d for %s: failed - %s (%d)", -+ call_id, (char *)user_data, pcmk_strerror(rc), rc); -+ crm_log_xml_warn(msg, "Failed update"); -+ abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); -+ -+ } else { -+ crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); -+ } -+} -+ -+static void -+send_stonith_update(crm_action_t *action, const char *target, const char *uuid) -+{ -+ int rc = pcmk_ok; -+ crm_node_t *peer = NULL; -+ -+ /* We (usually) rely on the membership layer to do node_update_cluster, -+ * and the peer status callback to do node_update_peer, because the node -+ * might have already rejoined before we get the stonith result here. -+ */ -+ int flags = node_update_join | node_update_expected; -+ -+ /* zero out the node-status & remove all LRM status info */ -+ xmlNode *node_state = NULL; -+ -+ CRM_CHECK(target != NULL, return); -+ CRM_CHECK(uuid != NULL, return); -+ -+ /* Make sure the membership and join caches are accurate */ -+ peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); -+ -+ CRM_CHECK(peer != NULL, return); -+ -+ if (peer->state == NULL) { -+ /* Usually, we rely on the membership layer to update the cluster state -+ * in the CIB. However, if the node has never been seen, do it here, so -+ * the node is not considered unclean. -+ */ -+ flags |= node_update_cluster; -+ } -+ -+ if (peer->uuid == NULL) { -+ crm_info("Recording uuid '%s' for node '%s'", uuid, target); -+ peer->uuid = strdup(uuid); -+ } -+ -+ crmd_peer_down(peer, TRUE); -+ -+ /* Generate a node state update for the CIB */ -+ node_state = create_node_state_update(peer, flags, NULL, __FUNCTION__); -+ -+ /* we have to mark whether or not remote nodes have already been fenced */ -+ if (peer->flags & crm_remote_node) { -+ time_t now = time(NULL); -+ char *now_s = crm_itoa(now); -+ crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s); -+ free(now_s); -+ } -+ -+ /* Force our known ID */ -+ crm_xml_add(node_state, XML_ATTR_UUID, uuid); -+ -+ rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, -+ cib_quorum_override | cib_scope_local | cib_can_create); -+ -+ /* Delay processing the trigger until the update completes */ -+ crm_debug("Sending fencing update %d for %s", rc, target); -+ fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); -+ -+ /* Make sure it sticks */ -+ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ -+ -+ erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local); -+ erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); -+ -+ free_xml(node_state); -+ return; -+} -+ -+/*! -+ * \internal -+ * \brief Abort transition due to stonith failure -+ * -+ * \param[in] abort_action Whether to restart or stop transition -+ * \param[in] target Don't restart if this (NULL for any) has too many failures -+ * \param[in] reason Log this stonith action XML as abort reason (or NULL) -+ */ -+static void -+abort_for_stonith_failure(enum transition_action abort_action, -+ const char *target, xmlNode *reason) -+{ -+ /* If stonith repeatedly fails, we eventually give up on starting a new -+ * transition for that reason. -+ */ -+ if ((abort_action != tg_stop) && too_many_st_failures(target)) { -+ abort_action = tg_stop; -+ } -+ abort_transition(INFINITY, abort_action, "Stonith failed", reason); -+} -+ -+ -+/* -+ * stonith cleanup list -+ * -+ * If the DC is shot, proper notifications might not go out. -+ * The stonith cleanup list allows the cluster to (re-)send -+ * notifications once a new DC is elected. -+ */ -+ -+static GListPtr stonith_cleanup_list = NULL; -+ -+/*! -+ * \internal -+ * \brief Add a node to the stonith cleanup list -+ * -+ * \param[in] target Name of node to add -+ */ -+void -+add_stonith_cleanup(const char *target) { -+ stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); -+} -+ -+/*! -+ * \internal -+ * \brief Remove a node from the stonith cleanup list -+ * -+ * \param[in] Name of node to remove -+ */ -+void -+remove_stonith_cleanup(const char *target) -+{ -+ GListPtr iter = stonith_cleanup_list; -+ -+ while (iter != NULL) { -+ GListPtr tmp = iter; -+ char *iter_name = tmp->data; -+ -+ iter = iter->next; -+ if (safe_str_eq(target, iter_name)) { -+ crm_trace("Removing %s from the cleanup list", iter_name); -+ stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); -+ free(iter_name); -+ } -+ } -+} -+ -+/*! -+ * \internal -+ * \brief Purge all entries from the stonith cleanup list -+ */ -+void -+purge_stonith_cleanup() -+{ -+ if (stonith_cleanup_list) { -+ GListPtr iter = NULL; -+ -+ for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { -+ char *target = iter->data; -+ -+ crm_info("Purging %s from stonith cleanup list", target); -+ free(target); -+ } -+ g_list_free(stonith_cleanup_list); -+ stonith_cleanup_list = NULL; -+ } -+} -+ -+/*! -+ * \internal -+ * \brief Send stonith updates for all entries in cleanup list, then purge it -+ */ -+void -+execute_stonith_cleanup() -+{ -+ GListPtr iter; -+ -+ for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { -+ char *target = iter->data; -+ crm_node_t *target_node = crm_get_peer(0, target); -+ const char *uuid = crm_peer_uuid(target_node); -+ -+ crm_notice("Marking %s, target of a previous stonith action, as clean", target); -+ send_stonith_update(NULL, target, uuid); -+ free(target); -+ } -+ g_list_free(stonith_cleanup_list); -+ stonith_cleanup_list = NULL; -+} -+ -+/* end stonith cleanup list functions */ -+ -+ -+/* stonith API client -+ * -+ * Functions that need to interact directly with the fencer via its API -+ */ -+ -+stonith_t *stonith_api = NULL; -+crm_trigger_t *stonith_reconnect = NULL; -+char *te_client_id = NULL; -+ -+static gboolean -+fail_incompletable_stonith(crm_graph_t *graph) -+{ -+ GListPtr lpc = NULL; -+ const char *task = NULL; -+ xmlNode *last_action = NULL; -+ -+ if (graph == NULL) { -+ return FALSE; -+ } -+ -+ for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { -+ GListPtr lpc2 = NULL; -+ synapse_t *synapse = (synapse_t *) lpc->data; -+ -+ if (synapse->confirmed) { -+ continue; -+ } -+ -+ for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { -+ crm_action_t *action = (crm_action_t *) lpc2->data; -+ -+ if (action->type != action_type_crm || action->confirmed) { -+ continue; -+ } -+ -+ task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); -+ if (task && safe_str_eq(task, CRM_OP_FENCE)) { -+ action->failed = TRUE; -+ last_action = action->xml; -+ update_graph(graph, action); -+ crm_notice("Failing action %d (%s): fencer terminated", -+ action->id, ID(action->xml)); -+ } -+ } -+ } -+ -+ if (last_action != NULL) { -+ crm_warn("Fencer failure resulted in unrunnable actions"); -+ abort_for_stonith_failure(tg_restart, NULL, last_action); -+ return TRUE; -+ } -+ -+ return FALSE; -+} -+ -+static void -+tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) -+{ -+ if (is_set(fsa_input_register, R_ST_REQUIRED)) { -+ crm_crit("Fencing daemon connection failed"); -+ mainloop_set_trigger(stonith_reconnect); -+ -+ } else { -+ crm_info("Fencing daemon disconnected"); -+ } -+ -+ if (stonith_api) { -+ stonith_api->state = stonith_disconnected; -+ } -+ -+ if (AM_I_DC) { -+ fail_incompletable_stonith(transition_graph); -+ trigger_graph(); -+ } -+} -+ -+static void -+tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) -+{ -+ if (te_client_id == NULL) { -+ te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, -+ (unsigned long) getpid()); -+ } -+ -+ if (st_event == NULL) { -+ crm_err("Notify data not found"); -+ return; -+ } -+ -+ crmd_alert_fencing_op(st_event); -+ -+ if ((st_event->result == pcmk_ok) && safe_str_eq("on", st_event->action)) { -+ crm_notice("%s was successfully unfenced by %s (at the request of %s)", -+ st_event->target, -+ st_event->executioner? st_event->executioner : "", -+ st_event->origin); -+ /* TODO: Hook up st_event->device */ -+ return; -+ -+ } else if (safe_str_eq("on", st_event->action)) { -+ crm_err("Unfencing of %s by %s failed: %s (%d)", -+ st_event->target, -+ st_event->executioner? st_event->executioner : "", -+ pcmk_strerror(st_event->result), st_event->result); -+ return; -+ -+ } else if ((st_event->result == pcmk_ok) -+ && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { -+ -+ crm_crit("We were allegedly just fenced by %s for %s!", -+ st_event->executioner? st_event->executioner : "", -+ st_event->origin); /* Dumps blackbox if enabled */ -+ -+ qb_log_fini(); /* Try to get the above log message to disk - somehow */ -+ -+ /* Get out ASAP and do not come back up. -+ * -+ * Triggering a reboot is also not the worst idea either since -+ * the rest of the cluster thinks we're safely down -+ */ -+ -+#ifdef RB_HALT_SYSTEM -+ reboot(RB_HALT_SYSTEM); -+#endif -+ -+ /* -+ * If reboot() fails or is not supported, coming back up will -+ * probably lead to a situation where the other nodes set our -+ * status to 'lost' because of the fencing callback and will -+ * discard subsequent election votes with: -+ * -+ * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) -+ * -+ * So just stay dead, something is seriously messed up anyway. -+ * -+ */ -+ exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini() -+ return; -+ } -+ -+ /* Update the count of stonith failures for this target, in case we become -+ * DC later. The current DC has already updated its fail count in -+ * tengine_stonith_callback(). -+ */ -+ if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { -+ if (st_event->result == pcmk_ok) { -+ st_fail_count_reset(st_event->target); -+ } else { -+ st_fail_count_increment(st_event->target); -+ } -+ } -+ -+ crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " -+ CRM_XS " initiator=%s ref=%s", -+ st_event->target, st_event->result == pcmk_ok ? "" : " not", -+ st_event->action, -+ st_event->executioner ? st_event->executioner : "", -+ (st_event->client_origin? st_event->client_origin : ""), -+ pcmk_strerror(st_event->result), -+ st_event->origin, st_event->id); -+ -+ if (st_event->result == pcmk_ok) { -+ crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY); -+ const char *uuid = NULL; -+ gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); -+ -+ if (peer == NULL) { -+ return; -+ } -+ -+ uuid = crm_peer_uuid(peer); -+ -+ crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); -+ if(AM_I_DC) { -+ /* The DC always sends updates */ -+ send_stonith_update(NULL, st_event->target, uuid); -+ -+ /* @TODO Ideally, at this point, we'd check whether the fenced node -+ * hosted any guest nodes, and call remote_node_down() for them. -+ * Unfortunately, the controller doesn't have a simple, reliable way -+ * to map hosts to guests. It might be possible to track this in the -+ * peer cache via crm_remote_peer_cache_refresh(). For now, we rely -+ * on the PE creating fence pseudo-events for the guests. -+ */ -+ -+ if (st_event->client_origin -+ && safe_str_neq(st_event->client_origin, te_client_id)) { -+ -+ /* Abort the current transition graph if it wasn't us -+ * that invoked stonith to fence someone -+ */ -+ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); -+ abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); -+ } -+ -+ /* Assume it was our leader if we don't currently have one */ -+ } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target)) -+ && is_not_set(peer->flags, crm_remote_node)) { -+ -+ crm_notice("Target %s our leader %s (recorded: %s)", -+ fsa_our_dc ? "was" : "may have been", st_event->target, -+ fsa_our_dc ? fsa_our_dc : ""); -+ -+ /* Given the CIB resyncing that occurs around elections, -+ * have one node update the CIB now and, if the new DC is different, -+ * have them do so too after the election -+ */ -+ if (we_are_executioner) { -+ send_stonith_update(NULL, st_event->target, uuid); -+ } -+ add_stonith_cleanup(st_event->target); -+ } -+ -+ /* If the target is a remote node, and we host its connection, -+ * immediately fail all monitors so it can be recovered quickly. -+ * The connection won't necessarily drop when a remote node is fenced, -+ * so the failure might not otherwise be detected until the next poke. -+ */ -+ if (is_set(peer->flags, crm_remote_node)) { -+ remote_ra_fail(st_event->target); -+ } -+ -+ crmd_peer_down(peer, TRUE); -+ } -+} -+ -+/*! -+ * \brief Connect to fencer -+ * -+ * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop -+ * -+ * \return TRUE -+ * \note If user_data is NULL, this will wait 2s between attempts, for up to -+ * 30 attempts, meaning the controller could be blocked as long as 58s. -+ */ -+gboolean -+te_connect_stonith(gpointer user_data) -+{ -+ int rc = pcmk_ok; -+ -+ if (stonith_api == NULL) { -+ stonith_api = stonith_api_new(); -+ } -+ -+ if (stonith_api->state != stonith_disconnected) { -+ crm_trace("Already connected to fencer, no need to retry"); -+ return TRUE; -+ } -+ -+ if (user_data == NULL) { -+ // Blocking (retry failures now until successful) -+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); -+ if (rc != pcmk_ok) { -+ crm_err("Could not connect to fencer in 30 attempts: %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); -+ } -+ } else { -+ // Non-blocking (retry failures later in main loop) -+ rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); -+ if (rc != pcmk_ok) { -+ if (is_set(fsa_input_register, R_ST_REQUIRED)) { -+ crm_err("Fencer connection failed (will retry): %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); -+ mainloop_set_trigger(stonith_reconnect); -+ } else { -+ crm_info("Fencer connection failed (ignoring because no longer required): %s " -+ CRM_XS " rc=%d", pcmk_strerror(rc), rc); -+ } -+ return TRUE; -+ } -+ } -+ -+ if (rc == pcmk_ok) { -+ stonith_api->cmds->register_notification(stonith_api, -+ T_STONITH_NOTIFY_DISCONNECT, -+ tengine_stonith_connection_destroy); -+ stonith_api->cmds->register_notification(stonith_api, -+ T_STONITH_NOTIFY_FENCE, -+ tengine_stonith_notify); -+ } -+ return TRUE; -+} -+ -+static gboolean -+do_stonith_history_sync(gpointer user_data) -+{ -+ if (stonith_api && (stonith_api->state != stonith_disconnected)) { -+ stonith_history_t *history = NULL; -+ -+ stonith_api->cmds->history(stonith_api, -+ st_opt_sync_call | st_opt_broadcast, -+ NULL, &history, 5); -+ stonith_history_free(history); -+ return TRUE; -+ } else { -+ crm_info("Skip triggering stonith history-sync as stonith is disconnected"); -+ return FALSE; -+ } -+} -+ -+static void -+tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) -+{ -+ char *uuid = NULL; -+ int stonith_id = -1; -+ int transition_id = -1; -+ crm_action_t *action = NULL; -+ int call_id = data->call_id; -+ int rc = data->rc; -+ char *userdata = data->userdata; -+ -+ CRM_CHECK(userdata != NULL, return); -+ crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, -+ pcmk_strerror(rc), rc); -+ -+ if (AM_I_DC == FALSE) { -+ return; -+ } -+ -+ /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ -+ /* op->call_id, op->optype, op->node_name, op->op_result, */ -+ /* (char *)op->node_list, op->private_data); */ -+ -+ /* filter out old STONITH actions */ -+ CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), -+ goto bail); -+ -+ if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) -+ || transition_graph->id != transition_id) { -+ crm_info("Ignoring STONITH action initiated outside of the current transition"); -+ goto bail; -+ } -+ -+ action = get_action(stonith_id, FALSE); -+ if (action == NULL) { -+ crm_err("Stonith action not matched"); -+ goto bail; -+ } -+ -+ stop_te_timer(action->timer); -+ if (rc == pcmk_ok) { -+ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -+ const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); -+ const char *op = crm_meta_value(action->params, "stonith_action"); -+ -+ crm_info("Stonith operation %d for %s passed", call_id, target); -+ if (action->confirmed == FALSE) { -+ te_action_confirmed(action); -+ if (safe_str_eq("on", op)) { -+ const char *value = NULL; -+ char *now = crm_itoa(time(NULL)); -+ -+ update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE); -+ free(now); -+ -+ value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); -+ update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE); -+ -+ value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); -+ update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE); -+ -+ } else if (action->sent_update == FALSE) { -+ send_stonith_update(action, target, uuid); -+ action->sent_update = TRUE; -+ } -+ } -+ st_fail_count_reset(target); -+ -+ } else { -+ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -+ enum transition_action abort_action = tg_restart; -+ -+ action->failed = TRUE; -+ crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", -+ call_id, target, pcmk_strerror(rc)); -+ -+ /* If no fence devices were available, there's no use in immediately -+ * checking again, so don't start a new transition in that case. -+ */ -+ if (rc == -ENODEV) { -+ crm_warn("No devices found in cluster to fence %s, giving up", -+ target); -+ abort_action = tg_stop; -+ } -+ -+ /* Increment the fail count now, so abort_for_stonith_failure() can -+ * check it. Non-DC nodes will increment it in tengine_stonith_notify(). -+ */ -+ st_fail_count_increment(target); -+ abort_for_stonith_failure(abort_action, target, NULL); -+ } -+ -+ update_graph(transition_graph, action); -+ trigger_graph(); -+ -+ bail: -+ free(userdata); -+ free(uuid); -+ return; -+} -+ -+gboolean -+te_fence_node(crm_graph_t *graph, crm_action_t *action) -+{ -+ int rc = 0; -+ const char *id = NULL; -+ const char *uuid = NULL; -+ const char *target = NULL; -+ const char *type = NULL; -+ gboolean invalid_action = FALSE; -+ enum stonith_call_options options = st_opt_none; -+ -+ id = ID(action->xml); -+ target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -+ uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); -+ type = crm_meta_value(action->params, "stonith_action"); -+ -+ CRM_CHECK(id != NULL, invalid_action = TRUE); -+ CRM_CHECK(uuid != NULL, invalid_action = TRUE); -+ CRM_CHECK(type != NULL, invalid_action = TRUE); -+ CRM_CHECK(target != NULL, invalid_action = TRUE); -+ -+ if (invalid_action) { -+ crm_log_xml_warn(action->xml, "BadAction"); -+ return FALSE; -+ } -+ -+ crm_notice("Requesting fencing (%s) of node %s " -+ CRM_XS " action=%s timeout=%d", -+ type, target, id, transition_graph->stonith_timeout); -+ -+ /* Passing NULL means block until we can connect... */ -+ te_connect_stonith(NULL); -+ -+ if (crmd_join_phase_count(crm_join_confirmed) == 1) { -+ options |= st_opt_allow_suicide; -+ } -+ -+ rc = stonith_api->cmds->fence(stonith_api, options, target, type, -+ transition_graph->stonith_timeout / 1000, 0); -+ -+ stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000, -+ st_opt_timeout_updates, -+ generate_transition_key(transition_graph->id, action->id, -+ 0, te_uuid), -+ "tengine_stonith_callback", tengine_stonith_callback); -+ -+ return TRUE; -+} -+ -+/* end stonith API client functions */ -+ -+ -+/* -+ * stonith history synchronization -+ * -+ * Each node's fencer keeps track of a cluster-wide fencing history. When a node -+ * joins or leaves, we need to synchronize the history across all nodes. -+ */ -+ -+static crm_trigger_t *stonith_history_sync_trigger = NULL; -+static mainloop_timer_t *stonith_history_sync_timer = NULL; -+ -+static gboolean -+stonith_history_sync_set_trigger(gpointer user_data) -+{ -+ mainloop_set_trigger(stonith_history_sync_trigger); -+ return FALSE; -+} -+ -+void -+te_trigger_stonith_history_sync(void) -+{ -+ /* trigger a sync in 5s to give more nodes the -+ * chance to show up so that we don't create -+ * unnecessary stonith-history-sync traffic -+ */ -+ -+ /* as we are finally checking the stonith-connection -+ * in do_stonith_history_sync we should be fine -+ * leaving stonith_history_sync_time & stonith_history_sync_trigger -+ * around -+ */ -+ if (stonith_history_sync_trigger == NULL) { -+ stonith_history_sync_trigger = -+ mainloop_add_trigger(G_PRIORITY_LOW, -+ do_stonith_history_sync, NULL); -+ } -+ -+ if(stonith_history_sync_timer == NULL) { -+ stonith_history_sync_timer = -+ mainloop_timer_add("history_sync", 5000, -+ FALSE, stonith_history_sync_set_trigger, -+ NULL); -+ } -+ crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); -+ mainloop_timer_start(stonith_history_sync_timer); -+} -+ -+/* end stonith history synchronization functions */ -diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h -new file mode 100644 -index 0000000..b80a6c9 ---- /dev/null -+++ b/daemons/controld/controld_fencing.h -@@ -0,0 +1,37 @@ -+/* -+ * Copyright 2004-2019 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#ifndef CONTROLD_FENCING__H -+# define CONTROLD_FENCING__H -+ -+#include // bool -+#include // crm_graph_t, crm_action_t -+ -+extern crm_trigger_t *stonith_reconnect; -+extern char *te_client_id; -+extern stonith_t *stonith_api; -+ -+// stonith fail counts -+void st_fail_count_reset(const char * target); -+void update_stonith_max_attempts(const char* value); -+ -+// stonith API client -+gboolean te_connect_stonith(gpointer user_data); -+gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action); -+ -+// stonith cleanup list -+void add_stonith_cleanup(const char *target); -+void remove_stonith_cleanup(const char *target); -+void purge_stonith_cleanup(void); -+void execute_stonith_cleanup(void); -+ -+// stonith history synchronization -+void te_trigger_stonith_history_sync(void); -+ -+#endif -diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c -index 9eca530..dc1937f 100644 ---- a/daemons/controld/controld_fsa.c -+++ b/daemons/controld/controld_fsa.c -@@ -26,6 +26,7 @@ - #include - #include - #include -+#include - #include - #include - -diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c -index 2ebc203..8f37cbf 100644 ---- a/daemons/controld/controld_messages.c -+++ b/daemons/controld/controld_messages.c -@@ -25,6 +25,7 @@ - #include - #include - #include -+#include - #include - #include - -diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c -index c95c6c7..2f61556 100644 ---- a/daemons/controld/controld_te_actions.c -+++ b/daemons/controld/controld_te_actions.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -17,6 +17,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -76,124 +77,6 @@ te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) - return TRUE; - } - --void --send_stonith_update(crm_action_t * action, const char *target, const char *uuid) --{ -- int rc = pcmk_ok; -- crm_node_t *peer = NULL; -- -- /* We (usually) rely on the membership layer to do node_update_cluster, -- * and the peer status callback to do node_update_peer, because the node -- * might have already rejoined before we get the stonith result here. -- */ -- int flags = node_update_join | node_update_expected; -- -- /* zero out the node-status & remove all LRM status info */ -- xmlNode *node_state = NULL; -- -- CRM_CHECK(target != NULL, return); -- CRM_CHECK(uuid != NULL, return); -- -- /* Make sure the membership and join caches are accurate */ -- peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); -- -- CRM_CHECK(peer != NULL, return); -- -- if (peer->state == NULL) { -- /* Usually, we rely on the membership layer to update the cluster state -- * in the CIB. However, if the node has never been seen, do it here, so -- * the node is not considered unclean. -- */ -- flags |= node_update_cluster; -- } -- -- if (peer->uuid == NULL) { -- crm_info("Recording uuid '%s' for node '%s'", uuid, target); -- peer->uuid = strdup(uuid); -- } -- -- crmd_peer_down(peer, TRUE); -- -- /* Generate a node state update for the CIB */ -- node_state = create_node_state_update(peer, flags, NULL, __FUNCTION__); -- -- /* we have to mark whether or not remote nodes have already been fenced */ -- if (peer->flags & crm_remote_node) { -- time_t now = time(NULL); -- char *now_s = crm_itoa(now); -- crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s); -- free(now_s); -- } -- -- /* Force our known ID */ -- crm_xml_add(node_state, XML_ATTR_UUID, uuid); -- -- rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, -- cib_quorum_override | cib_scope_local | cib_can_create); -- -- /* Delay processing the trigger until the update completes */ -- crm_debug("Sending fencing update %d for %s", rc, target); -- fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); -- -- /* Make sure it sticks */ -- /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ -- -- erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local); -- erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); -- -- free_xml(node_state); -- return; --} -- --static gboolean --te_fence_node(crm_graph_t * graph, crm_action_t * action) --{ -- int rc = 0; -- const char *id = NULL; -- const char *uuid = NULL; -- const char *target = NULL; -- const char *type = NULL; -- gboolean invalid_action = FALSE; -- enum stonith_call_options options = st_opt_none; -- -- id = ID(action->xml); -- target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -- uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); -- type = crm_meta_value(action->params, "stonith_action"); -- -- CRM_CHECK(id != NULL, invalid_action = TRUE); -- CRM_CHECK(uuid != NULL, invalid_action = TRUE); -- CRM_CHECK(type != NULL, invalid_action = TRUE); -- CRM_CHECK(target != NULL, invalid_action = TRUE); -- -- if (invalid_action) { -- crm_log_xml_warn(action->xml, "BadAction"); -- return FALSE; -- } -- -- crm_notice("Requesting fencing (%s) of node %s " -- CRM_XS " action=%s timeout=%d", -- type, target, id, transition_graph->stonith_timeout); -- -- /* Passing NULL means block until we can connect... */ -- te_connect_stonith(NULL); -- -- if (crmd_join_phase_count(crm_join_confirmed) == 1) { -- options |= st_opt_allow_suicide; -- } -- -- rc = stonith_api->cmds->fence(stonith_api, options, target, type, -- transition_graph->stonith_timeout / 1000, 0); -- -- stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000, -- st_opt_timeout_updates, -- generate_transition_key(transition_graph->id, action->id, -- 0, te_uuid), -- "tengine_stonith_callback", tengine_stonith_callback); -- -- return TRUE; --} -- - static int - get_target_rc(crm_action_t * action) - { -diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c -index 22b5f4b..1ab703f 100644 ---- a/daemons/controld/controld_te_callbacks.c -+++ b/daemons/controld/controld_te_callbacks.c -@@ -17,6 +17,7 @@ - - #include - #include -+#include - - #include /* For ONLINESTATUS etc */ - -@@ -27,21 +28,9 @@ gboolean shuttingdown = FALSE; - crm_graph_t *transition_graph; - crm_trigger_t *transition_trigger = NULL; - --static unsigned long int stonith_max_attempts = 10; -- - /* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ - #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" - --void --update_stonith_max_attempts(const char* value) --{ -- if (safe_str_eq(value, CRM_INFINITY_S)) { -- stonith_max_attempts = CRM_SCORE_INFINITY; -- } -- else { -- stonith_max_attempts = crm_int_helper(value, NULL); -- } --} - static void - te_update_diff_v1(const char *event, xmlNode *diff) - { -@@ -646,236 +635,6 @@ process_te_message(xmlNode * msg, xmlNode * xml_data) - return TRUE; - } - --GHashTable *stonith_failures = NULL; --struct st_fail_rec { -- int count; --}; -- --static gboolean --too_many_st_failures(const char *target) --{ -- GHashTableIter iter; -- const char *key = NULL; -- struct st_fail_rec *value = NULL; -- -- if (stonith_failures == NULL) { -- return FALSE; -- } -- -- if (target == NULL) { -- g_hash_table_iter_init(&iter, stonith_failures); -- while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { -- if (value->count >= stonith_max_attempts) { -- target = (const char*)key; -- goto too_many; -- } -- } -- } else { -- value = g_hash_table_lookup(stonith_failures, target); -- if ((value != NULL) && (value->count >= stonith_max_attempts)) { -- goto too_many; -- } -- } -- return FALSE; -- --too_many: -- crm_warn("Too many failures (%d) to fence %s, giving up", -- value->count, target); -- return TRUE; --} -- --/*! -- * \internal -- * \brief Reset a stonith fail count -- * -- * \param[in] target Name of node to reset, or NULL for all -- */ --void --st_fail_count_reset(const char *target) --{ -- if (stonith_failures == NULL) { -- return; -- } -- -- if (target) { -- struct st_fail_rec *rec = NULL; -- -- rec = g_hash_table_lookup(stonith_failures, target); -- if (rec) { -- rec->count = 0; -- } -- } else { -- GHashTableIter iter; -- const char *key = NULL; -- struct st_fail_rec *rec = NULL; -- -- g_hash_table_iter_init(&iter, stonith_failures); -- while (g_hash_table_iter_next(&iter, (gpointer *) &key, -- (gpointer *) &rec)) { -- rec->count = 0; -- } -- } --} -- --void --st_fail_count_increment(const char *target) --{ -- struct st_fail_rec *rec = NULL; -- -- if (stonith_failures == NULL) { -- stonith_failures = crm_str_table_new(); -- } -- -- rec = g_hash_table_lookup(stonith_failures, target); -- if (rec) { -- rec->count++; -- } else { -- rec = malloc(sizeof(struct st_fail_rec)); -- if(rec == NULL) { -- return; -- } -- -- rec->count = 1; -- g_hash_table_insert(stonith_failures, strdup(target), rec); -- } --} -- --/*! -- * \internal -- * \brief Abort transition due to stonith failure -- * -- * \param[in] abort_action Whether to restart or stop transition -- * \param[in] target Don't restart if this (NULL for any) has too many failures -- * \param[in] reason Log this stonith action XML as abort reason (or NULL) -- */ --void --abort_for_stonith_failure(enum transition_action abort_action, -- const char *target, xmlNode *reason) --{ -- /* If stonith repeatedly fails, we eventually give up on starting a new -- * transition for that reason. -- */ -- if ((abort_action != tg_stop) && too_many_st_failures(target)) { -- abort_action = tg_stop; -- } -- abort_transition(INFINITY, abort_action, "Stonith failed", reason); --} -- --void --tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) --{ -- char *uuid = NULL; -- int stonith_id = -1; -- int transition_id = -1; -- crm_action_t *action = NULL; -- int call_id = data->call_id; -- int rc = data->rc; -- char *userdata = data->userdata; -- -- CRM_CHECK(userdata != NULL, return); -- crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, -- pcmk_strerror(rc), rc); -- -- if (AM_I_DC == FALSE) { -- return; -- } -- -- /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ -- /* op->call_id, op->optype, op->node_name, op->op_result, */ -- /* (char *)op->node_list, op->private_data); */ -- -- /* filter out old STONITH actions */ -- CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), -- goto bail); -- -- if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) -- || transition_graph->id != transition_id) { -- crm_info("Ignoring STONITH action initiated outside of the current transition"); -- goto bail; -- } -- -- action = get_action(stonith_id, FALSE); -- if (action == NULL) { -- crm_err("Stonith action not matched"); -- goto bail; -- } -- -- stop_te_timer(action->timer); -- if (rc == pcmk_ok) { -- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -- const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); -- const char *op = crm_meta_value(action->params, "stonith_action"); -- -- crm_info("Stonith operation %d for %s passed", call_id, target); -- if (action->confirmed == FALSE) { -- te_action_confirmed(action); -- if (safe_str_eq("on", op)) { -- const char *value = NULL; -- char *now = crm_itoa(time(NULL)); -- -- update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE); -- free(now); -- -- value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); -- update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE); -- -- value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); -- update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE); -- -- } else if (action->sent_update == FALSE) { -- send_stonith_update(action, target, uuid); -- action->sent_update = TRUE; -- } -- } -- st_fail_count_reset(target); -- -- } else { -- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -- enum transition_action abort_action = tg_restart; -- -- action->failed = TRUE; -- crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", -- call_id, target, pcmk_strerror(rc)); -- -- /* If no fence devices were available, there's no use in immediately -- * checking again, so don't start a new transition in that case. -- */ -- if (rc == -ENODEV) { -- crm_warn("No devices found in cluster to fence %s, giving up", -- target); -- abort_action = tg_stop; -- } -- -- /* Increment the fail count now, so abort_for_stonith_failure() can -- * check it. Non-DC nodes will increment it in tengine_stonith_notify(). -- */ -- st_fail_count_increment(target); -- abort_for_stonith_failure(abort_action, target, NULL); -- } -- -- update_graph(transition_graph, action); -- trigger_graph(); -- -- bail: -- free(userdata); -- free(uuid); -- return; --} -- --void --cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) --{ -- if (rc < pcmk_ok) { -- crm_err("Fencing update %d for %s: failed - %s (%d)", -- call_id, (char *)user_data, pcmk_strerror(rc), rc); -- crm_log_xml_warn(msg, "Failed update"); -- abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); -- -- } else { -- crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); -- } --} -- - void - cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) - { -diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c -index 22f83ad..1496244 100644 ---- a/daemons/controld/controld_te_utils.c -+++ b/daemons/controld/controld_te_utils.c -@@ -6,441 +6,14 @@ - */ - - #include -- --#include - #include -- - #include -- - #include -+ - #include - #include --#include - #include - #include --#include -- --crm_trigger_t *stonith_reconnect = NULL; --static crm_trigger_t *stonith_history_sync_trigger = NULL; --static mainloop_timer_t *stonith_history_sync_timer = NULL; -- --/* -- * stonith cleanup list -- * -- * If the DC is shot, proper notifications might not go out. -- * The stonith cleanup list allows the cluster to (re-)send -- * notifications once a new DC is elected. -- */ -- --static GListPtr stonith_cleanup_list = NULL; -- --/*! -- * \internal -- * \brief Add a node to the stonith cleanup list -- * -- * \param[in] target Name of node to add -- */ --void --add_stonith_cleanup(const char *target) { -- stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); --} -- --/*! -- * \internal -- * \brief Remove a node from the stonith cleanup list -- * -- * \param[in] Name of node to remove -- */ --void --remove_stonith_cleanup(const char *target) --{ -- GListPtr iter = stonith_cleanup_list; -- -- while (iter != NULL) { -- GListPtr tmp = iter; -- char *iter_name = tmp->data; -- -- iter = iter->next; -- if (safe_str_eq(target, iter_name)) { -- crm_trace("Removing %s from the cleanup list", iter_name); -- stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); -- free(iter_name); -- } -- } --} -- --/*! -- * \internal -- * \brief Purge all entries from the stonith cleanup list -- */ --void --purge_stonith_cleanup() --{ -- if (stonith_cleanup_list) { -- GListPtr iter = NULL; -- -- for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { -- char *target = iter->data; -- -- crm_info("Purging %s from stonith cleanup list", target); -- free(target); -- } -- g_list_free(stonith_cleanup_list); -- stonith_cleanup_list = NULL; -- } --} -- --/*! -- * \internal -- * \brief Send stonith updates for all entries in cleanup list, then purge it -- */ --void --execute_stonith_cleanup() --{ -- GListPtr iter; -- -- for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { -- char *target = iter->data; -- crm_node_t *target_node = crm_get_peer(0, target); -- const char *uuid = crm_peer_uuid(target_node); -- -- crm_notice("Marking %s, target of a previous stonith action, as clean", target); -- send_stonith_update(NULL, target, uuid); -- free(target); -- } -- g_list_free(stonith_cleanup_list); -- stonith_cleanup_list = NULL; --} -- --/* end stonith cleanup list functions */ -- --static gboolean --fail_incompletable_stonith(crm_graph_t * graph) --{ -- GListPtr lpc = NULL; -- const char *task = NULL; -- xmlNode *last_action = NULL; -- -- if (graph == NULL) { -- return FALSE; -- } -- -- for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { -- GListPtr lpc2 = NULL; -- synapse_t *synapse = (synapse_t *) lpc->data; -- -- if (synapse->confirmed) { -- continue; -- } -- -- for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { -- crm_action_t *action = (crm_action_t *) lpc2->data; -- -- if (action->type != action_type_crm || action->confirmed) { -- continue; -- } -- -- task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); -- if (task && safe_str_eq(task, CRM_OP_FENCE)) { -- action->failed = TRUE; -- last_action = action->xml; -- update_graph(graph, action); -- crm_notice("Failing action %d (%s): fencer terminated", -- action->id, ID(action->xml)); -- } -- } -- } -- -- if (last_action != NULL) { -- crm_warn("Fencer failure resulted in unrunnable actions"); -- abort_for_stonith_failure(tg_restart, NULL, last_action); -- return TRUE; -- } -- -- return FALSE; --} -- --static void --tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e) --{ -- if (is_set(fsa_input_register, R_ST_REQUIRED)) { -- crm_crit("Fencing daemon connection failed"); -- mainloop_set_trigger(stonith_reconnect); -- -- } else { -- crm_info("Fencing daemon disconnected"); -- } -- -- /* cbchan will be garbage at this point, arrange for it to be reset */ -- if(stonith_api) { -- stonith_api->state = stonith_disconnected; -- } -- -- if (AM_I_DC) { -- fail_incompletable_stonith(transition_graph); -- trigger_graph(); -- } --} -- --char *te_client_id = NULL; -- --#ifdef HAVE_SYS_REBOOT_H --# include --# include --#endif -- --static void --tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) --{ -- if(te_client_id == NULL) { -- te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, -- (unsigned long) getpid()); -- } -- -- if (st_event == NULL) { -- crm_err("Notify data not found"); -- return; -- } -- -- crmd_alert_fencing_op(st_event); -- -- if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { -- crm_notice("%s was successfully unfenced by %s (at the request of %s)", -- st_event->target, st_event->executioner ? st_event->executioner : "", st_event->origin); -- /* TODO: Hook up st_event->device */ -- return; -- -- } else if (safe_str_eq("on", st_event->action)) { -- crm_err("Unfencing of %s by %s failed: %s (%d)", -- st_event->target, st_event->executioner ? st_event->executioner : "", -- pcmk_strerror(st_event->result), st_event->result); -- return; -- -- } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { -- crm_crit("We were allegedly just fenced by %s for %s!", -- st_event->executioner ? st_event->executioner : "", st_event->origin); /* Dumps blackbox if enabled */ -- -- qb_log_fini(); /* Try to get the above log message to disk - somehow */ -- -- /* Get out ASAP and do not come back up. -- * -- * Triggering a reboot is also not the worst idea either since -- * the rest of the cluster thinks we're safely down -- */ -- --#ifdef RB_HALT_SYSTEM -- reboot(RB_HALT_SYSTEM); --#endif -- -- /* -- * If reboot() fails or is not supported, coming back up will -- * probably lead to a situation where the other nodes set our -- * status to 'lost' because of the fencing callback and will -- * discard subsequent election votes with: -- * -- * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) -- * -- * So just stay dead, something is seriously messed up anyway. -- * -- */ -- exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini() -- return; -- } -- -- /* Update the count of stonith failures for this target, in case we become -- * DC later. The current DC has already updated its fail count in -- * tengine_stonith_callback(). -- */ -- if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { -- if (st_event->result == pcmk_ok) { -- st_fail_count_reset(st_event->target); -- } else { -- st_fail_count_increment(st_event->target); -- } -- } -- -- crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " -- CRM_XS " initiator=%s ref=%s", -- st_event->target, st_event->result == pcmk_ok ? "" : " not", -- st_event->action, -- st_event->executioner ? st_event->executioner : "", -- (st_event->client_origin? st_event->client_origin : ""), -- pcmk_strerror(st_event->result), -- st_event->origin, st_event->id); -- -- if (st_event->result == pcmk_ok) { -- crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY); -- const char *uuid = NULL; -- gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); -- -- if (peer == NULL) { -- return; -- } -- -- uuid = crm_peer_uuid(peer); -- -- crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); -- if(AM_I_DC) { -- /* The DC always sends updates */ -- send_stonith_update(NULL, st_event->target, uuid); -- -- /* @TODO Ideally, at this point, we'd check whether the fenced node -- * hosted any guest nodes, and call remote_node_down() for them. -- * Unfortunately, the controller doesn't have a simple, reliable way -- * to map hosts to guests. It might be possible to track this in the -- * peer cache via crm_remote_peer_cache_refresh(). For now, we rely -- * on the PE creating fence pseudo-events for the guests. -- */ -- -- if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { -- -- /* Abort the current transition graph if it wasn't us -- * that invoked stonith to fence someone -- */ -- crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); -- abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); -- } -- -- /* Assume it was our leader if we don't currently have one */ -- } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target)) -- && !is_set(peer->flags, crm_remote_node)) { -- -- crm_notice("Target %s our leader %s (recorded: %s)", -- fsa_our_dc ? "was" : "may have been", st_event->target, -- fsa_our_dc ? fsa_our_dc : ""); -- -- /* Given the CIB resyncing that occurs around elections, -- * have one node update the CIB now and, if the new DC is different, -- * have them do so too after the election -- */ -- if (we_are_executioner) { -- send_stonith_update(NULL, st_event->target, uuid); -- } -- add_stonith_cleanup(st_event->target); -- } -- -- /* If the target is a remote node, and we host its connection, -- * immediately fail all monitors so it can be recovered quickly. -- * The connection won't necessarily drop when a remote node is fenced, -- * so the failure might not otherwise be detected until the next poke. -- */ -- if (is_set(peer->flags, crm_remote_node)) { -- remote_ra_fail(st_event->target); -- } -- -- crmd_peer_down(peer, TRUE); -- } --} -- --static gboolean --do_stonith_history_sync(gpointer user_data) --{ -- if (stonith_api && (stonith_api->state != stonith_disconnected)) { -- stonith_history_t *history = NULL; -- -- stonith_api->cmds->history(stonith_api, -- st_opt_sync_call | st_opt_broadcast, -- NULL, &history, 5); -- stonith_history_free(history); -- return TRUE; -- } else { -- crm_info("Skip triggering stonith history-sync as stonith is disconnected"); -- return FALSE; -- } --} -- --static gboolean --stonith_history_sync_set_trigger(gpointer user_data) --{ -- mainloop_set_trigger(stonith_history_sync_trigger); -- return FALSE; --} -- --void --te_trigger_stonith_history_sync(void) --{ -- /* trigger a sync in 5s to give more nodes the -- * chance to show up so that we don't create -- * unnecessary stonith-history-sync traffic -- */ -- -- /* as we are finally checking the stonith-connection -- * in do_stonith_history_sync we should be fine -- * leaving stonith_history_sync_time & stonith_history_sync_trigger -- * around -- */ -- if (stonith_history_sync_trigger == NULL) { -- stonith_history_sync_trigger = -- mainloop_add_trigger(G_PRIORITY_LOW, -- do_stonith_history_sync, NULL); -- } -- -- if(stonith_history_sync_timer == NULL) { -- stonith_history_sync_timer = -- mainloop_timer_add("history_sync", 5000, -- FALSE, stonith_history_sync_set_trigger, -- NULL); -- } -- crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); -- mainloop_timer_start(stonith_history_sync_timer); --} -- --/*! -- * \brief Connect to fencer -- * -- * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop -- * -- * \return TRUE -- * \note If user_data is NULL, this will wait 2s between attempts, for up to -- * 30 attempts, meaning the controller could be blocked as long as 58s. -- */ --gboolean --te_connect_stonith(gpointer user_data) --{ -- int rc = pcmk_ok; -- -- if (stonith_api == NULL) { -- stonith_api = stonith_api_new(); -- } -- -- if (stonith_api->state != stonith_disconnected) { -- crm_trace("Already connected to fencer, no need to retry"); -- return TRUE; -- } -- -- if (user_data == NULL) { -- // Blocking (retry failures now until successful) -- rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); -- if (rc != pcmk_ok) { -- crm_err("Could not connect to fencer in 30 attempts: %s " -- CRM_XS " rc=%d", pcmk_strerror(rc), rc); -- } -- } else { -- // Non-blocking (retry failures later in main loop) -- rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); -- if (rc != pcmk_ok) { -- if (is_set(fsa_input_register, R_ST_REQUIRED)) { -- crm_err("Fencer connection failed (will retry): %s " -- CRM_XS " rc=%d", pcmk_strerror(rc), rc); -- mainloop_set_trigger(stonith_reconnect); -- } else { -- crm_info("Fencer connection failed (ignoring because no longer required): %s " -- CRM_XS " rc=%d", pcmk_strerror(rc), rc); -- } -- return TRUE; -- } -- } -- -- if (rc == pcmk_ok) { -- stonith_api->cmds->register_notification(stonith_api, -- T_STONITH_NOTIFY_DISCONNECT, -- tengine_stonith_connection_destroy); -- stonith_api->cmds->register_notification(stonith_api, -- T_STONITH_NOTIFY_FENCE, -- tengine_stonith_notify); -- } -- return TRUE; --} - - gboolean - stop_te_timer(crm_action_timer_t * timer) -diff --git a/daemons/controld/controld_transition.c b/daemons/controld/controld_transition.c -index 5f164ab..b942ab4 100644 ---- a/daemons/controld/controld_transition.c -+++ b/daemons/controld/controld_transition.c -@@ -18,7 +18,6 @@ - - - extern crm_graph_functions_t te_graph_fns; --stonith_t *stonith_api = NULL; - - static void - global_cib_callback(const xmlNode * msg, int callid, int rc, xmlNode * output) -diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h -index a162f99..f31ac2d 100644 ---- a/daemons/controld/controld_transition.h -+++ b/daemons/controld/controld_transition.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU Lesser General Public License - * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -@@ -12,15 +12,6 @@ - # include - # include - # include --extern stonith_t *stonith_api; --extern void send_stonith_update(crm_action_t * stonith_action, const char *target, -- const char *uuid); -- --/* stonith cleanup list */ --void add_stonith_cleanup(const char *target); --void remove_stonith_cleanup(const char *target); --void purge_stonith_cleanup(void); --void execute_stonith_cleanup(void); - - /* tengine */ - extern crm_action_t *match_down_event(const char *target); -@@ -46,16 +37,11 @@ extern char *te_uuid; - - extern void notify_crmd(crm_graph_t * graph); - --void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, -- void *user_data); - void cib_action_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, - void *user_data); - gboolean action_timer_callback(gpointer data); - gboolean te_graph_trigger(gpointer user_data); - void te_update_diff(const char *event, xmlNode *msg); --void tengine_stonith_callback(stonith_t *stonith, -- stonith_callback_data_t *data); --void update_stonith_max_attempts(const char* value); - - extern void trigger_graph_processing(const char *fn, int line); - void abort_after_delay(int abort_priority, enum transition_action abort_action, -@@ -68,12 +54,7 @@ extern void abort_transition_graph(int abort_priority, enum transition_action ab - # define abort_transition(pri, action, text, reason) \ - abort_transition_graph(pri, action, text, reason,__FUNCTION__,__LINE__); - --extern gboolean te_connect_stonith(gpointer user_data); -- --extern void te_trigger_stonith_history_sync(void); -- - extern crm_trigger_t *transition_trigger; --extern crm_trigger_t *stonith_reconnect; - - extern char *failed_stop_offset; - extern char *failed_start_offset; -diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h -index 68992f5..8b80e3c 100644 ---- a/daemons/controld/controld_utils.h -+++ b/daemons/controld/controld_utils.h -@@ -85,10 +85,6 @@ int crmd_join_phase_count(enum crm_join_phase phase); - void crmd_join_phase_log(int level); - - const char *get_timer_desc(fsa_timer_t * timer); --void st_fail_count_reset(const char * target); --void st_fail_count_increment(const char *target); --void abort_for_stonith_failure(enum transition_action abort_action, -- const char *target, xmlNode *reason); - void crmd_peer_down(crm_node_t *peer, bool full); - unsigned int cib_op_timeout(void); - --- -1.8.3.1 - -From 3002e485651e1ad18da6d44e7672dbe4f0380d3b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 23 May 2019 18:18:06 -0500 -Subject: [PATCH] Refactor: controller: isolate stonith API handling - -can now make more variables and functions static ---- - daemons/controld/controld_control.c | 28 +++------------------ - daemons/controld/controld_fencing.c | 49 ++++++++++++++++++++++++++++++++++--- - daemons/controld/controld_fencing.h | 7 ++---- - 3 files changed, 50 insertions(+), 34 deletions(-) - -diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c -index 7f918c0..e99d605 100644 ---- a/daemons/controld/controld_control.c -+++ b/daemons/controld/controld_control.c -@@ -113,14 +113,7 @@ do_shutdown(long long action, - { - /* just in case */ - set_bit(fsa_input_register, R_SHUTDOWN); -- -- if (stonith_api) { -- /* Prevent it from coming up again */ -- clear_bit(fsa_input_register, R_ST_REQUIRED); -- -- crm_info("Disconnecting from fencer"); -- stonith_api->cmds->disconnect(stonith_api); -- } -+ controld_disconnect_fencer(FALSE); - } - - /* A_SHUTDOWN_REQ */ -@@ -201,12 +194,7 @@ crmd_exit(crm_exit_t exit_code) - - controld_close_attrd_ipc(); - pe_subsystem_free(); -- -- if(stonith_api) { -- crm_trace("Disconnecting fencing API"); -- clear_bit(fsa_input_register, R_ST_REQUIRED); -- stonith_api->cmds->free(stonith_api); stonith_api = NULL; -- } -+ controld_disconnect_fencer(TRUE); - - if ((exit_code == CRM_EX_OK) && (crmd_mainloop == NULL)) { - crm_debug("No mainloop detected"); -@@ -258,7 +246,6 @@ crmd_exit(crm_exit_t exit_code) - mainloop_destroy_trigger(fsa_source); fsa_source = NULL; - - mainloop_destroy_trigger(config_read); config_read = NULL; -- mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; - mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; - - crm_client_cleanup(); -@@ -288,7 +275,6 @@ crmd_exit(crm_exit_t exit_code) - free(fsa_cluster_name); fsa_cluster_name = NULL; - - free(te_uuid); te_uuid = NULL; -- free(te_client_id); te_client_id = NULL; - free(fsa_pe_ref); fsa_pe_ref = NULL; - free(failed_stop_offset); failed_stop_offset = NULL; - free(failed_start_offset); failed_start_offset = NULL; -@@ -627,15 +613,7 @@ do_started(long long action, - crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); - register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); - } -- -- // Try connecting to fencer (retrying later in mainloop if failed) -- if (stonith_reconnect == NULL) { -- stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, -- te_connect_stonith, -- GINT_TO_POINTER(TRUE)); -- } -- set_bit(fsa_input_register, R_ST_REQUIRED); -- mainloop_set_trigger(stonith_reconnect); -+ controld_trigger_fencer_connect(); - - crm_notice("Pacemaker controller successfully started and accepting connections"); - clear_bit(fsa_input_register, R_STARTING); -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index cde57b5..92336e9 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -341,9 +341,9 @@ execute_stonith_cleanup() - * Functions that need to interact directly with the fencer via its API - */ - --stonith_t *stonith_api = NULL; --crm_trigger_t *stonith_reconnect = NULL; --char *te_client_id = NULL; -+static stonith_t *stonith_api = NULL; -+static crm_trigger_t *stonith_reconnect = NULL; -+static char *te_client_id = NULL; - - static gboolean - fail_incompletable_stonith(crm_graph_t *graph) -@@ -571,7 +571,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) - * \note If user_data is NULL, this will wait 2s between attempts, for up to - * 30 attempts, meaning the controller could be blocked as long as 58s. - */ --gboolean -+static gboolean - te_connect_stonith(gpointer user_data) - { - int rc = pcmk_ok; -@@ -619,6 +619,47 @@ te_connect_stonith(gpointer user_data) - return TRUE; - } - -+/*! -+ \internal -+ \brief Schedule fencer connection attempt in main loop -+*/ -+void -+controld_trigger_fencer_connect() -+{ -+ if (stonith_reconnect == NULL) { -+ stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, -+ te_connect_stonith, -+ GINT_TO_POINTER(TRUE)); -+ } -+ set_bit(fsa_input_register, R_ST_REQUIRED); -+ mainloop_set_trigger(stonith_reconnect); -+} -+ -+void -+controld_disconnect_fencer(bool destroy) -+{ -+ if (stonith_api) { -+ // Prevent fencer connection from coming up again -+ clear_bit(fsa_input_register, R_ST_REQUIRED); -+ -+ stonith_api->cmds->disconnect(stonith_api); -+ } -+ if (destroy) { -+ if (stonith_api) { -+ stonith_api->cmds->free(stonith_api); -+ stonith_api = NULL; -+ } -+ if (stonith_reconnect) { -+ mainloop_destroy_trigger(stonith_reconnect); -+ stonith_reconnect = NULL; -+ } -+ if (te_client_id) { -+ free(te_client_id); -+ te_client_id = NULL; -+ } -+ } -+} -+ - static gboolean - do_stonith_history_sync(gpointer user_data) - { -diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h -index b80a6c9..3ef537f 100644 ---- a/daemons/controld/controld_fencing.h -+++ b/daemons/controld/controld_fencing.h -@@ -13,16 +13,13 @@ - #include // bool - #include // crm_graph_t, crm_action_t - --extern crm_trigger_t *stonith_reconnect; --extern char *te_client_id; --extern stonith_t *stonith_api; -- - // stonith fail counts - void st_fail_count_reset(const char * target); - void update_stonith_max_attempts(const char* value); - - // stonith API client --gboolean te_connect_stonith(gpointer user_data); -+void controld_trigger_fencer_connect(void); -+void controld_disconnect_fencer(bool destroy); - gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action); - - // stonith cleanup list --- -1.8.3.1 - diff --git a/SOURCES/007-shutdown-lock.patch b/SOURCES/007-shutdown-lock.patch new file mode 100644 index 0000000..17e7588 --- /dev/null +++ b/SOURCES/007-shutdown-lock.patch @@ -0,0 +1,60 @@ +From f17c99492c7ab9e639b940a34d2a48b55937b605 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 16:00:36 -0600 +Subject: [PATCH 03/18] Low: tools: improve crm_resource "why" messages + +--- + tools/crm_resource_runtime.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 9ae24b6..61ceee7 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -878,7 +878,7 @@ cli_cleanup_all(crm_ipc_t *crmd_channel, const char *node_name, + void + cli_resource_check(cib_t * cib_conn, resource_t *rsc) + { +- int need_nl = 0; ++ bool printed = false; + char *role_s = NULL; + char *managed = NULL; + resource_t *parent = uber_parent(rsc); +@@ -897,23 +897,26 @@ cli_resource_check(cib_t * cib_conn, resource_t *rsc) + // Treated as if unset + + } else if(role == RSC_ROLE_STOPPED) { +- printf("\n * The configuration specifies that '%s' should remain stopped\n", parent->id); +- need_nl++; ++ printf("\n * Configuration specifies '%s' should remain stopped\n", ++ parent->id); ++ printed = true; + + } else if (is_set(parent->flags, pe_rsc_promotable) + && (role == RSC_ROLE_SLAVE)) { +- printf("\n * The configuration specifies that '%s' should not be promoted\n", parent->id); +- need_nl++; ++ printf("\n * Configuration specifies '%s' should not be promoted\n", ++ parent->id); ++ printed = true; + } + } + +- if(managed && crm_is_true(managed) == FALSE) { +- printf("%s * The configuration prevents the cluster from stopping or starting '%s' (unmanaged)\n", need_nl == 0?"\n":"", parent->id); +- need_nl++; ++ if (managed && !crm_is_true(managed)) { ++ printf("%s * Configuration prevents cluster from stopping or starting unmanaged '%s'\n", ++ (printed? "" : "\n"), parent->id); ++ printed = true; + } + free(managed); + +- if(need_nl) { ++ if (printed) { + printf("\n"); + } + } +-- +1.8.3.1 + diff --git a/SOURCES/008-shutdown-lock.patch b/SOURCES/008-shutdown-lock.patch new file mode 100644 index 0000000..0592013 --- /dev/null +++ b/SOURCES/008-shutdown-lock.patch @@ -0,0 +1,122 @@ +From 736f255c18d4c99f1956fbb5ad4ac5bfc15bb841 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 16:23:25 -0600 +Subject: [PATCH 04/18] Low: tools: improve error checking for crm_resource + cleanup/fail commands + +Bail earlier for misconfigured resources, and return error (rather than hang) +for unknown or offline node. Also add timeout directly to controller request +rather than rely on the controller using the interval as default timeout. +--- + tools/crm_resource_runtime.c | 54 +++++++++++++++++++++++++++----------------- + 1 file changed, 33 insertions(+), 21 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 61ceee7..2ea8bb3 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -468,8 +468,9 @@ send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op, + int rc = -ECOMM; + xmlNode *cmd = NULL; + xmlNode *xml_rsc = NULL; +- const char *value = NULL; + const char *router_node = host_uname; ++ const char *rsc_class = NULL; ++ const char *rsc_type = NULL; + xmlNode *params = NULL; + xmlNode *msg_data = NULL; + resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); +@@ -481,27 +482,49 @@ send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op, + } else if (rsc->variant != pe_native) { + CMD_ERR("We can only process primitive resources, not %s", rsc_id); + return -EINVAL; ++ } + +- } else if (host_uname == NULL) { ++ rsc_class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); ++ rsc_type = crm_element_value(rsc->xml, XML_ATTR_TYPE); ++ if ((rsc_class == NULL) || (rsc_type == NULL)) { ++ CMD_ERR("Resource %s does not have a class and type", rsc_id); ++ return -EINVAL; ++ } ++ ++ if (host_uname == NULL) { + CMD_ERR("Please specify a node name"); + return -EINVAL; ++ + } else { +- node_t *node = pe_find_node(data_set->nodes, host_uname); ++ pe_node_t *node = pe_find_node(data_set->nodes, host_uname); + ++ if (node == NULL) { ++ CMD_ERR("Node %s not found", host_uname); ++ return -pcmk_err_node_unknown; ++ } ++ ++ if (!(node->details->online)) { ++ CMD_ERR("Node %s is not online", host_uname); ++ return -ENOTCONN; ++ } + if (pe__is_guest_or_remote_node(node)) { + node = pe__current_node(node->details->remote_rsc); + if (node == NULL) { + CMD_ERR("No cluster connection to Pacemaker Remote node %s detected", + host_uname); +- return -ENXIO; ++ return -ENOTCONN; + } + router_node = node->details->uname; + } + } + +- key = generate_transition_key(0, getpid(), 0, "xxxxxxxx-xrsc-opxx-xcrm-resourcexxxx"); +- + msg_data = create_xml_node(NULL, XML_GRAPH_TAG_RSC_OP); ++ ++ /* The controller logs the transition key from requests, so we need to have ++ * *something* for it. ++ */ ++ key = generate_transition_key(0, getpid(), 0, ++ "xxxxxxxx-xrsc-opxx-xcrm-resourcexxxx"); + crm_xml_add(msg_data, XML_ATTR_TRANSITION_KEY, key); + free(key); + +@@ -519,31 +542,20 @@ send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op, + crm_xml_add(xml_rsc, XML_ATTR_ID, rsc->id); + } + +- value = crm_copy_xml_element(rsc->xml, xml_rsc, XML_ATTR_TYPE); +- if (value == NULL) { +- CMD_ERR("%s has no type! Aborting...", rsc_id); +- return -ENXIO; +- } +- +- value = crm_copy_xml_element(rsc->xml, xml_rsc, XML_AGENT_ATTR_CLASS); +- if (value == NULL) { +- CMD_ERR("%s has no class! Aborting...", rsc_id); +- return -ENXIO; +- } +- ++ crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, rsc_class); + crm_copy_xml_element(rsc->xml, xml_rsc, XML_AGENT_ATTR_PROVIDER); ++ crm_xml_add(xml_rsc, XML_ATTR_TYPE, rsc_type); + + params = create_xml_node(msg_data, XML_TAG_ATTRS); + crm_xml_add(params, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); + +- key = crm_meta_name(XML_LRM_ATTR_INTERVAL_MS); ++ // The controller parses the timeout from the request ++ key = crm_meta_name(XML_ATTR_TIMEOUT); + crm_xml_add(params, key, "60000"); /* 1 minute */ + free(key); + + our_pid = crm_getpid_s(); + cmd = create_request(op, msg_data, router_node, CRM_SYSTEM_CRMD, crm_system_name, our_pid); +- +-/* crm_log_xml_warn(cmd, "send_lrm_rsc_op"); */ + free_xml(msg_data); + + if (crm_ipc_send(crmd_channel, cmd, 0, 0, NULL) > 0) { +-- +1.8.3.1 + diff --git a/SOURCES/008-stonith_admin-header-refactoring.patch b/SOURCES/008-stonith_admin-header-refactoring.patch deleted file mode 100644 index 97a8b8d..0000000 --- a/SOURCES/008-stonith_admin-header-refactoring.patch +++ /dev/null @@ -1,1715 +0,0 @@ -From 756a3e522aa444b456e21128a52317226b346005 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 May 2019 15:26:20 -0500 -Subject: [PATCH 01/11] Doc: libpacemaker: correct doxygen block for shutdown - op creator - -copy/paste error ---- - lib/pacemaker/pcmk_sched_utils.c | 5 +---- - 1 file changed, 1 insertion(+), 4 deletions(-) - -diff --git a/lib/pacemaker/pcmk_sched_utils.c b/lib/pacemaker/pcmk_sched_utils.c -index 5342e51..7b5cb7d 100644 ---- a/lib/pacemaker/pcmk_sched_utils.c -+++ b/lib/pacemaker/pcmk_sched_utils.c -@@ -446,10 +446,7 @@ pe_cancel_op(pe_resource_t *rsc, const char *task, guint interval_ms, - * \internal - * \brief Create a shutdown op for a scheduler transition - * -- * \param[in] rsc Resource of action to cancel -- * \param[in] task Name of action to cancel -- * \param[in] interval_ms Interval of action to cancel -- * \param[in] node Node of action to cancel -+ * \param[in] node Node being shut down - * \param[in] data_set Working set of cluster - * - * \return Created op --- -1.8.3.1 - - -From 5249dd9295307c0e22e223ea7d6f5f24a0a3fe25 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 24 May 2019 10:17:15 -0500 -Subject: [PATCH 02/11] Refactor: libpe_status: rename target rc function - -... in line with current naming standards, to avoid confusion with -controller function of the same name ---- - include/crm/pengine/internal.h | 2 +- - lib/pengine/failcounts.c | 4 ++-- - lib/pengine/unpack.c | 4 ++-- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h -index 6d22db7..fd55bb9 100644 ---- a/include/crm/pengine/internal.h -+++ b/include/crm/pengine/internal.h -@@ -288,7 +288,7 @@ pe_base_name_eq(resource_t *rsc, const char *id) - return FALSE; - } - --int get_target_rc(xmlNode * xml_op); -+int pe__target_rc_from_xml(xmlNode *xml_op); - - gint sort_node_uname(gconstpointer a, gconstpointer b); - bool is_set_recursive(resource_t * rsc, long long flag, bool any); -diff --git a/lib/pengine/failcounts.c b/lib/pengine/failcounts.c -index 8f01c07..0c8ca5d 100644 ---- a/lib/pengine/failcounts.c -+++ b/lib/pengine/failcounts.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2008-2018 Andrew Beekhof -+ * Copyright 2008-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU Lesser General Public License - * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -@@ -62,7 +62,7 @@ is_matched_failure(const char *rsc_id, xmlNode *conf_op_xml, - - if (safe_str_eq(expected_op_key, lrm_op_id)) { - int rc = 0; -- int target_rc = get_target_rc(lrm_op_xml); -+ int target_rc = pe__target_rc_from_xml(lrm_op_xml); - - crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc); - if (rc != target_rc) { -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 02cef2c..0e8177b 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3013,7 +3013,7 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod - return expired; - } - --int get_target_rc(xmlNode *xml_op) -+int pe__target_rc_from_xml(xmlNode *xml_op) - { - int target_rc = 0; - const char *key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY); -@@ -3141,7 +3141,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last - - int rc = 0; - int status = PCMK_LRM_OP_UNKNOWN; -- int target_rc = get_target_rc(xml_op); -+ int target_rc = pe__target_rc_from_xml(xml_op); - guint interval_ms = 0; - - gboolean expired = FALSE; --- -1.8.3.1 - - -From 2ccbefc2b623a2671f14824c6aea87c87fc338a0 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 24 May 2019 09:59:35 -0500 -Subject: [PATCH 03/11] Refactor: libpacemaker: make transition.h internal - -transition.h has always been installed even though it was purely internal. -Since libtransitioner is now merged into libpacemaker, move transition.h -to include/pcmki/pcmki_transition.h and make it internal. - -Also, get rid of pcmki_sched_transition.h since it no longer contains -anything transition-related, and move its two function declarations to -pcmki_sched_utils.h. ---- - daemons/controld/controld_execd_state.c | 4 +- - daemons/controld/controld_fencing.h | 2 +- - daemons/controld/controld_join_dc.c | 4 +- - daemons/controld/controld_transition.h | 2 +- - daemons/controld/controld_utils.h | 7 +- - include/crm/Makefile.am | 5 +- - include/crm/transition.h | 153 -------------------------------- - include/pacemaker-internal.h | 2 +- - include/pcmki/Makefile.am | 12 +-- - include/pcmki/pcmki_sched_transition.h | 22 ----- - include/pcmki/pcmki_sched_utils.h | 16 ++++ - include/pcmki/pcmki_transition.h | 143 +++++++++++++++++++++++++++++ - lib/pacemaker/pcmk_sched_transition.c | 3 +- - lib/pacemaker/pcmk_trans_graph.c | 27 ++---- - lib/pacemaker/pcmk_trans_unpack.c | 23 ++--- - lib/pacemaker/pcmk_trans_utils.c | 27 ++---- - tools/crm_simulate.c | 1 - - 17 files changed, 201 insertions(+), 252 deletions(-) - delete mode 100644 include/crm/transition.h - delete mode 100644 include/pcmki/pcmki_sched_transition.h - create mode 100644 include/pcmki/pcmki_transition.h - -diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c -index 8a1a7f3..4e9f096 100644 ---- a/daemons/controld/controld_execd_state.c -+++ b/daemons/controld/controld_execd_state.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2012-2018 David Vossel -+ * Copyright 2012-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -10,6 +10,7 @@ - #include - #include - -+#include - #include - #include - #include -@@ -18,7 +19,6 @@ - #include - #include - #include --#include - #include - - GHashTable *lrm_state_table = NULL; -diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h -index 3ef537f..8f7f19b 100644 ---- a/daemons/controld/controld_fencing.h -+++ b/daemons/controld/controld_fencing.h -@@ -11,7 +11,7 @@ - # define CONTROLD_FENCING__H - - #include // bool --#include // crm_graph_t, crm_action_t -+#include // crm_graph_t, crm_action_t - - // stonith fail counts - void st_fail_count_reset(const char * target); -diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c -index ddee895..d790d9a 100644 ---- a/daemons/controld/controld_join_dc.c -+++ b/daemons/controld/controld_join_dc.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -15,7 +15,7 @@ - - #include - #include --#include "controld_transition.h" -+#include - - char *max_epoch = NULL; - char *max_generation_from = NULL; -diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h -index f31ac2d..192a9e8 100644 ---- a/daemons/controld/controld_transition.h -+++ b/daemons/controld/controld_transition.h -@@ -8,10 +8,10 @@ - #ifndef TENGINE__H - # define TENGINE__H - --# include - # include - # include - # include -+# include - - /* tengine */ - extern crm_action_t *match_down_event(const char *target); -diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h -index 8b80e3c..1946a82 100644 ---- a/daemons/controld/controld_utils.h -+++ b/daemons/controld/controld_utils.h -@@ -11,11 +11,10 @@ - # define CRMD_UTILS__H - - # include --# include - # include --# include /* For CIB_OP_MODIFY */ --# include "controld_fsa.h" // For fsa_cib_conn --# include "controld_alerts.h" -+# include // CIB_OP_MODIFY -+# include // fsa_cib_conn -+# include - - # define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" - -diff --git a/include/crm/Makefile.am b/include/crm/Makefile.am -index 3f5f4bf..bc939eb 100644 ---- a/include/crm/Makefile.am -+++ b/include/crm/Makefile.am -@@ -1,5 +1,5 @@ - # --# Copyright 2004-2018 the Pacemaker project contributors -+# Copyright 2004-2019 the Pacemaker project contributors - # - # The version control history for this file may have further details. - # -@@ -22,7 +22,6 @@ MAINTAINERCLEANFILES = Makefile.in - headerdir=$(pkgincludedir)/crm - - header_HEADERS = attrd.h cib.h cluster.h compatibility.h crm.h \ -- lrmd.h msg_xml.h services.h stonith-ng.h \ -- transition.h -+ lrmd.h msg_xml.h services.h stonith-ng.h - - SUBDIRS = common pengine cib fencing cluster -diff --git a/include/crm/transition.h b/include/crm/transition.h -deleted file mode 100644 -index 6e9a875..0000000 ---- a/include/crm/transition.h -+++ /dev/null -@@ -1,153 +0,0 @@ --/* -- * Copyright 2004-2018 the Pacemaker project contributors -- * -- * The version control history for this file may have further details. -- * -- * This program is free software; you can redistribute it and/or -- * modify it under the terms of the GNU Lesser General Public -- * License as published by the Free Software Foundation; either -- * version 2 of the License, or (at your option) any later version. -- * -- * This software is distributed in the hope that it will be useful, -- * but WITHOUT ANY WARRANTY; without even the implied warranty of -- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- * General Public License for more details. -- * -- * You should have received a copy of the GNU Lesser General Public -- * License along with this library; if not, write to the Free Software -- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -- */ --#ifndef CRM_TRANSITION__H --# define CRM_TRANSITION__H -- --#ifdef __cplusplus --extern "C" { --#endif -- --#include --#include --#include -- --typedef enum { -- action_type_pseudo, -- action_type_rsc, -- action_type_crm --} action_type_e; -- --typedef struct te_timer_s crm_action_timer_t; --typedef struct crm_graph_s crm_graph_t; -- --typedef struct synapse_s { -- int id; -- int priority; -- -- gboolean ready; -- gboolean failed; -- gboolean executed; -- gboolean confirmed; -- -- GListPtr actions; /* crm_action_t* */ -- GListPtr inputs; /* crm_action_t* */ --} synapse_t; -- --typedef struct crm_action_s { -- int id; -- int timeout; -- guint interval_ms; -- GHashTable *params; -- action_type_e type; -- -- crm_action_timer_t *timer; -- synapse_t *synapse; -- -- gboolean sent_update; /* sent to the CIB */ -- gboolean executed; /* sent to the CRM */ -- gboolean confirmed; -- -- gboolean failed; -- gboolean can_fail; -- -- xmlNode *xml; -- --} crm_action_t; -- --struct te_timer_s { -- int source_id; -- int timeout; -- crm_action_t *action; --}; -- --/* order matters here */ --enum transition_action { -- tg_done, -- tg_stop, -- tg_restart, -- tg_shutdown, --}; -- --struct crm_graph_s { -- int id; -- char *source; -- int abort_priority; -- -- gboolean complete; -- const char *abort_reason; -- enum transition_action completion_action; -- -- int num_actions; -- int num_synapses; -- -- int batch_limit; -- int network_delay; -- int stonith_timeout; -- int transition_timeout; -- -- int fired; -- int pending; -- int skipped; -- int completed; -- int incomplete; -- -- GListPtr synapses; /* synapse_t* */ -- -- int migration_limit; --}; -- --typedef struct crm_graph_functions_s { -- gboolean(*pseudo) (crm_graph_t * graph, crm_action_t * action); -- gboolean(*rsc) (crm_graph_t * graph, crm_action_t * action); -- gboolean(*crmd) (crm_graph_t * graph, crm_action_t * action); -- gboolean(*stonith) (crm_graph_t * graph, crm_action_t * action); -- gboolean(*allowed) (crm_graph_t * graph, crm_action_t * action); --} crm_graph_functions_t; -- --enum transition_status { -- transition_active, -- transition_pending, /* active but no actions performed this time */ -- transition_complete, -- transition_stopped, -- transition_terminated, -- transition_action_failed, -- transition_failed, --}; -- --void set_default_graph_functions(void); --void set_graph_functions(crm_graph_functions_t * fns); --crm_graph_t *unpack_graph(xmlNode * xml_graph, const char *reference); --int run_graph(crm_graph_t * graph); --gboolean update_graph(crm_graph_t * graph, crm_action_t * action); --void destroy_graph(crm_graph_t * graph); --const char *transition_status(enum transition_status state); --void print_graph(unsigned int log_level, crm_graph_t * graph); --void print_action(int log_level, const char *prefix, crm_action_t * action); --bool update_abort_priority(crm_graph_t * graph, int priority, -- enum transition_action action, const char *abort_reason); --const char *actiontype2text(action_type_e type); --lrmd_event_data_t *convert_graph_action(xmlNode * resource, crm_action_t * action, int status, -- int rc); -- --#ifdef __cplusplus --} --#endif -- --#endif -diff --git a/include/pacemaker-internal.h b/include/pacemaker-internal.h -index 3627ba5..51d7225 100644 ---- a/include/pacemaker-internal.h -+++ b/include/pacemaker-internal.h -@@ -13,8 +13,8 @@ - # include - # include - # include --# include - # include - # include -+# include - - #endif -diff --git a/include/pcmki/Makefile.am b/include/pcmki/Makefile.am -index b163e89..4cf1cf2 100644 ---- a/include/pcmki/Makefile.am -+++ b/include/pcmki/Makefile.am -@@ -9,11 +9,11 @@ - - MAINTAINERCLEANFILES = Makefile.in - --noinst_HEADERS = pcmki_error.h \ -- pcmki_sched_allocate.h \ -- pcmki_sched_notif.h \ -- pcmki_sched_transition.h \ -- pcmki_sched_utils.h \ -- pcmki_scheduler.h -+noinst_HEADERS = pcmki_error.h \ -+ pcmki_sched_allocate.h \ -+ pcmki_sched_notif.h \ -+ pcmki_sched_utils.h \ -+ pcmki_scheduler.h \ -+ pcmki_transition.h - - .PHONY: $(ARCHIVE_VERSION) -diff --git a/include/pcmki/pcmki_sched_transition.h b/include/pcmki/pcmki_sched_transition.h -deleted file mode 100644 -index 41f5d61..0000000 ---- a/include/pcmki/pcmki_sched_transition.h -+++ /dev/null -@@ -1,22 +0,0 @@ --/* -- * Copyright 2014-2019 the Pacemaker project contributors -- * -- * The version control history for this file may have further details. -- * -- * This source code is licensed under the GNU Lesser General Public License -- * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -- */ --#ifndef SCHED_TRANSITION__H --# define SCHED_TRANSITION__H -- --#include -- --void modify_configuration( -- pe_working_set_t * data_set, cib_t *cib, -- const char *quorum, const char *watchdog, GListPtr node_up, GListPtr node_down, GListPtr node_fail, -- GListPtr op_inject, GListPtr ticket_grant, GListPtr ticket_revoke, -- GListPtr ticket_standby, GListPtr ticket_activate); -- --int run_simulation(pe_working_set_t * data_set, cib_t *cib, GListPtr op_fail_list, bool quiet); -- --#endif -diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h -index b47a2bb..4361235 100644 ---- a/include/pcmki/pcmki_sched_utils.h -+++ b/include/pcmki/pcmki_sched_utils.h -@@ -10,6 +10,14 @@ - #ifndef PENGINE_AUTILS__H - # define PENGINE_AUTILS__H - -+#include // bool -+#include // GList, GHashTable, gboolean, guint -+#include // GListPtr -+#include // cib_t -+#include -+#include -+#include -+ - /* Constraint helper functions */ - extern rsc_colocation_t *invert_constraint(rsc_colocation_t * constraint); - -@@ -75,4 +83,12 @@ pe_action_t *sched_shutdown_op(pe_node_t *node, pe_working_set_t *data_set); - - # define LOAD_STOPPED "load_stopped" - -+void modify_configuration( -+ pe_working_set_t * data_set, cib_t *cib, -+ const char *quorum, const char *watchdog, GListPtr node_up, GListPtr node_down, GListPtr node_fail, -+ GListPtr op_inject, GListPtr ticket_grant, GListPtr ticket_revoke, -+ GListPtr ticket_standby, GListPtr ticket_activate); -+ -+int run_simulation(pe_working_set_t * data_set, cib_t *cib, GListPtr op_fail_list, bool quiet); -+ - #endif -diff --git a/include/pcmki/pcmki_transition.h b/include/pcmki/pcmki_transition.h -new file mode 100644 -index 0000000..d9a0ff6 ---- /dev/null -+++ b/include/pcmki/pcmki_transition.h -@@ -0,0 +1,143 @@ -+/* -+ * Copyright 2004-2019 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#ifndef CRM_TRANSITION__H -+# define CRM_TRANSITION__H -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+#include -+#include -+#include -+ -+typedef enum { -+ action_type_pseudo, -+ action_type_rsc, -+ action_type_crm -+} action_type_e; -+ -+typedef struct te_timer_s crm_action_timer_t; -+typedef struct crm_graph_s crm_graph_t; -+ -+typedef struct synapse_s { -+ int id; -+ int priority; -+ -+ gboolean ready; -+ gboolean failed; -+ gboolean executed; -+ gboolean confirmed; -+ -+ GListPtr actions; /* crm_action_t* */ -+ GListPtr inputs; /* crm_action_t* */ -+} synapse_t; -+ -+typedef struct crm_action_s { -+ int id; -+ int timeout; -+ guint interval_ms; -+ GHashTable *params; -+ action_type_e type; -+ -+ crm_action_timer_t *timer; -+ synapse_t *synapse; -+ -+ gboolean sent_update; /* sent to the CIB */ -+ gboolean executed; /* sent to the CRM */ -+ gboolean confirmed; -+ -+ gboolean failed; -+ gboolean can_fail; -+ -+ xmlNode *xml; -+ -+} crm_action_t; -+ -+struct te_timer_s { -+ int source_id; -+ int timeout; -+ crm_action_t *action; -+}; -+ -+/* order matters here */ -+enum transition_action { -+ tg_done, -+ tg_stop, -+ tg_restart, -+ tg_shutdown, -+}; -+ -+struct crm_graph_s { -+ int id; -+ char *source; -+ int abort_priority; -+ -+ gboolean complete; -+ const char *abort_reason; -+ enum transition_action completion_action; -+ -+ int num_actions; -+ int num_synapses; -+ -+ int batch_limit; -+ int network_delay; -+ int stonith_timeout; -+ int transition_timeout; -+ -+ int fired; -+ int pending; -+ int skipped; -+ int completed; -+ int incomplete; -+ -+ GListPtr synapses; /* synapse_t* */ -+ -+ int migration_limit; -+}; -+ -+typedef struct crm_graph_functions_s { -+ gboolean(*pseudo) (crm_graph_t * graph, crm_action_t * action); -+ gboolean(*rsc) (crm_graph_t * graph, crm_action_t * action); -+ gboolean(*crmd) (crm_graph_t * graph, crm_action_t * action); -+ gboolean(*stonith) (crm_graph_t * graph, crm_action_t * action); -+ gboolean(*allowed) (crm_graph_t * graph, crm_action_t * action); -+} crm_graph_functions_t; -+ -+enum transition_status { -+ transition_active, -+ transition_pending, /* active but no actions performed this time */ -+ transition_complete, -+ transition_stopped, -+ transition_terminated, -+ transition_action_failed, -+ transition_failed, -+}; -+ -+void set_default_graph_functions(void); -+void set_graph_functions(crm_graph_functions_t * fns); -+crm_graph_t *unpack_graph(xmlNode * xml_graph, const char *reference); -+int run_graph(crm_graph_t * graph); -+gboolean update_graph(crm_graph_t * graph, crm_action_t * action); -+void destroy_graph(crm_graph_t * graph); -+const char *transition_status(enum transition_status state); -+void print_graph(unsigned int log_level, crm_graph_t * graph); -+void print_action(int log_level, const char *prefix, crm_action_t * action); -+bool update_abort_priority(crm_graph_t * graph, int priority, -+ enum transition_action action, const char *abort_reason); -+const char *actiontype2text(action_type_e type); -+lrmd_event_data_t *convert_graph_action(xmlNode * resource, crm_action_t * action, int status, -+ int rc); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif -diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c -index 0fa5709..8ab8d82 100644 ---- a/lib/pacemaker/pcmk_sched_transition.c -+++ b/lib/pacemaker/pcmk_sched_transition.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2018 Andrew Beekhof -+ * Copyright 2009-2019 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -19,7 +19,6 @@ - #include - #include - #include --#include - #include - #include - #include -diff --git a/lib/pacemaker/pcmk_trans_graph.c b/lib/pacemaker/pcmk_trans_graph.c -index 71568dd..77980e5 100644 ---- a/lib/pacemaker/pcmk_trans_graph.c -+++ b/lib/pacemaker/pcmk_trans_graph.c -@@ -1,19 +1,10 @@ --/* -- * Copyright (C) 2004 Andrew Beekhof -- * -- * This library is free software; you can redistribute it and/or -- * modify it under the terms of the GNU Lesser General Public -- * License as published by the Free Software Foundation; either -- * version 2.1 of the License, or (at your option) any later version. -- * -- * This library is distributed in the hope that it will be useful, -- * but WITHOUT ANY WARRANTY; without even the implied warranty of -- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- * Lesser General Public License for more details. -- * -- * You should have received a copy of the GNU Lesser General Public -- * License along with this library; if not, write to the Free Software -- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+/* -+ * Copyright 2004-2019 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. - */ - - #include -@@ -21,9 +12,7 @@ - #include - #include - #include --#include --/* #include */ --/* */ -+#include - - crm_graph_functions_t *graph_fns = NULL; - -diff --git a/lib/pacemaker/pcmk_trans_unpack.c b/lib/pacemaker/pcmk_trans_unpack.c -index 31e39cb..b8147a9 100644 ---- a/lib/pacemaker/pcmk_trans_unpack.c -+++ b/lib/pacemaker/pcmk_trans_unpack.c -@@ -1,30 +1,21 @@ - /* -- * Copyright (C) 2004 Andrew Beekhof -+ * Copyright 2004-2019 the Pacemaker project contributors - * -- * This library is free software; you can redistribute it and/or -- * modify it under the terms of the GNU Lesser General Public -- * License as published by the Free Software Foundation; either -- * version 2.1 of the License, or (at your option) any later version. -+ * The version control history for this file may have further details. - * -- * This library is distributed in the hope that it will be useful, -- * but WITHOUT ANY WARRANTY; without even the implied warranty of -- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- * Lesser General Public License for more details. -- * -- * You should have received a copy of the GNU Lesser General Public -- * License along with this library; if not, write to the Free Software -- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. - */ - - #include - - #include -+#include -+ - #include - #include -- - #include --#include --#include -+#include - - CRM_TRACE_INIT_DATA(transitioner); - -diff --git a/lib/pacemaker/pcmk_trans_utils.c b/lib/pacemaker/pcmk_trans_utils.c -index d3199d9..69da7f4 100644 ---- a/lib/pacemaker/pcmk_trans_utils.c -+++ b/lib/pacemaker/pcmk_trans_utils.c -@@ -1,19 +1,10 @@ --/* -- * Copyright (C) 2004 Andrew Beekhof -- * -- * This library is free software; you can redistribute it and/or -- * modify it under the terms of the GNU Lesser General Public -- * License as published by the Free Software Foundation; either -- * version 2.1 of the License, or (at your option) any later version. -- * -- * This library is distributed in the hope that it will be useful, -- * but WITHOUT ANY WARRANTY; without even the implied warranty of -- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- * Lesser General Public License for more details. -- * -- * You should have received a copy of the GNU Lesser General Public -- * License along with this library; if not, write to the Free Software -- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+/* -+ * Copyright 2004-2019 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. - */ - - #include -@@ -21,9 +12,7 @@ - #include - #include - #include --#include --/* #include */ --/* */ -+#include - - extern crm_graph_functions_t *graph_fns; - -diff --git a/tools/crm_simulate.c b/tools/crm_simulate.c -index 1921ee4..d4ab6a3 100644 ---- a/tools/crm_simulate.c -+++ b/tools/crm_simulate.c -@@ -21,7 +21,6 @@ - #include - #include - #include --#include - #include - #include - #include --- -1.8.3.1 - - -From 8a0b29d8ed21c97075b4c059fa4b0f5c0d985a73 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jun 2019 14:18:37 -0500 -Subject: [PATCH 04/11] Test: scheduler: explicitly set concurrent-fencing in - relevant regression tests - -... since concurrent-fencing's default is likely to eventually change, -which would otherwise affect the results of these tests ---- - cts/scheduler/rec-node-14.xml | 1 + - cts/scheduler/remote-connection-unrecoverable.xml | 1 + - cts/scheduler/remote-recover-all.xml | 1 + - cts/scheduler/remote-recover-no-resources.xml | 1 + - cts/scheduler/remote-recover-unknown.xml | 1 + - cts/scheduler/stonith-4.xml | 1 + - cts/scheduler/suicide-needed-inquorate.xml | 1 + - cts/scheduler/ticket-clone-21.xml | 1 + - cts/scheduler/ticket-clone-9.xml | 1 + - 9 files changed, 9 insertions(+) - -diff --git a/cts/scheduler/rec-node-14.xml b/cts/scheduler/rec-node-14.xml -index 60307ba..aefa410 100644 ---- a/cts/scheduler/rec-node-14.xml -+++ b/cts/scheduler/rec-node-14.xml -@@ -4,6 +4,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/remote-connection-unrecoverable.xml b/cts/scheduler/remote-connection-unrecoverable.xml -index df9fee2..efec646 100644 ---- a/cts/scheduler/remote-connection-unrecoverable.xml -+++ b/cts/scheduler/remote-connection-unrecoverable.xml -@@ -7,6 +7,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/remote-recover-all.xml b/cts/scheduler/remote-recover-all.xml -index 0ade7cd..1680166 100644 ---- a/cts/scheduler/remote-recover-all.xml -+++ b/cts/scheduler/remote-recover-all.xml -@@ -10,6 +10,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/remote-recover-no-resources.xml b/cts/scheduler/remote-recover-no-resources.xml -index 37708bb..602ed2b 100644 ---- a/cts/scheduler/remote-recover-no-resources.xml -+++ b/cts/scheduler/remote-recover-no-resources.xml -@@ -10,6 +10,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/remote-recover-unknown.xml b/cts/scheduler/remote-recover-unknown.xml -index f070f11..f47a841 100644 ---- a/cts/scheduler/remote-recover-unknown.xml -+++ b/cts/scheduler/remote-recover-unknown.xml -@@ -10,6 +10,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/stonith-4.xml b/cts/scheduler/stonith-4.xml -index 7979462..dd7af8d 100644 ---- a/cts/scheduler/stonith-4.xml -+++ b/cts/scheduler/stonith-4.xml -@@ -4,6 +4,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/suicide-needed-inquorate.xml b/cts/scheduler/suicide-needed-inquorate.xml -index e626ea6..f87422b 100644 ---- a/cts/scheduler/suicide-needed-inquorate.xml -+++ b/cts/scheduler/suicide-needed-inquorate.xml -@@ -6,6 +6,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/ticket-clone-21.xml b/cts/scheduler/ticket-clone-21.xml -index bb1f044..efd5294 100644 ---- a/cts/scheduler/ticket-clone-21.xml -+++ b/cts/scheduler/ticket-clone-21.xml -@@ -4,6 +4,7 @@ - - - -+ - - - -diff --git a/cts/scheduler/ticket-clone-9.xml b/cts/scheduler/ticket-clone-9.xml -index e77210d..c6d5809 100644 ---- a/cts/scheduler/ticket-clone-9.xml -+++ b/cts/scheduler/ticket-clone-9.xml -@@ -4,6 +4,7 @@ - - - -+ - - - --- -1.8.3.1 - - -From 359f0e6089ef618361acc2437d779ecad1edb8d3 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jun 2019 11:22:58 -0500 -Subject: [PATCH 05/11] Build: doc: define variable properly - -broke when moved from GNUmakefile to Makefile.am ---- - doc/Makefile.am | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/doc/Makefile.am b/doc/Makefile.am -index f3b79bb..18747cf 100644 ---- a/doc/Makefile.am -+++ b/doc/Makefile.am -@@ -25,6 +25,9 @@ RSYNC_DEST ?= root@www.clusterlabs.org:/var/www/html - # don't cross filesystems, sparse, show progress - RSYNC_OPTS = -rlptvzxS --progress - -+LAST_RELEASE ?= Pacemaker-$(VERSION) -+TAG ?= $(shell git log --pretty=format:%H -n 1 HEAD) -+ - publican_docs = - generated_docs = - generated_mans = -@@ -364,9 +367,6 @@ doxygen-upload: doxygen - - # ABI compatibility report as HTML - --LAST_RELEASE ?= Pacemaker-$(VERSION) --TAG ?= $(git log --pretty=format:%H -n 1 HEAD) -- - abi: abi-check - ./abi-check $(PACKAGE) $(LAST_RELEASE) $(TAG) - --- -1.8.3.1 - - -From c0e1ff4cc8578b78b085b98effff11747f81a397 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 10 Jun 2019 11:38:51 -0500 -Subject: [PATCH 06/11] Doc: doxygen: avoid full paths in output graphics - -broke when doxygen target moved from toplevel to doc subdirectory ---- - doc/Doxyfile.in | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in -index 81615fb..bc449df 100644 ---- a/doc/Doxyfile.in -+++ b/doc/Doxyfile.in -@@ -142,7 +142,7 @@ FULL_PATH_NAMES = YES - # will be relative from the directory where doxygen is started. - # This tag requires that the tag FULL_PATH_NAMES is set to YES. - --STRIP_FROM_PATH = -+STRIP_FROM_PATH = .. - - # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the - # path mentioned in the documentation of a class, which tells the reader which -@@ -151,7 +151,7 @@ STRIP_FROM_PATH = - # specify the list of include paths that are normally passed to the compiler - # using the -I flag. - --STRIP_FROM_INC_PATH = -+STRIP_FROM_INC_PATH = .. - - # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but - # less readable) file names. This can be useful is your file systems doesn't --- -1.8.3.1 - - -From a0ab603c5c416148132a91f5bf22d55e65f8ba4e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 11 Jun 2019 14:26:28 -0500 -Subject: [PATCH 07/11] Low: xml: add API schema for list_item() output - ---- - xml/Makefile.am | 9 +++++---- - xml/api/item-1.1.rng | 19 +++++++++++++++++++ - 2 files changed, 24 insertions(+), 4 deletions(-) - create mode 100644 xml/api/item-1.1.rng - -diff --git a/xml/Makefile.am b/xml/Makefile.am -index 88edbda..49542a3 100644 ---- a/xml/Makefile.am -+++ b/xml/Makefile.am -@@ -1,5 +1,5 @@ - # --# Copyright 2004-2018 the Pacemaker project contributors -+# Copyright 2004-2019 the Pacemaker project contributors - # - # The version control history for this file may have further details. - # -@@ -63,8 +63,9 @@ RNG_max ?= $(lastword $(RNG_numeric_versions)) - - # A sorted list of all API and RNG versions (numeric and "next") - API_versions = next $(API_numeric_versions) --API_base = command-output stonith_admin --API_files = $(foreach base,$(API_base),$(wildcard api/$(base)*.rng)) -+API_request_base = command-output stonith_admin -+API_base = $(API_request_base) item -+API_files = $(foreach base,$(API_base),$(wildcard api/$(base)*.rng)) - - RNG_versions = next $(RNG_numeric_versions) - RNG_version_pairs = $(call version_pairs,${RNG_numeric_versions}) -@@ -139,7 +140,7 @@ api/api-result-%.rng: $(API_files) Makefile.am - echo ' ' >> $@ - echo ' ' >> $@ - echo ' ' >> $@ -- for rng in $(API_base); do $(top_srcdir)/xml/best-match.sh api/$$rng $(*) $(@) " " || :; done -+ for rng in $(API_request_base); do $(top_srcdir)/xml/best-match.sh api/$$rng $(*) $(@) " " || :; done - echo ' ' >> $@ - echo ' ' >> $@ - echo ' ' >> $@ -diff --git a/xml/api/item-1.1.rng b/xml/api/item-1.1.rng -new file mode 100644 -index 0000000..1a065ca ---- /dev/null -+++ b/xml/api/item-1.1.rng -@@ -0,0 +1,19 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - - -From 311d8629241d227dded598225d8f413c9ebb4a9b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 11 Jun 2019 18:47:10 -0500 -Subject: [PATCH 08/11] Refactor: fencing: expose function for parsing targets - from string - -... as internal API, so it can be reused elsewhere. Also, refactor it for -simplicity and versatility. ---- - daemons/fenced/fenced_commands.c | 91 +--------------------------- - include/crm/fencing/internal.h | 3 + - lib/fencing/st_client.c | 128 +++++++++++++++++++++++++++++++++++++++ - 3 files changed, 133 insertions(+), 89 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 1be2508..54575f9 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -617,93 +617,6 @@ build_port_aliases(const char *hostmap, GListPtr * targets) - return aliases; - } - --static void --parse_host_line(const char *line, int max, GListPtr * output) --{ -- int lpc = 0; -- int last = 0; -- -- if (max <= 0) { -- return; -- } -- -- /* Check for any complaints about additional parameters that the device doesn't understand */ -- if (strstr(line, "invalid") || strstr(line, "variable")) { -- crm_debug("Skipping: %s", line); -- return; -- } -- -- crm_trace("Processing %d bytes: [%s]", max, line); -- /* Skip initial whitespace */ -- for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) { -- last = lpc + 1; -- } -- -- /* Now the actual content */ -- for (lpc = 0; lpc <= max; lpc++) { -- gboolean a_space = isspace(line[lpc]); -- -- if (a_space && lpc < max && isspace(line[lpc + 1])) { -- /* fast-forward to the end of the spaces */ -- -- } else if (a_space || line[lpc] == ',' || line[lpc] == ';' || line[lpc] == 0) { -- int rc = 1; -- char *entry = NULL; -- -- if (lpc != last) { -- entry = calloc(1, 1 + lpc - last); -- rc = sscanf(line + last, "%[a-zA-Z0-9_-.]", entry); -- } -- -- if (entry == NULL) { -- /* Skip */ -- } else if (rc != 1) { -- crm_warn("Could not parse (%d %d): %s", last, lpc, line + last); -- } else if (safe_str_neq(entry, "on") && safe_str_neq(entry, "off")) { -- crm_trace("Adding '%s'", entry); -- *output = g_list_append(*output, entry); -- entry = NULL; -- } -- -- free(entry); -- last = lpc + 1; -- } -- } --} -- --static GListPtr --parse_host_list(const char *hosts) --{ -- int lpc = 0; -- int max = 0; -- int last = 0; -- GListPtr output = NULL; -- -- if (hosts == NULL) { -- return output; -- } -- -- max = strlen(hosts); -- for (lpc = 0; lpc <= max; lpc++) { -- if (hosts[lpc] == '\n' || hosts[lpc] == 0) { -- int len = lpc - last; -- -- if(len > 1) { -- char *line = strndup(hosts + last, len); -- -- line[len] = 0; /* Because it might be '\n' */ -- parse_host_line(line, len, &output); -- free(line); -- } -- -- last = lpc + 1; -- } -- } -- -- crm_trace("Parsed %d entries from '%s'", g_list_length(output), hosts); -- return output; --} -- - GHashTable *metadata_cache = NULL; - - void -@@ -937,7 +850,7 @@ build_device_from_xml(xmlNode * msg) - - value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTLIST); - if (value) { -- device->targets = parse_host_list(value); -+ device->targets = stonith__parse_targets(value); - } - - value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTMAP); -@@ -1108,7 +1021,7 @@ dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) - } else if (!rc) { - crm_info("Refreshing port list for %s", dev->id); - g_list_free_full(dev->targets, free); -- dev->targets = parse_host_list(output); -+ dev->targets = stonith__parse_targets(output); - dev->targets_age = time(NULL); - } - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 0c0ac70..f3d38a7 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -10,6 +10,7 @@ - #ifndef STONITH_NG_INTERNAL__H - # define STONITH_NG_INTERNAL__H - -+# include - # include - # include - # include -@@ -49,6 +50,8 @@ xmlNode *create_device_registration_xml(const char *id, - - void stonith__register_messages(pcmk__output_t *out); - -+GList *stonith__parse_targets(const char *hosts); -+ - # define ST_LEVEL_MAX 10 - - # define F_STONITH_CLIENTID "st_clientid" -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index d949fe1..629887a 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -9,6 +9,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -2304,3 +2305,130 @@ stonith_action_str(const char *action) - return action; - } - } -+ -+/*! -+ * \internal -+ * \brief Parse a target name from one line of a target list string -+ * -+ * \param[in] line One line of a target list string -+ * \parma[in] len String length of line -+ * \param[in,out] output List to add newly allocated target name to -+ */ -+static void -+parse_list_line(const char *line, int len, GList **output) -+{ -+ size_t i = 0; -+ size_t entry_start = 0; -+ -+ /* Skip complaints about additional parameters device doesn't understand -+ * -+ * @TODO Document or eliminate the implied restriction of target names -+ */ -+ if (strstr(line, "invalid") || strstr(line, "variable")) { -+ crm_debug("Skipping list output line: %s", line); -+ return; -+ } -+ -+ // Process line content, character by character -+ for (i = 0; i <= len; i++) { -+ -+ if (isspace(line[i]) || (line[i] == ',') || (line[i] == ';') -+ || (line[i] == '\0')) { -+ // We've found a separator (i.e. the end of an entry) -+ -+ int rc = 0; -+ char *entry = NULL; -+ -+ if (i == entry_start) { -+ // Skip leading and sequential separators -+ entry_start = i + 1; -+ continue; -+ } -+ -+ entry = calloc(i - entry_start + 1, sizeof(char)); -+ CRM_ASSERT(entry != NULL); -+ -+ /* Read entry, stopping at first separator -+ * -+ * @TODO Document or eliminate these character restrictions -+ */ -+ rc = sscanf(line + entry_start, "%[a-zA-Z0-9_-.]", entry); -+ if (rc != 1) { -+ crm_warn("Could not parse list output entry: %s " -+ CRM_XS " entry_start=%d position=%d", -+ line + entry_start, entry_start, i); -+ free(entry); -+ -+ } else if (safe_str_eq(entry, "on") || safe_str_eq(entry, "off")) { -+ /* Some agents print the target status in the list output, -+ * though none are known now (the separate list-status command -+ * is used for this, but it can also print "UNKNOWN"). To handle -+ * this possibility, skip such entries. -+ * -+ * @TODO Document or eliminate the implied restriction of target -+ * names. -+ */ -+ free(entry); -+ -+ } else { -+ // We have a valid entry -+ *output = g_list_append(*output, entry); -+ } -+ entry_start = i + 1; -+ } -+ } -+} -+ -+/*! -+ * \internal -+ * \brief Parse a list of targets from a string -+ * -+ * \param[in] list_output Target list as a string -+ * -+ * \return List of target names -+ * \note The target list string format is flexible, to allow for user-specified -+ * lists such pcmk_host_list and the output of an agent's list action -+ * (whether direct or via the API, which escapes newlines). There may be -+ * multiple lines, separated by either a newline or an escaped newline -+ * (backslash n). Each line may have one or more target names, separated -+ * by any combination of whitespace, commas, and semi-colons. Lines -+ * containing "invalid" or "variable" will be ignored entirely. Target -+ * names "on" or "off" (case-insensitive) will be ignored. Target names -+ * may contain only alphanumeric characters, underbars (_), dashes (-), -+ * and dots (.) (if any other character occurs in the name, it and all -+ * subsequent characters in the name will be ignored). -+ * \note The caller is responsible for freeing the result with -+ * g_list_free_full(result, free). -+ */ -+GList * -+stonith__parse_targets(const char *target_spec) -+{ -+ GList *targets = NULL; -+ -+ if (target_spec != NULL) { -+ size_t out_len = strlen(target_spec); -+ size_t line_start = 0; // Starting index of line being processed -+ -+ for (size_t i = 0; i <= out_len; ++i) { -+ if ((target_spec[i] == '\n') || (target_spec[i] == '\0') -+ || ((target_spec[i] == '\\') && (target_spec[i + 1] == 'n'))) { -+ // We've reached the end of one line of output -+ -+ int len = i - line_start; -+ -+ if (len > 0) { -+ char *line = strndup(target_spec + line_start, len); -+ -+ line[len] = '\0'; // Because it might be a newline -+ parse_list_line(line, len, &targets); -+ free(line); -+ } -+ if (target_spec[i] == '\\') { -+ ++i; // backslash-n takes up two positions -+ } -+ line_start = i + 1; -+ } -+ } -+ } -+ return targets; -+} --- -1.8.3.1 - - -From 60ad7730fbf34c1f67700bace39a083c0e3d1c31 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 10 Jun 2019 16:13:29 -0500 -Subject: [PATCH 09/11] Fix: tools: stonith_admin --list-targets should show - what fencer would use - -This fixes both a regression in 2.0.2 and a pre-existing issue before that. - -Before 2.0.2's 52f614aa, stonith_admin --list-targets would print each line of -output from the fence agent's list action, with any commas and semi-colons -removed. After that, it would only print lines that had three values separated -by spaces. - -In practice, fence agents have some variety in list action output, which is not -yet standardized by the fence agent API. Only fence_xvm is known to use the -three space-separated values. Most agents output the target name and an alias -separated by a comma. - -The earlier behavior would actually be awkward in the comma-separated case, -since the target name and alias would be run together with no separator. - -Neither behaviors matched what was actually used by the fencer. - -This commit refactors to use the new stonith__parse_targets() function, to show -the same list the fencer would use. It also fixes a few related issues: - -* Memory was not properly freed - -* No list wrapper would be printed if the list were empty - -* stonith_admin's XML output did not match its schema (the tool would output a - element, while the schema had ). Now, we abandon the - custom element and use the generic schema instead. While technically - this could be considered backward-incompatible, it's not really, because the - schema didn't match to begin with. Also, the API XML schema is still - considered experimental. - -* Not really a problem, but since we now have the generic schema, - stonith_admin uses this in place of its former dedicated schema. - The only difference is the former allows arbitrary strings while the - latter required NCName, but the reuse is more useful than type validation. ---- - lib/fencing/st_output.c | 28 ---------------- - tools/stonith_admin.c | 52 +++++------------------------- - xml/api/stonith_admin-1.1.rng | 75 +++++++++++++++++++++++++++++++++++++++++++ - 3 files changed, 83 insertions(+), 72 deletions(-) - create mode 100644 xml/api/stonith_admin-1.1.rng - -diff --git a/lib/fencing/st_output.c b/lib/fencing/st_output.c -index a8d0a60..0ceb699 100644 ---- a/lib/fencing/st_output.c -+++ b/lib/fencing/st_output.c -@@ -17,32 +17,6 @@ - #include - - static int --fence_target_text(pcmk__output_t *out, va_list args) { -- const char *hostname = va_arg(args, const char *); -- const char *uuid = va_arg(args, const char *); -- const char *status = va_arg(args, const char *); -- -- pcmk__indented_printf(out, "%s\t%s\t%s\n", hostname, uuid, status); -- return 0; --} -- --static int --fence_target_xml(pcmk__output_t *out, va_list args) { -- xmlNodePtr node = NULL; -- const char *hostname = va_arg(args, const char *); -- const char *uuid = va_arg(args, const char *); -- const char *status = va_arg(args, const char *); -- -- node = xmlNewNode(NULL, (pcmkXmlStr) "target"); -- xmlSetProp(node, (pcmkXmlStr) "hostname", (pcmkXmlStr) hostname); -- xmlSetProp(node, (pcmkXmlStr) "uuid", (pcmkXmlStr) uuid); -- xmlSetProp(node, (pcmkXmlStr) "status", (pcmkXmlStr) status); -- -- pcmk__xml_add_node(out, node); -- return 0; --} -- --static int - last_fenced_text(pcmk__output_t *out, va_list args) { - const char *target = va_arg(args, const char *); - time_t when = va_arg(args, time_t); -@@ -216,8 +190,6 @@ validate_agent_xml(pcmk__output_t *out, va_list args) { - } - - static pcmk__message_entry_t fmt_functions[] = { -- { "fence-target", "text", fence_target_text }, -- { "fence-target", "xml", fence_target_xml }, - { "last-fenced", "text", last_fenced_text }, - { "last-fenced", "xml", last_fenced_xml }, - { "stonith-event", "text", stonith_event_text }, -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index 6be66c6..a7551fd 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -635,53 +635,17 @@ main(int argc, char **argv) - break; - case 's': - rc = st->cmds->list(st, st_opts, device, &lists, timeout); -- if (rc == 0 && lists) { -- char *head = lists; -- char *eol = NULL; -+ if (rc == 0) { -+ GList *targets = stonith__parse_targets(lists); - - out->begin_list(out, "Fence targets", "fence target", "fence targets"); -- -- do { -- char *line = NULL; -- char *elem = NULL; -- -- char *hostname = NULL; -- char *uuid = NULL; -- char *status = NULL; -- -- eol = strstr(head, "\\n"); -- line = strndup(head, eol-head); -- -- while ((elem = strsep(&line, " ")) != NULL) { -- if (strcmp(elem, "") == 0) { -- continue; -- } -- -- if (hostname == NULL) { -- hostname = elem; -- } else if (uuid == NULL) { -- uuid = elem; -- } else if (status == NULL) { -- char *end = NULL; -- status = elem; -- -- end = strchr(status, '\n'); -- if (end != NULL) { -- *end = '\0'; -- } -- } -- } -- -- if (hostname != NULL && uuid != NULL && status != NULL) { -- out->message(out, "fence-target", hostname, uuid, status); -- } -- -- free(line); -- -- head = eol+2; -- } while (eol != NULL); -- -+ while (targets != NULL) { -+ out->list_item(out, NULL, (const char *) targets->data); -+ targets = targets->next; -+ } - out->end_list(out); -+ free(lists); -+ - } else if (rc != 0) { - fprintf(stderr, "List command returned error. rc : %d\n", rc); - } -diff --git a/xml/api/stonith_admin-1.1.rng b/xml/api/stonith_admin-1.1.rng -new file mode 100644 -index 0000000..997670f ---- /dev/null -+++ b/xml/api/stonith_admin-1.1.rng -@@ -0,0 +1,75 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ failed -+ success -+ pending -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -1.8.3.1 - - -From 8cc030c045e0a0b43a2a0dcfec5541e5e07faed3 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 10 Jun 2019 17:41:32 -0500 -Subject: [PATCH 10/11] Fix: libcrmcommon: add stderr source correctly when - outputting XML - ---- - lib/common/output_xml.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/common/output_xml.c b/lib/common/output_xml.c -index c93d66c..994af43 100644 ---- a/lib/common/output_xml.c -+++ b/lib/common/output_xml.c -@@ -130,7 +130,7 @@ xml_subprocess_output(pcmk__output_t *out, int exit_status, - if (proc_stderr != NULL) { - child_node = xmlNewTextChild(node, NULL, (pcmkXmlStr) "output", - (pcmkXmlStr) proc_stderr); -- xmlSetProp(node, (pcmkXmlStr) "source", (pcmkXmlStr) "stderr"); -+ xmlSetProp(child_node, (pcmkXmlStr) "source", (pcmkXmlStr) "stderr"); - } - - pcmk__xml_add_node(out, node); --- -1.8.3.1 - - -From 4ce8272fde2605099b9d4bb1e211bc66c4f79f90 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 12 Jun 2019 19:56:09 -0500 -Subject: [PATCH 11/11] Test: CTS: update pattern for changed log message - -changed in 0a884f32 ---- - cts/CM_common.py | 4 ++-- - cts/patterns.py | 2 +- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/cts/CM_common.py b/cts/CM_common.py -index 0112fec..b7ff223 100755 ---- a/cts/CM_common.py -+++ b/cts/CM_common.py -@@ -12,7 +12,7 @@ from __future__ import print_function, unicode_literals, absolute_import, divisi - __copyright__ = """Original Author: Huang Zhen - Copyright 2004 International Business Machines - --with later changes copyright 2004-2018 the Pacemaker project contributors. -+with later changes copyright 2004-2019 the Pacemaker project contributors. - The version control history for this file may have further details. - """ - __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" -@@ -318,7 +318,7 @@ class crm_common(ClusterManager): - - stonith_ignore = [ - r"Updating failcount for child_DoFencing", -- r"(ERROR|error).*: Sign-in failed: triggered a retry", -+ r"error.*: Fencer connection failed \(will retry\)", - "pacemaker-execd.*(ERROR|error): stonithd_receive_ops_result failed.", - ] - -diff --git a/cts/patterns.py b/cts/patterns.py -index 1bdace0..1b86ee7 100644 ---- a/cts/patterns.py -+++ b/cts/patterns.py -@@ -308,7 +308,7 @@ class crm_corosync(BasePatterns): - self.components["pacemaker-fenced-ignore"] = [ - r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", - r"crit:.*Fencing daemon connection failed", -- r"error:.*Sign-in failed: triggered a retry", -+ r"error:.*Fencer connection failed \(will retry\)", - r"Connection to (fencer|stonith-ng) failed, finalizing .* pending operations", - r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error", - ] --- -1.8.3.1 - diff --git a/SOURCES/009-improve-pacemaker_remote-handling.patch b/SOURCES/009-improve-pacemaker_remote-handling.patch deleted file mode 100644 index 0a31c27..0000000 --- a/SOURCES/009-improve-pacemaker_remote-handling.patch +++ /dev/null @@ -1,1466 +0,0 @@ -From 28566d6832274c59f27bb7b2f1f54420a3f3d822 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 9 May 2019 20:26:08 -0500 -Subject: [PATCH 01/13] Refactor: libpe_status: functionize unfencing digest - code more - -... for readability, reusability, and avoiding unnecessary function calls or -memory allocation. ---- - lib/pengine/utils.c | 159 ++++++++++++++++++++++++++++++++++++++-------------- - 1 file changed, 118 insertions(+), 41 deletions(-) - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 2f4dc1e..f80f8d4 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -2080,57 +2080,134 @@ rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node, - return data; - } - -+/*! -+ * \internal -+ * \brief Create an unfencing summary for use in special node attribute -+ * -+ * Create a string combining a fence device's resource ID, agent type, and -+ * parameter digest (whether for all parameters or just non-private parameters). -+ * This can be stored in a special node attribute, allowing us to detect changes -+ * in either the agent type or parameters, to know whether unfencing must be -+ * redone or can be safely skipped when the device's history is cleaned. -+ * -+ * \param[in] rsc_id Fence device resource ID -+ * \param[in] agent_type Fence device agent -+ * \param[in] param_digest Fence device parameter digest -+ * -+ * \return Newly allocated string with unfencing digest -+ * \note The caller is responsible for freeing the result. -+ */ -+static inline char * -+create_unfencing_summary(const char *rsc_id, const char *agent_type, -+ const char *param_digest) -+{ -+ return crm_strdup_printf("%s:%s:%s", rsc_id, agent_type, param_digest); -+} -+ -+/*! -+ * \internal -+ * \brief Check whether a node can skip unfencing -+ * -+ * Check whether a fence device's current definition matches a node's -+ * stored summary of when it was last unfenced by the device. -+ * -+ * \param[in] rsc_id Fence device's resource ID -+ * \param[in] agent Fence device's agent type -+ * \param[in] digest_calc Fence device's current parameter digest -+ * \param[in] node_summary Value of node's special unfencing node attribute -+ * (a comma-separated list of unfencing summaries for -+ * all devices that have unfenced this node) -+ * -+ * \return TRUE if digest matches, FALSE otherwise -+ */ -+static bool -+unfencing_digest_matches(const char *rsc_id, const char *agent, -+ const char *digest_calc, const char *node_summary) -+{ -+ bool matches = FALSE; -+ -+ if (rsc_id && agent && digest_calc && node_summary) { -+ char *search_secure = create_unfencing_summary(rsc_id, agent, -+ digest_calc); -+ -+ /* The digest was calculated including the device ID and agent, -+ * so there is no risk of collision using strstr(). -+ */ -+ matches = (strstr(node_summary, search_secure) != NULL); -+ crm_trace("Calculated unfencing digest '%s' %sfound in '%s'", -+ search_secure, matches? "" : "not ", node_summary); -+ free(search_secure); -+ } -+ return matches; -+} -+ -+/* Magic string to use as action name for digest cache entries used for -+ * unfencing checks. This is not a real action name (i.e. "on"), so -+ * check_action_definition() won't confuse these entries with real actions. -+ */ - #define STONITH_DIGEST_TASK "stonith-on" - -+/*! -+ * \internal -+ * \brief Calculate fence device digests and digest comparison result -+ * -+ * \param[in] rsc Fence device resource -+ * \param[in] agent Fence device's agent type -+ * \param[in] node Node with digest cache to use -+ * \param[in] data_set Cluster working set -+ * -+ * \return Node's digest cache entry -+ */ - static op_digest_cache_t * --fencing_action_digest_cmp(resource_t * rsc, node_t * node, pe_working_set_t * data_set) -+fencing_action_digest_cmp(pe_resource_t *rsc, const char *agent, -+ pe_node_t *node, pe_working_set_t *data_set) - { -- char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0); -- op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, node, NULL, data_set); -+ const char *node_summary = NULL; - -- const char *digest_all = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL); -- const char *digest_secure = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE); -+ // Calculate device's current parameter digests -+ char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0); -+ op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, -+ node, NULL, data_set); - -- /* No 'reloads' for fencing device changes -- * -- * We use the resource id + agent + digest so that we can detect -- * changes to the agent and/or the parameters used -- */ -- char *search_all = crm_strdup_printf("%s:%s:%s", rsc->id, (const char*)g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), data->digest_all_calc); -- char *search_secure = crm_strdup_printf("%s:%s:%s", rsc->id, (const char*)g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), data->digest_secure_calc); -+ free(key); - -- data->rc = RSC_DIGEST_ALL; -- if (digest_all == NULL) { -- /* it is unknown what the previous op digest was */ -+ // Check whether node has special unfencing summary node attribute -+ node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL); -+ if (node_summary == NULL) { - data->rc = RSC_DIGEST_UNKNOWN; -+ return data; -+ } - -- } else if (strstr(digest_all, search_all)) { -+ // Check whether full parameter digest matches -+ if (unfencing_digest_matches(rsc->id, agent, data->digest_all_calc, -+ node_summary)) { - data->rc = RSC_DIGEST_MATCH; -+ return data; -+ } - -- } else if(digest_secure && data->digest_secure_calc) { -- if(strstr(digest_secure, search_secure)) { -- if (is_set(data_set->flags, pe_flag_stdout)) { -- printf("Only 'private' parameters to %s for unfencing %s changed\n", -- rsc->id, node->details->uname); -- } -- data->rc = RSC_DIGEST_MATCH; -+ // Check whether secure parameter digest matches -+ node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE); -+ if (unfencing_digest_matches(rsc->id, agent, data->digest_secure_calc, -+ node_summary)) { -+ data->rc = RSC_DIGEST_MATCH; -+ if (is_set(data_set->flags, pe_flag_stdout)) { -+ printf("Only 'private' parameters to %s for unfencing %s changed\n", -+ rsc->id, node->details->uname); - } -+ return data; - } - -- if (is_set(data_set->flags, pe_flag_sanitized) -- && is_set(data_set->flags, pe_flag_stdout) -- && (data->rc == RSC_DIGEST_ALL) -+ // Parameters don't match -+ data->rc = RSC_DIGEST_ALL; -+ if (is_set(data_set->flags, (pe_flag_sanitized|pe_flag_stdout)) - && data->digest_secure_calc) { -- printf("Parameters to %s for unfencing %s changed, try '%s:%s:%s'\n", -- rsc->id, node->details->uname, rsc->id, -- (const char *) g_hash_table_lookup(rsc->meta, XML_ATTR_TYPE), -- data->digest_secure_calc); -- } -- -- free(key); -- free(search_all); -- free(search_secure); -+ char *digest = create_unfencing_summary(rsc->id, agent, -+ data->digest_secure_calc); - -+ printf("Parameters to %s for unfencing %s changed, try '%s'\n", -+ rsc->id, node->details->uname, digest); -+ free(digest); -+ } - return data; - } - -@@ -2218,9 +2295,6 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe - * - * We may do this for all nodes in the future, but for now - * the check_action_definition() based stuff works fine. -- * -- * Use "stonith-on" to avoid creating cache entries for -- * operations check_action_definition() would look for. - */ - long max = 1024; - long digests_all_offset = 0; -@@ -2232,8 +2306,11 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe - - for (GListPtr gIter = matches; gIter != NULL; gIter = gIter->next) { - resource_t *match = gIter->data; -- op_digest_cache_t *data = fencing_action_digest_cmp(match, node, data_set); -+ const char *agent = g_hash_table_lookup(match->meta, -+ XML_ATTR_TYPE); -+ op_digest_cache_t *data = NULL; - -+ data = fencing_action_digest_cmp(match, agent, node, data_set); - if(data->rc == RSC_DIGEST_ALL) { - optional = FALSE; - crm_notice("Unfencing %s (remote): because the definition of %s changed", node->details->uname, match->id); -@@ -2244,11 +2321,11 @@ pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe - - digests_all_offset += snprintf( - digests_all+digests_all_offset, max-digests_all_offset, -- "%s:%s:%s,", match->id, (const char*)g_hash_table_lookup(match->meta, XML_ATTR_TYPE), data->digest_all_calc); -+ "%s:%s:%s,", match->id, agent, data->digest_all_calc); - - digests_secure_offset += snprintf( - digests_secure+digests_secure_offset, max-digests_secure_offset, -- "%s:%s:%s,", match->id, (const char*)g_hash_table_lookup(match->meta, XML_ATTR_TYPE), data->digest_secure_calc); -+ "%s:%s:%s,", match->id, agent, data->digest_secure_calc); - } - g_hash_table_insert(stonith_op->meta, - strdup(XML_OP_ATTR_DIGESTS_ALL), --- -1.8.3.1 - - -From fd6e06ff419c95f4423202163d2d4dca3f03a4c5 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 May 2019 11:57:31 -0500 -Subject: [PATCH 02/13] Fix: libpe_status: calculate secure digests for - unfencing ops - -The calculation of digests for detection of when unfencing is needed reused -rsc_action_digest(). However that would only add secure digests when the -pe_flag_sanitized flag was set, which is only set by crm_simulate, so secure -digests would never be added in normal cluster operation. This led to -node attributes like name="#digests-secure" -value="stonith-fence_compute-fence-nova:fence_compute:(null),". - -Now, rsc_action_digest() takes a new argument to select whether secure digests -are added, which is always set to TRUE when calculating unfencing digests. ---- - lib/pengine/utils.c | 27 ++++++++++++++++++++++----- - 1 file changed, 22 insertions(+), 5 deletions(-) - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index f80f8d4..5b893f7 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -1936,9 +1936,24 @@ append_versioned_params(xmlNode *versioned_params, const char *ra_version, xmlNo - } - #endif - -+/*! -+ * \internal -+ * \brief Calculate action digests and store in node's digest cache -+ * -+ * \param[in] rsc Resource that action was for -+ * \param[in] task Name of action performed -+ * \param[in] key Action's task key -+ * \param[in] node Node action was performed on -+ * \param[in] xml_op XML of operation in CIB status (if available) -+ * \param[in] calc_secure Whether to calculate secure digest -+ * \param[in] data_set Cluster working set -+ * -+ * \return Pointer to node's digest cache entry -+ */ - static op_digest_cache_t * --rsc_action_digest(resource_t * rsc, const char *task, const char *key, -- node_t * node, xmlNode * xml_op, pe_working_set_t * data_set) -+rsc_action_digest(pe_resource_t *rsc, const char *task, const char *key, -+ pe_node_t *node, xmlNode *xml_op, bool calc_secure, -+ pe_working_set_t *data_set) - { - op_digest_cache_t *data = NULL; - -@@ -2007,7 +2022,7 @@ rsc_action_digest(resource_t * rsc, const char *task, const char *key, - - data->digest_all_calc = calculate_operation_digest(data->params_all, op_version); - -- if (is_set(data_set->flags, pe_flag_sanitized)) { -+ if (calc_secure) { - data->params_secure = copy_xml(data->params_all); - if(secure_list) { - filter_parameters(data->params_secure, secure_list, FALSE); -@@ -2053,7 +2068,9 @@ rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node, - - interval_ms = crm_parse_ms(interval_ms_s); - key = generate_op_key(rsc->id, task, interval_ms); -- data = rsc_action_digest(rsc, task, key, node, xml_op, data_set); -+ data = rsc_action_digest(rsc, task, key, node, xml_op, -+ is_set(data_set->flags, pe_flag_sanitized), -+ data_set); - - data->rc = RSC_DIGEST_MATCH; - if (digest_restart && data->digest_restart_calc && strcmp(data->digest_restart_calc, digest_restart) != 0) { -@@ -2167,7 +2184,7 @@ fencing_action_digest_cmp(pe_resource_t *rsc, const char *agent, - // Calculate device's current parameter digests - char *key = generate_op_key(rsc->id, STONITH_DIGEST_TASK, 0); - op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, key, -- node, NULL, data_set); -+ node, NULL, TRUE, data_set); - - free(key); - --- -1.8.3.1 - - -From 7886c8ec4dd209078cdc76274ed9d2804ea09b6a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 5 Jun 2019 12:54:34 -0500 -Subject: [PATCH 03/13] Refactor: controller: pass desired op status when - synthesizing failure - -so we can use new status codes later ---- - daemons/controld/controld_execd.c | 27 +++++++++++++++++++++------ - 1 file changed, 21 insertions(+), 6 deletions(-) - -diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c -index 8e89216..fed9419 100644 ---- a/daemons/controld/controld_execd.c -+++ b/daemons/controld/controld_execd.c -@@ -1424,8 +1424,22 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys, - update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, user_name, is_remote_node); - } - -+/*! -+ * \internal -+ * \brief Fail a requested action without actually executing it -+ * -+ * For an action that can't be executed, process it similarly to an actual -+ * execution result, with specified error status (except for notify actions, -+ * which will always be treated as successful). -+ * -+ * \param[in] lrm_state Executor connection that action is for -+ * \param[in] action Action XML from request -+ * \param[in] rc Desired return code to use -+ * \param[in] op_status Desired operation status to use -+ */ - static void --synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc) -+synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, -+ int op_status, enum ocf_exitcode rc) - { - lrmd_event_data_t *op = NULL; - const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK); -@@ -1451,7 +1465,7 @@ synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc) - if (safe_str_eq(operation, RSC_NOTIFY)) { // Notifications can't fail - fake_op_status(lrm_state, op, PCMK_LRM_OP_DONE, PCMK_OCF_OK); - } else { -- fake_op_status(lrm_state, op, PCMK_LRM_OP_ERROR, rc); -+ fake_op_status(lrm_state, op, op_status, rc); - } - - crm_info("Faking " CRM_OP_FMT " result (%d) on %s", -@@ -1744,7 +1758,8 @@ do_lrm_invoke(long long action, - if ((lrm_state == NULL) && is_remote_node) { - crm_err("Failing action because local node has never had connection to remote node %s", - target_node); -- synthesize_lrmd_failure(NULL, input->xml, PCMK_OCF_CONNECTION_DIED); -+ synthesize_lrmd_failure(NULL, input->xml, PCMK_LRM_OP_ERROR, -+ PCMK_OCF_CONNECTION_DIED); - return; - } - CRM_ASSERT(lrm_state != NULL); -@@ -1800,7 +1815,7 @@ do_lrm_invoke(long long action, - - rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc); - if (rc == -ENOTCONN) { -- synthesize_lrmd_failure(lrm_state, input->xml, -+ synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR, - PCMK_OCF_CONNECTION_DIED); - return; - -@@ -1822,7 +1837,7 @@ do_lrm_invoke(long long action, - // Resource operation on malformed resource - crm_err("Invalid resource definition for %s", ID(xml_rsc)); - crm_log_xml_warn(input->msg, "invalid resource"); -- synthesize_lrmd_failure(lrm_state, input->xml, -+ synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR, - PCMK_OCF_NOT_CONFIGURED); // fatal error - return; - -@@ -1832,7 +1847,7 @@ do_lrm_invoke(long long action, - CRM_XS " rc=%d", - ID(xml_rsc), pcmk_strerror(rc), rc); - crm_log_xml_warn(input->msg, "failed registration"); -- synthesize_lrmd_failure(lrm_state, input->xml, -+ synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR, - PCMK_OCF_INVALID_PARAM); // hard error - return; - } --- -1.8.3.1 - - -From ddc3942d7131db9c9874031ca4b3b4a531221573 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 5 Jun 2019 13:08:15 -0500 -Subject: [PATCH 04/13] Fix: controller: use op status, not rc, for executor - disconnection - -Previously, if an action were requested for an executor (local or remote) that -the controller does not have a connection to, the action's rc would be set to -PCMK_OCF_CONNECTION_DIED and its op status to PCMK_LRM_OP_ERROR. - -This was undesirable for a couple reasons: PCMK_OCF_CONNECTION_DIED is a -nonstandard extension to the OCF return codes, which can confuse users -trying to look up the meaning or interpret cluster status output; and it really -is an operation execution status and not an operation result. - -This changes the result to PCMK_OCF_UNKNOWN_ERROR with a new op status -PCMK_LRM_OP_NOT_CONNECTED. The new codes are mapped to the old ones for older -DCs that don't understand them. ---- - cts/CTStests.py | 2 +- - daemons/controld/controld_execd.c | 21 +++++++++++++++++---- - daemons/controld/controld_execd_state.c | 6 ++++-- - include/crm/services.h | 4 +++- - lib/common/operations.c | 1 + - lib/pengine/unpack.c | 3 ++- - 6 files changed, 28 insertions(+), 9 deletions(-) - -diff --git a/cts/CTStests.py b/cts/CTStests.py -index 32945cb..be7fd7f 100644 ---- a/cts/CTStests.py -+++ b/cts/CTStests.py -@@ -3068,7 +3068,7 @@ class RemoteStonithd(RemoteDriver): - r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", - r"Calculated [Tt]ransition .*pe-error", - r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery", -- r"error: Result of monitor operation for .* on remote-.*: Error", -+ r"error: Result of monitor operation for .* on remote-.*: No executor connection", - ] - - ignore_pats.extend(RemoteDriver.errorstoignore(self)) -diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c -index fed9419..ac215b6 100644 ---- a/daemons/controld/controld_execd.c -+++ b/daemons/controld/controld_execd.c -@@ -1758,8 +1758,8 @@ do_lrm_invoke(long long action, - if ((lrm_state == NULL) && is_remote_node) { - crm_err("Failing action because local node has never had connection to remote node %s", - target_node); -- synthesize_lrmd_failure(NULL, input->xml, PCMK_LRM_OP_ERROR, -- PCMK_OCF_CONNECTION_DIED); -+ synthesize_lrmd_failure(NULL, input->xml, PCMK_LRM_OP_NOT_CONNECTED, -+ PCMK_OCF_UNKNOWN_ERROR); - return; - } - CRM_ASSERT(lrm_state != NULL); -@@ -1815,8 +1815,9 @@ do_lrm_invoke(long long action, - - rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc); - if (rc == -ENOTCONN) { -- synthesize_lrmd_failure(lrm_state, input->xml, PCMK_LRM_OP_ERROR, -- PCMK_OCF_CONNECTION_DIED); -+ synthesize_lrmd_failure(lrm_state, input->xml, -+ PCMK_LRM_OP_NOT_CONNECTED, -+ PCMK_OCF_UNKNOWN_ERROR); - return; - - } else if ((rc < 0) && !create_rsc) { -@@ -2532,6 +2533,18 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, - CRM_CHECK(op != NULL, return); - CRM_CHECK(op->rsc_id != NULL, return); - -+ // Remap new status codes for older DCs -+ if (compare_version(fsa_our_dc_version, "3.2.0") < 0) { -+ switch (op->op_status) { -+ case PCMK_LRM_OP_NOT_CONNECTED: -+ op->op_status = PCMK_LRM_OP_ERROR; -+ op->rc = PCMK_OCF_CONNECTION_DIED; -+ break; -+ default: -+ break; -+ } -+ } -+ - op_id = make_stop_id(op->rsc_id, op->call_id); - op_key = generate_op_key(op->rsc_id, op->op_type, op->interval_ms); - -diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c -index 4e9f096..63e6b33 100644 ---- a/daemons/controld/controld_execd_state.c -+++ b/daemons/controld/controld_execd_state.c -@@ -1,6 +1,8 @@ - /* - * Copyright 2012-2019 the Pacemaker project contributors - * -+ * The version control history for this file may have further details. -+ * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. - */ -@@ -76,8 +78,8 @@ fail_pending_op(gpointer key, gpointer value, gpointer user_data) - event.user_data = op->user_data; - event.timeout = 0; - event.interval_ms = op->interval_ms; -- event.rc = PCMK_OCF_CONNECTION_DIED; -- event.op_status = PCMK_LRM_OP_ERROR; -+ event.rc = PCMK_OCF_UNKNOWN_ERROR; -+ event.op_status = PCMK_LRM_OP_NOT_CONNECTED; - event.t_run = op->start_time; - event.t_rcchange = op->start_time; - -diff --git a/include/crm/services.h b/include/crm/services.h -index 4bdd21a..ca9470b 100644 ---- a/include/crm/services.h -+++ b/include/crm/services.h -@@ -100,7 +100,7 @@ enum ocf_exitcode { - - - /* 150-199 reserved for application use */ -- PCMK_OCF_CONNECTION_DIED = 189, /* Operation failure implied by disconnection of the LRM API to a local or remote node */ -+ PCMK_OCF_CONNECTION_DIED = 189, // Deprecated (see PCMK_LRM_OP_NOT_CONNECTED) - - PCMK_OCF_DEGRADED = 190, /* Active resource that is no longer 100% functional */ - PCMK_OCF_DEGRADED_MASTER = 191, /* Promoted resource that is no longer 100% functional */ -@@ -126,6 +126,7 @@ enum op_status { - PCMK_LRM_OP_ERROR_HARD, - PCMK_LRM_OP_ERROR_FATAL, - PCMK_LRM_OP_NOT_INSTALLED, -+ PCMK_LRM_OP_NOT_CONNECTED, - }; - - enum nagios_exitcode { -@@ -337,6 +338,7 @@ gboolean services_alert_async(svc_action_t *action, - case PCMK_LRM_OP_NOTSUPPORTED:return "NOT SUPPORTED"; - case PCMK_LRM_OP_ERROR:return "Error"; - case PCMK_LRM_OP_NOT_INSTALLED:return "Not installed"; -+ case PCMK_LRM_OP_NOT_CONNECTED:return "No executor connection"; - default:return "UNKNOWN!"; - } - } -diff --git a/lib/common/operations.c b/lib/common/operations.c -index 2144cc6..c6b16cb 100644 ---- a/lib/common/operations.c -+++ b/lib/common/operations.c -@@ -395,6 +395,7 @@ did_rsc_op_fail(lrmd_event_data_t * op, int target_rc) - case PCMK_LRM_OP_NOTSUPPORTED: - case PCMK_LRM_OP_TIMEOUT: - case PCMK_LRM_OP_ERROR: -+ case PCMK_LRM_OP_NOT_CONNECTED: - return TRUE; - break; - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 0e8177b..671f0c4 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3163,7 +3163,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last - crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - - CRM_CHECK(task != NULL, return FALSE); -- CRM_CHECK(status <= PCMK_LRM_OP_NOT_INSTALLED, return FALSE); -+ CRM_CHECK(status <= PCMK_LRM_OP_NOT_CONNECTED, return FALSE); - CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return FALSE); - - if (safe_str_eq(task, CRMD_ACTION_NOTIFY) || -@@ -3304,6 +3304,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last - case PCMK_LRM_OP_ERROR_FATAL: - case PCMK_LRM_OP_TIMEOUT: - case PCMK_LRM_OP_NOTSUPPORTED: -+ case PCMK_LRM_OP_NOT_CONNECTED: - - failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); - if ((failure_strategy == action_fail_ignore) --- -1.8.3.1 - - -From fc135cb441fb7c66a44fbffe74dcae26c112be3f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 5 Jun 2019 13:43:08 -0500 -Subject: [PATCH 05/13] Fix: controller: use op status, not rc, for execution - in invalid state - -Previously, if an action were requested while the controller cannot execute actions -(i.e. shutdown), the action's rc would be set to CRM_DIRECT_NACK_RC and its op -status to PCMK_LRM_OP_ERROR. - -This was undesirable for a couple reasons: rc should only be OCF return codes, -and it really is an operation execution status and not an operation result. - -This changes the result to PCMK_OCF_UNKNOWN_ERROR with a new op status -PCMK_LRM_OP_INVALID. The new codes are mapped to the old ones for older -DCs that don't understand them. ---- - daemons/controld/controld_execd.c | 8 ++++++-- - daemons/controld/controld_fsa.h | 6 +----- - daemons/controld/controld_te_events.c | 13 ++++++------- - include/crm/services.h | 2 ++ - lib/common/operations.c | 1 + - lib/pengine/unpack.c | 3 ++- - 6 files changed, 18 insertions(+), 15 deletions(-) - -diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c -index ac215b6..a20f96a 100644 ---- a/daemons/controld/controld_execd.c -+++ b/daemons/controld/controld_execd.c -@@ -2254,8 +2254,8 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat - operation, rsc->id, fsa_state2string(fsa_state), - is_set(fsa_input_register, R_SHUTDOWN)?"true":"false"); - -- op->rc = CRM_DIRECT_NACK_RC; -- op->op_status = PCMK_LRM_OP_ERROR; -+ op->rc = PCMK_OCF_UNKNOWN_ERROR; -+ op->op_status = PCMK_LRM_OP_INVALID; - send_direct_ack(NULL, NULL, rsc, op, rsc->id); - lrmd_free_event(op); - free(op_id); -@@ -2540,6 +2540,10 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, - op->op_status = PCMK_LRM_OP_ERROR; - op->rc = PCMK_OCF_CONNECTION_DIED; - break; -+ case PCMK_LRM_OP_INVALID: -+ op->op_status = PCMK_LRM_OP_ERROR; -+ op->rc = CRM_DIRECT_NACK_RC; -+ break; - default: - break; - } -diff --git a/daemons/controld/controld_fsa.h b/daemons/controld/controld_fsa.h -index 397a9cd..7527ed9 100644 ---- a/daemons/controld/controld_fsa.h -+++ b/daemons/controld/controld_fsa.h -@@ -426,11 +426,7 @@ enum crmd_fsa_input { - - # define R_IN_RECOVERY 0x80000000ULL - --/* -- * Magic RC used within the controller to indicate direct nacks -- * (operation is invalid in current state) -- */ --#define CRM_DIRECT_NACK_RC (99) -+#define CRM_DIRECT_NACK_RC (99) // Deprecated (see PCMK_LRM_OP_INVALID) - - enum crmd_fsa_cause { - C_UNKNOWN = 0, -diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c -index b7b48a4..d297241 100644 ---- a/daemons/controld/controld_te_events.c -+++ b/daemons/controld/controld_te_events.c -@@ -123,10 +123,8 @@ update_failcount(xmlNode * event, const char *event_node_uuid, int rc, - const char *on_uname = crm_peer_uname(event_node_uuid); - const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); - -- /* Nothing needs to be done for success, lrm status refresh, -- * or direct nack (internal code for "busy, try again") -- */ -- if ((rc == CRM_DIRECT_NACK_RC) || (rc == target_rc)) { -+ // Nothing needs to be done for success or status refresh -+ if (rc == target_rc) { - return FALSE; - } else if (safe_str_eq(origin, "build_active_RAs")) { - crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", -@@ -225,7 +223,7 @@ status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc) - return PCMK_LRM_OP_DONE; - } - -- if (rc != CRM_DIRECT_NACK_RC) { -+ if (orig_status != PCMK_LRM_OP_INVALID) { - const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); - const char *uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - -@@ -541,8 +539,9 @@ process_graph_event(xmlNode *event, const char *event_node) - if (action && (rc == target_rc)) { - crm_trace("Processed update to %s: %s", id, magic); - } else { -- if (update_failcount(event, event_node, rc, target_rc, -- (transition_num == -1), ignore_failures)) { -+ if ((status != PCMK_LRM_OP_INVALID) -+ && update_failcount(event, event_node, rc, target_rc, -+ (transition_num == -1), ignore_failures)) { - desc = "failed"; - } - crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num, -diff --git a/include/crm/services.h b/include/crm/services.h -index ca9470b..0771241 100644 ---- a/include/crm/services.h -+++ b/include/crm/services.h -@@ -127,6 +127,7 @@ enum op_status { - PCMK_LRM_OP_ERROR_FATAL, - PCMK_LRM_OP_NOT_INSTALLED, - PCMK_LRM_OP_NOT_CONNECTED, -+ PCMK_LRM_OP_INVALID, - }; - - enum nagios_exitcode { -@@ -339,6 +340,7 @@ gboolean services_alert_async(svc_action_t *action, - case PCMK_LRM_OP_ERROR:return "Error"; - case PCMK_LRM_OP_NOT_INSTALLED:return "Not installed"; - case PCMK_LRM_OP_NOT_CONNECTED:return "No executor connection"; -+ case PCMK_LRM_OP_INVALID:return "Cannot execute now"; - default:return "UNKNOWN!"; - } - } -diff --git a/lib/common/operations.c b/lib/common/operations.c -index c6b16cb..480bddc 100644 ---- a/lib/common/operations.c -+++ b/lib/common/operations.c -@@ -396,6 +396,7 @@ did_rsc_op_fail(lrmd_event_data_t * op, int target_rc) - case PCMK_LRM_OP_TIMEOUT: - case PCMK_LRM_OP_ERROR: - case PCMK_LRM_OP_NOT_CONNECTED: -+ case PCMK_LRM_OP_INVALID: - return TRUE; - break; - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 671f0c4..fb1ab60 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3163,7 +3163,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last - crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - - CRM_CHECK(task != NULL, return FALSE); -- CRM_CHECK(status <= PCMK_LRM_OP_NOT_CONNECTED, return FALSE); -+ CRM_CHECK(status <= PCMK_LRM_OP_INVALID, return FALSE); - CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return FALSE); - - if (safe_str_eq(task, CRMD_ACTION_NOTIFY) || -@@ -3305,6 +3305,7 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last - case PCMK_LRM_OP_TIMEOUT: - case PCMK_LRM_OP_NOTSUPPORTED: - case PCMK_LRM_OP_NOT_CONNECTED: -+ case PCMK_LRM_OP_INVALID: - - failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); - if ((failure_strategy == action_fail_ignore) --- -1.8.3.1 - - -From f5ea526b211e95ece16acb0f72bfbbbda60ec437 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 12 Jun 2019 20:48:59 -0500 -Subject: [PATCH 06/13] Doc: libcrmcommon: document CRM_FEATURE_SET in API docs - ---- - include/crm/crm.h | 23 ++++++++++++++++++++++- - 1 file changed, 22 insertions(+), 1 deletion(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 5f323e8..56a2048 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2018 the Pacemaker project contributors -+ * Copyright 2004-2019 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -29,6 +29,27 @@ extern "C" { - - # include - -+/*! -+ * The CRM feature set assists with compatibility in mixed-version clusters. -+ * The major version number increases when nodes with different versions -+ * would not work (rolling upgrades are not allowed). The minor version -+ * number increases when mixed-version clusters are allowed only during -+ * rolling upgrades (a node with the oldest feature set will be elected DC). The -+ * minor-minor version number is ignored, but allows resource agents to detect -+ * cluster support for various features. -+ * -+ * The feature set also affects the processing of old saved CIBs (such as for -+ * many scheduler regression tests). -+ * -+ * Particular feature points currently used by pacemaker: -+ * -+ * >2.1: Operation updates include timing data -+ * >=3.0.5: XML v2 digests are created -+ * >=3.0.8: Peers do not need acks for cancellations -+ * >=3.0.9: DC will send its own shutdown request to all peers -+ * XML v2 patchsets are created by default -+ * >=3.0.13: Fail counts include operation name and interval -+ */ - # define CRM_FEATURE_SET "3.1.0" - - # define EOS '\0' --- -1.8.3.1 - - -From 1ff54a448b1178a34f2dd4f615221087e08468de Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 12 Jun 2019 20:51:21 -0500 -Subject: [PATCH 07/13] Feature: libcrmcommon: bump CRM feature set - -... for the new LRM op status codes ---- - include/crm/crm.h | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 56a2048..cbf72d3 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -49,8 +49,9 @@ extern "C" { - * >=3.0.9: DC will send its own shutdown request to all peers - * XML v2 patchsets are created by default - * >=3.0.13: Fail counts include operation name and interval -+ * >=3.2.0: DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.1.0" -+# define CRM_FEATURE_SET "3.2.0" - - # define EOS '\0' - # define DIMOF(a) ((int) (sizeof(a)/sizeof(a[0])) ) --- -1.8.3.1 - - -From efc639cc835fba27fa5af4a0539e995d95660520 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 5 Jun 2019 15:12:20 -0500 -Subject: [PATCH 08/13] Low: libpe_status: fail connection resource if remote - action gets "not connected" - ---- - lib/pengine/unpack.c | 15 ++++++++++++++- - 1 file changed, 14 insertions(+), 1 deletion(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index fb1ab60..081df07 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3299,12 +3299,25 @@ unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last - unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set); - break; - -+ case PCMK_LRM_OP_NOT_CONNECTED: -+ if (pe__is_guest_or_remote_node(node) -+ && is_set(node->details->remote_rsc->flags, pe_rsc_managed)) { -+ /* We should never get into a situation where a managed remote -+ * connection resource is considered OK but a resource action -+ * behind the connection gets a "not connected" status. But as a -+ * fail-safe in case a bug or unusual circumstances do lead to -+ * that, ensure the remote connection is considered failed. -+ */ -+ set_bit(node->details->remote_rsc->flags, pe_rsc_failed); -+ } -+ -+ // fall through -+ - case PCMK_LRM_OP_ERROR: - case PCMK_LRM_OP_ERROR_HARD: - case PCMK_LRM_OP_ERROR_FATAL: - case PCMK_LRM_OP_TIMEOUT: - case PCMK_LRM_OP_NOTSUPPORTED: -- case PCMK_LRM_OP_NOT_CONNECTED: - case PCMK_LRM_OP_INVALID: - - failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); --- -1.8.3.1 - - -From dad337a96dfeca4dbde7bbd97f99f24956440fc2 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Sat, 8 Jun 2019 16:25:04 -0500 -Subject: [PATCH 09/13] Refactor: libpe_status: add function for checking - shutdown attribute - -... to reduce code duplication and allow further reuse ---- - include/crm/pengine/internal.h | 2 ++ - lib/pengine/unpack.c | 8 ++------ - lib/pengine/utils.c | 20 ++++++++++++++++++++ - 3 files changed, 24 insertions(+), 6 deletions(-) - -diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h -index fd55bb9..a2a3d52 100644 ---- a/include/crm/pengine/internal.h -+++ b/include/crm/pengine/internal.h -@@ -359,4 +359,6 @@ void pe__foreach_param_check(pe_working_set_t *data_set, - enum pe_check_parameters, - pe_working_set_t*)); - void pe__free_param_checks(pe_working_set_t *data_set); -+ -+bool pe__shutdown_requested(pe_node_t *node); - #endif -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 081df07..9d13a57 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -909,7 +909,6 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * - const char *resource_discovery_enabled = NULL; - xmlNode *attrs = NULL; - resource_t *rsc = NULL; -- const char *shutdown = NULL; - - if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { - return; -@@ -931,8 +930,7 @@ unpack_handle_remote_attrs(node_t *this_node, xmlNode *state, pe_working_set_t * - attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); - add_node_attrs(attrs, this_node, TRUE, data_set); - -- shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN); -- if (shutdown != NULL && safe_str_neq("0", shutdown)) { -+ if (pe__shutdown_requested(this_node)) { - crm_info("Node %s is shutting down", this_node->details->uname); - this_node->details->shutdown = TRUE; - if (rsc) { -@@ -1392,7 +1390,6 @@ gboolean - determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set) - { - gboolean online = FALSE; -- const char *shutdown = NULL; - const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); - - if (this_node == NULL) { -@@ -1402,9 +1399,8 @@ determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set - - this_node->details->shutdown = FALSE; - this_node->details->expected_up = FALSE; -- shutdown = pe_node_attribute_raw(this_node, XML_CIB_ATTR_SHUTDOWN); - -- if (shutdown != NULL && safe_str_neq("0", shutdown)) { -+ if (pe__shutdown_requested(this_node)) { - this_node->details->shutdown = TRUE; - - } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) { -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 5b893f7..c5fd0f7 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -2510,3 +2510,23 @@ void pe_action_set_reason(pe_action_t *action, const char *reason, bool overwrit - } - } - } -+ -+/*! -+ * \internal -+ * \brief Check whether shutdown has been requested for a node -+ * -+ * \param[in] node Node to check -+ * -+ * \return TRUE if node has shutdown attribute set and nonzero, FALSE otherwise -+ * \note This differs from simply using node->details->shutdown in that it can -+ * be used before that has been determined (and in fact to determine it), -+ * and it can also be used to distinguish requested shutdown from implicit -+ * shutdown of remote nodes by virtue of their connection stopping. -+ */ -+bool -+pe__shutdown_requested(pe_node_t *node) -+{ -+ const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN); -+ -+ return shutdown && strcmp(shutdown, "0"); -+} --- -1.8.3.1 - - -From 1e9903326a59f58d9dd2f2618d709f8aa61e41e9 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 5 Jun 2019 16:37:26 -0500 -Subject: [PATCH 10/13] Fix: scheduler: remote state is failed if node is - shutting down with connection failure - -When determining remote state, if the connection resource is failed and not -being started again, we consider the state to be unknown if the connection has -a reconnect interval, because we won't know whether the connection can be -recovered until the interval expires and we re-attempt connection. - -However, if the node is shutting down at the time, we won't re-attempt -connection, so consider the state failed in that case. (Note that we check the -actual shutdown node attribute, rather than node->details->shutdown, since that -is set for remote nodes whenever the connection is stopping.) - -This avoids a situation where actions that cannot succeed can be scheduled on a -remote node that's shutting down. ---- - lib/pacemaker/pcmk_sched_allocate.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c -index 3363a72..b7d1b48 100644 ---- a/lib/pacemaker/pcmk_sched_allocate.c -+++ b/lib/pacemaker/pcmk_sched_allocate.c -@@ -1972,7 +1972,8 @@ get_remote_node_state(pe_node_t *node) - - if ((remote_rsc->next_role == RSC_ROLE_STOPPED) - && remote_rsc->remote_reconnect_ms -- && node->details->remote_was_fenced) { -+ && node->details->remote_was_fenced -+ && !pe__shutdown_requested(node)) { - - /* We won't know whether the connection is recoverable until the - * reconnect interval expires and we reattempt connection. --- -1.8.3.1 - - -From ea70750d04219618b5feeda04443b27616e441a0 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 5 Jun 2019 16:43:19 -0500 -Subject: [PATCH 11/13] Fix: libpe_status: don't order implied stops relative - to a remote connection - -Actions behind a remote connection are ordered relative to any start or stop of -the remote connection. However, if the action is a stop implied due to fencing, -it does not require the remote connection, and the ordering should not be done. - -This avoids a delay in the remote connection recovery if it is failed, e.g. -previously the ordering would look like: - - fence remote node -> implied stop of resource on remote -> stop connection - -Now, the connection stop can proceed simultaneously with the remote node -fencing. ---- - lib/pacemaker/pcmk_sched_allocate.c | 11 +++++------ - 1 file changed, 5 insertions(+), 6 deletions(-) - -diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c -index b7d1b48..9f82c00 100644 ---- a/lib/pacemaker/pcmk_sched_allocate.c -+++ b/lib/pacemaker/pcmk_sched_allocate.c -@@ -2065,14 +2065,13 @@ apply_remote_ordering(action_t *action, pe_working_set_t *data_set) - pe_order_implies_first, data_set); - - } else if(state == remote_state_failed) { -- /* We would only be here if the resource is -- * running on the remote node. Since we have no -- * way to stop it, it is necessary to fence the -- * node. -+ /* The resource is active on the node, but since we don't have a -+ * valid connection, the only way to stop the resource is by -+ * fencing the node. There is no need to order the stop relative -+ * to the remote connection, since the stop will become implied -+ * by the fencing. - */ - pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); -- order_action_then_stop(action, remote_rsc, -- pe_order_implies_first, data_set); - - } else if(remote_rsc->next_role == RSC_ROLE_STOPPED) { - /* State must be remote_state_unknown or remote_state_stopped. --- -1.8.3.1 - - -From 091c367369b892d26fe0de99d35cf521b6249d10 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Sat, 8 Jun 2019 16:51:20 -0500 -Subject: [PATCH 12/13] Test: cts-scheduler: update regression tests for remote - connection ordering change - -Remote connection stops no longer have to wait for implied stops of resources -behind the connection. - -Unchanged from before, if the remote connection stops are implied themselves, -they can be confirmed immediately without waiting for their host's fencing, -because remote connections have "requires" set to "quorum" rather than -"fencing". ---- - cts/scheduler/order-expired-failure.dot | 1 - - cts/scheduler/order-expired-failure.exp | 6 +----- - cts/scheduler/order-expired-failure.summary | 2 +- - cts/scheduler/remote-connection-unrecoverable.dot | 2 -- - cts/scheduler/remote-connection-unrecoverable.exp | 9 +-------- - cts/scheduler/remote-connection-unrecoverable.summary | 2 +- - cts/scheduler/remote-fence-before-reconnect.dot | 1 - - cts/scheduler/remote-fence-before-reconnect.exp | 6 +----- - cts/scheduler/remote-fence-before-reconnect.summary | 2 +- - cts/scheduler/remote-recover-all.dot | 2 -- - cts/scheduler/remote-recover-all.exp | 12 ++---------- - cts/scheduler/remote-recover-all.summary | 4 ++-- - cts/scheduler/remote-recover-no-resources.dot | 1 - - cts/scheduler/remote-recover-no-resources.exp | 6 +----- - cts/scheduler/remote-recover-no-resources.summary | 2 +- - cts/scheduler/remote-recover-unknown.dot | 1 - - cts/scheduler/remote-recover-unknown.exp | 6 +----- - cts/scheduler/remote-recover-unknown.summary | 2 +- - 18 files changed, 14 insertions(+), 53 deletions(-) - -diff --git a/cts/scheduler/order-expired-failure.dot b/cts/scheduler/order-expired-failure.dot -index 2e9963b..5c21d5d 100644 ---- a/cts/scheduler/order-expired-failure.dot -+++ b/cts/scheduler/order-expired-failure.dot -@@ -4,7 +4,6 @@ digraph "g" { - "compute-unfence-trigger-clone_stop_0" [ style=bold color="green" fontcolor="orange"] - "compute-unfence-trigger-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] - "compute-unfence-trigger_stop_0 overcloud-novacompute-1" -> "compute-unfence-trigger-clone_stopped_0" [ style = bold] --"compute-unfence-trigger_stop_0 overcloud-novacompute-1" -> "overcloud-novacompute-1_stop_0 controller-1" [ style = bold] - "compute-unfence-trigger_stop_0 overcloud-novacompute-1" [ style=bold color="green" fontcolor="orange"] - "ip-10.0.0.110_monitor_10000 controller-1" [ style=bold color="green" fontcolor="black"] - "ip-10.0.0.110_start_0 controller-1" -> "ip-10.0.0.110_monitor_10000 controller-1" [ style = bold] -diff --git a/cts/scheduler/order-expired-failure.exp b/cts/scheduler/order-expired-failure.exp -index c476bc2..4a50493 100644 ---- a/cts/scheduler/order-expired-failure.exp -+++ b/cts/scheduler/order-expired-failure.exp -@@ -9,11 +9,7 @@ - - - -- -- -- -- -- -+ - - - -diff --git a/cts/scheduler/order-expired-failure.summary b/cts/scheduler/order-expired-failure.summary -index c86bb91..2cf43ed 100644 ---- a/cts/scheduler/order-expired-failure.summary -+++ b/cts/scheduler/order-expired-failure.summary -@@ -52,6 +52,7 @@ Transition Summary: - * Stop compute-unfence-trigger:1 ( overcloud-novacompute-1 ) due to node availability - - Executing cluster transition: -+ * Resource action: overcloud-novacompute-1 stop on controller-1 - * Resource action: stonith-fence_compute-fence-nova stop on controller-2 - * Fencing overcloud-novacompute-1 (reboot) - * Cluster action: clear_failcount for overcloud-novacompute-1 on controller-1 -@@ -62,7 +63,6 @@ Executing cluster transition: - * Resource action: ip-10.0.0.110 monitor=10000 on controller-1 - * Pseudo action: compute-unfence-trigger_stop_0 - * Pseudo action: compute-unfence-trigger-clone_stopped_0 -- * Resource action: overcloud-novacompute-1 stop on controller-1 - Using the original execution date of: 2018-04-09 07:55:35Z - - Revised cluster status: -diff --git a/cts/scheduler/remote-connection-unrecoverable.dot b/cts/scheduler/remote-connection-unrecoverable.dot -index 7728425..1017d2b 100644 ---- a/cts/scheduler/remote-connection-unrecoverable.dot -+++ b/cts/scheduler/remote-connection-unrecoverable.dot -@@ -7,14 +7,12 @@ digraph "g" { - "remote1_stop_0 node1" [ style=bold color="green" fontcolor="orange"] - "rsc1_delete_0 remote1" -> "rsc1_start_0 node2" [ style = dashed] - "rsc1_delete_0 remote1" [ style=dashed color="red" fontcolor="black"] --"rsc1_monitor_0 node2" -> "remote1_stop_0 node1" [ style = bold] - "rsc1_monitor_0 node2" -> "rsc1_start_0 node2" [ style = bold] - "rsc1_monitor_0 node2" -> "rsc2-master_demote_0" [ style = bold] - "rsc1_monitor_0 node2" [ style=bold color="green" fontcolor="black"] - "rsc1_monitor_10000 node2" [ style=bold color="green" fontcolor="black"] - "rsc1_start_0 node2" -> "rsc1_monitor_10000 node2" [ style = bold] - "rsc1_start_0 node2" [ style=bold color="green" fontcolor="black"] --"rsc1_stop_0 remote1" -> "remote1_stop_0 node1" [ style = bold] - "rsc1_stop_0 remote1" -> "rsc1_delete_0 remote1" [ style = dashed] - "rsc1_stop_0 remote1" -> "rsc1_start_0 node2" [ style = bold] - "rsc1_stop_0 remote1" -> "rsc2-master_demote_0" [ style = bold] -diff --git a/cts/scheduler/remote-connection-unrecoverable.exp b/cts/scheduler/remote-connection-unrecoverable.exp -index 2c9357b..d57c106 100644 ---- a/cts/scheduler/remote-connection-unrecoverable.exp -+++ b/cts/scheduler/remote-connection-unrecoverable.exp -@@ -5,14 +5,7 @@ - - - -- -- -- -- -- -- -- -- -+ - - - -diff --git a/cts/scheduler/remote-connection-unrecoverable.summary b/cts/scheduler/remote-connection-unrecoverable.summary -index 23fa9ca..caff564 100644 ---- a/cts/scheduler/remote-connection-unrecoverable.summary -+++ b/cts/scheduler/remote-connection-unrecoverable.summary -@@ -21,6 +21,7 @@ Transition Summary: - * Stop rsc2:0 ( Master node1 ) due to node availability - - Executing cluster transition: -+ * Pseudo action: remote1_stop_0 - * Resource action: killer stop on node2 - * Resource action: rsc1 monitor on node2 - * Fencing node1 (reboot) -@@ -29,7 +30,6 @@ Executing cluster transition: - * Resource action: killer monitor=60000 on node2 - * Pseudo action: rsc1_stop_0 - * Pseudo action: rsc2-master_demote_0 -- * Pseudo action: remote1_stop_0 - * Resource action: rsc1 start on node2 - * Pseudo action: rsc2_demote_0 - * Pseudo action: rsc2-master_demoted_0 -diff --git a/cts/scheduler/remote-fence-before-reconnect.dot b/cts/scheduler/remote-fence-before-reconnect.dot -index 4ced43e..5812b7f 100644 ---- a/cts/scheduler/remote-fence-before-reconnect.dot -+++ b/cts/scheduler/remote-fence-before-reconnect.dot -@@ -3,7 +3,6 @@ - "fake2_monitor_10000 c7auto1" [ style=bold color="green" fontcolor="black"] - "fake2_start_0 c7auto1" -> "fake2_monitor_10000 c7auto1" [ style = bold] - "fake2_start_0 c7auto1" [ style=bold color="green" fontcolor="black"] --"fake2_stop_0 c7auto4" -> "c7auto4_stop_0 c7auto1" [ style = bold] - "fake2_stop_0 c7auto4" -> "fake2_start_0 c7auto1" [ style = bold] - "fake2_stop_0 c7auto4" [ style=bold color="green" fontcolor="orange"] - "stonith 'reboot' c7auto4" -> "fake2_start_0 c7auto1" [ style = bold] -diff --git a/cts/scheduler/remote-fence-before-reconnect.exp b/cts/scheduler/remote-fence-before-reconnect.exp -index f99d9ef..f506f85 100644 ---- a/cts/scheduler/remote-fence-before-reconnect.exp -+++ b/cts/scheduler/remote-fence-before-reconnect.exp -@@ -9,11 +9,7 @@ - - - -- -- -- -- -- -+ - - - -diff --git a/cts/scheduler/remote-fence-before-reconnect.summary b/cts/scheduler/remote-fence-before-reconnect.summary -index f61e18b..03eac20 100644 ---- a/cts/scheduler/remote-fence-before-reconnect.summary -+++ b/cts/scheduler/remote-fence-before-reconnect.summary -@@ -17,9 +17,9 @@ Transition Summary: - * Move fake2 ( c7auto4 -> c7auto1 ) - - Executing cluster transition: -+ * Resource action: c7auto4 stop on c7auto1 - * Fencing c7auto4 (reboot) - * Pseudo action: fake2_stop_0 -- * Resource action: c7auto4 stop on c7auto1 - * Resource action: fake2 start on c7auto1 - * Resource action: fake2 monitor=10000 on c7auto1 - -diff --git a/cts/scheduler/remote-recover-all.dot b/cts/scheduler/remote-recover-all.dot -index deed802..4128b10 100644 ---- a/cts/scheduler/remote-recover-all.dot -+++ b/cts/scheduler/remote-recover-all.dot -@@ -19,7 +19,6 @@ digraph "g" { - "galera_demote_0 galera-2" -> "galera_stop_0 galera-2" [ style = bold] - "galera_demote_0 galera-2" [ style=bold color="green" fontcolor="orange"] - "galera_monitor_10000 galera-0" [ style=bold color="green" fontcolor="black"] --"galera_stop_0 galera-2" -> "galera-2_stop_0 controller-1" [ style = bold] - "galera_stop_0 galera-2" -> "galera-master_stopped_0" [ style = bold] - "galera_stop_0 galera-2" [ style=bold color="green" fontcolor="orange"] - "haproxy-clone_stop_0" -> "haproxy-clone_stopped_0" [ style = bold] -@@ -60,7 +59,6 @@ digraph "g" { - "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold] - "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold] - "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] --"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold] - "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold] - "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"] - "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] -diff --git a/cts/scheduler/remote-recover-all.exp b/cts/scheduler/remote-recover-all.exp -index 8137ffb..0cb51f6 100644 ---- a/cts/scheduler/remote-recover-all.exp -+++ b/cts/scheduler/remote-recover-all.exp -@@ -5,11 +5,7 @@ - - - -- -- -- -- -- -+ - - - -@@ -57,11 +53,7 @@ - - - -- -- -- -- -- -+ - - - -diff --git a/cts/scheduler/remote-recover-all.summary b/cts/scheduler/remote-recover-all.summary -index 2ac0c6a..d095fdd 100644 ---- a/cts/scheduler/remote-recover-all.summary -+++ b/cts/scheduler/remote-recover-all.summary -@@ -56,7 +56,9 @@ Transition Summary: - * Move stonith-fence_ipmilan-5254005bdbb5 ( controller-1 -> controller-2 ) - - Executing cluster transition: -+ * Pseudo action: messaging-1_stop_0 - * Pseudo action: galera-0_stop_0 -+ * Pseudo action: galera-2_stop_0 - * Pseudo action: galera-master_demote_0 - * Pseudo action: redis-master_pre_notify_stop_0 - * Resource action: stonith-fence_ipmilan-525400bbf613 stop on controller-0 -@@ -94,7 +96,6 @@ Executing cluster transition: - * Resource action: stonith-fence_ipmilan-525400b4f6bd monitor=60000 on controller-0 - * Resource action: stonith-fence_ipmilan-5254005bdbb5 start on controller-2 - * Resource action: galera-0 monitor=20000 on controller-2 -- * Pseudo action: galera-2_stop_0 - * Resource action: rabbitmq notify on messaging-2 - * Resource action: rabbitmq notify on messaging-0 - * Pseudo action: rabbitmq_notified_0 -@@ -107,7 +108,6 @@ Executing cluster transition: - * Resource action: ip-172.17.1.17 start on controller-2 - * Resource action: ip-172.17.4.11 start on controller-2 - * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2 -- * Pseudo action: messaging-1_stop_0 - * Pseudo action: redis_notified_0 - * Resource action: ip-172.17.1.14 monitor=10000 on controller-2 - * Resource action: ip-172.17.1.17 monitor=10000 on controller-2 -diff --git a/cts/scheduler/remote-recover-no-resources.dot b/cts/scheduler/remote-recover-no-resources.dot -index ef78aa6..a2f8ce0 100644 ---- a/cts/scheduler/remote-recover-no-resources.dot -+++ b/cts/scheduler/remote-recover-no-resources.dot -@@ -45,7 +45,6 @@ digraph "g" { - "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold] - "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold] - "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] --"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold] - "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold] - "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"] - "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] -diff --git a/cts/scheduler/remote-recover-no-resources.exp b/cts/scheduler/remote-recover-no-resources.exp -index 8a67c11..90470fb 100644 ---- a/cts/scheduler/remote-recover-no-resources.exp -+++ b/cts/scheduler/remote-recover-no-resources.exp -@@ -5,11 +5,7 @@ - - - -- -- -- -- -- -+ - - - -diff --git a/cts/scheduler/remote-recover-no-resources.summary b/cts/scheduler/remote-recover-no-resources.summary -index 89da784..18a989b 100644 ---- a/cts/scheduler/remote-recover-no-resources.summary -+++ b/cts/scheduler/remote-recover-no-resources.summary -@@ -54,6 +54,7 @@ Transition Summary: - * Move stonith-fence_ipmilan-5254005bdbb5 ( controller-1 -> controller-2 ) - - Executing cluster transition: -+ * Pseudo action: messaging-1_stop_0 - * Pseudo action: galera-0_stop_0 - * Pseudo action: galera-2_stop_0 - * Pseudo action: redis-master_pre_notify_stop_0 -@@ -92,7 +93,6 @@ Executing cluster transition: - * Pseudo action: ip-172.17.1.17_stop_0 - * Pseudo action: ip-172.17.4.11_stop_0 - * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2 -- * Pseudo action: messaging-1_stop_0 - * Resource action: redis notify on controller-0 - * Resource action: redis notify on controller-2 - * Pseudo action: redis-master_confirmed-post_notify_stopped_0 -diff --git a/cts/scheduler/remote-recover-unknown.dot b/cts/scheduler/remote-recover-unknown.dot -index 5cd760b..29ab59f 100644 ---- a/cts/scheduler/remote-recover-unknown.dot -+++ b/cts/scheduler/remote-recover-unknown.dot -@@ -46,7 +46,6 @@ digraph "g" { - "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-0" [ style = bold] - "rabbitmq_post_notify_stonith_0" -> "rabbitmq_post_notify_stonith_0 messaging-2" [ style = bold] - "rabbitmq_post_notify_stonith_0" [ style=bold color="green" fontcolor="orange"] --"rabbitmq_stop_0 messaging-1" -> "messaging-1_stop_0 controller-1" [ style = bold] - "rabbitmq_stop_0 messaging-1" -> "rabbitmq-clone_stopped_0" [ style = bold] - "rabbitmq_stop_0 messaging-1" [ style=bold color="green" fontcolor="orange"] - "redis-master_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] -diff --git a/cts/scheduler/remote-recover-unknown.exp b/cts/scheduler/remote-recover-unknown.exp -index ac6f004..82cb65f7 100644 ---- a/cts/scheduler/remote-recover-unknown.exp -+++ b/cts/scheduler/remote-recover-unknown.exp -@@ -5,11 +5,7 @@ - - - -- -- -- -- -- -+ - - - -diff --git a/cts/scheduler/remote-recover-unknown.summary b/cts/scheduler/remote-recover-unknown.summary -index 2c60713..4d7a411 100644 ---- a/cts/scheduler/remote-recover-unknown.summary -+++ b/cts/scheduler/remote-recover-unknown.summary -@@ -55,6 +55,7 @@ Transition Summary: - * Move stonith-fence_ipmilan-5254005bdbb5 ( controller-1 -> controller-2 ) - - Executing cluster transition: -+ * Pseudo action: messaging-1_stop_0 - * Pseudo action: galera-0_stop_0 - * Pseudo action: galera-2_stop_0 - * Pseudo action: redis-master_pre_notify_stop_0 -@@ -94,7 +95,6 @@ Executing cluster transition: - * Pseudo action: ip-172.17.1.17_stop_0 - * Pseudo action: ip-172.17.4.11_stop_0 - * Resource action: stonith-fence_ipmilan-5254005bdbb5 monitor=60000 on controller-2 -- * Pseudo action: messaging-1_stop_0 - * Resource action: redis notify on controller-0 - * Resource action: redis notify on controller-2 - * Pseudo action: redis-master_confirmed-post_notify_stopped_0 --- -1.8.3.1 - - -From 9a5f7952c921f7f8eea3c7b0af711df2995a4e60 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 7 Jun 2019 17:11:27 -0500 -Subject: [PATCH 13/13] Low: libpe_status: don't add /var/log mount to bundles - if user did - ---- - lib/pengine/bundle.c | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/lib/pengine/bundle.c b/lib/pengine/bundle.c -index b223f03..060e73a 100644 ---- a/lib/pengine/bundle.c -+++ b/lib/pengine/bundle.c -@@ -1027,6 +1027,7 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set) - xmlNode *xml_obj = NULL; - xmlNode *xml_resource = NULL; - pe__bundle_variant_data_t *bundle_data = NULL; -+ bool need_log_mount = TRUE; - - CRM_ASSERT(rsc != NULL); - pe_rsc_trace(rsc, "Processing resource %s...", rsc->id); -@@ -1151,6 +1152,9 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set) - - if (source && target) { - mount_add(bundle_data, source, target, options, flags); -+ if (strcmp(target, "/var/log") == 0) { -+ need_log_mount = FALSE; -+ } - } else { - pe_err("Invalid mount directive %s", ID(xml_child)); - } -@@ -1253,8 +1257,10 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set) - mount_add(bundle_data, DEFAULT_REMOTE_KEY_LOCATION, - DEFAULT_REMOTE_KEY_LOCATION, NULL, pe__bundle_mount_none); - -- mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL, -- pe__bundle_mount_subdir); -+ if (need_log_mount) { -+ mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL, -+ pe__bundle_mount_subdir); -+ } - - port = calloc(1, sizeof(pe__bundle_port_t)); - if(bundle_data->control_port) { --- -1.8.3.1 - diff --git a/SOURCES/009-shutdown-lock.patch b/SOURCES/009-shutdown-lock.patch new file mode 100644 index 0000000..ff73598 --- /dev/null +++ b/SOURCES/009-shutdown-lock.patch @@ -0,0 +1,139 @@ +From 8a0e19a7702f61622d06b1c473fb3d9a5924c8f4 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 18:07:18 -0600 +Subject: [PATCH 05/18] Refactor: liblrmd: new convenience function for + allocating lrmd_event_data_t + +--- + daemons/controld/controld_execd.c | 7 +------ + include/crm/lrmd.h | 2 ++ + lib/lrmd/lrmd_client.c | 34 +++++++++++++++++++++++++++++++++- + lib/pacemaker/pcmk_sched_transition.c | 7 +------ + lib/pacemaker/pcmk_trans_unpack.c | 9 +++------ + 5 files changed, 40 insertions(+), 19 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 82f2bf1..17cc8d6 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1878,15 +1878,10 @@ construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op, const char *rsc_id, cons + + CRM_ASSERT(rsc_id && operation); + +- op = calloc(1, sizeof(lrmd_event_data_t)); +- CRM_ASSERT(op != NULL); +- ++ op = lrmd_new_event(rsc_id, operation, 0); + op->type = lrmd_event_exec_complete; +- op->op_type = strdup(operation); + op->op_status = PCMK_LRM_OP_PENDING; + op->rc = -1; +- op->rsc_id = strdup(rsc_id); +- op->interval_ms = 0; + op->timeout = 0; + op->start_delay = 0; + +diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h +index cfa2925..3ad1f05 100644 +--- a/include/crm/lrmd.h ++++ b/include/crm/lrmd.h +@@ -248,6 +248,8 @@ typedef struct lrmd_event_data_s { + const char *exit_reason; + } lrmd_event_data_t; + ++lrmd_event_data_t *lrmd_new_event(const char *rsc_id, const char *task, ++ guint interval_ms); + lrmd_event_data_t *lrmd_copy_event(lrmd_event_data_t * event); + void lrmd_free_event(lrmd_event_data_t * event); + +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index 2469c52..d16743d 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -1,5 +1,7 @@ + /* +- * Copyright 2012-2018 David Vossel ++ * Copyright 2012-2020 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. +@@ -175,6 +177,36 @@ lrmd_key_value_freeall(lrmd_key_value_t * head) + } + } + ++/*! ++ * Create a new lrmd_event_data_t object ++ * ++ * \param[in] rsc_id ID of resource involved in event ++ * \param[in] task Action name ++ * \param[in] interval_ms Action interval ++ * ++ * \return Newly allocated and initialized lrmd_event_data_t ++ * \note This functions asserts on memory errors, so the return value is ++ * guaranteed to be non-NULL. The caller is responsible for freeing the ++ * result with lrmd_free_event(). ++ */ ++lrmd_event_data_t * ++lrmd_new_event(const char *rsc_id, const char *task, guint interval_ms) ++{ ++ lrmd_event_data_t *event = calloc(1, sizeof(lrmd_event_data_t)); ++ ++ CRM_ASSERT(event != NULL); ++ if (rsc_id != NULL) { ++ event->rsc_id = strdup(rsc_id); ++ CRM_ASSERT(event->rsc_id != NULL); ++ } ++ if (task != NULL) { ++ event->op_type = strdup(task); ++ CRM_ASSERT(event->op_type != NULL); ++ } ++ event->interval_ms = interval_ms; ++ return event; ++} ++ + lrmd_event_data_t * + lrmd_copy_event(lrmd_event_data_t * event) + { +diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c +index c415b75..1698c85 100644 +--- a/lib/pacemaker/pcmk_sched_transition.c ++++ b/lib/pacemaker/pcmk_sched_transition.c +@@ -131,12 +131,7 @@ create_op(xmlNode *cib_resource, const char *task, guint interval_ms, + lrmd_event_data_t *op = NULL; + xmlNode *xop = NULL; + +- op = calloc(1, sizeof(lrmd_event_data_t)); +- +- op->rsc_id = strdup(ID(cib_resource)); +- op->interval_ms = interval_ms; +- op->op_type = strdup(task); +- ++ op = lrmd_new_event(ID(cib_resource), task, interval_ms); + op->rc = outcome; + op->op_status = 0; + op->params = NULL; /* TODO: Fill me in */ +diff --git a/lib/pacemaker/pcmk_trans_unpack.c b/lib/pacemaker/pcmk_trans_unpack.c +index e57f386..3e53289 100644 +--- a/lib/pacemaker/pcmk_trans_unpack.c ++++ b/lib/pacemaker/pcmk_trans_unpack.c +@@ -298,12 +298,9 @@ convert_graph_action(xmlNode * resource, crm_action_t * action, int status, int + CRM_CHECK(action_resource != NULL, crm_log_xml_warn(action->xml, "Bad"); + return NULL); + +- op = calloc(1, sizeof(lrmd_event_data_t)); +- +- op->rsc_id = strdup(ID(action_resource)); +- op->interval_ms = action->interval_ms; +- op->op_type = strdup(crm_element_value(action->xml, XML_LRM_ATTR_TASK)); +- ++ op = lrmd_new_event(ID(action_resource), ++ crm_element_value(action->xml, XML_LRM_ATTR_TASK), ++ action->interval_ms); + op->rc = rc; + op->op_status = status; + op->t_run = time(NULL); +-- +1.8.3.1 + diff --git a/SOURCES/010-fix-history-handing-on-fenced-restart.patch b/SOURCES/010-fix-history-handing-on-fenced-restart.patch deleted file mode 100644 index eeaab70..0000000 --- a/SOURCES/010-fix-history-handing-on-fenced-restart.patch +++ /dev/null @@ -1,606 +0,0 @@ -From 14bb468ab404228cae34809420ef0763d3d54482 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Thu, 13 Jun 2019 15:31:24 +0200 -Subject: [PATCH] Fix: fence-history: fail leftover pending-actions after - fenced-restart - ---- - daemons/fenced/fenced_history.c | 15 +++++++++++++++ - daemons/fenced/fenced_remote.c | 6 +++--- - daemons/fenced/pacemaker-fenced.h | 8 ++++++++ - 3 files changed, 26 insertions(+), 3 deletions(-) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 7c129cc..b65b64c 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -347,6 +347,21 @@ stonith_merge_in_history_list(GHashTable *history) - - updated = TRUE; - g_hash_table_iter_steal(&iter); -+ -+ if ((op->state != st_failed) && -+ (op->state != st_done) && -+ safe_str_eq(op->originator, stonith_our_uname)) { -+ crm_warn("received pending action we are supposed to be the " -+ "owner but it's not in our records -> fail it"); -+ op->state = st_failed; -+ op->completed = time(NULL); -+ /* use -EHOSTUNREACH to not introduce a new return-code that might -+ trigger unexpected results at other places and to prevent -+ remote_op_done from setting the delegate if not present -+ */ -+ stonith_bcast_result_to_peers(op, -EHOSTUNREACH); -+ } -+ - g_hash_table_insert(stonith_remote_op_list, op->id, op); - /* we could trim the history here but if we bail - * out after trim we might miss more recent entries -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 7d61249..5b86f0f 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -369,8 +369,8 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) - return notify_data; - } - --static void --bcast_result_to_peers(remote_fencing_op_t * op, int rc) -+void -+stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc) - { - static int count = 0; - xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); -@@ -509,7 +509,7 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) - subt = crm_element_value(data, F_SUBTYPE); - if (dup == FALSE && safe_str_neq(subt, "broadcast")) { - /* Defer notification until the bcast message arrives */ -- bcast_result_to_peers(op, rc); -+ stonith_bcast_result_to_peers(op, rc); - goto remote_op_done_cleanup; - } - -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 3a2edbb..a8531a6 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -149,6 +149,14 @@ typedef struct remote_fencing_op_s { - - } remote_fencing_op_t; - -+/*! -+ * \internal -+ * \brief Broadcast the result of an operation to the peers. -+ * \param op, Operation whose result should be broadcast -+ * \param rc, Result of the operation -+ */ -+void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc); -+ - enum st_callback_flags { - st_callback_unknown = 0x0000, - st_callback_notify_fence = 0x0001, --- -1.8.3.1 - -From a0bc0d3ab5aed64e37b1caae746f5c421696df1b Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Fri, 14 Jun 2019 13:41:43 +0200 -Subject: [PATCH] Fix: controld-fencing: remove-notifications upon - connection-destroy - ---- - daemons/controld/controld_fencing.c | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 92336e9..b925bc5 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -403,7 +403,14 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) - } - - if (stonith_api) { -- stonith_api->state = stonith_disconnected; -+ /* the client API won't properly reconnect notifications -+ * if they are still in the table - so remove them -+ */ -+ stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT); -+ stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE); -+ if (stonith_api->state != stonith_disconnected) { -+ stonith_api->cmds->disconnect(st); -+ } - } - - if (AM_I_DC) { --- -1.8.3.1 - -From 487cdd9e3ec6ab47fde5074acbb2ff564047d59c Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 18 Jun 2019 14:09:20 +0200 -Subject: [PATCH] Feature: fence-history: add notification upon history-synced - ---- - daemons/fenced/fenced_history.c | 5 +++++ - daemons/fenced/pacemaker-fenced.c | 3 +++ - daemons/fenced/pacemaker-fenced.h | 11 ++++++----- - include/crm/stonith-ng.h | 1 + - 4 files changed, 15 insertions(+), 5 deletions(-) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index b65b64c..cd08d74 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -420,6 +420,11 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, - stonith_fence_history_cleanup(target, - crm_element_value(msg, F_STONITH_CALLID) != NULL); - } else if (options & st_opt_broadcast) { -+ /* there is no clear sign atm for when a history sync -+ is done so send a notification for anything -+ that smells like history-sync -+ */ -+ do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY_SYNCED, 0, NULL); - if (crm_element_value(msg, F_STONITH_CALLID)) { - /* this is coming from the stonith-API - * -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 7e9bb07..7a87f93 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -279,6 +279,9 @@ get_stonith_flag(const char *name) - } else if (safe_str_eq(name, T_STONITH_NOTIFY_HISTORY)) { - return st_callback_notify_history; - -+ } else if (safe_str_eq(name, T_STONITH_NOTIFY_HISTORY_SYNCED)) { -+ return st_callback_notify_history_synced; -+ - } - return st_callback_unknown; - } -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index a8531a6..583cb47 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -158,11 +158,12 @@ typedef struct remote_fencing_op_s { - void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc); - - enum st_callback_flags { -- st_callback_unknown = 0x0000, -- st_callback_notify_fence = 0x0001, -- st_callback_device_add = 0x0004, -- st_callback_device_del = 0x0010, -- st_callback_notify_history = 0x0020 -+ st_callback_unknown = 0x0000, -+ st_callback_notify_fence = 0x0001, -+ st_callback_device_add = 0x0004, -+ st_callback_device_del = 0x0010, -+ st_callback_notify_history = 0x0020, -+ st_callback_notify_history_synced = 0x0040 - }; - - /* -diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h -index b640732..418a03c 100644 ---- a/include/crm/stonith-ng.h -+++ b/include/crm/stonith-ng.h -@@ -29,6 +29,7 @@ extern "C" { - # define T_STONITH_NOTIFY_DISCONNECT "st_notify_disconnect" - # define T_STONITH_NOTIFY_FENCE "st_notify_fence" - # define T_STONITH_NOTIFY_HISTORY "st_notify_history" -+# define T_STONITH_NOTIFY_HISTORY_SYNCED "st_notify_history_synced" - - /* *INDENT-OFF* */ - enum stonith_state { --- -1.8.3.1 - -From 03c4455fced74f093deb782198b1ba3076e52015 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 18 Jun 2019 14:12:27 +0200 -Subject: [PATCH] Fix: fence-history: resync fence-history after fenced crash - -Setting up a 30s fallback timer to trigger history-sync if the -sync via DC doesn't happen ---- - daemons/controld/controld_callbacks.c | 2 +- - daemons/controld/controld_control.c | 2 + - daemons/controld/controld_fencing.c | 86 ++++++++++++++++++++++++++++++----- - daemons/controld/controld_fencing.h | 3 +- - 4 files changed, 79 insertions(+), 14 deletions(-) - -diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c -index 3ce7470..48225ac 100644 ---- a/daemons/controld/controld_callbacks.c -+++ b/daemons/controld/controld_callbacks.c -@@ -211,7 +211,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d - - } else if(AM_I_DC) { - if (appeared) { -- te_trigger_stonith_history_sync(); -+ te_trigger_stonith_history_sync(FALSE); - } else { - erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); - } -diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c -index e99d605..f3bb20f 100644 ---- a/daemons/controld/controld_control.c -+++ b/daemons/controld/controld_control.c -@@ -259,6 +259,8 @@ crmd_exit(crm_exit_t exit_code) - crm_timer_stop(wait_timer); - crm_timer_stop(recheck_timer); - -+ te_cleanup_stonith_history_sync(NULL, TRUE); -+ - free(transition_timer); transition_timer = NULL; - free(integration_timer); integration_timer = NULL; - free(finalization_timer); finalization_timer = NULL; -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index b925bc5..22fa727 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -20,6 +20,9 @@ - # include - #endif - -+static void -+tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event); -+ - /* - * stonith failure counting - * -@@ -394,6 +397,8 @@ fail_incompletable_stonith(crm_graph_t *graph) - static void - tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) - { -+ te_cleanup_stonith_history_sync(st, FALSE); -+ - if (is_set(fsa_input_register, R_ST_REQUIRED)) { - crm_crit("Fencing daemon connection failed"); - mainloop_set_trigger(stonith_reconnect); -@@ -406,11 +411,12 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) - /* the client API won't properly reconnect notifications - * if they are still in the table - so remove them - */ -- stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT); -- stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE); - if (stonith_api->state != stonith_disconnected) { - stonith_api->cmds->disconnect(st); - } -+ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); -+ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); -+ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); - } - - if (AM_I_DC) { -@@ -622,7 +628,12 @@ te_connect_stonith(gpointer user_data) - stonith_api->cmds->register_notification(stonith_api, - T_STONITH_NOTIFY_FENCE, - tengine_stonith_notify); -+ stonith_api->cmds->register_notification(stonith_api, -+ T_STONITH_NOTIFY_HISTORY_SYNCED, -+ tengine_stonith_history_synced); -+ te_trigger_stonith_history_sync(TRUE); - } -+ - return TRUE; - } - -@@ -649,7 +660,12 @@ controld_disconnect_fencer(bool destroy) - // Prevent fencer connection from coming up again - clear_bit(fsa_input_register, R_ST_REQUIRED); - -- stonith_api->cmds->disconnect(stonith_api); -+ if (stonith_api->state != stonith_disconnected) { -+ stonith_api->cmds->disconnect(stonith_api); -+ } -+ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); -+ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); -+ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); - } - if (destroy) { - if (stonith_api) { -@@ -673,6 +689,7 @@ do_stonith_history_sync(gpointer user_data) - if (stonith_api && (stonith_api->state != stonith_disconnected)) { - stonith_history_t *history = NULL; - -+ te_cleanup_stonith_history_sync(stonith_api, FALSE); - stonith_api->cmds->history(stonith_api, - st_opt_sync_call | st_opt_broadcast, - NULL, &history, 5); -@@ -845,7 +862,33 @@ te_fence_node(crm_graph_t *graph, crm_action_t *action) - */ - - static crm_trigger_t *stonith_history_sync_trigger = NULL; --static mainloop_timer_t *stonith_history_sync_timer = NULL; -+static mainloop_timer_t *stonith_history_sync_timer_short = NULL; -+static mainloop_timer_t *stonith_history_sync_timer_long = NULL; -+ -+void -+te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers) -+{ -+ if (free_timers) { -+ mainloop_timer_del(stonith_history_sync_timer_short); -+ stonith_history_sync_timer_short = NULL; -+ mainloop_timer_del(stonith_history_sync_timer_long); -+ stonith_history_sync_timer_long = NULL; -+ } else { -+ mainloop_timer_stop(stonith_history_sync_timer_short); -+ mainloop_timer_stop(stonith_history_sync_timer_long); -+ } -+ -+ if (st) { -+ st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED); -+ } -+} -+ -+static void -+tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event) -+{ -+ te_cleanup_stonith_history_sync(st, FALSE); -+ crm_debug("Fence-history synced - cancel all timers"); -+} - - static gboolean - stonith_history_sync_set_trigger(gpointer user_data) -@@ -855,11 +898,18 @@ stonith_history_sync_set_trigger(gpointer user_data) - } - - void --te_trigger_stonith_history_sync(void) -+te_trigger_stonith_history_sync(bool long_timeout) - { - /* trigger a sync in 5s to give more nodes the - * chance to show up so that we don't create - * unnecessary stonith-history-sync traffic -+ * -+ * the long timeout of 30s is there as a fallback -+ * so that after a successful connection to fenced -+ * we will wait for 30s for the DC to trigger a -+ * history-sync -+ * if this doesn't happen we trigger a sync locally -+ * (e.g. fenced segfaults and is restarted by pacemakerd) - */ - - /* as we are finally checking the stonith-connection -@@ -873,14 +923,26 @@ te_trigger_stonith_history_sync(void) - do_stonith_history_sync, NULL); - } - -- if(stonith_history_sync_timer == NULL) { -- stonith_history_sync_timer = -- mainloop_timer_add("history_sync", 5000, -- FALSE, stonith_history_sync_set_trigger, -- NULL); -+ if (long_timeout) { -+ if(stonith_history_sync_timer_long == NULL) { -+ stonith_history_sync_timer_long = -+ mainloop_timer_add("history_sync_long", 30000, -+ FALSE, stonith_history_sync_set_trigger, -+ NULL); -+ } -+ crm_info("Fence history will be synchronized cluster-wide within 30 seconds"); -+ mainloop_timer_start(stonith_history_sync_timer_long); -+ } else { -+ if(stonith_history_sync_timer_short == NULL) { -+ stonith_history_sync_timer_short = -+ mainloop_timer_add("history_sync_short", 5000, -+ FALSE, stonith_history_sync_set_trigger, -+ NULL); -+ } -+ crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); -+ mainloop_timer_start(stonith_history_sync_timer_short); - } -- crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); -- mainloop_timer_start(stonith_history_sync_timer); -+ - } - - /* end stonith history synchronization functions */ -diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h -index 8f7f19b..2fe6d88 100644 ---- a/daemons/controld/controld_fencing.h -+++ b/daemons/controld/controld_fencing.h -@@ -29,6 +29,7 @@ void purge_stonith_cleanup(void); - void execute_stonith_cleanup(void); - - // stonith history synchronization --void te_trigger_stonith_history_sync(void); -+void te_trigger_stonith_history_sync(bool long_timeout); -+void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers); - - #endif --- -1.8.3.1 - -From 2b038831edf6dd345c3f39f0fc27cfbf9503f512 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 18 Jun 2019 21:54:49 +0200 -Subject: [PATCH] Fix: st_client: make safe to remove notifications from - notifications - -While cycling over the notification-list just mark for deletion -and delete afterwards. ---- - lib/fencing/st_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++++---- - 1 file changed, 54 insertions(+), 4 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 629887a..ba23ac5 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -67,6 +67,8 @@ typedef struct stonith_private_s { - mainloop_io_t *source; - GHashTable *stonith_op_callback_table; - GList *notify_list; -+ int notify_refcnt; -+ bool notify_deletes; - - void (*op_callback) (stonith_t * st, stonith_callback_data_t * data); - -@@ -77,6 +79,7 @@ typedef struct stonith_notify_client_s { - const char *obj_id; /* implement one day */ - const char *obj_type; /* implement one day */ - void (*notify) (stonith_t * st, stonith_event_t * e); -+ bool delete; - - } stonith_notify_client_t; - -@@ -211,6 +214,38 @@ log_action(stonith_action_t *action, pid_t pid) - } - } - -+/* when cycling through the list we don't want to delete items -+ so just mark them and when we know nobody is using the list -+ loop over it to remove the marked items -+ */ -+static void -+foreach_notify_entry (stonith_private_t *private, -+ GFunc func, -+ gpointer user_data) -+{ -+ private->notify_refcnt++; -+ g_list_foreach(private->notify_list, func, user_data); -+ private->notify_refcnt--; -+ if ((private->notify_refcnt == 0) && -+ private->notify_deletes) { -+ GList *list_item = private->notify_list; -+ -+ private->notify_deletes = FALSE; -+ while (list_item != NULL) -+ { -+ stonith_notify_client_t *list_client = list_item->data; -+ GList *next = g_list_next(list_item); -+ -+ if (list_client->delete) { -+ free(list_client); -+ private->notify_list = -+ g_list_delete_link(private->notify_list, list_item); -+ } -+ list_item = next; -+ } -+ } -+} -+ - static void - stonith_connection_destroy(gpointer user_data) - { -@@ -230,7 +265,7 @@ stonith_connection_destroy(gpointer user_data) - crm_xml_add(blob.xml, F_TYPE, T_STONITH_NOTIFY); - crm_xml_add(blob.xml, F_SUBTYPE, T_STONITH_NOTIFY_DISCONNECT); - -- g_list_foreach(native->notify_list, stonith_send_notification, &blob); -+ foreach_notify_entry(native, stonith_send_notification, &blob); - free_xml(blob.xml); - } - -@@ -1140,6 +1175,10 @@ stonithlib_GCompareFunc(gconstpointer a, gconstpointer b) - const stonith_notify_client_t *a_client = a; - const stonith_notify_client_t *b_client = b; - -+ if (a_client->delete || b_client->delete) { -+ /* make entries marked for deletion not findable */ -+ return -1; -+ } - CRM_CHECK(a_client->event != NULL && b_client->event != NULL, return 0); - rc = strcmp(a_client->event, b_client->event); - if (rc == 0) { -@@ -1394,7 +1433,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) - stonith_perform_callback(st, blob.xml, 0, 0); - - } else if (safe_str_eq(type, T_STONITH_NOTIFY)) { -- g_list_foreach(private->notify_list, stonith_send_notification, &blob); -+ foreach_notify_entry(private, stonith_send_notification, &blob); - } else if (safe_str_eq(type, T_STONITH_TIMEOUT_VALUE)) { - int call_id = 0; - int timeout = 0; -@@ -1592,8 +1631,13 @@ stonith_api_del_notification(stonith_t * stonith, const char *event) - if (list_item != NULL) { - stonith_notify_client_t *list_client = list_item->data; - -- private->notify_list = g_list_remove(private->notify_list, list_client); -- free(list_client); -+ if (private->notify_refcnt) { -+ list_client->delete = TRUE; -+ private->notify_deletes = TRUE; -+ } else { -+ private->notify_list = g_list_remove(private->notify_list, list_client); -+ free(list_client); -+ } - - crm_trace("Removed callback"); - -@@ -1754,6 +1798,10 @@ stonith_send_notification(gpointer data, gpointer user_data) - crm_warn("Skipping callback - NULL callback client"); - return; - -+ } else if (entry->delete) { -+ crm_trace("Skipping callback - marked for deletion"); -+ return; -+ - } else if (entry->notify == NULL) { - crm_warn("Skipping callback - NULL callback"); - return; -@@ -2037,6 +2085,8 @@ stonith_api_new(void) - private->stonith_op_callback_table = g_hash_table_new_full(g_direct_hash, g_direct_equal, - NULL, stonith_destroy_op_callback); - private->notify_list = NULL; -+ private->notify_refcnt = 0; -+ private->notify_deletes = FALSE; - - new_stonith->call_id = 1; - new_stonith->state = stonith_disconnected; --- -1.8.3.1 - -From 03765b7803f935f0db149843a0b90aa9c872d922 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Fri, 21 Jun 2019 14:13:10 +0200 -Subject: [PATCH] Test: CTS: new pattern to identify fenced reconnected - -Now that we are removing notifications upon disconnect a duplicate -notification can't be used as sign for reconnection any more. ---- - cts/patterns.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cts/patterns.py b/cts/patterns.py -index 1b86ee7..8de67b1 100644 ---- a/cts/patterns.py -+++ b/cts/patterns.py -@@ -303,7 +303,7 @@ class crm_corosync(BasePatterns): - self.components["pacemaker-fenced"] = [ - r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", - r"Fencing daemon connection failed", -- r"pacemaker-controld.*:\s*warn.*:\s*Callback already present", -+ r"pacemaker-controld.*Fencer successfully connected", - ] - self.components["pacemaker-fenced-ignore"] = [ - r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", --- -1.8.3.1 - -From c45c98cd77cb3e0913bcdb18fd6b116c3a25285d Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Fri, 21 Jun 2019 16:40:47 +0200 -Subject: [PATCH] Fix: controld-fencing: add notice-log for successful - fencer-connect - ---- - daemons/controld/controld_fencing.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 22fa727..2428168 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -632,6 +632,7 @@ te_connect_stonith(gpointer user_data) - T_STONITH_NOTIFY_HISTORY_SYNCED, - tengine_stonith_history_synced); - te_trigger_stonith_history_sync(TRUE); -+ crm_notice("Fencer successfully connected"); - } - - return TRUE; --- -1.8.3.1 - diff --git a/SOURCES/010-shutdown-lock.patch b/SOURCES/010-shutdown-lock.patch new file mode 100644 index 0000000..6304246 --- /dev/null +++ b/SOURCES/010-shutdown-lock.patch @@ -0,0 +1,129 @@ +From 50b0944c8add3f16b8190e75a6d06c3473c12a8f Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 21 Nov 2019 14:48:02 -0600 +Subject: [PATCH 06/18] Feature: scheduler: add shutdown lock cluster options + +This commit adds shutdown-lock and shutdown-lock-limit options (just the +options, not the feature itself). + +shutdown-lock defaults to false, which preserves current behavior. The intended +purpose of setting it to true is to *prevent* recovery of a node's resources +elsewhere when the node is cleanly shut down, until the node rejoins. If +shutdown-lock-limit is set to a nonzero time duration, the cluster will +be allowed to recover the resources if the node has not rejoined within this +time. + +The use case is when rebooting a node (such as for software updates) is done by +cluster-unaware system administrators during scheduled maintenance windows, +resources prefer specific nodes, and resource recovery time is high. +--- + include/crm/msg_xml.h | 4 +++- + include/crm/pengine/pe_types.h | 2 ++ + lib/pengine/common.c | 24 +++++++++++++++++++++++- + lib/pengine/unpack.c | 10 ++++++++++ + 4 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/include/crm/msg_xml.h b/include/crm/msg_xml.h +index d56e40c..d0cdf6c 100644 +--- a/include/crm/msg_xml.h ++++ b/include/crm/msg_xml.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -346,6 +346,8 @@ extern "C" { + # define XML_CONFIG_ATTR_FORCE_QUIT "shutdown-escalation" + # define XML_CONFIG_ATTR_RECHECK "cluster-recheck-interval" + # define XML_CONFIG_ATTR_FENCE_REACTION "fence-reaction" ++# define XML_CONFIG_ATTR_SHUTDOWN_LOCK "shutdown-lock" ++# define XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT "shutdown-lock-limit" + + # define XML_ALERT_ATTR_PATH "path" + # define XML_ALERT_ATTR_TIMEOUT "timeout" +diff --git a/include/crm/pengine/pe_types.h b/include/crm/pengine/pe_types.h +index 23e1c46..8a735a3 100644 +--- a/include/crm/pengine/pe_types.h ++++ b/include/crm/pengine/pe_types.h +@@ -102,6 +102,7 @@ enum pe_find { + # define pe_flag_start_failure_fatal 0x00001000ULL + # define pe_flag_remove_after_stop 0x00002000ULL + # define pe_flag_startup_fencing 0x00004000ULL ++# define pe_flag_shutdown_lock 0x00008000ULL + + # define pe_flag_startup_probes 0x00010000ULL + # define pe_flag_have_status 0x00020000ULL +@@ -167,6 +168,7 @@ struct pe_working_set_s { + GList *stop_needed; // Containers that need stop actions + time_t recheck_by; // Hint to controller to re-run scheduler by this time + int ninstances; // Total number of resource instances ++ guint shutdown_lock;// How long (seconds) to lock resources to shutdown node + }; + + enum pe_check_parameters { +diff --git a/lib/pengine/common.c b/lib/pengine/common.c +index da39c99..e72a033 100644 +--- a/lib/pengine/common.c ++++ b/lib/pengine/common.c +@@ -1,5 +1,7 @@ + /* +- * Copyright 2004-2018 Andrew Beekhof ++ * Copyright 2004-2020 the Pacemaker project contributors ++ * ++ * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. +@@ -85,6 +87,26 @@ static pe_cluster_option pe_opts[] = { + "When set to TRUE, the cluster will immediately ban a resource from a node if it fails to start there. When FALSE, the cluster will instead check the resource's fail count against its migration-threshold." }, + { "enable-startup-probes", NULL, "boolean", NULL, "true", &check_boolean, + "Should the cluster check for active resources during startup", NULL }, ++ { ++ XML_CONFIG_ATTR_SHUTDOWN_LOCK, ++ NULL, "boolean", NULL, "false", &check_boolean, ++ "Whether to lock resources to a cleanly shut down node", ++ "When true, resources active on a node when it is cleanly shut down " ++ "are kept \"locked\" to that node (not allowed to run elsewhere) " ++ "until they start again on that node after it rejoins (or for at " ++ "most shutdown-lock-limit, if set). Stonith resources and " ++ "Pacemaker Remote connections are never locked. Clone and bundle " ++ "instances and the master role of promotable clones are currently " ++ "never locked, though support could be added in a future release." ++ }, ++ { ++ XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT, ++ NULL, "time", NULL, "0", &check_timer, ++ "Do not lock resources to a cleanly shut down node longer than this", ++ "If shutdown-lock is true and this is set to a nonzero time duration, " ++ "shutdown locks will expire after this much time has passed since " ++ "the shutdown was initiated, even if the node has not rejoined." ++ }, + + /* Stonith Options */ + { "stonith-enabled", NULL, "boolean", NULL, "true", &check_boolean, +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index c9fc672..8c0d72a 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -319,6 +319,16 @@ unpack_config(xmlNode * config, pe_working_set_t * data_set) + data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy"); + crm_trace("Placement strategy: %s", data_set->placement_strategy); + ++ set_config_flag(data_set, "shutdown-lock", pe_flag_shutdown_lock); ++ crm_trace("Resources will%s be locked to cleanly shut down nodes", ++ (is_set(data_set->flags, pe_flag_shutdown_lock)? "" : " not")); ++ if (is_set(data_set->flags, pe_flag_shutdown_lock)) { ++ value = pe_pref(data_set->config_hash, ++ XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT); ++ data_set->shutdown_lock = crm_parse_interval_spec(value) / 1000; ++ crm_trace("Shutdown locks expire after %us", data_set->shutdown_lock); ++ } ++ + return TRUE; + } + +-- +1.8.3.1 + diff --git a/SOURCES/011-crm_report.patch b/SOURCES/011-crm_report.patch deleted file mode 100644 index b5b99d8..0000000 --- a/SOURCES/011-crm_report.patch +++ /dev/null @@ -1,200 +0,0 @@ -From 13809f57913cc5797d2a9d1ad19eb561a5113845 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 23 Aug 2019 17:28:49 -0500 -Subject: [PATCH 1/5] Fix: tools: correct crm_report argument parsing - -There were a few instances where crm_report's option names passed to getopt, -option names listed in help, and option names checked for did not match. - -Where getopt and checks matched, I went with that, so that anything that -worked before continues to work. ---- - tools/crm_report.in | 10 ++++++---- - 1 file changed, 6 insertions(+), 4 deletions(-) - -diff --git a/tools/crm_report.in b/tools/crm_report.in -index d1bc425..0ccec56 100644 ---- a/tools/crm_report.in -+++ b/tools/crm_report.in -@@ -10,7 +10,7 @@ - - TEMP=`@GETOPT_PATH@ \ - -o hv?xl:f:t:n:T:L:p:c:dSCu:D:MVse: \ -- --long help,cts:,cts-log:,dest:,node:,nodes:,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ -+ --long help,corosync,cts:,cts-log:,dest:,node:,nodes:,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ - -n 'crm_report' -- "$@"` - # The quotes around $TEMP are essential - eval set -- "$TEMP" -@@ -44,6 +44,7 @@ Required option: - - Options: - -V increase verbosity (may be specified multiple times) -+ -h, --help display this message - -v, --version display software version - --features display software features - -t, --to TIME time at which all problems were resolved -@@ -65,9 +66,10 @@ Options: - -C, --corosync force the cluster type to be corosync - -u, --user USER username to use when collecting data from other nodes - (default root) -- -D, --depth search depth to use when attempting to locate files -+ -D, --max-depth search depth to use when attempting to locate files - -e, --rsh command to use to run commands on other nodes - (default ssh -T) -+ -d, --as-directory leave result as a directory tree instead of archiving - --sos-mode use defaults suitable for being called by sosreport tool - (behavior subject to change and not useful to end users) - DEST, --dest DEST custom destination directory or file name -@@ -107,13 +109,13 @@ while true; do - case "$1" in - -x) set -x; shift;; - -V) verbose=`expr $verbose + 1`; shift;; -- -T|--cts-test) tests="$tests $2"; shift; shift;; -+ -T|--cts) tests="$tests $2"; shift; shift;; - --cts-log) ctslog="$2"; shift; shift;; - -f|--from) start_time=`get_time "$2"`; shift; shift;; - -t|--to) end_time=`get_time "$2"`; shift; shift;; - -n|--node|--nodes) nodes="$nodes $2"; shift; shift;; - -S|--single-node) nodes="$host"; shift;; -- -E|-l|--logfile) extra_logs="$extra_logs $2"; shift; shift;; -+ -l|--logfile) extra_logs="$extra_logs $2"; shift; shift;; - -p) sanitize_patterns="$sanitize_patterns $2"; shift; shift;; - -L) log_patterns="$log_patterns `echo $2 | sed 's/ /\\\W/g'`"; shift; shift;; - -d|--as-directory) compress=0; shift;; --- -1.8.3.1 - - -From 24f0cbb4423a98b41e629c915b79778b39b5ae22 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 23 Aug 2019 17:39:45 -0500 -Subject: [PATCH 3/5] Fix: tools: don't ignore log if unrelated file is too - large - -This fixes a regression in 1.1.12: since cb420a04, findln_by_time() would skip -a log if any file in the current working directory (rather than the log itself) -was larger than 1GB. ---- - tools/report.common.in | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/report.common.in b/tools/report.common.in -index 7dd00b3..4fed6bb 100644 ---- a/tools/report.common.in -+++ b/tools/report.common.in -@@ -538,7 +538,7 @@ findln_by_time() { - # Some logs can be massive (over 1,500,000,000 lines have been seen in the wild) - # Even just 'wc -l' on these files can take 10+ minutes - -- local fileSize=`ls -lh | awk '{ print $5 }' | grep -ie G` -+ local fileSize=`ls -lh "$logf" | awk '{ print $5 }' | grep -ie G` - if [ x$fileSize != x ]; then - warning "$logf is ${fileSize} in size and could take many hours to process. Skipping." - return --- -1.8.3.1 - - -From 885d9acdb8132a437b48d4d9e8121131cbedb3da Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 23 Aug 2019 22:38:51 -0500 -Subject: [PATCH 4/5] Fix: tools: check for tar in crm_report - -crm_report requires tar, so check for its existence up front. ---- - tools/crm_report.in | 4 ++++ - tools/report.collector.in | 2 ++ - tools/report.common.in | 10 ++++++++++ - 3 files changed, 16 insertions(+) - -diff --git a/tools/crm_report.in b/tools/crm_report.in -index 0ccec56..1818879 100644 ---- a/tools/crm_report.in -+++ b/tools/crm_report.in -@@ -419,6 +419,10 @@ getnodes() { - # TODO: Look for something like crm_update_peer - } - -+if [ $compress -eq 1 ]; then -+ require_tar -+fi -+ - if [ "x$tests" != "x" ]; then - do_cts - -diff --git a/tools/report.collector.in b/tools/report.collector.in -index 9419f17..315b785 100644 ---- a/tools/report.collector.in -+++ b/tools/report.collector.in -@@ -747,6 +747,8 @@ collect_logs() { - trap "" 0 - } - -+require_tar -+ - debug "Initializing $REPORT_TARGET subdir" - if [ "$REPORT_MASTER" != "$REPORT_TARGET" ]; then - if [ -e $REPORT_HOME/$REPORT_TARGET ]; then -diff --git a/tools/report.common.in b/tools/report.common.in -index 4fed6bb..73ec0dc 100644 ---- a/tools/report.common.in -+++ b/tools/report.common.in -@@ -114,6 +114,13 @@ fatal() { - exit 1 - } - -+require_tar() { -+ which tar >/dev/null 2>&1 -+ if [ $? -ne 0 ]; then -+ fatal "Required program 'tar' not found, please install and re-run" -+ fi -+} -+ - # check if process of given substring in its name does exist; - # only look for processes originated by user 0 (by UID), "@CRM_DAEMON_USER@" - # or effective user running this script, and/or group 0 (by GID), -@@ -525,6 +532,9 @@ shrink() { - - cd $dir >/dev/null 2>&1 - tar $tar_options $target $base >/dev/null 2>&1 -+ if [ $? -ne 0 ]; then -+ fatal "Could not archive $base, please investigate and collect manually" -+ fi - cd $olddir >/dev/null 2>&1 - - echo $target --- -1.8.3.1 - - -From 5dcdb1eef727912fe33d7c8d9d2a4076fee7eb70 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 23 Aug 2019 22:15:50 -0500 -Subject: [PATCH 5/5] Build: rpm: add soft dependency on tar and bzip2 - -... which are needed by crm_report. Minimal OS installations are increasingly -popular, and the existence of tar can't be assumed. These are soft dependencies -because they are only needed for crm_report, not cluster functioning, and a -soft dependency allows users to keep a smaller footprint if desired while -providing full functionality to the typical user. ---- - pacemaker.spec.in | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/pacemaker.spec.in b/pacemaker.spec.in -index 0143b63..0f1638b 100644 ---- a/pacemaker.spec.in -+++ b/pacemaker.spec.in -@@ -279,6 +279,9 @@ Group: System Environment/Daemons - Requires: %{name}-libs%{?_isa} = %{version}-%{release} - %if 0%{?fedora} > 22 || 0%{?rhel} > 7 - Recommends: pcmk-cluster-manager = %{version}-%{release} -+# For crm_report -+Recommends: tar -+Recommends: bzip2 - %endif - Requires: perl-TimeDate - Requires: procps-ng --- -1.8.3.1 - diff --git a/SOURCES/011-shutdown-lock.patch b/SOURCES/011-shutdown-lock.patch new file mode 100644 index 0000000..e9f1f5c --- /dev/null +++ b/SOURCES/011-shutdown-lock.patch @@ -0,0 +1,144 @@ +From f5d88938955f63935058b7cc2d706a12e6ea1121 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 6 Dec 2019 11:57:59 -0600 +Subject: [PATCH 07/18] Low: scheduler: respect shutdown locks when placing + active resources + +Use new pe_resource_t members to indicate that a resource is locked to a +particular node. + +For active resources (i.e. in the transition where the node is scheduled for +shutdown), these are connected by checking each lockable resource for whether +it is running on a single clean node that is shutting down. + +When applying constraints, place -INFINITY location constraints for locked +resources on all nodes other than the lock node. + +(Inactive resources -- i.e. in later transitions after the node is shut down -- +are not yet locked.) +--- + include/crm/pengine/pe_types.h | 2 + + lib/pacemaker/pcmk_sched_allocate.c | 87 +++++++++++++++++++++++++++++++++++++ + 2 files changed, 89 insertions(+) + +diff --git a/include/crm/pengine/pe_types.h b/include/crm/pengine/pe_types.h +index 8a735a3..123d8ef 100644 +--- a/include/crm/pengine/pe_types.h ++++ b/include/crm/pengine/pe_types.h +@@ -354,6 +354,8 @@ struct pe_resource_s { + GListPtr fillers; + + pe_node_t *pending_node; // Node on which pending_task is happening ++ pe_node_t *lock_node; // Resource is shutdown-locked to this node ++ time_t lock_time; // When shutdown lock started + + #if ENABLE_VERSIONED_ATTRS + xmlNode *versioned_parameters; +diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c +index fc2f4cf..0314f1b 100644 +--- a/lib/pacemaker/pcmk_sched_allocate.c ++++ b/lib/pacemaker/pcmk_sched_allocate.c +@@ -977,6 +977,87 @@ rsc_discover_filter(resource_t *rsc, node_t *node) + } + } + ++static time_t ++shutdown_time(pe_node_t *node, pe_working_set_t *data_set) ++{ ++ const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN); ++ time_t result = 0; ++ ++ if (shutdown) { ++ errno = 0; ++ result = (time_t) crm_int_helper(shutdown, NULL); ++ if (errno != 0) { ++ result = 0; ++ } ++ } ++ return result? result : get_effective_time(data_set); ++} ++ ++static void ++apply_shutdown_lock(pe_resource_t *rsc, pe_working_set_t *data_set) ++{ ++ const char *class; ++ ++ // Only primitives and (uncloned) groups may be locked ++ if (rsc->variant == pe_group) { ++ for (GList *item = rsc->children; item != NULL; ++ item = item->next) { ++ apply_shutdown_lock((pe_resource_t *) item->data, data_set); ++ } ++ } else if (rsc->variant != pe_native) { ++ return; ++ } ++ ++ // Fence devices and remote connections can't be locked ++ class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); ++ if ((class == NULL) || !strcmp(class, PCMK_RESOURCE_CLASS_STONITH) ++ || pe__resource_is_remote_conn(rsc, data_set)) { ++ return; ++ } ++ ++ // Only a resource active on exactly one node can be locked ++ if (pcmk__list_of_1(rsc->running_on)) { ++ pe_node_t *node = rsc->running_on->data; ++ ++ if (node->details->shutdown) { ++ if (node->details->unclean) { ++ pe_rsc_debug(rsc, "Not locking %s to unclean %s for shutdown", ++ rsc->id, node->details->uname); ++ } else { ++ rsc->lock_node = node; ++ rsc->lock_time = shutdown_time(node, data_set); ++ } ++ } ++ } ++ ++ if (rsc->lock_node == NULL) { ++ // No lock needed ++ return; ++ } ++ ++ if (data_set->shutdown_lock > 0) { ++ time_t lock_expiration = rsc->lock_time + data_set->shutdown_lock; ++ ++ pe_rsc_info(rsc, "Locking %s to %s due to shutdown (expires @%lld)", ++ rsc->id, rsc->lock_node->details->uname, ++ (long long) lock_expiration); ++ pe__update_recheck_time(++lock_expiration, data_set); ++ } else { ++ pe_rsc_info(rsc, "Locking %s to %s due to shutdown", ++ rsc->id, rsc->lock_node->details->uname); ++ } ++ ++ // If resource is locked to one node, ban it from all other nodes ++ for (GList *item = data_set->nodes; item != NULL; item = item->next) { ++ pe_node_t *node = item->data; ++ ++ if (strcmp(node->details->uname, rsc->lock_node->details->uname)) { ++ resource_location(rsc, node, -CRM_SCORE_INFINITY, ++ XML_CONFIG_ATTR_SHUTDOWN_LOCK, data_set); ++ } ++ } ++} ++ + /* + * Count how many valid nodes we have (so we know the maximum number of + * colors we can resolve). +@@ -988,6 +1069,12 @@ stage2(pe_working_set_t * data_set) + { + GListPtr gIter = NULL; + ++ if (is_set(data_set->flags, pe_flag_shutdown_lock)) { ++ for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { ++ apply_shutdown_lock((pe_resource_t *) gIter->data, data_set); ++ } ++ } ++ + for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { + node_t *node = (node_t *) gIter->data; + +-- +1.8.3.1 + diff --git a/SOURCES/012-fork-close.patch b/SOURCES/012-fork-close.patch deleted file mode 100644 index 4da5be6..0000000 --- a/SOURCES/012-fork-close.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 5a73027642b826793658774df8a0536975120b19 Mon Sep 17 00:00:00 2001 -From: John Eckersberg -Date: Fri, 11 Oct 2019 08:59:41 -0400 -Subject: [PATCH] Fix: libcrmservice: try not to spam close() file descriptors - -With large file descriptor limits, action_launch_child can close -millions of non-existent file descriptors. - -Instead, try to read open file descriptors from /proc or /dev/fd and -close only those which are open. - -See rhbz#1762025 ---- - lib/services/services_linux.c | 26 ++++++++++++++++++++++++-- - 1 file changed, 24 insertions(+), 2 deletions(-) - -diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c -index 90c1f44..464fc5b 100644 ---- a/lib/services/services_linux.c -+++ b/lib/services/services_linux.c -@@ -445,6 +445,7 @@ static void - action_launch_child(svc_action_t *op) - { - int lpc; -+ DIR *dir; - - /* SIGPIPE is ignored (which is different from signal blocking) by the gnutls library. - * Depending on the libqb version in use, libqb may set SIGPIPE to be ignored as well. -@@ -476,8 +477,29 @@ action_launch_child(svc_action_t *op) - setpgid(0, 0); - - // Close all file descriptors except stdin/stdout/stderr -- for (lpc = getdtablesize() - 1; lpc > STDERR_FILENO; lpc--) { -- close(lpc); -+#if SUPPORT_PROCFS -+ dir = opendir("/proc/self/fd"); -+#else -+ dir = opendir("/dev/fd"); -+#endif -+ if (dir == NULL) { /* /proc or /dev/fd not available */ -+ /* Iterate over all possible fds, might be slow */ -+ for (lpc = getdtablesize() - 1; lpc > STDERR_FILENO; lpc--) { -+ close(lpc); -+ } -+ } else { -+ /* Iterate over fds obtained from /proc or /dev/fd */ -+ struct dirent *entry; -+ int dir_fd = dirfd(dir); -+ -+ while ((entry = readdir(dir)) != NULL) { -+ lpc = atoi(entry->d_name); -+ if (lpc > STDERR_FILENO && lpc != dir_fd) { -+ close(lpc); -+ } -+ } -+ -+ closedir(dir); - } - - #if SUPPORT_CIBSECRETS --- -1.8.3.1 - diff --git a/SOURCES/012-shutdown-lock.patch b/SOURCES/012-shutdown-lock.patch new file mode 100644 index 0000000..c700d96 --- /dev/null +++ b/SOURCES/012-shutdown-lock.patch @@ -0,0 +1,202 @@ +From 16f57bb79de4f88c2def174e3bb7d8ef312674cd Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 6 Dec 2019 12:17:03 -0600 +Subject: [PATCH 08/18] Low: scheduler: respect shutdown locks when placing + inactive resources + +When shutdown-lock is enabled, and we're either scheduling a resource stop +on a node that's cleanly shutting down or scheduling any action for a +previously locked resource, add "shutdown-lock=" to the +graph action. The controller will be able to use this to know when to preserve +the lock (by adding the lock time to the resource state entry). + +When the scheduler unpacks a resource state entry with a lock, it will remember +the lock node and lock time, which will trigger existing code for applying +shutdown locks. +--- + lib/pacemaker/pcmk_sched_allocate.c | 17 ++++++++++++- + lib/pacemaker/pcmk_sched_graph.c | 30 ++++++++++++++++++++++- + lib/pengine/unpack.c | 49 +++++++++++++++++++++++++++++++++---- + 3 files changed, 89 insertions(+), 7 deletions(-) + +diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c +index 0314f1b..884e1bd 100644 +--- a/lib/pacemaker/pcmk_sched_allocate.c ++++ b/lib/pacemaker/pcmk_sched_allocate.c +@@ -1015,8 +1015,23 @@ apply_shutdown_lock(pe_resource_t *rsc, pe_working_set_t *data_set) + return; + } + ++ if (rsc->lock_node != NULL) { ++ // The lock was obtained from resource history ++ ++ if (rsc->running_on != NULL) { ++ /* The resource was started elsewhere even though it is now ++ * considered locked. This shouldn't be possible, but as a ++ * failsafe, we don't want to disturb the resource now. ++ */ ++ pe_rsc_info(rsc, ++ "Cancelling shutdown lock because %s is already active", ++ rsc->id); ++ rsc->lock_node = NULL; ++ rsc->lock_time = 0; ++ } ++ + // Only a resource active on exactly one node can be locked +- if (pcmk__list_of_1(rsc->running_on)) { ++ } else if (pcmk__list_of_1(rsc->running_on)) { + pe_node_t *node = rsc->running_on->data; + + if (node->details->shutdown) { +diff --git a/lib/pacemaker/pcmk_sched_graph.c b/lib/pacemaker/pcmk_sched_graph.c +index a6967fe..2861f3d 100644 +--- a/lib/pacemaker/pcmk_sched_graph.c ++++ b/lib/pacemaker/pcmk_sched_graph.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -988,6 +988,26 @@ add_downed_nodes(xmlNode *xml, const action_t *action, + } + } + ++static bool ++should_lock_action(pe_action_t *action) ++{ ++ // Only actions taking place on resource's lock node are locked ++ if ((action->rsc->lock_node == NULL) || (action->node == NULL) ++ || (action->node->details != action->rsc->lock_node->details)) { ++ return false; ++ } ++ ++ /* During shutdown, only stops are locked (otherwise, another action such as ++ * a demote would cause the controller to clear the lock) ++ */ ++ if (action->node->details->shutdown && action->task ++ && strcmp(action->task, RSC_STOP)) { ++ return false; ++ } ++ ++ return true; ++} ++ + static xmlNode * + action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) + { +@@ -1097,6 +1117,14 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) + XML_ATTR_TYPE + }; + ++ /* If a resource is locked to a node via shutdown-lock, mark its actions ++ * so the controller can preserve the lock when the action completes. ++ */ ++ if (should_lock_action(action)) { ++ crm_xml_add_ll(action_xml, XML_CONFIG_ATTR_SHUTDOWN_LOCK, ++ (long long) action->rsc->lock_time); ++ } ++ + // List affected resource + + rsc_xml = create_xml_node(action_xml, +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 8c0d72a..5139e60 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1059,7 +1060,8 @@ unpack_node_loop(xmlNode * status, bool fence, pe_working_set_t * data_set) + crm_trace("Checking node %s/%s/%s status %d/%d/%d", id, rsc->id, rsc->container->id, fence, rsc->role, RSC_ROLE_STARTED); + + } else if (!pe__is_guest_node(this_node) +- && rsc->role == RSC_ROLE_STARTED) { ++ && ((rsc->role == RSC_ROLE_STARTED) ++ || is_set(data_set->flags, pe_flag_shutdown_lock))) { + check = TRUE; + crm_trace("Checking node %s/%s status %d/%d/%d", id, rsc->id, fence, rsc->role, RSC_ROLE_STARTED); + } +@@ -1075,6 +1077,9 @@ unpack_node_loop(xmlNode * status, bool fence, pe_working_set_t * data_set) + + } else if (fence) { + process = TRUE; ++ ++ } else if (is_set(data_set->flags, pe_flag_shutdown_lock)) { ++ process = TRUE; + } + + if(process) { +@@ -2198,6 +2203,28 @@ calculate_active_ops(GListPtr sorted_op_list, int *start_index, int *stop_index) + } + } + ++// If resource history entry has shutdown lock, remember lock node and time ++static void ++unpack_shutdown_lock(xmlNode *rsc_entry, pe_resource_t *rsc, pe_node_t *node, ++ pe_working_set_t *data_set) ++{ ++ time_t lock_time = 0; // When lock started (i.e. node shutdown time) ++ ++ if ((crm_element_value_epoch(rsc_entry, XML_CONFIG_ATTR_SHUTDOWN_LOCK, ++ &lock_time) == pcmk_ok) && (lock_time != 0)) { ++ ++ if ((data_set->shutdown_lock > 0) ++ && (get_effective_time(data_set) ++ > (lock_time + data_set->shutdown_lock))) { ++ pe_rsc_info(rsc, "Shutdown lock for %s on %s expired", ++ rsc->id, node->details->uname); ++ } else { ++ rsc->lock_node = node; ++ rsc->lock_time = lock_time; ++ } ++ } ++} ++ + static resource_t * + unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data_set) + { +@@ -2234,18 +2261,30 @@ unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data + } + } + +- if (op_list == NULL) { +- /* if there are no operations, there is nothing to do */ +- return NULL; ++ if (is_not_set(data_set->flags, pe_flag_shutdown_lock)) { ++ if (op_list == NULL) { ++ // If there are no operations, there is nothing to do ++ return NULL; ++ } + } + + /* find the resource */ + rsc = unpack_find_resource(data_set, node, rsc_id, rsc_entry); + if (rsc == NULL) { +- rsc = process_orphan_resource(rsc_entry, node, data_set); ++ if (op_list == NULL) { ++ // If there are no operations, there is nothing to do ++ return NULL; ++ } else { ++ rsc = process_orphan_resource(rsc_entry, node, data_set); ++ } + } + CRM_ASSERT(rsc != NULL); + ++ // Check whether the resource is "shutdown-locked" to this node ++ if (is_set(data_set->flags, pe_flag_shutdown_lock)) { ++ unpack_shutdown_lock(rsc_entry, rsc, node, data_set); ++ } ++ + /* process operations */ + saved_role = rsc->role; + on_fail = action_fail_ignore; +-- +1.8.3.1 + diff --git a/SOURCES/013-shutdown-lock.patch b/SOURCES/013-shutdown-lock.patch new file mode 100644 index 0000000..4b9c91f --- /dev/null +++ b/SOURCES/013-shutdown-lock.patch @@ -0,0 +1,281 @@ +From 223ab7251adcb8c6f6b96def138be58b1478c42b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 22 Nov 2019 17:03:20 -0600 +Subject: [PATCH 09/18] Low: controller: mark shutdown-locked resources in + resource history + +When a graph action indicates that the resource should be shutdown-locked +to its node, remember the shutdown lock time in active_op_t so we can remember +that when the result comes back. When the result does come back, add +"shutdown-lock" to its lrm_resource entry in the CIB status section -- as +the timestamp if it's a successful stop or a probe finding the resource +inactive, or as 0 to clear the lock for any other operation. +--- + daemons/controld/controld_control.c | 9 ++++- + daemons/controld/controld_execd.c | 44 +++++++++++++++++++-- + daemons/controld/controld_lrm.h | 1 + + daemons/controld/controld_te_callbacks.c | 65 ++++++++++++++++++++++---------- + daemons/controld/controld_utils.h | 1 + + 5 files changed, 95 insertions(+), 25 deletions(-) + +diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c +index 6c7f97c..c918a1e 100644 +--- a/daemons/controld/controld_control.c ++++ b/daemons/controld/controld_control.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -35,6 +35,7 @@ gboolean fsa_has_quorum = FALSE; + crm_trigger_t *fsa_source = NULL; + crm_trigger_t *config_read = NULL; + bool no_quorum_suicide_escalation = FALSE; ++bool controld_shutdown_lock_enabled = false; + + /* A_HA_CONNECT */ + void +@@ -587,7 +588,10 @@ static pe_cluster_option crmd_opts[] = { + { "stonith-max-attempts",NULL,"integer",NULL,"10",&check_positive_number, + "How many times stonith can fail before it will no longer be attempted on a target" + }, ++ ++ // Already documented in libpe_status (other values must be kept identical) + { "no-quorum-policy", NULL, "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL }, ++ { XML_CONFIG_ATTR_SHUTDOWN_LOCK, NULL, "boolean", NULL, "false", &check_boolean, NULL, NULL }, + }; + /* *INDENT-ON* */ + +@@ -698,6 +702,9 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void + value = crmd_pref(config_hash, "join-finalization-timeout"); + finalization_timer->period_ms = crm_parse_interval_spec(value); + ++ value = crmd_pref(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK); ++ controld_shutdown_lock_enabled = crm_is_true(value); ++ + free(fsa_cluster_name); + fsa_cluster_name = NULL; + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 17cc8d6..c0436a2 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -44,7 +44,8 @@ static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, + + static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, + int log_level); +-static int do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op); ++static int do_update_resource(const char *node_name, lrmd_rsc_info_t *rsc, ++ lrmd_event_data_t *op, time_t lock_time); + + static void + lrm_connection_destroy(void) +@@ -2171,7 +2172,7 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t + crm_debug("Recording pending op " CRM_OP_FMT " on %s in the CIB", + op->rsc_id, op->op_type, op->interval_ms, node_name); + +- do_update_resource(node_name, rsc, op); ++ do_update_resource(node_name, rsc, op, 0); + } + + static void +@@ -2313,6 +2314,10 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, + pending->rsc_id = strdup(rsc->id); + pending->start_time = time(NULL); + pending->user_data = op->user_data? strdup(op->user_data) : NULL; ++ if (crm_element_value_epoch(msg, XML_CONFIG_ATTR_SHUTDOWN_LOCK, ++ &(pending->lock_time)) != pcmk_ok) { ++ pending->lock_time = 0; ++ } + g_hash_table_replace(lrm_state->pending_ops, call_id_s, pending); + + if ((op->interval_ms > 0) +@@ -2356,8 +2361,28 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use + } + } + ++/* Only successful stops, and probes that found the resource inactive, get locks ++ * recorded in the history. This ensures the resource stays locked to the node ++ * until it is active there again after the node comes back up. ++ */ ++static bool ++should_preserve_lock(lrmd_event_data_t *op) ++{ ++ if (!controld_shutdown_lock_enabled) { ++ return false; ++ } ++ if (!strcmp(op->op_type, RSC_STOP) && (op->rc == PCMK_OCF_OK)) { ++ return true; ++ } ++ if (!strcmp(op->op_type, RSC_STATUS) && (op->rc == PCMK_OCF_NOT_RUNNING)) { ++ return true; ++ } ++ return false; ++} ++ + static int +-do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) ++do_update_resource(const char *node_name, lrmd_rsc_info_t *rsc, ++ lrmd_event_data_t *op, time_t lock_time) + { + /* + +@@ -2412,6 +2437,16 @@ do_update_resource(const char *node_name, lrmd_rsc_info_t * rsc, lrmd_event_data + crm_xml_add(iter, XML_ATTR_TYPE, rsc->type); + crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->standard); + crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER, rsc->provider); ++ if (lock_time != 0) { ++ /* Actions on a locked resource should either preserve the lock by ++ * recording it with the action result, or clear it. ++ */ ++ if (!should_preserve_lock(op)) { ++ lock_time = 0; ++ } ++ crm_xml_add_ll(iter, XML_CONFIG_ATTR_SHUTDOWN_LOCK, ++ (long long) lock_time); ++ } + + if (op->params) { + container = g_hash_table_lookup(op->params, CRM_META"_"XML_RSC_ATTR_CONTAINER); +@@ -2616,7 +2651,8 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, + if (controld_action_is_recordable(op->op_type)) { + if (node_name && rsc) { + // We should record the result, and happily, we can +- update_id = do_update_resource(node_name, rsc, op); ++ update_id = do_update_resource(node_name, rsc, op, ++ pending? pending->lock_time : 0); + need_direct_ack = FALSE; + + } else if (op->rsc_deleted) { +diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h +index 7acac2a..da0582c 100644 +--- a/daemons/controld/controld_lrm.h ++++ b/daemons/controld/controld_lrm.h +@@ -46,6 +46,7 @@ typedef struct active_op_s { + int call_id; + uint32_t flags; // bitmask of active_op_e + time_t start_time; ++ time_t lock_time; + char *rsc_id; + char *op_type; + char *op_key; +diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c +index 25f0ab2..8506f26 100644 +--- a/daemons/controld/controld_te_callbacks.c ++++ b/daemons/controld/controld_te_callbacks.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -28,6 +28,17 @@ crm_trigger_t *transition_trigger = NULL; + /* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ + #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" + ++// An explicit shutdown-lock of 0 means the lock has been cleared ++static bool ++shutdown_lock_cleared(xmlNode *lrm_resource) ++{ ++ time_t shutdown_lock = 0; ++ ++ return (crm_element_value_epoch(lrm_resource, XML_CONFIG_ATTR_SHUTDOWN_LOCK, ++ &shutdown_lock) == pcmk_ok) ++ && (shutdown_lock == 0); ++} ++ + static void + te_update_diff_v1(const char *event, xmlNode *diff) + { +@@ -106,33 +117,42 @@ te_update_diff_v1(const char *event, xmlNode *diff) + } + freeXpathObject(xpathObj); + ++ // Check for lrm_resource entries ++ xpathObj = xpath_search(diff, ++ "//" F_CIB_UPDATE_RESULT ++ "//" XML_TAG_DIFF_ADDED ++ "//" XML_LRM_TAG_RESOURCE); ++ max = numXpathResults(xpathObj); ++ + /* +- * Updates by, or in response to, TE actions will never contain updates +- * for more than one resource at a time, so such updates indicate an +- * LRM refresh. +- * +- * In that case, start a new transition rather than check each result +- * individually, which can result in _huge_ speedups in large clusters. ++ * Updates by, or in response to, graph actions will never affect more than ++ * one resource at a time, so such updates indicate an LRM refresh. In that ++ * case, start a new transition rather than check each result individually, ++ * which can result in _huge_ speedups in large clusters. + * + * Unfortunately, we can only do so when there are no pending actions. + * Otherwise, we could mistakenly throw away those results here, and + * the cluster will stall waiting for them and time out the operation. + */ +- if (transition_graph->pending == 0) { +- xpathObj = xpath_search(diff, +- "//" F_CIB_UPDATE_RESULT +- "//" XML_TAG_DIFF_ADDED +- "//" XML_LRM_TAG_RESOURCE); +- max = numXpathResults(xpathObj); +- if (max > 1) { +- crm_debug("Ignoring resource operation updates due to history refresh of %d resources", +- max); +- crm_log_xml_trace(diff, "lrm-refresh"); +- abort_transition(INFINITY, tg_restart, "History refresh", NULL); +- goto bail; ++ if ((transition_graph->pending == 0) && (max > 1)) { ++ crm_debug("Ignoring resource operation updates due to history refresh of %d resources", ++ max); ++ crm_log_xml_trace(diff, "lrm-refresh"); ++ abort_transition(INFINITY, tg_restart, "History refresh", NULL); ++ goto bail; ++ } ++ ++ if (max == 1) { ++ xmlNode *lrm_resource = getXpathResult(xpathObj, 0); ++ ++ if (shutdown_lock_cleared(lrm_resource)) { ++ // @TODO would be more efficient to abort once after transition done ++ abort_transition(INFINITY, tg_restart, "Shutdown lock cleared", ++ lrm_resource); ++ // Still process results, so we stop timers and update failcounts + } +- freeXpathObject(xpathObj); + } ++ freeXpathObject(xpathObj); + + /* Process operation updates */ + xpathObj = +@@ -205,6 +225,11 @@ process_lrm_resource_diff(xmlNode *lrm_resource, const char *node) + rsc_op = __xml_next(rsc_op)) { + process_graph_event(rsc_op, node); + } ++ if (shutdown_lock_cleared(lrm_resource)) { ++ // @TODO would be more efficient to abort once after transition done ++ abort_transition(INFINITY, tg_restart, "Shutdown lock cleared", ++ lrm_resource); ++ } + } + + static void +diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h +index ca8cddb..8e31007 100644 +--- a/daemons/controld/controld_utils.h ++++ b/daemons/controld/controld_utils.h +@@ -41,6 +41,7 @@ fsa_cib_anon_update(const char *section, xmlNode *data) { + } + + extern gboolean fsa_has_quorum; ++extern bool controld_shutdown_lock_enabled; + extern int last_peer_update; + extern int last_resource_update; + +-- +1.8.3.1 + diff --git a/SOURCES/013-soft-limit.patch b/SOURCES/013-soft-limit.patch deleted file mode 100644 index 4a3c626..0000000 --- a/SOURCES/013-soft-limit.patch +++ /dev/null @@ -1,216 +0,0 @@ -From 68d6a69a8bc2f25e935608344d5b7e2b52cde85f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 21 Oct 2019 19:02:47 -0500 -Subject: [PATCH] Low: libcrmservice: don't close descriptors above current - limit - -This is irrelevant in normal use. However, valgrind can open high-numbered -file descriptors for its own use above the soft limit of the process being run -under valgrind. If that process forks a child that tries to close all open file -descriptors (e.g. the executor running an agent), the close fails because the -file descriptors are invalid, and (ironically) valgrind warns about that. - -This allows 5a73027 to work under valgrind. Additionally, we extend the -efficient close method from that commit to pacemakerd's spawning of children. ---- - daemons/pacemakerd/pacemakerd.c | 10 ++---- - include/crm/common/internal.h | 3 ++ - lib/common/io.c | 72 +++++++++++++++++++++++++++++++++++++++++ - lib/services/services_linux.c | 29 +---------------- - 4 files changed, 79 insertions(+), 35 deletions(-) - -diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c -index fdc1d9f..d8ff53d 100644 ---- a/daemons/pacemakerd/pacemakerd.c -+++ b/daemons/pacemakerd/pacemakerd.c -@@ -13,6 +13,8 @@ - #include - #include - #include -+#include -+#include - #include - #include - #include -@@ -290,10 +292,8 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; - static gboolean - start_child(pcmk_child_t * child) - { -- int lpc = 0; - uid_t uid = 0; - gid_t gid = 0; -- struct rlimit oflimits; - gboolean use_valgrind = FALSE; - gboolean use_callgrind = FALSE; - const char *devnull = "/dev/null"; -@@ -396,11 +396,7 @@ start_child(pcmk_child_t * child) - crm_perror(LOG_ERR, "Could not set user to %d (%s)", uid, child->uid); - } - -- /* Close all open file descriptors */ -- getrlimit(RLIMIT_NOFILE, &oflimits); -- for (lpc = 0; lpc < oflimits.rlim_cur; lpc++) { -- close(lpc); -- } -+ pcmk__close_fds_in_child(true); - - (void)open(devnull, O_RDONLY); /* Stdin: fd 0 */ - (void)open(devnull, O_WRONLY); /* Stdout: fd 1 */ -diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h -index b2eec00..da2c7d7 100644 ---- a/include/crm/common/internal.h -+++ b/include/crm/common/internal.h -@@ -13,6 +13,7 @@ - #include /* for gboolean */ - #include /* for struct dirent */ - #include /* for getpid() */ -+#include /* for bool */ - #include /* for uid_t and gid_t */ - - #include -@@ -33,6 +34,8 @@ int crm_write_sync(int fd, const char *contents); - int crm_set_nonblocking(int fd); - const char *crm_get_tmpdir(void); - -+void pcmk__close_fds_in_child(bool); -+ - - /* internal procfs utilities (from procfs.c) */ - -diff --git a/lib/common/io.c b/lib/common/io.c -index fa438dd..6cbab0a 100644 ---- a/lib/common/io.c -+++ b/lib/common/io.c -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -501,3 +502,74 @@ crm_get_tmpdir() - - return (dir && (*dir == '/'))? dir : "/tmp"; - } -+ -+/*! -+ * \internal -+ * \brief Close open file descriptors -+ * -+ * Close all file descriptors (except optionally stdin, stdout, and stderr), -+ * which is a best practice for a new child process forked for the purpose of -+ * executing an external program. -+ * -+ * \param[in] bool If true, close stdin, stdout, and stderr as well -+ */ -+void -+pcmk__close_fds_in_child(bool all) -+{ -+ DIR *dir; -+ struct rlimit rlim; -+ rlim_t max_fd; -+ int min_fd = (all? 0 : (STDERR_FILENO + 1)); -+ -+ /* Find the current process's (soft) limit for open files. getrlimit() -+ * should always work, but have a fallback just in case. -+ */ -+ if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { -+ max_fd = rlim.rlim_cur - 1; -+ } else { -+ long conf_max = sysconf(_SC_OPEN_MAX); -+ -+ max_fd = (conf_max > 0)? conf_max : 1024; -+ } -+ -+ /* /proc/self/fd (on Linux) or /dev/fd (on most OSes) contains symlinks to -+ * all open files for the current process, named as the file descriptor. -+ * Use this if available, because it's more efficient than a shotgun -+ * approach to closing descriptors. -+ */ -+#if SUPPORT_PROCFS -+ dir = opendir("/proc/self/fd"); -+ if (dir == NULL) { -+ dir = opendir("/dev/fd"); -+ } -+#else -+ dir = opendir("/dev/fd"); -+#endif -+ if (dir != NULL) { -+ struct dirent *entry; -+ int dir_fd = dirfd(dir); -+ -+ while ((entry = readdir(dir)) != NULL) { -+ int lpc = atoi(entry->d_name); -+ -+ /* How could one of these entries be higher than max_fd, you ask? -+ * It isn't possible in normal operation, but when run under -+ * valgrind, valgrind can open high-numbered file descriptors for -+ * its own use that are higher than the process's soft limit. -+ * These will show up in the fd directory but aren't closable. -+ */ -+ if ((lpc >= min_fd) && (lpc <= max_fd) && (lpc != dir_fd)) { -+ close(lpc); -+ } -+ } -+ closedir(dir); -+ return; -+ } -+ -+ /* If no fd directory is available, iterate over all possible descriptors. -+ * This is less efficient due to the overhead of many system calls. -+ */ -+ for (int lpc = max_fd; lpc >= min_fd; lpc--) { -+ close(lpc); -+ } -+} -diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c -index 464fc5b..6870273 100644 ---- a/lib/services/services_linux.c -+++ b/lib/services/services_linux.c -@@ -444,9 +444,6 @@ services_handle_exec_error(svc_action_t * op, int error) - static void - action_launch_child(svc_action_t *op) - { -- int lpc; -- DIR *dir; -- - /* SIGPIPE is ignored (which is different from signal blocking) by the gnutls library. - * Depending on the libqb version in use, libqb may set SIGPIPE to be ignored as well. - * We do not want this to be inherited by the child process. By resetting this the signal -@@ -476,31 +473,7 @@ action_launch_child(svc_action_t *op) - */ - setpgid(0, 0); - -- // Close all file descriptors except stdin/stdout/stderr --#if SUPPORT_PROCFS -- dir = opendir("/proc/self/fd"); --#else -- dir = opendir("/dev/fd"); --#endif -- if (dir == NULL) { /* /proc or /dev/fd not available */ -- /* Iterate over all possible fds, might be slow */ -- for (lpc = getdtablesize() - 1; lpc > STDERR_FILENO; lpc--) { -- close(lpc); -- } -- } else { -- /* Iterate over fds obtained from /proc or /dev/fd */ -- struct dirent *entry; -- int dir_fd = dirfd(dir); -- -- while ((entry = readdir(dir)) != NULL) { -- lpc = atoi(entry->d_name); -- if (lpc > STDERR_FILENO && lpc != dir_fd) { -- close(lpc); -- } -- } -- -- closedir(dir); -- } -+ pcmk__close_fds_in_child(false); - - #if SUPPORT_CIBSECRETS - if (replace_secret_params(op->rsc, op->params) < 0) { --- -1.8.3.1 - diff --git a/SOURCES/014-shutdown-lock.patch b/SOURCES/014-shutdown-lock.patch new file mode 100644 index 0000000..b464947 --- /dev/null +++ b/SOURCES/014-shutdown-lock.patch @@ -0,0 +1,158 @@ +From 8270e8aed46f6e672b94f00fe0bde07cd2b6ddd7 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 13 Dec 2019 11:38:49 -0600 +Subject: [PATCH 10/18] Low: controller: don't clear shutdown locks when node + rejoins + +Add new controld_delete_node_state() values for clearing resource history +while preserving shutdown locks. This is accomplished by deleting all +unlocked lrm_resource entries and all lrm_rsc_op entries, instead of the entire +lrm subsection. +--- + daemons/controld/controld_based.c | 22 +++++++++++++++++++++- + daemons/controld/controld_join_dc.c | 7 +++++-- + daemons/controld/controld_remote_ra.c | 16 ++++++++++------ + daemons/controld/controld_utils.h | 2 ++ + 4 files changed, 38 insertions(+), 9 deletions(-) + +diff --git a/daemons/controld/controld_based.c b/daemons/controld/controld_based.c +index f3a7c4f..0ffc1e8 100644 +--- a/daemons/controld/controld_based.c ++++ b/daemons/controld/controld_based.c +@@ -191,12 +191,21 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + // Node's lrm section (name 1x) + #define XPATH_NODE_LRM XPATH_NODE_STATE "/" XML_CIB_TAG_LRM + ++// Node's lrm_rsc_op entries and lrm_resource entries without lock (name 2x) ++#define XPATH_NODE_LRM_UNLOCKED XPATH_NODE_STATE "//" XML_LRM_TAG_RSC_OP \ ++ "|" XPATH_NODE_STATE \ ++ "//" XML_LRM_TAG_RESOURCE \ ++ "[not(@" XML_CONFIG_ATTR_SHUTDOWN_LOCK ")]" ++ + // Node's transient_attributes section (name 1x) + #define XPATH_NODE_ATTRS XPATH_NODE_STATE "/" XML_TAG_TRANSIENT_NODEATTRS + + // Everything under node_state (name 1x) + #define XPATH_NODE_ALL XPATH_NODE_STATE "/*" + ++// Unlocked history + transient attributes (name 3x) ++#define XPATH_NODE_ALL_UNLOCKED XPATH_NODE_LRM_UNLOCKED "|" XPATH_NODE_ATTRS ++ + /*! + * \internal + * \brief Delete subsection of a node's CIB node_state +@@ -218,6 +227,11 @@ controld_delete_node_state(const char *uname, enum controld_section_e section, + xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); + desc = crm_strdup_printf("resource history for node %s", uname); + break; ++ case controld_section_lrm_unlocked: ++ xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, uname, uname); ++ desc = crm_strdup_printf("resource history (other than shutdown " ++ "locks) for node %s", uname); ++ break; + case controld_section_attrs: + xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname); + desc = crm_strdup_printf("transient attributes for node %s", uname); +@@ -226,6 +240,12 @@ controld_delete_node_state(const char *uname, enum controld_section_e section, + xpath = crm_strdup_printf(XPATH_NODE_ALL, uname); + desc = crm_strdup_printf("all state for node %s", uname); + break; ++ case controld_section_all_unlocked: ++ xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED, ++ uname, uname, uname); ++ desc = crm_strdup_printf("all state (other than shutdown locks) " ++ "for node %s", uname); ++ break; + } + + if (fsa_cib_conn == NULL) { +@@ -234,7 +254,7 @@ controld_delete_node_state(const char *uname, enum controld_section_e section, + } else { + int call_id; + +- options |= cib_quorum_override|cib_xpath; ++ options |= cib_quorum_override|cib_xpath|cib_multiple; + call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, xpath, NULL, options); + crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s", + desc, call_id, xpath); +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index 885b2a9..f0eb2a2 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -534,6 +534,7 @@ do_dc_join_ack(long long action, + int join_id = -1; + int call_id = 0; + ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); ++ enum controld_section_e section = controld_section_lrm; + + const char *op = crm_element_value(join_ack->msg, F_CRM_TASK); + const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM); +@@ -583,8 +584,10 @@ do_dc_join_ack(long long action, + /* Update CIB with node's current executor state. A new transition will be + * triggered later, when the CIB notifies us of the change. + */ +- controld_delete_node_state(join_from, controld_section_lrm, +- cib_scope_local); ++ if (controld_shutdown_lock_enabled) { ++ section = controld_section_lrm_unlocked; ++ } ++ controld_delete_node_state(join_from, section, cib_scope_local); + if (safe_str_eq(join_from, fsa_our_uname)) { + xmlNode *now_dc_lrmd_state = controld_query_executor_state(fsa_our_uname); + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 2d3dfa7..a81c354 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2013-2019 the Pacemaker project contributors ++ * Copyright 2013-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -177,17 +177,21 @@ remote_node_up(const char *node_name) + int call_opt, call_id = 0; + xmlNode *update, *state; + crm_node_t *node; ++ enum controld_section_e section = controld_section_all; + + CRM_CHECK(node_name != NULL, return); + crm_info("Announcing pacemaker_remote node %s", node_name); + +- /* Clear node's entire state (resource history and transient attributes). +- * The transient attributes should and normally will be cleared when the +- * node leaves, but since remote node state has a number of corner cases, +- * clear them here as well, to be sure. ++ /* Clear node's entire state (resource history and transient attributes) ++ * other than shutdown locks. The transient attributes should and normally ++ * will be cleared when the node leaves, but since remote node state has a ++ * number of corner cases, clear them here as well, to be sure. + */ + call_opt = crmd_cib_smart_opt(); +- controld_delete_node_state(node_name, controld_section_all, call_opt); ++ if (controld_shutdown_lock_enabled) { ++ section = controld_section_all_unlocked; ++ } ++ controld_delete_node_state(node_name, section, call_opt); + + /* Clear node's probed attribute */ + update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE); +diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h +index 8e31007..5549636 100644 +--- a/daemons/controld/controld_utils.h ++++ b/daemons/controld/controld_utils.h +@@ -90,8 +90,10 @@ bool controld_action_is_recordable(const char *action); + // Subsections of node_state + enum controld_section_e { + controld_section_lrm, ++ controld_section_lrm_unlocked, + controld_section_attrs, + controld_section_all, ++ controld_section_all_unlocked + }; + + void controld_delete_node_state(const char *uname, +-- +1.8.3.1 + diff --git a/SOURCES/015-shutdown-lock.patch b/SOURCES/015-shutdown-lock.patch new file mode 100644 index 0000000..364b2aa --- /dev/null +++ b/SOURCES/015-shutdown-lock.patch @@ -0,0 +1,38 @@ +From d70d90367c898bcb62fd6c7dd8d641ca56be04ae Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 20 Dec 2019 11:46:37 -0600 +Subject: [PATCH 11/18] Low: scheduler: display when a resource is + shutdown-locked to a node + +... so it shows up in logs and cluster status displays +--- + lib/pengine/native.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/lib/pengine/native.c b/lib/pengine/native.c +index b064115..5a6fd80 100644 +--- a/lib/pengine/native.c ++++ b/lib/pengine/native.c +@@ -541,6 +541,9 @@ native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node, + provider = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); + } + ++ if ((node == NULL) && (rsc->lock_node != NULL)) { ++ node = rsc->lock_node; ++ } + if (is_set(options, pe_print_rsconly) + || pcmk__list_of_multiple(rsc->running_on)) { + node = NULL; +@@ -583,6 +586,9 @@ native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node, + if (node && !(node->details->online) && node->details->unclean) { + have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); + } ++ if (node && (node == rsc->lock_node)) { ++ have_flags = add_output_flag(outstr, "LOCKED", have_flags); ++ } + if (is_set(options, pe_print_pending)) { + const char *pending_task = native_pending_task(rsc); + +-- +1.8.3.1 + diff --git a/SOURCES/016-shutdown-lock.patch b/SOURCES/016-shutdown-lock.patch new file mode 100644 index 0000000..b8f8e5d --- /dev/null +++ b/SOURCES/016-shutdown-lock.patch @@ -0,0 +1,29 @@ +From bc9c07951cb9c411324056b4d5322016153fee20 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 16:01:16 -0600 +Subject: [PATCH 12/18] Low: tools: crm_resource resource checks should show + shutdown locks + +--- + tools/crm_resource_runtime.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 2ea8bb3..ed5fb03 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -928,6 +928,11 @@ cli_resource_check(cib_t * cib_conn, resource_t *rsc) + } + free(managed); + ++ if (rsc->lock_node) { ++ printf("%s * '%s' is locked to node %s due to shutdown\n", ++ (printed? "" : "\n"), parent->id, rsc->lock_node->details->uname); ++ } ++ + if (printed) { + printf("\n"); + } +-- +1.8.3.1 + diff --git a/SOURCES/017-shutdown-lock.patch b/SOURCES/017-shutdown-lock.patch new file mode 100644 index 0000000..8dc7dd9 --- /dev/null +++ b/SOURCES/017-shutdown-lock.patch @@ -0,0 +1,191 @@ +From 45a6f0b051743c266c13f3ffd365baf3a9d730f6 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 12:53:39 -0600 +Subject: [PATCH 13/18] Low: controller: allow CRM_OP_LRM_DELETE to clear CIB + only + +Normally, CRM_OP_LRM_DELETE is relayed to the affected node's controller, which +clears the resource from the executor and CIB as well the its own bookkeeping. + +Now, we want to be able to use it to clear shutdown locks for nodes that are +down. Let it take a new "mode" attribute, and if it is "cib", clear the +resource from the CIB locally without relaying the operation or doing anything +else. +--- + daemons/controld/controld_execd.c | 4 +- + daemons/controld/controld_messages.c | 97 ++++++++++++++++++++++++++++++++-- + daemons/controld/controld_te_actions.c | 7 +++ + include/crm_internal.h | 2 + + 4 files changed, 106 insertions(+), 4 deletions(-) + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index c0436a2..8d25fb8 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1769,7 +1769,9 @@ do_lrm_invoke(long long action, + crm_trace("Executor %s command from %s", crm_op, from_sys); + + if (safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) { +- crm_rsc_delete = TRUE; // Only crm_resource uses this op ++ if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { ++ crm_rsc_delete = TRUE; // from crm_resource ++ } + operation = CRMD_ACTION_DELETE; + + } else if (safe_str_eq(crm_op, CRM_OP_LRM_FAIL)) { +diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c +index 466c64c..689e4a0 100644 +--- a/daemons/controld/controld_messages.c ++++ b/daemons/controld/controld_messages.c +@@ -410,6 +410,14 @@ relay_message(xmlNode * msg, gboolean originated_locally) + + } else if (safe_str_eq(fsa_our_uname, host_to)) { + is_local = 1; ++ } else if (is_for_crm && safe_str_eq(task, CRM_OP_LRM_DELETE)) { ++ xmlNode *msg_data = get_message_xml(msg, F_CRM_DATA); ++ const char *mode = crm_element_value(msg_data, PCMK__XA_MODE); ++ ++ if (safe_str_eq(mode, XML_TAG_CIB)) { ++ // Local delete of an offline node's resource history ++ is_local = 1; ++ } + } + + if (is_for_dc || is_for_dcib || is_for_te) { +@@ -654,6 +662,86 @@ handle_failcount_op(xmlNode * stored_msg) + return I_NULL; + } + ++static enum crmd_fsa_input ++handle_lrm_delete(xmlNode *stored_msg) ++{ ++ const char *mode = NULL; ++ xmlNode *msg_data = get_message_xml(stored_msg, F_CRM_DATA); ++ ++ CRM_CHECK(msg_data != NULL, return I_NULL); ++ ++ /* CRM_OP_LRM_DELETE has two distinct modes. The default behavior is to ++ * relay the operation to the affected node, which will unregister the ++ * resource from the local executor, clear the resource's history from the ++ * CIB, and do some bookkeeping in the controller. ++ * ++ * However, if the affected node is offline, the client will specify ++ * mode="cib" which means the controller receiving the operation should ++ * clear the resource's history from the CIB and nothing else. This is used ++ * to clear shutdown locks. ++ */ ++ mode = crm_element_value(msg_data, PCMK__XA_MODE); ++ if ((mode == NULL) || strcmp(mode, XML_TAG_CIB)) { ++ // Relay to affected node ++ crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD); ++ return I_ROUTER; ++ ++ } else { ++ // Delete CIB history locally (compare with do_lrm_delete()) ++ const char *from_sys = NULL; ++ const char *user_name = NULL; ++ const char *rsc_id = NULL; ++ const char *node = NULL; ++ xmlNode *rsc_xml = NULL; ++ int rc = pcmk_rc_ok; ++ ++ rsc_xml = first_named_child(msg_data, XML_CIB_TAG_RESOURCE); ++ CRM_CHECK(rsc_xml != NULL, return I_NULL); ++ ++ rsc_id = ID(rsc_xml); ++ from_sys = crm_element_value(stored_msg, F_CRM_SYS_FROM); ++ node = crm_element_value(msg_data, XML_LRM_ATTR_TARGET); ++#if ENABLE_ACL ++ user_name = crm_acl_get_set_user(stored_msg, F_CRM_USER, NULL); ++#endif ++ crm_debug("Handling " CRM_OP_LRM_DELETE " for %s on %s locally%s%s " ++ "(clearing CIB resource history only)", rsc_id, node, ++ (user_name? " for user " : ""), (user_name? user_name : "")); ++#if ENABLE_ACL ++ rc = controld_delete_resource_history(rsc_id, node, user_name, ++ cib_dryrun|cib_sync_call); ++#endif ++ if (rc == pcmk_rc_ok) { ++ rc = controld_delete_resource_history(rsc_id, node, user_name, ++ crmd_cib_smart_opt()); ++ } ++ ++ // Notify client if not from graph (compare with notify_deleted()) ++ if (from_sys && strcmp(from_sys, CRM_SYSTEM_TENGINE)) { ++ lrmd_event_data_t *op = NULL; ++ const char *from_host = crm_element_value(stored_msg, ++ F_CRM_HOST_FROM); ++ const char *transition = crm_element_value(msg_data, ++ XML_ATTR_TRANSITION_KEY); ++ ++ crm_info("Notifying %s on %s that %s was%s deleted", ++ from_sys, (from_host? from_host : "local node"), rsc_id, ++ ((rc == pcmk_rc_ok)? "" : " not")); ++ op = lrmd_new_event(rsc_id, CRMD_ACTION_DELETE, 0); ++ op->type = lrmd_event_exec_complete; ++ op->user_data = strdup(transition? transition : FAKE_TE_ID); ++ op->params = crm_str_table_new(); ++ g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), ++ strdup(CRM_FEATURE_SET)); ++ controld_rc2event(op, rc); ++ controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id); ++ lrmd_free_event(op); ++ controld_trigger_delete_refresh(from_sys, rsc_id); ++ } ++ return I_NULL; ++ } ++} ++ + /*! + * \brief Handle a CRM_OP_REMOTE_STATE message by updating remote peer cache + * +@@ -913,9 +1001,12 @@ handle_request(xmlNode * stored_msg, enum crmd_fsa_cause cause) + crm_debug("Raising I_JOIN_RESULT: join-%s", crm_element_value(stored_msg, F_CRM_JOIN_ID)); + return I_JOIN_RESULT; + +- } else if (strcmp(op, CRM_OP_LRM_DELETE) == 0 +- || strcmp(op, CRM_OP_LRM_FAIL) == 0 +- || strcmp(op, CRM_OP_LRM_REFRESH) == 0 || strcmp(op, CRM_OP_REPROBE) == 0) { ++ } else if (strcmp(op, CRM_OP_LRM_DELETE) == 0) { ++ return handle_lrm_delete(stored_msg); ++ ++ } else if ((strcmp(op, CRM_OP_LRM_FAIL) == 0) ++ || (strcmp(op, CRM_OP_LRM_REFRESH) == 0) ++ || (strcmp(op, CRM_OP_REPROBE) == 0)) { + + crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD); + return I_ROUTER; +diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c +index 948bd64..59e0b5a 100644 +--- a/daemons/controld/controld_te_actions.c ++++ b/daemons/controld/controld_te_actions.c +@@ -107,6 +107,13 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action) + + if (!router_node) { + router_node = on_node; ++ if (safe_str_eq(task, CRM_OP_LRM_DELETE)) { ++ const char *mode = crm_element_value(action->xml, PCMK__XA_MODE); ++ ++ if (safe_str_eq(mode, XML_TAG_CIB)) { ++ router_node = fsa_our_uname; ++ } ++ } + } + + CRM_CHECK(on_node != NULL && strlen(on_node) != 0, +diff --git a/include/crm_internal.h b/include/crm_internal.h +index 1f25686..2fa53dd 100644 +--- a/include/crm_internal.h ++++ b/include/crm_internal.h +@@ -216,6 +216,8 @@ pid_t pcmk_locate_sbd(void); + # define ATTRD_OP_SYNC_RESPONSE "sync-response" + # define ATTRD_OP_CLEAR_FAILURE "clear-failure" + ++# define PCMK__XA_MODE "mode" ++ + # define PCMK_ENV_PHYSICAL_HOST "physical_host" + + +-- +1.8.3.1 + diff --git a/SOURCES/018-shutdown-lock.patch b/SOURCES/018-shutdown-lock.patch new file mode 100644 index 0000000..99ad90e --- /dev/null +++ b/SOURCES/018-shutdown-lock.patch @@ -0,0 +1,56 @@ +From 457e231256feb0bdcf206209e03f0875f50d03b3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 14 Jan 2020 16:24:08 -0600 +Subject: [PATCH 14/18] Low: tools: for down nodes, crm_resource --refresh + should clear CIB only + +This provides a mechanism to manually clear shutdown locks. +--- + tools/crm_resource_runtime.c | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index ed5fb03..e89b572 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -473,6 +473,7 @@ send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op, + const char *rsc_type = NULL; + xmlNode *params = NULL; + xmlNode *msg_data = NULL; ++ bool cib_only = false; + resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); + + if (rsc == NULL) { +@@ -504,10 +505,14 @@ send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op, + } + + if (!(node->details->online)) { +- CMD_ERR("Node %s is not online", host_uname); +- return -ENOTCONN; ++ if (strcmp(op, CRM_OP_LRM_DELETE) == 0) { ++ cib_only = true; ++ } else { ++ CMD_ERR("Node %s is not online", host_uname); ++ return -ENOTCONN; ++ } + } +- if (pe__is_guest_or_remote_node(node)) { ++ if (!cib_only && pe__is_guest_or_remote_node(node)) { + node = pe__current_node(node->details->remote_rsc); + if (node == NULL) { + CMD_ERR("No cluster connection to Pacemaker Remote node %s detected", +@@ -533,6 +538,11 @@ send_lrm_rsc_op(crm_ipc_t * crmd_channel, const char *op, + crm_xml_add(msg_data, XML_LRM_ATTR_ROUTER_NODE, router_node); + } + ++ if (cib_only) { ++ // Indicate that only the CIB needs to be cleaned ++ crm_xml_add(msg_data, PCMK__XA_MODE, XML_TAG_CIB); ++ } ++ + xml_rsc = create_xml_node(msg_data, XML_CIB_TAG_RESOURCE); + if (rsc->clone_name) { + crm_xml_add(xml_rsc, XML_ATTR_ID, rsc->clone_name); +-- +1.8.3.1 + diff --git a/SOURCES/019-shutdown-lock.patch b/SOURCES/019-shutdown-lock.patch new file mode 100644 index 0000000..f94dc58 --- /dev/null +++ b/SOURCES/019-shutdown-lock.patch @@ -0,0 +1,221 @@ +From cf1e90ffe764f3639799206db9444ae32821386b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Fri, 10 Jan 2020 18:18:07 -0600 +Subject: [PATCH 15/18] Low: scheduler: clear resource history when appropriate + +Tell the controller to clear resource history from the CIB when a resource has +a shutdown lock that expired or was cancelled because the resource is already +active elsewhere. +--- + include/crm/pengine/internal.h | 4 +++- + include/crm/pengine/pe_types.h | 4 +++- + lib/pacemaker/pcmk_sched_allocate.c | 1 + + lib/pacemaker/pcmk_sched_graph.c | 16 ++++++++++++++-- + lib/pacemaker/pcmk_sched_native.c | 6 ++++++ + lib/pengine/unpack.c | 1 + + lib/pengine/utils.c | 34 ++++++++++++++++++++++++++++++++-- + 7 files changed, 60 insertions(+), 6 deletions(-) + +diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h +index 119624d..bc2c70e 100644 +--- a/include/crm/pengine/internal.h ++++ b/include/crm/pengine/internal.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -435,5 +435,7 @@ void pe__unpack_dataset_nvpairs(xmlNode *xml_obj, const char *set_name, + pe_working_set_t *data_set); + + bool pe__resource_is_disabled(pe_resource_t *rsc); ++pe_action_t *pe__clear_resource_history(pe_resource_t *rsc, pe_node_t *node, ++ pe_working_set_t *data_set); + + #endif +diff --git a/include/crm/pengine/pe_types.h b/include/crm/pengine/pe_types.h +index 123d8ef..572787b 100644 +--- a/include/crm/pengine/pe_types.h ++++ b/include/crm/pengine/pe_types.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -287,6 +287,8 @@ enum pe_action_flags { + pe_action_reschedule = 0x02000, + pe_action_tracking = 0x04000, + pe_action_dedup = 0x08000, //! Internal state tracking when creating graph ++ ++ pe_action_dc = 0x10000, //! Action may run on DC instead of target + }; + /* *INDENT-ON* */ + +diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c +index 884e1bd..195d055 100644 +--- a/lib/pacemaker/pcmk_sched_allocate.c ++++ b/lib/pacemaker/pcmk_sched_allocate.c +@@ -1026,6 +1026,7 @@ apply_shutdown_lock(pe_resource_t *rsc, pe_working_set_t *data_set) + pe_rsc_info(rsc, + "Cancelling shutdown lock because %s is already active", + rsc->id); ++ pe__clear_resource_history(rsc, rsc->lock_node, data_set); + rsc->lock_node = NULL; + rsc->lock_time = 0; + } +diff --git a/lib/pacemaker/pcmk_sched_graph.c b/lib/pacemaker/pcmk_sched_graph.c +index 2861f3d..355ffca 100644 +--- a/lib/pacemaker/pcmk_sched_graph.c ++++ b/lib/pacemaker/pcmk_sched_graph.c +@@ -586,10 +586,11 @@ update_action(pe_action_t *then, pe_working_set_t *data_set) + + /* 'then' is required, so we must abandon 'first' + * (e.g. a required stop cancels any reload). +- * Only used with reload actions as 'first'. + */ + set_bit(other->action->flags, pe_action_optional); +- clear_bit(first->rsc->flags, pe_rsc_reload); ++ if (!strcmp(first->task, CRMD_ACTION_RELOAD)) { ++ clear_bit(first->rsc->flags, pe_rsc_reload); ++ } + } + + if (first->rsc && then->rsc && (first->rsc != then->rsc) +@@ -1039,6 +1040,11 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) + } else if (safe_str_eq(action->task, CRM_OP_LRM_REFRESH)) { + action_xml = create_xml_node(NULL, XML_GRAPH_TAG_CRM_EVENT); + ++ } else if (safe_str_eq(action->task, CRM_OP_LRM_DELETE)) { ++ // CIB-only clean-up for shutdown locks ++ action_xml = create_xml_node(NULL, XML_GRAPH_TAG_CRM_EVENT); ++ crm_xml_add(action_xml, PCMK__XA_MODE, XML_TAG_CIB); ++ + /* } else if(safe_str_eq(action->task, RSC_PROBED)) { */ + /* action_xml = create_xml_node(NULL, XML_GRAPH_TAG_CRM_EVENT); */ + +@@ -1051,6 +1057,7 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) + + } else { + action_xml = create_xml_node(NULL, XML_GRAPH_TAG_RSC_OP); ++ + #if ENABLE_VERSIONED_ATTRS + rsc_details = pe_rsc_action_details(action); + #endif +@@ -1392,6 +1399,11 @@ should_dump_action(pe_action_t *action) + log_action(LOG_DEBUG, "Unallocated action", action, false); + return false; + ++ } else if (is_set(action->flags, pe_action_dc)) { ++ crm_trace("Action %s (%d) should be dumped: " ++ "can run on DC instead of %s", ++ action->uuid, action->id, action->node->details->uname); ++ + } else if (pe__is_guest_node(action->node) + && !action->node->details->remote_requires_reset) { + crm_trace("Action %s (%d) should be dumped: " +diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c +index 9ebdd35..714a7a0 100644 +--- a/lib/pacemaker/pcmk_sched_native.c ++++ b/lib/pacemaker/pcmk_sched_native.c +@@ -1403,6 +1403,12 @@ native_internal_constraints(resource_t * rsc, pe_working_set_t * data_set) + pe_order_runnable_left, data_set); + } + ++ // Don't clear resource history if probing on same node ++ custom_action_order(rsc, generate_op_key(rsc->id, CRM_OP_LRM_DELETE, 0), ++ NULL, rsc, generate_op_key(rsc->id, RSC_STATUS, 0), ++ NULL, pe_order_same_node|pe_order_then_cancels_first, ++ data_set); ++ + // Certain checks need allowed nodes + if (check_unfencing || check_utilization || rsc->container) { + allowed_nodes = allowed_nodes_as_list(rsc, data_set); +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 5139e60..87edc83 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -2218,6 +2218,7 @@ unpack_shutdown_lock(xmlNode *rsc_entry, pe_resource_t *rsc, pe_node_t *node, + > (lock_time + data_set->shutdown_lock))) { + pe_rsc_info(rsc, "Shutdown lock for %s on %s expired", + rsc->id, node->details->uname); ++ pe__clear_resource_history(rsc, node, data_set); + } else { + rsc->lock_node = node; + rsc->lock_time = lock_time; +diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c +index 586d92c..b61455d 100644 +--- a/lib/pengine/utils.c ++++ b/lib/pengine/utils.c +@@ -520,6 +520,11 @@ custom_action(resource_t * rsc, char *key, const char *task, + } + action->uuid = strdup(key); + ++ if (safe_str_eq(task, CRM_OP_LRM_DELETE)) { ++ // Resource history deletion for a node can be done on the DC ++ pe_set_action_bit(action, pe_action_dc); ++ } ++ + pe_set_action_bit(action, pe_action_runnable); + if (optional) { + pe_set_action_bit(action, pe_action_optional); +@@ -588,7 +593,8 @@ custom_action(resource_t * rsc, char *key, const char *task, + pe_set_action_bit(action, pe_action_optional); + /* action->runnable = FALSE; */ + +- } else if (action->node->details->online == FALSE ++ } else if (is_not_set(action->flags, pe_action_dc) ++ && !(action->node->details->online) + && (!pe__is_guest_node(action->node) + || action->node->details->remote_requires_reset)) { + pe_clear_action_bit(action, pe_action_runnable); +@@ -600,7 +606,8 @@ custom_action(resource_t * rsc, char *key, const char *task, + pe_fence_node(data_set, action->node, "resource actions are unrunnable"); + } + +- } else if (action->node->details->pending) { ++ } else if (is_not_set(action->flags, pe_action_dc) ++ && action->node->details->pending) { + pe_clear_action_bit(action, pe_action_runnable); + do_crm_log(warn_level, "Action %s on %s is unrunnable (pending)", + action->uuid, action->node->details->uname); +@@ -714,6 +721,8 @@ unpack_operation_on_fail(action_t * action) + + value = on_fail; + } ++ } else if (safe_str_eq(action->task, CRM_OP_LRM_DELETE)) { ++ value = "ignore"; + } + + return value; +@@ -2595,3 +2604,24 @@ pe__resource_is_disabled(pe_resource_t *rsc) + } + return false; + } ++ ++/*! ++ * \internal ++ * \brief Create an action to clear a resource's history from CIB ++ * ++ * \param[in] rsc Resource to clear ++ * \param[in] node Node to clear history on ++ * ++ * \return New action to clear resource history ++ */ ++pe_action_t * ++pe__clear_resource_history(pe_resource_t *rsc, pe_node_t *node, ++ pe_working_set_t *data_set) ++{ ++ char *key = NULL; ++ ++ CRM_ASSERT(rsc && node); ++ key = generate_op_key(rsc->id, CRM_OP_LRM_DELETE, 0); ++ return custom_action(rsc, key, CRM_OP_LRM_DELETE, node, FALSE, TRUE, ++ data_set); ++} +-- +1.8.3.1 + diff --git a/SOURCES/020-shutdown-lock.patch b/SOURCES/020-shutdown-lock.patch new file mode 100644 index 0000000..f650b81 --- /dev/null +++ b/SOURCES/020-shutdown-lock.patch @@ -0,0 +1,32 @@ +From 16bcad136dc004b7c7bb9f5044c7ef488c441701 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 21 Nov 2019 15:39:42 -0600 +Subject: [PATCH 16/18] Feature: controller: bump feature set for shutdown-lock + +--- + include/crm/crm.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/crm/crm.h b/include/crm/crm.h +index cbf72d3..d2ffb61 100644 +--- a/include/crm/crm.h ++++ b/include/crm/crm.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -51,7 +51,7 @@ extern "C" { + * >=3.0.13: Fail counts include operation name and interval + * >=3.2.0: DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED + */ +-# define CRM_FEATURE_SET "3.2.0" ++# define CRM_FEATURE_SET "3.3.0" + + # define EOS '\0' + # define DIMOF(a) ((int) (sizeof(a)/sizeof(a[0])) ) +-- +1.8.3.1 + diff --git a/SOURCES/021-shutdown-lock.patch b/SOURCES/021-shutdown-lock.patch new file mode 100644 index 0000000..cdd9dba --- /dev/null +++ b/SOURCES/021-shutdown-lock.patch @@ -0,0 +1,738 @@ +From a9fdae8b3acd9a271d04f98f9c4e230bfa74efd3 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 6 Jan 2020 16:19:12 -0600 +Subject: [PATCH 17/18] Test: scheduler: add regression tests for shutdown + locks + +--- + cts/cts-scheduler.in | 4 +- + cts/scheduler/shutdown-lock-expiration.dot | 11 ++ + cts/scheduler/shutdown-lock-expiration.exp | 68 +++++++++ + cts/scheduler/shutdown-lock-expiration.scores | 17 +++ + cts/scheduler/shutdown-lock-expiration.summary | 31 ++++ + cts/scheduler/shutdown-lock-expiration.xml | 187 +++++++++++++++++++++++++ + cts/scheduler/shutdown-lock.dot | 11 ++ + cts/scheduler/shutdown-lock.exp | 64 +++++++++ + cts/scheduler/shutdown-lock.scores | 17 +++ + cts/scheduler/shutdown-lock.summary | 31 ++++ + cts/scheduler/shutdown-lock.xml | 186 ++++++++++++++++++++++++ + 11 files changed, 626 insertions(+), 1 deletion(-) + create mode 100644 cts/scheduler/shutdown-lock-expiration.dot + create mode 100644 cts/scheduler/shutdown-lock-expiration.exp + create mode 100644 cts/scheduler/shutdown-lock-expiration.scores + create mode 100644 cts/scheduler/shutdown-lock-expiration.summary + create mode 100644 cts/scheduler/shutdown-lock-expiration.xml + create mode 100644 cts/scheduler/shutdown-lock.dot + create mode 100644 cts/scheduler/shutdown-lock.exp + create mode 100644 cts/scheduler/shutdown-lock.scores + create mode 100644 cts/scheduler/shutdown-lock.summary + create mode 100644 cts/scheduler/shutdown-lock.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index 8fa16fb..f2957ba 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -5,7 +5,7 @@ + # Pacemaker targets compatibility with Python 2.7 and 3.2+ + from __future__ import print_function, unicode_literals, absolute_import, division + +-__copyright__ = "Copyright 2004-2019 the Pacemaker project contributors" ++__copyright__ = "Copyright 2004-2020 the Pacemaker project contributors" + __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + + import io +@@ -956,6 +956,8 @@ TESTS = [ + [ + [ "resource-discovery", "Exercises resource-discovery location constraint option" ], + [ "rsc-discovery-per-node", "Disable resource discovery per node" ], ++ [ "shutdown-lock", "Ensure shutdown lock works properly" ], ++ [ "shutdown-lock-expiration", "Ensure shutdown lock expiration works properly" ], + ], + + # @TODO: If pacemaker implements versioned attributes, uncomment these tests +diff --git a/cts/scheduler/shutdown-lock-expiration.dot b/cts/scheduler/shutdown-lock-expiration.dot +new file mode 100644 +index 0000000..ee99079 +--- /dev/null ++++ b/cts/scheduler/shutdown-lock-expiration.dot +@@ -0,0 +1,11 @@ ++ digraph "g" { ++"Fencing_monitor_120000 node3" [ style=bold color="green" fontcolor="black"] ++"Fencing_start_0 node3" -> "Fencing_monitor_120000 node3" [ style = bold] ++"Fencing_start_0 node3" [ style=bold color="green" fontcolor="black"] ++"Fencing_stop_0 node3" -> "Fencing_start_0 node3" [ style = bold] ++"Fencing_stop_0 node3" [ style=bold color="green" fontcolor="black"] ++"rsc2_lrm_delete_0 node2" [ style=bold color="green" fontcolor="black"] ++"rsc2_monitor_10000 node4" [ style=bold color="green" fontcolor="black"] ++"rsc2_start_0 node4" -> "rsc2_monitor_10000 node4" [ style = bold] ++"rsc2_start_0 node4" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/shutdown-lock-expiration.exp b/cts/scheduler/shutdown-lock-expiration.exp +new file mode 100644 +index 0000000..465f12b +--- /dev/null ++++ b/cts/scheduler/shutdown-lock-expiration.exp +@@ -0,0 +1,68 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/shutdown-lock-expiration.scores b/cts/scheduler/shutdown-lock-expiration.scores +new file mode 100644 +index 0000000..e5d435d +--- /dev/null ++++ b/cts/scheduler/shutdown-lock-expiration.scores +@@ -0,0 +1,17 @@ ++Allocation scores: ++Using the original execution date of: 2020-01-06 22:11:40Z ++native_color: Fencing allocation score on node1: 0 ++native_color: Fencing allocation score on node2: 0 ++native_color: Fencing allocation score on node3: 0 ++native_color: Fencing allocation score on node4: 0 ++native_color: Fencing allocation score on node5: 0 ++native_color: rsc1 allocation score on node1: INFINITY ++native_color: rsc1 allocation score on node2: -INFINITY ++native_color: rsc1 allocation score on node3: -INFINITY ++native_color: rsc1 allocation score on node4: -INFINITY ++native_color: rsc1 allocation score on node5: -INFINITY ++native_color: rsc2 allocation score on node1: 0 ++native_color: rsc2 allocation score on node2: INFINITY ++native_color: rsc2 allocation score on node3: 0 ++native_color: rsc2 allocation score on node4: 0 ++native_color: rsc2 allocation score on node5: 0 +diff --git a/cts/scheduler/shutdown-lock-expiration.summary b/cts/scheduler/shutdown-lock-expiration.summary +new file mode 100644 +index 0000000..08c93aa +--- /dev/null ++++ b/cts/scheduler/shutdown-lock-expiration.summary +@@ -0,0 +1,31 @@ ++Using the original execution date of: 2020-01-06 22:11:40Z ++ ++Current cluster status: ++Online: [ node3 node4 node5 ] ++OFFLINE: [ node1 node2 ] ++ ++ Fencing (stonith:fence_xvm): Started node3 ++ rsc1 (ocf::pacemaker:Dummy): Stopped node1 (LOCKED) ++ rsc2 (ocf::pacemaker:Dummy): Stopped ++ ++Transition Summary: ++ * Restart Fencing ( node3 ) due to resource definition change ++ * Start rsc2 ( node4 ) ++ ++Executing cluster transition: ++ * Resource action: Fencing stop on node3 ++ * Resource action: Fencing start on node3 ++ * Resource action: Fencing monitor=120000 on node3 ++ * Resource action: rsc2 start on node4 ++ * Cluster action: lrm_delete for rsc2 on node2 ++ * Resource action: rsc2 monitor=10000 on node4 ++Using the original execution date of: 2020-01-06 22:11:40Z ++ ++Revised cluster status: ++Online: [ node3 node4 node5 ] ++OFFLINE: [ node1 node2 ] ++ ++ Fencing (stonith:fence_xvm): Started node3 ++ rsc1 (ocf::pacemaker:Dummy): Stopped node1 (LOCKED) ++ rsc2 (ocf::pacemaker:Dummy): Started node4 ++ +diff --git a/cts/scheduler/shutdown-lock-expiration.xml b/cts/scheduler/shutdown-lock-expiration.xml +new file mode 100644 +index 0000000..26f720e +--- /dev/null ++++ b/cts/scheduler/shutdown-lock-expiration.xml +@@ -0,0 +1,187 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/shutdown-lock.dot b/cts/scheduler/shutdown-lock.dot +new file mode 100644 +index 0000000..0a7d8c3 +--- /dev/null ++++ b/cts/scheduler/shutdown-lock.dot +@@ -0,0 +1,11 @@ ++ digraph "g" { ++"Fencing_monitor_120000 node3" [ style=bold color="green" fontcolor="black"] ++"Fencing_start_0 node3" -> "Fencing_monitor_120000 node3" [ style = bold] ++"Fencing_start_0 node3" [ style=bold color="green" fontcolor="black"] ++"Fencing_stop_0 node1" -> "Fencing_start_0 node3" [ style = bold] ++"Fencing_stop_0 node1" -> "do_shutdown node1" [ style = bold] ++"Fencing_stop_0 node1" [ style=bold color="green" fontcolor="black"] ++"do_shutdown node1" [ style=bold color="green" fontcolor="black"] ++"rsc1_stop_0 node1" -> "do_shutdown node1" [ style = bold] ++"rsc1_stop_0 node1" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/shutdown-lock.exp b/cts/scheduler/shutdown-lock.exp +new file mode 100644 +index 0000000..e8bf9d8 +--- /dev/null ++++ b/cts/scheduler/shutdown-lock.exp +@@ -0,0 +1,64 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/shutdown-lock.scores b/cts/scheduler/shutdown-lock.scores +new file mode 100644 +index 0000000..e09ebfb +--- /dev/null ++++ b/cts/scheduler/shutdown-lock.scores +@@ -0,0 +1,17 @@ ++Allocation scores: ++Using the original execution date of: 2020-01-06 21:59:11Z ++native_color: Fencing allocation score on node1: 0 ++native_color: Fencing allocation score on node2: 0 ++native_color: Fencing allocation score on node3: 0 ++native_color: Fencing allocation score on node4: 0 ++native_color: Fencing allocation score on node5: 0 ++native_color: rsc1 allocation score on node1: INFINITY ++native_color: rsc1 allocation score on node2: -INFINITY ++native_color: rsc1 allocation score on node3: -INFINITY ++native_color: rsc1 allocation score on node4: -INFINITY ++native_color: rsc1 allocation score on node5: -INFINITY ++native_color: rsc2 allocation score on node1: -INFINITY ++native_color: rsc2 allocation score on node2: INFINITY ++native_color: rsc2 allocation score on node3: -INFINITY ++native_color: rsc2 allocation score on node4: -INFINITY ++native_color: rsc2 allocation score on node5: -INFINITY +diff --git a/cts/scheduler/shutdown-lock.summary b/cts/scheduler/shutdown-lock.summary +new file mode 100644 +index 0000000..6ed56d1 +--- /dev/null ++++ b/cts/scheduler/shutdown-lock.summary +@@ -0,0 +1,31 @@ ++Using the original execution date of: 2020-01-06 21:59:11Z ++ ++Current cluster status: ++Online: [ node1 node3 node4 node5 ] ++OFFLINE: [ node2 ] ++ ++ Fencing (stonith:fence_xvm): Started node1 ++ rsc1 (ocf::pacemaker:Dummy): Started node1 ++ rsc2 (ocf::pacemaker:Dummy): Stopped node2 (LOCKED) ++ ++Transition Summary: ++ * Shutdown node1 ++ * Move Fencing ( node1 -> node3 ) ++ * Stop rsc1 ( node1 ) due to node availability ++ ++Executing cluster transition: ++ * Resource action: Fencing stop on node1 ++ * Resource action: rsc1 stop on node1 ++ * Cluster action: do_shutdown on node1 ++ * Resource action: Fencing start on node3 ++ * Resource action: Fencing monitor=120000 on node3 ++Using the original execution date of: 2020-01-06 21:59:11Z ++ ++Revised cluster status: ++Online: [ node1 node3 node4 node5 ] ++OFFLINE: [ node2 ] ++ ++ Fencing (stonith:fence_xvm): Started node3 ++ rsc1 (ocf::pacemaker:Dummy): Stopped ++ rsc2 (ocf::pacemaker:Dummy): Stopped node2 (LOCKED) ++ +diff --git a/cts/scheduler/shutdown-lock.xml b/cts/scheduler/shutdown-lock.xml +new file mode 100644 +index 0000000..ec6db30 +--- /dev/null ++++ b/cts/scheduler/shutdown-lock.xml +@@ -0,0 +1,186 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +1.8.3.1 + diff --git a/SOURCES/022-shutdown-lock.patch b/SOURCES/022-shutdown-lock.patch new file mode 100644 index 0000000..cfcef11 --- /dev/null +++ b/SOURCES/022-shutdown-lock.patch @@ -0,0 +1,51 @@ +From 5656b7d486569702ea6f3fe695c2fba366c970ac Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 12 Dec 2019 09:26:00 -0600 +Subject: [PATCH 18/18] Doc: Pacemaker Explained: document shutdown lock + options + +--- + doc/Pacemaker_Explained/en-US/Ch-Options.txt | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/doc/Pacemaker_Explained/en-US/Ch-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Options.txt +index f864987..35856aa 100644 +--- a/doc/Pacemaker_Explained/en-US/Ch-Options.txt ++++ b/doc/Pacemaker_Explained/en-US/Ch-Options.txt +@@ -389,6 +389,33 @@ rules with +date_spec+ are only guaranteed to be checked this often, and it + also serves as a fail-safe for certain classes of scheduler bugs. A value of 0 + disables this polling; positive values are a time interval. + ++| shutdown-lock | false | ++The default of false allows active resources to be recovered elsewhere when ++their node is cleanly shut down, which is what the vast majority of users will ++want. However, some users prefer to make resources highly available only for ++failures, with no recovery for clean shutdowns. If this option is true, ++resources active on a node when it is cleanly shut down are kept "locked" to ++that node (not allowed to run elsewhere) until they start again on that node ++after it rejoins (or for at most shutdown-lock-limit, if set). Stonith ++resources and Pacemaker Remote connections are never locked. Clone and bundle ++instances and the master role of promotable clones are currently never locked, ++though support could be added in a future release. Locks may be manually ++cleared using the `--refresh` option of `crm_resource` (both the resource and ++node must be specified; this works with remote nodes if their connection ++resource's target-role is set to Stopped, but not if Pacemaker Remote is ++stopped on the remote node without disabling the connection resource). ++indexterm:[shutdown-lock,Cluster Option] ++indexterm:[Cluster,Option,shutdown-lock] ++ ++| shutdown-lock-limit | 0 | ++If shutdown-lock is true, and this is set to a nonzero time duration, locked ++resources will be allowed to start after this much time has passed since the ++node shutdown was initiated, even if the node has not rejoined. (This works ++with remote nodes only if their connection resource's target-role is set to ++Stopped.) ++indexterm:[shutdown-lock-limit,Cluster Option] ++indexterm:[Cluster,Option,shutdown-lock-limit] ++ + | remove-after-stop | FALSE | + indexterm:[remove-after-stop,Cluster Option] + indexterm:[Cluster,Option,remove-after-stop] +-- +1.8.3.1 + diff --git a/SOURCES/023-curses.patch b/SOURCES/023-curses.patch new file mode 100644 index 0000000..c1d9a91 --- /dev/null +++ b/SOURCES/023-curses.patch @@ -0,0 +1,27 @@ +From 426f06cc088d11d6db0c45b434e5ce6b69da78b4 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 2 Jan 2020 15:08:58 -0500 +Subject: [PATCH] Fix: tools: Fix definition of curses_indented_printf. + +The placeholder version that is built if curses is not enabled does not +have a type that matches the header file. Correct that. +--- + tools/crm_mon_curses.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/crm_mon_curses.c b/tools/crm_mon_curses.c +index c0dbedb..ecd0584 100644 +--- a/tools/crm_mon_curses.c ++++ b/tools/crm_mon_curses.c +@@ -368,7 +368,7 @@ curses_indented_vprintf(pcmk__output_t *out, const char *format, va_list args) { + + G_GNUC_PRINTF(2, 3) + void +-curses_indented_printf(pcmk__output_t *out, const char *format, va_list args) { ++curses_indented_printf(pcmk__output_t *out, const char *format, ...) { + return; + } + +-- +1.8.3.1 + diff --git a/SOURCES/024-crm_mon-cgi.patch b/SOURCES/024-crm_mon-cgi.patch new file mode 100644 index 0000000..c6743eb --- /dev/null +++ b/SOURCES/024-crm_mon-cgi.patch @@ -0,0 +1,33 @@ +From 5b98dd71cef867a115a1b07fca2351ba430baf08 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 10 Jan 2020 09:54:59 -0500 +Subject: [PATCH] Fix: tools: Re-enable CGI output from crm_mon. + +The CGI header was not being written out because "false" was being +passed to the finish function. That was being passed because we didn't +want the HTML to be printed out without the refresh header. The fix is +just to s/false/true, and change the order so the extra header is added +first. +--- + tools/crm_mon.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/tools/crm_mon.c b/tools/crm_mon.c +index c1dcf29..4b28bef 100644 +--- a/tools/crm_mon.c ++++ b/tools/crm_mon.c +@@ -1854,10 +1854,9 @@ static void + handle_html_output(crm_exit_t exit_code) { + xmlNodePtr html = NULL; + +- out->finish(out, exit_code, false, (void **) &html); + pcmk__html_add_header(html, "meta", "http-equiv", "refresh", "content", + crm_itoa(options.reconnect_msec/1000), NULL); +- htmlDocDump(out->dest, html->doc); ++ out->finish(out, exit_code, true, (void **) &html); + } + + /* +-- +1.8.3.1 + diff --git a/SOURCES/025-clear-attrs.patch b/SOURCES/025-clear-attrs.patch new file mode 100644 index 0000000..842656c --- /dev/null +++ b/SOURCES/025-clear-attrs.patch @@ -0,0 +1,37 @@ +From 01b463bd715d48dde5bf76ca3a2e78e31f0ffaa1 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 21 Jan 2020 17:25:57 -0600 +Subject: [PATCH] Fix: controller: clear leaving node's transient attributes + even if there is no DC + +--- + daemons/controld/controld_callbacks.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c +index f7e3db2..21f831a 100644 +--- a/daemons/controld/controld_callbacks.c ++++ b/daemons/controld/controld_callbacks.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2019 the Pacemaker project contributors ++ * Copyright 2004-2020 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -205,7 +205,11 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d + cib_scope_local); + } + +- } else if(AM_I_DC) { ++ } else if (AM_I_DC || (fsa_our_dc == NULL)) { ++ /* This only needs to be done once, so normally the DC should do ++ * it. However if there is no DC, every node must do it, since ++ * there is no other way to ensure some one node does it. ++ */ + if (appeared) { + te_trigger_stonith_history_sync(FALSE); + } else { +-- +1.8.3.1 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index dc73608..dab302c 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -5,7 +5,15 @@ %global gname haclient ## Where to install Pacemaker documentation +%if 0%{?suse_version} > 0 +%global pcmk_docdir %{_docdir}/%{name}-%{version} +%else +%if 0%{?rhel} > 7 %global pcmk_docdir %{_docdir}/%{name}-doc +%else +%global pcmk_docdir %{_docdir}/%{name} +%endif +%endif ## GitHub entity that distributes source (for ease of using a fork) %global github_owner ClusterLabs @@ -13,12 +21,12 @@ ## Upstream pacemaker version, and its package version (specversion ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) -%global pcmkversion 2.0.2 -%global specversion 3 +%global pcmkversion 2.0.3 +%global specversion 5 ## Upstream commit (or git tag, such as "Pacemaker-" plus the ## {pcmkversion} macro for an official release) to use for this package -%global commit 744a30d655c9fbd66ad6e103697db0283bb90779 +%global commit 4b1f869f0f64ef0d248b6aa4781d38ecccf83318 ## Since git v2.11, the extent of abbreviation is autoscaled by default ## (used to be constant of 7), so we need to convey it for non-tags, too. %global commit_abbrev 7 @@ -80,6 +88,43 @@ %define gnutls_priorities %{?pcmk_gnutls_priorities}%{!?pcmk_gnutls_priorities:@SYSTEM} %endif +%if !%{defined _rundir} +%if 0%{?fedora} >= 15 || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1200 +%define _rundir /run +%else +%define _rundir /var/run +%endif +%endif + +%if 0%{?fedora} > 22 || 0%{?rhel} > 7 +%global supports_recommends 1 +%endif + +## Different distros name certain packages differently +## (note: corosync libraries also differ, but all provide corosync-devel) +%if 0%{?suse_version} > 0 +%global pkgname_bzip2_devel libbz2-devel +%global pkgname_docbook_xsl docbook-xsl-stylesheets +%global pkgname_gnutls_devel libgnutls-devel +%global pkgname_shadow_utils shadow +%global pkgname_procps procps +%global pkgname_glue_libs libglue +%global pkgname_pcmk_libs lib%{name}3 +%global hacluster_id 90 +%else +%global pkgname_libtool_devel libtool-ltdl-devel +%global pkgname_libtool_devel_arch libtool-ltdl-devel%{?_isa} +%global pkgname_bzip2_devel bzip2-devel +%global pkgname_docbook_xsl docbook-style-xsl +%global pkgname_gnutls_devel gnutls-devel +%global pkgname_shadow_utils shadow-utils +%global pkgname_procps procps-ng +%global pkgname_publican publican +%global pkgname_glue_libs cluster-glue-libs +%global pkgname_pcmk_libs %{name}-libs +%global hacluster_id 189 +%endif + # Python-related definitions ## Use Python 3 on certain platforms if major version not specified @@ -195,7 +240,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: %{pcmkversion} -Release: %{pcmk_release}%{?dist}.2 +Release: %{pcmk_release}%{?dist} %if %{defined _unitdir} License: GPLv2+ and LGPLv2+ %else @@ -211,31 +256,46 @@ Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{na Source1: nagios-agents-metadata-%{nagios_hash}.tar.gz # upstream commits -Patch1: 001-xmldiffs.patch -Patch2: 002-failed-monitors.patch -Patch3: 003-fencer-logs.patch -Patch4: 004-concurrent-fencing.patch -Patch5: 005-glib-priorities.patch -Patch6: 006-bundle-fixes.patch -Patch7: 007-fork-controld_fencing.patch -Patch8: 008-stonith_admin-header-refactoring.patch -Patch9: 009-improve-pacemaker_remote-handling.patch -Patch10: 010-fix-history-handing-on-fenced-restart.patch -Patch11: 011-crm_report.patch -Patch12: 012-fork-close.patch -Patch13: 013-soft-limit.patch +Patch1: 001-status-deletion.patch +Patch2: 002-status-deletion.patch +Patch3: 003-return-codes.patch +Patch4: 004-unused.patch +Patch5: 005-shutdown-lock.patch +Patch6: 006-shutdown-lock.patch +Patch7: 007-shutdown-lock.patch +Patch8: 008-shutdown-lock.patch +Patch9: 009-shutdown-lock.patch +Patch10: 010-shutdown-lock.patch +Patch11: 011-shutdown-lock.patch +Patch12: 012-shutdown-lock.patch +Patch13: 013-shutdown-lock.patch +Patch14: 014-shutdown-lock.patch +Patch15: 015-shutdown-lock.patch +Patch16: 016-shutdown-lock.patch +Patch17: 017-shutdown-lock.patch +Patch18: 018-shutdown-lock.patch +Patch19: 019-shutdown-lock.patch +Patch20: 020-shutdown-lock.patch +Patch21: 021-shutdown-lock.patch +Patch22: 022-shutdown-lock.patch +Patch23: 023-curses.patch +Patch24: 024-crm_mon-cgi.patch +Patch25: 025-clear-attrs.patch + +# downstream-only commits +#Patch100: xxx.patch Requires: resource-agents -Requires: %{name}-libs%{?_isa} = %{version}-%{release} +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release} Requires: %{name}-cli = %{version}-%{release} %if !%{defined _unitdir} -Requires: procps-ng +Requires: %{pkgname_procps} Requires: psmisc %endif %{?systemd_requires} -ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 %{arm} +ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 Requires: %{python_path} BuildRequires: %{python_name}-devel @@ -248,14 +308,14 @@ BuildRequires: libqb-devel >= 0.17.0 BuildRequires: coreutils findutils grep sed # Required for core functionality -BuildRequires: automake autoconf gcc libtool pkgconfig libtool-ltdl-devel +BuildRequires: automake autoconf gcc libtool pkgconfig %{?pkgname_libtool_devel} BuildRequires: pkgconfig(glib-2.0) >= 2.16 BuildRequires: libxml2-devel libxslt-devel libuuid-devel -BuildRequires: bzip2-devel +BuildRequires: %{pkgname_bzip2_devel} # Enables optional functionality -BuildRequires: ncurses-devel docbook-style-xsl -BuildRequires: help2man gnutls-devel pam-devel pkgconfig(dbus-1) +BuildRequires: ncurses-devel %{pkgname_docbook_xsl} +BuildRequires: help2man %{pkgname_gnutls_devel} pam-devel pkgconfig(dbus-1) %if %{systemd_native} BuildRequires: pkgconfig(systemd) @@ -265,16 +325,16 @@ BuildRequires: pkgconfig(systemd) BuildRequires: git Requires: corosync >= 2.0.0 -BuildRequires: corosynclib-devel >= 2.0.0 +BuildRequires: corosync-devel >= 2.0.0 %if %{with stonithd} -BuildRequires: cluster-glue-libs-devel +BuildRequires: %{pkgname_glue_libs}-devel %endif ## (note no avoiding effect when building through non-customized mock) %if !%{bleeding} %if %{with doc} -BuildRequires: inkscape asciidoc publican +BuildRequires: inkscape asciidoc %{?pkgname_publican} %endif %endif @@ -305,14 +365,15 @@ Available rpmbuild rebuild options: License: GPLv2+ and LGPLv2+ Summary: Command line tools for controlling Pacemaker clusters Group: System Environment/Daemons -Requires: %{name}-libs%{?_isa} = %{version}-%{release} -%if 0%{?fedora} > 22 || 0%{?rhel} > 7 +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} +%if 0%{?supports_recommends} #Recommends: pcmk-cluster-manager = %{version}-%{release} +# For crm_report Requires: tar Requires: bzip2 %endif Requires: perl-TimeDate -Requires: procps-ng +Requires: %{pkgname_procps} Requires: psmisc Requires(post):coreutils @@ -324,27 +385,27 @@ The %{name}-cli package contains command line tools that can be used to query and control the cluster from machines that may, or may not, be part of the cluster. -%package libs +%package -n %{pkgname_pcmk_libs} License: GPLv2+ and LGPLv2+ Summary: Core Pacemaker libraries Group: System Environment/Daemons -Requires(pre): shadow-utils +Requires(pre): %{pkgname_shadow_utils} Requires: %{name}-schemas = %{version}-%{release} # sbd 1.4.0+ supports the libpe_status API for pe_working_set_t Conflicts: sbd < 1.4.0 -%description libs +%description -n %{pkgname_pcmk_libs} Pacemaker is an advanced, scalable High-Availability cluster resource manager. -The %{name}-libs package contains shared libraries needed for cluster +The %{pkgname_pcmk_libs} package contains shared libraries needed for cluster nodes and those just running the CLI tools. %package cluster-libs License: GPLv2+ and LGPLv2+ Summary: Cluster Libraries used by Pacemaker Group: System Environment/Daemons -Requires: %{name}-libs%{?_isa} = %{version}-%{release} +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} %description cluster-libs Pacemaker is an advanced, scalable High-Availability cluster resource @@ -362,11 +423,11 @@ License: GPLv2+ and LGPLv2+ and BSD %endif Summary: Pacemaker remote daemon for non-cluster nodes Group: System Environment/Daemons -Requires: %{name}-libs%{?_isa} = %{version}-%{release} +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{name}-cli = %{version}-%{release} Requires: resource-agents %if !%{defined _unitdir} -Requires: procps-ng +Requires: %{pkgname_procps} %endif # -remote can be fully independent of systemd %{?systemd_ordering}%{!?systemd_ordering:%{?systemd_requires}} @@ -381,23 +442,23 @@ The %{name}-remote package contains the Pacemaker Remote daemon which is capable of extending pacemaker functionality to remote nodes not running the full corosync/cluster stack. -%package libs-devel +%package -n %{pkgname_pcmk_libs}-devel License: GPLv2+ and LGPLv2+ Summary: Pacemaker development package Group: Development/Libraries -Requires: %{name}-libs%{?_isa} = %{version}-%{release} +Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release} -Requires: libuuid-devel%{?_isa} libtool-ltdl-devel%{?_isa} +Requires: libuuid-devel%{?_isa} %{?pkgname_libtool_devel_arch} Requires: libxml2-devel%{?_isa} libxslt-devel%{?_isa} -Requires: bzip2-devel%{?_isa} glib2-devel%{?_isa} +Requires: %{pkgname_bzip2_devel}%{?_isa} glib2-devel%{?_isa} Requires: libqb-devel%{?_isa} -Requires: corosynclib-devel%{?_isa} >= 2.0.0 +Requires: corosync-devel >= 2.0.0 -%description libs-devel +%description -n %{pkgname_pcmk_libs}-devel Pacemaker is an advanced, scalable High-Availability cluster resource manager. -The %{name}-libs-devel package contains headers and shared libraries +The %{pkgname_pcmk_libs}-devel package contains headers and shared libraries for developing tools for Pacemaker. %package cts @@ -405,8 +466,9 @@ License: GPLv2+ and LGPLv2+ Summary: Test framework for cluster-related technologies like Pacemaker Group: System Environment/Daemons Requires: %{python_path} -Requires: %{name}-libs = %{version}-%{release} -Requires: procps-ng +Requires: %{pkgname_pcmk_libs} = %{version}-%{release} +Requires: %{name}-cli = %{version}-%{release} +Requires: %{pkgname_procps} Requires: psmisc BuildArch: noarch @@ -478,7 +540,7 @@ monitor resources. # Early versions of autotools (e.g. RHEL <= 5) do not support --docdir export docdir=%{pcmk_docdir} -export systemdunitdir=%{?_unitdir}%{!?_unitdir:no} +export systemdsystemunitdir=%{?_unitdir}%{!?_unitdir:no} # RHEL changes pacemaker's concurrent-fencing default to true export CPPFLAGS="-DDEFAULT_CONCURRENT_FENCING_TRUE" @@ -506,8 +568,9 @@ export LDFLAGS_HARDENED_LIB="%{?_hardening_ldflags}" %{!?with_doc: --with-brand=} \ %{?gnutls_priorities: --with-gnutls-priorities="%{gnutls_priorities}"} \ --with-initdir=%{_initrddir} \ + --with-runstatedir=%{_rundir} \ --localstatedir=%{_var} \ - --with-bug-url=https://bugzilla.redhat.com/ \ + --with-bug-url=https://bugzilla.redhat.com/ \ --with-nagios \ --with-nagios-metadata-dir=%{_datadir}/pacemaker/nagios/plugins-metadata/ \ --with-nagios-plugin-dir=%{_libdir}/nagios/plugins/ \ @@ -519,7 +582,7 @@ sed -i 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' libtool sed -i 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool %endif -make %{_smp_mflags} V=1 all +make %{_smp_mflags} V=1 %check { cts/cts-scheduler --run load-stopped-loop \ @@ -695,17 +758,17 @@ fi %systemd_postun_with_restart crm_mon.service %endif -%pre libs -getent group %{gname} >/dev/null || groupadd -r %{gname} -g 189 -getent passwd %{uname} >/dev/null || useradd -r -g %{gname} -u 189 -s /sbin/nologin -c "cluster user" %{uname} +%pre -n %{pkgname_pcmk_libs} +getent group %{gname} >/dev/null || groupadd -r %{gname} -g %{hacluster_id} +getent passwd %{uname} >/dev/null || useradd -r -g %{gname} -u %{hacluster_id} -s /sbin/nologin -c "cluster user" %{uname} exit 0 %if %{defined ldconfig_scriptlets} %ldconfig_scriptlets libs %ldconfig_scriptlets cluster-libs %else -%post libs -p /sbin/ldconfig -%postun libs -p /sbin/ldconfig +%post -n %{pkgname_pcmk_libs} -p /sbin/ldconfig +%postun -n %{pkgname_pcmk_libs} -p /sbin/ldconfig %post cluster-libs -p /sbin/ldconfig %postun cluster-libs -p /sbin/ldconfig @@ -825,7 +888,7 @@ exit 0 %dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker %dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker/bundles -%files libs +%files -n %{pkgname_pcmk_libs} %{_libdir}/libcib.so.* %{_libdir}/liblrmd.so.* %{_libdir}/libcrmservice.so.* @@ -880,7 +943,7 @@ exit 0 %doc COPYING %doc ChangeLog -%files libs-devel +%files -n %{pkgname_pcmk_libs}-devel %{_includedir}/pacemaker %{_libdir}/*.so %if %{with coverage} @@ -897,6 +960,7 @@ exit 0 %{_datadir}/pacemaker/*.rng %{_datadir}/pacemaker/*.xsl %{_datadir}/pacemaker/api +%{_datadir}/pkgconfig/pacemaker-schemas.pc %files nagios-plugins-metadata %dir %{_datadir}/pacemaker/nagios/plugins-metadata @@ -904,13 +968,37 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog -* Mon Oct 28 2019 Ken Gaillot - 2.0.2-3.2 -- Correct gating test syntax and add z-stream tag to build -- Resolves: rhbz#1764181 - -* Mon Oct 28 2019 Ken Gaillot - 2.0.2-3.1 -- Improve efficiency when closing file descriptors after a fork -- Resolves: rhbz#1764181 +* Mon Jan 27 2020 Ken Gaillot - 2.0.3-5 +- Clear leaving node's attributes if there is no DC +- Resolves: rhbz1791841 + +* Thu Jan 16 2020 Ken Gaillot - 2.0.3-4 +- Implement shutdown-lock feature +- Resolves: rhbz1712584 + +* Wed Nov 27 2019 Ken Gaillot - 2.0.3-3 +- Rebase on Pacemaker-2.0.3 final release +- Resolves: rhbz1752538 + +* Wed Nov 13 2019 Ken Gaillot - 2.0.3-2 +- Rebase on Pacemaker-2.0.3-rc3 +- Resolves: rhbz1752538 + +* Thu Oct 31 2019 Ken Gaillot - 2.0.3-1 +- Rebase on Pacemaker-2.0.3-rc2 +- Parse crm_mon --fence-history option correctly +- Put timeout on controller waiting for scheduler response +- Offer Pacemaker Remote option for bind address +- Calculate cluster recheck interval dynamically +- Clarify crm_resource help text +- Reduce system calls after forking a child process +- Resolves: rhbz1699978 +- Resolves: rhbz1725236 +- Resolves: rhbz1743377 +- Resolves: rhbz1747553 +- Resolves: rhbz1748805 +- Resolves: rhbz1752538 +- Resolves: rhbz1762025 * Mon Aug 26 2019 Ken Gaillot - 2.0.2-3 - Make pacemaker-cli require tar and bzip2