Blob Blame History Raw
From 11e8a3b9c8e35301b197724658ec2d243aec3336 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 22 Nov 2019 16:39:54 -0600
Subject: [PATCH 01/11] Refactor: controller: rename struct recurring_op_s to
 active_op_t

... because it holds both recurring and pending non-recurring actions,
and the name was confusing
---
 crmd/crmd_lrm.h  |  8 ++++----
 crmd/lrm.c       | 18 +++++++++---------
 crmd/lrm_state.c |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h
index 0870817..7c6a3c4 100644
--- a/crmd/crmd_lrm.h
+++ b/crmd/crmd_lrm.h
@@ -42,8 +42,8 @@ typedef struct resource_history_s {
 
 void history_free(gpointer data);
 
-/* TODO - Replace this with lrmd_event_data_t */
-struct recurring_op_s {
+// In-flight action (recurring or pending)
+typedef struct active_op_s {
     int call_id;
     int interval;
     gboolean remove;
@@ -54,7 +54,7 @@ struct recurring_op_s {
     char *op_key;
     char *user_data;
     GHashTable *params;
-};
+} active_op_t;
 
 typedef struct lrm_state_s {
     const char *node_name;
@@ -171,4 +171,4 @@ void remote_ra_process_maintenance_nodes(xmlNode *xml);
 gboolean remote_ra_controlling_guest(lrm_state_t * lrm_state);
 
 void process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
-                       struct recurring_op_s *pending, xmlNode *action_xml);
+                       active_op_t *pending, xmlNode *action_xml);
diff --git a/crmd/lrm.c b/crmd/lrm.c
index 437840f..e459465 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -400,7 +400,7 @@ lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
     GHashTableIter gIter;
     const char *key = NULL;
     rsc_history_t *entry = NULL;
-    struct recurring_op_s *pending = NULL;
+    active_op_t *pending = NULL;
 
     crm_debug("Checking for active resources before exit");
 
@@ -917,7 +917,7 @@ static gboolean
 lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data)
 {
     const char *rsc = user_data;
-    struct recurring_op_s *pending = value;
+    active_op_t *pending = value;
 
     if (crm_str_eq(rsc, pending->rsc_id, TRUE)) {
         crm_info("Removing op %s:%d for deleted resource %s",
@@ -1148,7 +1148,7 @@ cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op,
 {
     int rc = pcmk_ok;
     char *local_key = NULL;
-    struct recurring_op_s *pending = NULL;
+    active_op_t *pending = NULL;
 
     CRM_CHECK(op != 0, return FALSE);
     CRM_CHECK(rsc_id != NULL, return FALSE);
@@ -1213,7 +1213,7 @@ cancel_action_by_key(gpointer key, gpointer value, gpointer user_data)
 {
     gboolean remove = FALSE;
     struct cancel_data *data = user_data;
-    struct recurring_op_s *op = (struct recurring_op_s *)value;
+    active_op_t *op = value;
 
     if (crm_str_eq(op->op_key, data->key, TRUE)) {
         data->done = TRUE;
@@ -2104,7 +2104,7 @@ stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data)
 {
     gboolean remove = FALSE;
     struct stop_recurring_action_s *event = user_data;
-    struct recurring_op_s *op = (struct recurring_op_s *)value;
+    active_op_t *op = value;
 
     if (op->interval != 0 && crm_str_eq(op->rsc_id, event->rsc->id, TRUE)) {
         crm_debug("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, (char*)key);
@@ -2119,7 +2119,7 @@ stop_recurring_actions(gpointer key, gpointer value, gpointer user_data)
 {
     gboolean remove = FALSE;
     lrm_state_t *lrm_state = user_data;
-    struct recurring_op_s *op = (struct recurring_op_s *)value;
+    active_op_t *op = value;
 
     if (op->interval != 0) {
         crm_info("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, key);
@@ -2294,9 +2294,9 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat
          * for them to complete during shutdown
          */
         char *call_id_s = make_stop_id(rsc->id, call_id);
-        struct recurring_op_s *pending = NULL;
+        active_op_t *pending = NULL;
 
-        pending = calloc(1, sizeof(struct recurring_op_s));
+        pending = calloc(1, sizeof(active_op_t));
         crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s);
 
         pending->call_id = call_id;
@@ -2517,7 +2517,7 @@ did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id,
 
 void
 process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
-                  struct recurring_op_s *pending, xmlNode *action_xml)
+                  active_op_t *pending, xmlNode *action_xml)
 {
     char *op_id = NULL;
     char *op_key = NULL;
diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c
index 8d07ef5..0f39c3d 100644
--- a/crmd/lrm_state.c
+++ b/crmd/lrm_state.c
@@ -58,7 +58,7 @@ free_deletion_op(gpointer value)
 static void
 free_recurring_op(gpointer value)
 {
-    struct recurring_op_s *op = (struct recurring_op_s *)value;
+    active_op_t *op = value;
 
     free(op->user_data);
     free(op->rsc_id);
@@ -75,7 +75,7 @@ fail_pending_op(gpointer key, gpointer value, gpointer user_data)
 {
     lrmd_event_data_t event = { 0, };
     lrm_state_t *lrm_state = user_data;
-    struct recurring_op_s *op = (struct recurring_op_s *)value;
+    active_op_t *op = value;
 
     crm_trace("Pre-emptively failing %s_%s_%d on %s (call=%s, %s)",
               op->rsc_id, op->op_type, op->interval,
-- 
1.8.3.1


From 9795f5401957563de2307f94c393dc83ce41d3d1 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 22 Nov 2019 16:45:31 -0600
Subject: [PATCH 02/11] Refactor: controller: convert active_op_t booleans to
 bitmask

---
 crmd/crmd_lrm.h |  8 ++++++--
 crmd/lrm.c      | 11 +++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h
index 7c6a3c4..eb27d84 100644
--- a/crmd/crmd_lrm.h
+++ b/crmd/crmd_lrm.h
@@ -42,12 +42,16 @@ typedef struct resource_history_s {
 
 void history_free(gpointer data);
 
+enum active_op_e {
+    active_op_remove    = (1 << 0),
+    active_op_cancelled = (1 << 1),
+};
+
 // In-flight action (recurring or pending)
 typedef struct active_op_s {
     int call_id;
     int interval;
-    gboolean remove;
-    gboolean cancelled;
+    uint32_t flags; // bitmask of active_op_e
     unsigned int start_time;
     char *rsc_id;
     char *op_type;
diff --git a/crmd/lrm.c b/crmd/lrm.c
index e459465..2ae0d85 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -1159,18 +1159,17 @@ cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op,
     pending = g_hash_table_lookup(lrm_state->pending_ops, key);
 
     if (pending) {
-        if (remove && pending->remove == FALSE) {
-            pending->remove = TRUE;
+        if (remove && is_not_set(pending->flags, active_op_remove)) {
+            set_bit(pending->flags, active_op_remove);
             crm_debug("Scheduling %s for removal", key);
         }
 
-        if (pending->cancelled) {
+        if (is_set(pending->flags, active_op_cancelled)) {
             crm_debug("Operation %s already cancelled", key);
             free(local_key);
             return FALSE;
         }
-
-        pending->cancelled = TRUE;
+        set_bit(pending->flags, active_op_cancelled);
 
     } else {
         crm_info("No pending op found for %s", key);
@@ -2636,7 +2635,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
         crm_err("Recurring operation %s was cancelled without transition information",
                 op_key);
 
-    } else if (pending->remove) {
+    } else if (is_set(pending->flags, active_op_remove)) {
         /* This recurring operation was cancelled (by us) and pending, and we
          * have been waiting for it to finish.
          */
-- 
1.8.3.1


From 620ff6cb923e4fe6c3d1a9e3345a26ff5180d56d Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 22 Nov 2019 16:58:25 -0600
Subject: [PATCH 03/11] Refactor: controller: remove unused argument

---
 crmd/lrm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/crmd/lrm.c b/crmd/lrm.c
index 2ae0d85..80fcd69 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -44,8 +44,8 @@ static int delete_rsc_status(lrm_state_t * lrm_state, const char *rsc_id, int ca
 
 static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
                                        const char *rsc_id, const char *operation);
-static void do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation,
-                          xmlNode * msg, xmlNode * request);
+static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
+                          const char *operation, xmlNode *msg);
 
 void send_direct_ack(const char *to_host, const char *to_sys,
                      lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id);
@@ -1851,7 +1851,7 @@ do_lrm_invoke(long long action,
                           crm_rsc_delete, user_name);
 
         } else {
-            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, input->msg);
+            do_lrm_rsc_op(lrm_state, rsc, operation, input->xml);
         }
 
         lrmd_free_rsc_info(rsc);
@@ -2167,8 +2167,8 @@ record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t
 }
 
 static void
-do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg,
-              xmlNode * request)
+do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
+              const char *operation, xmlNode *msg)
 {
     int call_id = 0;
     char *op_id = NULL;
-- 
1.8.3.1


From 2716f2f4c334b927f9e253979dce27b56bbff46a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 6 Dec 2019 12:15:05 -0600
Subject: [PATCH 04/11] Refactor: scheduler: combine two "if" statements

... for readability, and ease of adding another block later
---
 pengine/graph.c | 120 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/pengine/graph.c b/pengine/graph.c
index 9edd1a1..cba30d0 100644
--- a/pengine/graph.c
+++ b/pengine/graph.c
@@ -1095,71 +1095,71 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set)
         return action_xml;
     }
 
-    /* List affected resource */
-    if (action->rsc) {
-        if (is_set(action->flags, pe_action_pseudo) == FALSE) {
-            int lpc = 0;
-
-            xmlNode *rsc_xml = create_xml_node(action_xml, crm_element_name(action->rsc->xml));
-
-            const char *attr_list[] = {
-                XML_AGENT_ATTR_CLASS,
-                XML_AGENT_ATTR_PROVIDER,
-                XML_ATTR_TYPE
-            };
-
-            if (is_set(action->rsc->flags, pe_rsc_orphan) && action->rsc->clone_name) {
-                /* Do not use the 'instance free' name here as that
-                 * might interfere with the instance we plan to keep.
-                 * Ie. if there are more than two named /anonymous/
-                 * instances on a given node, we need to make sure the
-                 * command goes to the right one.
-                 *
-                 * Keep this block, even when everyone is using
-                 * 'instance free' anonymous clone names - it means
-                 * we'll do the right thing if anyone toggles the
-                 * unique flag to 'off'
-                 */
-                crm_debug("Using orphan clone name %s instead of %s", action->rsc->id,
-                          action->rsc->clone_name);
-                crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->clone_name);
-                crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id);
+    if (action->rsc && is_not_set(action->flags, pe_action_pseudo)) {
+        int lpc = 0;
+        xmlNode *rsc_xml = NULL;
+        const char *attr_list[] = {
+            XML_AGENT_ATTR_CLASS,
+            XML_AGENT_ATTR_PROVIDER,
+            XML_ATTR_TYPE
+        };
+
+        // List affected resource
+
+        rsc_xml = create_xml_node(action_xml,
+                                  crm_element_name(action->rsc->xml));
+        if (is_set(action->rsc->flags, pe_rsc_orphan)
+            && action->rsc->clone_name) {
+            /* Do not use the 'instance free' name here as that
+             * might interfere with the instance we plan to keep.
+             * Ie. if there are more than two named /anonymous/
+             * instances on a given node, we need to make sure the
+             * command goes to the right one.
+             *
+             * Keep this block, even when everyone is using
+             * 'instance free' anonymous clone names - it means
+             * we'll do the right thing if anyone toggles the
+             * unique flag to 'off'
+             */
+            crm_debug("Using orphan clone name %s instead of %s", action->rsc->id,
+                      action->rsc->clone_name);
+            crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->clone_name);
+            crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id);
 
-            } else if (is_not_set(action->rsc->flags, pe_rsc_unique)) {
-                const char *xml_id = ID(action->rsc->xml);
-
-                crm_debug("Using anonymous clone name %s for %s (aka. %s)", xml_id, action->rsc->id,
-                          action->rsc->clone_name);
-
-                /* ID is what we'd like client to use
-                 * ID_LONG is what they might know it as instead
-                 *
-                 * ID_LONG is only strictly needed /here/ during the
-                 * transition period until all nodes in the cluster
-                 * are running the new software /and/ have rebooted
-                 * once (meaning that they've only ever spoken to a DC
-                 * supporting this feature).
-                 *
-                 * If anyone toggles the unique flag to 'on', the
-                 * 'instance free' name will correspond to an orphan
-                 * and fall into the clause above instead
-                 */
-                crm_xml_add(rsc_xml, XML_ATTR_ID, xml_id);
-                if (action->rsc->clone_name && safe_str_neq(xml_id, action->rsc->clone_name)) {
-                    crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->clone_name);
-                } else {
-                    crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id);
-                }
+        } else if (is_not_set(action->rsc->flags, pe_rsc_unique)) {
+            const char *xml_id = ID(action->rsc->xml);
+
+            crm_debug("Using anonymous clone name %s for %s (aka. %s)", xml_id, action->rsc->id,
+                      action->rsc->clone_name);
 
+            /* ID is what we'd like client to use
+             * ID_LONG is what they might know it as instead
+             *
+             * ID_LONG is only strictly needed /here/ during the
+             * transition period until all nodes in the cluster
+             * are running the new software /and/ have rebooted
+             * once (meaning that they've only ever spoken to a DC
+             * supporting this feature).
+             *
+             * If anyone toggles the unique flag to 'on', the
+             * 'instance free' name will correspond to an orphan
+             * and fall into the clause above instead
+             */
+            crm_xml_add(rsc_xml, XML_ATTR_ID, xml_id);
+            if (action->rsc->clone_name && safe_str_neq(xml_id, action->rsc->clone_name)) {
+                crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->clone_name);
             } else {
-                CRM_ASSERT(action->rsc->clone_name == NULL);
-                crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->id);
+                crm_xml_add(rsc_xml, XML_ATTR_ID_LONG, action->rsc->id);
             }
 
-            for (lpc = 0; lpc < DIMOF(attr_list); lpc++) {
-                crm_xml_add(rsc_xml, attr_list[lpc],
-                            g_hash_table_lookup(action->rsc->meta, attr_list[lpc]));
-            }
+        } else {
+            CRM_ASSERT(action->rsc->clone_name == NULL);
+            crm_xml_add(rsc_xml, XML_ATTR_ID, action->rsc->id);
+        }
+
+        for (lpc = 0; lpc < DIMOF(attr_list); lpc++) {
+            crm_xml_add(rsc_xml, attr_list[lpc],
+                        g_hash_table_lookup(action->rsc->meta, attr_list[lpc]));
         }
     }
 
-- 
1.8.3.1


From dc73907eeae769d70b04e4a8feae208c12018d83 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 19 Dec 2019 17:18:41 -0600
Subject: [PATCH 05/11] Log: scheduler: drop redundant trace messages

We logged "applying placement constraints" three times.
---
 pengine/allocate.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/pengine/allocate.c b/pengine/allocate.c
index b819af3..30d29e1 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -652,21 +652,15 @@ check_actions(pe_working_set_t * data_set)
     }
 }
 
-static gboolean
+static void
 apply_placement_constraints(pe_working_set_t * data_set)
 {
-    GListPtr gIter = NULL;
-
-    crm_trace("Applying constraints...");
-
-    for (gIter = data_set->placement_constraints; gIter != NULL; gIter = gIter->next) {
+    for (GList *gIter = data_set->placement_constraints;
+         gIter != NULL; gIter = gIter->next) {
         pe__location_t *cons = gIter->data;
 
         cons->rsc_lh->cmds->rsc_location(cons->rsc_lh, cons);
     }
-
-    return TRUE;
-
 }
 
 static gboolean
@@ -1026,10 +1020,7 @@ stage2(pe_working_set_t * data_set)
 {
     GListPtr gIter = NULL;
 
-    crm_trace("Applying placement constraints");
-
-    gIter = data_set->nodes;
-    for (; gIter != NULL; gIter = gIter->next) {
+    for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
         node_t *node = (node_t *) gIter->data;
 
         if (node == NULL) {
-- 
1.8.3.1


From ae404a4e04b50e6177942256c5b8ff57a36af309 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Sat, 7 Dec 2019 12:13:11 -0600
Subject: [PATCH 06/11] Refactor: libcrmcommon: convenience functions for list
 length comparisons

... for efficiency and readability
---
 include/crm/common/internal.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h
index 0d225f5..0f7158c 100644
--- a/include/crm/common/internal.h
+++ b/include/crm/common/internal.h
@@ -84,6 +84,20 @@ crm_getpid_s()
     return crm_strdup_printf("%lu", (unsigned long) getpid());
 }
 
+// More efficient than g_list_length(list) == 1
+static inline bool
+pcmk__list_of_1(GList *list)
+{
+    return list && (list->next == NULL);
+}
+
+// More efficient than g_list_length(list) > 1
+static inline bool
+pcmk__list_of_multiple(GList *list)
+{
+    return list && (list->next != NULL);
+}
+
 /* convenience functions for failure-related node attributes */
 
 #define CRM_FAIL_COUNT_PREFIX   "fail-count"
-- 
1.8.3.1


From 78a1d57ae4028e3068b13d3526840f24c7798140 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 16 Dec 2019 14:13:30 -0600
Subject: [PATCH 07/11] Refactor: libcrmcommon: add convenience macros for
 plurals

I've avoided making s_if_plural() an official API due to its hackiness, but
it really is the best solution for now. Promote it to pcmk__plural_s(), along
with a companion macro pcmk__plural_alt() for more complicated plurals.
---
 include/crm/common/internal.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h
index 0f7158c..5bbf01b 100644
--- a/include/crm/common/internal.h
+++ b/include/crm/common/internal.h
@@ -72,6 +72,29 @@ bool crm_compress_string(const char *data, int length, int max, char **result,
                          unsigned int *result_len);
 gint crm_alpha_sort(gconstpointer a, gconstpointer b);
 
+/* Correctly displaying singular or plural is complicated; consider "1 node has"
+ * vs. "2 nodes have". A flexible solution is to pluralize entire strings, e.g.
+ *
+ * if (a == 1) {
+ *     crm_info("singular message"):
+ * } else {
+ *     crm_info("plural message");
+ * }
+ *
+ * though even that's not sufficient for all languages besides English (if we
+ * ever desire to do translations of output and log messages). But the following
+ * convenience macros are "good enough" and more concise for many cases.
+ */
+
+/* Example:
+ * crm_info("Found %d %s", nentries,
+ *          pcmk__plural_alt(nentries, "entry", "entries"));
+ */
+#define pcmk__plural_alt(i, s1, s2) (((i) == 1)? (s1) : (s2))
+
+// Example: crm_info("Found %d node%s", nnodes, pcmk__plural_s(nnodes));
+#define pcmk__plural_s(i) pcmk__plural_alt(i, "", "s")
+
 static inline int
 crm_strlen_zero(const char *s)
 {
-- 
1.8.3.1


From e33b5e297c24182c15ace7142efef1add7664643 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 12 Dec 2019 20:50:50 -0600
Subject: [PATCH 08/11] Log: controller: improve join messages

---
 crmd/fsa.c     |  79 +++++++-----
 crmd/join_dc.c | 375 +++++++++++++++++++++++++++++++++------------------------
 2 files changed, 263 insertions(+), 191 deletions(-)

diff --git a/crmd/fsa.c b/crmd/fsa.c
index 9b1189a..7f6a3ac 100644
--- a/crmd/fsa.c
+++ b/crmd/fsa.c
@@ -469,12 +469,53 @@ log_fsa_input(fsa_data_t * stored_msg)
     }
 }
 
+static void
+check_join_counts(fsa_data_t *msg_data)
+{
+    int count;
+    guint npeers;
+
+    count = crmd_join_phase_count(crm_join_finalized);
+    if (count > 0) {
+        crm_err("%d cluster node%s failed to confirm join",
+                count, pcmk__plural_s(count));
+        crmd_join_phase_log(LOG_NOTICE);
+        return;
+    }
+
+    npeers = crm_active_peers();
+    count = crmd_join_phase_count(crm_join_confirmed);
+    if (count == npeers) {
+        if (npeers == 1) {
+            crm_debug("Sole active cluster node is fully joined");
+        } else {
+            crm_debug("All %d active cluster nodes are fully joined", count);
+        }
+
+    } else if (count > npeers) {
+        crm_err("New election needed because more nodes confirmed join "
+                "than are in membership (%d > %u)", count, npeers);
+        register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
+
+    } else if (saved_ccm_membership_id != crm_peer_seq) {
+        crm_info("New join needed because membership changed (%llu -> %llu)",
+                 saved_ccm_membership_id, crm_peer_seq);
+        register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
+
+    } else {
+        crm_warn("Only %d of %u active cluster nodes fully joined "
+                 "(%d did not respond to offer)",
+                 count, npeers, crmd_join_phase_count(crm_join_welcomed));
+    }
+}
+
 long long
 do_state_transition(long long actions,
                     enum crmd_fsa_state cur_state,
                     enum crmd_fsa_state next_state, fsa_data_t * msg_data)
 {
     int level = LOG_INFO;
+    int count = 0;
     long long tmp = actions;
     gboolean clear_recovery_bit = TRUE;
 
@@ -572,13 +613,14 @@ do_state_transition(long long actions,
                 crm_warn("Progressed to state %s after %s",
                          fsa_state2string(next_state), fsa_cause2string(cause));
             }
-            if (crmd_join_phase_count(crm_join_welcomed) > 0) {
-                crm_warn("%u cluster nodes failed to respond"
-                         " to the join offer.", crmd_join_phase_count(crm_join_welcomed));
+            count = crmd_join_phase_count(crm_join_welcomed);
+            if (count > 0) {
+                crm_warn("%d cluster node%s failed to respond to join offer",
+                         count, pcmk__plural_s(count));
                 crmd_join_phase_log(LOG_NOTICE);
 
             } else {
-                crm_debug("All %d cluster nodes responded to the join offer.",
+                crm_debug("All cluster nodes (%d) responded to join offer",
                           crmd_join_phase_count(crm_join_integrated));
             }
             break;
@@ -590,34 +632,7 @@ do_state_transition(long long actions,
                 crm_info("Progressed to state %s after %s",
                          fsa_state2string(next_state), fsa_cause2string(cause));
             }
-
-            if (crmd_join_phase_count(crm_join_finalized) > 0) {
-                crm_err("%u cluster nodes failed to confirm their join.",
-                        crmd_join_phase_count(crm_join_finalized));
-                crmd_join_phase_log(LOG_NOTICE);
-
-            } else if (crmd_join_phase_count(crm_join_confirmed)
-                       == crm_active_peers()) {
-                crm_debug("All %u cluster nodes are"
-                          " eligible to run resources.", crm_active_peers());
-
-            } else if (crmd_join_phase_count(crm_join_confirmed) > crm_active_peers()) {
-                crm_err("We have more confirmed nodes than our membership does: %d vs. %d",
-                        crmd_join_phase_count(crm_join_confirmed), crm_active_peers());
-                register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
-
-            } else if (saved_ccm_membership_id != crm_peer_seq) {
-                crm_info("Membership changed: %llu -> %llu - join restart",
-                         saved_ccm_membership_id, crm_peer_seq);
-                register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
-
-            } else {
-                crm_warn("Only %u of %u cluster "
-                         "nodes are eligible to run resources - continue %d",
-                         crmd_join_phase_count(crm_join_confirmed),
-                         crm_active_peers(), crmd_join_phase_count(crm_join_welcomed));
-            }
-/* 			initialize_join(FALSE); */
+            check_join_counts(msg_data);
             break;
 
         case S_STOPPING:
diff --git a/crmd/join_dc.c b/crmd/join_dc.c
index 857e760..cdb3f77 100644
--- a/crmd/join_dc.c
+++ b/crmd/join_dc.c
@@ -36,7 +36,11 @@ void finalize_join_for(gpointer key, gpointer value, gpointer user_data);
 void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data);
 gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
 
+/* Numeric counter used to identify join rounds (an unsigned int would be
+ * appropriate, except we get and set it in XML as int)
+ */
 static int current_join_id = 0;
+
 unsigned long long saved_ccm_membership_id = 0;
 
 void
@@ -44,12 +48,7 @@ crm_update_peer_join(const char *source, crm_node_t * node, enum crm_join_phase
 {
     enum crm_join_phase last = 0;
 
-    if(node == NULL) {
-        crm_err("Could not update join because node not specified"
-                CRM_XS " join-%u source=%s phase=%s",
-                current_join_id, source, crm_join_phase_str(phase));
-        return;
-    }
+    CRM_CHECK(node != NULL, return);
 
     /* Remote nodes do not participate in joins */
     if (is_set(node->flags, crm_remote_node)) {
@@ -59,21 +58,23 @@ crm_update_peer_join(const char *source, crm_node_t * node, enum crm_join_phase
     last = node->join;
 
     if(phase == last) {
-        crm_trace("%s: Node %s[%u] - join-%u phase still %s",
-                  source, node->uname, node->id, current_join_id,
-                  crm_join_phase_str(last));
+        crm_trace("Node %s join-%d phase is still %s "
+                  CRM_XS " nodeid=%u source=%s",
+                  node->uname, current_join_id, crm_join_phase_str(last),
+                  node->id, source);
 
     } else if ((phase <= crm_join_none) || (phase == (last + 1))) {
         node->join = phase;
-        crm_info("%s: Node %s[%u] - join-%u phase %s -> %s",
-                 source, node->uname, node->id, current_join_id,
-                 crm_join_phase_str(last), crm_join_phase_str(phase));
+        crm_trace("Node %s join-%d phase is now %s (was %s) "
+                  CRM_XS " nodeid=%u source=%s",
+                 node->uname, current_join_id, crm_join_phase_str(phase),
+                 crm_join_phase_str(last), node->id, source);
 
     } else {
-        crm_err("Could not update join for node %s because phase transition invalid "
-                CRM_XS " join-%u source=%s node_id=%u last=%s new=%s",
-                node->uname, current_join_id, source, node->id,
-                crm_join_phase_str(last), crm_join_phase_str(phase));
+        crm_warn("Rejecting join-%d phase update for node %s because "
+                 "can't go from %s to %s " CRM_XS " nodeid=%u source=%s",
+                 current_join_id, node->uname, crm_join_phase_str(last),
+                 crm_join_phase_str(phase), node->id, source);
     }
 }
 
@@ -83,9 +84,7 @@ initialize_join(gboolean before)
     GHashTableIter iter;
     crm_node_t *peer = NULL;
 
-    /* clear out/reset a bunch of stuff */
-    crm_debug("join-%d: Initializing join data (flag=%s)",
-              current_join_id, before ? "true" : "false");
+    crm_debug("Starting new join round join-%d", current_join_id);
 
     g_hash_table_iter_init(&iter, crm_peer_cache);
     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) {
@@ -138,7 +137,9 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data)
 
     CRM_ASSERT(member != NULL);
     if (crm_is_peer_active(member) == FALSE) {
-        crm_info("Not making an offer to %s: not active (%s)", member->uname, member->state);
+        crm_info("Not making join-%d offer to inactive node %s",
+                 current_join_id,
+                 (member->uname? member->uname : "with unknown name"));
         if(member->expected == NULL && safe_str_eq(member->state, CRM_NODE_LOST)) {
             /* You would think this unsafe, but in fact this plus an
              * active resource is what causes it to be fenced.
@@ -155,17 +156,21 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data)
     }
 
     if (member->uname == NULL) {
-        crm_info("No recipient for welcome message.(Node uuid:%s)", member->uuid);
+        crm_info("Not making join-%d offer to node uuid %s with unknown name",
+                 current_join_id, member->uuid);
         return;
     }
 
     if (saved_ccm_membership_id != crm_peer_seq) {
         saved_ccm_membership_id = crm_peer_seq;
-        crm_info("Making join offers based on membership %llu", crm_peer_seq);
+        crm_info("Making join-%d offers based on membership event %llu",
+                 current_join_id, crm_peer_seq);
     }
 
     if(user_data && member->join > crm_join_none) {
-        crm_info("Skipping %s: already known %d", member->uname, member->join);
+        crm_info("Not making join-%d offer to already known node %s (%s)",
+                 current_join_id, member->uname,
+                 crm_join_phase_str(member->join));
         return;
     }
 
@@ -173,14 +178,11 @@ join_make_offer(gpointer key, gpointer value, gpointer user_data)
 
     offer = create_dc_message(CRM_OP_JOIN_OFFER, member->uname);
 
-    /* send the welcome */
-    crm_info("join-%d: Sending offer to %s", current_join_id, member->uname);
-
+    crm_info("Sending join-%d offer to %s", current_join_id, member->uname);
     send_cluster_message(member, crm_msg_crmd, offer, TRUE);
     free_xml(offer);
 
     crm_update_peer_join(__FUNCTION__, member, crm_join_welcomed);
-    /* crm_update_peer_expected(__FUNCTION__, member, CRMD_JOINSTATE_PENDING); */
 }
 
 /*	 A_DC_JOIN_OFFER_ALL	*/
@@ -190,6 +192,8 @@ do_dc_join_offer_all(long long action,
                      enum crmd_fsa_state cur_state,
                      enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 {
+    int count;
+
     /* reset everyone's status back to down or in_ccm in the CIB
      *
      * any nodes that are active in the CIB but not in the CCM list
@@ -205,9 +209,11 @@ do_dc_join_offer_all(long long action,
     }
     g_hash_table_foreach(crm_peer_cache, join_make_offer, NULL);
 
+    count = crmd_join_phase_count(crm_join_welcomed);
+    crm_info("Waiting on join-%d requests from %d outstanding node%s",
+             current_join_id, count, pcmk__plural_s(count));
+
     /* don't waste time by invoking the PE yet; */
-    crm_info("join-%d: Waiting on %d outstanding join acks",
-             current_join_id, crmd_join_phase_count(crm_join_welcomed));
 }
 
 /*	 A_DC_JOIN_OFFER_ONE	*/
@@ -219,50 +225,40 @@ do_dc_join_offer_one(long long action,
 {
     crm_node_t *member;
     ha_msg_input_t *welcome = NULL;
-
-    const char *op = NULL;
+    int count;
     const char *join_to = NULL;
 
-    if (msg_data->data) {
-        welcome = fsa_typed_data(fsa_dt_ha_msg);
-
-    } else {
-        crm_info("An unknown node joined - (re-)offer to any unconfirmed nodes");
+    if (msg_data->data == NULL) {
+        crm_info("Making join-%d offers to any unconfirmed nodes "
+                 "because an unknown node joined", current_join_id);
         g_hash_table_foreach(crm_peer_cache, join_make_offer, &member);
         check_join_state(cur_state, __FUNCTION__);
         return;
     }
 
+    welcome = fsa_typed_data(fsa_dt_ha_msg);
     if (welcome == NULL) {
-        crm_err("Attempt to send welcome message without a message to reply to!");
+        // fsa_typed_data() already logged an error
         return;
     }
 
     join_to = crm_element_value(welcome->msg, F_CRM_HOST_FROM);
     if (join_to == NULL) {
-        crm_err("Attempt to send welcome message without a host to reply to!");
+        crm_err("Can't make join-%d offer to unknown node", current_join_id);
         return;
     }
-
     member = crm_get_peer(0, join_to);
-    op = crm_element_value(welcome->msg, F_CRM_TASK);
-    if (join_to != NULL && (cur_state == S_INTEGRATION || cur_state == S_FINALIZE_JOIN)) {
-        /* note: it _is_ possible that a node will have been
-         *  sick or starting up when the original offer was made.
-         *  however, it will either re-announce itself in due course
-         *  _or_ we can re-store the original offer on the client.
-         */
-        crm_trace("(Re-)offering membership to %s...", join_to);
-    }
 
-    crm_info("join-%d: Processing %s request from %s in state %s",
-             current_join_id, op, join_to, fsa_state2string(cur_state));
+    /* It is possible that a node will have been sick or starting up when the
+     * original offer was made. However, it will either re-announce itself in
+     * due course, or we can re-store the original offer on the client.
+     */
 
     crm_update_peer_join(__FUNCTION__, member, crm_join_none);
     join_make_offer(NULL, member, NULL);
 
-    /* always offer to the DC (ourselves)
-     * this ensures the correct value for max_generation_from
+    /* If the offer isn't to the local node, make an offer to the local node as
+     * well, to ensure the correct value for max_generation_from.
      */
     if (strcmp(join_to, fsa_our_uname) != 0) {
         member = crm_get_peer(0, fsa_our_uname);
@@ -274,9 +270,11 @@ do_dc_join_offer_one(long long action,
      */
     abort_transition(INFINITY, tg_restart, "Node join", NULL);
 
+    count = crmd_join_phase_count(crm_join_welcomed);
+    crm_info("Waiting on join-%d requests from %d outstanding node%s",
+             current_join_id, count, pcmk__plural_s(count));
+
     /* don't waste time by invoking the PE yet; */
-    crm_debug("Waiting on %d outstanding join acks for join-%d",
-              crmd_join_phase_count(crm_join_welcomed), current_join_id);
 }
 
 static int
@@ -309,20 +307,29 @@ do_dc_join_filter_offer(long long action,
 
     int cmp = 0;
     int join_id = -1;
+    int count = 0;
     gboolean ack_nack_bool = TRUE;
-    const char *ack_nack = CRMD_JOINSTATE_MEMBER;
     ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);
 
     const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
     const char *ref = crm_element_value(join_ack->msg, F_CRM_REFERENCE);
+    crm_node_t *join_node = NULL;
 
-    crm_node_t *join_node = crm_get_peer(0, join_from);
-
-    crm_debug("Processing req from %s", join_from);
+    if (join_from == NULL) {
+        crm_err("Ignoring invalid join request without node name");
+        return;
+    }
+    join_node = crm_get_peer(0, join_from);
 
-    generation = join_ack->xml;
     crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);
+    if (join_id != current_join_id) {
+        crm_debug("Ignoring join-%d request from %s because we are on join-%d",
+                  join_id, join_from, current_join_id);
+        check_join_state(cur_state, __FUNCTION__);
+        return;
+    }
 
+    generation = join_ack->xml;
     if (max_generation_xml != NULL && generation != NULL) {
         int lpc = 0;
 
@@ -337,61 +344,63 @@ do_dc_join_filter_offer(long long action,
         }
     }
 
-    if (join_id != current_join_id) {
-        crm_debug("Invalid response from %s: join-%d vs. join-%d",
-                  join_from, join_id, current_join_id);
-        check_join_state(cur_state, __FUNCTION__);
-        return;
+    if (ref == NULL) {
+        ref = "none"; // for logging only
+    }
 
-    } else if (join_node == NULL || crm_is_peer_active(join_node) == FALSE) {
-        crm_err("Node %s is not a member", join_from);
+    if (crm_is_peer_active(join_node) == FALSE) {
+        crm_err("Rejecting join-%d request from inactive node %s "
+                CRM_XS " ref=%s", join_id, join_from, ref);
         ack_nack_bool = FALSE;
 
     } else if (generation == NULL) {
-        crm_err("Generation was NULL");
+        crm_err("Rejecting invalid join-%d request from node %s "
+                "missing CIB generation " CRM_XS " ref=%s",
+                join_id, join_from, ref);
         ack_nack_bool = FALSE;
 
     } else if (max_generation_xml == NULL) {
+        crm_debug("Accepting join-%d request from %s "
+                  "(with first CIB generation) " CRM_XS " ref=%s",
+                  join_id, join_from, ref);
         max_generation_xml = copy_xml(generation);
         max_generation_from = strdup(join_from);
 
     } else if (cmp < 0 || (cmp == 0 && safe_str_eq(join_from, fsa_our_uname))) {
-        crm_debug("%s has a better generation number than"
-                  " the current max %s", join_from, max_generation_from);
-        if (max_generation_xml) {
-            crm_log_xml_debug(max_generation_xml, "Max generation");
-        }
-        crm_log_xml_debug(generation, "Their generation");
+        crm_debug("Accepting join-%d request from %s (with better "
+                  "CIB generation than current best from %s) " CRM_XS " ref=%s",
+                  join_id, join_from, max_generation_from, ref);
+        crm_log_xml_debug(max_generation_xml, "Old max generation");
+        crm_log_xml_debug(generation, "New max generation");
 
         free(max_generation_from);
         free_xml(max_generation_xml);
 
         max_generation_from = strdup(join_from);
         max_generation_xml = copy_xml(join_ack->xml);
+
+    } else {
+        crm_debug("Accepting join-%d request from %s " CRM_XS " ref=%s",
+                  join_id, join_from, ref);
     }
 
     if (ack_nack_bool == FALSE) {
-        /* NACK this client */
-        ack_nack = CRMD_JOINSTATE_NACK;
         crm_update_peer_join(__FUNCTION__, join_node, crm_join_nack);
-        crm_err("Rejecting cluster join request from %s " CRM_XS
-                " NACK join-%d ref=%s", join_from, join_id, ref);
-
+        crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_NACK);
     } else {
-        crm_debug("join-%d: Welcoming node %s (ref %s)", join_id, join_from, ref);
         crm_update_peer_join(__FUNCTION__, join_node, crm_join_integrated);
+        crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_MEMBER);
     }
 
-    crm_update_peer_expected(__FUNCTION__, join_node, ack_nack);
-
-    crm_debug("%u nodes have been integrated into join-%d",
-              crmd_join_phase_count(crm_join_integrated), join_id);
-
+    count = crmd_join_phase_count(crm_join_integrated);
+    crm_debug("%d node%s currently integrated in join-%d",
+              count, pcmk__plural_s(count), join_id);
 
     if (check_join_state(cur_state, __FUNCTION__) == FALSE) {
         /* don't waste time by invoking the PE yet; */
-        crm_debug("join-%d: Still waiting on %d outstanding offers",
-                  join_id, crmd_join_phase_count(crm_join_welcomed));
+        count = crmd_join_phase_count(crm_join_welcomed);
+        crm_debug("Waiting on join-%d requests from %d outstanding node%s",
+                  join_id, count, pcmk__plural_s(count));
     }
 }
 
@@ -404,21 +413,24 @@ do_dc_join_finalize(long long action,
 {
     char *sync_from = NULL;
     int rc = pcmk_ok;
+    int count_welcomed = crmd_join_phase_count(crm_join_welcomed);
+    int count_integrated = crmd_join_phase_count(crm_join_integrated);
 
     /* This we can do straight away and avoid clients timing us out
      *  while we compute the latest CIB
      */
-    crm_debug("Finalizing join-%d for %d clients",
-              current_join_id, crmd_join_phase_count(crm_join_integrated));
-
-    crmd_join_phase_log(LOG_INFO);
-    if (crmd_join_phase_count(crm_join_welcomed) != 0) {
-        crm_info("Waiting for %d more nodes", crmd_join_phase_count(crm_join_welcomed));
+    if (count_welcomed != 0) {
+        crm_debug("Waiting on join-%d requests from %d outstanding node%s "
+                  "before finalizing join", current_join_id, count_welcomed,
+                  pcmk__plural_s(count_welcomed));
+        crmd_join_phase_log(LOG_DEBUG);
         /* crmd_fsa_stall(FALSE); Needed? */
         return;
 
-    } else if (crmd_join_phase_count(crm_join_integrated) == 0) {
-        /* Nothing to do */
+    } else if (count_integrated == 0) {
+        crm_debug("Finalization not needed for join-%d at the current time",
+                  current_join_id);
+        crmd_join_phase_log(LOG_DEBUG);
         check_join_state(fsa_state, __FUNCTION__);
         return;
     }
@@ -429,8 +441,9 @@ do_dc_join_finalize(long long action,
     }
 
     if (is_set(fsa_input_register, R_IN_TRANSITION)) {
-        crm_warn("Delaying response to cluster join offer while transition in progress "
-                 CRM_XS " join-%d", current_join_id);
+        crm_warn("Delaying join-%d finalization while transition in progress",
+                 current_join_id);
+        crmd_join_phase_log(LOG_DEBUG);
         crmd_fsa_stall(FALSE);
         return;
     }
@@ -439,18 +452,20 @@ do_dc_join_finalize(long long action,
         /* ask for the agreed best CIB */
         sync_from = strdup(max_generation_from);
         set_bit(fsa_input_register, R_CIB_ASKED);
-        crm_notice("Syncing the Cluster Information Base from %s to rest of cluster "
-                   CRM_XS " join-%d", sync_from, current_join_id);
-        crm_log_xml_notice(max_generation_xml, "Requested version");
+        crm_notice("Finalizing join-%d for %d node%s (sync'ing CIB from %s)",
+                   current_join_id, count_integrated,
+                   pcmk__plural_s(count_integrated), sync_from);
+        crm_log_xml_notice(max_generation_xml, "Requested CIB version");
 
     } else {
         /* Send _our_ CIB out to everyone */
         sync_from = strdup(fsa_our_uname);
-        crm_info("join-%d: Syncing our CIB to the rest of the cluster",
-                 current_join_id);
-        crm_log_xml_debug(max_generation_xml, "Requested version");
+        crm_debug("Finalizing join-%d for %d node%s (sync'ing from local CIB)",
+                  current_join_id, count_integrated,
+                  pcmk__plural_s(count_integrated));
+        crm_log_xml_debug(max_generation_xml, "Requested CIB version");
     }
-
+    crmd_join_phase_log(LOG_DEBUG);
 
     rc = fsa_cib_conn->cmds->sync_from(fsa_cib_conn, sync_from, NULL, cib_quorum_override);
     fsa_register_cib_callback(rc, FALSE, sync_from, finalize_sync_callback);
@@ -462,26 +477,33 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi
     CRM_LOG_ASSERT(-EPERM != rc);
     clear_bit(fsa_input_register, R_CIB_ASKED);
     if (rc != pcmk_ok) {
-        do_crm_log((rc == -pcmk_err_old_data ? LOG_WARNING : LOG_ERR),
-                   "Sync from %s failed: %s", (char *)user_data, pcmk_strerror(rc));
+        do_crm_log(((rc == -pcmk_err_old_data)? LOG_WARNING : LOG_ERR),
+                   "Could not sync CIB from %s in join-%d: %s",
+                   (char *) user_data, current_join_id, pcmk_strerror(rc));
 
         /* restart the whole join process */
         register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __FUNCTION__);
 
-    } else if (AM_I_DC && fsa_state == S_FINALIZE_JOIN) {
+    } else if (!AM_I_DC) {
+        crm_debug("Sync'ed CIB for join-%d but no longer DC", current_join_id);
+
+    } else if (fsa_state != S_FINALIZE_JOIN) {
+        crm_debug("Sync'ed CIB for join-%d but no longer in S_FINALIZE_JOIN (%s)",
+                  current_join_id, fsa_state2string(fsa_state));
+
+    } else {
         set_bit(fsa_input_register, R_HAVE_CIB);
         clear_bit(fsa_input_register, R_CIB_ASKED);
 
         /* make sure dc_uuid is re-set to us */
         if (check_join_state(fsa_state, __FUNCTION__) == FALSE) {
-            crm_debug("Notifying %d clients of join-%d results",
-                      crmd_join_phase_count(crm_join_integrated), current_join_id);
+            int count_integrated = crmd_join_phase_count(crm_join_integrated);
+
+            crm_debug("Notifying %d node%s of join-%d results",
+                      count_integrated, pcmk__plural_s(count_integrated),
+                      current_join_id);
             g_hash_table_foreach(crm_peer_cache, finalize_join_for, NULL);
         }
-
-    } else {
-        crm_debug("No longer the DC in S_FINALIZE_JOIN: %s/%s",
-                  AM_I_DC ? "DC" : "CRMd", fsa_state2string(fsa_state));
     }
 }
 
@@ -491,11 +513,14 @@ join_update_complete_callback(xmlNode * msg, int call_id, int rc, xmlNode * outp
     fsa_data_t *msg_data = NULL;
 
     if (rc == pcmk_ok) {
-        crm_debug("Join update %d complete", call_id);
+        crm_debug("join-%d node history update (via CIB call %d) complete",
+                  current_join_id, call_id);
         check_join_state(fsa_state, __FUNCTION__);
 
     } else {
-        crm_err("Join update %d failed", call_id);
+        crm_err("join-%d node history update (via CIB call %d) failed: %s "
+                "(next transition may determine resource status incorrectly)",
+                current_join_id, call_id, pcmk_strerror(rc));
         crm_log_xml_debug(msg, "failed");
         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
     }
@@ -514,60 +539,75 @@ do_dc_join_ack(long long action,
 
     const char *op = crm_element_value(join_ack->msg, F_CRM_TASK);
     const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
-    crm_node_t *peer = crm_get_peer(0, join_from);
+    crm_node_t *peer = NULL;
 
-    if (safe_str_neq(op, CRM_OP_JOIN_CONFIRM) || peer == NULL) {
-        crm_debug("Ignoring op=%s message from %s", op, join_from);
+    // Sanity checks
+    if (join_from == NULL) {
+        crm_warn("Ignoring message received without node identification");
+        return;
+    }
+    if (op == NULL) {
+        crm_warn("Ignoring message received from %s without task", join_from);
         return;
     }
 
-    crm_trace("Processing ack from %s", join_from);
-    crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);
+    if (strcmp(op, CRM_OP_JOIN_CONFIRM)) {
+        crm_debug("Ignoring '%s' message from %s while waiting for '%s'",
+                  op, join_from, CRM_OP_JOIN_CONFIRM);
+        return;
+    }
 
+    if (crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id) != 0) {
+        crm_warn("Ignoring join confirmation from %s without valid join ID",
+                 join_from);
+        return;
+    }
+
+    peer = crm_get_peer(0, join_from);
     if (peer->join != crm_join_finalized) {
-        crm_info("Join not in progress: ignoring join-%d from %s (phase = %d)",
-                 join_id, join_from, peer->join);
+        crm_info("Ignoring out-of-sequence join-%d confirmation from %s "
+                 "(currently %s not %s)",
+                 join_id, join_from, crm_join_phase_str(peer->join),
+                 crm_join_phase_str(crm_join_finalized));
         return;
+    }
 
-    } else if (join_id != current_join_id) {
-        crm_err("Invalid response from %s: join-%d vs. join-%d",
-                join_from, join_id, current_join_id);
+    if (join_id != current_join_id) {
+        crm_err("Rejecting join-%d confirmation from %s "
+                "because currently on join-%d",
+                join_id, join_from, current_join_id);
         crm_update_peer_join(__FUNCTION__, peer, crm_join_nack);
         return;
     }
 
     crm_update_peer_join(__FUNCTION__, peer, crm_join_confirmed);
 
-    crm_info("join-%d: Updating node state to %s for %s",
-             join_id, CRMD_JOINSTATE_MEMBER, join_from);
-
-    /* update CIB with the current LRM status from the node
-     * We don't need to notify the TE of these updates, a transition will
-     *   be started in due time
+    /* Update CIB with node's current LRM state. A new transition will be
+     * triggered later, when the CIB notifies us of the change.
      */
     erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local);
-
     if (safe_str_eq(join_from, fsa_our_uname)) {
         xmlNode *now_dc_lrmd_state = do_lrm_query(TRUE, fsa_our_uname);
 
         if (now_dc_lrmd_state != NULL) {
-            crm_debug("LRM state is updated from do_lrm_query.(%s)", join_from);
             fsa_cib_update(XML_CIB_TAG_STATUS, now_dc_lrmd_state,
                 cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL);
             free_xml(now_dc_lrmd_state);
+            crm_debug("Updating local node history for join-%d "
+                      "from query result (via CIB call %d)", join_id, call_id);
         } else {
-            crm_warn("Could not get our LRM state. LRM state is updated from join_ack->xml.(%s)", join_from);
             fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml,
                 cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL);
+            crm_warn("Updating local node history from join-%d confirmation "
+                     "because query failed (via CIB call %d)", join_id, call_id);
         }
     } else {
-        crm_debug("LRM state is updated from join_ack->xml.(%s)", join_from);
         fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml,
            cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL);
+        crm_debug("Updating node history for %s from join-%d confirmation "
+                  "(via CIB call %d)", join_from, join_id, call_id);
     }
-
     fsa_register_cib_callback(call_id, FALSE, NULL, join_update_complete_callback);
-    crm_debug("join-%d: Registered callback for LRM update %d", join_id, call_id);
 }
 
 void
@@ -579,17 +619,16 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data)
     const char *join_to = join_node->uname;
 
     if(join_node->join != crm_join_integrated) {
-        crm_trace("Skipping %s in state %d", join_to, join_node->join);
+        crm_trace("Not updating non-integrated node %s (%s) for join-%d",
+                  join_to, crm_join_phase_str(join_node->join),
+                  current_join_id);
         return;
     }
 
-    /* make sure a node entry exists for the new node */
-    crm_trace("Creating node entry for %s", join_to);
-
+    crm_trace("Updating node state for %s", join_to);
     tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE);
     set_uuid(tmp1, XML_ATTR_UUID, join_node);
     crm_xml_add(tmp1, XML_ATTR_UNAME, join_to);
-
     fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1);
     free_xml(tmp1);
 
@@ -608,11 +647,10 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data)
         return;
     }
 
-    /* send the ack/nack to the node */
-    acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to);
-
-    crm_debug("join-%d: ACK'ing join request from %s",
+    // Acknowledge node's join request
+    crm_debug("Acknowledging join-%d request from %s",
               current_join_id, join_to);
+    acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to);
     crm_xml_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_TRUE);
     crm_update_peer_join(__FUNCTION__, join_node, crm_join_finalized);
     crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_MEMBER);
@@ -629,11 +667,11 @@ check_join_state(enum crmd_fsa_state cur_state, const char *source)
 {
     static unsigned long long highest_seq = 0;
 
-    crm_debug("Invoked by %s in state: %s", source, fsa_state2string(cur_state));
-
     if (saved_ccm_membership_id != crm_peer_seq) {
-        crm_debug("%s: Membership changed since join started: %llu -> %llu (%llu)",
-                  source, saved_ccm_membership_id, crm_peer_seq, highest_seq);
+        crm_debug("join-%d: Membership changed from %llu to %llu "
+                  CRM_XS " highest=%llu state=%s for=%s",
+                  current_join_id, saved_ccm_membership_id, crm_peer_seq, highest_seq,
+                  fsa_state2string(cur_state), source);
         if(highest_seq < crm_peer_seq) {
             /* Don't spam the FSA with duplicates */
             highest_seq = crm_peer_seq;
@@ -642,34 +680,53 @@ check_join_state(enum crmd_fsa_state cur_state, const char *source)
 
     } else if (cur_state == S_INTEGRATION) {
         if (crmd_join_phase_count(crm_join_welcomed) == 0) {
-            crm_debug("join-%d: Integration of %d peers complete: %s",
-                      current_join_id, crmd_join_phase_count(crm_join_integrated), source);
+            int count = crmd_join_phase_count(crm_join_integrated);
+
+            crm_debug("join-%d: Integration of %d peer%s complete "
+                      CRM_XS " state=%s for=%s",
+                      current_join_id, count, pcmk__plural_s(count),
+                      fsa_state2string(cur_state), source);
             register_fsa_input_before(C_FSA_INTERNAL, I_INTEGRATED, NULL);
             return TRUE;
         }
 
     } else if (cur_state == S_FINALIZE_JOIN) {
         if (is_set(fsa_input_register, R_HAVE_CIB) == FALSE) {
-            crm_debug("join-%d: Delaying I_FINALIZED until we have the CIB", current_join_id);
+            crm_debug("join-%d: Delaying finalization until we have CIB "
+                      CRM_XS " state=%s for=%s",
+                      current_join_id, fsa_state2string(cur_state), source);
             return TRUE;
 
         } else if (crmd_join_phase_count(crm_join_welcomed) != 0) {
-            crm_debug("join-%d: Still waiting on %d welcomed nodes",
-                      current_join_id, crmd_join_phase_count(crm_join_welcomed));
+            int count = crmd_join_phase_count(crm_join_welcomed);
+
+            crm_debug("join-%d: Still waiting on %d welcomed node%s "
+                      CRM_XS " state=%s for=%s",
+                      current_join_id, count, pcmk__plural_s(count),
+                      fsa_state2string(cur_state), source);
             crmd_join_phase_log(LOG_DEBUG);
 
         } else if (crmd_join_phase_count(crm_join_integrated) != 0) {
-            crm_debug("join-%d: Still waiting on %d integrated nodes",
-                      current_join_id, crmd_join_phase_count(crm_join_integrated));
+            int count = crmd_join_phase_count(crm_join_integrated);
+
+            crm_debug("join-%d: Still waiting on %d integrated node%s "
+                      CRM_XS " state=%s for=%s",
+                      current_join_id, count, pcmk__plural_s(count),
+                      fsa_state2string(cur_state), source);
             crmd_join_phase_log(LOG_DEBUG);
 
         } else if (crmd_join_phase_count(crm_join_finalized) != 0) {
-            crm_debug("join-%d: Still waiting on %d finalized nodes",
-                      current_join_id, crmd_join_phase_count(crm_join_finalized));
+            int count = crmd_join_phase_count(crm_join_finalized);
+
+            crm_debug("join-%d: Still waiting on %d finalized node%s "
+                      CRM_XS " state=%s for=%s",
+                      current_join_id, count, pcmk__plural_s(count),
+                      fsa_state2string(cur_state), source);
             crmd_join_phase_log(LOG_DEBUG);
 
         } else {
-            crm_debug("join-%d complete: %s", current_join_id, source);
+            crm_debug("join-%d: Complete " CRM_XS " state=%s for=%s",
+                      current_join_id, fsa_state2string(cur_state), source);
             register_fsa_input_later(C_FSA_INTERNAL, I_FINALIZED, NULL);
             return TRUE;
         }
-- 
1.8.3.1


From 770cdaa5c477bfc75a6a38afc1d34f0d4f521ee1 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 13 Dec 2019 10:39:34 -0600
Subject: [PATCH 09/11] Log: controller: improve CIB status deletion messages

---
 crmd/utils.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/crmd/utils.c b/crmd/utils.c
index 761f5a7..47aa6f0 100644
--- a/crmd/utils.c
+++ b/crmd/utils.c
@@ -983,14 +983,18 @@ update_dc(xmlNode * msg)
     return TRUE;
 }
 
-#define STATUS_PATH_MAX 512
 static void
 erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
 {
     char *xpath = user_data;
 
-    do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE,
-                        "Deletion of \"%s\": %s (rc=%d)", xpath, pcmk_strerror(rc), rc);
+    if (rc == 0) {
+        crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded",
+                  xpath, call_id);
+    } else {
+        crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s "
+                 CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc);
+    }
 }
 
 #define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s"
@@ -998,14 +1002,19 @@ erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
 void
 erase_status_tag(const char *uname, const char *tag, int options)
 {
-    if (fsa_cib_conn && uname) {
+    CRM_CHECK(uname != NULL, return);
+
+    if (fsa_cib_conn == NULL) {
+        crm_warn("Unable to delete CIB '%s' section for node %s: "
+                 "no CIB connection", tag, uname);
+    } else {
         int call_id;
         char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag);
 
-        crm_info("Deleting %s status entries for %s " CRM_XS " xpath=%s",
-                 tag, uname, xpath);
-        call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, xpath, NULL,
-                                             cib_quorum_override | cib_xpath | options);
+        options |= cib_quorum_override|cib_xpath;
+        call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, xpath, NULL, options);
+        crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) "
+                 CRM_XS " xpath=%s", tag, uname, call_id, xpath);
         fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback);
         // CIB library handles freeing xpath
     }
-- 
1.8.3.1


From 19d74b26b9b30157499c2fc4bd2da55c408fc638 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 13 Dec 2019 10:36:56 -0600
Subject: [PATCH 10/11] Refactor: controller: move erase_status_tag() to
 controld_based.c

---
 crmd/cib.c   | 38 ++++++++++++++++++++++++++++++++++++++
 crmd/utils.c | 37 -------------------------------------
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/crmd/cib.c b/crmd/cib.c
index a8a097a..ff489b4 100644
--- a/crmd/cib.c
+++ b/crmd/cib.c
@@ -224,3 +224,41 @@ controld_action_is_recordable(const char *action)
     }
     return TRUE;
 }
+
+static void
+erase_xpath_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
+                     void *user_data)
+{
+    char *xpath = user_data;
+
+    if (rc == 0) {
+        crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded",
+                  xpath, call_id);
+    } else {
+        crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s "
+                 CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc);
+    }
+}
+
+#define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s"
+
+void
+erase_status_tag(const char *uname, const char *tag, int options)
+{
+    CRM_CHECK(uname != NULL, return);
+
+    if (fsa_cib_conn == NULL) {
+        crm_warn("Unable to delete CIB '%s' section for node %s: "
+                 "no CIB connection", tag, uname);
+    } else {
+        int call_id;
+        char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag);
+
+        options |= cib_quorum_override|cib_xpath;
+        call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, xpath, NULL, options);
+        crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) "
+                 CRM_XS " xpath=%s", tag, uname, call_id, xpath);
+        fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback);
+        // CIB library handles freeing xpath
+    }
+}
diff --git a/crmd/utils.c b/crmd/utils.c
index 47aa6f0..b2a0b7d 100644
--- a/crmd/utils.c
+++ b/crmd/utils.c
@@ -983,43 +983,6 @@ update_dc(xmlNode * msg)
     return TRUE;
 }
 
-static void
-erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
-{
-    char *xpath = user_data;
-
-    if (rc == 0) {
-        crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded",
-                  xpath, call_id);
-    } else {
-        crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s "
-                 CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc);
-    }
-}
-
-#define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s"
-
-void
-erase_status_tag(const char *uname, const char *tag, int options)
-{
-    CRM_CHECK(uname != NULL, return);
-
-    if (fsa_cib_conn == NULL) {
-        crm_warn("Unable to delete CIB '%s' section for node %s: "
-                 "no CIB connection", tag, uname);
-    } else {
-        int call_id;
-        char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag);
-
-        options |= cib_quorum_override|cib_xpath;
-        call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, xpath, NULL, options);
-        crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) "
-                 CRM_XS " xpath=%s", tag, uname, call_id, xpath);
-        fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback);
-        // CIB library handles freeing xpath
-    }
-}
-
 void crmd_peer_down(crm_node_t *peer, bool full) 
 {
     if(full && peer->state == NULL) {
-- 
1.8.3.1


From 8ba2bfa5aca514dcd2ad6c8a4f88ffedd028d206 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 13 Dec 2019 11:16:25 -0600
Subject: [PATCH 11/11] Refactor: controller: improve efficiency when deleting
 node state

Rename erase_status_xpath() to controld_delete_node_state() to follow current
naming practice.

Instead of passing it a node_state subsection name, pass a new enum value
indicating what to erase (resource history, transient node attributes, or
both). This allows us to improve the log messages further, as well as improving
efficiency when both need to be cleared.
---
 crmd/callbacks.c      | 15 +++++------
 crmd/cib.c            | 69 +++++++++++++++++++++++++++++++++++++++------------
 crmd/crmd_utils.h     | 11 +++++++-
 crmd/join_client.c    |  3 ++-
 crmd/join_dc.c        |  3 ++-
 crmd/lrm.c            |  3 ++-
 crmd/remote_lrmd_ra.c | 24 +++++++++---------
 crmd/te_actions.c     |  5 ++--
 8 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/crmd/callbacks.c b/crmd/callbacks.c
index 7560470..419b154 100644
--- a/crmd/callbacks.c
+++ b/crmd/callbacks.c
@@ -202,17 +202,18 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
                  * transient attributes intact until it rejoins.
                  */
                 if (compare_version(fsa_our_dc_version, "3.0.9") > 0) {
-                    erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
+                    controld_delete_node_state(node->uname,
+                                               controld_section_attrs,
+                                               cib_scope_local);
                 }
 
             } else if(AM_I_DC) {
-                if (appeared == FALSE) {
-                    crm_info("Peer %s left us", node->uname);
-                    erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
-                } else {
-                    crm_info("New peer %s we want to sync fence history with",
-                             node->uname);
+                if (appeared) {
                     te_trigger_stonith_history_sync(FALSE);
+                } else {
+                    controld_delete_node_state(node->uname,
+                                               controld_section_attrs,
+                                               cib_scope_local);
                 }
             }
             break;
diff --git a/crmd/cib.c b/crmd/cib.c
index ff489b4..c602130 100644
--- a/crmd/cib.c
+++ b/crmd/cib.c
@@ -226,39 +226,76 @@ controld_action_is_recordable(const char *action)
 }
 
 static void
-erase_xpath_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
-                     void *user_data)
+cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
+                    void *user_data)
 {
-    char *xpath = user_data;
+    char *desc = user_data;
 
     if (rc == 0) {
-        crm_debug("Deletion of '%s' from CIB (via CIB call %d) succeeded",
-                  xpath, call_id);
+        crm_debug("Deletion of %s (via CIB call %d) succeeded", desc, call_id);
     } else {
-        crm_warn("Deletion of '%s' from CIB (via CIB call %d) failed: %s "
-                 CRM_XS " rc=%d", xpath, call_id, pcmk_strerror(rc), rc);
+        crm_warn("Deletion of %s (via CIB call %d) failed: %s " CRM_XS " rc=%d",
+                 desc, call_id, pcmk_strerror(rc), rc);
     }
 }
 
-#define XPATH_STATUS_TAG "//node_state[@uname='%s']/%s"
+// Searches for various portions of node_state to delete
 
+// Match a particular node's node_state (takes node name 1x)
+#define XPATH_NODE_STATE        "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']"
+
+// Node's lrm section (name 1x)
+#define XPATH_NODE_LRM          XPATH_NODE_STATE "/" XML_CIB_TAG_LRM
+
+// Node's transient_attributes section (name 1x)
+#define XPATH_NODE_ATTRS        XPATH_NODE_STATE "/" XML_TAG_TRANSIENT_NODEATTRS
+
+// Everything under node_state (name 1x)
+#define XPATH_NODE_ALL          XPATH_NODE_STATE "/*"
+
+/*!
+ * \internal
+ * \brief Delete subsection of a node's CIB node_state
+ *
+ * \param[in] uname    Desired node
+ * \param[in] section  Subsection of node_state to delete
+ * \param[in] options  CIB call options to use
+ */
 void
-erase_status_tag(const char *uname, const char *tag, int options)
+controld_delete_node_state(const char *uname, enum controld_section_e section,
+                           int options)
 {
+    char *xpath = NULL;
+    char *desc = NULL;
+
     CRM_CHECK(uname != NULL, return);
+    switch (section) {
+        case controld_section_lrm:
+            xpath = crm_strdup_printf(XPATH_NODE_LRM, uname);
+            desc = crm_strdup_printf("resource history for node %s", uname);
+            break;
+        case controld_section_attrs:
+            xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname);
+            desc = crm_strdup_printf("transient attributes for node %s", uname);
+            break;
+        case controld_section_all:
+            xpath = crm_strdup_printf(XPATH_NODE_ALL, uname);
+            desc = crm_strdup_printf("all state for node %s", uname);
+            break;
+    }
 
     if (fsa_cib_conn == NULL) {
-        crm_warn("Unable to delete CIB '%s' section for node %s: "
-                 "no CIB connection", tag, uname);
+        crm_warn("Unable to delete %s: no CIB connection", desc);
+        free(desc);
     } else {
         int call_id;
-        char *xpath = crm_strdup_printf(XPATH_STATUS_TAG, uname, tag);
 
         options |= cib_quorum_override|cib_xpath;
         call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, xpath, NULL, options);
-        crm_info("Deleting CIB '%s' section for node %s (via CIB call %d) "
-                 CRM_XS " xpath=%s", tag, uname, call_id, xpath);
-        fsa_register_cib_callback(call_id, FALSE, xpath, erase_xpath_callback);
-        // CIB library handles freeing xpath
+        crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s",
+                 desc, call_id, xpath);
+        fsa_register_cib_callback(call_id, FALSE, desc, cib_delete_callback);
+        // CIB library handles freeing desc
     }
+    free(xpath);
 }
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
index 955d859..9afa2ca 100644
--- a/crmd/crmd_utils.h
+++ b/crmd/crmd_utils.h
@@ -96,7 +96,6 @@ xmlNode *create_node_state_update(crm_node_t *node, int flags,
                                   xmlNode *parent, const char *source);
 void populate_cib_nodes(enum node_update_flags flags, const char *source);
 void crm_update_quorum(gboolean quorum, gboolean force_update);
-void erase_status_tag(const char *uname, const char *tag, int options);
 void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node);
 void update_attrd_remote_node_removed(const char *host, const char *user_name);
 void update_attrd_clear_failures(const char *host, const char *rsc,
@@ -115,6 +114,16 @@ void crmd_peer_down(crm_node_t *peer, bool full);
 unsigned int cib_op_timeout(void);
 bool controld_action_is_recordable(const char *action);
 
+// Subsections of node_state
+enum controld_section_e {
+    controld_section_lrm,
+    controld_section_attrs,
+    controld_section_all,
+};
+
+void controld_delete_node_state(const char *uname,
+                                enum controld_section_e section, int options);
+
 const char *get_node_id(xmlNode *lrm_rsc_op);
 
 /* Convenience macro for registering a CIB callback
diff --git a/crmd/join_client.c b/crmd/join_client.c
index 2142d21..9f572ad 100644
--- a/crmd/join_client.c
+++ b/crmd/join_client.c
@@ -298,7 +298,8 @@ do_cl_join_finalize_respond(long long action,
              * present for legacy attrd, but given legacy attrd's imminent
              * demise, this is preferable to making intrusive changes to it.
              */
-            erase_status_tag(fsa_our_uname, XML_TAG_TRANSIENT_NODEATTRS, 0);
+            controld_delete_node_state(fsa_our_uname, controld_section_attrs,
+                                       cib_scope_local);
             update_attrd(fsa_our_uname, "terminate", NULL, NULL, FALSE);
             update_attrd(fsa_our_uname, XML_CIB_ATTR_SHUTDOWN, "0", NULL, FALSE);
 #endif
diff --git a/crmd/join_dc.c b/crmd/join_dc.c
index cdb3f77..6705022 100644
--- a/crmd/join_dc.c
+++ b/crmd/join_dc.c
@@ -585,7 +585,8 @@ do_dc_join_ack(long long action,
     /* Update CIB with node's current LRM state. A new transition will be
      * triggered later, when the CIB notifies us of the change.
      */
-    erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local);
+    controld_delete_node_state(join_from, controld_section_lrm,
+                               cib_scope_local);
     if (safe_str_eq(join_from, fsa_our_uname)) {
         xmlNode *now_dc_lrmd_state = do_lrm_query(TRUE, fsa_our_uname);
 
diff --git a/crmd/lrm.c b/crmd/lrm.c
index 80fcd69..2c9e475 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -1421,7 +1421,8 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
     }
 
     /* Now delete the copy in the CIB */
-    erase_status_tag(lrm_state->node_name, XML_CIB_TAG_LRM, cib_scope_local);
+    controld_delete_node_state(lrm_state->node_name, controld_section_lrm,
+                               cib_scope_local);
 
     /* And finally, _delete_ the value in attrd
      * Setting it to FALSE results in the PE sending us back here again
diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
index 1214814..c4f58d6 100644
--- a/crmd/remote_lrmd_ra.c
+++ b/crmd/remote_lrmd_ra.c
@@ -195,13 +195,13 @@ remote_node_up(const char *node_name)
     CRM_CHECK(node_name != NULL, return);
     crm_info("Announcing pacemaker_remote node %s", node_name);
 
-    /* Clear node's operation history. The node's transient attributes should
-     * and normally will be cleared when the node leaves, but since remote node
-     * state has a number of corner cases, clear them here as well, to be sure.
+    /* Clear node's entire state (resource history and transient attributes).
+     * The transient attributes should and normally will be cleared when the
+     * node leaves, but since remote node state has a number of corner cases,
+     * clear them here as well, to be sure.
      */
     call_opt = crmd_cib_smart_opt();
-    erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
-    erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
+    controld_delete_node_state(node_name, controld_section_all, call_opt);
 
     /* Clear node's probed attribute */
     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
@@ -266,15 +266,15 @@ remote_node_down(const char *node_name, const enum down_opts opts)
     /* Purge node from attrd's memory */
     update_attrd_remote_node_removed(node_name, NULL);
 
-    /* Purge node's transient attributes */
-    erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
-
-    /* Normally, the LRM operation history should be kept until the node comes
-     * back up. However, after a successful fence, we want to clear it, so we
-     * don't think resources are still running on the node.
+    /* Normally, only node attributes should be erased, and the resource history
+     * should be kept until the node comes back up. However, after a successful
+     * fence, we want to clear the history as well, so we don't think resources
+     * are still running on the node.
      */
     if (opts == DOWN_ERASE_LRM) {
-        erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
+        controld_delete_node_state(node_name, controld_section_all, call_opt);
+    } else {
+        controld_delete_node_state(node_name, controld_section_attrs, call_opt);
     }
 
     /* Ensure node is in the remote peer cache with lost state */
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
index 14097ab..19bb199 100644
--- a/crmd/te_actions.c
+++ b/crmd/te_actions.c
@@ -150,9 +150,8 @@ send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
     /* Make sure it sticks */
     /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
 
-    erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local);
-    erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
-
+    controld_delete_node_state(peer->uname, controld_section_all,
+                               cib_scope_local);
     free_xml(node_state);
     return;
 }
-- 
1.8.3.1