Blob Blame History Raw
From 83811e2115f5516a7faec2e653b1be3d58b35a79 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 12 Apr 2019 09:46:51 -0500
Subject: [PATCH 1/2] Log: libcrmcluster: improve CPG membership messages

Show CPG event reason when provided by corosync, make messages more readable,
upgrade duplicate pid messages to warnings (and log only one message in those
cases).

This also fixes a typo in 4d6f6e01 that led to using an index with the wrong
array, potentially leading to use of an uninitialized value or invalid memory
access.
---
 lib/cluster/cpg.c | 95 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c
index c5ecc67..85476be 100644
--- a/lib/cluster/cpg.c
+++ b/lib/cluster/cpg.c
@@ -399,6 +399,32 @@ static int cmp_member_list_nodeid(const void *first,
     return 0;
 }
 
+static const char *
+cpgreason2str(cpg_reason_t reason)
+{
+    switch (reason) {
+        case CPG_REASON_JOIN:       return " via cpg_join";
+        case CPG_REASON_LEAVE:      return " via cpg_leave";
+        case CPG_REASON_NODEDOWN:   return " via cluster exit";
+        case CPG_REASON_NODEUP:     return " via cluster join";
+        case CPG_REASON_PROCDOWN:   return " for unknown reason";
+        default:                    break;
+    }
+    return "";
+}
+
+static inline const char *
+peer_name(crm_node_t *peer)
+{
+    if (peer == NULL) {
+        return "unknown node";
+    } else if (peer->uname == NULL) {
+        return "peer node";
+    } else {
+        return peer->uname;
+    }
+}
+
 void
 pcmk_cpg_membership(cpg_handle_t handle,
                     const struct cpg_name *groupName,
@@ -410,7 +436,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
     gboolean found = FALSE;
     static int counter = 0;
     uint32_t local_nodeid = get_local_nodeid(handle);
-    const struct cpg_address *key, **rival, **sorted;
+    const struct cpg_address *key, **sorted;
 
     sorted = malloc(member_list_entries * sizeof(const struct cpg_address *));
     CRM_ASSERT(sorted != NULL);
@@ -424,11 +450,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
 
     for (i = 0; i < left_list_entries; i++) {
         crm_node_t *peer = crm_find_peer(left_list[i].nodeid, NULL);
-
-        crm_info("Node %u left group %s (peer=%s:%llu, counter=%d.%d)",
-                 left_list[i].nodeid, groupName->value,
-                 (peer? peer->uname : "<none>"),
-                 (unsigned long long) left_list[i].pid, counter, i);
+        const struct cpg_address **rival = NULL;
 
         /* in CPG world, NODE:PROCESS-IN-MEMBERSHIP-OF-G is an 1:N relation
            and not playing by this rule may go wild in case of multiple
@@ -442,7 +464,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
            also API end-point carriers, and that's what matters locally
            (who's the winner);
            remotely, we will just compare leave_list and member_list and if
-           the left process has it's node retained in member_list (under some
+           the left process has its node retained in member_list (under some
            other PID, anyway) we will just ignore it as well
            XXX: long-term fix is to establish in-out PID-aware tracking? */
         if (peer) {
@@ -450,51 +472,51 @@ pcmk_cpg_membership(cpg_handle_t handle,
             rival = bsearch(&key, sorted, member_list_entries,
                             sizeof(const struct cpg_address *),
                             cmp_member_list_nodeid);
-            if (rival == NULL) {
+        }
+
+        if (rival == NULL) {
+            crm_info("Group %s event %d: %s (node %u pid %u) left%s",
+                     groupName->value, counter, peer_name(peer),
+                     left_list[i].nodeid, left_list[i].pid,
+                     cpgreason2str(left_list[i].reason));
+            if (peer) {
                 crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg,
                                      OFFLINESTATUS);
-            } else if (left_list[i].nodeid == local_nodeid) {
-                crm_info("Ignoring the above event %s.%d, comes from a local"
-                         " rival process (presumably not us): %llu",
-                         groupName->value, counter,
-                         (unsigned long long) left_list[i].pid);
-            } else {
-                crm_info("Ignoring the above event %s.%d, comes from"
-                         " a rival-rich node: %llu (e.g. %llu process"
-                         " carries on)",
-                         groupName->value, counter,
-                         (unsigned long long) left_list[i].pid,
-                         (unsigned long long) (*rival)->pid);
             }
+        } else if (left_list[i].nodeid == local_nodeid) {
+            crm_warn("Group %s event %d: duplicate local pid %u left%s",
+                     groupName->value, counter,
+                     left_list[i].pid, cpgreason2str(left_list[i].reason));
+        } else {
+            crm_warn("Group %s event %d: "
+                     "%s (node %u) duplicate pid %u left%s (%u remains)",
+                     groupName->value, counter, peer_name(peer),
+                     left_list[i].nodeid, left_list[i].pid,
+                     cpgreason2str(left_list[i].reason), (*rival)->pid);
         }
     }
     free(sorted);
     sorted = NULL;
 
     for (i = 0; i < joined_list_entries; i++) {
-        crm_info("Node %u joined group %s (counter=%d.%d, pid=%llu,"
-                 " unchecked for rivals)",
-                 joined_list[i].nodeid, groupName->value, counter, i,
-                 (unsigned long long) left_list[i].pid);
+        crm_info("Group %s event %d: node %u pid %u joined%s",
+                 groupName->value, counter, joined_list[i].nodeid,
+                 joined_list[i].pid, cpgreason2str(joined_list[i].reason));
     }
 
     for (i = 0; i < member_list_entries; i++) {
         crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL);
 
-        crm_info("Node %u still member of group %s (peer=%s:%llu,"
-                 " counter=%d.%d, at least once)",
-                 member_list[i].nodeid, groupName->value,
-                 (peer? peer->uname : "<none>"), member_list[i].pid,
-                 counter, i);
-
         if (member_list[i].nodeid == local_nodeid
                 && member_list[i].pid != getpid()) {
             /* see the note above */
-            crm_info("Ignoring the above event %s.%d, comes from a local rival"
-                     " process: %llu", groupName->value, counter,
-                     (unsigned long long) member_list[i].pid);
+            crm_warn("Group %s event %d: detected duplicate local pid %u",
+                     groupName->value, counter, member_list[i].pid);
             continue;
         }
+        crm_info("Group %s event %d: %s (node %u pid %u) is member",
+                 groupName->value, counter, peer_name(peer),
+                 member_list[i].nodeid, member_list[i].pid);
 
         /* Anyone that is sending us CPG messages must also be a _CPG_ member.
          * But it's _not_ safe to assume it's in the quorum membership.
@@ -514,9 +536,8 @@ pcmk_cpg_membership(cpg_handle_t handle,
                  *
                  * Set the threshold to 1 minute
                  */
-                crm_err("Node %s[%u] appears to be online even though we think"
-                        " it is dead (unchecked for rivals)",
-                        peer->uname, peer->id);
+                crm_warn("Node %u is member of group %s but was believed offline",
+                         member_list[i].nodeid, groupName->value);
                 if (crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0)) {
                     peer->votes = 0;
                 }
@@ -529,7 +550,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
     }
 
     if (!found) {
-        crm_err("We're not part of CPG group '%s' anymore!", groupName->value);
+        crm_err("Local node was evicted from group %s", groupName->value);
         cpg_evicted = TRUE;
     }
 
-- 
1.8.3.1


From 87769895ebccc1033a876ef98a21577d6f4d1c0e Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 18 Apr 2019 22:18:27 -0500
Subject: [PATCH 2/2] Fix: libcrmcluster,pacemakerd: restore compatibility with
 corosync 1

Pacemaker 1.1 supports older versions of corosync that don't supply
cs_strerror() or CMAP. This simply drops usage cs_strerror() (in favor of just
the raw error code, as before 07a82c5c) and properly conditionalizes CMAP
usage.
---
 lib/cluster/cpg.c | 12 ++++--------
 mcp/corosync.c    | 13 +++++++------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c
index 85476be..e4783e5 100644
--- a/lib/cluster/cpg.c
+++ b/lib/cluster/cpg.c
@@ -91,15 +91,13 @@ uint32_t get_local_nodeid(cpg_handle_t handle)
         crm_trace("Creating connection");
         cs_repeat(retries, 5, rc = cpg_initialize(&local_handle, &cb));
         if (rc != CS_OK) {
-            crm_err("Could not connect to the CPG API: %s (%d)",
-                    cs_strerror(rc), rc);
+            crm_err("Could not connect to the CPG API (rc=%d)", rc);
             return 0;
         }
 
         rc = cpg_fd_get(local_handle, &fd);
         if (rc != CS_OK) {
-            crm_err("Could not obtain the CPG API connection: %s (%d)",
-                    cs_strerror(rc), rc);
+            crm_err("Could not obtain the CPG API connection (rc=%d)", rc);
             goto bail;
         }
 
@@ -594,15 +592,13 @@ cluster_connect_cpg(crm_cluster_t *cluster)
 
     cs_repeat(retries, 30, rc = cpg_initialize(&handle, &cpg_callbacks));
     if (rc != CS_OK) {
-        crm_err("Could not connect to the CPG API: %s (%d)",
-                cs_strerror(rc), rc);
+        crm_err("Could not connect to the CPG API (rc=%d)", rc);
         goto bail;
     }
 
     rc = cpg_fd_get(handle, &fd);
     if (rc != CS_OK) {
-        crm_err("Could not obtain the CPG API connection: %s (%d)",
-                cs_strerror(rc), rc);
+        crm_err("Could not obtain the CPG API connection (rc=%d)", rc);
         goto bail;
     }
 
diff --git a/mcp/corosync.c b/mcp/corosync.c
index 407a63f..40be727 100644
--- a/mcp/corosync.c
+++ b/mcp/corosync.c
@@ -118,13 +118,13 @@ cluster_connect_cfg(uint32_t * nodeid)
     cs_repeat(retries, 30, rc = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks));
 
     if (rc != CS_OK) {
-        crm_err("corosync cfg init: %s (%d)", cs_strerror(rc), rc);
+        crm_err("corosync cfg init error %d", rc);
         return FALSE;
     }
 
     rc = corosync_cfg_fd_get(cfg_handle, &fd);
     if (rc != CS_OK) {
-        crm_err("corosync cfg fd_get: %s (%d)", cs_strerror(rc), rc);
+        crm_err("corosync cfg fd_get error %d", rc);
         goto bail;
     }
 
@@ -314,8 +314,8 @@ mcp_read_config(void)
         rc = cmap_initialize(&local_handle);
         if (rc != CS_OK) {
             retries++;
-            printf("cmap connection setup failed: %s.  Retrying in %ds\n", cs_strerror(rc), retries);
-            crm_info("cmap connection setup failed: %s.  Retrying in %ds", cs_strerror(rc), retries);
+            printf("cmap connection setup failed: error %d.  Retrying in %ds\n", rc, retries);
+            crm_info("cmap connection setup failed: error %d.  Retrying in %ds", rc, retries);
             sleep(retries);
 
         } else {
@@ -331,10 +331,10 @@ mcp_read_config(void)
         return FALSE;
     }
 
+#if HAVE_CMAP
     rc = cmap_fd_get(local_handle, &fd);
     if (rc != CS_OK) {
-        crm_err("Could not obtain the CMAP API connection: %s (%d)",
-                cs_strerror(rc), rc);
+        crm_err("Could not obtain the CMAP API connection: error %d", rc);
         cmap_finalize(local_handle);
         return FALSE;
     }
@@ -354,6 +354,7 @@ mcp_read_config(void)
         cmap_finalize(local_handle);
         return FALSE;
     }
+#endif
 
     stack = get_cluster_type();
     crm_info("Reading configure for stack: %s", name_for_cluster_type(stack));
-- 
1.8.3.1