From bccf845261c6e69fc4e6bdb8cf4e630a4a4ec7a8 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 12 Apr 2019 09:46:51 -0500
Subject: [PATCH] Log: libcrmcluster: improve CPG membership messages
Show CPG event reason when provided by corosync, make messages more readable,
upgrade duplicate pid messages to warnings (and log only one message in those
cases).
---
lib/cluster/cpg.c | 91 ++++++++++++++++++++++++++++++++++---------------------
1 file changed, 56 insertions(+), 35 deletions(-)
diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c
index 2898c51..ef6fa36 100644
--- a/lib/cluster/cpg.c
+++ b/lib/cluster/cpg.c
@@ -360,8 +360,6 @@ pcmk_message_common_cs(cpg_handle_t handle, uint32_t nodeid, uint32_t pid, void
return NULL;
}
-#define PEER_NAME(peer) ((peer)? ((peer)->uname? (peer)->uname : "<unknown>") : "<none>")
-
static int cmp_member_list_nodeid(const void *first,
const void *second)
{
@@ -376,6 +374,32 @@ static int cmp_member_list_nodeid(const void *first,
return 0;
}
+static const char *
+cpgreason2str(cpg_reason_t reason)
+{
+ switch (reason) {
+ case CPG_REASON_JOIN: return " via cpg_join";
+ case CPG_REASON_LEAVE: return " via cpg_leave";
+ case CPG_REASON_NODEDOWN: return " via cluster exit";
+ case CPG_REASON_NODEUP: return " via cluster join";
+ case CPG_REASON_PROCDOWN: return " for unknown reason";
+ default: break;
+ }
+ return "";
+}
+
+static inline const char *
+peer_name(crm_node_t *peer)
+{
+ if (peer == NULL) {
+ return "unknown node";
+ } else if (peer->uname == NULL) {
+ return "peer node";
+ } else {
+ return peer->uname;
+ }
+}
+
void
pcmk_cpg_membership(cpg_handle_t handle,
const struct cpg_name *groupName,
@@ -387,7 +411,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
gboolean found = FALSE;
static int counter = 0;
uint32_t local_nodeid = get_local_nodeid(handle);
- const struct cpg_address *key, **rival, **sorted;
+ const struct cpg_address *key, **sorted;
sorted = malloc(member_list_entries * sizeof(const struct cpg_address *));
CRM_ASSERT(sorted != NULL);
@@ -401,10 +425,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
for (i = 0; i < left_list_entries; i++) {
crm_node_t *peer = crm_find_peer(left_list[i].nodeid, NULL);
-
- crm_info("Group event %s.%d: node %u (%s) left: %llu",
- groupName->value, counter, left_list[i].nodeid,
- PEER_NAME(peer), (unsigned long long) left_list[i].pid);
+ const struct cpg_address **rival = NULL;
/* in CPG world, NODE:PROCESS-IN-MEMBERSHIP-OF-G is an 1:N relation
and not playing by this rule may go wild in case of multiple
@@ -418,7 +439,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
also API end-point carriers, and that's what matters locally
(who's the winner);
remotely, we will just compare leave_list and member_list and if
- the left process has it's node retained in member_list (under some
+ the left process has its node retained in member_list (under some
other PID, anyway) we will just ignore it as well
XXX: long-term fix is to establish in-out PID-aware tracking? */
if (peer) {
@@ -426,50 +447,51 @@ pcmk_cpg_membership(cpg_handle_t handle,
rival = bsearch(&key, sorted, member_list_entries,
sizeof(const struct cpg_address *),
cmp_member_list_nodeid);
- if (rival == NULL) {
+ }
+
+ if (rival == NULL) {
+ crm_info("Group %s event %d: %s (node %u pid %u) left%s",
+ groupName->value, counter, peer_name(peer),
+ left_list[i].nodeid, left_list[i].pid,
+ cpgreason2str(left_list[i].reason));
+ if (peer) {
crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg,
OFFLINESTATUS);
- } else if (left_list[i].nodeid == local_nodeid) {
- crm_info("Ignoring the above event %s.%d, comes from a local"
- " rival process (presumably not us): %llu",
- groupName->value, counter,
- (unsigned long long) left_list[i].pid);
- } else {
- crm_info("Ignoring the above event %s.%d, comes from"
- " a rival-rich node: %llu (e.g. %llu process"
- " carries on)",
- groupName->value, counter,
- (unsigned long long) left_list[i].pid,
- (unsigned long long) (*rival)->pid);
}
+ } else if (left_list[i].nodeid == local_nodeid) {
+ crm_warn("Group %s event %d: duplicate local pid %u left%s",
+ groupName->value, counter,
+ left_list[i].pid, cpgreason2str(left_list[i].reason));
+ } else {
+ crm_warn("Group %s event %d: "
+ "%s (node %u) duplicate pid %u left%s (%u remains)",
+ groupName->value, counter, peer_name(peer),
+ left_list[i].nodeid, left_list[i].pid,
+ cpgreason2str(left_list[i].reason), (*rival)->pid);
}
}
free(sorted);
sorted = NULL;
for (i = 0; i < joined_list_entries; i++) {
- crm_info("Group event %s.%d: node %u joined: %llu"
- " (unchecked for rivals)",
+ crm_info("Group %s event %d: node %u pid %u joined%s",
groupName->value, counter, joined_list[i].nodeid,
- (unsigned long long) joined_list[i].pid);
+ joined_list[i].pid, cpgreason2str(joined_list[i].reason));
}
for (i = 0; i < member_list_entries; i++) {
crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL);
- crm_info("Group event %s.%d: node %u (%s) is member: %llu"
- " (at least once)",
- groupName->value, counter, member_list[i].nodeid,
- PEER_NAME(peer), member_list[i].pid);
-
if (member_list[i].nodeid == local_nodeid
&& member_list[i].pid != getpid()) {
/* see the note above */
- crm_info("Ignoring the above event %s.%d, comes from a local rival"
- " process: %llu", groupName->value, counter,
- (unsigned long long) member_list[i].pid);
+ crm_warn("Group %s event %d: detected duplicate local pid %u",
+ groupName->value, counter, member_list[i].pid);
continue;
}
+ crm_info("Group %s event %d: %s (node %u pid %u) is member",
+ groupName->value, counter, peer_name(peer),
+ member_list[i].nodeid, member_list[i].pid);
/* If the caller left auto-reaping enabled, this will also update the
* state to member.
@@ -492,8 +514,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
} else if (now > (peer->when_lost + 60)) {
// If it persists for more than a minute, update the state
- crm_warn("Node %u member of group %s but believed offline"
- " (unchecked for rivals)",
+ crm_warn("Node %u is member of group %s but was believed offline",
member_list[i].nodeid, groupName->value);
crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0);
}
@@ -505,7 +526,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
}
if (!found) {
- crm_err("We're not part of CPG group '%s' anymore!", groupName->value);
+ crm_err("Local node was evicted from group %s", groupName->value);
cpg_evicted = TRUE;
}
--
1.8.3.1