|
|
a125f5 |
From bccf845261c6e69fc4e6bdb8cf4e630a4a4ec7a8 Mon Sep 17 00:00:00 2001
|
|
|
a125f5 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
a125f5 |
Date: Fri, 12 Apr 2019 09:46:51 -0500
|
|
|
a125f5 |
Subject: [PATCH] Log: libcrmcluster: improve CPG membership messages
|
|
|
a125f5 |
|
|
|
a125f5 |
Show CPG event reason when provided by corosync, make messages more readable,
|
|
|
a125f5 |
upgrade duplicate pid messages to warnings (and log only one message in those
|
|
|
a125f5 |
cases).
|
|
|
a125f5 |
---
|
|
|
a125f5 |
lib/cluster/cpg.c | 91 ++++++++++++++++++++++++++++++++++---------------------
|
|
|
a125f5 |
1 file changed, 56 insertions(+), 35 deletions(-)
|
|
|
a125f5 |
|
|
|
a125f5 |
diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c
|
|
|
a125f5 |
index 2898c51..ef6fa36 100644
|
|
|
a125f5 |
--- a/lib/cluster/cpg.c
|
|
|
a125f5 |
+++ b/lib/cluster/cpg.c
|
|
|
a125f5 |
@@ -360,8 +360,6 @@ pcmk_message_common_cs(cpg_handle_t handle, uint32_t nodeid, uint32_t pid, void
|
|
|
a125f5 |
return NULL;
|
|
|
a125f5 |
}
|
|
|
a125f5 |
|
|
|
a125f5 |
-#define PEER_NAME(peer) ((peer)? ((peer)->uname? (peer)->uname : "<unknown>") : "<none>")
|
|
|
a125f5 |
-
|
|
|
a125f5 |
static int cmp_member_list_nodeid(const void *first,
|
|
|
a125f5 |
const void *second)
|
|
|
a125f5 |
{
|
|
|
a125f5 |
@@ -376,6 +374,32 @@ static int cmp_member_list_nodeid(const void *first,
|
|
|
a125f5 |
return 0;
|
|
|
a125f5 |
}
|
|
|
a125f5 |
|
|
|
a125f5 |
+static const char *
|
|
|
a125f5 |
+cpgreason2str(cpg_reason_t reason)
|
|
|
a125f5 |
+{
|
|
|
a125f5 |
+ switch (reason) {
|
|
|
a125f5 |
+ case CPG_REASON_JOIN: return " via cpg_join";
|
|
|
a125f5 |
+ case CPG_REASON_LEAVE: return " via cpg_leave";
|
|
|
a125f5 |
+ case CPG_REASON_NODEDOWN: return " via cluster exit";
|
|
|
a125f5 |
+ case CPG_REASON_NODEUP: return " via cluster join";
|
|
|
a125f5 |
+ case CPG_REASON_PROCDOWN: return " for unknown reason";
|
|
|
a125f5 |
+ default: break;
|
|
|
a125f5 |
+ }
|
|
|
a125f5 |
+ return "";
|
|
|
a125f5 |
+}
|
|
|
a125f5 |
+
|
|
|
a125f5 |
+static inline const char *
|
|
|
a125f5 |
+peer_name(crm_node_t *peer)
|
|
|
a125f5 |
+{
|
|
|
a125f5 |
+ if (peer == NULL) {
|
|
|
a125f5 |
+ return "unknown node";
|
|
|
a125f5 |
+ } else if (peer->uname == NULL) {
|
|
|
a125f5 |
+ return "peer node";
|
|
|
a125f5 |
+ } else {
|
|
|
a125f5 |
+ return peer->uname;
|
|
|
a125f5 |
+ }
|
|
|
a125f5 |
+}
|
|
|
a125f5 |
+
|
|
|
a125f5 |
void
|
|
|
a125f5 |
pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
const struct cpg_name *groupName,
|
|
|
a125f5 |
@@ -387,7 +411,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
gboolean found = FALSE;
|
|
|
a125f5 |
static int counter = 0;
|
|
|
a125f5 |
uint32_t local_nodeid = get_local_nodeid(handle);
|
|
|
a125f5 |
- const struct cpg_address *key, **rival, **sorted;
|
|
|
a125f5 |
+ const struct cpg_address *key, **sorted;
|
|
|
a125f5 |
|
|
|
a125f5 |
sorted = malloc(member_list_entries * sizeof(const struct cpg_address *));
|
|
|
a125f5 |
CRM_ASSERT(sorted != NULL);
|
|
|
a125f5 |
@@ -401,10 +425,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
|
|
|
a125f5 |
for (i = 0; i < left_list_entries; i++) {
|
|
|
a125f5 |
crm_node_t *peer = crm_find_peer(left_list[i].nodeid, NULL);
|
|
|
a125f5 |
-
|
|
|
a125f5 |
- crm_info("Group event %s.%d: node %u (%s) left: %llu",
|
|
|
a125f5 |
- groupName->value, counter, left_list[i].nodeid,
|
|
|
a125f5 |
- PEER_NAME(peer), (unsigned long long) left_list[i].pid);
|
|
|
a125f5 |
+ const struct cpg_address **rival = NULL;
|
|
|
a125f5 |
|
|
|
a125f5 |
/* in CPG world, NODE:PROCESS-IN-MEMBERSHIP-OF-G is an 1:N relation
|
|
|
a125f5 |
and not playing by this rule may go wild in case of multiple
|
|
|
a125f5 |
@@ -418,7 +439,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
also API end-point carriers, and that's what matters locally
|
|
|
a125f5 |
(who's the winner);
|
|
|
a125f5 |
remotely, we will just compare leave_list and member_list and if
|
|
|
a125f5 |
- the left process has it's node retained in member_list (under some
|
|
|
a125f5 |
+ the left process has its node retained in member_list (under some
|
|
|
a125f5 |
other PID, anyway) we will just ignore it as well
|
|
|
a125f5 |
XXX: long-term fix is to establish in-out PID-aware tracking? */
|
|
|
a125f5 |
if (peer) {
|
|
|
a125f5 |
@@ -426,50 +447,51 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
rival = bsearch(&key, sorted, member_list_entries,
|
|
|
a125f5 |
sizeof(const struct cpg_address *),
|
|
|
a125f5 |
cmp_member_list_nodeid);
|
|
|
a125f5 |
- if (rival == NULL) {
|
|
|
a125f5 |
+ }
|
|
|
a125f5 |
+
|
|
|
a125f5 |
+ if (rival == NULL) {
|
|
|
a125f5 |
+ crm_info("Group %s event %d: %s (node %u pid %u) left%s",
|
|
|
a125f5 |
+ groupName->value, counter, peer_name(peer),
|
|
|
a125f5 |
+ left_list[i].nodeid, left_list[i].pid,
|
|
|
a125f5 |
+ cpgreason2str(left_list[i].reason));
|
|
|
a125f5 |
+ if (peer) {
|
|
|
a125f5 |
crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg,
|
|
|
a125f5 |
OFFLINESTATUS);
|
|
|
a125f5 |
- } else if (left_list[i].nodeid == local_nodeid) {
|
|
|
a125f5 |
- crm_info("Ignoring the above event %s.%d, comes from a local"
|
|
|
a125f5 |
- " rival process (presumably not us): %llu",
|
|
|
a125f5 |
- groupName->value, counter,
|
|
|
a125f5 |
- (unsigned long long) left_list[i].pid);
|
|
|
a125f5 |
- } else {
|
|
|
a125f5 |
- crm_info("Ignoring the above event %s.%d, comes from"
|
|
|
a125f5 |
- " a rival-rich node: %llu (e.g. %llu process"
|
|
|
a125f5 |
- " carries on)",
|
|
|
a125f5 |
- groupName->value, counter,
|
|
|
a125f5 |
- (unsigned long long) left_list[i].pid,
|
|
|
a125f5 |
- (unsigned long long) (*rival)->pid);
|
|
|
a125f5 |
}
|
|
|
a125f5 |
+ } else if (left_list[i].nodeid == local_nodeid) {
|
|
|
a125f5 |
+ crm_warn("Group %s event %d: duplicate local pid %u left%s",
|
|
|
a125f5 |
+ groupName->value, counter,
|
|
|
a125f5 |
+ left_list[i].pid, cpgreason2str(left_list[i].reason));
|
|
|
a125f5 |
+ } else {
|
|
|
a125f5 |
+ crm_warn("Group %s event %d: "
|
|
|
a125f5 |
+ "%s (node %u) duplicate pid %u left%s (%u remains)",
|
|
|
a125f5 |
+ groupName->value, counter, peer_name(peer),
|
|
|
a125f5 |
+ left_list[i].nodeid, left_list[i].pid,
|
|
|
a125f5 |
+ cpgreason2str(left_list[i].reason), (*rival)->pid);
|
|
|
a125f5 |
}
|
|
|
a125f5 |
}
|
|
|
a125f5 |
free(sorted);
|
|
|
a125f5 |
sorted = NULL;
|
|
|
a125f5 |
|
|
|
a125f5 |
for (i = 0; i < joined_list_entries; i++) {
|
|
|
a125f5 |
- crm_info("Group event %s.%d: node %u joined: %llu"
|
|
|
a125f5 |
- " (unchecked for rivals)",
|
|
|
a125f5 |
+ crm_info("Group %s event %d: node %u pid %u joined%s",
|
|
|
a125f5 |
groupName->value, counter, joined_list[i].nodeid,
|
|
|
a125f5 |
- (unsigned long long) joined_list[i].pid);
|
|
|
a125f5 |
+ joined_list[i].pid, cpgreason2str(joined_list[i].reason));
|
|
|
a125f5 |
}
|
|
|
a125f5 |
|
|
|
a125f5 |
for (i = 0; i < member_list_entries; i++) {
|
|
|
a125f5 |
crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL);
|
|
|
a125f5 |
|
|
|
a125f5 |
- crm_info("Group event %s.%d: node %u (%s) is member: %llu"
|
|
|
a125f5 |
- " (at least once)",
|
|
|
a125f5 |
- groupName->value, counter, member_list[i].nodeid,
|
|
|
a125f5 |
- PEER_NAME(peer), member_list[i].pid);
|
|
|
a125f5 |
-
|
|
|
a125f5 |
if (member_list[i].nodeid == local_nodeid
|
|
|
a125f5 |
&& member_list[i].pid != getpid()) {
|
|
|
a125f5 |
/* see the note above */
|
|
|
a125f5 |
- crm_info("Ignoring the above event %s.%d, comes from a local rival"
|
|
|
a125f5 |
- " process: %llu", groupName->value, counter,
|
|
|
a125f5 |
- (unsigned long long) member_list[i].pid);
|
|
|
a125f5 |
+ crm_warn("Group %s event %d: detected duplicate local pid %u",
|
|
|
a125f5 |
+ groupName->value, counter, member_list[i].pid);
|
|
|
a125f5 |
continue;
|
|
|
a125f5 |
}
|
|
|
a125f5 |
+ crm_info("Group %s event %d: %s (node %u pid %u) is member",
|
|
|
a125f5 |
+ groupName->value, counter, peer_name(peer),
|
|
|
a125f5 |
+ member_list[i].nodeid, member_list[i].pid);
|
|
|
a125f5 |
|
|
|
a125f5 |
/* If the caller left auto-reaping enabled, this will also update the
|
|
|
a125f5 |
* state to member.
|
|
|
a125f5 |
@@ -492,8 +514,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
|
|
|
a125f5 |
} else if (now > (peer->when_lost + 60)) {
|
|
|
a125f5 |
// If it persists for more than a minute, update the state
|
|
|
a125f5 |
- crm_warn("Node %u member of group %s but believed offline"
|
|
|
a125f5 |
- " (unchecked for rivals)",
|
|
|
a125f5 |
+ crm_warn("Node %u is member of group %s but was believed offline",
|
|
|
a125f5 |
member_list[i].nodeid, groupName->value);
|
|
|
a125f5 |
crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0);
|
|
|
a125f5 |
}
|
|
|
a125f5 |
@@ -505,7 +526,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
a125f5 |
}
|
|
|
a125f5 |
|
|
|
a125f5 |
if (!found) {
|
|
|
a125f5 |
- crm_err("We're not part of CPG group '%s' anymore!", groupName->value);
|
|
|
a125f5 |
+ crm_err("Local node was evicted from group %s", groupName->value);
|
|
|
a125f5 |
cpg_evicted = TRUE;
|
|
|
a125f5 |
}
|
|
|
a125f5 |
|
|
|
a125f5 |
--
|
|
|
a125f5 |
1.8.3.1
|
|
|
a125f5 |
|