|
|
c54a00 |
From 83811e2115f5516a7faec2e653b1be3d58b35a79 Mon Sep 17 00:00:00 2001
|
|
|
c54a00 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
c54a00 |
Date: Fri, 12 Apr 2019 09:46:51 -0500
|
|
|
c54a00 |
Subject: [PATCH 1/2] Log: libcrmcluster: improve CPG membership messages
|
|
|
c54a00 |
|
|
|
c54a00 |
Show CPG event reason when provided by corosync, make messages more readable,
|
|
|
c54a00 |
upgrade duplicate pid messages to warnings (and log only one message in those
|
|
|
c54a00 |
cases).
|
|
|
c54a00 |
|
|
|
c54a00 |
This also fixes a typo in 4d6f6e01 that led to using an index with the wrong
|
|
|
c54a00 |
array, potentially leading to use of an uninitialized value or invalid memory
|
|
|
c54a00 |
access.
|
|
|
c54a00 |
---
|
|
|
c54a00 |
lib/cluster/cpg.c | 95 +++++++++++++++++++++++++++++++++----------------------
|
|
|
c54a00 |
1 file changed, 58 insertions(+), 37 deletions(-)
|
|
|
c54a00 |
|
|
|
c54a00 |
diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c
|
|
|
c54a00 |
index c5ecc67..85476be 100644
|
|
|
c54a00 |
--- a/lib/cluster/cpg.c
|
|
|
c54a00 |
+++ b/lib/cluster/cpg.c
|
|
|
c54a00 |
@@ -399,6 +399,32 @@ static int cmp_member_list_nodeid(const void *first,
|
|
|
c54a00 |
return 0;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
+static const char *
|
|
|
c54a00 |
+cpgreason2str(cpg_reason_t reason)
|
|
|
c54a00 |
+{
|
|
|
c54a00 |
+ switch (reason) {
|
|
|
c54a00 |
+ case CPG_REASON_JOIN: return " via cpg_join";
|
|
|
c54a00 |
+ case CPG_REASON_LEAVE: return " via cpg_leave";
|
|
|
c54a00 |
+ case CPG_REASON_NODEDOWN: return " via cluster exit";
|
|
|
c54a00 |
+ case CPG_REASON_NODEUP: return " via cluster join";
|
|
|
c54a00 |
+ case CPG_REASON_PROCDOWN: return " for unknown reason";
|
|
|
c54a00 |
+ default: break;
|
|
|
c54a00 |
+ }
|
|
|
c54a00 |
+ return "";
|
|
|
c54a00 |
+}
|
|
|
c54a00 |
+
|
|
|
c54a00 |
+static inline const char *
|
|
|
c54a00 |
+peer_name(crm_node_t *peer)
|
|
|
c54a00 |
+{
|
|
|
c54a00 |
+ if (peer == NULL) {
|
|
|
c54a00 |
+ return "unknown node";
|
|
|
c54a00 |
+ } else if (peer->uname == NULL) {
|
|
|
c54a00 |
+ return "peer node";
|
|
|
c54a00 |
+ } else {
|
|
|
c54a00 |
+ return peer->uname;
|
|
|
c54a00 |
+ }
|
|
|
c54a00 |
+}
|
|
|
c54a00 |
+
|
|
|
c54a00 |
void
|
|
|
c54a00 |
pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
const struct cpg_name *groupName,
|
|
|
c54a00 |
@@ -410,7 +436,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
gboolean found = FALSE;
|
|
|
c54a00 |
static int counter = 0;
|
|
|
c54a00 |
uint32_t local_nodeid = get_local_nodeid(handle);
|
|
|
c54a00 |
- const struct cpg_address *key, **rival, **sorted;
|
|
|
c54a00 |
+ const struct cpg_address *key, **sorted;
|
|
|
c54a00 |
|
|
|
c54a00 |
sorted = malloc(member_list_entries * sizeof(const struct cpg_address *));
|
|
|
c54a00 |
CRM_ASSERT(sorted != NULL);
|
|
|
c54a00 |
@@ -424,11 +450,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
|
|
|
c54a00 |
for (i = 0; i < left_list_entries; i++) {
|
|
|
c54a00 |
crm_node_t *peer = crm_find_peer(left_list[i].nodeid, NULL);
|
|
|
c54a00 |
-
|
|
|
c54a00 |
- crm_info("Node %u left group %s (peer=%s:%llu, counter=%d.%d)",
|
|
|
c54a00 |
- left_list[i].nodeid, groupName->value,
|
|
|
c54a00 |
- (peer? peer->uname : "<none>"),
|
|
|
c54a00 |
- (unsigned long long) left_list[i].pid, counter, i);
|
|
|
c54a00 |
+ const struct cpg_address **rival = NULL;
|
|
|
c54a00 |
|
|
|
c54a00 |
/* in CPG world, NODE:PROCESS-IN-MEMBERSHIP-OF-G is an 1:N relation
|
|
|
c54a00 |
and not playing by this rule may go wild in case of multiple
|
|
|
c54a00 |
@@ -442,7 +464,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
also API end-point carriers, and that's what matters locally
|
|
|
c54a00 |
(who's the winner);
|
|
|
c54a00 |
remotely, we will just compare leave_list and member_list and if
|
|
|
c54a00 |
- the left process has it's node retained in member_list (under some
|
|
|
c54a00 |
+ the left process has its node retained in member_list (under some
|
|
|
c54a00 |
other PID, anyway) we will just ignore it as well
|
|
|
c54a00 |
XXX: long-term fix is to establish in-out PID-aware tracking? */
|
|
|
c54a00 |
if (peer) {
|
|
|
c54a00 |
@@ -450,51 +472,51 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
rival = bsearch(&key, sorted, member_list_entries,
|
|
|
c54a00 |
sizeof(const struct cpg_address *),
|
|
|
c54a00 |
cmp_member_list_nodeid);
|
|
|
c54a00 |
- if (rival == NULL) {
|
|
|
c54a00 |
+ }
|
|
|
c54a00 |
+
|
|
|
c54a00 |
+ if (rival == NULL) {
|
|
|
c54a00 |
+ crm_info("Group %s event %d: %s (node %u pid %u) left%s",
|
|
|
c54a00 |
+ groupName->value, counter, peer_name(peer),
|
|
|
c54a00 |
+ left_list[i].nodeid, left_list[i].pid,
|
|
|
c54a00 |
+ cpgreason2str(left_list[i].reason));
|
|
|
c54a00 |
+ if (peer) {
|
|
|
c54a00 |
crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg,
|
|
|
c54a00 |
OFFLINESTATUS);
|
|
|
c54a00 |
- } else if (left_list[i].nodeid == local_nodeid) {
|
|
|
c54a00 |
- crm_info("Ignoring the above event %s.%d, comes from a local"
|
|
|
c54a00 |
- " rival process (presumably not us): %llu",
|
|
|
c54a00 |
- groupName->value, counter,
|
|
|
c54a00 |
- (unsigned long long) left_list[i].pid);
|
|
|
c54a00 |
- } else {
|
|
|
c54a00 |
- crm_info("Ignoring the above event %s.%d, comes from"
|
|
|
c54a00 |
- " a rival-rich node: %llu (e.g. %llu process"
|
|
|
c54a00 |
- " carries on)",
|
|
|
c54a00 |
- groupName->value, counter,
|
|
|
c54a00 |
- (unsigned long long) left_list[i].pid,
|
|
|
c54a00 |
- (unsigned long long) (*rival)->pid);
|
|
|
c54a00 |
}
|
|
|
c54a00 |
+ } else if (left_list[i].nodeid == local_nodeid) {
|
|
|
c54a00 |
+ crm_warn("Group %s event %d: duplicate local pid %u left%s",
|
|
|
c54a00 |
+ groupName->value, counter,
|
|
|
c54a00 |
+ left_list[i].pid, cpgreason2str(left_list[i].reason));
|
|
|
c54a00 |
+ } else {
|
|
|
c54a00 |
+ crm_warn("Group %s event %d: "
|
|
|
c54a00 |
+ "%s (node %u) duplicate pid %u left%s (%u remains)",
|
|
|
c54a00 |
+ groupName->value, counter, peer_name(peer),
|
|
|
c54a00 |
+ left_list[i].nodeid, left_list[i].pid,
|
|
|
c54a00 |
+ cpgreason2str(left_list[i].reason), (*rival)->pid);
|
|
|
c54a00 |
}
|
|
|
c54a00 |
}
|
|
|
c54a00 |
free(sorted);
|
|
|
c54a00 |
sorted = NULL;
|
|
|
c54a00 |
|
|
|
c54a00 |
for (i = 0; i < joined_list_entries; i++) {
|
|
|
c54a00 |
- crm_info("Node %u joined group %s (counter=%d.%d, pid=%llu,"
|
|
|
c54a00 |
- " unchecked for rivals)",
|
|
|
c54a00 |
- joined_list[i].nodeid, groupName->value, counter, i,
|
|
|
c54a00 |
- (unsigned long long) left_list[i].pid);
|
|
|
c54a00 |
+ crm_info("Group %s event %d: node %u pid %u joined%s",
|
|
|
c54a00 |
+ groupName->value, counter, joined_list[i].nodeid,
|
|
|
c54a00 |
+ joined_list[i].pid, cpgreason2str(joined_list[i].reason));
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
for (i = 0; i < member_list_entries; i++) {
|
|
|
c54a00 |
crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL);
|
|
|
c54a00 |
|
|
|
c54a00 |
- crm_info("Node %u still member of group %s (peer=%s:%llu,"
|
|
|
c54a00 |
- " counter=%d.%d, at least once)",
|
|
|
c54a00 |
- member_list[i].nodeid, groupName->value,
|
|
|
c54a00 |
- (peer? peer->uname : "<none>"), member_list[i].pid,
|
|
|
c54a00 |
- counter, i);
|
|
|
c54a00 |
-
|
|
|
c54a00 |
if (member_list[i].nodeid == local_nodeid
|
|
|
c54a00 |
&& member_list[i].pid != getpid()) {
|
|
|
c54a00 |
/* see the note above */
|
|
|
c54a00 |
- crm_info("Ignoring the above event %s.%d, comes from a local rival"
|
|
|
c54a00 |
- " process: %llu", groupName->value, counter,
|
|
|
c54a00 |
- (unsigned long long) member_list[i].pid);
|
|
|
c54a00 |
+ crm_warn("Group %s event %d: detected duplicate local pid %u",
|
|
|
c54a00 |
+ groupName->value, counter, member_list[i].pid);
|
|
|
c54a00 |
continue;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
+ crm_info("Group %s event %d: %s (node %u pid %u) is member",
|
|
|
c54a00 |
+ groupName->value, counter, peer_name(peer),
|
|
|
c54a00 |
+ member_list[i].nodeid, member_list[i].pid);
|
|
|
c54a00 |
|
|
|
c54a00 |
/* Anyone that is sending us CPG messages must also be a _CPG_ member.
|
|
|
c54a00 |
* But it's _not_ safe to assume it's in the quorum membership.
|
|
|
c54a00 |
@@ -514,9 +536,8 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
*
|
|
|
c54a00 |
* Set the threshold to 1 minute
|
|
|
c54a00 |
*/
|
|
|
c54a00 |
- crm_err("Node %s[%u] appears to be online even though we think"
|
|
|
c54a00 |
- " it is dead (unchecked for rivals)",
|
|
|
c54a00 |
- peer->uname, peer->id);
|
|
|
c54a00 |
+ crm_warn("Node %u is member of group %s but was believed offline",
|
|
|
c54a00 |
+ member_list[i].nodeid, groupName->value);
|
|
|
c54a00 |
if (crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0)) {
|
|
|
c54a00 |
peer->votes = 0;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
@@ -529,7 +550,7 @@ pcmk_cpg_membership(cpg_handle_t handle,
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
if (!found) {
|
|
|
c54a00 |
- crm_err("We're not part of CPG group '%s' anymore!", groupName->value);
|
|
|
c54a00 |
+ crm_err("Local node was evicted from group %s", groupName->value);
|
|
|
c54a00 |
cpg_evicted = TRUE;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
--
|
|
|
c54a00 |
1.8.3.1
|
|
|
c54a00 |
|
|
|
c54a00 |
|
|
|
c54a00 |
From 87769895ebccc1033a876ef98a21577d6f4d1c0e Mon Sep 17 00:00:00 2001
|
|
|
c54a00 |
From: Ken Gaillot <kgaillot@redhat.com>
|
|
|
c54a00 |
Date: Thu, 18 Apr 2019 22:18:27 -0500
|
|
|
c54a00 |
Subject: [PATCH 2/2] Fix: libcrmcluster,pacemakerd: restore compatibility with
|
|
|
c54a00 |
corosync 1
|
|
|
c54a00 |
|
|
|
c54a00 |
Pacemaker 1.1 supports older versions of corosync that don't supply
|
|
|
c54a00 |
cs_strerror() or CMAP. This simply drops usage cs_strerror() (in favor of just
|
|
|
c54a00 |
the raw error code, as before 07a82c5c) and properly conditionalizes CMAP
|
|
|
c54a00 |
usage.
|
|
|
c54a00 |
---
|
|
|
c54a00 |
lib/cluster/cpg.c | 12 ++++--------
|
|
|
c54a00 |
mcp/corosync.c | 13 +++++++------
|
|
|
c54a00 |
2 files changed, 11 insertions(+), 14 deletions(-)
|
|
|
c54a00 |
|
|
|
c54a00 |
diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c
|
|
|
c54a00 |
index 85476be..e4783e5 100644
|
|
|
c54a00 |
--- a/lib/cluster/cpg.c
|
|
|
c54a00 |
+++ b/lib/cluster/cpg.c
|
|
|
c54a00 |
@@ -91,15 +91,13 @@ uint32_t get_local_nodeid(cpg_handle_t handle)
|
|
|
c54a00 |
crm_trace("Creating connection");
|
|
|
c54a00 |
cs_repeat(retries, 5, rc = cpg_initialize(&local_handle, &cb));
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("Could not connect to the CPG API: %s (%d)",
|
|
|
c54a00 |
- cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("Could not connect to the CPG API (rc=%d)", rc);
|
|
|
c54a00 |
return 0;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
rc = cpg_fd_get(local_handle, &fd;;
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("Could not obtain the CPG API connection: %s (%d)",
|
|
|
c54a00 |
- cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("Could not obtain the CPG API connection (rc=%d)", rc);
|
|
|
c54a00 |
goto bail;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
@@ -594,15 +592,13 @@ cluster_connect_cpg(crm_cluster_t *cluster)
|
|
|
c54a00 |
|
|
|
c54a00 |
cs_repeat(retries, 30, rc = cpg_initialize(&handle, &cpg_callbacks));
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("Could not connect to the CPG API: %s (%d)",
|
|
|
c54a00 |
- cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("Could not connect to the CPG API (rc=%d)", rc);
|
|
|
c54a00 |
goto bail;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
rc = cpg_fd_get(handle, &fd;;
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("Could not obtain the CPG API connection: %s (%d)",
|
|
|
c54a00 |
- cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("Could not obtain the CPG API connection (rc=%d)", rc);
|
|
|
c54a00 |
goto bail;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
diff --git a/mcp/corosync.c b/mcp/corosync.c
|
|
|
c54a00 |
index 407a63f..40be727 100644
|
|
|
c54a00 |
--- a/mcp/corosync.c
|
|
|
c54a00 |
+++ b/mcp/corosync.c
|
|
|
c54a00 |
@@ -118,13 +118,13 @@ cluster_connect_cfg(uint32_t * nodeid)
|
|
|
c54a00 |
cs_repeat(retries, 30, rc = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks));
|
|
|
c54a00 |
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("corosync cfg init: %s (%d)", cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("corosync cfg init error %d", rc);
|
|
|
c54a00 |
return FALSE;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
rc = corosync_cfg_fd_get(cfg_handle, &fd;;
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("corosync cfg fd_get: %s (%d)", cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("corosync cfg fd_get error %d", rc);
|
|
|
c54a00 |
goto bail;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
@@ -314,8 +314,8 @@ mcp_read_config(void)
|
|
|
c54a00 |
rc = cmap_initialize(&local_handle);
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
retries++;
|
|
|
c54a00 |
- printf("cmap connection setup failed: %s. Retrying in %ds\n", cs_strerror(rc), retries);
|
|
|
c54a00 |
- crm_info("cmap connection setup failed: %s. Retrying in %ds", cs_strerror(rc), retries);
|
|
|
c54a00 |
+ printf("cmap connection setup failed: error %d. Retrying in %ds\n", rc, retries);
|
|
|
c54a00 |
+ crm_info("cmap connection setup failed: error %d. Retrying in %ds", rc, retries);
|
|
|
c54a00 |
sleep(retries);
|
|
|
c54a00 |
|
|
|
c54a00 |
} else {
|
|
|
c54a00 |
@@ -331,10 +331,10 @@ mcp_read_config(void)
|
|
|
c54a00 |
return FALSE;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
|
|
|
c54a00 |
+#if HAVE_CMAP
|
|
|
c54a00 |
rc = cmap_fd_get(local_handle, &fd;;
|
|
|
c54a00 |
if (rc != CS_OK) {
|
|
|
c54a00 |
- crm_err("Could not obtain the CMAP API connection: %s (%d)",
|
|
|
c54a00 |
- cs_strerror(rc), rc);
|
|
|
c54a00 |
+ crm_err("Could not obtain the CMAP API connection: error %d", rc);
|
|
|
c54a00 |
cmap_finalize(local_handle);
|
|
|
c54a00 |
return FALSE;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
@@ -354,6 +354,7 @@ mcp_read_config(void)
|
|
|
c54a00 |
cmap_finalize(local_handle);
|
|
|
c54a00 |
return FALSE;
|
|
|
c54a00 |
}
|
|
|
c54a00 |
+#endif
|
|
|
c54a00 |
|
|
|
c54a00 |
stack = get_cluster_type();
|
|
|
c54a00 |
crm_info("Reading configure for stack: %s", name_for_cluster_type(stack));
|
|
|
c54a00 |
--
|
|
|
c54a00 |
1.8.3.1
|
|
|
c54a00 |
|