|
|
bcdf71 |
From 028bd6aab181104fe68166c8ec9c0485e12f9376 Mon Sep 17 00:00:00 2001
|
|
|
bcdf71 |
From: Damien Ciabrini <dciabrin@redhat.com>
|
|
|
bcdf71 |
Date: Fri, 18 Sep 2020 18:34:22 +0200
|
|
|
bcdf71 |
Subject: [PATCH] galera: recover from joining a non existing cluster
|
|
|
bcdf71 |
|
|
|
bcdf71 |
galera being a M/S resource, the resource agent decides
|
|
|
bcdf71 |
when and how to promote a resource based on the current
|
|
|
bcdf71 |
state of the galera cluster. If there's no cluster,
|
|
|
bcdf71 |
a resource is promoted as the bootstrap node. Otherwise
|
|
|
bcdf71 |
it is promoted as a joiner node.
|
|
|
bcdf71 |
|
|
|
bcdf71 |
There can be some time between the moment when a node is
|
|
|
bcdf71 |
promoted and when the promote operation effectively
|
|
|
bcdf71 |
takes place. So if a node is promoted for joining a cluster,
|
|
|
bcdf71 |
all the running galera nodes are stopped before the promote
|
|
|
bcdf71 |
operation start, the joining node won't be able to join the
|
|
|
bcdf71 |
cluster, and it can't bootstrap a new one either because it
|
|
|
bcdf71 |
doesn't have the most recent copy of the DB.
|
|
|
bcdf71 |
|
|
|
bcdf71 |
In that case, do not make the promotion fail, and force
|
|
|
bcdf71 |
a demotion instead. This ensures that a normal bootstrap
|
|
|
bcdf71 |
election will take place eventually, without blocking
|
|
|
bcdf71 |
the joining node due to a failed promotion.
|
|
|
bcdf71 |
---
|
|
|
bcdf71 |
heartbeat/galera | 13 ++++++++++---
|
|
|
bcdf71 |
1 file changed, 10 insertions(+), 3 deletions(-)
|
|
|
bcdf71 |
|
|
|
bcdf71 |
diff --git a/heartbeat/galera b/heartbeat/galera
|
|
|
bcdf71 |
index 74f11d8c5..d2f4faa86 100755
|
|
|
bcdf71 |
--- a/heartbeat/galera
|
|
|
bcdf71 |
+++ b/heartbeat/galera
|
|
|
bcdf71 |
@@ -727,9 +727,16 @@ galera_promote()
|
|
|
bcdf71 |
ocf_log info "Node <${NODENAME}> is bootstrapping the cluster"
|
|
|
bcdf71 |
extra_opts="--wsrep-cluster-address=gcomm://"
|
|
|
bcdf71 |
else
|
|
|
bcdf71 |
- ocf_exit_reason "Failure, Attempted to promote Master instance of $OCF_RESOURCE_INSTANCE before bootstrap node has been detected."
|
|
|
bcdf71 |
- clear_last_commit
|
|
|
bcdf71 |
- return $OCF_ERR_GENERIC
|
|
|
bcdf71 |
+ # We are being promoted without having the bootstrap
|
|
|
bcdf71 |
+ # attribute in the CIB, which means we are supposed to
|
|
|
bcdf71 |
+ # join a cluster; however if we end up here, there is no
|
|
|
bcdf71 |
+ # Master remaining right now, which means there is no
|
|
|
bcdf71 |
+ # cluster to join anymore. So force a demotion, and and
|
|
|
bcdf71 |
+ # let the RA decide later which node should be the next
|
|
|
bcdf71 |
+ # bootstrap node.
|
|
|
bcdf71 |
+ ocf_log warn "There is no running cluster to join, demoting ourself"
|
|
|
bcdf71 |
+ clear_master_score
|
|
|
bcdf71 |
+ return $OCF_SUCCESS
|
|
|
bcdf71 |
fi
|
|
|
bcdf71 |
fi
|
|
|
bcdf71 |
|