|
|
4531df |
--- a/heartbeat/galera 2020-10-28 16:28:48.125700714 +0100
|
|
|
4531df |
+++ b/heartbeat/galera 2020-10-28 16:31:14.932820752 +0100
|
|
|
4531df |
@@ -81,6 +81,11 @@
|
|
|
4531df |
. /etc/default/clustercheck
|
|
|
4531df |
fi
|
|
|
4531df |
|
|
|
4531df |
+# Parameter defaults
|
|
|
4531df |
+
|
|
|
4531df |
+OCF_RESKEY_two_node_mode_default="false"
|
|
|
4531df |
+: ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}}
|
|
|
4531df |
+
|
|
|
4531df |
#######################################################################
|
|
|
4531df |
|
|
|
4531df |
usage() {
|
|
|
4531df |
@@ -249,6 +254,16 @@
|
|
|
4531df |
<content type="string" default="" />
|
|
|
4531df |
</parameter>
|
|
|
4531df |
|
|
|
4531df |
+<parameter name="two_node_mode" unique="0" required="0">
|
|
|
4531df |
+<longdesc lang="en">
|
|
|
4531df |
+If running in a 2-node pacemaker cluster, rely on pacemaker quorum
|
|
|
4531df |
+to allow automatic recovery even when the other node is unreachable.
|
|
|
4531df |
+Use it with caution! (and fencing)
|
|
|
4531df |
+</longdesc>
|
|
|
4531df |
+<shortdesc lang="en">Special recovery when running on a 2-node cluster</shortdesc>
|
|
|
4531df |
+<content type="boolean" default="${OCF_RESKEY_two_node_mode_default}"/>
|
|
|
4531df |
+</parameter>
|
|
|
4531df |
+
|
|
|
4531df |
</parameters>
|
|
|
4531df |
|
|
|
4531df |
<actions>
|
|
|
4531df |
@@ -400,6 +415,27 @@
|
|
|
4531df |
return 0
|
|
|
4531df |
}
|
|
|
4531df |
|
|
|
4531df |
+is_two_node_mode_active()
|
|
|
4531df |
+{
|
|
|
4531df |
+ # crm_node or corosync-quorumtool cannot access various corosync
|
|
|
4531df |
+ # flags when running inside a bundle, so only count the cluster
|
|
|
4531df |
+ # members
|
|
|
4531df |
+ ocf_is_true "$OCF_RESKEY_two_node_mode" && ${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2
|
|
|
4531df |
+}
|
|
|
4531df |
+
|
|
|
4531df |
+is_last_node_in_quorate_partition()
|
|
|
4531df |
+{
|
|
|
4531df |
+ # when a network split occurs in a 2-node cluster, pacemaker
|
|
|
4531df |
+ # fences the other node and try to retain quorum. So until
|
|
|
4531df |
+ # the fencing is resolved (and the status of the peer node
|
|
|
4531df |
+ # is clean), we shouldn't consider ourself quorate.
|
|
|
4531df |
+ local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w)
|
|
|
4531df |
+ local quorate=$(${HA_SBIN_DIR}/crm_node -q)
|
|
|
4531df |
+ local clean_members=$(${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -)
|
|
|
4531df |
+
|
|
|
4531df |
+ [ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ]
|
|
|
4531df |
+}
|
|
|
4531df |
+
|
|
|
4531df |
master_exists()
|
|
|
4531df |
{
|
|
|
4531df |
if [ "$__OCF_ACTION" = "demote" ]; then
|
|
|
4531df |
@@ -518,8 +554,20 @@
|
|
|
4531df |
done
|
|
|
4531df |
|
|
|
4531df |
for node in $nodes_recovered $nodes; do
|
|
|
4531df |
+ # On clean shutdown, galera sets the last stopped node as 'safe to bootstrap',
|
|
|
4531df |
+ # so use this hint when we can
|
|
|
4531df |
safe_to_bootstrap=$(get_safe_to_bootstrap $node)
|
|
|
4531df |
|
|
|
4531df |
+ # Special case for 2-node clusters: during a network split, rely on
|
|
|
4531df |
+ # pacemaker's quorum to check whether we can restart galera
|
|
|
4531df |
+ if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then
|
|
|
4531df |
+ is_last_node_in_quorate_partition
|
|
|
4531df |
+ if [ $? -eq 0 ]; then
|
|
|
4531df |
+ ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap"
|
|
|
4531df |
+ safe_to_bootstrap=1
|
|
|
4531df |
+ fi
|
|
|
4531df |
+ fi
|
|
|
4531df |
+
|
|
|
4531df |
if [ "$safe_to_bootstrap" = "1" ]; then
|
|
|
4531df |
# Galera marked the node as safe to boostrap during shutdown. Let's just
|
|
|
4531df |
# pick it as our bootstrap node.
|