Blame SOURCES/bz1891855-galera-recover-2-node-cluster.patch

4531df
--- a/heartbeat/galera	2020-10-28 16:28:48.125700714 +0100
4531df
+++ b/heartbeat/galera	2020-10-28 16:31:14.932820752 +0100
4531df
@@ -81,6 +81,11 @@
4531df
     . /etc/default/clustercheck
4531df
 fi
4531df
 
4531df
+# Parameter defaults
4531df
+
4531df
+OCF_RESKEY_two_node_mode_default="false"
4531df
+: ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}}
4531df
+
4531df
 #######################################################################
4531df
 
4531df
 usage() {
4531df
@@ -249,6 +254,16 @@
4531df
 <content type="string" default="" />
4531df
 </parameter>
4531df
 
4531df
+<parameter name="two_node_mode" unique="0" required="0">
4531df
+<longdesc lang="en">
4531df
+If running in a 2-node pacemaker cluster, rely on pacemaker quorum
4531df
+to allow automatic recovery even when the other node is unreachable.
4531df
+Use it with caution! (and fencing)
4531df
+</longdesc>
4531df
+<shortdesc lang="en">Special recovery when running on a 2-node cluster</shortdesc>
4531df
+<content type="boolean" default="${OCF_RESKEY_two_node_mode_default}"/>
4531df
+</parameter>
4531df
+
4531df
 </parameters>
4531df
 
4531df
 <actions>
4531df
@@ -400,6 +415,27 @@
4531df
     return 0
4531df
 }
4531df
 
4531df
+is_two_node_mode_active()
4531df
+{
4531df
+    # crm_node or corosync-quorumtool cannot access various corosync
4531df
+    # flags when running inside a bundle, so only count the cluster
4531df
+    # members
4531df
+    ocf_is_true "$OCF_RESKEY_two_node_mode" && ${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2
4531df
+}
4531df
+
4531df
+is_last_node_in_quorate_partition()
4531df
+{
4531df
+    # when a network split occurs in a 2-node cluster, pacemaker
4531df
+    # fences the other node and try to retain quorum. So until
4531df
+    # the fencing is resolved (and the status of the peer node
4531df
+    # is clean), we shouldn't consider ourself quorate.
4531df
+    local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w)
4531df
+    local quorate=$(${HA_SBIN_DIR}/crm_node -q)
4531df
+    local clean_members=$(${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -)
4531df
+
4531df
+    [ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ]
4531df
+}
4531df
+
4531df
 master_exists()
4531df
 {
4531df
     if [ "$__OCF_ACTION" = "demote" ]; then
4531df
@@ -518,8 +554,20 @@
4531df
     done
4531df
 
4531df
     for node in $nodes_recovered $nodes; do
4531df
+        # On clean shutdown, galera sets the last stopped node as 'safe to bootstrap',
4531df
+        # so use this hint when we can
4531df
         safe_to_bootstrap=$(get_safe_to_bootstrap $node)
4531df
 
4531df
+        # Special case for 2-node clusters: during a network split, rely on
4531df
+        # pacemaker's quorum to check whether we can restart galera
4531df
+        if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then
4531df
+            is_last_node_in_quorate_partition
4531df
+            if [ $? -eq 0 ]; then
4531df
+                ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap"
4531df
+                safe_to_bootstrap=1
4531df
+            fi
4531df
+        fi
4531df
+
4531df
         if [ "$safe_to_bootstrap" = "1" ]; then
4531df
             # Galera marked the node as safe to boostrap during shutdown. Let's just
4531df
             # pick it as our bootstrap node.