--- a/heartbeat/galera 2020-10-28 16:28:48.125700714 +0100 +++ b/heartbeat/galera 2020-10-28 16:31:14.932820752 +0100 @@ -81,6 +81,11 @@ . /etc/default/clustercheck fi +# Parameter defaults + +OCF_RESKEY_two_node_mode_default="false" +: ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}} + ####################################################################### usage() { @@ -249,6 +254,16 @@ + + +If running in a 2-node pacemaker cluster, rely on pacemaker quorum +to allow automatic recovery even when the other node is unreachable. +Use it with caution! (and fencing) + +Special recovery when running on a 2-node cluster + + + @@ -400,6 +415,27 @@ return 0 } +is_two_node_mode_active() +{ + # crm_node or corosync-quorumtool cannot access various corosync + # flags when running inside a bundle, so only count the cluster + # members + ocf_is_true "$OCF_RESKEY_two_node_mode" && ${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2 +} + +is_last_node_in_quorate_partition() +{ + # when a network split occurs in a 2-node cluster, pacemaker + # fences the other node and try to retain quorum. So until + # the fencing is resolved (and the status of the peer node + # is clean), we shouldn't consider ourself quorate. + local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w) + local quorate=$(${HA_SBIN_DIR}/crm_node -q) + local clean_members=$(${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -) + + [ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ] +} + master_exists() { if [ "$__OCF_ACTION" = "demote" ]; then @@ -518,8 +554,20 @@ done for node in $nodes_recovered $nodes; do + # On clean shutdown, galera sets the last stopped node as 'safe to bootstrap', + # so use this hint when we can safe_to_bootstrap=$(get_safe_to_bootstrap $node) + # Special case for 2-node clusters: during a network split, rely on + # pacemaker's quorum to check whether we can restart galera + if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then + is_last_node_in_quorate_partition + if [ $? -eq 0 ]; then + ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap" + safe_to_bootstrap=1 + fi + fi + if [ "$safe_to_bootstrap" = "1" ]; then # Galera marked the node as safe to boostrap during shutdown. Let's just # pick it as our bootstrap node.