Blob Blame History Raw
From 4d98bbcdadda60166faf7ccc512b9095b439e2bd Mon Sep 17 00:00:00 2001
From: Damien Ciabrini <dciabrin@redhat.com>
Date: Tue, 2 Feb 2016 16:29:10 +0100
Subject: [PATCH] galera: prevent recovered nodes from bootstrapping cluster
 when possible

---
 heartbeat/README.galera | 19 ++++++++++++++++++-
 heartbeat/galera        | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/heartbeat/galera b/heartbeat/galera
index ca94c21..84c92fd 100755
--- a/heartbeat/galera
+++ b/heartbeat/galera
@@ -276,6 +276,22 @@ is_bootstrap()
 
 }
 
+set_heuristic_recovered()
+{
+    ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -v "true"
+}
+
+clear_heuristic_recovered()
+{
+    ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -D
+}
+
+is_heuristic_recovered()
+{
+    local node=$1
+    ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -Q 2>/dev/null
+}
+
 clear_last_commit()
 {
     ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D
@@ -398,8 +414,19 @@ detect_first_master()
     local best_node="$NODENAME"
     local last_commit=0
     local missing_nodes=0
+    local nodes=""
+    local nodes_recovered=""
 
+    # avoid selecting a recovered node as bootstrap if possible
     for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do
+        if is_heuristic_recovered $node; then
+            nodes_recovered="$nodes_recovered $node"
+        else
+            nodes="$nodes $node"
+        fi
+    done
+
+    for node in $nodes_recovered $nodes; do
         last_commit=$(get_last_commit $node)
 
         if [ -z "$last_commit" ]; then
@@ -466,6 +493,12 @@ detect_last_commit()
                                      --tc-heuristic-recover=rollback > $tmp 2>/dev/null
 
                 last_commit="$(cat $tmp | sed -n $recovered_position_regex)"
+                if [ ! -z "$last_commit" ]; then
+                    ocf_log warn "State recovered. force SST at next restart for full resynchronization"
+                    rm -f ${OCF_RESKEY_datadir}/grastate.dat
+                    # try not to use this node if bootstrap is needed
+                    set_heuristic_recovered
+                fi
             fi
         fi
         rm -f $tmp $tmperr
@@ -549,11 +582,17 @@ galera_promote()
     if ocf_is_true $bootstrap; then
         promote_everyone
         clear_bootstrap_node
+        # clear attribute heuristic-recovered. if last shutdown was
+        # not clean, we cannot be extra-cautious by requesting a SST
+        # since this is the bootstrap node
+        clear_heuristic_recovered
         ocf_log info "Bootstrap complete, promoting the rest of the galera instances."
     else
         # if this is not the bootstrap node, make sure this instance
         # syncs with the rest of the cluster before promotion returns.
         wait_for_sync
+        # sync is done, clear info about last recovery
+        clear_heuristic_recovered
     fi
 
     ocf_log info "Galera started"