Blame SOURCES/bz1318744-galera-crash-recovery.patch

7a920e
From d9833b68498e306d181be11adf9eee14b646a899 Mon Sep 17 00:00:00 2001
7a920e
From: Damien Ciabrini <dciabrin@redhat.com>
7a920e
Date: Tue, 2 Feb 2016 14:34:36 +0100
7a920e
Subject: [PATCH] galera: force crash recovery if needed during last commit
7a920e
 detection
7a920e
7a920e
---
7a920e
 heartbeat/galera | 90 +++++++++++++++++++++++++++++++++++++-------------------
7a920e
 1 file changed, 60 insertions(+), 30 deletions(-)
7a920e
7a920e
diff --git a/heartbeat/galera b/heartbeat/galera
7a920e
index 7be2b00..ca94c21 100755
7a920e
--- a/heartbeat/galera
7a920e
+++ b/heartbeat/galera
7a920e
@@ -525,6 +525,58 @@ detect_first_master()
7a920e
     set_bootstrap_node $best_node
7a920e
 }
7a920e
 
7a920e
+detect_last_commit()
7a920e
+{
7a920e
+    local last_commit
7a920e
+    local recover_args="--defaults-file=$OCF_RESKEY_config \
7a920e
+                        --pid-file=$OCF_RESKEY_pid \
7a920e
+                        --socket=$OCF_RESKEY_socket \
7a920e
+                        --datadir=$OCF_RESKEY_datadir \
7a920e
+                        --user=$OCF_RESKEY_user"
7a920e
+    local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
7a920e
+
7a920e
+    ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat"
7a920e
+    last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
7a920e
+    if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
7a920e
+        local tmp=$(mktemp)
7a920e
+        local tmperr=$(mktemp)
7a920e
+
7a920e
+        ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'"
7a920e
+
7a920e
+        ${OCF_RESKEY_binary} $recover_args --wsrep-recover > $tmp 2> $tmperr
7a920e
+
7a920e
+        last_commit="$(cat $tmp | sed -n $recovered_position_regex)"
7a920e
+        if [ -z "$last_commit" ]; then
7a920e
+            # Galera uses InnoDB's 2pc transactions internally. If
7a920e
+            # server was stopped in the middle of a replication, the
7a920e
+            # recovery may find a "prepared" XA transaction in the
7a920e
+            # redo log, and mysql won't recover automatically
7a920e
+
7a920e
+            cat $tmperr | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null
7a920e
+            if [ $? -eq 0 ]; then
7a920e
+                # we can only rollback the transaction, but that's OK
7a920e
+                # since the DB will get resynchronized anyway
7a920e
+                ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover"
7a920e
+                ${OCF_RESKEY_binary} $recover_args --wsrep-recover \
7a920e
+                                     --tc-heuristic-recover=rollback > $tmp 2>/dev/null
7a920e
+
7a920e
+                last_commit="$(cat $tmp | sed -n $recovered_position_regex)"
7a920e
+            fi
7a920e
+        fi
7a920e
+        rm -f $tmp $tmperr
7a920e
+    fi
7a920e
+
7a920e
+    if [ ! -z "$last_commit" ]; then
7a920e
+        ocf_log info "Last commit version found:  $last_commit"
7a920e
+        set_last_commit $last_commit
7a920e
+        return $OCF_SUCCESS
7a920e
+    else
7a920e
+        ocf_exit_reason "Unable to detect last known write sequence number"
7a920e
+        clear_last_commit
7a920e
+        return $OCF_ERR_GENERIC
7a920e
+    fi
7a920e
+}
7a920e
+
7a920e
 # For galera, promote is really start
7a920e
 galera_promote()
7a920e
 {
7a920e
@@ -569,13 +620,15 @@ galera_demote()
7a920e
     clear_bootstrap_node
7a920e
     clear_last_commit
7a920e
 
7a920e
-    # record last commit by "starting" galera. start is just detection of the last sequence number
7a920e
-    galera_start
7a920e
+    # record last commit for next promotion
7a920e
+    detect_last_commit
7a920e
+    rc=$?
7a920e
+    return $rc
7a920e
 }
7a920e
 
7a920e
 galera_start()
7a920e
 {
7a920e
-    local last_commit
7a920e
+    local rc
7a920e
 
7a920e
     echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME
7a920e
     if [ $? -ne 0 ]; then
7a920e
@@ -591,34 +644,11 @@ galera_start()
7a920e
 
7a920e
     mysql_common_prepare_dirs
7a920e
 
7a920e
-    ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat"
7a920e
-    last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
7a920e
-    if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
7a920e
-        ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'"
7a920e
-        local tmp=$(mktemp)
7a920e
-        ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \
7a920e
-            --pid-file=$OCF_RESKEY_pid \
7a920e
-            --socket=$OCF_RESKEY_socket \
7a920e
-            --datadir=$OCF_RESKEY_datadir \
7a920e
-            --user=$OCF_RESKEY_user \
7a920e
-            --wsrep-recover > $tmp 2>&1
7a920e
-
7a920e
-        last_commit="$(cat $tmp | sed -n 's/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p')"
7a920e
-        rm -f $tmp
7a920e
-
7a920e
-        if [ "$last_commit" = "-1" ]; then
7a920e
-            last_commit="0"
7a920e
-        fi
7a920e
-    fi
7a920e
-
7a920e
-    if [ -z "$last_commit" ]; then
7a920e
-        ocf_exit_reason "Unable to detect last known write sequence number"
7a920e
-        clear_last_commit
7a920e
-        return $OCF_ERR_GENERIC
7a920e
+    detect_last_commit
7a920e
+    rc=$?
7a920e
+    if [ $rc -ne $OCF_SUCCESS ]; then
7a920e
+        return $rc
7a920e
     fi
7a920e
-    ocf_log info "Last commit version found:  $last_commit"
7a920e
-
7a920e
-    set_last_commit $last_commit
7a920e
 
7a920e
     master_exists
7a920e
     if [ $? -eq 0 ]; then