From 7a920e120c63407b55e77d78b05874e545cc5051 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Apr 14 2016 21:44:20 +0000 Subject: import resource-agents-3.9.5-54.el7_2.9 --- diff --git a/SOURCES/bz1318744-galera-crash-recovery.patch b/SOURCES/bz1318744-galera-crash-recovery.patch new file mode 100644 index 0000000..3e51ad0 --- /dev/null +++ b/SOURCES/bz1318744-galera-crash-recovery.patch @@ -0,0 +1,131 @@ +From d9833b68498e306d181be11adf9eee14b646a899 Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Tue, 2 Feb 2016 14:34:36 +0100 +Subject: [PATCH] galera: force crash recovery if needed during last commit + detection + +--- + heartbeat/galera | 90 +++++++++++++++++++++++++++++++++++++------------------- + 1 file changed, 60 insertions(+), 30 deletions(-) + +diff --git a/heartbeat/galera b/heartbeat/galera +index 7be2b00..ca94c21 100755 +--- a/heartbeat/galera ++++ b/heartbeat/galera +@@ -525,6 +525,58 @@ detect_first_master() + set_bootstrap_node $best_node + } + ++detect_last_commit() ++{ ++ local last_commit ++ local recover_args="--defaults-file=$OCF_RESKEY_config \ ++ --pid-file=$OCF_RESKEY_pid \ ++ --socket=$OCF_RESKEY_socket \ ++ --datadir=$OCF_RESKEY_datadir \ ++ --user=$OCF_RESKEY_user" ++ local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p' ++ ++ ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" ++ last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" ++ if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then ++ local tmp=$(mktemp) ++ local tmperr=$(mktemp) ++ ++ ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" ++ ++ ${OCF_RESKEY_binary} $recover_args --wsrep-recover > $tmp 2> $tmperr ++ ++ last_commit="$(cat $tmp | sed -n $recovered_position_regex)" ++ if [ -z "$last_commit" ]; then ++ # Galera uses InnoDB's 2pc transactions internally. If ++ # server was stopped in the middle of a replication, the ++ # recovery may find a "prepared" XA transaction in the ++ # redo log, and mysql won't recover automatically ++ ++ cat $tmperr | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null ++ if [ $? -eq 0 ]; then ++ # we can only rollback the transaction, but that's OK ++ # since the DB will get resynchronized anyway ++ ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" ++ ${OCF_RESKEY_binary} $recover_args --wsrep-recover \ ++ --tc-heuristic-recover=rollback > $tmp 2>/dev/null ++ ++ last_commit="$(cat $tmp | sed -n $recovered_position_regex)" ++ fi ++ fi ++ rm -f $tmp $tmperr ++ fi ++ ++ if [ ! -z "$last_commit" ]; then ++ ocf_log info "Last commit version found: $last_commit" ++ set_last_commit $last_commit ++ return $OCF_SUCCESS ++ else ++ ocf_exit_reason "Unable to detect last known write sequence number" ++ clear_last_commit ++ return $OCF_ERR_GENERIC ++ fi ++} ++ + # For galera, promote is really start + galera_promote() + { +@@ -569,13 +620,15 @@ galera_demote() + clear_bootstrap_node + clear_last_commit + +- # record last commit by "starting" galera. start is just detection of the last sequence number +- galera_start ++ # record last commit for next promotion ++ detect_last_commit ++ rc=$? ++ return $rc + } + + galera_start() + { +- local last_commit ++ local rc + + echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME + if [ $? -ne 0 ]; then +@@ -591,34 +644,11 @@ galera_start() + + mysql_common_prepare_dirs + +- ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" +- last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" +- if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then +- ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" +- local tmp=$(mktemp) +- ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ +- --pid-file=$OCF_RESKEY_pid \ +- --socket=$OCF_RESKEY_socket \ +- --datadir=$OCF_RESKEY_datadir \ +- --user=$OCF_RESKEY_user \ +- --wsrep-recover > $tmp 2>&1 +- +- last_commit="$(cat $tmp | sed -n 's/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p')" +- rm -f $tmp +- +- if [ "$last_commit" = "-1" ]; then +- last_commit="0" +- fi +- fi +- +- if [ -z "$last_commit" ]; then +- ocf_exit_reason "Unable to detect last known write sequence number" +- clear_last_commit +- return $OCF_ERR_GENERIC ++ detect_last_commit ++ rc=$? ++ if [ $rc -ne $OCF_SUCCESS ]; then ++ return $rc + fi +- ocf_log info "Last commit version found: $last_commit" +- +- set_last_commit $last_commit + + master_exists + if [ $? -eq 0 ]; then diff --git a/SOURCES/bz1318744-galera-heuristic-recovered.patch b/SOURCES/bz1318744-galera-heuristic-recovered.patch new file mode 100644 index 0000000..589fc11 --- /dev/null +++ b/SOURCES/bz1318744-galera-heuristic-recovered.patch @@ -0,0 +1,89 @@ +From 4d98bbcdadda60166faf7ccc512b9095b439e2bd Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Tue, 2 Feb 2016 16:29:10 +0100 +Subject: [PATCH] galera: prevent recovered nodes from bootstrapping cluster + when possible + +--- + heartbeat/README.galera | 19 ++++++++++++++++++- + heartbeat/galera | 41 +++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 59 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/galera b/heartbeat/galera +index ca94c21..84c92fd 100755 +--- a/heartbeat/galera ++++ b/heartbeat/galera +@@ -276,6 +276,22 @@ is_bootstrap() + + } + ++set_heuristic_recovered() ++{ ++ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -v "true" ++} ++ ++clear_heuristic_recovered() ++{ ++ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -D ++} ++ ++is_heuristic_recovered() ++{ ++ local node=$1 ++ ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -Q 2>/dev/null ++} ++ + clear_last_commit() + { + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D +@@ -398,8 +414,19 @@ detect_first_master() + local best_node="$NODENAME" + local last_commit=0 + local missing_nodes=0 ++ local nodes="" ++ local nodes_recovered="" + ++ # avoid selecting a recovered node as bootstrap if possible + for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do ++ if is_heuristic_recovered $node; then ++ nodes_recovered="$nodes_recovered $node" ++ else ++ nodes="$nodes $node" ++ fi ++ done ++ ++ for node in $nodes_recovered $nodes; do + last_commit=$(get_last_commit $node) + + if [ -z "$last_commit" ]; then +@@ -466,6 +493,12 @@ detect_last_commit() + --tc-heuristic-recover=rollback > $tmp 2>/dev/null + + last_commit="$(cat $tmp | sed -n $recovered_position_regex)" ++ if [ ! -z "$last_commit" ]; then ++ ocf_log warn "State recovered. force SST at next restart for full resynchronization" ++ rm -f ${OCF_RESKEY_datadir}/grastate.dat ++ # try not to use this node if bootstrap is needed ++ set_heuristic_recovered ++ fi + fi + fi + rm -f $tmp $tmperr +@@ -549,11 +582,17 @@ galera_promote() + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node ++ # clear attribute heuristic-recovered. if last shutdown was ++ # not clean, we cannot be extra-cautious by requesting a SST ++ # since this is the bootstrap node ++ clear_heuristic_recovered + ocf_log info "Bootstrap complete, promoting the rest of the galera instances." + else + # if this is not the bootstrap node, make sure this instance + # syncs with the rest of the cluster before promotion returns. + wait_for_sync ++ # sync is done, clear info about last recovery ++ clear_heuristic_recovered + fi + + ocf_log info "Galera started" diff --git a/SOURCES/bz1318744-galera-no-grastate.patch b/SOURCES/bz1318744-galera-no-grastate.patch new file mode 100644 index 0000000..8f2ca23 --- /dev/null +++ b/SOURCES/bz1318744-galera-no-grastate.patch @@ -0,0 +1,113 @@ +From 422ef6a2018ebf9d6765e1f2965778f42c6a9d9c Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Tue, 15 Mar 2016 18:45:13 +0100 +Subject: [PATCH] galera: don't bootstrap from a node with no grastate.dat when + possible + +--- + heartbeat/README.galera | 9 ++++----- + heartbeat/galera | 36 ++++++++++++++++++++++-------------- + 2 files changed, 26 insertions(+), 19 deletions(-) + +diff --git a/heartbeat/galera b/heartbeat/galera +index 72add3c..e4495be 100755 +--- a/heartbeat/galera ++++ b/heartbeat/galera +@@ -276,20 +276,20 @@ is_bootstrap() + + } + +-set_heuristic_recovered() ++set_no_grastate() + { +- ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -v "true" ++ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -v "true" + } + +-clear_heuristic_recovered() ++clear_no_grastate() + { +- ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -D ++ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -D + } + +-is_heuristic_recovered() ++is_no_grastate() + { + local node=$1 +- ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -Q 2>/dev/null ++ ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -Q 2>/dev/null + } + + clear_last_commit() +@@ -419,7 +419,7 @@ detect_first_master() + + # avoid selecting a recovered node as bootstrap if possible + for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do +- if is_heuristic_recovered $node; then ++ if is_no_grastate $node; then + nodes_recovered="$nodes_recovered $node" + else + nodes="$nodes $node" +@@ -473,6 +473,12 @@ detect_last_commit() + local tmp=$(mktemp) + local tmperr=$(mktemp) + ++ # if we pass here because grastate.dat doesn't exist, ++ # try not to bootstrap from this node if possible ++ if [ ! -f ${OCF_RESKEY_datadir}/grastate.dat ]; then ++ set_no_grastate ++ fi ++ + ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" + + ${OCF_RESKEY_binary} $recover_args --wsrep-recover > $tmp 2> $tmperr +@@ -496,8 +502,8 @@ detect_last_commit() + if [ ! -z "$last_commit" ]; then + ocf_log warn "State recovered. force SST at next restart for full resynchronization" + rm -f ${OCF_RESKEY_datadir}/grastate.dat +- # try not to use this node if bootstrap is needed +- set_heuristic_recovered ++ # try not to bootstrap from this node if possible ++ set_no_grastate + fi + fi + fi +@@ -582,17 +588,17 @@ galera_promote() + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node +- # clear attribute heuristic-recovered. if last shutdown was ++ # clear attribute no-grastate. if last shutdown was + # not clean, we cannot be extra-cautious by requesting a SST + # since this is the bootstrap node +- clear_heuristic_recovered ++ clear_no_grastate + ocf_log info "Bootstrap complete, promoting the rest of the galera instances." + else + # if this is not the bootstrap node, make sure this instance + # syncs with the rest of the cluster before promotion returns. + wait_for_sync +- # sync is done, clear info about last recovery +- clear_heuristic_recovered ++ # sync is done, clear info about last startup ++ clear_no_grastate + fi + + ocf_log info "Galera started" +@@ -611,6 +617,7 @@ galera_demote() + # if this node was previously a bootstrap node, that is no longer the case. + clear_bootstrap_node + clear_last_commit ++ clear_no_grastate + + # record last commit for next promotion + detect_last_commit +@@ -722,6 +729,7 @@ galera_stop() + clear_last_commit + clear_master_score + clear_bootstrap_node ++ clear_no_grastate + return $rc + } + diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index d5d9477..e4bf573 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -32,7 +32,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 3.9.5 -Release: 54%{?dist}.8 +Release: 54%{?dist}.9 License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -133,6 +133,9 @@ Patch88: bz1304711-galera-custom-host-port.patch Patch89: bz1304811-replace-novacompute-with-nova-compute-wait.patch Patch90: bz1311180-rabbitmq-cluster-forget-stopped-cluster-nodes.patch Patch91: bz1316633-backup-and-restore-rabbitmq-users-during-resource-re.patch +Patch92: bz1318744-galera-crash-recovery.patch +Patch93: bz1318744-galera-heuristic-recovered.patch +Patch94: bz1318744-galera-no-grastate.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -329,6 +332,9 @@ exit 1 %patch89 -p1 %patch90 -p1 %patch91 -p1 +%patch92 -p1 +%patch93 -p1 +%patch94 -p1 %build if [ ! -f configure ]; then @@ -581,6 +587,11 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Thu Mar 17 2016 Damien Ciabrini - 3.9.5-54.9 +- galera: recover blocked nodes with --tc-heuristics-recover + + Resolves: rhbz#1318744 + * Fri Mar 11 2016 Oyvind Albrigtsen - 3.9.5-54.8 - rabbitmq-cluster: keep users during resource reload