Blame SOURCES/bz1229194-2-votequorum-Fix-auto_tie_breaker-behaviour-in-odd-siz.patch

850067
From b9f5c290b7dedd0a677cdfc25db7dd111245a745 Mon Sep 17 00:00:00 2001
850067
From: Christine Caulfield <ccaulfie@redhat.com>
850067
Date: Thu, 18 Jun 2015 09:57:59 +0100
850067
Subject: [PATCH] votequorum: Fix auto_tie_breaker behaviour in odd-sized clusters
850067
850067
auto_tie_breaker can behave incorrectly in the case of a cluster
850067
with an odd number of nodes. It's possible for a partition to
850067
have quorum while the other side has the ATB node, and both will
850067
continue working. (Of course in a properly configured cluster one side
850067
will be fenced but that becomes an indeterminate race .. just what ATB
850067
is supposed to avoid).
850067
850067
This patch prevents ATB from running in a partition if the 'other'
850067
partition might have quorum, and also mandates the use of wait_for_all
850067
in clusters with an odd number of nodes so that a quorate partition
850067
cannot start services or fence an existing partition with the tie
850067
breaker node.
850067
850067
Signed-Off-By: Christine Caulfield <ccaulfie@redhat.com>
850067
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
850067
---
850067
 exec/votequorum.c |   31 +++++++++++++++++++++++++++++++
850067
 1 files changed, 31 insertions(+), 0 deletions(-)
850067
850067
diff --git a/exec/votequorum.c b/exec/votequorum.c
850067
index f6faa25..62c8cf3 100644
850067
--- a/exec/votequorum.c
850067
+++ b/exec/votequorum.c
850067
@@ -1011,7 +1011,10 @@ static void are_we_quorate(unsigned int total_votes)
850067
 	}
850067
 
850067
 	if ((auto_tie_breaker != ATB_NONE) &&
850067
+	    /* Must be a half (or half-1) split */
850067
 	    (total_votes == (us->expected_votes / 2)) &&
850067
+	    /* If the 'other' partition in a split might have quorum then we can't run ATB */
850067
+	    (previous_quorum_members_entries - quorum_members_entries < quorum) &&
850067
 	    (check_auto_tie_breaker() == 1)) {
850067
 		quorate = 1;
850067
 	}
850067
@@ -1331,6 +1334,34 @@ static char *votequorum_readconfig(int runtime)
850067
 		log_printf(LOGSYS_LEVEL_CRIT, "two_node has been disabled, please fix your corosync.conf");
850067
 		two_node = 0;
850067
 	}
850067
+
850067
+	/* If ATB is set and the cluster has an odd number of nodes then wait_for_all needs
850067
+	 * to be set so that an isolated half+1 without the tie breaker node
850067
+	 * does not have quorum on reboot.
850067
+	 */
850067
+	if ((auto_tie_breaker != ATB_NONE) && (node_expected_votes % 2) &&
850067
+	    (!wait_for_all)) {
850067
+		if (last_man_standing) {
850067
+			/* if LMS is set too, it's a fatal configuration error. We can't dictate to the user what
850067
+			 *  they might want so we'll just quit.
850067
+			 */
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set, the cluster has an odd number of nodes\n");
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "and last_man_standing is also set. With this situation a better\n");
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "solution would be to disable LMS, leave ATB enabled, and also\n");
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "enable wait_for_all (mandatory for ATB in odd-numbered clusters).\n");
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "Due to this ambiguity, corosync will fail to start. Please fix your corosync.conf\n");
850067
+			error = (char *)"configuration error: auto_tie_breaker & last_man_standing not available in odd sized cluster";
850067
+			goto out;
850067
+		}
850067
+		else {
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set and the cluster has an odd number of nodes.\n");
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "wait_for_all needs to be set for this configuration but it is missing\n");
850067
+			log_printf(LOGSYS_LEVEL_CRIT, "Therefore auto_tie_breaker has been disabled. Please fix your corosync.conf\n");
850067
+			auto_tie_breaker = ATB_NONE;
850067
+			icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
850067
+		}
850067
+	}
850067
+
850067
 	/*
850067
 	 * quorum device is not compatible with last_man_standing and auto_tie_breaker
850067
 	 * neither lms or atb can be set at runtime, so there is no need to check for
850067
-- 
850067
1.7.1
850067