From 200559d8ca0b834f90d4f2ba70e8f7ce403b9726 Mon Sep 17 00:00:00 2001
From: Tomas Jelinek <tojeline@redhat.com>
Date: Fri, 16 Jan 2015 16:31:49 +0100
Subject: [PATCH] Warn if nodes stop will cause a loss of the quorum
---
pcs/cluster.py | 68 +++++++++++
pcs/utils.py | 77 ++++++++++--
pcsd/public/js/pcsd.js | 41 ++++++-
pcsd/remote.rb | 35 ++++--
5 files changed, 513 insertions(+), 17 deletions(-)
diff --git a/pcs/cluster.py b/pcs/cluster.py
index 05dc91a..e4829f1 100644
--- a/pcs/cluster.py
+++ b/pcs/cluster.py
@@ -676,6 +676,55 @@ def stop_cluster_all():
stop_cluster_nodes(utils.getNodesFromCorosyncConf())
def stop_cluster_nodes(nodes):
+ all_nodes = utils.getNodesFromCorosyncConf()
+ unknown_nodes = set(nodes) - set(all_nodes)
+ if unknown_nodes:
+ utils.err(
+ "nodes '%s' do not appear to exist in configuration"
+ % "', '".join(unknown_nodes)
+ )
+
+ stopping_all = set(nodes) >= set(all_nodes)
+ if (
+ not "--force" in utils.pcs_options
+ and
+ not stopping_all
+ and
+ not utils.is_rhel6()
+ ):
+ # we are sure we are not on cman cluster because only nodes from
+ # a local cluster can be stopped (see nodes validation above)
+ error_list = []
+ for node in nodes:
+ retval, data = utils.get_remote_quorumtool_output(node)
+ if retval != 0:
+ error_list.append(node + ": " + data)
+ continue
+ quorum_info = utils.parse_quorumtool_output(data)
+ if quorum_info:
+ if not quorum_info["quorate"]:
+ continue
+ if utils.is_node_stop_cause_quorum_loss(
+ quorum_info, local=False, node_list=nodes
+ ):
+ utils.err(
+ "Stopping the node(s) will cause a loss of the quorum"
+ + ", use --force to override"
+ )
+ else:
+ # We have the info, no need to print errors
+ error_list = []
+ break
+ if data.strip() != "Cannot initialize CMAP service":
+ error_list.append("Unable to get quorum status")
+ # else the node seems to be stopped already
+ if error_list:
+ utils.err(
+ "Unable to determine whether stopping the nodes will cause "
+ + "a loss of the quorum, use --force to override\n"
+ + "\n".join(error_list)
+ )
+
threads = dict()
for node in nodes:
threads[node] = NodeStopPacemakerThread(node)
@@ -764,6 +813,25 @@ def stop_cluster(argv):
stop_cluster_nodes(argv)
return
+ if not "--force" in utils.pcs_options and not utils.is_rhel6():
+ output, retval = utils.run(["corosync-quorumtool", "-p", "-s"])
+ # retval is 0 on success if node is not in partition with quorum
+ # retval is 1 on error OR on success if node has quorum
+ quorum_info = utils.parse_quorumtool_output(output)
+ if quorum_info:
+ if utils.is_node_stop_cause_quorum_loss(quorum_info, local=True):
+ utils.err(
+ "Stopping the node will cause a loss of the quorum"
+ + ", use --force to override"
+ )
+ else:
+ if output.strip() != "Cannot initialize CMAP service":
+ utils.err(
+ "Unable to determine whether stopping the node will cause "
+ + "a loss of the quorum, use --force to override"
+ )
+ # else the node seems to be stopped already, proceed to be sure
+
stop_all = (
"--pacemaker" not in utils.pcs_options
and
diff --git a/pcs/utils.py b/pcs/utils.py
index 82b71c6..2f718e7 100644
--- a/pcs/utils.py
+++ b/pcs/utils.py
@@ -211,15 +211,15 @@ def setCorosyncConfig(node,config):
def startCluster(node, quiet=False):
return sendHTTPRequest(node, 'remote/cluster_start', None, False, not quiet)
-def stopCluster(node, quiet=False, pacemaker=True, corosync=True):
- if (pacemaker and corosync) or (not pacemaker and not corosync):
- data = None
- elif pacemaker:
- data = {"component": "pacemaker"}
- elif corosync:
- data = {"component": "corosync"}
- if data:
- data = urllib.urlencode(data)
+def stopCluster(node, quiet=False, pacemaker=True, corosync=True, force=True):
+ data = dict()
+ if pacemaker and not corosync:
+ data["component"] = "pacemaker"
+ elif corosync and not pacemaker:
+ data["component"] = "corosync"
+ if force:
+ data["force"] = 1
+ data = urllib.urlencode(data)
return sendHTTPRequest(node, 'remote/cluster_stop', data, False, not quiet)
def enableCluster(node):
@@ -2286,3 +2286,62 @@ def get_operations_from_transitions(transitions_dom):
op_list = [op[1] for op in operation_list]
return op_list
+def get_remote_quorumtool_output(node):
+ return sendHTTPRequest(node, "remote/get_quorum_info", None, False, False)
+
+def parse_quorumtool_output(quorumtool_output):
+ parsed = {}
+ in_node_list = False
+ try:
+ for line in quorumtool_output.split("\n"):
+ line = line.strip()
+ if not line:
+ continue
+ if in_node_list:
+ if line.startswith("-") or line.startswith("Nodeid"):
+ # skip headers
+ continue
+ parts = line.split()
+ parsed["node_list"].append({
+ "name": parts[3],
+ "votes": int(parts[1]),
+ "local": len(parts) > 4 and parts[4] == "(local)"
+ })
+ else:
+ if line == "Membership information":
+ in_node_list = True
+ parsed["node_list"] = []
+ continue
+ if not ":" in line:
+ continue
+ parts = map(lambda x: x.strip(), line.split(":", 1))
+ if parts[0] == "Quorate":
+ parsed["quorate"] = parts[1].lower() == "yes"
+ elif parts[0] == "Quorum":
+ match = re.match("(\d+).*", parts[1])
+ if match:
+ parsed["quorum"] = int(match.group(1))
+ else:
+ return None
+ except (ValueError, IndexError):
+ return None
+ for required in ("quorum", "quorate", "node_list"):
+ if required not in parsed:
+ return None
+ return parsed
+
+# node_list - nodes to stop
+# local - local node is going to be stopped
+def is_node_stop_cause_quorum_loss(quorum_info, local=True, node_list=None):
+ if not quorum_info["quorate"]:
+ return False
+ # sum the votes of nodes that are not going to be stopped
+ votes_after_stop = 0
+ for node_info in quorum_info.get("node_list", []):
+ if local and node_info["local"]:
+ continue
+ if node_list and node_info["name"] in node_list:
+ continue
+ votes_after_stop += node_info["votes"]
+ return votes_after_stop < quorum_info["quorum"]
+
diff --git a/pcsd/public/js/pcsd.js b/pcsd/public/js/pcsd.js
index cbfc621..30473c9 100644
--- a/pcsd/public/js/pcsd.js
+++ b/pcsd/public/js/pcsd.js
@@ -481,7 +481,7 @@ function setup_node_links() {
$("#node_stop").click(function() {
node = $("#node_info_header_title_name").text();
fade_in_out("#node_stop");
- $.post('/remote/cluster_stop', {"name": $.trim(node)});
+ node_stop($.trim(node), false);
});
$("#node_restart").click(function() {
node = $("#node_info_header_title_name").text();
@@ -500,6 +500,45 @@ function setup_node_links() {
});
}
+function node_stop(node, force) {
+ var data = {};
+ data["name"] = node;
+ if (force) {
+ data["force"] = force;
+ }
+ $.ajax({
+ type: 'POST',
+ url: '/remote/cluster_stop',
+ data: data,
+ timeout: pcs_timeout,
+ success: function() {
+ },
+ error: function(xhr, status, error) {
+ if ((status == "timeout") || ($.trim(error) == "timeout")) {
+ /*
+ We are not interested in timeout because:
+ - it can take minutes to stop a node (resources running on it have
+ to be stopped/moved and we do not need to wait for that)
+ - if pcs is not able to stop a node it returns an (forceable) error
+ immediatelly
+ */
+ return;
+ }
+ var message = "Unable to stop node '" + node + "' (" + $.trim(error) + ")";
+ message += "\n" + xhr.responseText;
+ if (message.indexOf('--force') == -1) {
+ alert(message);
+ }
+ else {
+ message = message.replace(', use --force to override', '');
+ if (confirm(message + "\n\nDo you want to force the operation?")) {
+ node_stop(node, true);
+ }
+ }
+ }
+ });
+}
+
function setup_resource_links(link_type) {
Ember.debug("Setup resource links");
$("#resource_delete_link").click(function () {
diff --git a/pcsd/remote.rb b/pcsd/remote.rb
index e818288..cbbcc12 100644
--- a/pcsd/remote.rb
+++ b/pcsd/remote.rb
@@ -21,6 +21,8 @@ def remote(params,request)
return setup_cluster(params)
when "create_cluster"
return create_cluster(params)
+ when "get_quorum_info"
+ return get_quorum_info(params)
when "get_cib"
return get_cib(params)
when "get_corosync_conf"
@@ -156,18 +158,22 @@ def cluster_stop(params)
params[:name], 'cluster_stop', true, params_without_name
)
else
- options = ""
+ options = []
if params.has_key?("component")
if params["component"].downcase == "pacemaker"
- options = "--pacemaker"
+ options << "--pacemaker"
elsif params["component"].downcase == "corosync"
- options = "--corosync"
+ options << "--corosync"
end
end
- $logger.info "Stopping Daemons #{options}"
- output = `#{PCS} cluster stop #{options}`
- $logger.debug output
- return output
+ options << "--force" if params["force"]
+ $logger.info "Stopping Daemons"
+ stdout, stderr, retval = run_cmd(PCS, "cluster", "stop", *options)
+ if retval != 0
+ return [400, stderr.join]
+ else
+ return stdout.join
+ end
end
end
@@ -274,6 +280,21 @@ def cluster_disable(params)
end
end
+def get_quorum_info(params)
+ if ISRHEL6
+ return ''
+ else
+ stdout, stderr, retval = run_cmd("corosync-quorumtool", "-p", "-s")
+ # retval is 0 on success if node is not in partition with quorum
+ # retval is 1 on error OR on success if node has quorum
+ if stderr.length > 0
+ return stderr.join
+ else
+ return stdout.join
+ end
+ end
+end
+
def get_cib(params)
cib, stderr, retval = run_cmd(CIBADMIN, "-Ql")
if retval != 0
--
1.9.1