|
|
337c54 |
From 200559d8ca0b834f90d4f2ba70e8f7ce403b9726 Mon Sep 17 00:00:00 2001
|
|
|
337c54 |
From: Tomas Jelinek <tojeline@redhat.com>
|
|
|
337c54 |
Date: Fri, 16 Jan 2015 16:31:49 +0100
|
|
|
337c54 |
Subject: [PATCH] Warn if nodes stop will cause a loss of the quorum
|
|
|
337c54 |
|
|
|
337c54 |
---
|
|
|
337c54 |
pcs/cluster.py | 68 +++++++++++
|
|
|
337c54 |
pcs/utils.py | 77 ++++++++++--
|
|
|
337c54 |
pcsd/public/js/pcsd.js | 41 ++++++-
|
|
|
337c54 |
pcsd/remote.rb | 35 ++++--
|
|
|
337c54 |
5 files changed, 513 insertions(+), 17 deletions(-)
|
|
|
337c54 |
|
|
|
337c54 |
diff --git a/pcs/cluster.py b/pcs/cluster.py
|
|
|
337c54 |
index 05dc91a..e4829f1 100644
|
|
|
337c54 |
--- a/pcs/cluster.py
|
|
|
337c54 |
+++ b/pcs/cluster.py
|
|
|
337c54 |
@@ -676,6 +676,55 @@ def stop_cluster_all():
|
|
|
337c54 |
stop_cluster_nodes(utils.getNodesFromCorosyncConf())
|
|
|
337c54 |
|
|
|
337c54 |
def stop_cluster_nodes(nodes):
|
|
|
337c54 |
+ all_nodes = utils.getNodesFromCorosyncConf()
|
|
|
337c54 |
+ unknown_nodes = set(nodes) - set(all_nodes)
|
|
|
337c54 |
+ if unknown_nodes:
|
|
|
337c54 |
+ utils.err(
|
|
|
337c54 |
+ "nodes '%s' do not appear to exist in configuration"
|
|
|
337c54 |
+ % "', '".join(unknown_nodes)
|
|
|
337c54 |
+ )
|
|
|
337c54 |
+
|
|
|
337c54 |
+ stopping_all = set(nodes) >= set(all_nodes)
|
|
|
337c54 |
+ if (
|
|
|
337c54 |
+ not "--force" in utils.pcs_options
|
|
|
337c54 |
+ and
|
|
|
337c54 |
+ not stopping_all
|
|
|
337c54 |
+ and
|
|
|
337c54 |
+ not utils.is_rhel6()
|
|
|
337c54 |
+ ):
|
|
|
337c54 |
+ # we are sure we are not on cman cluster because only nodes from
|
|
|
337c54 |
+ # a local cluster can be stopped (see nodes validation above)
|
|
|
337c54 |
+ error_list = []
|
|
|
337c54 |
+ for node in nodes:
|
|
|
337c54 |
+ retval, data = utils.get_remote_quorumtool_output(node)
|
|
|
337c54 |
+ if retval != 0:
|
|
|
337c54 |
+ error_list.append(node + ": " + data)
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ quorum_info = utils.parse_quorumtool_output(data)
|
|
|
337c54 |
+ if quorum_info:
|
|
|
337c54 |
+ if not quorum_info["quorate"]:
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ if utils.is_node_stop_cause_quorum_loss(
|
|
|
337c54 |
+ quorum_info, local=False, node_list=nodes
|
|
|
337c54 |
+ ):
|
|
|
337c54 |
+ utils.err(
|
|
|
337c54 |
+ "Stopping the node(s) will cause a loss of the quorum"
|
|
|
337c54 |
+ + ", use --force to override"
|
|
|
337c54 |
+ )
|
|
|
337c54 |
+ else:
|
|
|
337c54 |
+ # We have the info, no need to print errors
|
|
|
337c54 |
+ error_list = []
|
|
|
337c54 |
+ break
|
|
|
337c54 |
+ if data.strip() != "Cannot initialize CMAP service":
|
|
|
337c54 |
+ error_list.append("Unable to get quorum status")
|
|
|
337c54 |
+ # else the node seems to be stopped already
|
|
|
337c54 |
+ if error_list:
|
|
|
337c54 |
+ utils.err(
|
|
|
337c54 |
+ "Unable to determine whether stopping the nodes will cause "
|
|
|
337c54 |
+ + "a loss of the quorum, use --force to override\n"
|
|
|
337c54 |
+ + "\n".join(error_list)
|
|
|
337c54 |
+ )
|
|
|
337c54 |
+
|
|
|
337c54 |
threads = dict()
|
|
|
337c54 |
for node in nodes:
|
|
|
337c54 |
threads[node] = NodeStopPacemakerThread(node)
|
|
|
337c54 |
@@ -764,6 +813,25 @@ def stop_cluster(argv):
|
|
|
337c54 |
stop_cluster_nodes(argv)
|
|
|
337c54 |
return
|
|
|
337c54 |
|
|
|
337c54 |
+ if not "--force" in utils.pcs_options and not utils.is_rhel6():
|
|
|
337c54 |
+ output, retval = utils.run(["corosync-quorumtool", "-p", "-s"])
|
|
|
337c54 |
+ # retval is 0 on success if node is not in partition with quorum
|
|
|
337c54 |
+ # retval is 1 on error OR on success if node has quorum
|
|
|
337c54 |
+ quorum_info = utils.parse_quorumtool_output(output)
|
|
|
337c54 |
+ if quorum_info:
|
|
|
337c54 |
+ if utils.is_node_stop_cause_quorum_loss(quorum_info, local=True):
|
|
|
337c54 |
+ utils.err(
|
|
|
337c54 |
+ "Stopping the node will cause a loss of the quorum"
|
|
|
337c54 |
+ + ", use --force to override"
|
|
|
337c54 |
+ )
|
|
|
337c54 |
+ else:
|
|
|
337c54 |
+ if output.strip() != "Cannot initialize CMAP service":
|
|
|
337c54 |
+ utils.err(
|
|
|
337c54 |
+ "Unable to determine whether stopping the node will cause "
|
|
|
337c54 |
+ + "a loss of the quorum, use --force to override"
|
|
|
337c54 |
+ )
|
|
|
337c54 |
+ # else the node seems to be stopped already, proceed to be sure
|
|
|
337c54 |
+
|
|
|
337c54 |
stop_all = (
|
|
|
337c54 |
"--pacemaker" not in utils.pcs_options
|
|
|
337c54 |
and
|
|
|
337c54 |
diff --git a/pcs/utils.py b/pcs/utils.py
|
|
|
337c54 |
index 82b71c6..2f718e7 100644
|
|
|
337c54 |
--- a/pcs/utils.py
|
|
|
337c54 |
+++ b/pcs/utils.py
|
|
|
337c54 |
@@ -211,15 +211,15 @@ def setCorosyncConfig(node,config):
|
|
|
337c54 |
def startCluster(node, quiet=False):
|
|
|
337c54 |
return sendHTTPRequest(node, 'remote/cluster_start', None, False, not quiet)
|
|
|
337c54 |
|
|
|
337c54 |
-def stopCluster(node, quiet=False, pacemaker=True, corosync=True):
|
|
|
337c54 |
- if (pacemaker and corosync) or (not pacemaker and not corosync):
|
|
|
337c54 |
- data = None
|
|
|
337c54 |
- elif pacemaker:
|
|
|
337c54 |
- data = {"component": "pacemaker"}
|
|
|
337c54 |
- elif corosync:
|
|
|
337c54 |
- data = {"component": "corosync"}
|
|
|
337c54 |
- if data:
|
|
|
337c54 |
- data = urllib.urlencode(data)
|
|
|
337c54 |
+def stopCluster(node, quiet=False, pacemaker=True, corosync=True, force=True):
|
|
|
337c54 |
+ data = dict()
|
|
|
337c54 |
+ if pacemaker and not corosync:
|
|
|
337c54 |
+ data["component"] = "pacemaker"
|
|
|
337c54 |
+ elif corosync and not pacemaker:
|
|
|
337c54 |
+ data["component"] = "corosync"
|
|
|
337c54 |
+ if force:
|
|
|
337c54 |
+ data["force"] = 1
|
|
|
337c54 |
+ data = urllib.urlencode(data)
|
|
|
337c54 |
return sendHTTPRequest(node, 'remote/cluster_stop', data, False, not quiet)
|
|
|
337c54 |
|
|
|
337c54 |
def enableCluster(node):
|
|
|
337c54 |
@@ -2286,3 +2286,62 @@ def get_operations_from_transitions(transitions_dom):
|
|
|
337c54 |
op_list = [op[1] for op in operation_list]
|
|
|
337c54 |
return op_list
|
|
|
337c54 |
|
|
|
337c54 |
+def get_remote_quorumtool_output(node):
|
|
|
337c54 |
+ return sendHTTPRequest(node, "remote/get_quorum_info", None, False, False)
|
|
|
337c54 |
+
|
|
|
337c54 |
+def parse_quorumtool_output(quorumtool_output):
|
|
|
337c54 |
+ parsed = {}
|
|
|
337c54 |
+ in_node_list = False
|
|
|
337c54 |
+ try:
|
|
|
337c54 |
+ for line in quorumtool_output.split("\n"):
|
|
|
337c54 |
+ line = line.strip()
|
|
|
337c54 |
+ if not line:
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ if in_node_list:
|
|
|
337c54 |
+ if line.startswith("-") or line.startswith("Nodeid"):
|
|
|
337c54 |
+ # skip headers
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ parts = line.split()
|
|
|
337c54 |
+ parsed["node_list"].append({
|
|
|
337c54 |
+ "name": parts[3],
|
|
|
337c54 |
+ "votes": int(parts[1]),
|
|
|
337c54 |
+ "local": len(parts) > 4 and parts[4] == "(local)"
|
|
|
337c54 |
+ })
|
|
|
337c54 |
+ else:
|
|
|
337c54 |
+ if line == "Membership information":
|
|
|
337c54 |
+ in_node_list = True
|
|
|
337c54 |
+ parsed["node_list"] = []
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ if not ":" in line:
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ parts = map(lambda x: x.strip(), line.split(":", 1))
|
|
|
337c54 |
+ if parts[0] == "Quorate":
|
|
|
337c54 |
+ parsed["quorate"] = parts[1].lower() == "yes"
|
|
|
337c54 |
+ elif parts[0] == "Quorum":
|
|
|
337c54 |
+ match = re.match("(\d+).*", parts[1])
|
|
|
337c54 |
+ if match:
|
|
|
337c54 |
+ parsed["quorum"] = int(match.group(1))
|
|
|
337c54 |
+ else:
|
|
|
337c54 |
+ return None
|
|
|
337c54 |
+ except (ValueError, IndexError):
|
|
|
337c54 |
+ return None
|
|
|
337c54 |
+ for required in ("quorum", "quorate", "node_list"):
|
|
|
337c54 |
+ if required not in parsed:
|
|
|
337c54 |
+ return None
|
|
|
337c54 |
+ return parsed
|
|
|
337c54 |
+
|
|
|
337c54 |
+# node_list - nodes to stop
|
|
|
337c54 |
+# local - local node is going to be stopped
|
|
|
337c54 |
+def is_node_stop_cause_quorum_loss(quorum_info, local=True, node_list=None):
|
|
|
337c54 |
+ if not quorum_info["quorate"]:
|
|
|
337c54 |
+ return False
|
|
|
337c54 |
+ # sum the votes of nodes that are not going to be stopped
|
|
|
337c54 |
+ votes_after_stop = 0
|
|
|
337c54 |
+ for node_info in quorum_info.get("node_list", []):
|
|
|
337c54 |
+ if local and node_info["local"]:
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ if node_list and node_info["name"] in node_list:
|
|
|
337c54 |
+ continue
|
|
|
337c54 |
+ votes_after_stop += node_info["votes"]
|
|
|
337c54 |
+ return votes_after_stop < quorum_info["quorum"]
|
|
|
337c54 |
+
|
|
|
337c54 |
diff --git a/pcsd/public/js/pcsd.js b/pcsd/public/js/pcsd.js
|
|
|
337c54 |
index cbfc621..30473c9 100644
|
|
|
337c54 |
--- a/pcsd/public/js/pcsd.js
|
|
|
337c54 |
+++ b/pcsd/public/js/pcsd.js
|
|
|
337c54 |
@@ -481,7 +481,7 @@ function setup_node_links() {
|
|
|
337c54 |
$("#node_stop").click(function() {
|
|
|
337c54 |
node = $("#node_info_header_title_name").text();
|
|
|
337c54 |
fade_in_out("#node_stop");
|
|
|
337c54 |
- $.post('/remote/cluster_stop', {"name": $.trim(node)});
|
|
|
337c54 |
+ node_stop($.trim(node), false);
|
|
|
337c54 |
});
|
|
|
337c54 |
$("#node_restart").click(function() {
|
|
|
337c54 |
node = $("#node_info_header_title_name").text();
|
|
|
337c54 |
@@ -500,6 +500,45 @@ function setup_node_links() {
|
|
|
337c54 |
});
|
|
|
337c54 |
}
|
|
|
337c54 |
|
|
|
337c54 |
+function node_stop(node, force) {
|
|
|
337c54 |
+ var data = {};
|
|
|
337c54 |
+ data["name"] = node;
|
|
|
337c54 |
+ if (force) {
|
|
|
337c54 |
+ data["force"] = force;
|
|
|
337c54 |
+ }
|
|
|
337c54 |
+ $.ajax({
|
|
|
337c54 |
+ type: 'POST',
|
|
|
337c54 |
+ url: '/remote/cluster_stop',
|
|
|
337c54 |
+ data: data,
|
|
|
337c54 |
+ timeout: pcs_timeout,
|
|
|
337c54 |
+ success: function() {
|
|
|
337c54 |
+ },
|
|
|
337c54 |
+ error: function(xhr, status, error) {
|
|
|
337c54 |
+ if ((status == "timeout") || ($.trim(error) == "timeout")) {
|
|
|
337c54 |
+ /*
|
|
|
337c54 |
+ We are not interested in timeout because:
|
|
|
337c54 |
+ - it can take minutes to stop a node (resources running on it have
|
|
|
337c54 |
+ to be stopped/moved and we do not need to wait for that)
|
|
|
337c54 |
+ - if pcs is not able to stop a node it returns an (forceable) error
|
|
|
337c54 |
+ immediatelly
|
|
|
337c54 |
+ */
|
|
|
337c54 |
+ return;
|
|
|
337c54 |
+ }
|
|
|
337c54 |
+ var message = "Unable to stop node '" + node + "' (" + $.trim(error) + ")";
|
|
|
337c54 |
+ message += "\n" + xhr.responseText;
|
|
|
337c54 |
+ if (message.indexOf('--force') == -1) {
|
|
|
337c54 |
+ alert(message);
|
|
|
337c54 |
+ }
|
|
|
337c54 |
+ else {
|
|
|
337c54 |
+ message = message.replace(', use --force to override', '');
|
|
|
337c54 |
+ if (confirm(message + "\n\nDo you want to force the operation?")) {
|
|
|
337c54 |
+ node_stop(node, true);
|
|
|
337c54 |
+ }
|
|
|
337c54 |
+ }
|
|
|
337c54 |
+ }
|
|
|
337c54 |
+ });
|
|
|
337c54 |
+}
|
|
|
337c54 |
+
|
|
|
337c54 |
function setup_resource_links(link_type) {
|
|
|
337c54 |
Ember.debug("Setup resource links");
|
|
|
337c54 |
$("#resource_delete_link").click(function () {
|
|
|
337c54 |
diff --git a/pcsd/remote.rb b/pcsd/remote.rb
|
|
|
337c54 |
index e818288..cbbcc12 100644
|
|
|
337c54 |
--- a/pcsd/remote.rb
|
|
|
337c54 |
+++ b/pcsd/remote.rb
|
|
|
337c54 |
@@ -21,6 +21,8 @@ def remote(params,request)
|
|
|
337c54 |
return setup_cluster(params)
|
|
|
337c54 |
when "create_cluster"
|
|
|
337c54 |
return create_cluster(params)
|
|
|
337c54 |
+ when "get_quorum_info"
|
|
|
337c54 |
+ return get_quorum_info(params)
|
|
|
337c54 |
when "get_cib"
|
|
|
337c54 |
return get_cib(params)
|
|
|
337c54 |
when "get_corosync_conf"
|
|
|
337c54 |
@@ -156,18 +158,22 @@ def cluster_stop(params)
|
|
|
337c54 |
params[:name], 'cluster_stop', true, params_without_name
|
|
|
337c54 |
)
|
|
|
337c54 |
else
|
|
|
337c54 |
- options = ""
|
|
|
337c54 |
+ options = []
|
|
|
337c54 |
if params.has_key?("component")
|
|
|
337c54 |
if params["component"].downcase == "pacemaker"
|
|
|
337c54 |
- options = "--pacemaker"
|
|
|
337c54 |
+ options << "--pacemaker"
|
|
|
337c54 |
elsif params["component"].downcase == "corosync"
|
|
|
337c54 |
- options = "--corosync"
|
|
|
337c54 |
+ options << "--corosync"
|
|
|
337c54 |
end
|
|
|
337c54 |
end
|
|
|
337c54 |
- $logger.info "Stopping Daemons #{options}"
|
|
|
337c54 |
- output = `#{PCS} cluster stop #{options}`
|
|
|
337c54 |
- $logger.debug output
|
|
|
337c54 |
- return output
|
|
|
337c54 |
+ options << "--force" if params["force"]
|
|
|
337c54 |
+ $logger.info "Stopping Daemons"
|
|
|
337c54 |
+ stdout, stderr, retval = run_cmd(PCS, "cluster", "stop", *options)
|
|
|
337c54 |
+ if retval != 0
|
|
|
337c54 |
+ return [400, stderr.join]
|
|
|
337c54 |
+ else
|
|
|
337c54 |
+ return stdout.join
|
|
|
337c54 |
+ end
|
|
|
337c54 |
end
|
|
|
337c54 |
end
|
|
|
337c54 |
|
|
|
337c54 |
@@ -274,6 +280,21 @@ def cluster_disable(params)
|
|
|
337c54 |
end
|
|
|
337c54 |
end
|
|
|
337c54 |
|
|
|
337c54 |
+def get_quorum_info(params)
|
|
|
337c54 |
+ if ISRHEL6
|
|
|
337c54 |
+ return ''
|
|
|
337c54 |
+ else
|
|
|
337c54 |
+ stdout, stderr, retval = run_cmd("corosync-quorumtool", "-p", "-s")
|
|
|
337c54 |
+ # retval is 0 on success if node is not in partition with quorum
|
|
|
337c54 |
+ # retval is 1 on error OR on success if node has quorum
|
|
|
337c54 |
+ if stderr.length > 0
|
|
|
337c54 |
+ return stderr.join
|
|
|
337c54 |
+ else
|
|
|
337c54 |
+ return stdout.join
|
|
|
337c54 |
+ end
|
|
|
337c54 |
+ end
|
|
|
337c54 |
+end
|
|
|
337c54 |
+
|
|
|
337c54 |
def get_cib(params)
|
|
|
337c54 |
cib, stderr, retval = run_cmd(CIBADMIN, "-Ql")
|
|
|
337c54 |
if retval != 0
|
|
|
337c54 |
--
|
|
|
337c54 |
1.9.1
|
|
|
337c54 |
|