Blame SOURCES/bz1180506-Warn-if-nodes-stop-will-cause-a-loss-of-the-quorum.patch

337c54
From 200559d8ca0b834f90d4f2ba70e8f7ce403b9726 Mon Sep 17 00:00:00 2001
337c54
From: Tomas Jelinek <tojeline@redhat.com>
337c54
Date: Fri, 16 Jan 2015 16:31:49 +0100
337c54
Subject: [PATCH] Warn if nodes stop will cause a loss of the quorum
337c54
337c54
---
337c54
 pcs/cluster.py         |  68 +++++++++++
337c54
 pcs/utils.py           |  77 ++++++++++--
337c54
 pcsd/public/js/pcsd.js |  41 ++++++-
337c54
 pcsd/remote.rb         |  35 ++++--
337c54
 5 files changed, 513 insertions(+), 17 deletions(-)
337c54
337c54
diff --git a/pcs/cluster.py b/pcs/cluster.py
337c54
index 05dc91a..e4829f1 100644
337c54
--- a/pcs/cluster.py
337c54
+++ b/pcs/cluster.py
337c54
@@ -676,6 +676,55 @@ def stop_cluster_all():
337c54
     stop_cluster_nodes(utils.getNodesFromCorosyncConf())
337c54
 
337c54
 def stop_cluster_nodes(nodes):
337c54
+    all_nodes = utils.getNodesFromCorosyncConf()
337c54
+    unknown_nodes = set(nodes) - set(all_nodes)
337c54
+    if unknown_nodes:
337c54
+        utils.err(
337c54
+            "nodes '%s' do not appear to exist in configuration"
337c54
+            % "', '".join(unknown_nodes)
337c54
+        )
337c54
+
337c54
+    stopping_all = set(nodes) >= set(all_nodes)
337c54
+    if (
337c54
+        not "--force" in utils.pcs_options
337c54
+        and
337c54
+        not stopping_all
337c54
+        and
337c54
+        not utils.is_rhel6()
337c54
+    ):
337c54
+        # we are sure we are not on cman cluster because only nodes from
337c54
+        # a local cluster can be stopped (see nodes validation above)
337c54
+        error_list = []
337c54
+        for node in nodes:
337c54
+            retval, data = utils.get_remote_quorumtool_output(node)
337c54
+            if retval != 0:
337c54
+                error_list.append(node + ": " + data)
337c54
+                continue
337c54
+            quorum_info = utils.parse_quorumtool_output(data)
337c54
+            if quorum_info:
337c54
+                if not quorum_info["quorate"]:
337c54
+                    continue
337c54
+                if utils.is_node_stop_cause_quorum_loss(
337c54
+                    quorum_info, local=False, node_list=nodes
337c54
+                ):
337c54
+                    utils.err(
337c54
+                        "Stopping the node(s) will cause a loss of the quorum"
337c54
+                        + ", use --force to override"
337c54
+                    )
337c54
+                else:
337c54
+                    # We have the info, no need to print errors
337c54
+                    error_list = []
337c54
+                    break
337c54
+            if data.strip() != "Cannot initialize CMAP service":
337c54
+                error_list.append("Unable to get quorum status")
337c54
+            # else the node seems to be stopped already
337c54
+        if error_list:
337c54
+            utils.err(
337c54
+                "Unable to determine whether stopping the nodes will cause "
337c54
+                + "a loss of the quorum, use --force to override\n"
337c54
+                + "\n".join(error_list)
337c54
+            )
337c54
+
337c54
     threads = dict()
337c54
     for node in nodes:
337c54
         threads[node] = NodeStopPacemakerThread(node)
337c54
@@ -764,6 +813,25 @@ def stop_cluster(argv):
337c54
         stop_cluster_nodes(argv)
337c54
         return
337c54
 
337c54
+    if not "--force" in utils.pcs_options and not utils.is_rhel6():
337c54
+        output, retval = utils.run(["corosync-quorumtool", "-p", "-s"])
337c54
+        # retval is 0 on success if node is not in partition with quorum
337c54
+        # retval is 1 on error OR on success if node has quorum
337c54
+        quorum_info = utils.parse_quorumtool_output(output)
337c54
+        if quorum_info:
337c54
+            if utils.is_node_stop_cause_quorum_loss(quorum_info, local=True):
337c54
+                utils.err(
337c54
+                    "Stopping the node will cause a loss of the quorum"
337c54
+                    + ", use --force to override"
337c54
+                )
337c54
+        else:
337c54
+            if output.strip() != "Cannot initialize CMAP service":
337c54
+                utils.err(
337c54
+                    "Unable to determine whether stopping the node will cause "
337c54
+                    + "a loss of the quorum, use --force to override"
337c54
+                )
337c54
+            # else the node seems to be stopped already, proceed to be sure
337c54
+
337c54
     stop_all = (
337c54
         "--pacemaker" not in utils.pcs_options
337c54
         and
337c54
diff --git a/pcs/utils.py b/pcs/utils.py
337c54
index 82b71c6..2f718e7 100644
337c54
--- a/pcs/utils.py
337c54
+++ b/pcs/utils.py
337c54
@@ -211,15 +211,15 @@ def setCorosyncConfig(node,config):
337c54
 def startCluster(node, quiet=False):
337c54
     return sendHTTPRequest(node, 'remote/cluster_start', None, False, not quiet)
337c54
 
337c54
-def stopCluster(node, quiet=False, pacemaker=True, corosync=True):
337c54
-    if (pacemaker and corosync) or (not pacemaker and not corosync):
337c54
-        data = None
337c54
-    elif pacemaker:
337c54
-        data = {"component": "pacemaker"}
337c54
-    elif corosync:
337c54
-        data = {"component": "corosync"}
337c54
-    if data:
337c54
-        data = urllib.urlencode(data)
337c54
+def stopCluster(node, quiet=False, pacemaker=True, corosync=True, force=True):
337c54
+    data = dict()
337c54
+    if pacemaker and not corosync:
337c54
+        data["component"] = "pacemaker"
337c54
+    elif corosync and not pacemaker:
337c54
+        data["component"] = "corosync"
337c54
+    if force:
337c54
+        data["force"] = 1
337c54
+    data = urllib.urlencode(data)
337c54
     return sendHTTPRequest(node, 'remote/cluster_stop', data, False, not quiet)
337c54
 
337c54
 def enableCluster(node):
337c54
@@ -2286,3 +2286,62 @@ def get_operations_from_transitions(transitions_dom):
337c54
     op_list = [op[1] for op in operation_list]
337c54
     return op_list
337c54
 
337c54
+def get_remote_quorumtool_output(node):
337c54
+    return sendHTTPRequest(node, "remote/get_quorum_info", None, False, False)
337c54
+
337c54
+def parse_quorumtool_output(quorumtool_output):
337c54
+    parsed = {}
337c54
+    in_node_list = False
337c54
+    try:
337c54
+        for line in quorumtool_output.split("\n"):
337c54
+            line = line.strip()
337c54
+            if not line:
337c54
+                continue
337c54
+            if in_node_list:
337c54
+                if line.startswith("-") or line.startswith("Nodeid"):
337c54
+                    # skip headers
337c54
+                    continue
337c54
+                parts = line.split()
337c54
+                parsed["node_list"].append({
337c54
+                    "name": parts[3],
337c54
+                    "votes": int(parts[1]),
337c54
+                    "local": len(parts) > 4 and parts[4] == "(local)"
337c54
+                })
337c54
+            else:
337c54
+                if line == "Membership information":
337c54
+                    in_node_list = True
337c54
+                    parsed["node_list"] = []
337c54
+                    continue
337c54
+                if not ":" in line:
337c54
+                    continue
337c54
+                parts = map(lambda x: x.strip(), line.split(":", 1))
337c54
+                if parts[0] == "Quorate":
337c54
+                    parsed["quorate"] = parts[1].lower() == "yes"
337c54
+                elif parts[0] == "Quorum":
337c54
+                    match = re.match("(\d+).*", parts[1])
337c54
+                    if match:
337c54
+                        parsed["quorum"] = int(match.group(1))
337c54
+                    else:
337c54
+                        return None
337c54
+    except (ValueError, IndexError):
337c54
+        return None
337c54
+    for required in ("quorum", "quorate", "node_list"):
337c54
+        if required not in parsed:
337c54
+            return None
337c54
+    return parsed
337c54
+
337c54
+# node_list - nodes to stop
337c54
+# local - local node is going to be stopped
337c54
+def is_node_stop_cause_quorum_loss(quorum_info, local=True, node_list=None):
337c54
+    if not quorum_info["quorate"]:
337c54
+        return False
337c54
+    # sum the votes of nodes that are not going to be stopped
337c54
+    votes_after_stop = 0
337c54
+    for node_info in quorum_info.get("node_list", []):
337c54
+        if local and node_info["local"]:
337c54
+            continue
337c54
+        if node_list and node_info["name"] in node_list:
337c54
+            continue
337c54
+        votes_after_stop += node_info["votes"]
337c54
+    return votes_after_stop < quorum_info["quorum"]
337c54
+
337c54
diff --git a/pcsd/public/js/pcsd.js b/pcsd/public/js/pcsd.js
337c54
index cbfc621..30473c9 100644
337c54
--- a/pcsd/public/js/pcsd.js
337c54
+++ b/pcsd/public/js/pcsd.js
337c54
@@ -481,7 +481,7 @@ function setup_node_links() {
337c54
   $("#node_stop").click(function() {
337c54
     node = $("#node_info_header_title_name").text();
337c54
     fade_in_out("#node_stop");
337c54
-    $.post('/remote/cluster_stop', {"name": $.trim(node)});
337c54
+    node_stop($.trim(node), false);
337c54
   });
337c54
   $("#node_restart").click(function() {
337c54
     node = $("#node_info_header_title_name").text();
337c54
@@ -500,6 +500,45 @@ function setup_node_links() {
337c54
   });
337c54
 }
337c54
 
337c54
+function node_stop(node, force) {
337c54
+  var data = {};
337c54
+  data["name"] = node;
337c54
+  if (force) {
337c54
+    data["force"] = force;
337c54
+  }
337c54
+  $.ajax({
337c54
+    type: 'POST',
337c54
+    url: '/remote/cluster_stop',
337c54
+    data: data,
337c54
+    timeout: pcs_timeout,
337c54
+    success: function() {
337c54
+    },
337c54
+    error: function(xhr, status, error) {
337c54
+      if ((status == "timeout") || ($.trim(error) == "timeout")) {
337c54
+        /*
337c54
+         We are not interested in timeout because:
337c54
+         - it can take minutes to stop a node (resources running on it have
337c54
+           to be stopped/moved and we do not need to wait for that)
337c54
+         - if pcs is not able to stop a node it returns an (forceable) error
337c54
+           immediatelly
337c54
+        */
337c54
+        return;
337c54
+      }
337c54
+      var message = "Unable to stop node '" + node + "' (" + $.trim(error) + ")";
337c54
+      message += "\n" + xhr.responseText;
337c54
+      if (message.indexOf('--force') == -1) {
337c54
+        alert(message);
337c54
+      }
337c54
+      else {
337c54
+        message = message.replace(', use --force to override', '');
337c54
+        if (confirm(message + "\n\nDo you want to force the operation?")) {
337c54
+          node_stop(node, true);
337c54
+        }
337c54
+      }
337c54
+    }
337c54
+  });
337c54
+}
337c54
+
337c54
 function setup_resource_links(link_type) {
337c54
   Ember.debug("Setup resource links");
337c54
   $("#resource_delete_link").click(function () {
337c54
diff --git a/pcsd/remote.rb b/pcsd/remote.rb
337c54
index e818288..cbbcc12 100644
337c54
--- a/pcsd/remote.rb
337c54
+++ b/pcsd/remote.rb
337c54
@@ -21,6 +21,8 @@ def remote(params,request)
337c54
     return setup_cluster(params)
337c54
   when "create_cluster"
337c54
     return create_cluster(params)
337c54
+  when "get_quorum_info"
337c54
+    return get_quorum_info(params)
337c54
   when "get_cib"
337c54
     return get_cib(params)
337c54
   when "get_corosync_conf"
337c54
@@ -156,18 +158,22 @@ def cluster_stop(params)
337c54
       params[:name], 'cluster_stop', true, params_without_name
337c54
     )
337c54
   else
337c54
-    options = ""
337c54
+    options = []
337c54
     if params.has_key?("component")
337c54
       if params["component"].downcase == "pacemaker"
337c54
-        options = "--pacemaker"
337c54
+        options << "--pacemaker"
337c54
       elsif params["component"].downcase == "corosync"
337c54
-        options = "--corosync"
337c54
+        options << "--corosync"
337c54
       end
337c54
     end
337c54
-    $logger.info "Stopping Daemons #{options}"
337c54
-    output =  `#{PCS} cluster stop #{options}`
337c54
-    $logger.debug output
337c54
-    return output
337c54
+    options << "--force" if params["force"]
337c54
+    $logger.info "Stopping Daemons"
337c54
+    stdout, stderr, retval = run_cmd(PCS, "cluster", "stop", *options)
337c54
+    if retval != 0
337c54
+      return [400, stderr.join]
337c54
+    else
337c54
+      return stdout.join
337c54
+    end
337c54
   end
337c54
 end
337c54
 
337c54
@@ -274,6 +280,21 @@ def cluster_disable(params)
337c54
   end
337c54
 end
337c54
 
337c54
+def get_quorum_info(params)
337c54
+  if ISRHEL6
337c54
+    return ''
337c54
+  else
337c54
+    stdout, stderr, retval = run_cmd("corosync-quorumtool", "-p", "-s")
337c54
+    # retval is 0 on success if node is not in partition with quorum
337c54
+    # retval is 1 on error OR on success if node has quorum
337c54
+    if stderr.length > 0
337c54
+      return stderr.join
337c54
+    else
337c54
+      return stdout.join
337c54
+    end
337c54
+  end
337c54
+end
337c54
+
337c54
 def get_cib(params)
337c54
   cib, stderr, retval = run_cmd(CIBADMIN, "-Ql")
337c54
   if retval != 0
337c54
-- 
337c54
1.9.1
337c54