Blob Blame History Raw
From 9ed24231c194985f16ba14633d4f215f48608ee2 Mon Sep 17 00:00:00 2001
From: Tomas Jelinek <tojeline@redhat.com>
Date: Thu, 4 Dec 2014 17:10:11 +0100
Subject: [PATCH] Fix waiting for resource operations

* fixed waiting for globally-unique clone resources
* added --wait support to 'pcs resource update' command
* do not exit with an error when a resource is not running, print a warning
  instead
---
 pcs/pcs.8       |   4 +-
 pcs/resource.py | 175 +++++++++++++++++++++++++++++++++++++++-----------------
 pcs/usage.py    |   8 ++-
 pcs/utils.py    |  46 ++++++++++++++-
 4 files changed, 174 insertions(+), 59 deletions(-)

diff --git a/pcs/pcs.8 b/pcs/pcs.8
index 2020f99..67f85d5 100644
--- a/pcs/pcs.8
+++ b/pcs/pcs.8
@@ -92,8 +92,8 @@ List available OCF resource agent providers
 agents [standard[:provider]]
 List available agents optionally filtered by standard and provider
 .TP
-update <resource id> [resource options] [op [<operation action> <operation options>]...] [meta <meta operations>...]
-Add/Change options to specified resource, clone or multi\-state resource.  If an operation (op) is specified it will update the first found operation with the same action on the specified resource, if no operation with that action exists then a new operation will be created (WARNING: all current options on the update op will be reset if not specified). If you want to create multiple monitor operations you should use the add_operation & remove_operation commands.
+update <resource id> [resource options] [op [<operation action> <operation options>]...] [meta <meta operations>...] [\fB\-\-wait\fR[=n]]
+Add/Change options to specified resource, clone or multi\-state resource.  If an operation (op) is specified it will update the first found operation with the same action on the specified resource, if no operation with that action exists then a new operation will be created (WARNING: all current options on the update op will be reset if not specified). If you want to create multiple monitor operations you should use the add_operation & remove_operation commands.  If \fB\-\-wait\fR is specified, pcs will wait up to 'n' seconds for the changes to take effect and then return 0 if the changes have been processed or 1 otherwise.  If 'n' is not specified, default resource timeout will be used.
 .TP
 op add <resource id> <operation action> [operation properties]
 Add operation for specified resource
diff --git a/pcs/resource.py b/pcs/resource.py
index 48d894d..75cd8bb 100644
--- a/pcs/resource.py
+++ b/pcs/resource.py
@@ -575,8 +575,10 @@ def resource_move(argv,clear=False,ban=False):
         if utils.usefile:
             utils.err("Cannot use '-f' together with '--wait'")
         if not utils.is_resource_started(resource_id, 0)[0]:
-            utils.err("Cannot use '--wait' on non-running resources")
-        wait = True
+            print "Warning: Cannot use '--wait' on non-running resources"
+        else:
+            wait = True
+    if wait:
         timeout = utils.pcs_options["--wait"]
         if timeout is None:
             timeout = (
@@ -737,6 +739,12 @@ def resource_update(res_id,args):
             else:
                 ra_values.append(arg)
 
+    wait = False
+    if "--wait" in utils.pcs_options:
+        if utils.usefile:
+            utils.err("Cannot use '-f' together with '--wait'")
+        wait = True
+
     resource = None
     for r in dom.getElementsByTagName("primitive"):
         if r.getAttribute("id") == res_id:
@@ -753,10 +761,8 @@ def resource_update(res_id,args):
         if clone:
             for a in c.childNodes:
                 if a.localName == "primitive" or a.localName == "group":
-                    return utils.replace_cib_configuration(
-                        resource_clone_create(
-                            dom, [a.getAttribute("id")] + args, True
-                          )
+                    return resource_update_clone_master(
+                        dom, clone, "clone", a.getAttribute("id"), args, wait
                     )
 
         master = None
@@ -766,12 +772,18 @@ def resource_update(res_id,args):
                 break
 
         if master:
-            return utils.replace_cib_configuration(
-                resource_master_create(dom, [res_id] + args, True)
+            return resource_update_clone_master(
+                dom, master, "master", res_id, args, wait
             )
 
         utils.err ("Unable to find resource: %s" % res_id)
 
+    if wait:
+        node_count = len(utils.getNodesFromPacemaker())
+        status_old = utils.get_resource_status_for_wait(
+            dom, resource, node_count
+        )
+
     instance_attributes = resource.getElementsByTagName("instance_attributes")
     if len(instance_attributes) == 0:
         instance_attributes = dom.createElement("instance_attributes")
@@ -919,8 +931,85 @@ def resource_update(res_id,args):
     if len(instance_attributes.getElementsByTagName("nvpair")) == 0:
         instance_attributes.parentNode.removeChild(instance_attributes)
 
+    if wait:
+        status_new = utils.get_resource_status_for_wait(
+            dom, resource, node_count
+        )
+        wait_for_start, wait_for_stop = utils.get_resource_wait_decision(
+            status_old, status_new
+        )
+        if wait_for_start or wait_for_stop:
+            timeout = utils.pcs_options["--wait"]
+            if timeout is None:
+                timeout = utils.get_resource_op_timeout(
+                    dom, res_id, "start" if wait_for_start else "stop"
+                )
+            elif not timeout.isdigit():
+                utils.err("You must specify the number of seconds to wait")
+        else:
+            timeout = 0
+
     utils.replace_cib_configuration(dom)
 
+    if wait:
+        if wait_for_start or wait_for_stop:
+            success, message = utils.is_resource_started(
+                res_id, int(timeout), wait_for_stop,
+                count=status_new["instances"]
+            )
+            if success:
+                print message
+            else:
+                utils.err("Unable to start '%s'\n%s" % (res_id, message))
+        else:
+            print utils.resource_running_on(res_id)["message"]
+
+def resource_update_clone_master(dom, clone, clone_type, res_id, args, wait):
+    if wait:
+        node_count = len(utils.getNodesFromPacemaker())
+        status_old = utils.get_resource_status_for_wait(dom, clone, node_count)
+
+    if clone_type == "clone":
+        dom = resource_clone_create(dom, [res_id] + args, True)
+    elif clone_type == "master":
+        dom = resource_master_create(dom, [res_id] + args, True)
+
+    if wait:
+        status_new = utils.get_resource_status_for_wait(dom, clone, node_count)
+        wait_for_start, wait_for_stop = utils.get_resource_wait_decision(
+            status_old, status_new
+        )
+        if wait_for_start or wait_for_stop:
+            timeout = utils.pcs_options["--wait"]
+            if timeout is None:
+                timeout = utils.get_resource_op_timeout(
+                    dom, res_id, "start" if wait_for_start else "stop"
+                )
+            elif not timeout.isdigit():
+                utils.err("You must specify the number of seconds to wait")
+        else:
+            timeout = 0
+
+    dom = utils.replace_cib_configuration(dom)
+
+    if wait:
+        if wait_for_start or wait_for_stop:
+            success, message = utils.is_resource_started(
+                clone.getAttribute("id"), int(timeout), wait_for_stop,
+                count=status_new["instances"]
+            )
+            if success:
+                print message
+            else:
+                utils.err(
+                    "Unable to start '%s'\n%s"
+                    % (clone.getAttribute("id"), message)
+                )
+        else:
+            print utils.resource_running_on(clone.getAttribute("id"))["message"]
+
+    return dom
+
 # Removes all OCF_CHECK_LEVEL nvpairs
 def remove_ocf_check_levels(dom):
     for np in dom.getElementsByTagName("nvpair")[:]:
@@ -1092,15 +1181,7 @@ def resource_meta(res_id, argv):
             utils.err("Cannot use '-f' together with '--wait'")
         wait = True
         node_count = len(utils.getNodesFromPacemaker())
-        clone_ms_parent = utils.dom_get_resource_clone_ms_parent(dom, res_id)
-        old_status_running = utils.is_resource_started(res_id, 0)[0]
-        old_role = utils.dom_get_meta_attr_value(
-            meta_attributes.parentNode, "target-role"
-        )
-        old_status_enabled = not old_role or old_role.lower() != "stopped"
-        old_status_instances = utils.count_expected_resource_instances(
-            clone_ms_parent if clone_ms_parent else elem, node_count
-        )
+        status_old = utils.get_resource_status_for_wait(dom, elem, node_count)
 
     update_meta_attributes(
         meta_attributes,
@@ -1109,29 +1190,10 @@ def resource_meta(res_id, argv):
     )
 
     if wait:
-        new_role = utils.dom_get_meta_attr_value(
-            meta_attributes.parentNode, "target-role"
+        status_new = utils.get_resource_status_for_wait(dom, elem, node_count)
+        wait_for_start, wait_for_stop = utils.get_resource_wait_decision(
+            status_old, status_new
         )
-        new_status_enabled = not new_role or new_role.lower() != "stopped"
-        new_status_instances = utils.count_expected_resource_instances(
-            clone_ms_parent if clone_ms_parent else elem, node_count
-        )
-        wait_for_start = False
-        wait_for_stop = False
-        if old_status_running and not new_status_enabled:
-            wait_for_stop = True
-        elif (
-            not old_status_running
-            and
-            (not old_status_enabled and new_status_enabled)
-        ):
-            wait_for_start = True
-        elif (
-            old_status_running
-            and
-            old_status_instances != new_status_instances
-        ):
-            wait_for_start = True
         if wait_for_start or wait_for_stop:
             timeout = utils.pcs_options["--wait"]
             if timeout is None:
@@ -1145,14 +1207,17 @@ def resource_meta(res_id, argv):
 
     utils.replace_cib_configuration(dom)
 
-    if wait and (wait_for_start or wait_for_stop):
-        success, message = utils.is_resource_started(
-            res_id, int(timeout), wait_for_stop, count=new_status_instances
-        )
-        if success:
-            print message
+    if wait:
+        if wait_for_start or wait_for_stop:
+            success, message = utils.is_resource_started(
+                res_id, int(timeout), wait_for_stop, count=status_new["instances"]
+            )
+            if success:
+                print message
+            else:
+                utils.err("Unable to start '%s'\n%s" % (res_id, message))
         else:
-            utils.err("Unable to start '%s'\n%s" % (res_id, message))
+            print utils.resource_running_on(res_id)["message"]
 
 def update_meta_attributes(meta_attributes, meta_attrs, id_prefix):
     dom = meta_attributes.ownerDocument
@@ -1377,8 +1442,10 @@ def resource_clone(argv):
         if utils.usefile:
             utils.err("Cannot use '-f' together with '--wait'")
         if not utils.is_resource_started(res, 0)[0]:
-            utils.err("Cannot use '--wait' on non-running resources")
-        wait = True
+            print "Warning: Cannot use '--wait' on non-running resources"
+        else:
+            wait = True
+    if wait:
         wait_op = "start"
         for arg in argv:
             if arg.lower() == "target-role=stopped":
@@ -1486,8 +1553,10 @@ def resource_clone_master_remove(argv):
         if utils.usefile:
             utils.err("Cannot use '-f' together with '--wait'")
         if not utils.is_resource_started(resource_id, 0)[0]:
-            utils.err("Cannot use '--wait' on non-running resources")
-        wait = True
+            print "Warning: Cannot use '--wait' on non-running resources"
+        else:
+            wait = True
+    if wait:
         timeout = utils.pcs_options["--wait"]
         if timeout is None:
             timeout = utils.get_resource_op_timeout(dom, resource_id, "stop")
@@ -1534,8 +1603,10 @@ def resource_master(argv):
         if utils.usefile:
             utils.err("Cannot use '-f' together with '--wait'")
         if not utils.is_resource_started(res_id, 0)[0]:
-            utils.err("Cannot use '--wait' on non-running resources")
-        wait = True
+            print "Warning: Cannot use '--wait' on non-running resources"
+        else:
+            wait = True
+    if wait:
         wait_op = "promote"
         for arg in argv:
             if arg.lower() == "target-role=stopped":
diff --git a/pcs/usage.py b/pcs/usage.py
index ed99148..a66b90e 100644
--- a/pcs/usage.py
+++ b/pcs/usage.py
@@ -333,14 +333,18 @@ Commands:
         List available agents optionally filtered by standard and provider
 
     update <resource id> [resource options] [op [<operation action>
-           <operation options>]...] [meta <meta operations>...]
+           <operation options>]...] [meta <meta operations>...] [--wait[=n]]
         Add/Change options to specified resource, clone or multi-state
         resource.  If an operation (op) is specified it will update the first
         found operation with the same action on the specified resource, if no
         operation with that action exists then a new operation will be created.
         (WARNING: all current options on the update op will be reset if not
         specified) If you want to create multiple monitor operations you should
-        use the add_operation & remove_operation commands.
+        use the add_operation & remove_operation commands.  If --wait is
+        specified, pcs will wait up to 'n' seconds for the changes to take
+        effect and then return 0 if the changes have been processed or 1
+        otherwise.  If 'n' is not specified, default resource timeout will
+        be used.
 
     op add <resource id> <operation action> [operation properties]
         Add operation for specified resource
diff --git a/pcs/utils.py b/pcs/utils.py
index 0e6c70c..76fe57f 100644
--- a/pcs/utils.py
+++ b/pcs/utils.py
@@ -1038,11 +1038,12 @@ def is_resource_started(
         for res in resources:
             # If resource is a clone it can have an id of '<resource name>:N'
             if res.getAttribute("id") == resource or res.getAttribute("id").startswith(resource+":"):
-                set_running_on = set(
+                list_running_on = (
                     running_on["nodes_started"] + running_on["nodes_master"]
                 )
                 if slave_as_started:
-                    set_running_on.update(running_on["nodes_slave"])
+                    list_running_on.extend(running_on["nodes_slave"])
+                set_running_on = set(list_running_on)
                 if stopped:
                     if (
                         res.getAttribute("role") != "Stopped"
@@ -1071,7 +1072,7 @@ def is_resource_started(
                         and
                         res.getAttribute("failed") != "true"
                         and
-                        (count is None or len(set_running_on) == count)
+                        (count is None or len(list_running_on) == count)
                         and
                         (
                             not banned_nodes
@@ -1180,6 +1181,45 @@ def wait_for_primitive_ops_to_process(op_list, timeout=None):
                 % (op[1], op[0], op[2], message)
             )
 
+def get_resource_status_for_wait(dom, resource_el, node_count):
+    res_id = resource_el.getAttribute("id")
+    clone_ms_parent = dom_get_resource_clone_ms_parent(dom, res_id)
+    meta_resource_el = clone_ms_parent if clone_ms_parent else resource_el
+    status_running = is_resource_started(res_id, 0)[0]
+    status_enabled = True
+    for meta in meta_resource_el.getElementsByTagName("meta_attributes"):
+        for nvpair in meta.getElementsByTagName("nvpair"):
+            if nvpair.getAttribute("name") == "target-role":
+                if nvpair.getAttribute("value").lower() == "stopped":
+                    status_enabled = False
+    status_instances = count_expected_resource_instances(
+        meta_resource_el, node_count
+    )
+    return {
+        "running": status_running,
+        "enabled": status_enabled,
+        "instances": status_instances,
+    }
+
+def get_resource_wait_decision(old_status, new_status):
+    wait_for_start = False
+    wait_for_stop = False
+    if old_status["running"] and not new_status["enabled"]:
+        wait_for_stop = True
+    elif (
+        not old_status["running"]
+        and
+        (not old_status["enabled"] and new_status["enabled"])
+    ):
+        wait_for_start = True
+    elif (
+        old_status["running"]
+        and
+        old_status["instances"] != new_status["instances"]
+    ):
+        wait_for_start = True
+    return wait_for_start, wait_for_stop
+
 def get_lrm_rsc_op(cib, resource, op_list=None, last_call_id=None):
     lrm_rsc_op_list = []
     for lrm_resource in cib.getElementsByTagName("lrm_resource"):
-- 
1.9.1