109fe2
From 7a813755269f00d7b815e819636841af991762c0 Mon Sep 17 00:00:00 2001
109fe2
From: Ken Gaillot <kgaillot@redhat.com>
109fe2
Date: Mon, 11 Dec 2017 12:23:06 -0600
109fe2
Subject: [PATCH] Fix: tools: crm_resource --cleanup
109fe2
109fe2
The new "failures only" mode of crm_resource --cleanup had multiple issues,
109fe2
including not working without --resource specified, comparing a
109fe2
user-provided interval string against a milliseconds interval, and
109fe2
considering no interval specified as all intervals rather than 0
109fe2
but only when clearing LRM history entries.
109fe2
---
109fe2
 tools/crm_resource.c         |  35 +++---
109fe2
 tools/crm_resource.h         |   9 +-
109fe2
 tools/crm_resource_runtime.c | 258 ++++++++++++++++++++++++++++++-------------
109fe2
 3 files changed, 202 insertions(+), 100 deletions(-)
109fe2
109fe2
diff --git a/tools/crm_resource.c b/tools/crm_resource.c
109fe2
index 4ddcef4..5152004 100644
109fe2
--- a/tools/crm_resource.c
109fe2
+++ b/tools/crm_resource.c
109fe2
@@ -1092,14 +1092,20 @@ main(int argc, char **argv)
109fe2
         rc = cli_resource_delete_attribute(rsc, rsc_id, prop_set, prop_id,
109fe2
                                            prop_name, cib_conn, &data_set);
109fe2
 
109fe2
-    } else if (rsc_cmd == 'C' && just_errors) {
109fe2
+    } else if ((rsc_cmd == 'C') && rsc) {
109fe2
+        if (do_force == FALSE) {
109fe2
+            rsc = uber_parent(rsc);
109fe2
+        }
109fe2
         crmd_replies_needed = 0;
109fe2
 
109fe2
-        rc = cli_resource_delete_failures(crmd_channel, host_uname, rsc, operation,
109fe2
-                                          interval, &data_set);
109fe2
+        crm_debug("%s of %s (%s requested) on %s",
109fe2
+                  (just_errors? "Clearing failures" : "Re-checking the state"),
109fe2
+                  rsc->id, rsc_id, (host_uname? host_uname : "all hosts"));
109fe2
+        rc = cli_resource_delete(crmd_channel, host_uname, rsc, operation,
109fe2
+                                 interval, just_errors, &data_set);
109fe2
 
109fe2
-        if(rsc && (rc == pcmk_ok) && (BE_QUIET == FALSE)) {
109fe2
-            /* Now check XML_RSC_ATTR_TARGET_ROLE and XML_RSC_ATTR_MANAGED */
109fe2
+        if ((rc == pcmk_ok) && !BE_QUIET) {
109fe2
+            // Show any reasons why resource might stay stopped
109fe2
             cli_resource_check(cib_conn, rsc);
109fe2
         }
109fe2
 
109fe2
@@ -1107,22 +1113,9 @@ main(int argc, char **argv)
109fe2
             start_mainloop();
109fe2
         }
109fe2
 
109fe2
-    } else if ((rsc_cmd == 'C') && rsc) {
109fe2
-        if(do_force == FALSE) {
109fe2
-            rsc = uber_parent(rsc);
109fe2
-        }
109fe2
-
109fe2
-        crm_debug("Re-checking the state of %s (%s requested) on %s",
109fe2
-                  rsc->id, rsc_id, host_uname);
109fe2
-        crmd_replies_needed = 0;
109fe2
-        rc = cli_resource_delete(crmd_channel, host_uname, rsc, operation,
109fe2
-                                 interval, &data_set);
109fe2
-
109fe2
-        if(rc == pcmk_ok && BE_QUIET == FALSE) {
109fe2
-            /* Now check XML_RSC_ATTR_TARGET_ROLE and XML_RSC_ATTR_MANAGED */
109fe2
-            cli_resource_check(cib_conn, rsc);
109fe2
-        }
109fe2
-
109fe2
+    } else if (rsc_cmd == 'C' && just_errors) {
109fe2
+        rc = cli_cleanup_all(crmd_channel, host_uname, operation, interval,
109fe2
+                             &data_set);
109fe2
         if (rc == pcmk_ok) {
109fe2
             start_mainloop();
109fe2
         }
109fe2
diff --git a/tools/crm_resource.h b/tools/crm_resource.h
109fe2
index e28c9ef..0ac51f2 100644
109fe2
--- a/tools/crm_resource.h
109fe2
+++ b/tools/crm_resource.h
109fe2
@@ -75,10 +75,11 @@ int cli_resource_search(resource_t *rsc, const char *requested_name,
109fe2
                         pe_working_set_t *data_set);
109fe2
 int cli_resource_delete(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
                         resource_t *rsc, const char *operation,
109fe2
-                        const char *interval, pe_working_set_t *data_set);
109fe2
-int cli_resource_delete_failures(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
-                                 resource_t *rsc, const char *operation,
109fe2
-                                 const char *interval, pe_working_set_t *data_set);
109fe2
+                        const char *interval, bool just_failures,
109fe2
+                        pe_working_set_t *data_set);
109fe2
+int cli_cleanup_all(crm_ipc_t *crmd_channel, const char *node_name,
109fe2
+                    const char *operation, const char *interval,
109fe2
+                    pe_working_set_t *data_set);
109fe2
 int cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t * cib);
109fe2
 int cli_resource_move(resource_t *rsc, const char *rsc_id,
109fe2
                       const char *host_name, cib_t *cib,
109fe2
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
109fe2
index 1048636..bdebb0b 100644
109fe2
--- a/tools/crm_resource_runtime.c
109fe2
+++ b/tools/crm_resource_runtime.c
109fe2
@@ -532,15 +532,129 @@ rsc_fail_name(resource_t *rsc)
109fe2
     return is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
109fe2
 }
109fe2
 
109fe2
+static int
109fe2
+clear_rsc_history(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
+                  const char *rsc_id, pe_working_set_t *data_set)
109fe2
+{
109fe2
+    int rc = pcmk_ok;
109fe2
+
109fe2
+    /* Erase the resource's entire LRM history in the CIB, even if we're only
109fe2
+     * clearing a single operation's fail count. If we erased only entries for a
109fe2
+     * single operation, we might wind up with a wrong idea of the current
109fe2
+     * resource state, and we might not re-probe the resource.
109fe2
+     */
109fe2
+    rc = send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_DELETE, host_uname, rsc_id,
109fe2
+                         TRUE, data_set);
109fe2
+    if (rc != pcmk_ok) {
109fe2
+        return rc;
109fe2
+    }
109fe2
+    crmd_replies_needed++;
109fe2
+
109fe2
+    crm_trace("Processing %d mainloop inputs", crmd_replies_needed);
109fe2
+    while (g_main_context_iteration(NULL, FALSE)) {
109fe2
+        crm_trace("Processed mainloop input, %d still remaining",
109fe2
+                  crmd_replies_needed);
109fe2
+    }
109fe2
+
109fe2
+    if (crmd_replies_needed < 0) {
109fe2
+        crmd_replies_needed = 0;
109fe2
+    }
109fe2
+    return rc;
109fe2
+}
109fe2
+
109fe2
+static int
109fe2
+clear_rsc_failures(crm_ipc_t *crmd_channel, const char *node_name,
109fe2
+                   const char *rsc_id, const char *operation,
109fe2
+                   const char *interval, pe_working_set_t *data_set)
109fe2
+{
109fe2
+    int rc = pcmk_ok;
109fe2
+    const char *failed_value = NULL;
109fe2
+    const char *interval_ms_str = NULL;
109fe2
+    GHashTable *rscs = NULL;
109fe2
+    GHashTableIter iter;
109fe2
+
109fe2
+    /* Create a hash table to use as a set of resources to clean. This lets us
109fe2
+     * clean each resource only once (per node) regardless of how many failed
109fe2
+     * operations it has.
109fe2
+     */
109fe2
+    rscs = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, NULL);
109fe2
+
109fe2
+    // Normalize interval to milliseconds for comparison to history entry
109fe2
+    if (operation) {
109fe2
+        interval_ms_str = crm_strdup_printf("%llu", crm_get_interval(interval));
109fe2
+    }
109fe2
+
109fe2
+    for (xmlNode *xml_op = __xml_first_child(data_set->failed); xml_op != NULL;
109fe2
+         xml_op = __xml_next(xml_op)) {
109fe2
+
109fe2
+        // No resource specified means all resources match
109fe2
+        failed_value = crm_element_value(xml_op, XML_LRM_ATTR_RSCID);
109fe2
+        if (rsc_id == NULL) {
109fe2
+            rsc_id = failed_value;
109fe2
+        } else if (safe_str_neq(rsc_id, failed_value)) {
109fe2
+            continue;
109fe2
+        }
109fe2
+
109fe2
+        // Host name should always have been provided by this point
109fe2
+        failed_value = crm_element_value(xml_op, XML_ATTR_UNAME);
109fe2
+        if (safe_str_neq(node_name, failed_value)) {
109fe2
+            continue;
109fe2
+        }
109fe2
+
109fe2
+        // No operation specified means all operations match
109fe2
+        if (operation) {
109fe2
+            failed_value = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
109fe2
+            if (safe_str_neq(operation, failed_value)) {
109fe2
+                continue;
109fe2
+            }
109fe2
+
109fe2
+            // Interval (if operation was specified) defaults to 0 (not all)
109fe2
+            failed_value = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL);
109fe2
+            if (safe_str_neq(interval_ms_str, failed_value)) {
109fe2
+                continue;
109fe2
+            }
109fe2
+        }
109fe2
+
109fe2
+        g_hash_table_add(rscs, (gpointer) rsc_id);
109fe2
+    }
109fe2
+
109fe2
+    g_hash_table_iter_init(&iter, rscs);
109fe2
+    while (g_hash_table_iter_next(&iter, (gpointer *) &rsc_id, NULL)) {
109fe2
+        crm_debug("Erasing failures of %s on %s", rsc_id, node_name);
109fe2
+        rc = clear_rsc_history(crmd_channel, node_name, rsc_id, data_set);
109fe2
+        if (rc != pcmk_ok) {
109fe2
+            return rc;
109fe2
+        }
109fe2
+    }
109fe2
+    g_hash_table_destroy(rscs);
109fe2
+    return rc;
109fe2
+}
109fe2
+
109fe2
+static int
109fe2
+clear_rsc_fail_attrs(resource_t *rsc, const char *operation,
109fe2
+                     const char *interval, node_t *node)
109fe2
+{
109fe2
+    int rc = pcmk_ok;
109fe2
+    int attr_options = attrd_opt_none;
109fe2
+    char *rsc_name = rsc_fail_name(rsc);
109fe2
+
109fe2
+    if (is_remote_node(node)) {
109fe2
+        attr_options |= attrd_opt_remote;
109fe2
+    }
109fe2
+    rc = attrd_clear_delegate(NULL, node->details->uname, rsc_name, operation,
109fe2
+                              interval, NULL, attr_options);
109fe2
+    free(rsc_name);
109fe2
+    return rc;
109fe2
+}
109fe2
+
109fe2
 int
109fe2
 cli_resource_delete(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
                     resource_t *rsc, const char *operation,
109fe2
-                    const char *interval, pe_working_set_t *data_set)
109fe2
+                    const char *interval, bool just_failures,
109fe2
+                    pe_working_set_t *data_set)
109fe2
 {
109fe2
     int rc = pcmk_ok;
109fe2
     node_t *node = NULL;
109fe2
-    char *rsc_name = NULL;
109fe2
-    int attr_options = attrd_opt_none;
109fe2
 
109fe2
     if (rsc == NULL) {
109fe2
         return -ENXIO;
109fe2
@@ -552,8 +666,8 @@ cli_resource_delete(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
             resource_t *child = (resource_t *) lpc->data;
109fe2
 
109fe2
             rc = cli_resource_delete(crmd_channel, host_uname, child, operation,
109fe2
-                                     interval, data_set);
109fe2
-            if(rc != pcmk_ok) {
109fe2
+                                     interval, just_failures, data_set);
109fe2
+            if (rc != pcmk_ok) {
109fe2
                 return rc;
109fe2
             }
109fe2
         }
109fe2
@@ -585,8 +699,13 @@ cli_resource_delete(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
             node = (node_t *) lpc->data;
109fe2
 
109fe2
             if (node->details->online) {
109fe2
-                cli_resource_delete(crmd_channel, node->details->uname, rsc,
109fe2
-                                    operation, interval, data_set);
109fe2
+                rc = cli_resource_delete(crmd_channel, node->details->uname,
109fe2
+                                         rsc, operation, interval,
109fe2
+                                         just_failures, data_set);
109fe2
+            }
109fe2
+            if (rc != pcmk_ok) {
109fe2
+                g_list_free(nodes);
109fe2
+                return rc;
109fe2
             }
109fe2
         }
109fe2
 
109fe2
@@ -611,102 +730,91 @@ cli_resource_delete(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
     if (crmd_channel == NULL) {
109fe2
         printf("Dry run: skipping clean-up of %s on %s due to CIB_file\n",
109fe2
                rsc->id, host_uname);
109fe2
-        return rc;
109fe2
-     }
109fe2
+        return pcmk_ok;
109fe2
+    }
109fe2
 
109fe2
-    /* Erase the resource's entire LRM history in the CIB, even if we're only
109fe2
-     * clearing a single operation's fail count. If we erased only entries for a
109fe2
-     * single operation, we might wind up with a wrong idea of the current
109fe2
-     * resource state, and we might not re-probe the resource.
109fe2
-     */
109fe2
-    rc = send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_DELETE, host_uname, rsc->id,
109fe2
-                         TRUE, data_set);
109fe2
+    rc = clear_rsc_fail_attrs(rsc, operation, interval, node);
109fe2
     if (rc != pcmk_ok) {
109fe2
-        printf("Unable to clean up %s history on %s: %s\n",
109fe2
-               rsc->id, host_uname, pcmk_strerror(rc));
109fe2
+        printf("Unable to clean up %s failures on %s: %s\n",
109fe2
+                rsc->id, host_uname, pcmk_strerror(rc));
109fe2
         return rc;
109fe2
     }
109fe2
-    crmd_replies_needed++;
109fe2
 
109fe2
-    crm_trace("Processing %d mainloop inputs", crmd_replies_needed);
109fe2
-    while(g_main_context_iteration(NULL, FALSE)) {
109fe2
-        crm_trace("Processed mainloop input, %d still remaining",
109fe2
-                  crmd_replies_needed);
109fe2
-    }
109fe2
-
109fe2
-    if(crmd_replies_needed < 0) {
109fe2
-        crmd_replies_needed = 0;
109fe2
-    }
109fe2
-
109fe2
-    rsc_name = rsc_fail_name(rsc);
109fe2
-    if (is_remote_node(node)) {
109fe2
-        attr_options |= attrd_opt_remote;
109fe2
+    if (just_failures) {
109fe2
+        rc = clear_rsc_failures(crmd_channel, host_uname, rsc->id, operation,
109fe2
+                                interval, data_set);
109fe2
+    } else {
109fe2
+        rc = clear_rsc_history(crmd_channel, host_uname, rsc->id, data_set);
109fe2
     }
109fe2
-    rc = attrd_clear_delegate(NULL, host_uname, rsc_name, operation, interval,
109fe2
-                              NULL, attr_options);
109fe2
     if (rc != pcmk_ok) {
109fe2
-        printf("Cleaned %s history on %s, but unable to clear failures: %s\n",
109fe2
+        printf("Cleaned %s failures on %s, but unable to clean history: %s\n",
109fe2
                rsc->id, host_uname, pcmk_strerror(rc));
109fe2
     } else {
109fe2
         printf("Cleaned up %s on %s\n", rsc->id, host_uname);
109fe2
     }
109fe2
-    free(rsc_name);
109fe2
-
109fe2
     return rc;
109fe2
 }
109fe2
 
109fe2
 int
109fe2
-cli_resource_delete_failures(crm_ipc_t *crmd_channel, const char *host_uname,
109fe2
-                    resource_t *rsc, const char *operation,
109fe2
-                    const char *interval, pe_working_set_t *data_set)
109fe2
+cli_cleanup_all(crm_ipc_t *crmd_channel, const char *node_name,
109fe2
+                const char *operation, const char *interval,
109fe2
+                pe_working_set_t *data_set)
109fe2
 {
109fe2
+    int attr_options = attrd_opt_none;
109fe2
     int rc = pcmk_ok;
109fe2
+    const char *display_name = node_name? node_name : "all nodes";
109fe2
 
109fe2
-    if (rsc == NULL) {
109fe2
-        return -ENXIO;
109fe2
-
109fe2
-    } else if (rsc->children) {
109fe2
-        GListPtr lpc = NULL;
109fe2
+    if (crmd_channel == NULL) {
109fe2
+        printf("Dry run: skipping clean-up of %s due to CIB_file\n",
109fe2
+               display_name);
109fe2
+        return pcmk_ok;
109fe2
+    }
109fe2
+    crmd_replies_needed = 0;
109fe2
 
109fe2
-        for (lpc = rsc->children; lpc != NULL; lpc = lpc->next) {
109fe2
-            resource_t *child = (resource_t *) lpc->data;
109fe2
+    if (node_name) {
109fe2
+        node_t *node = pe_find_node(data_set->nodes, node_name);
109fe2
 
109fe2
-            rc = cli_resource_delete_failures(crmd_channel, host_uname, child, operation,
109fe2
-                                              interval, data_set);
109fe2
-            if(rc != pcmk_ok) {
109fe2
-                return rc;
109fe2
-            }
109fe2
+        if (node == NULL) {
109fe2
+            CMD_ERR("Unknown node: %s", node_name);
109fe2
+            return -ENXIO;
109fe2
+        }
109fe2
+        if (is_remote_node(node)) {
109fe2
+            attr_options |= attrd_opt_remote;
109fe2
         }
109fe2
-        return pcmk_ok;
109fe2
     }
109fe2
 
109fe2
-    for (xmlNode *xml_op = __xml_first_child(data_set->failed); xml_op != NULL;
109fe2
-         xml_op = __xml_next(xml_op)) {
109fe2
-
109fe2
-        const char *node = crm_element_value(xml_op, XML_ATTR_UNAME);
109fe2
-        const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
109fe2
-        const char *task_interval = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL);
109fe2
-        const char *resource_name = crm_element_value(xml_op, XML_LRM_ATTR_RSCID);
109fe2
+    rc = attrd_clear_delegate(NULL, node_name, NULL, operation, interval,
109fe2
+                              NULL, attr_options);
109fe2
+    if (rc != pcmk_ok) {
109fe2
+        printf("Unable to clean up all failures on %s: %s\n",
109fe2
+                display_name, pcmk_strerror(rc));
109fe2
+        return rc;
109fe2
+    }
109fe2
 
109fe2
-        if(resource_name == NULL) {
109fe2
-            continue;
109fe2
-        } else if(host_uname && safe_str_neq(host_uname, node)) {
109fe2
-            continue;
109fe2
-        } else if(rsc->id && safe_str_neq(rsc->id, resource_name)) {
109fe2
-            continue;
109fe2
-        } else if(operation && safe_str_neq(operation, task)) {
109fe2
-            continue;
109fe2
-        } else if(interval && safe_str_neq(interval, task_interval)) {
109fe2
-            continue;
109fe2
+    if (node_name) {
109fe2
+        rc = clear_rsc_failures(crmd_channel, node_name, NULL,
109fe2
+                                operation, interval, data_set);
109fe2
+        if (rc != pcmk_ok) {
109fe2
+            printf("Cleaned all resource failures on %s, but unable to clean history: %s\n",
109fe2
+                   node_name, pcmk_strerror(rc));
109fe2
+            return rc;
109fe2
         }
109fe2
+    } else {
109fe2
+        for (GList *iter = data_set->nodes; iter; iter = iter->next) {
109fe2
+            pe_node_t *node = (pe_node_t *) iter->data;
109fe2
 
109fe2
-        crm_debug("Erasing %s failure for %s (%s detected) on %s",
109fe2
-                  task, rsc->id, resource_name, node);
109fe2
-        rc = cli_resource_delete(crmd_channel, node, rsc, task,
109fe2
-                                 task_interval, data_set);
109fe2
+            rc = clear_rsc_failures(crmd_channel, node->details->uname, NULL,
109fe2
+                                    operation, interval, data_set);
109fe2
+            if (rc != pcmk_ok) {
109fe2
+                printf("Cleaned all resource failures on all nodes, but unable to clean history on %s: %s\n",
109fe2
+                       node->details->uname, pcmk_strerror(rc));
109fe2
+                return rc;
109fe2
+            }
109fe2
+        }
109fe2
     }
109fe2
 
109fe2
-    return rc;
109fe2
+    printf("Cleaned up all resources on %s\n", display_name);
109fe2
+    return pcmk_ok;
109fe2
 }
109fe2
 
109fe2
 void
109fe2
-- 
109fe2
1.8.3.1
109fe2