Blame SOURCES/022-inflight-systemd.patch

ab5585
From 47f957c747f76d54b7289d9651aa057efb5be4c7 Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Mon, 27 Feb 2017 14:10:02 -0600
ab5585
Subject: [PATCH 1/7] Fix: libcrmcommon: assert if can't generate operation key
ab5585
ab5585
This is a refactor of generate_op_key() to use the existing
ab5585
crm_strdup_printf() function instead of its own allocation,
ab5585
and use CRM_ASSERT() instead of CRM_CHECK() for argument validation.
ab5585
ab5585
This simplifies the code, avoids allocating more memory than needed,
ab5585
and results in assertion failures instead of returning NULL on errors,
ab5585
which is preferable because existing callers don't check the
ab5585
return value for NULL before using it.
ab5585
---
ab5585
 lib/common/utils.c | 27 +++++++++++++++------------
ab5585
 1 file changed, 15 insertions(+), 12 deletions(-)
ab5585
ab5585
diff --git a/lib/common/utils.c b/lib/common/utils.c
ab5585
index 3e3abd3..dbf84c0 100644
ab5585
--- a/lib/common/utils.c
ab5585
+++ b/lib/common/utils.c
ab5585
@@ -629,21 +629,24 @@ crm_get_msec(const char *input)
ab5585
     return msec;
ab5585
 }
ab5585
 
ab5585
+/*!
ab5585
+ * \brief Generate an operation key
ab5585
+ *
ab5585
+ * \param[in] rsc_id    ID of resource being operated on
ab5585
+ * \param[in] op_type   Operation name
ab5585
+ * \param[in] interval  Operation interval
ab5585
+ *
ab5585
+ * \return Newly allocated memory containing operation key as string
ab5585
+ *
ab5585
+ * \note It is the caller's responsibility to free() the result.
ab5585
+ */
ab5585
 char *
ab5585
 generate_op_key(const char *rsc_id, const char *op_type, int interval)
ab5585
 {
ab5585
-    int len = 35;
ab5585
-    char *op_id = NULL;
ab5585
-
ab5585
-    CRM_CHECK(rsc_id != NULL, return NULL);
ab5585
-    CRM_CHECK(op_type != NULL, return NULL);
ab5585
-
ab5585
-    len += strlen(op_type);
ab5585
-    len += strlen(rsc_id);
ab5585
-    op_id = malloc(len);
ab5585
-    CRM_CHECK(op_id != NULL, return NULL);
ab5585
-    sprintf(op_id, "%s_%s_%d", rsc_id, op_type, interval);
ab5585
-    return op_id;
ab5585
+    CRM_ASSERT(rsc_id != NULL);
ab5585
+    CRM_ASSERT(op_type != NULL);
ab5585
+    CRM_ASSERT(interval >= 0);
ab5585
+    return crm_strdup_printf("%s_%s_%d", rsc_id, op_type, interval);
ab5585
 }
ab5585
 
ab5585
 gboolean
ab5585
-- 
ab5585
1.8.3.1
ab5585
ab5585
ab5585
From 94a4c58f675d163085a055f59fd6c3a2c9f57c43 Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Mon, 27 Feb 2017 14:17:01 -0600
ab5585
Subject: [PATCH 2/7] Fix: libservices: ensure completed ops aren't on blocked
ab5585
 ops list
ab5585
ab5585
This refactors removal of a completed operation from the in-flight ops list
ab5585
into a new internal function services_untrack_op(), which allows the
ab5585
in-flight ops list and handle_blocked_ops() to be static to services.c.
ab5585
ab5585
The new function also ensures the op isn't on the blocked ops list,
ab5585
which could otherwise result in a use-after-free bug.
ab5585
---
ab5585
 lib/services/services.c         | 23 +++++++++++++++++++++--
ab5585
 lib/services/services_linux.c   |  7 +------
ab5585
 lib/services/services_private.h |  2 +-
ab5585
 3 files changed, 23 insertions(+), 9 deletions(-)
ab5585
ab5585
diff --git a/lib/services/services.c b/lib/services/services.c
ab5585
index 0b535e6..90a2181 100644
ab5585
--- a/lib/services/services.c
ab5585
+++ b/lib/services/services.c
ab5585
@@ -44,7 +44,9 @@ static GHashTable *recurring_actions = NULL;
ab5585
 static GList *blocked_ops = NULL;
ab5585
 
ab5585
 /* ops currently active (in-flight) */
ab5585
-GList *inflight_ops = NULL;
ab5585
+static GList *inflight_ops = NULL;
ab5585
+
ab5585
+static void handle_blocked_ops(void);
ab5585
 
ab5585
 svc_action_t *
ab5585
 services_action_create(const char *name, const char *action, int interval, int timeout)
ab5585
@@ -605,6 +607,23 @@ services_add_inflight_op(svc_action_t * op)
ab5585
     }
ab5585
 }
ab5585
 
ab5585
+/*!
ab5585
+ * \internal
ab5585
+ * \brief Stop tracking an operation that completed
ab5585
+ *
ab5585
+ * \param[in] op  Operation to stop tracking
ab5585
+ */
ab5585
+void
ab5585
+services_untrack_op(svc_action_t *op)
ab5585
+{
ab5585
+    /* Op is no longer in-flight or blocked */
ab5585
+    inflight_ops = g_list_remove(inflight_ops, op);
ab5585
+    blocked_ops = g_list_remove(blocked_ops, op);
ab5585
+
ab5585
+    /* Op is no longer blocking other ops, so check if any need to run */
ab5585
+    handle_blocked_ops();
ab5585
+}
ab5585
+
ab5585
 gboolean
ab5585
 services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t *))
ab5585
 {
ab5585
@@ -649,7 +668,7 @@ is_op_blocked(const char *rsc)
ab5585
     return FALSE;
ab5585
 }
ab5585
 
ab5585
-void
ab5585
+static void
ab5585
 handle_blocked_ops(void)
ab5585
 {
ab5585
     GList *executed_ops = NULL;
ab5585
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
ab5585
index adfefaa..1fe3ec1 100644
ab5585
--- a/lib/services/services_linux.c
ab5585
+++ b/lib/services/services_linux.c
ab5585
@@ -36,9 +36,6 @@
ab5585
 #  include "crm/common/cib_secrets.h"
ab5585
 #endif
ab5585
 
ab5585
-/* ops currently active (in-flight) */
ab5585
-extern GList *inflight_ops;
ab5585
-
ab5585
 static inline void
ab5585
 set_fd_opts(int fd, int opts)
ab5585
 {
ab5585
@@ -248,9 +245,7 @@ operation_finalize(svc_action_t * op)
ab5585
 
ab5585
     op->pid = 0;
ab5585
 
ab5585
-    inflight_ops = g_list_remove(inflight_ops, op);
ab5585
-
ab5585
-    handle_blocked_ops();
ab5585
+    services_untrack_op(op);
ab5585
 
ab5585
     if (!recurring && op->synchronous == FALSE) {
ab5585
         /*
ab5585
diff --git a/lib/services/services_private.h b/lib/services/services_private.h
ab5585
index ec9e1c4..a949592 100644
ab5585
--- a/lib/services/services_private.h
ab5585
+++ b/lib/services/services_private.h
ab5585
@@ -76,7 +76,7 @@ G_GNUC_INTERNAL
ab5585
 void services_add_inflight_op(svc_action_t *op);
ab5585
 
ab5585
 G_GNUC_INTERNAL
ab5585
-void handle_blocked_ops(void);
ab5585
+void services_untrack_op(svc_action_t *op);
ab5585
 
ab5585
 G_GNUC_INTERNAL
ab5585
 gboolean is_op_blocked(const char *rsc);
ab5585
-- 
ab5585
1.8.3.1
ab5585
ab5585
ab5585
From b938c47740a456d0057438e4f5e6029d0e084587 Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Mon, 27 Feb 2017 14:20:05 -0600
ab5585
Subject: [PATCH 3/7] Refactor: libservices: dynamically allocate operation key
ab5585
ab5585
reduces code duplication, and avoids memory over-allocation
ab5585
---
ab5585
 lib/services/services.c | 18 ++++++------------
ab5585
 1 file changed, 6 insertions(+), 12 deletions(-)
ab5585
ab5585
diff --git a/lib/services/services.c b/lib/services/services.c
ab5585
index 90a2181..4020b7d 100644
ab5585
--- a/lib/services/services.c
ab5585
+++ b/lib/services/services.c
ab5585
@@ -149,10 +149,7 @@ resources_action_create(const char *name, const char *standard, const char *prov
ab5585
     op->agent = strdup(agent);
ab5585
     op->sequence = ++operations;
ab5585
     op->flags = flags;
ab5585
-
ab5585
-    if (asprintf(&op->id, "%s_%s_%d", name, action, interval) == -1) {
ab5585
-        goto return_error;
ab5585
-    }
ab5585
+    op->id = generate_op_key(name, action, interval);
ab5585
 
ab5585
     if (strcasecmp(op->standard, "service") == 0) {
ab5585
         const char *expanded = resources_find_service_class(op->agent);
ab5585
@@ -467,11 +464,10 @@ gboolean
ab5585
 services_action_cancel(const char *name, const char *action, int interval)
ab5585
 {
ab5585
     svc_action_t *op = NULL;
ab5585
-    char id[512];
ab5585
-
ab5585
-    snprintf(id, sizeof(id), "%s_%s_%d", name, action, interval);
ab5585
+    char *id = generate_op_key(name, action, interval);
ab5585
 
ab5585
     if (!(op = g_hash_table_lookup(recurring_actions, id))) {
ab5585
+        free(id);
ab5585
         return FALSE;
ab5585
     }
ab5585
 
ab5585
@@ -494,10 +490,12 @@ services_action_cancel(const char *name, const char *action, int interval)
ab5585
             /* even though the early termination failed,
ab5585
              * the op will be marked as cancelled once it completes. */
ab5585
             crm_err("Termination of %s (pid=%d) failed", id, op->pid);
ab5585
+            free(id);
ab5585
             return FALSE;
ab5585
         }
ab5585
     }
ab5585
 
ab5585
+    free(id);
ab5585
     return TRUE;
ab5585
 }
ab5585
 
ab5585
@@ -505,11 +503,7 @@ gboolean
ab5585
 services_action_kick(const char *name, const char *action, int interval /* ms */)
ab5585
 {
ab5585
     svc_action_t * op = NULL;
ab5585
-    char *id = NULL;
ab5585
-
ab5585
-    if (asprintf(&id, "%s_%s_%d", name, action, interval) == -1) {
ab5585
-        return FALSE;
ab5585
-    }
ab5585
+    char *id = generate_op_key(name, action, interval);
ab5585
 
ab5585
     op = g_hash_table_lookup(recurring_actions, id);
ab5585
     free(id);
ab5585
-- 
ab5585
1.8.3.1
ab5585
ab5585
ab5585
From 933d46ef20591757301784773a37e06b78906584 Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Mon, 27 Feb 2017 14:28:36 -0600
ab5585
Subject: [PATCH 4/7] Fix: libservices: prevent use-after-free when freeing an
ab5585
 operation
ab5585
ab5585
---
ab5585
 lib/services/services.c | 10 ++++++++++
ab5585
 1 file changed, 10 insertions(+)
ab5585
ab5585
diff --git a/lib/services/services.c b/lib/services/services.c
ab5585
index 4020b7d..78a0781 100644
ab5585
--- a/lib/services/services.c
ab5585
+++ b/lib/services/services.c
ab5585
@@ -410,6 +410,16 @@ services_action_free(svc_action_t * op)
ab5585
         return;
ab5585
     }
ab5585
 
ab5585
+    /* The operation should be removed from all tracking lists by this point.
ab5585
+     * If it's not, we have a bug somewhere, so bail. That may lead to a
ab5585
+     * memory leak, but it's better than a use-after-free segmentation fault.
ab5585
+     */
ab5585
+    CRM_CHECK(g_list_find(inflight_ops, op) == NULL, return);
ab5585
+    CRM_CHECK(g_list_find(blocked_ops, op) == NULL, return);
ab5585
+    CRM_CHECK((recurring_actions == NULL)
ab5585
+              || (g_hash_table_lookup(recurring_actions, op->id) == NULL),
ab5585
+              return);
ab5585
+
ab5585
     services_action_cleanup(op);
ab5585
 
ab5585
     if (op->opaque->repeat_timer) {
ab5585
-- 
ab5585
1.8.3.1
ab5585
ab5585
ab5585
From dc36d4375c049024a6f9e4d2277a3e6444fad05b Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Tue, 28 Feb 2017 15:41:10 -0600
ab5585
Subject: [PATCH 5/7] Refactor: libservices: handle in-flight case first when
ab5585
 cancelling an operation
ab5585
ab5585
will make next bugfix easier
ab5585
---
ab5585
 lib/services/services.c | 65 ++++++++++++++++++++++++++++++++-----------------
ab5585
 1 file changed, 42 insertions(+), 23 deletions(-)
ab5585
ab5585
diff --git a/lib/services/services.c b/lib/services/services.c
ab5585
index 78a0781..dba4e66 100644
ab5585
--- a/lib/services/services.c
ab5585
+++ b/lib/services/services.c
ab5585
@@ -470,43 +470,62 @@ cancel_recurring_action(svc_action_t * op)
ab5585
     return TRUE;
ab5585
 }
ab5585
 
ab5585
+/*!
ab5585
+ * \brief Cancel a recurring action
ab5585
+ *
ab5585
+ * \param[in] name      Name of resource that operation is for
ab5585
+ * \param[in] action    Name of operation to cancel
ab5585
+ * \param[in] interval  Interval of operation to cancel
ab5585
+ *
ab5585
+ * \return TRUE if action was successfully cancelled, FALSE otherwise
ab5585
+ */
ab5585
 gboolean
ab5585
 services_action_cancel(const char *name, const char *action, int interval)
ab5585
 {
ab5585
-    svc_action_t *op = NULL;
ab5585
+    gboolean cancelled = FALSE;
ab5585
     char *id = generate_op_key(name, action, interval);
ab5585
+    svc_action_t *op = g_hash_table_lookup(recurring_actions, id);
ab5585
 
ab5585
-    if (!(op = g_hash_table_lookup(recurring_actions, id))) {
ab5585
-        free(id);
ab5585
-        return FALSE;
ab5585
+    /* We can only cancel a recurring action */
ab5585
+    if (op == NULL) {
ab5585
+        goto done;
ab5585
     }
ab5585
 
ab5585
-    /* Always kill the recurring timer */
ab5585
+    /* Tell operation_finalize() not to reschedule the operation */
ab5585
+    op->cancel = TRUE;
ab5585
+
ab5585
+    /* Stop tracking it as a recurring operation, and stop its timer */
ab5585
     cancel_recurring_action(op);
ab5585
 
ab5585
-    if (op->pid == 0) {
ab5585
-        op->status = PCMK_LRM_OP_CANCELLED;
ab5585
-        if (op->opaque->callback) {
ab5585
-            op->opaque->callback(op);
ab5585
+    /* If the op has a PID, it's an in-flight child process, so kill it.
ab5585
+     *
ab5585
+     * Whether the kill succeeds or fails, the main loop will send the op to
ab5585
+     * operation_finished() (and thus operation_finalize()) when the process
ab5585
+     * goes away.
ab5585
+     */
ab5585
+    if (op->pid != 0) {
ab5585
+        crm_info("Terminating in-flight op %s (pid %d) early because it was cancelled",
ab5585
+                 id, op->pid);
ab5585
+        cancelled = mainloop_child_kill(op->pid);
ab5585
+        if (cancelled == FALSE) {
ab5585
+            crm_err("Termination of %s (pid %d) failed", id, op->pid);
ab5585
         }
ab5585
+        goto done;
ab5585
+    }
ab5585
 
ab5585
-        blocked_ops = g_list_remove(blocked_ops, op);
ab5585
-        services_action_free(op);
ab5585
-
ab5585
-    } else {
ab5585
-        crm_info("Cancelling in-flight op: performing early termination of %s (pid=%d)", id, op->pid);
ab5585
-        op->cancel = 1;
ab5585
-        if (mainloop_child_kill(op->pid) == FALSE) {
ab5585
-            /* even though the early termination failed,
ab5585
-             * the op will be marked as cancelled once it completes. */
ab5585
-            crm_err("Termination of %s (pid=%d) failed", id, op->pid);
ab5585
-            free(id);
ab5585
-            return FALSE;
ab5585
-        }
ab5585
+    /* Otherwise, operation is not in-flight, just report as cancelled */
ab5585
+    op->status = PCMK_LRM_OP_CANCELLED;
ab5585
+    if (op->opaque->callback) {
ab5585
+        op->opaque->callback(op);
ab5585
     }
ab5585
 
ab5585
+    blocked_ops = g_list_remove(blocked_ops, op);
ab5585
+    services_action_free(op);
ab5585
+    cancelled = TRUE;
ab5585
+
ab5585
+done:
ab5585
     free(id);
ab5585
-    return TRUE;
ab5585
+    return cancelled;
ab5585
 }
ab5585
 
ab5585
 gboolean
ab5585
-- 
ab5585
1.8.3.1
ab5585
ab5585
ab5585
From deabcc5a6aa93dadf0b20364715b559a5b9848ac Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Tue, 28 Feb 2017 15:48:11 -0600
ab5585
Subject: [PATCH 6/7] Fix: libservices: properly cancel in-flight
ab5585
 systemd/upstart op
ab5585
ab5585
---
ab5585
 lib/services/services.c | 27 +++++++++++++++++++++++++++
ab5585
 1 file changed, 27 insertions(+)
ab5585
ab5585
diff --git a/lib/services/services.c b/lib/services/services.c
ab5585
index dba4e66..4aa4c01 100644
ab5585
--- a/lib/services/services.c
ab5585
+++ b/lib/services/services.c
ab5585
@@ -89,6 +89,21 @@ resources_find_service_class(const char *agent)
ab5585
     return NULL;
ab5585
 }
ab5585
 
ab5585
+/*!
ab5585
+ * \internal
ab5585
+ * \brief Check whether op is in-flight systemd or upstart op
ab5585
+ *
ab5585
+ * \param[in] op  Operation to check
ab5585
+ *
ab5585
+ * \return TRUE if op is in-flight systemd or upstart op
ab5585
+ */
ab5585
+static inline gboolean
ab5585
+inflight_systemd_or_upstart(svc_action_t *op)
ab5585
+{
ab5585
+    return (safe_str_eq(op->standard, "systemd")
ab5585
+            || safe_str_eq(op->standard, "upstart"))
ab5585
+            && (g_list_find(inflight_ops, op) != NULL);
ab5585
+}
ab5585
 
ab5585
 svc_action_t *
ab5585
 resources_action_create(const char *name, const char *standard, const char *provider,
ab5585
@@ -513,6 +528,18 @@ services_action_cancel(const char *name, const char *action, int interval)
ab5585
         goto done;
ab5585
     }
ab5585
 
ab5585
+    /* In-flight systemd and upstart ops don't have a pid. The relevant handlers
ab5585
+     * will call operation_finalize() when the operation completes.
ab5585
+     * @TODO: Can we request early termination, maybe using
ab5585
+     * dbus_pending_call_cancel()?
ab5585
+     */
ab5585
+    if (inflight_systemd_or_upstart(op)) {
ab5585
+        crm_info("Will cancel %s op %s when in-flight instance completes",
ab5585
+                 op->standard, op->id);
ab5585
+        cancelled = FALSE;
ab5585
+        goto done;
ab5585
+    }
ab5585
+
ab5585
     /* Otherwise, operation is not in-flight, just report as cancelled */
ab5585
     op->status = PCMK_LRM_OP_CANCELLED;
ab5585
     if (op->opaque->callback) {
ab5585
-- 
ab5585
1.8.3.1
ab5585
ab5585
ab5585
From b85037b75255061a41d0ec3fd9b64f271351b43e Mon Sep 17 00:00:00 2001
ab5585
From: Ken Gaillot <kgaillot@redhat.com>
ab5585
Date: Tue, 28 Feb 2017 16:43:52 -0600
ab5585
Subject: [PATCH 7/7] Fix: libservices: properly detect in-flight
ab5585
 systemd/upstart ops when kicking
ab5585
ab5585
---
ab5585
 lib/services/services.c | 3 ++-
ab5585
 1 file changed, 2 insertions(+), 1 deletion(-)
ab5585
ab5585
diff --git a/lib/services/services.c b/lib/services/services.c
ab5585
index 4aa4c01..b41517d 100644
ab5585
--- a/lib/services/services.c
ab5585
+++ b/lib/services/services.c
ab5585
@@ -568,7 +568,8 @@ services_action_kick(const char *name, const char *action, int interval /* ms */
ab5585
         return FALSE;
ab5585
     }
ab5585
 
ab5585
-    if (op->pid) {
ab5585
+
ab5585
+    if (op->pid || inflight_systemd_or_upstart(op)) {
ab5585
         return TRUE;
ab5585
     } else {
ab5585
         if (op->opaque->repeat_timer) {
ab5585
-- 
ab5585
1.8.3.1
ab5585