Blob Blame History Raw
From edd133ade2bd9b003d3437280271a9c9dbab3ed6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 23 May 2019 16:36:12 -0500
Subject: [PATCH] Refactor: controller: separate fencing-related functionality
 into own source file

Before:
   748 daemons/controld/controld_te_actions.c
   942 daemons/controld/controld_te_callbacks.c
   725 daemons/controld/controld_te_utils.c
    84 daemons/controld/controld_transition.h
   110 daemons/controld/controld_utils.h

After:
   838 daemons/controld/controld_fencing.c
    37 daemons/controld/controld_fencing.h
   631 daemons/controld/controld_te_actions.c
   701 daemons/controld/controld_te_callbacks.c
   298 daemons/controld/controld_te_utils.c
    65 daemons/controld/controld_transition.h
   106 daemons/controld/controld_utils.h
---
 daemons/controld/Makefile.am             |   5 +-
 daemons/controld/controld_callbacks.c    |   3 +-
 daemons/controld/controld_control.c      |   2 +-
 daemons/controld/controld_election.c     |   3 +-
 daemons/controld/controld_fencing.c      | 838 +++++++++++++++++++++++++++++++
 daemons/controld/controld_fencing.h      |  37 ++
 daemons/controld/controld_fsa.c          |   1 +
 daemons/controld/controld_messages.c     |   1 +
 daemons/controld/controld_te_actions.c   | 121 +----
 daemons/controld/controld_te_callbacks.c | 243 +--------
 daemons/controld/controld_te_utils.c     | 429 +---------------
 daemons/controld/controld_transition.c   |   1 -
 daemons/controld/controld_transition.h   |  21 +-
 daemons/controld/controld_utils.h        |   4 -
 14 files changed, 891 insertions(+), 818 deletions(-)
 create mode 100644 daemons/controld/controld_fencing.c
 create mode 100644 daemons/controld/controld_fencing.h

diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am
index 17c3342..858e1bb 100644
--- a/daemons/controld/Makefile.am
+++ b/daemons/controld/Makefile.am
@@ -1,5 +1,7 @@
 #
-# Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
+# Copyright 2018-2019 the Pacemaker project contributors
+#
+# The version control history for this file may have further details.
 #
 # This source code is licensed under the GNU General Public License version 2
 # or later (GPLv2+) WITHOUT ANY WARRANTY.
@@ -46,6 +48,7 @@ pacemaker_controld_SOURCES = pacemaker-controld.c	\
 			     controld_election.c	\
 			     controld_execd.c		\
 			     controld_execd_state.c	\
+			     controld_fencing.c		\
 			     controld_fsa.c		\
 			     controld_join_client.c	\
 			     controld_join_dc.c		\
diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c
index a188263..06ffb9d 100644
--- a/daemons/controld/controld_callbacks.c
+++ b/daemons/controld/controld_callbacks.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2004-2019 the Pacemaker project contributors
  *
  * This source code is licensed under the GNU General Public License version 2
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
@@ -22,6 +22,7 @@
 #include <controld_messages.h>
 #include <controld_callbacks.h>
 #include <controld_lrm.h>
+#include <controld_fencing.h>
 #include <controld_transition.h>
 #include <controld_membership.h>
 
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
index 6d9f335..7f918c0 100644
--- a/daemons/controld/controld_control.c
+++ b/daemons/controld/controld_control.c
@@ -25,6 +25,7 @@
 #include <controld_messages.h>
 #include <controld_callbacks.h>
 #include <controld_lrm.h>
+#include <controld_fencing.h>
 #include <controld_alerts.h>
 #include <controld_metadata.h>
 #include <controld_transition.h>
@@ -147,7 +148,6 @@ extern char *max_generation_from;
 extern xmlNode *max_generation_xml;
 extern GHashTable *resource_history;
 extern GHashTable *voted;
-extern char *te_client_id;
 
 void
 crmd_fast_exit(crm_exit_t exit_code)
diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c
index 5d6858c..9e49c7b 100644
--- a/daemons/controld/controld_election.c
+++ b/daemons/controld/controld_election.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2019 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2004-2019 the Pacemaker project contributors
  *
  * This source code is licensed under the GNU General Public License version 2
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
@@ -18,6 +18,7 @@
 #include <crm/crm.h>
 #include <pacemaker-controld.h>
 #include <controld_fsa.h>
+#include <controld_fencing.h>
 #include <controld_messages.h>
 #include <controld_callbacks.h>
 #include <controld_transition.h>
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
new file mode 100644
index 0000000..cde57b5
--- /dev/null
+++ b/daemons/controld/controld_fencing.c
@@ -0,0 +1,838 @@
+/*
+ * Copyright 2004-2019 the Pacemaker project contributors
+ *
+ * This source code is licensed under the GNU General Public License version 2
+ * or later (GPLv2+) WITHOUT ANY WARRANTY.
+ */
+
+#include <crm_internal.h>
+#include <crm/crm.h>
+#include <crm/msg_xml.h>
+#include <crm/common/xml.h>
+
+#include <controld_transition.h>
+#include <controld_fsa.h>
+#include <controld_lrm.h>
+#include <controld_fencing.h>
+
+#ifdef HAVE_SYS_REBOOT_H
+#  include <unistd.h>
+#  include <sys/reboot.h>
+#endif
+
+/*
+ * stonith failure counting
+ *
+ * We don't want to get stuck in a permanent fencing loop. Keep track of the
+ * number of fencing failures for each target node, and the most we'll restart a
+ * transition for.
+ */
+
+struct st_fail_rec {
+    int count;
+};
+
+static unsigned long int stonith_max_attempts = 10;
+static GHashTable *stonith_failures = NULL;
+
+void
+update_stonith_max_attempts(const char *value)
+{
+    if (safe_str_eq(value, CRM_INFINITY_S)) {
+       stonith_max_attempts = CRM_SCORE_INFINITY;
+    } else {
+       stonith_max_attempts = crm_int_helper(value, NULL);
+    }
+}
+
+static gboolean
+too_many_st_failures(const char *target)
+{
+    GHashTableIter iter;
+    const char *key = NULL;
+    struct st_fail_rec *value = NULL;
+
+    if (stonith_failures == NULL) {
+        return FALSE;
+    }
+
+    if (target == NULL) {
+        g_hash_table_iter_init(&iter, stonith_failures);
+        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
+               (gpointer *) &value)) {
+
+            if (value->count >= stonith_max_attempts) {
+                target = (const char*)key;
+                goto too_many;
+            }
+        }
+    } else {
+        value = g_hash_table_lookup(stonith_failures, target);
+        if ((value != NULL) && (value->count >= stonith_max_attempts)) {
+            goto too_many;
+        }
+    }
+    return FALSE;
+
+too_many:
+    crm_warn("Too many failures (%d) to fence %s, giving up",
+             value->count, target);
+    return TRUE;
+}
+
+/*!
+ * \internal
+ * \brief Reset a stonith fail count
+ *
+ * \param[in] target  Name of node to reset, or NULL for all
+ */
+void
+st_fail_count_reset(const char *target)
+{
+    if (stonith_failures == NULL) {
+        return;
+    }
+
+    if (target) {
+        struct st_fail_rec *rec = NULL;
+
+        rec = g_hash_table_lookup(stonith_failures, target);
+        if (rec) {
+            rec->count = 0;
+        }
+    } else {
+        GHashTableIter iter;
+        const char *key = NULL;
+        struct st_fail_rec *rec = NULL;
+
+        g_hash_table_iter_init(&iter, stonith_failures);
+        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
+                                      (gpointer *) &rec)) {
+            rec->count = 0;
+        }
+    }
+}
+
+static void
+st_fail_count_increment(const char *target)
+{
+    struct st_fail_rec *rec = NULL;
+
+    if (stonith_failures == NULL) {
+        stonith_failures = crm_str_table_new();
+    }
+
+    rec = g_hash_table_lookup(stonith_failures, target);
+    if (rec) {
+        rec->count++;
+    } else {
+        rec = malloc(sizeof(struct st_fail_rec));
+        if(rec == NULL) {
+            return;
+        }
+
+        rec->count = 1;
+        g_hash_table_insert(stonith_failures, strdup(target), rec);
+    }
+}
+
+/* end stonith fail count functions */
+
+
+static void
+cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
+                    void *user_data)
+{
+    if (rc < pcmk_ok) {
+        crm_err("Fencing update %d for %s: failed - %s (%d)",
+                call_id, (char *)user_data, pcmk_strerror(rc), rc);
+        crm_log_xml_warn(msg, "Failed update");
+        abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
+
+    } else {
+        crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
+    }
+}
+
+static void
+send_stonith_update(crm_action_t *action, const char *target, const char *uuid)
+{
+    int rc = pcmk_ok;
+    crm_node_t *peer = NULL;
+
+    /* We (usually) rely on the membership layer to do node_update_cluster,
+     * and the peer status callback to do node_update_peer, because the node
+     * might have already rejoined before we get the stonith result here.
+     */
+    int flags = node_update_join | node_update_expected;
+
+    /* zero out the node-status & remove all LRM status info */
+    xmlNode *node_state = NULL;
+
+    CRM_CHECK(target != NULL, return);
+    CRM_CHECK(uuid != NULL, return);
+
+    /* Make sure the membership and join caches are accurate */
+    peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
+
+    CRM_CHECK(peer != NULL, return);
+
+    if (peer->state == NULL) {
+        /* Usually, we rely on the membership layer to update the cluster state
+         * in the CIB. However, if the node has never been seen, do it here, so
+         * the node is not considered unclean.
+         */
+        flags |= node_update_cluster;
+    }
+
+    if (peer->uuid == NULL) {
+        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
+        peer->uuid = strdup(uuid);
+    }
+
+    crmd_peer_down(peer, TRUE);
+
+    /* Generate a node state update for the CIB */
+    node_state = create_node_state_update(peer, flags, NULL, __FUNCTION__);
+
+    /* we have to mark whether or not remote nodes have already been fenced */
+    if (peer->flags & crm_remote_node) {
+        time_t now = time(NULL);
+        char *now_s = crm_itoa(now);
+        crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
+        free(now_s);
+    }
+
+    /* Force our known ID */
+    crm_xml_add(node_state, XML_ATTR_UUID, uuid);
+
+    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
+                                    cib_quorum_override | cib_scope_local | cib_can_create);
+
+    /* Delay processing the trigger until the update completes */
+    crm_debug("Sending fencing update %d for %s", rc, target);
+    fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
+
+    /* Make sure it sticks */
+    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
+
+    erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local);
+    erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
+
+    free_xml(node_state);
+    return;
+}
+
+/*!
+ * \internal
+ * \brief Abort transition due to stonith failure
+ *
+ * \param[in] abort_action  Whether to restart or stop transition
+ * \param[in] target  Don't restart if this (NULL for any) has too many failures
+ * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
+ */
+static void
+abort_for_stonith_failure(enum transition_action abort_action,
+                          const char *target, xmlNode *reason)
+{
+    /* If stonith repeatedly fails, we eventually give up on starting a new
+     * transition for that reason.
+     */
+    if ((abort_action != tg_stop) && too_many_st_failures(target)) {
+        abort_action = tg_stop;
+    }
+    abort_transition(INFINITY, abort_action, "Stonith failed", reason);
+}
+
+
+/*
+ * stonith cleanup list
+ *
+ * If the DC is shot, proper notifications might not go out.
+ * The stonith cleanup list allows the cluster to (re-)send
+ * notifications once a new DC is elected.
+ */
+
+static GListPtr stonith_cleanup_list = NULL;
+
+/*!
+ * \internal
+ * \brief Add a node to the stonith cleanup list
+ *
+ * \param[in] target  Name of node to add
+ */
+void
+add_stonith_cleanup(const char *target) {
+    stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
+}
+
+/*!
+ * \internal
+ * \brief Remove a node from the stonith cleanup list
+ *
+ * \param[in] Name of node to remove
+ */
+void
+remove_stonith_cleanup(const char *target)
+{
+    GListPtr iter = stonith_cleanup_list;
+
+    while (iter != NULL) {
+        GListPtr tmp = iter;
+        char *iter_name = tmp->data;
+
+        iter = iter->next;
+        if (safe_str_eq(target, iter_name)) {
+            crm_trace("Removing %s from the cleanup list", iter_name);
+            stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
+            free(iter_name);
+        }
+    }
+}
+
+/*!
+ * \internal
+ * \brief Purge all entries from the stonith cleanup list
+ */
+void
+purge_stonith_cleanup()
+{
+    if (stonith_cleanup_list) {
+        GListPtr iter = NULL;
+
+        for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
+            char *target = iter->data;
+
+            crm_info("Purging %s from stonith cleanup list", target);
+            free(target);
+        }
+        g_list_free(stonith_cleanup_list);
+        stonith_cleanup_list = NULL;
+    }
+}
+
+/*!
+ * \internal
+ * \brief Send stonith updates for all entries in cleanup list, then purge it
+ */
+void
+execute_stonith_cleanup()
+{
+    GListPtr iter;
+
+    for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
+        char *target = iter->data;
+        crm_node_t *target_node = crm_get_peer(0, target);
+        const char *uuid = crm_peer_uuid(target_node);
+
+        crm_notice("Marking %s, target of a previous stonith action, as clean", target);
+        send_stonith_update(NULL, target, uuid);
+        free(target);
+    }
+    g_list_free(stonith_cleanup_list);
+    stonith_cleanup_list = NULL;
+}
+
+/* end stonith cleanup list functions */
+
+
+/* stonith API client
+ *
+ * Functions that need to interact directly with the fencer via its API
+ */
+
+stonith_t *stonith_api = NULL;
+crm_trigger_t *stonith_reconnect = NULL;
+char *te_client_id = NULL;
+
+static gboolean
+fail_incompletable_stonith(crm_graph_t *graph)
+{
+    GListPtr lpc = NULL;
+    const char *task = NULL;
+    xmlNode *last_action = NULL;
+
+    if (graph == NULL) {
+        return FALSE;
+    }
+
+    for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
+        GListPtr lpc2 = NULL;
+        synapse_t *synapse = (synapse_t *) lpc->data;
+
+        if (synapse->confirmed) {
+            continue;
+        }
+
+        for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
+            crm_action_t *action = (crm_action_t *) lpc2->data;
+
+            if (action->type != action_type_crm || action->confirmed) {
+                continue;
+            }
+
+            task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
+            if (task && safe_str_eq(task, CRM_OP_FENCE)) {
+                action->failed = TRUE;
+                last_action = action->xml;
+                update_graph(graph, action);
+                crm_notice("Failing action %d (%s): fencer terminated",
+                           action->id, ID(action->xml));
+            }
+        }
+    }
+
+    if (last_action != NULL) {
+        crm_warn("Fencer failure resulted in unrunnable actions");
+        abort_for_stonith_failure(tg_restart, NULL, last_action);
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+static void
+tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
+{
+    if (is_set(fsa_input_register, R_ST_REQUIRED)) {
+        crm_crit("Fencing daemon connection failed");
+        mainloop_set_trigger(stonith_reconnect);
+
+    } else {
+        crm_info("Fencing daemon disconnected");
+    }
+
+    if (stonith_api) {
+        stonith_api->state = stonith_disconnected;
+    }
+
+    if (AM_I_DC) {
+        fail_incompletable_stonith(transition_graph);
+        trigger_graph();
+    }
+}
+
+static void
+tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
+{
+    if (te_client_id == NULL) {
+        te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
+                                         (unsigned long) getpid());
+    }
+
+    if (st_event == NULL) {
+        crm_err("Notify data not found");
+        return;
+    }
+
+    crmd_alert_fencing_op(st_event);
+
+    if ((st_event->result == pcmk_ok) && safe_str_eq("on", st_event->action)) {
+        crm_notice("%s was successfully unfenced by %s (at the request of %s)",
+                   st_event->target,
+                   st_event->executioner? st_event->executioner : "<anyone>",
+                   st_event->origin);
+                /* TODO: Hook up st_event->device */
+        return;
+
+    } else if (safe_str_eq("on", st_event->action)) {
+        crm_err("Unfencing of %s by %s failed: %s (%d)",
+                st_event->target,
+                st_event->executioner? st_event->executioner : "<anyone>",
+                pcmk_strerror(st_event->result), st_event->result);
+        return;
+
+    } else if ((st_event->result == pcmk_ok)
+               && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) {
+
+        crm_crit("We were allegedly just fenced by %s for %s!",
+                 st_event->executioner? st_event->executioner : "<anyone>",
+                 st_event->origin); /* Dumps blackbox if enabled */
+
+        qb_log_fini(); /* Try to get the above log message to disk - somehow */
+
+        /* Get out ASAP and do not come back up.
+         *
+         * Triggering a reboot is also not the worst idea either since
+         * the rest of the cluster thinks we're safely down
+         */
+
+#ifdef RB_HALT_SYSTEM
+        reboot(RB_HALT_SYSTEM);
+#endif
+
+        /*
+         * If reboot() fails or is not supported, coming back up will
+         * probably lead to a situation where the other nodes set our
+         * status to 'lost' because of the fencing callback and will
+         * discard subsequent election votes with:
+         *
+         * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster)
+         *
+         * So just stay dead, something is seriously messed up anyway.
+         *
+         */
+        exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini()
+        return;
+    }
+
+    /* Update the count of stonith failures for this target, in case we become
+     * DC later. The current DC has already updated its fail count in
+     * tengine_stonith_callback().
+     */
+    if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
+        if (st_event->result == pcmk_ok) {
+            st_fail_count_reset(st_event->target);
+        } else {
+            st_fail_count_increment(st_event->target);
+        }
+    }
+
+    crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
+               CRM_XS " initiator=%s ref=%s",
+               st_event->target, st_event->result == pcmk_ok ? "" : " not",
+               st_event->action,
+               st_event->executioner ? st_event->executioner : "<anyone>",
+               (st_event->client_origin? st_event->client_origin : "<unknown>"),
+               pcmk_strerror(st_event->result),
+               st_event->origin, st_event->id);
+
+    if (st_event->result == pcmk_ok) {
+        crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
+        const char *uuid = NULL;
+        gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);
+
+        if (peer == NULL) {
+            return;
+        }
+
+        uuid = crm_peer_uuid(peer);
+
+        crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
+        if(AM_I_DC) {
+            /* The DC always sends updates */
+            send_stonith_update(NULL, st_event->target, uuid);
+
+            /* @TODO Ideally, at this point, we'd check whether the fenced node
+             * hosted any guest nodes, and call remote_node_down() for them.
+             * Unfortunately, the controller doesn't have a simple, reliable way
+             * to map hosts to guests. It might be possible to track this in the
+             * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
+             * on the PE creating fence pseudo-events for the guests.
+             */
+
+            if (st_event->client_origin
+                && safe_str_neq(st_event->client_origin, te_client_id)) {
+
+                /* Abort the current transition graph if it wasn't us
+                 * that invoked stonith to fence someone
+                 */
+                crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
+                abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
+            }
+
+            /* Assume it was our leader if we don't currently have one */
+        } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target))
+                   && is_not_set(peer->flags, crm_remote_node)) {
+
+            crm_notice("Target %s our leader %s (recorded: %s)",
+                       fsa_our_dc ? "was" : "may have been", st_event->target,
+                       fsa_our_dc ? fsa_our_dc : "<unset>");
+
+            /* Given the CIB resyncing that occurs around elections,
+             * have one node update the CIB now and, if the new DC is different,
+             * have them do so too after the election
+             */
+            if (we_are_executioner) {
+                send_stonith_update(NULL, st_event->target, uuid);
+            }
+            add_stonith_cleanup(st_event->target);
+        }
+
+        /* If the target is a remote node, and we host its connection,
+         * immediately fail all monitors so it can be recovered quickly.
+         * The connection won't necessarily drop when a remote node is fenced,
+         * so the failure might not otherwise be detected until the next poke.
+         */
+        if (is_set(peer->flags, crm_remote_node)) {
+            remote_ra_fail(st_event->target);
+        }
+
+        crmd_peer_down(peer, TRUE);
+     }
+}
+
+/*!
+ * \brief Connect to fencer
+ *
+ * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
+ *
+ * \return TRUE
+ * \note If user_data is NULL, this will wait 2s between attempts, for up to
+ *       30 attempts, meaning the controller could be blocked as long as 58s.
+ */
+gboolean
+te_connect_stonith(gpointer user_data)
+{
+    int rc = pcmk_ok;
+
+    if (stonith_api == NULL) {
+        stonith_api = stonith_api_new();
+    }
+
+    if (stonith_api->state != stonith_disconnected) {
+        crm_trace("Already connected to fencer, no need to retry");
+        return TRUE;
+    }
+
+    if (user_data == NULL) {
+        // Blocking (retry failures now until successful)
+        rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
+        if (rc != pcmk_ok) {
+            crm_err("Could not connect to fencer in 30 attempts: %s "
+                    CRM_XS " rc=%d", pcmk_strerror(rc), rc);
+        }
+    } else {
+        // Non-blocking (retry failures later in main loop)
+        rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
+        if (rc != pcmk_ok) {
+            if (is_set(fsa_input_register, R_ST_REQUIRED)) {
+                crm_err("Fencer connection failed (will retry): %s "
+                        CRM_XS " rc=%d", pcmk_strerror(rc), rc);
+                mainloop_set_trigger(stonith_reconnect);
+            } else {
+                crm_info("Fencer connection failed (ignoring because no longer required): %s "
+                         CRM_XS " rc=%d", pcmk_strerror(rc), rc);
+            }
+            return TRUE;
+        }
+    }
+
+    if (rc == pcmk_ok) {
+        stonith_api->cmds->register_notification(stonith_api,
+                                                 T_STONITH_NOTIFY_DISCONNECT,
+                                                 tengine_stonith_connection_destroy);
+        stonith_api->cmds->register_notification(stonith_api,
+                                                 T_STONITH_NOTIFY_FENCE,
+                                                 tengine_stonith_notify);
+    }
+    return TRUE;
+}
+
+static gboolean
+do_stonith_history_sync(gpointer user_data)
+{
+    if (stonith_api && (stonith_api->state != stonith_disconnected)) {
+        stonith_history_t *history = NULL;
+
+        stonith_api->cmds->history(stonith_api,
+                                   st_opt_sync_call | st_opt_broadcast,
+                                   NULL, &history, 5);
+        stonith_history_free(history);
+        return TRUE;
+    } else {
+        crm_info("Skip triggering stonith history-sync as stonith is disconnected");
+        return FALSE;
+    }
+}
+
+static void
+tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
+{
+    char *uuid = NULL;
+    int stonith_id = -1;
+    int transition_id = -1;
+    crm_action_t *action = NULL;
+    int call_id = data->call_id;
+    int rc = data->rc;
+    char *userdata = data->userdata;
+
+    CRM_CHECK(userdata != NULL, return);
+    crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
+               pcmk_strerror(rc), rc);
+
+    if (AM_I_DC == FALSE) {
+        return;
+    }
+
+    /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
+    /*       op->call_id, op->optype, op->node_name, op->op_result, */
+    /*       (char *)op->node_list, op->private_data); */
+
+    /* filter out old STONITH actions */
+    CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
+              goto bail);
+
+    if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid)
+        || transition_graph->id != transition_id) {
+        crm_info("Ignoring STONITH action initiated outside of the current transition");
+        goto bail;
+    }
+
+    action = get_action(stonith_id, FALSE);
+    if (action == NULL) {
+        crm_err("Stonith action not matched");
+        goto bail;
+    }
+
+    stop_te_timer(action->timer);
+    if (rc == pcmk_ok) {
+        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
+        const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
+        const char *op = crm_meta_value(action->params, "stonith_action");
+
+        crm_info("Stonith operation %d for %s passed", call_id, target);
+        if (action->confirmed == FALSE) {
+            te_action_confirmed(action);
+            if (safe_str_eq("on", op)) {
+                const char *value = NULL;
+                char *now = crm_itoa(time(NULL));
+
+                update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE);
+                free(now);
+
+                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
+                update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE);
+
+                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
+                update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE);
+
+            } else if (action->sent_update == FALSE) {
+                send_stonith_update(action, target, uuid);
+                action->sent_update = TRUE;
+            }
+        }
+        st_fail_count_reset(target);
+
+    } else {
+        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
+        enum transition_action abort_action = tg_restart;
+
+        action->failed = TRUE;
+        crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
+                   call_id, target, pcmk_strerror(rc));
+
+        /* If no fence devices were available, there's no use in immediately
+         * checking again, so don't start a new transition in that case.
+         */
+        if (rc == -ENODEV) {
+            crm_warn("No devices found in cluster to fence %s, giving up",
+                     target);
+            abort_action = tg_stop;
+        }
+
+        /* Increment the fail count now, so abort_for_stonith_failure() can
+         * check it. Non-DC nodes will increment it in tengine_stonith_notify().
+         */
+        st_fail_count_increment(target);
+        abort_for_stonith_failure(abort_action, target, NULL);
+    }
+
+    update_graph(transition_graph, action);
+    trigger_graph();
+
+  bail:
+    free(userdata);
+    free(uuid);
+    return;
+}
+
+gboolean
+te_fence_node(crm_graph_t *graph, crm_action_t *action)
+{
+    int rc = 0;
+    const char *id = NULL;
+    const char *uuid = NULL;
+    const char *target = NULL;
+    const char *type = NULL;
+    gboolean invalid_action = FALSE;
+    enum stonith_call_options options = st_opt_none;
+
+    id = ID(action->xml);
+    target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
+    uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
+    type = crm_meta_value(action->params, "stonith_action");
+
+    CRM_CHECK(id != NULL, invalid_action = TRUE);
+    CRM_CHECK(uuid != NULL, invalid_action = TRUE);
+    CRM_CHECK(type != NULL, invalid_action = TRUE);
+    CRM_CHECK(target != NULL, invalid_action = TRUE);
+
+    if (invalid_action) {
+        crm_log_xml_warn(action->xml, "BadAction");
+        return FALSE;
+    }
+
+    crm_notice("Requesting fencing (%s) of node %s "
+               CRM_XS " action=%s timeout=%d",
+               type, target, id, transition_graph->stonith_timeout);
+
+    /* Passing NULL means block until we can connect... */
+    te_connect_stonith(NULL);
+
+    if (crmd_join_phase_count(crm_join_confirmed) == 1) {
+        options |= st_opt_allow_suicide;
+    }
+
+    rc = stonith_api->cmds->fence(stonith_api, options, target, type,
+                                  transition_graph->stonith_timeout / 1000, 0);
+
+    stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000,
+                                         st_opt_timeout_updates,
+                                         generate_transition_key(transition_graph->id, action->id,
+                                                                 0, te_uuid),
+                                         "tengine_stonith_callback", tengine_stonith_callback);
+
+    return TRUE;
+}
+
+/* end stonith API client functions */
+
+
+/*
+ * stonith history synchronization
+ *
+ * Each node's fencer keeps track of a cluster-wide fencing history. When a node
+ * joins or leaves, we need to synchronize the history across all nodes.
+ */
+
+static crm_trigger_t *stonith_history_sync_trigger = NULL;
+static mainloop_timer_t *stonith_history_sync_timer = NULL;
+
+static gboolean
+stonith_history_sync_set_trigger(gpointer user_data)
+{
+    mainloop_set_trigger(stonith_history_sync_trigger);
+    return FALSE;
+}
+
+void
+te_trigger_stonith_history_sync(void)
+{
+    /* trigger a sync in 5s to give more nodes the
+     * chance to show up so that we don't create
+     * unnecessary stonith-history-sync traffic
+     */
+
+    /* as we are finally checking the stonith-connection
+     * in do_stonith_history_sync we should be fine
+     * leaving stonith_history_sync_time & stonith_history_sync_trigger
+     * around
+     */
+    if (stonith_history_sync_trigger == NULL) {
+        stonith_history_sync_trigger =
+            mainloop_add_trigger(G_PRIORITY_LOW,
+                                 do_stonith_history_sync, NULL);
+    }
+
+    if(stonith_history_sync_timer == NULL) {
+        stonith_history_sync_timer =
+            mainloop_timer_add("history_sync", 5000,
+                               FALSE, stonith_history_sync_set_trigger,
+                               NULL);
+    }
+    crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
+    mainloop_timer_start(stonith_history_sync_timer);
+}
+
+/* end stonith history synchronization functions */
diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h
new file mode 100644
index 0000000..b80a6c9
--- /dev/null
+++ b/daemons/controld/controld_fencing.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2004-2019 the Pacemaker project contributors
+ *
+ * The version control history for this file may have further details.
+ *
+ * This source code is licensed under the GNU Lesser General Public License
+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
+ */
+
+#ifndef CONTROLD_FENCING__H
+#  define CONTROLD_FENCING__H
+
+#include <stdbool.h>                // bool
+#include <crm/transition.h>         // crm_graph_t, crm_action_t
+
+extern crm_trigger_t *stonith_reconnect;
+extern char *te_client_id;
+extern stonith_t *stonith_api;
+
+// stonith fail counts
+void st_fail_count_reset(const char * target);
+void update_stonith_max_attempts(const char* value);
+
+// stonith API client
+gboolean te_connect_stonith(gpointer user_data);
+gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action);
+
+// stonith cleanup list
+void add_stonith_cleanup(const char *target);
+void remove_stonith_cleanup(const char *target);
+void purge_stonith_cleanup(void);
+void execute_stonith_cleanup(void);
+
+// stonith history synchronization
+void te_trigger_stonith_history_sync(void);
+
+#endif
diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c
index 9eca530..dc1937f 100644
--- a/daemons/controld/controld_fsa.c
+++ b/daemons/controld/controld_fsa.c
@@ -26,6 +26,7 @@
 #include <pacemaker-controld.h>
 #include <controld_messages.h>
 #include <controld_fsa.h>
+#include <controld_fencing.h>
 #include <controld_transition.h>
 #include <controld_matrix.h>
 
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
index 2ebc203..8f37cbf 100644
--- a/daemons/controld/controld_messages.c
+++ b/daemons/controld/controld_messages.c
@@ -25,6 +25,7 @@
 #include <pacemaker-controld.h>
 #include <controld_messages.h>
 #include <controld_lrm.h>
+#include <controld_fencing.h>
 #include <controld_transition.h>
 #include <controld_throttle.h>
 
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
index c95c6c7..2f61556 100644
--- a/daemons/controld/controld_te_actions.c
+++ b/daemons/controld/controld_te_actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2004-2019 the Pacemaker project contributors
  *
  * This source code is licensed under the GNU General Public License version 2
  * or later (GPLv2+) WITHOUT ANY WARRANTY.
@@ -17,6 +17,7 @@
 
 #include <controld_fsa.h>
 #include <controld_lrm.h>
+#include <controld_fencing.h>
 #include <controld_messages.h>
 #include <crm/cluster.h>
 #include <controld_throttle.h>
@@ -76,124 +77,6 @@ te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo)
     return TRUE;
 }
 
-void
-send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
-{
-    int rc = pcmk_ok;
-    crm_node_t *peer = NULL;
-
-    /* We (usually) rely on the membership layer to do node_update_cluster,
-     * and the peer status callback to do node_update_peer, because the node
-     * might have already rejoined before we get the stonith result here.
-     */
-    int flags = node_update_join | node_update_expected;
-
-    /* zero out the node-status & remove all LRM status info */
-    xmlNode *node_state = NULL;
-
-    CRM_CHECK(target != NULL, return);
-    CRM_CHECK(uuid != NULL, return);
-
-    /* Make sure the membership and join caches are accurate */
-    peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
-
-    CRM_CHECK(peer != NULL, return);
-
-    if (peer->state == NULL) {
-        /* Usually, we rely on the membership layer to update the cluster state
-         * in the CIB. However, if the node has never been seen, do it here, so
-         * the node is not considered unclean.
-         */
-        flags |= node_update_cluster;
-    }
-
-    if (peer->uuid == NULL) {
-        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
-        peer->uuid = strdup(uuid);
-    }
-
-    crmd_peer_down(peer, TRUE);
-
-    /* Generate a node state update for the CIB */
-    node_state = create_node_state_update(peer, flags, NULL, __FUNCTION__);
-
-    /* we have to mark whether or not remote nodes have already been fenced */
-    if (peer->flags & crm_remote_node) {
-        time_t now = time(NULL);
-        char *now_s = crm_itoa(now);
-        crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
-        free(now_s);
-    }
-
-    /* Force our known ID */
-    crm_xml_add(node_state, XML_ATTR_UUID, uuid);
-
-    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
-                                    cib_quorum_override | cib_scope_local | cib_can_create);
-
-    /* Delay processing the trigger until the update completes */
-    crm_debug("Sending fencing update %d for %s", rc, target);
-    fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
-
-    /* Make sure it sticks */
-    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
-
-    erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local);
-    erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
-
-    free_xml(node_state);
-    return;
-}
-
-static gboolean
-te_fence_node(crm_graph_t * graph, crm_action_t * action)
-{
-    int rc = 0;
-    const char *id = NULL;
-    const char *uuid = NULL;
-    const char *target = NULL;
-    const char *type = NULL;
-    gboolean invalid_action = FALSE;
-    enum stonith_call_options options = st_opt_none;
-
-    id = ID(action->xml);
-    target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
-    uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
-    type = crm_meta_value(action->params, "stonith_action");
-
-    CRM_CHECK(id != NULL, invalid_action = TRUE);
-    CRM_CHECK(uuid != NULL, invalid_action = TRUE);
-    CRM_CHECK(type != NULL, invalid_action = TRUE);
-    CRM_CHECK(target != NULL, invalid_action = TRUE);
-
-    if (invalid_action) {
-        crm_log_xml_warn(action->xml, "BadAction");
-        return FALSE;
-    }
-
-    crm_notice("Requesting fencing (%s) of node %s "
-               CRM_XS " action=%s timeout=%d",
-               type, target, id, transition_graph->stonith_timeout);
-
-    /* Passing NULL means block until we can connect... */
-    te_connect_stonith(NULL);
-
-    if (crmd_join_phase_count(crm_join_confirmed) == 1) {
-        options |= st_opt_allow_suicide;
-    }
-
-    rc = stonith_api->cmds->fence(stonith_api, options, target, type,
-                                  transition_graph->stonith_timeout / 1000, 0);
-
-    stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000,
-                                         st_opt_timeout_updates,
-                                         generate_transition_key(transition_graph->id, action->id,
-                                                                 0, te_uuid),
-                                         "tengine_stonith_callback", tengine_stonith_callback);
-
-    return TRUE;
-}
-
 static int
 get_target_rc(crm_action_t * action)
 {
diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c
index 22b5f4b..1ab703f 100644
--- a/daemons/controld/controld_te_callbacks.c
+++ b/daemons/controld/controld_te_callbacks.c
@@ -17,6 +17,7 @@
 
 #include <controld_transition.h>
 #include <controld_fsa.h>
+#include <controld_fencing.h>
 
 #include <crm/cluster.h>        /* For ONLINESTATUS etc */
 
@@ -27,21 +28,9 @@ gboolean shuttingdown = FALSE;
 crm_graph_t *transition_graph;
 crm_trigger_t *transition_trigger = NULL;
 
-static unsigned long int stonith_max_attempts = 10;
-
 /* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */
 #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']"
 
-void
-update_stonith_max_attempts(const char* value)
-{
-    if (safe_str_eq(value, CRM_INFINITY_S)) {
-       stonith_max_attempts = CRM_SCORE_INFINITY;
-    }
-    else {
-       stonith_max_attempts = crm_int_helper(value, NULL);
-    }
-}
 static void
 te_update_diff_v1(const char *event, xmlNode *diff)
 {
@@ -646,236 +635,6 @@ process_te_message(xmlNode * msg, xmlNode * xml_data)
     return TRUE;
 }
 
-GHashTable *stonith_failures = NULL;
-struct st_fail_rec {
-    int count;
-};
-
-static gboolean
-too_many_st_failures(const char *target)
-{
-    GHashTableIter iter;
-    const char *key = NULL;
-    struct st_fail_rec *value = NULL;
-
-    if (stonith_failures == NULL) {
-        return FALSE;
-    }
-
-    if (target == NULL) {
-        g_hash_table_iter_init(&iter, stonith_failures);
-        while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
-            if (value->count >= stonith_max_attempts) {
-                target = (const char*)key;
-                goto too_many;
-            }
-        }
-    } else {
-        value = g_hash_table_lookup(stonith_failures, target);
-        if ((value != NULL) && (value->count >= stonith_max_attempts)) {
-            goto too_many;
-        }
-    }
-    return FALSE;
-
-too_many:
-    crm_warn("Too many failures (%d) to fence %s, giving up",
-             value->count, target);
-    return TRUE;
-}
-
-/*!
- * \internal
- * \brief Reset a stonith fail count
- *
- * \param[in] target  Name of node to reset, or NULL for all
- */
-void
-st_fail_count_reset(const char *target)
-{
-    if (stonith_failures == NULL) {
-        return;
-    }
-
-    if (target) {
-        struct st_fail_rec *rec = NULL;
-
-        rec = g_hash_table_lookup(stonith_failures, target);
-        if (rec) {
-            rec->count = 0;
-        }
-    } else {
-        GHashTableIter iter;
-        const char *key = NULL;
-        struct st_fail_rec *rec = NULL;
-
-        g_hash_table_iter_init(&iter, stonith_failures);
-        while (g_hash_table_iter_next(&iter, (gpointer *) &key,
-                                      (gpointer *) &rec)) {
-            rec->count = 0;
-        }
-    }
-}
-
-void
-st_fail_count_increment(const char *target)
-{
-    struct st_fail_rec *rec = NULL;
-
-    if (stonith_failures == NULL) {
-        stonith_failures = crm_str_table_new();
-    }
-
-    rec = g_hash_table_lookup(stonith_failures, target);
-    if (rec) {
-        rec->count++;
-    } else {
-        rec = malloc(sizeof(struct st_fail_rec));
-        if(rec == NULL) {
-            return;
-        }
-
-        rec->count = 1;
-        g_hash_table_insert(stonith_failures, strdup(target), rec);
-    }
-}
-
-/*!
- * \internal
- * \brief Abort transition due to stonith failure
- *
- * \param[in] abort_action  Whether to restart or stop transition
- * \param[in] target  Don't restart if this (NULL for any) has too many failures
- * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
- */
-void
-abort_for_stonith_failure(enum transition_action abort_action,
-                          const char *target, xmlNode *reason)
-{
-    /* If stonith repeatedly fails, we eventually give up on starting a new
-     * transition for that reason.
-     */
-    if ((abort_action != tg_stop) && too_many_st_failures(target)) {
-        abort_action = tg_stop;
-    }
-    abort_transition(INFINITY, abort_action, "Stonith failed", reason);
-}
-
-void
-tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
-{
-    char *uuid = NULL;
-    int stonith_id = -1;
-    int transition_id = -1;
-    crm_action_t *action = NULL;
-    int call_id = data->call_id;
-    int rc = data->rc;
-    char *userdata = data->userdata;
-
-    CRM_CHECK(userdata != NULL, return);
-    crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
-               pcmk_strerror(rc), rc);
-
-    if (AM_I_DC == FALSE) {
-        return;
-    }
-
-    /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
-    /*       op->call_id, op->optype, op->node_name, op->op_result, */
-    /*       (char *)op->node_list, op->private_data); */
-
-    /* filter out old STONITH actions */
-    CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
-              goto bail);
-
-    if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid)
-        || transition_graph->id != transition_id) {
-        crm_info("Ignoring STONITH action initiated outside of the current transition");
-        goto bail;
-    }
-
-    action = get_action(stonith_id, FALSE);
-    if (action == NULL) {
-        crm_err("Stonith action not matched");
-        goto bail;
-    }
-
-    stop_te_timer(action->timer);
-    if (rc == pcmk_ok) {
-        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
-        const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
-        const char *op = crm_meta_value(action->params, "stonith_action"); 
-
-        crm_info("Stonith operation %d for %s passed", call_id, target);
-        if (action->confirmed == FALSE) {
-            te_action_confirmed(action);
-            if (safe_str_eq("on", op)) {
-                const char *value = NULL;
-                char *now = crm_itoa(time(NULL));
-
-                update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE);
-                free(now);
-
-                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
-                update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE);
-
-                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
-                update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE);
-
-            } else if (action->sent_update == FALSE) {
-                send_stonith_update(action, target, uuid);
-                action->sent_update = TRUE;
-            }
-        }
-        st_fail_count_reset(target);
-
-    } else {
-        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
-        enum transition_action abort_action = tg_restart;
-
-        action->failed = TRUE;
-        crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
-                   call_id, target, pcmk_strerror(rc));
-
-        /* If no fence devices were available, there's no use in immediately
-         * checking again, so don't start a new transition in that case.
-         */
-        if (rc == -ENODEV) {
-            crm_warn("No devices found in cluster to fence %s, giving up",
-                     target);
-            abort_action = tg_stop;
-        }
-
-        /* Increment the fail count now, so abort_for_stonith_failure() can
-         * check it. Non-DC nodes will increment it in tengine_stonith_notify().
-         */
-        st_fail_count_increment(target);
-        abort_for_stonith_failure(abort_action, target, NULL);
-    }
-
-    update_graph(transition_graph, action);
-    trigger_graph();
-
-  bail:
-    free(userdata);
-    free(uuid);
-    return;
-}
-
-void
-cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
-{
-    if (rc < pcmk_ok) {
-        crm_err("Fencing update %d for %s: failed - %s (%d)",
-                call_id, (char *)user_data, pcmk_strerror(rc), rc);
-        crm_log_xml_warn(msg, "Failed update");
-        abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
-
-    } else {
-        crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
-    }
-}
-
 void
 cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
 {
diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c
index 22f83ad..1496244 100644
--- a/daemons/controld/controld_te_utils.c
+++ b/daemons/controld/controld_te_utils.c
@@ -6,441 +6,14 @@
  */
 
 #include <crm_internal.h>
-
-#include <sys/param.h>
 #include <crm/crm.h>
-
 #include <crm/msg_xml.h>
-
 #include <crm/common/xml.h>
+
 #include <controld_transition.h>
 #include <controld_fsa.h>
-#include <controld_lrm.h>
 #include <controld_messages.h>
 #include <controld_throttle.h>
-#include <crm/fencing/internal.h>
-
-crm_trigger_t *stonith_reconnect = NULL;
-static crm_trigger_t *stonith_history_sync_trigger = NULL;
-static mainloop_timer_t *stonith_history_sync_timer = NULL;
-
-/*
- * stonith cleanup list
- *
- * If the DC is shot, proper notifications might not go out.
- * The stonith cleanup list allows the cluster to (re-)send
- * notifications once a new DC is elected.
- */
-
-static GListPtr stonith_cleanup_list = NULL;
-
-/*!
- * \internal
- * \brief Add a node to the stonith cleanup list
- *
- * \param[in] target  Name of node to add
- */
-void
-add_stonith_cleanup(const char *target) {
-    stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
-}
-
-/*!
- * \internal
- * \brief Remove a node from the stonith cleanup list
- *
- * \param[in] Name of node to remove
- */
-void
-remove_stonith_cleanup(const char *target)
-{
-    GListPtr iter = stonith_cleanup_list;
-
-    while (iter != NULL) {
-        GListPtr tmp = iter;
-        char *iter_name = tmp->data;
-
-        iter = iter->next;
-        if (safe_str_eq(target, iter_name)) {
-            crm_trace("Removing %s from the cleanup list", iter_name);
-            stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
-            free(iter_name);
-        }
-    }
-}
-
-/*!
- * \internal
- * \brief Purge all entries from the stonith cleanup list
- */
-void
-purge_stonith_cleanup()
-{
-    if (stonith_cleanup_list) {
-        GListPtr iter = NULL;
-
-        for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
-            char *target = iter->data;
-
-            crm_info("Purging %s from stonith cleanup list", target);
-            free(target);
-        }
-        g_list_free(stonith_cleanup_list);
-        stonith_cleanup_list = NULL;
-    }
-}
-
-/*!
- * \internal
- * \brief Send stonith updates for all entries in cleanup list, then purge it
- */
-void
-execute_stonith_cleanup()
-{
-    GListPtr iter;
-
-    for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
-        char *target = iter->data;
-        crm_node_t *target_node = crm_get_peer(0, target);
-        const char *uuid = crm_peer_uuid(target_node);
-
-        crm_notice("Marking %s, target of a previous stonith action, as clean", target);
-        send_stonith_update(NULL, target, uuid);
-        free(target);
-    }
-    g_list_free(stonith_cleanup_list);
-    stonith_cleanup_list = NULL;
-}
-
-/* end stonith cleanup list functions */
-
-static gboolean
-fail_incompletable_stonith(crm_graph_t * graph)
-{
-    GListPtr lpc = NULL;
-    const char *task = NULL;
-    xmlNode *last_action = NULL;
-
-    if (graph == NULL) {
-        return FALSE;
-    }
-
-    for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
-        GListPtr lpc2 = NULL;
-        synapse_t *synapse = (synapse_t *) lpc->data;
-
-        if (synapse->confirmed) {
-            continue;
-        }
-
-        for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
-            crm_action_t *action = (crm_action_t *) lpc2->data;
-
-            if (action->type != action_type_crm || action->confirmed) {
-                continue;
-            }
-
-            task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
-            if (task && safe_str_eq(task, CRM_OP_FENCE)) {
-                action->failed = TRUE;
-                last_action = action->xml;
-                update_graph(graph, action);
-                crm_notice("Failing action %d (%s): fencer terminated",
-                           action->id, ID(action->xml));
-            }
-        }
-    }
-
-    if (last_action != NULL) {
-        crm_warn("Fencer failure resulted in unrunnable actions");
-        abort_for_stonith_failure(tg_restart, NULL, last_action);
-        return TRUE;
-    }
-
-    return FALSE;
-}
-
-static void
-tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e)
-{
-    if (is_set(fsa_input_register, R_ST_REQUIRED)) {
-        crm_crit("Fencing daemon connection failed");
-        mainloop_set_trigger(stonith_reconnect);
-
-    } else {
-        crm_info("Fencing daemon disconnected");
-    }
-
-    /* cbchan will be garbage at this point, arrange for it to be reset */
-    if(stonith_api) {
-        stonith_api->state = stonith_disconnected;
-    }
-
-    if (AM_I_DC) {
-        fail_incompletable_stonith(transition_graph);
-        trigger_graph();
-    }
-}
-
-char *te_client_id = NULL;
-
-#ifdef HAVE_SYS_REBOOT_H
-#  include <unistd.h>
-#  include <sys/reboot.h>
-#endif
-
-static void
-tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
-{
-    if(te_client_id == NULL) {
-        te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
-                                         (unsigned long) getpid());
-    }
-
-    if (st_event == NULL) {
-        crm_err("Notify data not found");
-        return;
-    }
-
-    crmd_alert_fencing_op(st_event);
-
-    if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) {
-        crm_notice("%s was successfully unfenced by %s (at the request of %s)",
-                   st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin);
-                /* TODO: Hook up st_event->device */
-        return;
-
-    } else if (safe_str_eq("on", st_event->action)) {
-        crm_err("Unfencing of %s by %s failed: %s (%d)",
-                st_event->target, st_event->executioner ? st_event->executioner : "<anyone>",
-                pcmk_strerror(st_event->result), st_event->result);
-        return;
-
-    } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) {
-        crm_crit("We were allegedly just fenced by %s for %s!",
-                 st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* Dumps blackbox if enabled */
-
-        qb_log_fini(); /* Try to get the above log message to disk - somehow */
-
-        /* Get out ASAP and do not come back up.
-         *
-         * Triggering a reboot is also not the worst idea either since
-         * the rest of the cluster thinks we're safely down
-         */
-
-#ifdef RB_HALT_SYSTEM
-        reboot(RB_HALT_SYSTEM);
-#endif
-
-        /*
-         * If reboot() fails or is not supported, coming back up will
-         * probably lead to a situation where the other nodes set our
-         * status to 'lost' because of the fencing callback and will
-         * discard subsequent election votes with:
-         *
-         * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster)
-         *
-         * So just stay dead, something is seriously messed up anyway.
-         *
-         */
-        exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini()
-        return;
-    }
-
-    /* Update the count of stonith failures for this target, in case we become
-     * DC later. The current DC has already updated its fail count in
-     * tengine_stonith_callback().
-     */
-    if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
-        if (st_event->result == pcmk_ok) {
-            st_fail_count_reset(st_event->target);
-        } else {
-            st_fail_count_increment(st_event->target);
-        }
-    }
-
-    crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
-               CRM_XS " initiator=%s ref=%s",
-               st_event->target, st_event->result == pcmk_ok ? "" : " not",
-               st_event->action,
-               st_event->executioner ? st_event->executioner : "<anyone>",
-               (st_event->client_origin? st_event->client_origin : "<unknown>"),
-               pcmk_strerror(st_event->result),
-               st_event->origin, st_event->id);
-
-    if (st_event->result == pcmk_ok) {
-        crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
-        const char *uuid = NULL;
-        gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);
-
-        if (peer == NULL) {
-            return;
-        }
-
-        uuid = crm_peer_uuid(peer);
-
-        crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
-        if(AM_I_DC) {
-            /* The DC always sends updates */
-            send_stonith_update(NULL, st_event->target, uuid);
-
-            /* @TODO Ideally, at this point, we'd check whether the fenced node
-             * hosted any guest nodes, and call remote_node_down() for them.
-             * Unfortunately, the controller doesn't have a simple, reliable way
-             * to map hosts to guests. It might be possible to track this in the
-             * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
-             * on the PE creating fence pseudo-events for the guests.
-             */
-
-            if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) {
-
-                /* Abort the current transition graph if it wasn't us
-                 * that invoked stonith to fence someone
-                 */
-                crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
-                abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
-            }
-
-            /* Assume it was our leader if we don't currently have one */
-        } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target))
-            && !is_set(peer->flags, crm_remote_node)) {
-
-            crm_notice("Target %s our leader %s (recorded: %s)",
-                       fsa_our_dc ? "was" : "may have been", st_event->target,
-                       fsa_our_dc ? fsa_our_dc : "<unset>");
-
-            /* Given the CIB resyncing that occurs around elections,
-             * have one node update the CIB now and, if the new DC is different,
-             * have them do so too after the election
-             */
-            if (we_are_executioner) {
-                send_stonith_update(NULL, st_event->target, uuid);
-            }
-            add_stonith_cleanup(st_event->target);
-        }
-
-        /* If the target is a remote node, and we host its connection,
-         * immediately fail all monitors so it can be recovered quickly.
-         * The connection won't necessarily drop when a remote node is fenced,
-         * so the failure might not otherwise be detected until the next poke.
-         */
-        if (is_set(peer->flags, crm_remote_node)) {
-            remote_ra_fail(st_event->target);
-        }
-
-        crmd_peer_down(peer, TRUE);
-     }
-}
-
-static gboolean
-do_stonith_history_sync(gpointer user_data)
-{
-    if (stonith_api && (stonith_api->state != stonith_disconnected)) {
-        stonith_history_t *history = NULL;
-
-        stonith_api->cmds->history(stonith_api,
-                                   st_opt_sync_call | st_opt_broadcast,
-                                   NULL, &history, 5);
-        stonith_history_free(history);
-        return TRUE;
-    } else {
-        crm_info("Skip triggering stonith history-sync as stonith is disconnected");
-        return FALSE;
-    }
-}
-
-static gboolean
-stonith_history_sync_set_trigger(gpointer user_data)
-{
-    mainloop_set_trigger(stonith_history_sync_trigger);
-    return FALSE;
-}
-
-void
-te_trigger_stonith_history_sync(void)
-{
-    /* trigger a sync in 5s to give more nodes the
-     * chance to show up so that we don't create
-     * unnecessary stonith-history-sync traffic
-     */
-
-    /* as we are finally checking the stonith-connection
-     * in do_stonith_history_sync we should be fine
-     * leaving stonith_history_sync_time & stonith_history_sync_trigger
-     * around
-     */
-    if (stonith_history_sync_trigger == NULL) {
-        stonith_history_sync_trigger =
-            mainloop_add_trigger(G_PRIORITY_LOW,
-                                 do_stonith_history_sync, NULL);
-    }
-
-    if(stonith_history_sync_timer == NULL) {
-        stonith_history_sync_timer =
-            mainloop_timer_add("history_sync", 5000,
-                               FALSE, stonith_history_sync_set_trigger,
-                               NULL);
-    }
-    crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
-    mainloop_timer_start(stonith_history_sync_timer);
-}
-
-/*!
- * \brief Connect to fencer
- *
- * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
- *
- * \return TRUE
- * \note If user_data is NULL, this will wait 2s between attempts, for up to
- *       30 attempts, meaning the controller could be blocked as long as 58s.
- */
-gboolean
-te_connect_stonith(gpointer user_data)
-{
-    int rc = pcmk_ok;
-
-    if (stonith_api == NULL) {
-        stonith_api = stonith_api_new();
-    }
-
-    if (stonith_api->state != stonith_disconnected) {
-        crm_trace("Already connected to fencer, no need to retry");
-        return TRUE;
-    }
-
-    if (user_data == NULL) {
-        // Blocking (retry failures now until successful)
-        rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
-        if (rc != pcmk_ok) {
-            crm_err("Could not connect to fencer in 30 attempts: %s "
-                    CRM_XS " rc=%d", pcmk_strerror(rc), rc);
-        }
-    } else {
-        // Non-blocking (retry failures later in main loop)
-        rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
-        if (rc != pcmk_ok) {
-            if (is_set(fsa_input_register, R_ST_REQUIRED)) {
-                crm_err("Fencer connection failed (will retry): %s "
-                        CRM_XS " rc=%d", pcmk_strerror(rc), rc);
-                mainloop_set_trigger(stonith_reconnect);
-            } else {
-                crm_info("Fencer connection failed (ignoring because no longer required): %s "
-                         CRM_XS " rc=%d", pcmk_strerror(rc), rc);
-            }
-            return TRUE;
-        }
-    }
-
-    if (rc == pcmk_ok) {
-        stonith_api->cmds->register_notification(stonith_api,
-                                                 T_STONITH_NOTIFY_DISCONNECT,
-                                                 tengine_stonith_connection_destroy);
-        stonith_api->cmds->register_notification(stonith_api,
-                                                 T_STONITH_NOTIFY_FENCE,
-                                                 tengine_stonith_notify);
-    }
-    return TRUE;
-}
 
 gboolean
 stop_te_timer(crm_action_timer_t * timer)
diff --git a/daemons/controld/controld_transition.c b/daemons/controld/controld_transition.c
index 5f164ab..b942ab4 100644
--- a/daemons/controld/controld_transition.c
+++ b/daemons/controld/controld_transition.c
@@ -18,7 +18,6 @@
 
 
 extern crm_graph_functions_t te_graph_fns;
-stonith_t *stonith_api = NULL;
 
 static void
 global_cib_callback(const xmlNode * msg, int callid, int rc, xmlNode * output)
diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h
index a162f99..f31ac2d 100644
--- a/daemons/controld/controld_transition.h
+++ b/daemons/controld/controld_transition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2004-2019 the Pacemaker project contributors
  *
  * This source code is licensed under the GNU Lesser General Public License
  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
@@ -12,15 +12,6 @@
 #  include <crm/common/mainloop.h>
 #  include <crm/stonith-ng.h>
 #  include <crm/services.h>
-extern stonith_t *stonith_api;
-extern void send_stonith_update(crm_action_t * stonith_action, const char *target,
-                                const char *uuid);
-
-/* stonith cleanup list */
-void add_stonith_cleanup(const char *target);
-void remove_stonith_cleanup(const char *target);
-void purge_stonith_cleanup(void);
-void execute_stonith_cleanup(void);
 
 /* tengine */
 extern crm_action_t *match_down_event(const char *target);
@@ -46,16 +37,11 @@ extern char *te_uuid;
 
 extern void notify_crmd(crm_graph_t * graph);
 
-void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
-                         void *user_data);
 void cib_action_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
                         void *user_data);
 gboolean action_timer_callback(gpointer data);
 gboolean te_graph_trigger(gpointer user_data);
 void te_update_diff(const char *event, xmlNode *msg);
-void tengine_stonith_callback(stonith_t *stonith,
-                              stonith_callback_data_t *data);
-void update_stonith_max_attempts(const char* value);
 
 extern void trigger_graph_processing(const char *fn, int line);
 void abort_after_delay(int abort_priority, enum transition_action abort_action,
@@ -68,12 +54,7 @@ extern void abort_transition_graph(int abort_priority, enum transition_action ab
 #  define abort_transition(pri, action, text, reason)			\
 	abort_transition_graph(pri, action, text, reason,__FUNCTION__,__LINE__);
 
-extern gboolean te_connect_stonith(gpointer user_data);
-
-extern void te_trigger_stonith_history_sync(void);
-
 extern crm_trigger_t *transition_trigger;
-extern crm_trigger_t *stonith_reconnect;
 
 extern char *failed_stop_offset;
 extern char *failed_start_offset;
diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h
index 68992f5..8b80e3c 100644
--- a/daemons/controld/controld_utils.h
+++ b/daemons/controld/controld_utils.h
@@ -85,10 +85,6 @@ int crmd_join_phase_count(enum crm_join_phase phase);
 void crmd_join_phase_log(int level);
 
 const char *get_timer_desc(fsa_timer_t * timer);
-void st_fail_count_reset(const char * target);
-void st_fail_count_increment(const char *target);
-void abort_for_stonith_failure(enum transition_action abort_action,
-                               const char *target, xmlNode *reason);
 void crmd_peer_down(crm_node_t *peer, bool full);
 unsigned int cib_op_timeout(void);
 
-- 
1.8.3.1

From 3002e485651e1ad18da6d44e7672dbe4f0380d3b Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 23 May 2019 18:18:06 -0500
Subject: [PATCH] Refactor: controller: isolate stonith API handling

can now make more variables and functions static
---
 daemons/controld/controld_control.c | 28 +++------------------
 daemons/controld/controld_fencing.c | 49 ++++++++++++++++++++++++++++++++++---
 daemons/controld/controld_fencing.h |  7 ++----
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
index 7f918c0..e99d605 100644
--- a/daemons/controld/controld_control.c
+++ b/daemons/controld/controld_control.c
@@ -113,14 +113,7 @@ do_shutdown(long long action,
 {
     /* just in case */
     set_bit(fsa_input_register, R_SHUTDOWN);
-
-    if (stonith_api) {
-        /* Prevent it from coming up again */
-        clear_bit(fsa_input_register, R_ST_REQUIRED);
-
-        crm_info("Disconnecting from fencer");
-        stonith_api->cmds->disconnect(stonith_api);
-    }
+    controld_disconnect_fencer(FALSE);
 }
 
 /*	 A_SHUTDOWN_REQ	*/
@@ -201,12 +194,7 @@ crmd_exit(crm_exit_t exit_code)
 
     controld_close_attrd_ipc();
     pe_subsystem_free();
-
-    if(stonith_api) {
-        crm_trace("Disconnecting fencing API");
-        clear_bit(fsa_input_register, R_ST_REQUIRED);
-        stonith_api->cmds->free(stonith_api); stonith_api = NULL;
-    }
+    controld_disconnect_fencer(TRUE);
 
     if ((exit_code == CRM_EX_OK) && (crmd_mainloop == NULL)) {
         crm_debug("No mainloop detected");
@@ -258,7 +246,6 @@ crmd_exit(crm_exit_t exit_code)
     mainloop_destroy_trigger(fsa_source); fsa_source = NULL;
 
     mainloop_destroy_trigger(config_read); config_read = NULL;
-    mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL;
     mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL;
 
     crm_client_cleanup();
@@ -288,7 +275,6 @@ crmd_exit(crm_exit_t exit_code)
     free(fsa_cluster_name); fsa_cluster_name = NULL;
 
     free(te_uuid); te_uuid = NULL;
-    free(te_client_id); te_client_id = NULL;
     free(fsa_pe_ref); fsa_pe_ref = NULL;
     free(failed_stop_offset); failed_stop_offset = NULL;
     free(failed_start_offset); failed_start_offset = NULL;
@@ -627,15 +613,7 @@ do_started(long long action,
         crm_err("Failed to create IPC server: shutting down and inhibiting respawn");
         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
     }
-
-    // Try connecting to fencer (retrying later in mainloop if failed)
-    if (stonith_reconnect == NULL) {
-        stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
-                                                 te_connect_stonith,
-                                                 GINT_TO_POINTER(TRUE));
-    }
-    set_bit(fsa_input_register, R_ST_REQUIRED);
-    mainloop_set_trigger(stonith_reconnect);
+    controld_trigger_fencer_connect();
 
     crm_notice("Pacemaker controller successfully started and accepting connections");
     clear_bit(fsa_input_register, R_STARTING);
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index cde57b5..92336e9 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -341,9 +341,9 @@ execute_stonith_cleanup()
  * Functions that need to interact directly with the fencer via its API
  */
 
-stonith_t *stonith_api = NULL;
-crm_trigger_t *stonith_reconnect = NULL;
-char *te_client_id = NULL;
+static stonith_t *stonith_api = NULL;
+static crm_trigger_t *stonith_reconnect = NULL;
+static char *te_client_id = NULL;
 
 static gboolean
 fail_incompletable_stonith(crm_graph_t *graph)
@@ -571,7 +571,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
  * \note If user_data is NULL, this will wait 2s between attempts, for up to
  *       30 attempts, meaning the controller could be blocked as long as 58s.
  */
-gboolean
+static gboolean
 te_connect_stonith(gpointer user_data)
 {
     int rc = pcmk_ok;
@@ -619,6 +619,47 @@ te_connect_stonith(gpointer user_data)
     return TRUE;
 }
 
+/*!
+    \internal
+    \brief Schedule fencer connection attempt in main loop
+*/
+void
+controld_trigger_fencer_connect()
+{
+    if (stonith_reconnect == NULL) {
+        stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
+                                                 te_connect_stonith,
+                                                 GINT_TO_POINTER(TRUE));
+    }
+    set_bit(fsa_input_register, R_ST_REQUIRED);
+    mainloop_set_trigger(stonith_reconnect);
+}
+
+void
+controld_disconnect_fencer(bool destroy)
+{
+    if (stonith_api) {
+        // Prevent fencer connection from coming up again
+        clear_bit(fsa_input_register, R_ST_REQUIRED);
+
+        stonith_api->cmds->disconnect(stonith_api);
+    }
+    if (destroy) {
+        if (stonith_api) {
+            stonith_api->cmds->free(stonith_api);
+            stonith_api = NULL;
+        }
+        if (stonith_reconnect) {
+            mainloop_destroy_trigger(stonith_reconnect);
+            stonith_reconnect = NULL;
+        }
+        if (te_client_id) {
+            free(te_client_id);
+            te_client_id = NULL;
+        }
+    }
+}
+
 static gboolean
 do_stonith_history_sync(gpointer user_data)
 {
diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h
index b80a6c9..3ef537f 100644
--- a/daemons/controld/controld_fencing.h
+++ b/daemons/controld/controld_fencing.h
@@ -13,16 +13,13 @@
 #include <stdbool.h>                // bool
 #include <crm/transition.h>         // crm_graph_t, crm_action_t
 
-extern crm_trigger_t *stonith_reconnect;
-extern char *te_client_id;
-extern stonith_t *stonith_api;
-
 // stonith fail counts
 void st_fail_count_reset(const char * target);
 void update_stonith_max_attempts(const char* value);
 
 // stonith API client
-gboolean te_connect_stonith(gpointer user_data);
+void controld_trigger_fencer_connect(void);
+void controld_disconnect_fencer(bool destroy);
 gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action);
 
 // stonith cleanup list
-- 
1.8.3.1