Blob Blame History Raw
From bc91cc5d8b4257627d09103cf676cd83656bda8c Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 12 Jan 2021 10:45:53 -0500
Subject: [PATCH 01/11] Refactor: tools: Split up connection teardown in
 crm_mon.

We don't necessarily want to tear down the fencing and CIB connections
at the same time always.  This can then immediately be used in
mon_refresh_display and do_mon_cib_connection_destroy.
---
 tools/crm_mon.c | 57 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 8ec97bb..fc20e4c 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -122,7 +122,8 @@ struct {
     .mon_ops = mon_op_default
 };
 
-static void clean_up_connections(void);
+static void clean_up_cib_connection(void);
+static void clean_up_fencing_connection(void);
 static crm_exit_t clean_up(crm_exit_t exit_code);
 static void crm_diff_update(const char *event, xmlNode * msg);
 static int mon_refresh_display(gpointer user_data);
@@ -712,12 +713,7 @@ do_mon_cib_connection_destroy(gpointer user_data, bool is_error)
         /* the client API won't properly reconnect notifications
          * if they are still in the table - so remove them
          */
-        st->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT);
-        st->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE);
-        st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY);
-        if (st->state != stonith_disconnected) {
-            st->cmds->disconnect(st);
-        }
+        clean_up_fencing_connection();
     }
     if (cib) {
         cib->cmds->signoff(cib);
@@ -851,7 +847,8 @@ cib_connect(gboolean full)
 
         if (rc != pcmk_ok) {
             out->err(out, "Notification setup failed, could not monitor CIB actions");
-            clean_up_connections();
+            clean_up_cib_connection();
+            clean_up_fencing_connection();
         }
     }
     return rc;
@@ -1866,9 +1863,7 @@ mon_refresh_display(gpointer user_data)
     last_refresh = time(NULL);
 
     if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) {
-        if (cib) {
-            cib->cmds->signoff(cib);
-        }
+        clean_up_cib_connection();
         out->err(out, "Upgrade failed: %s", pcmk_strerror(-pcmk_err_schema_validation));
         clean_up(CRM_EX_CONFIG);
         return 0;
@@ -2040,24 +2035,33 @@ mon_st_callback_display(stonith_t * st, stonith_event_t * e)
 }
 
 static void
-clean_up_connections(void)
+clean_up_cib_connection(void)
 {
-    if (cib != NULL) {
-        cib->cmds->signoff(cib);
-        cib_delete(cib);
-        cib = NULL;
+    if (cib == NULL) {
+        return;
     }
 
-    if (st != NULL) {
-        if (st->state != stonith_disconnected) {
-            st->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT);
-            st->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE);
-            st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY);
-            st->cmds->disconnect(st);
-        }
-        stonith_api_delete(st);
-        st = NULL;
+    cib->cmds->signoff(cib);
+    cib_delete(cib);
+    cib = NULL;
+}
+
+static void
+clean_up_fencing_connection(void)
+{
+    if (st == NULL) {
+        return;
     }
+
+    if (st->state != stonith_disconnected) {
+        st->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT);
+        st->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE);
+        st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY);
+        st->cmds->disconnect(st);
+    }
+
+    stonith_api_delete(st);
+    st = NULL;
 }
 
 /*
@@ -2074,7 +2078,8 @@ clean_up(crm_exit_t exit_code)
     /* Quitting crm_mon is much more complicated than it ought to be. */
 
     /* (1) Close connections, free things, etc. */
-    clean_up_connections();
+    clean_up_cib_connection();
+    clean_up_fencing_connection();
     free(options.pid_file);
     free(options.neg_location_prefix);
     g_slist_free_full(options.includes_excludes, free);
-- 
1.8.3.1


From 28d646ce67c6a933eaa76aca51f9973a65d0ee3c Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 7 Jan 2021 17:18:13 -0500
Subject: [PATCH 02/11] Refactor: tools: Split up connection establishment in
 crm_mon.

We don't necessarily always want to connect to the CIB and fencing in
the same action.  Note that bringing up the fencing connection needs to
happen first, because mon_refresh_display is called from cib_connect and
it will want a fencing connection.
---
 tools/crm_mon.c | 66 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index fc20e4c..301a222 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -128,6 +128,7 @@ static crm_exit_t clean_up(crm_exit_t exit_code);
 static void crm_diff_update(const char *event, xmlNode * msg);
 static int mon_refresh_display(gpointer user_data);
 static int cib_connect(gboolean full);
+static int fencing_connect(void);
 static void mon_st_callback_event(stonith_t * st, stonith_event_t * e);
 static void mon_st_callback_display(stonith_t * st, stonith_event_t * e);
 static void kick_refresh(gboolean data_updated);
@@ -668,8 +669,6 @@ static GOptionEntry deprecated_entries[] = {
 static gboolean
 mon_timer_popped(gpointer data)
 {
-    int rc = pcmk_ok;
-
 #if CURSES_ENABLED
     if (output_format == mon_output_console) {
         clear();
@@ -683,9 +682,7 @@ mon_timer_popped(gpointer data)
     }
 
     print_as(output_format, "Reconnecting...\n");
-    rc = cib_connect(TRUE);
-
-    if (rc != pcmk_ok) {
+    if (fencing_connect() == pcmk_ok && cib_connect(TRUE) == pcmk_ok) {
         timer_id = g_timeout_add(options.reconnect_msec, mon_timer_popped, NULL);
     }
     return FALSE;
@@ -767,39 +764,48 @@ mon_winresize(int nsig)
 #endif
 
 static int
-cib_connect(gboolean full)
+fencing_connect(void)
 {
     int rc = pcmk_ok;
-    static gboolean need_pass = TRUE;
-
-    CRM_CHECK(cib != NULL, return -EINVAL);
-
-    if (getenv("CIB_passwd") != NULL) {
-        need_pass = FALSE;
-    }
 
     if (pcmk_is_set(options.mon_ops, mon_op_fence_connect) && (st == NULL)) {
         st = stonith_api_new();
     }
 
-    if (pcmk_is_set(options.mon_ops, mon_op_fence_connect)
-        && (st != NULL) && (st->state == stonith_disconnected)) {
+    if (!pcmk_is_set(options.mon_ops, mon_op_fence_connect) ||
+        st == NULL || st->state != stonith_disconnected) {
+        return rc;
+    }
 
-        rc = st->cmds->connect(st, crm_system_name, NULL);
-        if (rc == pcmk_ok) {
-            crm_trace("Setting up stonith callbacks");
-            if (pcmk_is_set(options.mon_ops, mon_op_watch_fencing)) {
-                st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT,
-                                                mon_st_callback_event);
-                st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, mon_st_callback_event);
-            } else {
-                st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT,
-                                                mon_st_callback_display);
-                st->cmds->register_notification(st, T_STONITH_NOTIFY_HISTORY, mon_st_callback_display);
-            }
+    rc = st->cmds->connect(st, crm_system_name, NULL);
+    if (rc == pcmk_ok) {
+        crm_trace("Setting up stonith callbacks");
+        if (pcmk_is_set(options.mon_ops, mon_op_watch_fencing)) {
+            st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT,
+                                            mon_st_callback_event);
+            st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, mon_st_callback_event);
+        } else {
+            st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT,
+                                            mon_st_callback_display);
+            st->cmds->register_notification(st, T_STONITH_NOTIFY_HISTORY, mon_st_callback_display);
         }
     }
 
+    return rc;
+}
+
+static int
+cib_connect(gboolean full)
+{
+    int rc = pcmk_ok;
+    static gboolean need_pass = TRUE;
+
+    CRM_CHECK(cib != NULL, return -EINVAL);
+
+    if (getenv("CIB_passwd") != NULL) {
+        need_pass = FALSE;
+    }
+
     if (cib->state == cib_connected_query || cib->state == cib_connected_command) {
         return rc;
     }
@@ -1373,7 +1379,11 @@ main(int argc, char **argv)
         if (!pcmk_is_set(options.mon_ops, mon_op_one_shot)) {
             print_as(output_format ,"Waiting until cluster is available on this node ...\n");
         }
-        rc = cib_connect(!pcmk_is_set(options.mon_ops, mon_op_one_shot));
+
+        rc = fencing_connect();
+        if (rc == pcmk_ok) {
+            rc = cib_connect(!pcmk_is_set(options.mon_ops, mon_op_one_shot));
+        }
 
         if (pcmk_is_set(options.mon_ops, mon_op_one_shot)) {
             break;
-- 
1.8.3.1


From e12508ffba06b1c5652e7f49a449aae6d89ec420 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 12 Jan 2021 17:01:53 -0500
Subject: [PATCH 03/11] Refactor: tools: Split one shot mode out into its own
 function.

Also, the connection error handling function can get split out on its
own as well to allow it to be reused in both the one shot and loop
cases.
---
 tools/crm_mon.c | 69 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 301a222..b33598b 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -1162,6 +1162,41 @@ reconcile_output_format(pcmk__common_args_t *args) {
     }
 }
 
+static void
+handle_connection_failures(int rc)
+{
+    if (rc == pcmk_ok) {
+        return;
+    }
+
+    if (output_format == mon_output_monitor) {
+        g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "CLUSTER CRIT: Connection to cluster failed: %s",
+                    pcmk_strerror(rc));
+        rc = MON_STATUS_CRIT;
+    } else if (rc == -ENOTCONN) {
+        g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Error: cluster is not available on this node");
+        rc = crm_errno2exit(rc);
+    } else {
+        g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Connection to cluster failed: %s", pcmk_strerror(rc));
+        rc = crm_errno2exit(rc);
+    }
+
+    clean_up(rc);
+}
+
+static void
+one_shot()
+{
+    int rc = fencing_connect();
+
+    if (rc == pcmk_rc_ok) {
+        rc = cib_connect(FALSE);
+        handle_connection_failures(rc);
+    }
+
+    clean_up(CRM_EX_OK);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -1375,20 +1410,19 @@ main(int argc, char **argv)
 
     crm_info("Starting %s", crm_system_name);
 
+    if (pcmk_is_set(options.mon_ops, mon_op_one_shot)) {
+        one_shot();
+    }
+
     do {
-        if (!pcmk_is_set(options.mon_ops, mon_op_one_shot)) {
-            print_as(output_format ,"Waiting until cluster is available on this node ...\n");
-        }
+        print_as(output_format ,"Waiting until cluster is available on this node ...\n");
 
         rc = fencing_connect();
         if (rc == pcmk_ok) {
-            rc = cib_connect(!pcmk_is_set(options.mon_ops, mon_op_one_shot));
+            rc = cib_connect(TRUE);
         }
 
-        if (pcmk_is_set(options.mon_ops, mon_op_one_shot)) {
-            break;
-
-        } else if (rc != pcmk_ok) {
+        if (rc != pcmk_ok) {
             sleep(options.reconnect_msec / 1000);
 #if CURSES_ENABLED
             if (output_format == mon_output_console) {
@@ -1402,24 +1436,7 @@ main(int argc, char **argv)
 
     } while (rc == -ENOTCONN);
 
-    if (rc != pcmk_ok) {
-        if (output_format == mon_output_monitor) {
-            g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "CLUSTER CRIT: Connection to cluster failed: %s",
-                        pcmk_strerror(rc));
-            return clean_up(MON_STATUS_CRIT);
-        } else {
-            if (rc == -ENOTCONN) {
-                g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Error: cluster is not available on this node");
-            } else {
-                g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Connection to cluster failed: %s", pcmk_strerror(rc));
-            }
-        }
-        return clean_up(crm_errno2exit(rc));
-    }
-
-    if (pcmk_is_set(options.mon_ops, mon_op_one_shot)) {
-        return clean_up(CRM_EX_OK);
-    }
+    handle_connection_failures(rc);
 
     mainloop = g_main_loop_new(NULL, FALSE);
 
-- 
1.8.3.1


From 0eb307a19d57d4a59a4b51a64a3b62dcd0b7cc9a Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 13 Jan 2021 12:47:41 -0500
Subject: [PATCH 04/11] Refactor: tools: Don't call mon_refresh_display from
 cib_connect.

---
 tools/crm_mon.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index b33598b..b0daf76 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -683,6 +683,7 @@ mon_timer_popped(gpointer data)
 
     print_as(output_format, "Reconnecting...\n");
     if (fencing_connect() == pcmk_ok && cib_connect(TRUE) == pcmk_ok) {
+        mon_refresh_display(NULL);
         timer_id = g_timeout_add(options.reconnect_msec, mon_timer_popped, NULL);
     }
     return FALSE;
@@ -831,9 +832,6 @@ cib_connect(gboolean full)
     }
 
     rc = cib->cmds->query(cib, NULL, &current_cib, cib_scope_local | cib_sync_call);
-    if (rc == pcmk_ok) {
-        mon_refresh_display(NULL);
-    }
 
     if (rc == pcmk_ok && full) {
         rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy_regular);
@@ -1192,6 +1190,7 @@ one_shot()
     if (rc == pcmk_rc_ok) {
         rc = cib_connect(FALSE);
         handle_connection_failures(rc);
+        mon_refresh_display(NULL);
     }
 
     clean_up(CRM_EX_OK);
@@ -1437,6 +1436,7 @@ main(int argc, char **argv)
     } while (rc == -ENOTCONN);
 
     handle_connection_failures(rc);
+    mon_refresh_display(NULL);
 
     mainloop = g_main_loop_new(NULL, FALSE);
 
-- 
1.8.3.1


From 46696d3135e699c58918e41c93c357d951146d5c Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 13 Jan 2021 13:52:49 -0500
Subject: [PATCH 05/11] Fix: tools: Report if getting fencing history failed in
 crm_mon.

This just takes history_rc into account in the text and html formatters.
It was already used by the XML formatter.  If we can't get fencing
history, add a message to the output indicating that happened.
---
 tools/crm_mon.c       | 13 +++++----
 tools/crm_mon.h       | 12 ++++-----
 tools/crm_mon_print.c | 74 ++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index b0daf76..1a68555 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -1943,7 +1943,8 @@ mon_refresh_display(gpointer user_data)
     switch (output_format) {
         case mon_output_html:
         case mon_output_cgi:
-            if (print_html_status(out, mon_data_set, stonith_history, options.mon_ops,
+            if (print_html_status(out, mon_data_set, crm_errno2exit(history_rc),
+                                  stonith_history, options.mon_ops,
                                   show, options.neg_location_prefix,
                                   options.only_node, options.only_rsc) != 0) {
                 g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_CANTCREAT, "Critical: Unable to output html file");
@@ -1974,15 +1975,17 @@ mon_refresh_display(gpointer user_data)
              */
 #if CURSES_ENABLED
             blank_screen();
-            print_status(out, mon_data_set, stonith_history, options.mon_ops, show,
-                         options.neg_location_prefix, options.only_node, options.only_rsc);
+            print_status(out, mon_data_set, crm_errno2exit(history_rc), stonith_history,
+                         options.mon_ops, show, options.neg_location_prefix,
+                         options.only_node, options.only_rsc);
             refresh();
             break;
 #endif
 
         case mon_output_plain:
-            print_status(out, mon_data_set, stonith_history, options.mon_ops, show,
-                         options.neg_location_prefix, options.only_node, options.only_rsc);
+            print_status(out, mon_data_set, crm_errno2exit(history_rc), stonith_history,
+                         options.mon_ops, show, options.neg_location_prefix,
+                         options.only_node, options.only_rsc);
             break;
 
         case mon_output_unset:
diff --git a/tools/crm_mon.h b/tools/crm_mon.h
index f746507..73c926d 100644
--- a/tools/crm_mon.h
+++ b/tools/crm_mon.h
@@ -95,17 +95,17 @@ typedef enum mon_output_format_e {
 #define mon_op_default              (mon_op_print_pending | mon_op_fence_history | mon_op_fence_connect)
 
 void print_status(pcmk__output_t *out, pe_working_set_t *data_set,
-                  stonith_history_t *stonith_history, unsigned int mon_ops,
-                  unsigned int show, char *prefix, char *only_node,
-                  char *only_rsc);
+                  crm_exit_t history_rc, stonith_history_t *stonith_history,
+                  unsigned int mon_ops, unsigned int show, char *prefix,
+                  char *only_node, char *only_rsc);
 void print_xml_status(pcmk__output_t *out, pe_working_set_t *data_set,
                       crm_exit_t history_rc, stonith_history_t *stonith_history,
                       unsigned int mon_ops, unsigned int show, char *prefix,
                       char *only_node, char *only_rsc);
 int print_html_status(pcmk__output_t *out, pe_working_set_t *data_set,
-                      stonith_history_t *stonith_history, unsigned int mon_ops,
-                      unsigned int show, char *prefix, char *only_node,
-                      char *only_rsc);
+                      crm_exit_t history_rc, stonith_history_t *stonith_history,
+                      unsigned int mon_ops, unsigned int show, char *prefix,
+                      char *only_node, char *only_rsc);
 
 GList *append_attr_list(GList *attr_list, char *name);
 void blank_screen(void);
diff --git a/tools/crm_mon_print.c b/tools/crm_mon_print.c
index 8ae11bf..73406bd 100644
--- a/tools/crm_mon_print.c
+++ b/tools/crm_mon_print.c
@@ -656,6 +656,7 @@ print_failed_actions(pcmk__output_t *out, pe_working_set_t *data_set,
  *
  * \param[in] out             The output functions structure.
  * \param[in] data_set        Cluster state to display.
+ * \param[in] history_rc      Result of getting stonith history
  * \param[in] stonith_history List of stonith actions.
  * \param[in] mon_ops         Bitmask of mon_op_*.
  * \param[in] show            Bitmask of mon_show_*.
@@ -663,14 +664,16 @@ print_failed_actions(pcmk__output_t *out, pe_working_set_t *data_set,
  */
 void
 print_status(pcmk__output_t *out, pe_working_set_t *data_set,
-             stonith_history_t *stonith_history, unsigned int mon_ops,
-             unsigned int show, char *prefix, char *only_node, char *only_rsc)
+             crm_exit_t history_rc, stonith_history_t *stonith_history,
+             unsigned int mon_ops, unsigned int show, char *prefix,
+             char *only_node, char *only_rsc)
 {
     GListPtr unames = NULL;
     GListPtr resources = NULL;
 
     unsigned int print_opts = get_resource_display_options(mon_ops);
     int rc = pcmk_rc_no_output;
+    bool already_printed_failure = false;
 
     CHECK_RC(rc, out->message(out, "cluster-summary", data_set,
                               pcmk_is_set(mon_ops, mon_op_print_clone_detail),
@@ -731,13 +734,23 @@ print_status(pcmk__output_t *out, pe_working_set_t *data_set,
     if (pcmk_is_set(show, mon_show_fence_failed)
         && pcmk_is_set(mon_ops, mon_op_fence_history)) {
 
-        stonith_history_t *hp = stonith__first_matching_event(stonith_history, stonith__event_state_eq,
-                                                              GINT_TO_POINTER(st_failed));
+        if (history_rc == 0) {
+            stonith_history_t *hp = stonith__first_matching_event(stonith_history, stonith__event_state_eq,
+                                                                  GINT_TO_POINTER(st_failed));
+
+            if (hp) {
+                CHECK_RC(rc, out->message(out, "failed-fencing-list", stonith_history, unames,
+                                          pcmk_is_set(mon_ops, mon_op_fence_full_history),
+                                          rc == pcmk_rc_ok));
+            }
+        } else {
+            PCMK__OUTPUT_SPACER_IF(out, rc == pcmk_rc_ok);
+            out->begin_list(out, NULL, NULL, "Failed Fencing Actions");
+            out->list_item(out, NULL, "Failed to get fencing history: %s",
+                           crm_exit_str(history_rc));
+            out->end_list(out);
 
-        if (hp) {
-            CHECK_RC(rc, out->message(out, "failed-fencing-list", stonith_history, unames,
-                                      pcmk_is_set(mon_ops, mon_op_fence_full_history),
-                                      rc == pcmk_rc_ok));
+            already_printed_failure = true;
         }
     }
 
@@ -754,7 +767,15 @@ print_status(pcmk__output_t *out, pe_working_set_t *data_set,
 
     /* Print stonith history */
     if (pcmk_is_set(mon_ops, mon_op_fence_history)) {
-        if (pcmk_is_set(show, mon_show_fence_worked)) {
+        if (history_rc != 0) {
+            if (!already_printed_failure) {
+                PCMK__OUTPUT_SPACER_IF(out, rc == pcmk_rc_ok);
+                out->begin_list(out, NULL, NULL, "Failed Fencing Actions");
+                out->list_item(out, NULL, "Failed to get fencing history: %s",
+                               crm_exit_str(history_rc));
+                out->end_list(out);
+            }
+        } else if (pcmk_is_set(show, mon_show_fence_worked)) {
             stonith_history_t *hp = stonith__first_matching_event(stonith_history, stonith__event_state_neq,
                                                                   GINT_TO_POINTER(st_failed));
 
@@ -783,6 +804,7 @@ print_status(pcmk__output_t *out, pe_working_set_t *data_set,
  *
  * \param[in] out             The output functions structure.
  * \param[in] data_set        Cluster state to display.
+ * \param[in] history_rc      Result of getting stonith history
  * \param[in] stonith_history List of stonith actions.
  * \param[in] mon_ops         Bitmask of mon_op_*.
  * \param[in] show            Bitmask of mon_show_*.
@@ -878,6 +900,7 @@ print_xml_status(pcmk__output_t *out, pe_working_set_t *data_set,
  *
  * \param[in] out             The output functions structure.
  * \param[in] data_set        Cluster state to display.
+ * \param[in] history_rc      Result of getting stonith history
  * \param[in] stonith_history List of stonith actions.
  * \param[in] mon_ops         Bitmask of mon_op_*.
  * \param[in] show            Bitmask of mon_show_*.
@@ -885,14 +908,15 @@ print_xml_status(pcmk__output_t *out, pe_working_set_t *data_set,
  */
 int
 print_html_status(pcmk__output_t *out, pe_working_set_t *data_set,
-                  stonith_history_t *stonith_history, unsigned int mon_ops,
-                  unsigned int show, char *prefix, char *only_node,
-                  char *only_rsc)
+                  crm_exit_t history_rc, stonith_history_t *stonith_history,
+                  unsigned int mon_ops, unsigned int show, char *prefix,
+                  char *only_node, char *only_rsc)
 {
     GListPtr unames = NULL;
     GListPtr resources = NULL;
 
     unsigned int print_opts = get_resource_display_options(mon_ops);
+    bool already_printed_failure = false;
 
     out->message(out, "cluster-summary", data_set,
                  pcmk_is_set(mon_ops, mon_op_print_clone_detail),
@@ -950,18 +974,32 @@ print_html_status(pcmk__output_t *out, pe_working_set_t *data_set,
     if (pcmk_is_set(show, mon_show_fence_failed)
         && pcmk_is_set(mon_ops, mon_op_fence_history)) {
 
-        stonith_history_t *hp = stonith__first_matching_event(stonith_history, stonith__event_state_eq,
-                                                              GINT_TO_POINTER(st_failed));
+        if (history_rc == 0) {
+            stonith_history_t *hp = stonith__first_matching_event(stonith_history, stonith__event_state_eq,
+                                                                  GINT_TO_POINTER(st_failed));
 
-        if (hp) {
-            out->message(out, "failed-fencing-list", stonith_history, unames,
-                         pcmk_is_set(mon_ops, mon_op_fence_full_history), FALSE);
+            if (hp) {
+                out->message(out, "failed-fencing-list", stonith_history, unames,
+                             pcmk_is_set(mon_ops, mon_op_fence_full_history), FALSE);
+            }
+        } else {
+            out->begin_list(out, NULL, NULL, "Failed Fencing Actions");
+            out->list_item(out, NULL, "Failed to get fencing history: %s",
+                           crm_exit_str(history_rc));
+            out->end_list(out);
         }
     }
 
     /* Print stonith history */
     if (pcmk_is_set(mon_ops, mon_op_fence_history)) {
-        if (pcmk_is_set(show, mon_show_fence_worked)) {
+        if (history_rc != 0) {
+            if (!already_printed_failure) {
+                out->begin_list(out, NULL, NULL, "Failed Fencing Actions");
+                out->list_item(out, NULL, "Failed to get fencing history: %s",
+                               crm_exit_str(history_rc));
+                out->end_list(out);
+            }
+        } else if (pcmk_is_set(show, mon_show_fence_worked)) {
             stonith_history_t *hp = stonith__first_matching_event(stonith_history, stonith__event_state_neq,
                                                                   GINT_TO_POINTER(st_failed));
 
-- 
1.8.3.1


From 2e391be6fdbbbccd6aef49b3f109e5c342eb5dcc Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Mon, 11 Jan 2021 12:54:40 -0500
Subject: [PATCH 06/11] Fix: tools: A lack of stonith history is not fatal in
 crm_mon.

Instead, print out all the rest of the typical output.  This should also
include an error message in the fencing section, if that section was
requested.

See: rhbz#1880426
---
 tools/crm_mon.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 1a68555..17b8ee9 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -682,7 +682,8 @@ mon_timer_popped(gpointer data)
     }
 
     print_as(output_format, "Reconnecting...\n");
-    if (fencing_connect() == pcmk_ok && cib_connect(TRUE) == pcmk_ok) {
+    fencing_connect();
+    if (cib_connect(TRUE) == pcmk_ok) {
         mon_refresh_display(NULL);
         timer_id = g_timeout_add(options.reconnect_msec, mon_timer_popped, NULL);
     }
@@ -726,12 +727,6 @@ mon_cib_connection_destroy_regular(gpointer user_data)
     do_mon_cib_connection_destroy(user_data, false);
 }
 
-static void
-mon_cib_connection_destroy_error(gpointer user_data)
-{
-    do_mon_cib_connection_destroy(user_data, true);
-}
-
 /*
  * Mainloop signal handler.
  */
@@ -790,6 +785,8 @@ fencing_connect(void)
                                             mon_st_callback_display);
             st->cmds->register_notification(st, T_STONITH_NOTIFY_HISTORY, mon_st_callback_display);
         }
+    } else {
+        st = NULL;
     }
 
     return rc;
@@ -1185,12 +1182,15 @@ handle_connection_failures(int rc)
 static void
 one_shot()
 {
-    int rc = fencing_connect();
+    int rc;
+
+    fencing_connect();
 
+    rc = cib_connect(FALSE);
     if (rc == pcmk_rc_ok) {
-        rc = cib_connect(FALSE);
-        handle_connection_failures(rc);
         mon_refresh_display(NULL);
+    } else {
+        handle_connection_failures(rc);
     }
 
     clean_up(CRM_EX_OK);
@@ -1416,10 +1416,8 @@ main(int argc, char **argv)
     do {
         print_as(output_format ,"Waiting until cluster is available on this node ...\n");
 
-        rc = fencing_connect();
-        if (rc == pcmk_ok) {
-            rc = cib_connect(TRUE);
-        }
+        fencing_connect();
+        rc = cib_connect(TRUE);
 
         if (rc != pcmk_ok) {
             sleep(options.reconnect_msec / 1000);
@@ -1896,16 +1894,12 @@ mon_refresh_display(gpointer user_data)
         return 0;
     }
 
-    /* get the stonith-history if there is evidence we need it
-     */
+    /* get the stonith-history if there is evidence we need it */
     while (pcmk_is_set(options.mon_ops, mon_op_fence_history)) {
         if (st != NULL) {
             history_rc = st->cmds->history(st, st_opt_sync_call, NULL, &stonith_history, 120);
 
-            if (history_rc != 0) {
-                out->err(out, "Critical: Unable to get stonith-history");
-                mon_cib_connection_destroy_error(NULL);
-            } else {
+            if (history_rc == 0) {
                 stonith_history = stonith__sort_history(stonith_history);
                 if (!pcmk_is_set(options.mon_ops, mon_op_fence_full_history)
                     && (output_format != mon_output_xml)) {
@@ -1915,11 +1909,9 @@ mon_refresh_display(gpointer user_data)
                 break; /* all other cases are errors */
             }
         } else {
-            out->err(out, "Critical: No stonith-API");
+            history_rc = ENOTCONN;
+            break;
         }
-        free_xml(cib_copy);
-        out->err(out, "Reading stonith-history failed");
-        return 0;
     }
 
     if (mon_data_set == NULL) {
-- 
1.8.3.1


From 8abcb2bf0c5c90004a687e27aa86fd6ad1b62eb3 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 14 Jan 2021 14:31:25 -0500
Subject: [PATCH 07/11] Refactor: Split the fencing history code into its own
 function.

---
 tools/crm_mon.c | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 17b8ee9..1baba5f 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -1879,6 +1879,33 @@ crm_diff_update(const char *event, xmlNode * msg)
 }
 
 static int
+get_fencing_history(stonith_history_t **stonith_history)
+{
+    int rc = 0;
+
+    while (pcmk_is_set(options.mon_ops, mon_op_fence_history)) {
+        if (st != NULL) {
+            rc = st->cmds->history(st, st_opt_sync_call, NULL, stonith_history, 120);
+
+            if (rc == 0) {
+                *stonith_history = stonith__sort_history(*stonith_history);
+                if (!pcmk_is_set(options.mon_ops, mon_op_fence_full_history)
+                    && (output_format != mon_output_xml)) {
+
+                    *stonith_history = pcmk__reduce_fence_history(*stonith_history);
+                }
+                break; /* all other cases are errors */
+            }
+        } else {
+            rc = ENOTCONN;
+            break;
+        }
+    }
+
+    return rc;
+}
+
+static int
 mon_refresh_display(gpointer user_data)
 {
     xmlNode *cib_copy = copy_xml(current_cib);
@@ -1895,24 +1922,7 @@ mon_refresh_display(gpointer user_data)
     }
 
     /* get the stonith-history if there is evidence we need it */
-    while (pcmk_is_set(options.mon_ops, mon_op_fence_history)) {
-        if (st != NULL) {
-            history_rc = st->cmds->history(st, st_opt_sync_call, NULL, &stonith_history, 120);
-
-            if (history_rc == 0) {
-                stonith_history = stonith__sort_history(stonith_history);
-                if (!pcmk_is_set(options.mon_ops, mon_op_fence_full_history)
-                    && (output_format != mon_output_xml)) {
-
-                    stonith_history = pcmk__reduce_fence_history(stonith_history);
-                }
-                break; /* all other cases are errors */
-            }
-        } else {
-            history_rc = ENOTCONN;
-            break;
-        }
-    }
+    history_rc = get_fencing_history(&stonith_history);
 
     if (mon_data_set == NULL) {
         mon_data_set = pe_new_working_set();
-- 
1.8.3.1


From fa75e884e3c3822e1010ad1d67958e4f1cc5400b Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 14 Jan 2021 14:49:09 -0500
Subject: [PATCH 08/11] Refactor: tools: Get rid of
 mon_cib_connection_destroy_regular.

With the _error version removed in a previous commit, there's no need
for this wrapper to exist anymore.  We can just call
mon_cib_connection_destroy directly.
---
 tools/crm_mon.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 1baba5f..a0764a5 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -691,13 +691,9 @@ mon_timer_popped(gpointer data)
 }
 
 static void
-do_mon_cib_connection_destroy(gpointer user_data, bool is_error)
+mon_cib_connection_destroy(gpointer user_data)
 {
-    if (is_error) {
-        out->err(out, "Connection to the cluster-daemons terminated");
-    } else {
-        out->info(out, "Connection to the cluster-daemons terminated");
-    }
+    out->info(out, "Connection to the cluster-daemons terminated");
 
     if (refresh_timer != NULL) {
         /* we'll trigger a refresh after reconnect */
@@ -721,12 +717,6 @@ do_mon_cib_connection_destroy(gpointer user_data, bool is_error)
     return;
 }
 
-static void
-mon_cib_connection_destroy_regular(gpointer user_data)
-{
-    do_mon_cib_connection_destroy(user_data, false);
-}
-
 /*
  * Mainloop signal handler.
  */
@@ -831,7 +821,7 @@ cib_connect(gboolean full)
     rc = cib->cmds->query(cib, NULL, &current_cib, cib_scope_local | cib_sync_call);
 
     if (rc == pcmk_ok && full) {
-        rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy_regular);
+        rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy);
         if (rc == -EPROTONOSUPPORT) {
             print_as
                 (output_format, "Notification setup not supported, won't be able to reconnect after failure");
@@ -890,7 +880,7 @@ detect_user_input(GIOChannel *channel, GIOCondition condition, gpointer user_dat
                     options.mon_ops |= mon_op_fence_history;
                     options.mon_ops |= mon_op_fence_connect;
                     if (st == NULL) {
-                        mon_cib_connection_destroy_regular(NULL);
+                        mon_cib_connection_destroy(NULL);
                     }
                 }
 
@@ -2010,7 +2000,7 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e)
 {
     if (st->state == stonith_disconnected) {
         /* disconnect cib as well and have everything reconnect */
-        mon_cib_connection_destroy_regular(NULL);
+        mon_cib_connection_destroy(NULL);
     } else if (options.external_agent) {
         char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)",
                                     e->operation, e->origin, e->target, pcmk_strerror(e->result),
@@ -2059,7 +2049,7 @@ mon_st_callback_display(stonith_t * st, stonith_event_t * e)
 {
     if (st->state == stonith_disconnected) {
         /* disconnect cib as well and have everything reconnect */
-        mon_cib_connection_destroy_regular(NULL);
+        mon_cib_connection_destroy(NULL);
     } else {
         print_dot(output_format);
         kick_refresh(TRUE);
-- 
1.8.3.1


From 009f3aa0caa6d138d4da418297f12c4a1210cf6b Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 14 Jan 2021 16:25:37 -0500
Subject: [PATCH 09/11] Refactor: Add comments to connection functions in
 crm_mon.c.

There are an awful lot of these functions, and trying to make sense of
them can be confusing when there's no comments explaining when they
happen.  Hopefully this helps a little.
---
 tools/crm_mon.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 8 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index a0764a5..54a7958 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -666,6 +666,10 @@ static GOptionEntry deprecated_entries[] = {
 };
 /* *INDENT-ON* */
 
+/* Reconnect to the CIB and fencing agent after reconnect_msec has passed.  This sounds
+ * like it would be more broadly useful, but only ever happens after a disconnect via
+ * mon_cib_connection_destroy.
+ */
 static gboolean
 mon_timer_popped(gpointer data)
 {
@@ -684,12 +688,17 @@ mon_timer_popped(gpointer data)
     print_as(output_format, "Reconnecting...\n");
     fencing_connect();
     if (cib_connect(TRUE) == pcmk_ok) {
+        /* Redraw the screen and reinstall ourselves to get called after another reconnect_msec. */
         mon_refresh_display(NULL);
         timer_id = g_timeout_add(options.reconnect_msec, mon_timer_popped, NULL);
     }
     return FALSE;
 }
 
+/* Called from various places when we are disconnected from the CIB or from the
+ * fencing agent.  If the CIB connection is still valid, this function will also
+ * attempt to sign off and reconnect.
+ */
 static void
 mon_cib_connection_destroy(gpointer user_data)
 {
@@ -717,9 +726,7 @@ mon_cib_connection_destroy(gpointer user_data)
     return;
 }
 
-/*
- * Mainloop signal handler.
- */
+/* Signal handler installed into the mainloop for normal program shutdown */
 static void
 mon_shutdown(int nsig)
 {
@@ -729,6 +736,10 @@ mon_shutdown(int nsig)
 #if CURSES_ENABLED
 static sighandler_t ncurses_winch_handler;
 
+/* Signal handler installed the regular way (not into the main loop) for when
+ * the screen is resized.  Commonly, this happens when running in an xterm and
+ * the user changes its size.
+ */
 static void
 mon_winresize(int nsig)
 {
@@ -743,6 +754,9 @@ mon_winresize(int nsig)
             (*ncurses_winch_handler) (SIGWINCH);
         getmaxyx(stdscr, lines, cols);
         resizeterm(lines, cols);
+        /* Alert the mainloop code we'd like the refresh_trigger to run next
+         * time the mainloop gets around to checking.
+         */
         mainloop_set_trigger(refresh_trigger);
     }
     not_done--;
@@ -863,6 +877,12 @@ get_option_desc(char c)
 #define print_option_help(output_format, option, condition) \
     out->info(out, "%c %c: \t%s", ((condition)? '*': ' '), option, get_option_desc(option));
 
+/* This function is called from the main loop when there is something to be read
+ * on stdin, like an interactive user's keystroke.  All it does is read the keystroke,
+ * set flags (or show the page showing which keystrokes are valid), and redraw the
+ * screen.  It does not do anything with connections to the CIB or fencing agent
+ * agent what would happen in mon_refresh_display.
+ */
 static gboolean
 detect_user_input(GIOChannel *channel, GIOCondition condition, gpointer user_data)
 {
@@ -951,6 +971,7 @@ detect_user_input(GIOChannel *channel, GIOCondition condition, gpointer user_dat
                 config_mode = TRUE;
                 break;
             default:
+                /* All other keys just redraw the screen. */
                 goto refresh;
         }
 
@@ -1441,6 +1462,10 @@ main(int argc, char **argv)
         g_io_add_watch(io_channel, G_IO_IN, detect_user_input, NULL);
     }
 #endif
+
+    /* When refresh_trigger->trigger is set to TRUE, call mon_refresh_display.  In
+     * this file, that is anywhere mainloop_set_trigger is called.
+     */
     refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_display, NULL);
 
     g_main_loop_run(mainloop);
@@ -1677,6 +1702,10 @@ handle_rsc_op(xmlNode * xml, const char *node_id)
     free(task);
 }
 
+/* This function is just a wrapper around mainloop_set_trigger so that it can be
+ * called from a mainloop directly.  It's simply another way of ensuring the screen
+ * gets redrawn.
+ */
 static gboolean
 mon_trigger_refresh(gpointer user_data)
 {
@@ -1995,6 +2024,9 @@ mon_refresh_display(gpointer user_data)
     return 1;
 }
 
+/* This function is called for fencing events (see fencing_connect for which ones) when
+ * --watch-fencing is used on the command line.
+ */
 static void
 mon_st_callback_event(stonith_t * st, stonith_event_t * e)
 {
@@ -2010,6 +2042,16 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e)
     }
 }
 
+/* Cause the screen to be redrawn (via mainloop_set_trigger) when various conditions are met:
+ *
+ * - If the last update occurred more than reconnect_msec ago (defaults to 5s, but can be
+ *   changed via the -i command line option), or
+ * - After every 10 CIB updates, or
+ * - If it's been 2s since the last update
+ *
+ * This function sounds like it would be more broadly useful, but it is only called when a
+ * fencing event is received or a CIB diff occurrs.
+ */
 static void
 kick_refresh(gboolean data_updated)
 {
@@ -2024,11 +2066,6 @@ kick_refresh(gboolean data_updated)
         refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL);
     }
 
-    /* Refresh
-     * - immediately if the last update was more than 5s ago
-     * - every 10 cib-updates
-     * - at most 2s after the last update
-     */
     if ((now - last_refresh) > (options.reconnect_msec / 1000)) {
         mainloop_set_trigger(refresh_trigger);
         mainloop_timer_stop(refresh_timer);
@@ -2044,6 +2081,9 @@ kick_refresh(gboolean data_updated)
     }
 }
 
+/* This function is called for fencing events (see fencing_connect for which ones) when
+ * --watch-fencing is NOT used on the command line.
+ */
 static void
 mon_st_callback_display(stonith_t * st, stonith_event_t * e)
 {
-- 
1.8.3.1


From aa328f0788ef0057874aeeeae7261dfb450b9b9e Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 14 Jan 2021 16:44:45 -0500
Subject: [PATCH 10/11] Refactor: tools: Rename some connection-related symbols
 in crm_mon.

---
 tools/crm_mon.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 54a7958..89d7ae2 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -66,7 +66,7 @@ static mon_output_format_t output_format = mon_output_unset;
 /* other globals */
 static GIOChannel *io_channel = NULL;
 static GMainLoop *mainloop = NULL;
-static guint timer_id = 0;
+static guint reconnect_timer = 0;
 static mainloop_timer_t *refresh_timer = NULL;
 static pe_working_set_t *mon_data_set = NULL;
 
@@ -131,7 +131,7 @@ static int cib_connect(gboolean full);
 static int fencing_connect(void);
 static void mon_st_callback_event(stonith_t * st, stonith_event_t * e);
 static void mon_st_callback_display(stonith_t * st, stonith_event_t * e);
-static void kick_refresh(gboolean data_updated);
+static void refresh_after_event(gboolean data_updated);
 
 static unsigned int
 all_includes(mon_output_format_t fmt) {
@@ -671,7 +671,7 @@ static GOptionEntry deprecated_entries[] = {
  * mon_cib_connection_destroy.
  */
 static gboolean
-mon_timer_popped(gpointer data)
+reconnect_after_timeout(gpointer data)
 {
 #if CURSES_ENABLED
     if (output_format == mon_output_console) {
@@ -680,9 +680,9 @@ mon_timer_popped(gpointer data)
     }
 #endif
 
-    if (timer_id > 0) {
-        g_source_remove(timer_id);
-        timer_id = 0;
+    if (reconnect_timer > 0) {
+        g_source_remove(reconnect_timer);
+        reconnect_timer = 0;
     }
 
     print_as(output_format, "Reconnecting...\n");
@@ -690,7 +690,7 @@ mon_timer_popped(gpointer data)
     if (cib_connect(TRUE) == pcmk_ok) {
         /* Redraw the screen and reinstall ourselves to get called after another reconnect_msec. */
         mon_refresh_display(NULL);
-        timer_id = g_timeout_add(options.reconnect_msec, mon_timer_popped, NULL);
+        reconnect_timer = g_timeout_add(options.reconnect_msec, reconnect_after_timeout, NULL);
     }
     return FALSE;
 }
@@ -708,10 +708,10 @@ mon_cib_connection_destroy(gpointer user_data)
         /* we'll trigger a refresh after reconnect */
         mainloop_timer_stop(refresh_timer);
     }
-    if (timer_id) {
+    if (reconnect_timer) {
         /* we'll trigger a new reconnect-timeout at the end */
-        g_source_remove(timer_id);
-        timer_id = 0;
+        g_source_remove(reconnect_timer);
+        reconnect_timer = 0;
     }
     if (st) {
         /* the client API won't properly reconnect notifications
@@ -721,7 +721,7 @@ mon_cib_connection_destroy(gpointer user_data)
     }
     if (cib) {
         cib->cmds->signoff(cib);
-        timer_id = g_timeout_add(options.reconnect_msec, mon_timer_popped, NULL);
+        reconnect_timer = g_timeout_add(options.reconnect_msec, reconnect_after_timeout, NULL);
     }
     return;
 }
@@ -1894,7 +1894,7 @@ crm_diff_update(const char *event, xmlNode * msg)
     }
 
     stale = FALSE;
-    kick_refresh(cib_updated);
+    refresh_after_event(cib_updated);
 }
 
 static int
@@ -2053,7 +2053,7 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e)
  * fencing event is received or a CIB diff occurrs.
  */
 static void
-kick_refresh(gboolean data_updated)
+refresh_after_event(gboolean data_updated)
 {
     static int updates = 0;
     time_t now = time(NULL);
@@ -2092,7 +2092,7 @@ mon_st_callback_display(stonith_t * st, stonith_event_t * e)
         mon_cib_connection_destroy(NULL);
     } else {
         print_dot(output_format);
-        kick_refresh(TRUE);
+        refresh_after_event(TRUE);
     }
 }
 
-- 
1.8.3.1


From 8c51b4980f349e8773681f7ed2882ca639e0e63a Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Mon, 18 Jan 2021 14:03:39 -0500
Subject: [PATCH 11/11] Fix: tools: Attempt to reestablish connections in
 crm_mon.

If the fencing or CIB connections go away between screen refreshes,
attempt to re-establish those connections.  The functions that do this
should be safe to be called repeatedly.

See: rhbz#1880426, rhbz#1466875
---
 tools/crm_mon.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index 89d7ae2..083b7ae 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -126,6 +126,7 @@ static void clean_up_cib_connection(void);
 static void clean_up_fencing_connection(void);
 static crm_exit_t clean_up(crm_exit_t exit_code);
 static void crm_diff_update(const char *event, xmlNode * msg);
+static void handle_connection_failures(int rc);
 static int mon_refresh_display(gpointer user_data);
 static int cib_connect(gboolean full);
 static int fencing_connect(void);
@@ -690,9 +691,11 @@ reconnect_after_timeout(gpointer data)
     if (cib_connect(TRUE) == pcmk_ok) {
         /* Redraw the screen and reinstall ourselves to get called after another reconnect_msec. */
         mon_refresh_display(NULL);
-        reconnect_timer = g_timeout_add(options.reconnect_msec, reconnect_after_timeout, NULL);
+        return FALSE;
     }
-    return FALSE;
+
+    reconnect_timer = g_timeout_add(options.reconnect_msec, reconnect_after_timeout, NULL);
+    return TRUE;
 }
 
 /* Called from various places when we are disconnected from the CIB or from the
@@ -887,6 +890,7 @@ static gboolean
 detect_user_input(GIOChannel *channel, GIOCondition condition, gpointer user_data)
 {
     int c;
+    int rc;
     gboolean config_mode = FALSE;
 
     while (1) {
@@ -1001,7 +1005,14 @@ detect_user_input(GIOChannel *channel, GIOCondition condition, gpointer user_dat
     }
 
 refresh:
-    mon_refresh_display(NULL);
+    fencing_connect();
+    rc = cib_connect(FALSE);
+    if (rc == pcmk_rc_ok) {
+        mon_refresh_display(NULL);
+    } else {
+        handle_connection_failures(rc);
+    }
+
     return TRUE;
 }
 #endif
-- 
1.8.3.1