Blame SOURCES/018-failure-messages.patch

d4e586
From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Tue, 18 Jan 2022 16:04:49 -0600
d4e586
Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not
d4e586
 connected" status
d4e586
d4e586
PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor
d4e586
connection", but it can also now mean no fencer connection, so change it to
d4e586
"Internal communication failure" which is probably less mysterious to end users
d4e586
anyway (especially since it should be accompanied by a more descriptive exit
d4e586
reason).
d4e586
---
d4e586
 include/crm/common/results.h | 2 +-
d4e586
 1 file changed, 1 insertion(+), 1 deletion(-)
d4e586
d4e586
diff --git a/include/crm/common/results.h b/include/crm/common/results.h
d4e586
index 873faf5c43..3d322a7ce6 100644
d4e586
--- a/include/crm/common/results.h
d4e586
+++ b/include/crm/common/results.h
d4e586
@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status)
d4e586
         case PCMK_EXEC_ERROR_HARD:      return "Hard error";
d4e586
         case PCMK_EXEC_ERROR_FATAL:     return "Fatal error";
d4e586
         case PCMK_EXEC_NOT_INSTALLED:   return "Not installed";
d4e586
-        case PCMK_EXEC_NOT_CONNECTED:   return "No executor connection";
d4e586
+        case PCMK_EXEC_NOT_CONNECTED:   return "Internal communication failure";
d4e586
         case PCMK_EXEC_INVALID:         return "Cannot execute now";
d4e586
         case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device";
d4e586
         case PCMK_EXEC_NO_SECRETS:      return "CIB secrets unavailable";
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Mon, 20 Dec 2021 15:12:36 -0600
d4e586
Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error
d4e586
 redefinitions
d4e586
d4e586
portability.h defines some system error codes that might not be present on
d4e586
non-Linux systems.
d4e586
d4e586
This was a bad idea, since there's no way to ensure the defined values don't
d4e586
conflict with existing system codes. However, we use a number of them, so it's
d4e586
probably best to keep them, at least until we can make a backward compatibility
d4e586
break.
d4e586
d4e586
However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those.
d4e586
---
d4e586
 include/portability.h | 12 ------------
d4e586
 lib/common/results.c  |  9 ++++++---
d4e586
 2 files changed, 6 insertions(+), 15 deletions(-)
d4e586
d4e586
diff --git a/include/portability.h b/include/portability.h
d4e586
index 9a60c583a7..ee065a376d 100644
d4e586
--- a/include/portability.h
d4e586
+++ b/include/portability.h
d4e586
@@ -131,10 +131,6 @@ typedef union
d4e586
 #    define EREMOTEIO 193
d4e586
 #  endif
d4e586
 
d4e586
-#  ifndef EUNATCH
d4e586
-#    define EUNATCH   194
d4e586
-#  endif
d4e586
-
d4e586
 #  ifndef ENOKEY
d4e586
 #    define ENOKEY    195
d4e586
 #  endif
d4e586
@@ -147,14 +143,6 @@ typedef union
d4e586
 #    define ETIME     197
d4e586
 #  endif
d4e586
 
d4e586
-#  ifndef ENOSR
d4e586
-#    define ENOSR     198
d4e586
-#  endif
d4e586
-
d4e586
-#  ifndef ENOSTR
d4e586
-#    define ENOSTR    199
d4e586
-#  endif
d4e586
-
d4e586
 #  ifndef EKEYREJECTED
d4e586
 #    define EKEYREJECTED 200
d4e586
 #  endif
d4e586
diff --git a/lib/common/results.c b/lib/common/results.c
d4e586
index 6d120694cd..96cd4e5659 100644
d4e586
--- a/lib/common/results.c
d4e586
+++ b/lib/common/results.c
d4e586
@@ -118,9 +118,6 @@ pcmk_strerror(int rc)
d4e586
         case EREMOTEIO:
d4e586
             return "Remote I/O error";
d4e586
             /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
-        case EUNATCH:
d4e586
-            return "Protocol driver not attached";
d4e586
-            /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
         case ENOKEY:
d4e586
             return "Required key not available";
d4e586
     }
d4e586
@@ -342,8 +339,12 @@ pcmk_rc_name(int rc)
d4e586
         case ENOMSG:            return "ENOMSG";
d4e586
         case ENOPROTOOPT:       return "ENOPROTOOPT";
d4e586
         case ENOSPC:            return "ENOSPC";
d4e586
+#ifdef ENOSR
d4e586
         case ENOSR:             return "ENOSR";
d4e586
+#endif
d4e586
+#ifdef ENOSTR
d4e586
         case ENOSTR:            return "ENOSTR";
d4e586
+#endif
d4e586
         case ENOSYS:            return "ENOSYS";
d4e586
         case ENOTBLK:           return "ENOTBLK";
d4e586
         case ENOTCONN:          return "ENOTCONN";
d4e586
@@ -376,7 +377,9 @@ pcmk_rc_name(int rc)
d4e586
         case ETIME:             return "ETIME";
d4e586
         case ETIMEDOUT:         return "ETIMEDOUT";
d4e586
         case ETXTBSY:           return "ETXTBSY";
d4e586
+#ifdef EUNATCH
d4e586
         case EUNATCH:           return "EUNATCH";
d4e586
+#endif
d4e586
         case EUSERS:            return "EUSERS";
d4e586
         /* case EWOULDBLOCK:    return "EWOULDBLOCK"; */
d4e586
         case EXDEV:             return "EXDEV";
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Mon, 20 Dec 2021 15:33:12 -0600
d4e586
Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h
d4e586
 error codes
d4e586
d4e586
portability.h defines some system error codes that might not be present on
d4e586
non-Linux systems.
d4e586
d4e586
Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when
d4e586
the system doesn't have the value, so we can detect that when relevant.
d4e586
d4e586
Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values.
d4e586
---
d4e586
 include/portability.h |  8 ++++++++
d4e586
 lib/common/results.c  | 32 ++++++++++++++++++++++++++++++--
d4e586
 2 files changed, 38 insertions(+), 2 deletions(-)
d4e586
d4e586
diff --git a/include/portability.h b/include/portability.h
d4e586
index ee065a376d..5d5fbf21cb 100644
d4e586
--- a/include/portability.h
d4e586
+++ b/include/portability.h
d4e586
@@ -116,34 +116,42 @@ typedef union
d4e586
 #  include <errno.h>
d4e586
 
d4e586
 #  ifndef ENOTUNIQ
d4e586
+#    define PCMK__ENOTUNIQ
d4e586
 #    define ENOTUNIQ  190
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef ECOMM
d4e586
+#    define PCMK__ECOMM
d4e586
 #    define ECOMM     191
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef ELIBACC
d4e586
+#    define PCMK__ELIBACC
d4e586
 #    define ELIBACC   192
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef EREMOTEIO
d4e586
+#    define PCMK__EREMOTIO
d4e586
 #    define EREMOTEIO 193
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef ENOKEY
d4e586
+#    define PCMK__ENOKEY
d4e586
 #    define ENOKEY    195
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef ENODATA
d4e586
+#    define PCMK__ENODATA
d4e586
 #    define ENODATA   196
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef ETIME
d4e586
+#    define PCMK__ETIME
d4e586
 #    define ETIME     197
d4e586
 #  endif
d4e586
 
d4e586
 #  ifndef EKEYREJECTED
d4e586
+#    define PCMK__EKEYREJECTED
d4e586
 #    define EKEYREJECTED 200
d4e586
 #  endif
d4e586
 
d4e586
diff --git a/lib/common/results.c b/lib/common/results.c
d4e586
index 96cd4e5659..bcf289d0d6 100644
d4e586
--- a/lib/common/results.c
d4e586
+++ b/lib/common/results.c
d4e586
@@ -395,9 +395,9 @@ pcmk_rc_name(int rc)
d4e586
 #ifdef EISNAM // Not available on OS X, Illumos, Solaris
d4e586
         case EISNAM:            return "EISNAM";
d4e586
         case EKEYEXPIRED:       return "EKEYEXPIRED";
d4e586
-        case EKEYREJECTED:      return "EKEYREJECTED";
d4e586
         case EKEYREVOKED:       return "EKEYREVOKED";
d4e586
 #endif
d4e586
+        case EKEYREJECTED:      return "EKEYREJECTED";
d4e586
         case EL2HLT:            return "EL2HLT";
d4e586
         case EL2NSYNC:          return "EL2NSYNC";
d4e586
         case EL3HLT:            return "EL3HLT";
d4e586
@@ -443,7 +443,35 @@ pcmk_rc_str(int rc)
d4e586
     if (rc < 0) {
d4e586
         return "Unknown error";
d4e586
     }
d4e586
-    return strerror(rc);
d4e586
+
d4e586
+    // Handle values that could be defined by system or by portability.h
d4e586
+    switch (rc) {
d4e586
+#ifdef PCMK__ENOTUNIQ
d4e586
+        case ENOTUNIQ:      return "Name not unique on network";
d4e586
+#endif
d4e586
+#ifdef PCMK__ECOMM
d4e586
+        case ECOMM:         return "Communication error on send";
d4e586
+#endif
d4e586
+#ifdef PCMK__ELIBACC
d4e586
+        case ELIBACC:       return "Can not access a needed shared library";
d4e586
+#endif
d4e586
+#ifdef PCMK__EREMOTEIO
d4e586
+        case EREMOTEIO:     return "Remote I/O error";
d4e586
+#endif
d4e586
+#ifdef PCMK__ENOKEY
d4e586
+        case ENOKEY:        return "Required key not available";
d4e586
+#endif
d4e586
+#ifdef PCMK__ENODATA
d4e586
+        case ENODATA:       return "No data available";
d4e586
+#endif
d4e586
+#ifdef PCMK__ETIME
d4e586
+        case ETIME:         return "Timer expired";
d4e586
+#endif
d4e586
+#ifdef PCMK__EKEYREJECTED
d4e586
+        case EKEYREJECTED:  return "Key was rejected by service";
d4e586
+#endif
d4e586
+        default:            return strerror(rc);
d4e586
+    }
d4e586
 }
d4e586
 
d4e586
 // This returns negative values for errors
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Mon, 20 Dec 2021 15:39:19 -0600
d4e586
Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of
d4e586
 pcmk_rc_str()
d4e586
d4e586
... to reduce code duplication. This causes minor differences in the string for
d4e586
a few values.
d4e586
---
d4e586
 lib/common/results.c | 67 +-------------------------------------------
d4e586
 1 file changed, 1 insertion(+), 66 deletions(-)
d4e586
d4e586
diff --git a/lib/common/results.c b/lib/common/results.c
d4e586
index bcf289d0d6..b2c6e8d553 100644
d4e586
--- a/lib/common/results.c
d4e586
+++ b/lib/common/results.c
d4e586
@@ -57,72 +57,7 @@ pcmk_errorname(int rc)
d4e586
 const char *
d4e586
 pcmk_strerror(int rc)
d4e586
 {
d4e586
-    if (rc == 0) {
d4e586
-        return "OK";
d4e586
-    }
d4e586
-
d4e586
-    rc = abs(rc);
d4e586
-
d4e586
-    // Of course rc > 0 ... unless someone passed INT_MIN as rc
d4e586
-    if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) {
d4e586
-        return strerror(rc);
d4e586
-    }
d4e586
-
d4e586
-    switch (rc) {
d4e586
-        case pcmk_err_generic:
d4e586
-            return "Generic Pacemaker error";
d4e586
-        case pcmk_err_no_quorum:
d4e586
-            return "Operation requires quorum";
d4e586
-        case pcmk_err_schema_validation:
d4e586
-            return "Update does not conform to the configured schema";
d4e586
-        case pcmk_err_transform_failed:
d4e586
-            return "Schema transform failed";
d4e586
-        case pcmk_err_old_data:
d4e586
-            return "Update was older than existing configuration";
d4e586
-        case pcmk_err_diff_failed:
d4e586
-            return "Application of an update diff failed";
d4e586
-        case pcmk_err_diff_resync:
d4e586
-            return "Application of an update diff failed, requesting a full refresh";
d4e586
-        case pcmk_err_cib_modified:
d4e586
-            return "The on-disk configuration was manually modified";
d4e586
-        case pcmk_err_cib_backup:
d4e586
-            return "Could not archive the previous configuration";
d4e586
-        case pcmk_err_cib_save:
d4e586
-            return "Could not save the new configuration to disk";
d4e586
-        case pcmk_err_cib_corrupt:
d4e586
-            return "Could not parse on-disk configuration";
d4e586
-        case pcmk_err_multiple:
d4e586
-            return "Resource active on multiple nodes";
d4e586
-        case pcmk_err_node_unknown:
d4e586
-            return "Node not found";
d4e586
-        case pcmk_err_already:
d4e586
-            return "Situation already as requested";
d4e586
-        case pcmk_err_bad_nvpair:
d4e586
-            return "Bad name/value pair given";
d4e586
-        case pcmk_err_schema_unchanged:
d4e586
-            return "Schema is already the latest available";
d4e586
-        case pcmk_err_unknown_format:
d4e586
-            return "Unknown output format";
d4e586
-
d4e586
-            /* The following cases will only be hit on systems for which they are non-standard */
d4e586
-            /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
-        case ENOTUNIQ:
d4e586
-            return "Name not unique on network";
d4e586
-            /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
-        case ECOMM:
d4e586
-            return "Communication error on send";
d4e586
-            /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
-        case ELIBACC:
d4e586
-            return "Can not access a needed shared library";
d4e586
-            /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
-        case EREMOTEIO:
d4e586
-            return "Remote I/O error";
d4e586
-            /* coverity[dead_error_condition] False positive on non-Linux */
d4e586
-        case ENOKEY:
d4e586
-            return "Required key not available";
d4e586
-    }
d4e586
-    crm_err("Unknown error code: %d", rc);
d4e586
-    return "Unknown error";
d4e586
+    return pcmk_rc_str(pcmk_legacy2rc(rc));
d4e586
 }
d4e586
 
d4e586
 // Standard Pacemaker API return codes
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Mon, 20 Dec 2021 15:41:24 -0600
d4e586
Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error"
d4e586
d4e586
... which is unhelpful and annoying to users
d4e586
---
d4e586
 lib/common/results.c | 4 ++--
d4e586
 1 file changed, 2 insertions(+), 2 deletions(-)
d4e586
d4e586
diff --git a/lib/common/results.c b/lib/common/results.c
d4e586
index b2c6e8d553..5ffac76549 100644
d4e586
--- a/lib/common/results.c
d4e586
+++ b/lib/common/results.c
d4e586
@@ -376,7 +376,7 @@ pcmk_rc_str(int rc)
d4e586
         return pcmk__rcs[pcmk_rc_error - rc].desc;
d4e586
     }
d4e586
     if (rc < 0) {
d4e586
-        return "Unknown error";
d4e586
+        return "Error";
d4e586
     }
d4e586
 
d4e586
     // Handle values that could be defined by system or by portability.h
d4e586
@@ -768,7 +768,7 @@ bz2_strerror(int rc)
d4e586
         case BZ_OUTBUFF_FULL:
d4e586
             return "output data will not fit into the buffer provided";
d4e586
     }
d4e586
-    return "Unknown error";
d4e586
+    return "Data compression error";
d4e586
 }
d4e586
 
d4e586
 crm_exit_t
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Mon, 20 Dec 2021 16:01:39 -0600
d4e586
Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog
d4e586
 device
d4e586
d4e586
---
d4e586
 lib/fencing/st_client.c | 7 ++++++-
d4e586
 1 file changed, 6 insertions(+), 1 deletion(-)
d4e586
d4e586
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
d4e586
index b1de912b2a..a0f3119f3b 100644
d4e586
--- a/lib/fencing/st_client.c
d4e586
+++ b/lib/fencing/st_client.c
d4e586
@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node)
d4e586
                  * we drop in here - so as not to make remote nodes
d4e586
                  * panic on that answer
d4e586
                  */
d4e586
-                crm_warn("watchdog-fencing-query failed");
d4e586
+                if (rc == -ENODEV) {
d4e586
+                    crm_notice("Cluster does not have watchdog fencing device");
d4e586
+                } else {
d4e586
+                    crm_warn("Could not check for watchdog fencing device: %s",
d4e586
+                             pcmk_strerror(rc));
d4e586
+                }
d4e586
             } else if (list[0] == '\0') {
d4e586
                 rv = TRUE;
d4e586
             } else {
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Mon, 20 Dec 2021 16:22:49 -0600
d4e586
Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for
d4e586
 recent change
d4e586
d4e586
---
d4e586
 lib/common/tests/results/pcmk__results_test.c | 2 +-
d4e586
 1 file changed, 1 insertion(+), 1 deletion(-)
d4e586
d4e586
diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c
d4e586
index 57a520c501..e08d4b6261 100644
d4e586
--- a/lib/common/tests/results/pcmk__results_test.c
d4e586
+++ b/lib/common/tests/results/pcmk__results_test.c
d4e586
@@ -30,7 +30,7 @@ static void
d4e586
 test_for_pcmk_rc_str(void **state) {
d4e586
     assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format");
d4e586
     assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK");
d4e586
-    assert_string_equal(pcmk_rc_str(-1), "Unknown error");
d4e586
+    assert_string_equal(pcmk_rc_str(-1), "Error");
d4e586
 }
d4e586
 
d4e586
 static void
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Tue, 21 Dec 2021 10:20:38 -0600
d4e586
Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent
d4e586
 changes
d4e586
d4e586
---
d4e586
 cts/lab/CTStests.py | 2 +-
d4e586
 1 file changed, 1 insertion(+), 1 deletion(-)
d4e586
d4e586
diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
d4e586
index 62c832eb45..f4be998cfb 100644
d4e586
--- a/cts/lab/CTStests.py
d4e586
+++ b/cts/lab/CTStests.py
d4e586
@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver):
d4e586
             r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
d4e586
             r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
d4e586
             r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
d4e586
-            r"error: Result of monitor operation for .* on remote-.*: No executor connection",
d4e586
+            r"error: Result of monitor operation for .* on remote-.*: Internal communication failure",
d4e586
         ]
d4e586
 
d4e586
         ignore_pats.extend(RemoteDriver.errorstoignore(self))
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Tue, 21 Dec 2021 15:40:49 -0600
d4e586
Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation
d4e586
 less chatty
d4e586
d4e586
Other messages with the same info will already be logged at higher severity
d4e586
---
d4e586
 daemons/controld/controld_execd.c      |  3 +--
d4e586
 daemons/controld/controld_te_actions.c |  7 ++-----
d4e586
 include/pcmki/pcmki_sched_utils.h      |  3 +--
d4e586
 lib/pacemaker/pcmk_injections.c        |  3 +--
d4e586
 lib/pacemaker/pcmk_sched_actions.c     | 12 +++++-------
d4e586
 5 files changed, 10 insertions(+), 18 deletions(-)
d4e586
d4e586
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
d4e586
index 15784e7687..52157fa5d4 100644
d4e586
--- a/daemons/controld/controld_execd.c
d4e586
+++ b/daemons/controld/controld_execd.c
d4e586
@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
d4e586
         caller_version = CRM_FEATURE_SET;
d4e586
     }
d4e586
 
d4e586
-    crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version);
d4e586
     xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
d4e586
-                                      fsa_our_uname, src, LOG_DEBUG);
d4e586
+                                      fsa_our_uname, src);
d4e586
     if (xml_op == NULL) {
d4e586
         return TRUE;
d4e586
     }
d4e586
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
d4e586
index 63b7c72359..b0bcb8b2e4 100644
d4e586
--- a/daemons/controld/controld_te_actions.c
d4e586
+++ b/daemons/controld/controld_te_actions.c
d4e586
@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action)
d4e586
     lrmd_event_data_t *op = NULL;
d4e586
     xmlNode *state = NULL;
d4e586
     xmlNode *rsc = NULL;
d4e586
-    xmlNode *xml_op = NULL;
d4e586
     xmlNode *action_rsc = NULL;
d4e586
 
d4e586
     int rc = pcmk_ok;
d4e586
@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action)
d4e586
     op->user_data = pcmk__transition_key(transition_graph->id, action->id,
d4e586
                                          target_rc, te_uuid);
d4e586
 
d4e586
-    xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc,
d4e586
-                                      target, __func__, LOG_INFO);
d4e586
+    pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target,
d4e586
+                             __func__);
d4e586
     lrmd_free_event(op);
d4e586
 
d4e586
-    crm_log_xml_trace(xml_op, "Action timeout");
d4e586
-
d4e586
     rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options);
d4e586
     fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated);
d4e586
     free_xml(state);
d4e586
diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h
d4e586
index 68d60fc7db..144424a609 100644
d4e586
--- a/include/pcmki/pcmki_sched_utils.h
d4e586
+++ b/include/pcmki/pcmki_sched_utils.h
d4e586
@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor
d4e586
 
d4e586
 xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event,
d4e586
                                  const char *caller_version, int target_rc,
d4e586
-                                 const char *node, const char *origin,
d4e586
-                                 int level);
d4e586
+                                 const char *node, const char *origin);
d4e586
 
d4e586
 #  define LOAD_STOPPED "load_stopped"
d4e586
 
d4e586
diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c
d4e586
index 678c3f5dd2..1aa90a5a0b 100644
d4e586
--- a/lib/pacemaker/pcmk_sched_transition.c
d4e586
+++ b/lib/pacemaker/pcmk_sched_transition.c
d4e586
@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc)
d4e586
 inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc)
d4e586
 {
d4e586
     return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET,
d4e586
-                                    target_rc, NULL, crm_system_name,
d4e586
-                                    LOG_TRACE);
d4e586
+                                    target_rc, NULL, crm_system_name);
d4e586
 }
d4e586
 
d4e586
 static xmlNode *
d4e586
diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c
d4e586
index f8200b0efc..4f63d3374d 100644
d4e586
--- a/lib/pacemaker/pcmk_sched_utils.c
d4e586
+++ b/lib/pacemaker/pcmk_sched_utils.c
d4e586
@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update)
d4e586
  * \param[in]     target_rc       Expected result of operation
d4e586
  * \param[in]     node            Name of node on which operation was performed
d4e586
  * \param[in]     origin          Arbitrary description of update source
d4e586
- * \param[in]     level           A log message will be logged at this level
d4e586
  *
d4e586
  * \return Newly created XML node for history update
d4e586
  */
d4e586
 xmlNode *
d4e586
 pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op,
d4e586
                          const char *caller_version, int target_rc,
d4e586
-                         const char *node, const char *origin, int level)
d4e586
+                         const char *node, const char *origin)
d4e586
 {
d4e586
     char *key = NULL;
d4e586
     char *magic = NULL;
d4e586
@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op,
d4e586
     const char *task = NULL;
d4e586
 
d4e586
     CRM_CHECK(op != NULL, return NULL);
d4e586
-    do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)",
d4e586
-               origin, op->rsc_id, op->op_type,
d4e586
-               pcmk_exec_status_str(op->op_status), op->interval_ms);
d4e586
-
d4e586
-    crm_trace("DC version: %s", caller_version);
d4e586
+    crm_trace("Creating history XML for %s-interval %s action for %s on %s "
d4e586
+              "(DC version: %s, origin: %s)",
d4e586
+              pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id,
d4e586
+              ((node == NULL)? "no node" : node), caller_version, origin);
d4e586
 
d4e586
     task = op->op_type;
d4e586
 
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Tue, 21 Dec 2021 17:09:44 -0600
d4e586
Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal
d4e586
 timeouts
d4e586
d4e586
Functionize the part of controld_record_action_timeout() that creates a fake
d4e586
executor event, into a new function synthesize_timeout_event(), and have it set
d4e586
a more detailed exit reason describing what timed out.
d4e586
---
d4e586
 daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------
d4e586
 1 file changed, 48 insertions(+), 13 deletions(-)
d4e586
d4e586
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
d4e586
index b0bcb8b2e4..de2fbb82bf 100644
d4e586
--- a/daemons/controld/controld_te_actions.c
d4e586
+++ b/daemons/controld/controld_te_actions.c
d4e586
@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action)
d4e586
     return TRUE;
d4e586
 }
d4e586
 
d4e586
+/*!
d4e586
+ * \internal
d4e586
+ * \brief Synthesize an executor event for a resource action timeout
d4e586
+ *
d4e586
+ * \param[in] action     Resource action that timed out
d4e586
+ * \param[in] target_rc  Expected result of action that timed out
d4e586
+ *
d4e586
+ * Synthesize an executor event for a resource action timeout. (If the executor
d4e586
+ * gets a timeout while waiting for a resource action to complete, that will be
d4e586
+ * reported via the usual callback. This timeout means we didn't hear from the
d4e586
+ * executor itself or the controller that relayed the action to the executor.)
d4e586
+ *
d4e586
+ * \return Newly created executor event for result of \p action
d4e586
+ * \note The caller is responsible for freeing the return value using
d4e586
+ *       lrmd_free_event().
d4e586
+ */
d4e586
+static lrmd_event_data_t *
d4e586
+synthesize_timeout_event(crm_action_t *action, int target_rc)
d4e586
+{
d4e586
+    lrmd_event_data_t *op = NULL;
d4e586
+    const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
d4e586
+    const char *reason = NULL;
d4e586
+    char *dynamic_reason = NULL;
d4e586
+
d4e586
+    if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) {
d4e586
+        reason = "Local executor did not return result in time";
d4e586
+    } else {
d4e586
+        const char *router_node = NULL;
d4e586
+
d4e586
+        router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
d4e586
+        if (router_node == NULL) {
d4e586
+            router_node = target;
d4e586
+        }
d4e586
+        dynamic_reason = crm_strdup_printf("Controller on %s did not return "
d4e586
+                                           "result in time", router_node);
d4e586
+        reason = dynamic_reason;
d4e586
+    }
d4e586
+
d4e586
+    op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT,
d4e586
+                                       PCMK_OCF_UNKNOWN_ERROR, reason);
d4e586
+    op->call_id = -1;
d4e586
+    op->user_data = pcmk__transition_key(transition_graph->id, action->id,
d4e586
+                                         target_rc, te_uuid);
d4e586
+    free(dynamic_reason);
d4e586
+    return op;
d4e586
+}
d4e586
+
d4e586
 void
d4e586
 controld_record_action_timeout(crm_action_t *action)
d4e586
 {
d4e586
@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action)
d4e586
     crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS);
d4e586
     crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER);
d4e586
 
d4e586
-    /* If the executor gets a timeout while waiting for the action to complete,
d4e586
-     * that will be reported via the usual callback. This timeout means that we
d4e586
-     * didn't hear from the executor or the controller that relayed the action
d4e586
-     * to the executor.
d4e586
-     */
d4e586
-    op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT,
d4e586
-                                       PCMK_OCF_UNKNOWN_ERROR,
d4e586
-                                       "Cluster communication timeout "
d4e586
-                                       "(no response from executor)");
d4e586
-    op->call_id = -1;
d4e586
-    op->user_data = pcmk__transition_key(transition_graph->id, action->id,
d4e586
-                                         target_rc, te_uuid);
d4e586
-
d4e586
+    op = synthesize_timeout_event(action, target_rc);
d4e586
     pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target,
d4e586
                              __func__);
d4e586
     lrmd_free_event(op);
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Wed, 22 Dec 2021 16:35:06 -0600
d4e586
Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing
d4e586
 timeouts
d4e586
d4e586
Troubleshooting timeouts is one of the more difficult aspects of cluster
d4e586
maintenance. We want to give as much of a hint as possible, but for fencing in
d4e586
particular it is difficult because an operation might involve multiple retries
d4e586
of multiple devices.
d4e586
d4e586
Barring another major project to track exactly which devices, retries, etc.,
d4e586
were used in a given operation, these changes in wording are probably the best
d4e586
we can do.
d4e586
---
d4e586
 daemons/fenced/fenced_remote.c | 8 +++++---
d4e586
 lib/fencing/st_client.c        | 2 +-
d4e586
 2 files changed, 6 insertions(+), 4 deletions(-)
d4e586
d4e586
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
d4e586
index 1e237150c5..6eebb7381e 100644
d4e586
--- a/daemons/fenced/fenced_remote.c
d4e586
+++ b/daemons/fenced/fenced_remote.c
d4e586
@@ -1,5 +1,5 @@
d4e586
 /*
d4e586
- * Copyright 2009-2021 the Pacemaker project contributors
d4e586
+ * Copyright 2009-2022 the Pacemaker project contributors
d4e586
  *
d4e586
  * The version control history for this file may have further details.
d4e586
  *
d4e586
@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata)
d4e586
                   CRM_XS " id=%.8s",
d4e586
                   op->action, op->target, op->client_name, op->id);
d4e586
     } else {
d4e586
-        finalize_timed_out_op(userdata, "Fencing could not be completed "
d4e586
-                                        "within overall timeout");
d4e586
+        finalize_timed_out_op(userdata, "Fencing did not complete within a "
d4e586
+                                        "total timeout based on the "
d4e586
+                                        "configured timeout and retries for "
d4e586
+                                        "any devices attempted");
d4e586
     }
d4e586
     return G_SOURCE_REMOVE;
d4e586
 }
d4e586
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
d4e586
index a0f3119f3b..718739b321 100644
d4e586
--- a/lib/fencing/st_client.c
d4e586
+++ b/lib/fencing/st_client.c
d4e586
@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
d4e586
     if (msg == NULL) {
d4e586
         // Fencer didn't reply in time
d4e586
         pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
d4e586
-                         "Timeout waiting for reply from fencer");
d4e586
+                         "Fencer accepted request but did not reply in time");
d4e586
         CRM_LOG_ASSERT(call_id > 0);
d4e586
 
d4e586
     } else {
d4e586
-- 
d4e586
2.27.0
d4e586
d4e586
d4e586
From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001
d4e586
From: Ken Gaillot <kgaillot@redhat.com>
d4e586
Date: Wed, 22 Dec 2021 17:09:09 -0600
d4e586
Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for
d4e586
 timeouts
d4e586
d4e586
The services library doesn't have enough information about an action to say
d4e586
(for example) what configuration parameters might be relevant, but we can at
d4e586
least distinguish what kind of agent timed out.
d4e586
---
d4e586
 lib/services/services_linux.c | 12 +++++++++++-
d4e586
 lib/services/systemd.c        |  2 +-
d4e586
 2 files changed, 12 insertions(+), 2 deletions(-)
d4e586
d4e586
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
d4e586
index f15eee860e..d6aafcfe46 100644
d4e586
--- a/lib/services/services_linux.c
d4e586
+++ b/lib/services/services_linux.c
d4e586
@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo,
d4e586
         parse_exit_reason_from_stderr(op);
d4e586
 
d4e586
     } else if (mainloop_child_timeout(p)) {
d4e586
+        const char *reason = NULL;
d4e586
+
d4e586
+        if (op->rsc != NULL) {
d4e586
+            reason = "Resource agent did not complete in time";
d4e586
+        } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH,
d4e586
+                                pcmk__str_none)) {
d4e586
+            reason = "Fence agent did not complete in time";
d4e586
+        } else {
d4e586
+            reason = "Process did not complete in time";
d4e586
+        }
d4e586
         crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout);
d4e586
         services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT,
d4e586
-                             "Process did not exit within specified timeout");
d4e586
+                             reason);
d4e586
 
d4e586
     } else if (op->cancel) {
d4e586
         /* If an in-flight recurring operation was killed because it was
d4e586
diff --git a/lib/services/systemd.c b/lib/services/systemd.c
d4e586
index 27a3b376db..d87b287424 100644
d4e586
--- a/lib/services/systemd.c
d4e586
+++ b/lib/services/systemd.c
d4e586
@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p)
d4e586
     crm_info("%s action for systemd unit %s named '%s' timed out",
d4e586
              op->action, op->agent, op->rsc);
d4e586
     services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
d4e586
-                         "Systemd action did not complete within specified timeout");
d4e586
+                         "Systemd unit action did not complete in time");
d4e586
     services__finalize_async_op(op);
d4e586
     return FALSE;
d4e586
 }
d4e586
-- 
d4e586
2.27.0
d4e586