From b64c30af56e7eabd63ce1db25bc5ed9b953485af Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Fri, 23 Nov 2018 14:09:22 +0100
Subject: [PATCH] Feature: make timeout-action executed by sbd configurable
---
man/sbd.8.pod | 19 +++++++++++++++++++
src/sbd-common.c | 22 ++++++++++++++++------
src/sbd-inquisitor.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---
src/sbd-md.c | 2 +-
src/sbd.h | 3 +++
src/sbd.sysconfig | 18 ++++++++++++++++++
6 files changed, 107 insertions(+), 10 deletions(-)
diff --git a/man/sbd.8.pod b/man/sbd.8.pod
index ffd01c2..dbb3855 100644
--- a/man/sbd.8.pod
+++ b/man/sbd.8.pod
@@ -333,6 +333,23 @@ prevent a successful crashdump from ever being written.
Defaults to 240 seconds. Set to zero to disable.
+=item B<-r> I<N>
+
+Actions to be executed when the watchers don't timely report to the sbd
+master process or one of the watchers detects that the master process
+has died.
+
+Set timeout-action to comma-separated combination of
+noflush|flush plus reboot|crashdump|off.
+If just one of both is given the other stays at the default.
+
+This doesn't affect actions like off, crashdump, reboot explicitly
+triggered via message slots.
+And it does as well not configure the action a watchdog would
+trigger should it run off (there is no generic interface).
+
+Defaults to flush,reboot.
+
=back
=head2 allocate
@@ -552,6 +569,8 @@ options to pass to the daemon:
C<sbd> will fail to start if no C<SBD_DEVICE> is specified. See the
installed template for more options that can be configured here.
+In general configuration done via parameters takes precedence over
+the configuration from the configuration file.
=head2 Testing the sbd installation
diff --git a/src/sbd-common.c b/src/sbd-common.c
index cc84cd0..0e8be65 100644
--- a/src/sbd-common.c
+++ b/src/sbd-common.c
@@ -98,6 +98,8 @@ usage(void)
" (default is 1, set to 0 to disable)\n"
"-P Check Pacemaker quorum and node health (optional, watch only)\n"
"-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n"
+"-r Set timeout-action to comma-separated combination of\n"
+" noflush|flush plus reboot|crashdump|off (default is flush,reboot)\n"
"Commands:\n"
#if SUPPORT_SHARED_DISK
"create initialize N slots on <dev> - OVERWRITES DEVICE!\n"
@@ -769,7 +771,7 @@ sysrq_trigger(char t)
static void
-do_exit(char kind)
+do_exit(char kind, bool do_flush)
{
/* TODO: Turn debug_mode into a bit field? Delay + kdump for example */
const char *reason = NULL;
@@ -814,7 +816,9 @@ do_exit(char kind)
}
cl_log(LOG_EMERG, "Rebooting system: %s", reason);
- sync();
+ if (do_flush) {
+ sync();
+ }
if(kind == 'c') {
watchdog_close(true);
@@ -834,19 +838,25 @@ do_exit(char kind)
void
do_crashdump(void)
{
- do_exit('c');
+ do_exit('c', true);
}
void
do_reset(void)
{
- do_exit('b');
+ do_exit('b', true);
}
void
do_off(void)
{
- do_exit('o');
+ do_exit('o', true);
+}
+
+void
+do_timeout_action(void)
+{
+ do_exit(timeout_sysrq_char, do_flush);
}
/*
@@ -980,7 +990,7 @@ notify_parent(void)
/* Our parent died unexpectedly. Triggering
* self-fence. */
cl_log(LOG_WARNING, "Our parent is dead.");
- do_reset();
+ do_timeout_action();
}
switch (servant_health) {
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index 9b193d4..8e0bc87 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -31,6 +31,8 @@ int servant_restart_interval = 5;
int servant_restart_count = 1;
int start_mode = 0;
char* pidfile = NULL;
+bool do_flush = true;
+char timeout_sysrq_char = 'b';
int parse_device_line(const char *line);
@@ -655,7 +657,7 @@ void inquisitor_child(void)
/* At level 2 or above, we do nothing, but expect
* things to eventually return to
* normal. */
- do_reset();
+ do_timeout_action();
} else {
cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!");
}
@@ -668,7 +670,7 @@ void inquisitor_child(void)
if (debug_mode && watchdog_use) {
/* In debug mode, trigger a reset before the watchdog can panic the machine */
- do_reset();
+ do_timeout_action();
}
}
@@ -833,6 +835,7 @@ int main(int argc, char **argv, char **envp)
int qb_facility;
const char *value = NULL;
int start_delay = 0;
+ char *timeout_action = NULL;
if ((cmdname = strrchr(argv[0], '/')) == NULL) {
cmdname = argv[0];
@@ -928,7 +931,12 @@ int main(int argc, char **argv, char **envp)
}
cl_log(LOG_DEBUG, "Start delay: %d (%s)", (int)start_delay, value?value:"default");
- while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:")) != -1) {
+ value = getenv("SBD_TIMEOUT_ACTION");
+ if(value) {
+ timeout_action = strdup(value);
+ }
+
+ while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
switch (c) {
case 'D':
break;
@@ -1043,6 +1051,12 @@ int main(int argc, char **argv, char **envp)
cl_log(LOG_INFO, "Servant restart count set to %d",
(int)servant_restart_count);
break;
+ case 'r':
+ if (timeout_action) {
+ free(timeout_action);
+ }
+ timeout_action = strdup(optarg);
+ break;
case 'h':
usage();
return (0);
@@ -1101,6 +1115,39 @@ int main(int argc, char **argv, char **envp)
goto out;
}
+ if (timeout_action) {
+ char *p[2];
+ int i;
+ char c;
+ int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c);
+ bool parse_error = (nrflags < 1) || (nrflags > 2);
+
+ for (i = 0; (i < nrflags) && (i < 2); i++) {
+ if (!strcmp(p[i], "reboot")) {
+ timeout_sysrq_char = 'b';
+ } else if (!strcmp(p[i], "crashdump")) {
+ timeout_sysrq_char = 'c';
+ } else if (!strcmp(p[i], "off")) {
+ timeout_sysrq_char = 'o';
+ } else if (!strcmp(p[i], "flush")) {
+ do_flush = true;
+ } else if (!strcmp(p[i], "noflush")) {
+ do_flush = false;
+ } else {
+ parse_error = true;
+ }
+ free(p[i]);
+ }
+ if (parse_error) {
+ fprintf(stderr, "Failed to parse timeout-action \"%s\".\n",
+ timeout_action);
+ exit_status = -1;
+ goto out;
+ }
+ }
+ cl_log(LOG_NOTICE, "%s flush + writing \'%c\' to sysrq on timeout",
+ do_flush?"Doing":"Skipping", timeout_sysrq_char);
+
#if SUPPORT_SHARED_DISK
if (strcmp(argv[optind], "create") == 0) {
exit_status = init_devices(servants_leader);
diff --git a/src/sbd-md.c b/src/sbd-md.c
index a736118..579d273 100644
--- a/src/sbd-md.c
+++ b/src/sbd-md.c
@@ -1149,7 +1149,7 @@ int servant(const char *diskname, int mode, const void* argp)
if (ppid == 1) {
/* Our parent died unexpectedly. Triggering
* self-fence. */
- do_reset();
+ do_timeout_action();
}
/* These attempts are, by definition, somewhat racy. If
diff --git a/src/sbd.h b/src/sbd.h
index 0f8847a..386c85c 100644
--- a/src/sbd.h
+++ b/src/sbd.h
@@ -130,6 +130,7 @@ void sysrq_trigger(char t);
void do_crashdump(void);
void do_reset(void);
void do_off(void);
+void do_timeout_action(void);
pid_t make_daemon(void);
void maximize_priority(void);
void sbd_get_uname(void);
@@ -153,6 +154,8 @@ extern int debug_mode;
extern char *watchdogdev;
extern bool watchdogdev_is_default;
extern char* local_uname;
+extern bool do_flush;
+extern char timeout_sysrq_char;
/* Global, non-tunable variables: */
extern int sector_size;
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
index c6d7c07..8f38426 100644
--- a/src/sbd.sysconfig
+++ b/src/sbd.sysconfig
@@ -71,6 +71,24 @@ SBD_WATCHDOG_DEV=/dev/watchdog
SBD_WATCHDOG_TIMEOUT=5
## Type: string
+## Default: "flush,reboot"
+#
+# Actions to be executed when the watchers don't timely report to the sbd
+# master process or one of the watchers detects that the master process
+# has died.
+#
+# Set timeout-action to comma-separated combination of
+# noflush|flush plus reboot|crashdump|off.
+# If just one of both is given the other stays at the default.
+#
+# This doesn't affect actions like off, crashdump, reboot explicitly
+# triggered via message slots.
+# And it does as well not configure the action a watchdog would
+# trigger should it run off (there is no generic interface).
+#
+SBD_TIMEOUT_ACTION=flush,reboot
+
+## Type: string
## Default: ""
#
# Additional options for starting sbd
--
1.8.3.1