From eaeed6cca46a0223617ead834aaa576dd5ad07ff Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Fri, 31 May 2019 16:11:16 +0200
Subject: [PATCH] Fix: sbd-common: query rt-budget > 0 otherwise try moving to
root-slice
---
src/sbd-common.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++
src/sbd-inquisitor.c | 15 +++++++
src/sbd.h | 2 +
src/sbd.sysconfig | 14 +++++++
4 files changed, 141 insertions(+)
diff --git a/src/sbd-common.c b/src/sbd-common.c
index 873a76e..ebfdaa3 100644
--- a/src/sbd-common.c
+++ b/src/sbd-common.c
@@ -662,6 +662,112 @@ static void sbd_memlock(int stackgrowK, int heapgrowK)
#endif
}
+static int get_realtime_budget(void)
+{
+ FILE *f;
+ char fname[PATH_MAX];
+ int res = -1, lnum = 0;
+ char *cgroup = NULL, *namespecs = NULL;
+
+ snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid());
+ f = fopen(fname, "rt");
+ if (f == NULL) {
+ cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd",
+ (intmax_t)getpid());
+ goto exit_res;
+ }
+ while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum, &namespecs, &cgroup) !=EOF ) {
+ if (namespecs && strstr(namespecs, "cpuacct")) {
+ free(namespecs);
+ break;
+ }
+ if (cgroup) {
+ free(cgroup);
+ cgroup = NULL;
+ }
+ if (namespecs) {
+ free(namespecs);
+ namespecs = NULL;
+ }
+ }
+ fclose(f);
+ if (cgroup == NULL) {
+ cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd",
+ (intmax_t)getpid());
+ goto exit_res;
+ }
+ snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us",
+ cgroup);
+ f = fopen(fname, "rt");
+ if (f == NULL) {
+ cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but "
+ "doesn't for '%s'", cgroup);
+ goto exit_res;
+ }
+ if (fscanf(f, "%d", &res) != 1) {
+ cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname);
+ } else {
+ cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res);
+ }
+ fclose(f);
+
+exit_res:
+ if (cgroup) {
+ free(cgroup);
+ }
+ return res;
+}
+
+/* stolen from corosync */
+static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
+ FILE *f;
+ int res = -1;
+
+ /*
+ * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
+ * using systemd and systemd uses hardcoded path of cgroup mount point.
+ *
+ * This feature is expected to be removed as soon as systemd gets support
+ * for managing RT configuration.
+ */
+ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
+ if (f == NULL) {
+ cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> "
+ "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
+ res = 0;
+ goto exit_res;
+ }
+ fclose(f);
+
+ if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) {
+ cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are "
+ "-> skip moving to root-slice");
+ res = 0;
+ goto exit_res;
+ }
+
+ f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
+ if (f == NULL) {
+ cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing");
+
+ goto exit_res;
+ }
+
+ if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) {
+ cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file");
+ goto close_and_exit_res;
+ }
+
+close_and_exit_res:
+ if (fclose(f) != 0) {
+ cl_log(LOG_WARNING, "Can't close cgroups tasks file");
+ goto exit_res;
+ }
+
+exit_res:
+ return (res);
+}
+
void
sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
{
@@ -670,6 +776,10 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
}
#ifdef SCHED_RR
+ if (move_to_root_cgroup) {
+ sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup);
+ }
+
{
int pcurrent = 0;
int pmin = sched_get_priority_min(SCHED_RR);
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index abde4e5..cef5cc7 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -33,6 +33,8 @@ int start_mode = 0;
char* pidfile = NULL;
bool do_flush = true;
char timeout_sysrq_char = 'b';
+bool move_to_root_cgroup = true;
+bool enforce_moving_to_root_cgroup = false;
int parse_device_line(const char *line);
@@ -965,6 +967,19 @@ int main(int argc, char **argv, char **envp)
timeout_action = strdup(value);
}
+ value = getenv("SBD_MOVE_TO_ROOT_CGROUP");
+ if(value) {
+ move_to_root_cgroup = crm_is_true(value);
+
+ if (move_to_root_cgroup) {
+ enforce_moving_to_root_cgroup = true;
+ } else {
+ if (strcmp(value, "auto") == 0) {
+ move_to_root_cgroup = true;
+ }
+ }
+ }
+
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
switch (c) {
case 'D':
diff --git a/src/sbd.h b/src/sbd.h
index 3b05a11..ac30ec7 100644
--- a/src/sbd.h
+++ b/src/sbd.h
@@ -159,6 +159,8 @@ extern bool watchdogdev_is_default;
extern char* local_uname;
extern bool do_flush;
extern char timeout_sysrq_char;
+extern bool move_to_root_cgroup;
+extern bool enforce_moving_to_root_cgroup;
/* Global, non-tunable variables: */
extern int sector_size;
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
index f163f21..e1a60ed 100644
--- a/src/sbd.sysconfig
+++ b/src/sbd.sysconfig
@@ -91,6 +91,20 @@ SBD_WATCHDOG_TIMEOUT=5
#
SBD_TIMEOUT_ACTION=flush,reboot
+## Type: yesno / auto
+## Default: auto
+#
+# If CPUAccounting is enabled default is not to assign any RT-budget
+# to the system.slice which prevents sbd from running RR-scheduled.
+#
+# One way to escape that issue is to move sbd-processes from the
+# slice they were originally started to root-slice.
+# Of course starting sbd in a certain slice might be intentional.
+# Thus in auto-mode sbd will check if the slice has RT-budget assigned.
+# If that is the case sbd will stay in that slice while it will
+# be moved to root-slice otherwise.
+SBD_MOVE_TO_ROOT_CGROUP=auto
+
## Type: string
## Default: ""
#
--
1.8.3.1