Blame SOURCES/0012-Fix-sbd-common-query-rt-budget-0-otherwise-try-movin.patch

1e6485
From eaeed6cca46a0223617ead834aaa576dd5ad07ff Mon Sep 17 00:00:00 2001
1e6485
From: Klaus Wenninger <klaus.wenninger@aon.at>
1e6485
Date: Fri, 31 May 2019 16:11:16 +0200
1e6485
Subject: [PATCH] Fix: sbd-common: query rt-budget > 0 otherwise try moving to
1e6485
 root-slice
1e6485
1e6485
---
1e6485
 src/sbd-common.c     | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++
1e6485
 src/sbd-inquisitor.c |  15 +++++++
1e6485
 src/sbd.h            |   2 +
1e6485
 src/sbd.sysconfig    |  14 +++++++
1e6485
 4 files changed, 141 insertions(+)
1e6485
1e6485
diff --git a/src/sbd-common.c b/src/sbd-common.c
1e6485
index 873a76e..ebfdaa3 100644
1e6485
--- a/src/sbd-common.c
1e6485
+++ b/src/sbd-common.c
1e6485
@@ -662,6 +662,112 @@ static void sbd_memlock(int stackgrowK, int heapgrowK)
1e6485
 #endif
1e6485
 }
1e6485
 
1e6485
+static int get_realtime_budget(void)
1e6485
+{
1e6485
+    FILE *f;
1e6485
+    char fname[PATH_MAX];
1e6485
+    int res = -1, lnum = 0;
1e6485
+    char *cgroup = NULL, *namespecs = NULL;
1e6485
+
1e6485
+    snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid());
1e6485
+    f = fopen(fname, "rt");
1e6485
+    if (f == NULL) {
1e6485
+        cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd",
1e6485
+                            (intmax_t)getpid());
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+    while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum,  &namespecs, &cgroup) !=EOF ) {
1e6485
+        if (namespecs && strstr(namespecs, "cpuacct")) {
1e6485
+            free(namespecs);
1e6485
+            break;
1e6485
+        }
1e6485
+        if (cgroup) {
1e6485
+            free(cgroup);
1e6485
+            cgroup = NULL;
1e6485
+        }
1e6485
+        if (namespecs) {
1e6485
+            free(namespecs);
1e6485
+            namespecs = NULL;
1e6485
+        }
1e6485
+    }
1e6485
+    fclose(f);
1e6485
+    if (cgroup == NULL) {
1e6485
+        cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd",
1e6485
+                            (intmax_t)getpid());
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+    snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us",
1e6485
+                              cgroup);
1e6485
+    f = fopen(fname, "rt");
1e6485
+    if (f == NULL) {
1e6485
+        cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but "
1e6485
+            "doesn't for '%s'", cgroup);
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+    if (fscanf(f, "%d", &res) != 1) {
1e6485
+        cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname);
1e6485
+    } else {
1e6485
+        cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res);
1e6485
+    }
1e6485
+    fclose(f);
1e6485
+
1e6485
+exit_res:
1e6485
+    if (cgroup) {
1e6485
+        free(cgroup);
1e6485
+    }
1e6485
+    return res;
1e6485
+}
1e6485
+
1e6485
+/* stolen from corosync */
1e6485
+static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
1e6485
+    FILE *f;
1e6485
+    int res = -1;
1e6485
+
1e6485
+    /*
1e6485
+     * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
1e6485
+     * using systemd and systemd uses hardcoded path of cgroup mount point.
1e6485
+     *
1e6485
+     * This feature is expected to be removed as soon as systemd gets support
1e6485
+     * for managing RT configuration.
1e6485
+     */
1e6485
+    f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
1e6485
+    if (f == NULL) {
1e6485
+        cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> "
1e6485
+            "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
1e6485
+        res = 0;
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+    fclose(f);
1e6485
+
1e6485
+    if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) {
1e6485
+        cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are "
1e6485
+                          "-> skip moving to root-slice");
1e6485
+        res = 0;
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+
1e6485
+    f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
1e6485
+    if (f == NULL) {
1e6485
+        cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing");
1e6485
+
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+
1e6485
+    if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) {
1e6485
+        cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file");
1e6485
+        goto close_and_exit_res;
1e6485
+    }
1e6485
+
1e6485
+close_and_exit_res:
1e6485
+    if (fclose(f) != 0) {
1e6485
+        cl_log(LOG_WARNING, "Can't close cgroups tasks file");
1e6485
+        goto exit_res;
1e6485
+    }
1e6485
+
1e6485
+exit_res:
1e6485
+    return (res);
1e6485
+}
1e6485
+
1e6485
 void
1e6485
 sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
1e6485
 {
1e6485
@@ -670,6 +776,10 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
1e6485
     }
1e6485
 
1e6485
 #ifdef SCHED_RR
1e6485
+    if (move_to_root_cgroup) {
1e6485
+        sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup);
1e6485
+    }
1e6485
+
1e6485
     {
1e6485
         int pcurrent = 0;
1e6485
         int pmin = sched_get_priority_min(SCHED_RR);
1e6485
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
1e6485
index abde4e5..cef5cc7 100644
1e6485
--- a/src/sbd-inquisitor.c
1e6485
+++ b/src/sbd-inquisitor.c
1e6485
@@ -33,6 +33,8 @@ int	start_mode = 0;
1e6485
 char*	pidfile = NULL;
1e6485
 bool do_flush = true;
1e6485
 char timeout_sysrq_char = 'b';
1e6485
+bool move_to_root_cgroup = true;
1e6485
+bool enforce_moving_to_root_cgroup = false;
1e6485
 
1e6485
 int parse_device_line(const char *line);
1e6485
 
1e6485
@@ -965,6 +967,19 @@ int main(int argc, char **argv, char **envp)
1e6485
             timeout_action = strdup(value);
1e6485
         }
1e6485
 
1e6485
+        value = getenv("SBD_MOVE_TO_ROOT_CGROUP");
1e6485
+        if(value) {
1e6485
+            move_to_root_cgroup = crm_is_true(value);
1e6485
+
1e6485
+            if (move_to_root_cgroup) {
1e6485
+               enforce_moving_to_root_cgroup = true;
1e6485
+            } else {
1e6485
+                if (strcmp(value, "auto") == 0) {
1e6485
+                    move_to_root_cgroup = true;
1e6485
+                }
1e6485
+            }
1e6485
+        }
1e6485
+
1e6485
 	while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
1e6485
 		switch (c) {
1e6485
 		case 'D':
1e6485
diff --git a/src/sbd.h b/src/sbd.h
1e6485
index 3b05a11..ac30ec7 100644
1e6485
--- a/src/sbd.h
1e6485
+++ b/src/sbd.h
1e6485
@@ -159,6 +159,8 @@ extern bool watchdogdev_is_default;
1e6485
 extern char*  local_uname;
1e6485
 extern bool do_flush;
1e6485
 extern char timeout_sysrq_char;
1e6485
+extern bool move_to_root_cgroup;
1e6485
+extern bool enforce_moving_to_root_cgroup;
1e6485
 
1e6485
 /* Global, non-tunable variables: */
1e6485
 extern int  sector_size;
1e6485
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
1e6485
index f163f21..e1a60ed 100644
1e6485
--- a/src/sbd.sysconfig
1e6485
+++ b/src/sbd.sysconfig
1e6485
@@ -91,6 +91,20 @@ SBD_WATCHDOG_TIMEOUT=5
1e6485
 #
1e6485
 SBD_TIMEOUT_ACTION=flush,reboot
1e6485
 
1e6485
+## Type: yesno / auto
1e6485
+## Default: auto
1e6485
+#
1e6485
+# If CPUAccounting is enabled default is not to assign any RT-budget
1e6485
+# to the system.slice which prevents sbd from running RR-scheduled.
1e6485
+#
1e6485
+# One way to escape that issue is to move sbd-processes from the
1e6485
+# slice they were originally started to root-slice.
1e6485
+# Of course starting sbd in a certain slice might be intentional.
1e6485
+# Thus in auto-mode sbd will check if the slice has RT-budget assigned.
1e6485
+# If that is the case sbd will stay in that slice while it will
1e6485
+# be moved to root-slice otherwise.
1e6485
+SBD_MOVE_TO_ROOT_CGROUP=auto
1e6485
+
1e6485
 ## Type: string
1e6485
 ## Default: ""
1e6485
 #
1e6485
-- 
1e6485
1.8.3.1
1e6485