Blame SOURCES/0012-Fix-sbd-common-query-rt-budget-0-otherwise-try-movin.patch

789c7f
From eaeed6cca46a0223617ead834aaa576dd5ad07ff Mon Sep 17 00:00:00 2001
789c7f
From: Klaus Wenninger <klaus.wenninger@aon.at>
789c7f
Date: Fri, 31 May 2019 16:11:16 +0200
789c7f
Subject: [PATCH] Fix: sbd-common: query rt-budget > 0 otherwise try moving to
789c7f
 root-slice
789c7f
789c7f
---
789c7f
 src/sbd-common.c     | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++
789c7f
 src/sbd-inquisitor.c |  15 +++++++
789c7f
 src/sbd.h            |   2 +
789c7f
 src/sbd.sysconfig    |  14 +++++++
789c7f
 4 files changed, 141 insertions(+)
789c7f
789c7f
diff --git a/src/sbd-common.c b/src/sbd-common.c
789c7f
index 873a76e..ebfdaa3 100644
789c7f
--- a/src/sbd-common.c
789c7f
+++ b/src/sbd-common.c
789c7f
@@ -662,6 +662,112 @@ static void sbd_memlock(int stackgrowK, int heapgrowK)
789c7f
 #endif
789c7f
 }
789c7f
 
789c7f
+static int get_realtime_budget(void)
789c7f
+{
789c7f
+    FILE *f;
789c7f
+    char fname[PATH_MAX];
789c7f
+    int res = -1, lnum = 0;
789c7f
+    char *cgroup = NULL, *namespecs = NULL;
789c7f
+
789c7f
+    snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid());
789c7f
+    f = fopen(fname, "rt");
789c7f
+    if (f == NULL) {
789c7f
+        cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd",
789c7f
+                            (intmax_t)getpid());
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+    while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum,  &namespecs, &cgroup) !=EOF ) {
789c7f
+        if (namespecs && strstr(namespecs, "cpuacct")) {
789c7f
+            free(namespecs);
789c7f
+            break;
789c7f
+        }
789c7f
+        if (cgroup) {
789c7f
+            free(cgroup);
789c7f
+            cgroup = NULL;
789c7f
+        }
789c7f
+        if (namespecs) {
789c7f
+            free(namespecs);
789c7f
+            namespecs = NULL;
789c7f
+        }
789c7f
+    }
789c7f
+    fclose(f);
789c7f
+    if (cgroup == NULL) {
789c7f
+        cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd",
789c7f
+                            (intmax_t)getpid());
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+    snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us",
789c7f
+                              cgroup);
789c7f
+    f = fopen(fname, "rt");
789c7f
+    if (f == NULL) {
789c7f
+        cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but "
789c7f
+            "doesn't for '%s'", cgroup);
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+    if (fscanf(f, "%d", &res) != 1) {
789c7f
+        cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname);
789c7f
+    } else {
789c7f
+        cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res);
789c7f
+    }
789c7f
+    fclose(f);
789c7f
+
789c7f
+exit_res:
789c7f
+    if (cgroup) {
789c7f
+        free(cgroup);
789c7f
+    }
789c7f
+    return res;
789c7f
+}
789c7f
+
789c7f
+/* stolen from corosync */
789c7f
+static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
789c7f
+    FILE *f;
789c7f
+    int res = -1;
789c7f
+
789c7f
+    /*
789c7f
+     * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
789c7f
+     * using systemd and systemd uses hardcoded path of cgroup mount point.
789c7f
+     *
789c7f
+     * This feature is expected to be removed as soon as systemd gets support
789c7f
+     * for managing RT configuration.
789c7f
+     */
789c7f
+    f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
789c7f
+    if (f == NULL) {
789c7f
+        cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> "
789c7f
+            "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
789c7f
+        res = 0;
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+    fclose(f);
789c7f
+
789c7f
+    if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) {
789c7f
+        cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are "
789c7f
+                          "-> skip moving to root-slice");
789c7f
+        res = 0;
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+
789c7f
+    f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
789c7f
+    if (f == NULL) {
789c7f
+        cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing");
789c7f
+
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+
789c7f
+    if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) {
789c7f
+        cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file");
789c7f
+        goto close_and_exit_res;
789c7f
+    }
789c7f
+
789c7f
+close_and_exit_res:
789c7f
+    if (fclose(f) != 0) {
789c7f
+        cl_log(LOG_WARNING, "Can't close cgroups tasks file");
789c7f
+        goto exit_res;
789c7f
+    }
789c7f
+
789c7f
+exit_res:
789c7f
+    return (res);
789c7f
+}
789c7f
+
789c7f
 void
789c7f
 sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
789c7f
 {
789c7f
@@ -670,6 +776,10 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
789c7f
     }
789c7f
 
789c7f
 #ifdef SCHED_RR
789c7f
+    if (move_to_root_cgroup) {
789c7f
+        sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup);
789c7f
+    }
789c7f
+
789c7f
     {
789c7f
         int pcurrent = 0;
789c7f
         int pmin = sched_get_priority_min(SCHED_RR);
789c7f
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
789c7f
index abde4e5..cef5cc7 100644
789c7f
--- a/src/sbd-inquisitor.c
789c7f
+++ b/src/sbd-inquisitor.c
789c7f
@@ -33,6 +33,8 @@ int	start_mode = 0;
789c7f
 char*	pidfile = NULL;
789c7f
 bool do_flush = true;
789c7f
 char timeout_sysrq_char = 'b';
789c7f
+bool move_to_root_cgroup = true;
789c7f
+bool enforce_moving_to_root_cgroup = false;
789c7f
 
789c7f
 int parse_device_line(const char *line);
789c7f
 
789c7f
@@ -965,6 +967,19 @@ int main(int argc, char **argv, char **envp)
789c7f
             timeout_action = strdup(value);
789c7f
         }
789c7f
 
789c7f
+        value = getenv("SBD_MOVE_TO_ROOT_CGROUP");
789c7f
+        if(value) {
789c7f
+            move_to_root_cgroup = crm_is_true(value);
789c7f
+
789c7f
+            if (move_to_root_cgroup) {
789c7f
+               enforce_moving_to_root_cgroup = true;
789c7f
+            } else {
789c7f
+                if (strcmp(value, "auto") == 0) {
789c7f
+                    move_to_root_cgroup = true;
789c7f
+                }
789c7f
+            }
789c7f
+        }
789c7f
+
789c7f
 	while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
789c7f
 		switch (c) {
789c7f
 		case 'D':
789c7f
diff --git a/src/sbd.h b/src/sbd.h
789c7f
index 3b05a11..ac30ec7 100644
789c7f
--- a/src/sbd.h
789c7f
+++ b/src/sbd.h
789c7f
@@ -159,6 +159,8 @@ extern bool watchdogdev_is_default;
789c7f
 extern char*  local_uname;
789c7f
 extern bool do_flush;
789c7f
 extern char timeout_sysrq_char;
789c7f
+extern bool move_to_root_cgroup;
789c7f
+extern bool enforce_moving_to_root_cgroup;
789c7f
 
789c7f
 /* Global, non-tunable variables: */
789c7f
 extern int  sector_size;
789c7f
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
789c7f
index f163f21..e1a60ed 100644
789c7f
--- a/src/sbd.sysconfig
789c7f
+++ b/src/sbd.sysconfig
789c7f
@@ -91,6 +91,20 @@ SBD_WATCHDOG_TIMEOUT=5
789c7f
 #
789c7f
 SBD_TIMEOUT_ACTION=flush,reboot
789c7f
 
789c7f
+## Type: yesno / auto
789c7f
+## Default: auto
789c7f
+#
789c7f
+# If CPUAccounting is enabled default is not to assign any RT-budget
789c7f
+# to the system.slice which prevents sbd from running RR-scheduled.
789c7f
+#
789c7f
+# One way to escape that issue is to move sbd-processes from the
789c7f
+# slice they were originally started to root-slice.
789c7f
+# Of course starting sbd in a certain slice might be intentional.
789c7f
+# Thus in auto-mode sbd will check if the slice has RT-budget assigned.
789c7f
+# If that is the case sbd will stay in that slice while it will
789c7f
+# be moved to root-slice otherwise.
789c7f
+SBD_MOVE_TO_ROOT_CGROUP=auto
789c7f
+
789c7f
 ## Type: string
789c7f
 ## Default: ""
789c7f
 #
789c7f
-- 
789c7f
1.8.3.1
789c7f