a19bc6
From 7ec6e537898e139cc33017e03465ef40a86dd433 Mon Sep 17 00:00:00 2001
a19bc6
From: Lennart Poettering <lennart@poettering.net>
a19bc6
Date: Tue, 19 Jul 2016 15:58:49 +0200
a19bc6
Subject: [PATCH] core: support percentage specifications on TasksMax=
a19bc6
a19bc6
This adds support for a TasksMax=40% syntax for specifying values relative to
a19bc6
the system's configured maximum number of processes. This is useful in order to
a19bc6
neatly subdivide the available room for tasks within containers.
a19bc6
a19bc6
Cherry-picked from: 83f8e80857090f63cf6a02c54d381dad3c0fad55
a19bc6
Related: #1337244
a19bc6
---
a19bc6
 man/systemd.resource-control.xml | 15 +++----
a19bc6
 src/core/dbus-cgroup.c           | 22 +++++++++++
a19bc6
 src/core/load-fragment.c         | 15 +++++--
a19bc6
 src/libsystemd/sd-bus/bus-util.c | 19 +++++++++
a19bc6
 src/shared/util.c                | 84 ++++++++++++++++++++++++++++++++++++++++
a19bc6
 src/shared/util.h                |  5 +++
a19bc6
 src/test/test-util.c             | 39 +++++++++++++++++++
a19bc6
 7 files changed, 187 insertions(+), 12 deletions(-)
a19bc6
a19bc6
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
a19bc6
index 217105e..f507c67 100644
a19bc6
--- a/man/systemd.resource-control.xml
a19bc6
+++ b/man/systemd.resource-control.xml
a19bc6
@@ -221,15 +221,12 @@
a19bc6
         <term><varname>TasksMax=<replaceable>N</replaceable></varname></term>
a19bc6
 
a19bc6
         <listitem>
a19bc6
-          <para>Specify the maximum number of tasks that may be
a19bc6
-          created in the unit. This ensures that the number of tasks
a19bc6
-          accounted for the unit (see above) stays below a specific
a19bc6
-          limit. If assigned the special value
a19bc6
-          <literal>infinity</literal> no tasks limit is applied. This
a19bc6
-          controls the <literal>pids.max</literal> control group
a19bc6
-          attribute. For details about this control group attribute,
a19bc6
-          see 
a19bc6
-          url="https://www.kernel.org/doc/Documentation/cgroups/pids.txt">pids.txt</ulink>.</para>
a19bc6
+          <para>Specify the maximum number of tasks that may be created in the unit. This ensures that the number of
a19bc6
+          tasks accounted for the unit (see above) stays below a specific limit. This either takes an absolute number
a19bc6
+          of tasks or a percentage value that is taken relative to the configured maximum number of tasks on the
a19bc6
+          system.  If assigned the special value <literal>infinity</literal>, no tasks limit is applied. This controls
a19bc6
+          the <literal>pids.max</literal> control group attribute. For details about this control group attribute, see
a19bc6
+          <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v1/pids.txt">pids.txt</ulink>.</para>
a19bc6
 
a19bc6
           <para>Implies <literal>TasksAccounting=true</literal>. The
a19bc6
           system default for this setting may be controlled with
a19bc6
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
a19bc6
index a4465dc..fa76c60 100644
a19bc6
--- a/src/core/dbus-cgroup.c
a19bc6
+++ b/src/core/dbus-cgroup.c
a19bc6
@@ -694,6 +694,8 @@ int bus_cgroup_set_property(
a19bc6
                 r = sd_bus_message_read(message, "t", &limit);
a19bc6
                 if (r < 0)
a19bc6
                         return r;
a19bc6
+                if (limit <= 0)
a19bc6
+                        return sd_bus_error_set_errnof(error, EINVAL, "%s= is too small", name);
a19bc6
 
a19bc6
                 if (mode != UNIT_CHECK) {
a19bc6
                         c->tasks_max = limit;
a19bc6
@@ -706,6 +708,26 @@ int bus_cgroup_set_property(
a19bc6
                 }
a19bc6
 
a19bc6
                 return 1;
a19bc6
+        } else if (streq(name, "TasksMaxScale")) {
a19bc6
+                uint64_t limit;
a19bc6
+                uint32_t raw;
a19bc6
+
a19bc6
+                r = sd_bus_message_read(message, "u", &raw;;
a19bc6
+                if (r < 0)
a19bc6
+                        return r;
a19bc6
+
a19bc6
+                limit = system_tasks_max_scale(raw, UINT32_MAX);
a19bc6
+                if (limit <= 0 || limit >= UINT64_MAX)
a19bc6
+                        return sd_bus_error_set_errnof(error, EINVAL, "%s= is out of range", name);
a19bc6
+
a19bc6
+                if (mode != UNIT_CHECK) {
a19bc6
+                        c->tasks_max = limit;
a19bc6
+                        u->cgroup_realized_mask &= ~CGROUP_PIDS;
a19bc6
+                        unit_write_drop_in_private_format(u, mode, name, "TasksMax=%" PRIu32 "%%",
a19bc6
+                                                          (uint32_t) (DIV_ROUND_UP((uint64_t) raw * 100U, (uint64_t) UINT32_MAX)));
a19bc6
+                }
a19bc6
+
a19bc6
+                return 1;
a19bc6
         }
a19bc6
 
a19bc6
         if (u->transient && u->load_state == UNIT_STUB) {
a19bc6
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
a19bc6
index c1ffee2..4114750 100644
a19bc6
--- a/src/core/load-fragment.c
a19bc6
+++ b/src/core/load-fragment.c
a19bc6
@@ -3089,9 +3089,18 @@ int config_parse_tasks_max(
a19bc6
                 return 0;
a19bc6
         }
a19bc6
 
a19bc6
-        r = safe_atou64(rvalue, &u);
a19bc6
-        if (r < 0 || u < 1) {
a19bc6
-                log_syntax(unit, LOG_ERR, filename, line, EINVAL, "Maximum tasks value '%s' invalid. Ignoring.", rvalue);
a19bc6
+        r = parse_percent(rvalue);
a19bc6
+        if (r < 0) {
a19bc6
+                r = safe_atou64(rvalue, &u);
a19bc6
+                if (r < 0) {
a19bc6
+                        log_syntax(unit, LOG_ERR, filename, line, r, "Maximum tasks value '%s' invalid. Ignoring.", rvalue);
a19bc6
+                        return 0;
a19bc6
+                }
a19bc6
+        } else
a19bc6
+                u = system_tasks_max_scale(r, 100U);
a19bc6
+
a19bc6
+        if (u <= 0 || u >= UINT64_MAX) {
a19bc6
+                log_syntax(unit, LOG_ERR, filename, line, 0, "Maximum tasks value '%s' out of range. Ignoring.", rvalue);
a19bc6
                 return 0;
a19bc6
         }
a19bc6
 
a19bc6
diff --git a/src/libsystemd/sd-bus/bus-util.c b/src/libsystemd/sd-bus/bus-util.c
a19bc6
index ed0849b..f46fa2b 100644
a19bc6
--- a/src/libsystemd/sd-bus/bus-util.c
a19bc6
+++ b/src/libsystemd/sd-bus/bus-util.c
a19bc6
@@ -1409,7 +1409,26 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
a19bc6
                 }
a19bc6
 
a19bc6
                 r = sd_bus_message_append(m, "v", "t", (uint64_t) bytes);
a19bc6
+        } else if (streq(field, "TasksMax")) {
a19bc6
+                uint64_t t;
a19bc6
+
a19bc6
+                if (isempty(eq) || streq(eq, "infinity"))
a19bc6
+                        t = (uint64_t) -1;
a19bc6
+                else {
a19bc6
+                        r = parse_percent(eq);
a19bc6
+                        if (r >= 0) {
a19bc6
+                                r = sd_bus_message_append(m, "sv", "TasksMaxScale", "u", (uint32_t) (((uint64_t) UINT32_MAX * r) / 100U));
a19bc6
+                                if (r < 0)
a19bc6
+                                        return bus_log_create_error(r);
a19bc6
+                        } else {
a19bc6
+                                r = safe_atou64(eq, &t);
a19bc6
+                                if (r < 0)
a19bc6
+                                        return log_error_errno(r, "Failed to parse maximum tasks specification %s", assignment);
a19bc6
+                        }
a19bc6
+
a19bc6
+                }
a19bc6
 
a19bc6
+                r = sd_bus_message_append(m, "sv", "TasksMax", "t", t);
a19bc6
         } else if (STR_IN_SET(field, "CPUShares", "BlockIOWeight")) {
a19bc6
                 uint64_t u;
a19bc6
 
a19bc6
diff --git a/src/shared/util.c b/src/shared/util.c
a19bc6
index cadadde..bbb4577 100644
a19bc6
--- a/src/shared/util.c
a19bc6
+++ b/src/shared/util.c
a19bc6
@@ -94,6 +94,7 @@
a19bc6
 #include "def.h"
a19bc6
 #include "sparse-endian.h"
a19bc6
 #include "conf-parser.h"
a19bc6
+#include "cgroup-util.h"
a19bc6
 
a19bc6
 int saved_argc = 0;
a19bc6
 char **saved_argv = NULL;
a19bc6
@@ -8707,3 +8708,86 @@ int extract_many_words(const char **p, const char *separators, ExtractFlags flag
a19bc6
 
a19bc6
         return c;
a19bc6
 }
a19bc6
+
a19bc6
+int parse_percent_unbounded(const char *p) {
a19bc6
+        const char *pc, *n;
a19bc6
+        unsigned v;
a19bc6
+        int r;
a19bc6
+
a19bc6
+        pc = endswith(p, "%");
a19bc6
+        if (!pc)
a19bc6
+                return -EINVAL;
a19bc6
+
a19bc6
+        n = strndupa(p, pc - p);
a19bc6
+        r = safe_atou(n, &v);
a19bc6
+        if (r < 0)
a19bc6
+                return r;
a19bc6
+
a19bc6
+        return (int) v;
a19bc6
+}
a19bc6
+
a19bc6
+int parse_percent(const char *p) {
a19bc6
+        int v;
a19bc6
+
a19bc6
+        v = parse_percent_unbounded(p);
a19bc6
+        if (v > 100)
a19bc6
+                return -ERANGE;
a19bc6
+
a19bc6
+        return v;
a19bc6
+}
a19bc6
+
a19bc6
+uint64_t system_tasks_max(void) {
a19bc6
+
a19bc6
+#if SIZEOF_PID_T == 4
a19bc6
+#define TASKS_MAX ((uint64_t) (INT32_MAX-1))
a19bc6
+#elif SIZEOF_PID_T == 2
a19bc6
+#define TASKS_MAX ((uint64_t) (INT16_MAX-1))
a19bc6
+#else
a19bc6
+#error "Unknown pid_t size"
a19bc6
+#endif
a19bc6
+
a19bc6
+        _cleanup_free_ char *value = NULL, *root = NULL;
a19bc6
+        uint64_t a = TASKS_MAX, b = TASKS_MAX;
a19bc6
+
a19bc6
+        /* Determine the maximum number of tasks that may run on this system. We check three sources to determine this
a19bc6
+         * limit:
a19bc6
+         *
a19bc6
+         * a) the maximum value for the pid_t type
a19bc6
+         * b) the cgroups pids_max attribute for the system
a19bc6
+         * c) the kernel's configure maximum PID value
a19bc6
+         *
a19bc6
+         * And then pick the smallest of the three */
a19bc6
+
a19bc6
+        if (read_one_line_file("/proc/sys/kernel/pid_max", &value) >= 0)
a19bc6
+                (void) safe_atou64(value, &a);
a19bc6
+
a19bc6
+        if (cg_get_root_path(&root) >= 0) {
a19bc6
+                free(value);
a19bc6
+                value = NULL;
a19bc6
+
a19bc6
+                if (cg_get_attribute("pids", root, "pids.max", &value) >= 0)
a19bc6
+                        (void) safe_atou64(value, &b);
a19bc6
+        }
a19bc6
+
a19bc6
+        return MIN3(TASKS_MAX,
a19bc6
+                    a <= 0 ? TASKS_MAX : a,
a19bc6
+                    b <= 0 ? TASKS_MAX : b);
a19bc6
+}
a19bc6
+
a19bc6
+uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
a19bc6
+        uint64_t t, m;
a19bc6
+
a19bc6
+        assert(max > 0);
a19bc6
+
a19bc6
+        /* Multiply the system's task value by the fraction v/max. Hence, if max==100 this calculates percentages
a19bc6
+         * relative to the system's maximum number of tasks. Returns UINT64_MAX on overflow. */
a19bc6
+
a19bc6
+        t = system_tasks_max();
a19bc6
+        assert(t > 0);
a19bc6
+
a19bc6
+        m = t * v;
a19bc6
+        if (m / t != v) /* overflow? */
a19bc6
+                return UINT64_MAX;
a19bc6
+
a19bc6
+        return m / max;
a19bc6
+}
a19bc6
diff --git a/src/shared/util.h b/src/shared/util.h
a19bc6
index 12afcc3..f1b6c34 100644
a19bc6
--- a/src/shared/util.h
a19bc6
+++ b/src/shared/util.h
a19bc6
@@ -1098,3 +1098,8 @@ typedef enum ExtractFlags {
a19bc6
 int extract_first_word(const char **p, char **ret, const char *separators, ExtractFlags flags);
a19bc6
 int extract_first_word_and_warn(const char **p, char **ret, const char *separators, ExtractFlags flags, const char *unit, const char *filename, unsigned line, const char *rvalue);
a19bc6
 int extract_many_words(const char **p, const char *separators, ExtractFlags flags, ...) _sentinel_;
a19bc6
+int parse_percent_unbounded(const char *p);
a19bc6
+int parse_percent(const char *p);
a19bc6
+
a19bc6
+uint64_t system_tasks_max(void);
a19bc6
+uint64_t system_tasks_max_scale(uint64_t v, uint64_t max);
a19bc6
diff --git a/src/test/test-util.c b/src/test/test-util.c
a19bc6
index 9ae347b..971f97d 100644
a19bc6
--- a/src/test/test-util.c
a19bc6
+++ b/src/test/test-util.c
a19bc6
@@ -1530,6 +1530,43 @@ static void test_shell_maybe_quote(void) {
a19bc6
         test_shell_maybe_quote_one("foo$bar", "\"foo\\$bar\"");
a19bc6
 }
a19bc6
 
a19bc6
+static void test_system_tasks_max(void) {
a19bc6
+        uint64_t t;
a19bc6
+
a19bc6
+        t = system_tasks_max();
a19bc6
+        assert_se(t > 0);
a19bc6
+        assert_se(t < UINT64_MAX);
a19bc6
+
a19bc6
+        log_info("Max tasks: %" PRIu64, t);
a19bc6
+}
a19bc6
+
a19bc6
+static void test_system_tasks_max_scale(void) {
a19bc6
+        uint64_t t;
a19bc6
+
a19bc6
+        t = system_tasks_max();
a19bc6
+
a19bc6
+        assert_se(system_tasks_max_scale(0, 100) == 0);
a19bc6
+        assert_se(system_tasks_max_scale(100, 100) == t);
a19bc6
+
a19bc6
+        assert_se(system_tasks_max_scale(0, 1) == 0);
a19bc6
+        assert_se(system_tasks_max_scale(1, 1) == t);
a19bc6
+        assert_se(system_tasks_max_scale(2, 1) == 2*t);
a19bc6
+
a19bc6
+        assert_se(system_tasks_max_scale(0, 2) == 0);
a19bc6
+        assert_se(system_tasks_max_scale(1, 2) == t/2);
a19bc6
+        assert_se(system_tasks_max_scale(2, 2) == t);
a19bc6
+        assert_se(system_tasks_max_scale(3, 2) == (3*t)/2);
a19bc6
+        assert_se(system_tasks_max_scale(4, 2) == t*2);
a19bc6
+
a19bc6
+        assert_se(system_tasks_max_scale(0, UINT32_MAX) == 0);
a19bc6
+        assert_se(system_tasks_max_scale((UINT32_MAX-1)/2, UINT32_MAX-1) == t/2);
a19bc6
+        assert_se(system_tasks_max_scale(UINT32_MAX, UINT32_MAX) == t);
a19bc6
+
a19bc6
+        /* overflow */
a19bc6
+
a19bc6
+        assert_se(system_tasks_max_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX);
a19bc6
+}
a19bc6
+
a19bc6
 int main(int argc, char *argv[]) {
a19bc6
         log_parse_environment();
a19bc6
         log_open();
a19bc6
@@ -1608,6 +1645,8 @@ int main(int argc, char *argv[]) {
a19bc6
         test_uid_ptr();
a19bc6
         test_sparse_write();
a19bc6
         test_shell_maybe_quote();
a19bc6
+        test_system_tasks_max();
a19bc6
+        test_system_tasks_max_scale();
a19bc6
 
a19bc6
         return 0;
a19bc6
 }