8be66a
From 1a822dbe19ab6634ffb2c0d3ce92b27b503e1612 Mon Sep 17 00:00:00 2001
8be66a
From: =?UTF-8?q?Michal=20Sekleta=CC=81r?= <msekleta@redhat.com>
8be66a
Date: Mon, 17 Feb 2020 13:50:31 +0100
8be66a
Subject: [PATCH] core: add support for setting CPUAffinity= to special "numa"
8be66a
 value
8be66a
8be66a
systemd will automatically derive CPU affinity mask from NUMA node
8be66a
mask.
8be66a
8be66a
Fixes #13248
8be66a
8be66a
(cherry picked from commit e2b2fb7f566d13a3de61952b5356cd4d2eaee917)
8be66a
8be66a
Resolves: #1740657
8be66a
---
8be66a
 man/systemd.exec.xml                 |  9 +++---
8be66a
 src/basic/cpu-set-util.c             | 43 ++++++++++++++++++++++++--
8be66a
 src/basic/cpu-set-util.h             |  1 +
8be66a
 src/core/dbus-execute.c              | 30 +++++++++++++++++-
8be66a
 src/core/execute.c                   | 46 ++++++++++++++++++++++++++--
8be66a
 src/core/execute.h                   |  3 ++
8be66a
 src/core/load-fragment.c             | 14 ++++++++-
8be66a
 src/shared/bus-unit-util.c           |  9 ++++++
8be66a
 src/test/test-cpu-set-util.c         |  6 ++--
8be66a
 test/TEST-36-NUMAPOLICY/testsuite.sh | 18 +++++++++++
8be66a
 10 files changed, 166 insertions(+), 13 deletions(-)
8be66a
8be66a
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
8be66a
index e2a5ede968..696438c4ef 100644
8be66a
--- a/man/systemd.exec.xml
8be66a
+++ b/man/systemd.exec.xml
8be66a
@@ -706,10 +706,11 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
8be66a
         <term><varname>CPUAffinity=</varname></term>
8be66a
 
8be66a
         <listitem><para>Controls the CPU affinity of the executed processes. Takes a list of CPU indices or ranges
8be66a
-        separated by either whitespace or commas. CPU ranges are specified by the lower and upper CPU indices separated
8be66a
-        by a dash. This option may be specified more than once, in which case the specified CPU affinity masks are
8be66a
-        merged. If the empty string is assigned, the mask is reset, all assignments prior to this will have no
8be66a
-        effect. See
8be66a
+        separated by either whitespace or commas. Alternatively, takes a special "numa" value in which case systemd
8be66a
+        automatically derives allowed CPU range based on the value of <varname>NUMAMask=</varname> option. CPU ranges
8be66a
+        are specified by the lower and upper CPU indices separated by a dash. This option may be specified more than
8be66a
+        once, in which case the specified CPU affinity masks are merged. If the empty string is assigned, the mask
8be66a
+        is reset, all assignments prior to this will have no effect. See
8be66a
         <citerefentry><refentrytitle>sched_setaffinity</refentrytitle><manvolnum>2</manvolnum></citerefentry> for
8be66a
         details.</para></listitem>
8be66a
       </varlistentry>
8be66a
diff --git a/src/basic/cpu-set-util.c b/src/basic/cpu-set-util.c
8be66a
index 51752ad1a6..1922c95864 100644
8be66a
--- a/src/basic/cpu-set-util.c
8be66a
+++ b/src/basic/cpu-set-util.c
8be66a
@@ -12,12 +12,14 @@
8be66a
 #include "cpu-set-util.h"
8be66a
 #include "dirent-util.h"
8be66a
 #include "extract-word.h"
8be66a
+#include "fileio.h"
8be66a
 #include "fd-util.h"
8be66a
 #include "log.h"
8be66a
 #include "macro.h"
8be66a
 #include "missing.h"
8be66a
 #include "parse-util.h"
8be66a
 #include "stat-util.h"
8be66a
+#include "stdio-util.h"
8be66a
 #include "string-util.h"
8be66a
 #include "string-table.h"
8be66a
 #include "strv.h"
8be66a
@@ -179,7 +181,7 @@ int cpu_set_add_all(CPUSet *a, const CPUSet *b) {
8be66a
                                 return r;
8be66a
                 }
8be66a
 
8be66a
-        return 0;
8be66a
+        return 1;
8be66a
 }
8be66a
 
8be66a
 int parse_cpu_set_full(
8be66a
@@ -264,7 +266,7 @@ int parse_cpu_set_extend(
8be66a
         if (!old->set) {
8be66a
                 *old = cpuset;
8be66a
                 cpuset = (CPUSet) {};
8be66a
-                return 0;
8be66a
+                return 1;
8be66a
         }
8be66a
 
8be66a
         return cpu_set_add_all(old, &cpuset);
8be66a
@@ -417,6 +419,43 @@ int apply_numa_policy(const NUMAPolicy *policy) {
8be66a
         return 0;
8be66a
 }
8be66a
 
8be66a
+int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) {
8be66a
+        int r;
8be66a
+        size_t i;
8be66a
+        _cleanup_(cpu_set_reset) CPUSet s = {};
8be66a
+
8be66a
+        assert(policy);
8be66a
+        assert(ret);
8be66a
+
8be66a
+        for (i = 0; i < policy->nodes.allocated * 8; i++) {
8be66a
+                _cleanup_free_ char *l = NULL;
8be66a
+                char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1];
8be66a
+                _cleanup_(cpu_set_reset) CPUSet part = {};
8be66a
+
8be66a
+                if (!CPU_ISSET_S(i, policy->nodes.allocated, policy->nodes.set))
8be66a
+                        continue;
8be66a
+
8be66a
+                xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", i);
8be66a
+
8be66a
+                r = read_one_line_file(p, &l);
8be66a
+                if (r < 0)
8be66a
+                        return r;
8be66a
+
8be66a
+                r = parse_cpu_set(l, &part);
8be66a
+                if (r < 0)
8be66a
+                        return r;
8be66a
+
8be66a
+                r = cpu_set_add_all(&s, &part);
8be66a
+                if (r < 0)
8be66a
+                        return r;
8be66a
+        }
8be66a
+
8be66a
+        *ret = s;
8be66a
+        s = (CPUSet) {};
8be66a
+
8be66a
+        return 0;
8be66a
+}
8be66a
+
8be66a
 static const char* const mpol_table[] = {
8be66a
         [MPOL_DEFAULT]    = "default",
8be66a
         [MPOL_PREFERRED]  = "preferred",
8be66a
diff --git a/src/basic/cpu-set-util.h b/src/basic/cpu-set-util.h
8be66a
index 8519a9b6c8..795be807af 100644
8be66a
--- a/src/basic/cpu-set-util.h
8be66a
+++ b/src/basic/cpu-set-util.h
8be66a
@@ -78,6 +78,7 @@ static inline void numa_policy_reset(NUMAPolicy *p) {
8be66a
 }
8be66a
 
8be66a
 int apply_numa_policy(const NUMAPolicy *policy);
8be66a
+int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret);
8be66a
 
8be66a
 const char* mpol_to_string(int i) _const_;
8be66a
 int mpol_from_string(const char *s) _pure_;
8be66a
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
8be66a
index d5acca384f..0fe4c14e48 100644
8be66a
--- a/src/core/dbus-execute.c
8be66a
+++ b/src/core/dbus-execute.c
8be66a
@@ -58,6 +58,8 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext,
8be66a
 static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
8be66a
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
8be66a
 static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
8be66a
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
8be66a
+
8be66a
 
8be66a
 static int property_get_environment_files(
8be66a
                 sd_bus *bus,
8be66a
@@ -215,6 +217,7 @@ static int property_get_cpu_affinity(
8be66a
                 sd_bus_error *error) {
8be66a
 
8be66a
         ExecContext *c = userdata;
8be66a
+        _cleanup_(cpu_set_reset) CPUSet s = {};
8be66a
         _cleanup_free_ uint8_t *array = NULL;
8be66a
         size_t allocated;
8be66a
 
8be66a
@@ -222,7 +225,16 @@ static int property_get_cpu_affinity(
8be66a
         assert(reply);
8be66a
         assert(c);
8be66a
 
8be66a
-        (void) cpu_set_to_dbus(&c->cpu_set, &array, &allocated);
8be66a
+        if (c->cpu_affinity_from_numa) {
8be66a
+                int r;
8be66a
+
8be66a
+                r = numa_to_cpu_set(&c->numa_policy, &s);
8be66a
+                if (r < 0)
8be66a
+                        return r;
8be66a
+        }
8be66a
+
8be66a
+        (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set,  &array, &allocated);
8be66a
+
8be66a
         return sd_bus_message_append_array(reply, 'y', array, allocated);
8be66a
 }
8be66a
 
8be66a
@@ -743,6 +755,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
8be66a
         SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
         SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
         SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
+        SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
         SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
         SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
         SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
8be66a
@@ -1639,6 +1652,20 @@ int bus_exec_context_set_transient_property(
8be66a
 
8be66a
                 return 1;
8be66a
 
8be66a
+        } else if (streq(name, "CPUAffinityFromNUMA")) {
8be66a
+                int q;
8be66a
+
8be66a
+                r = sd_bus_message_read_basic(message, 'b', &q);
8be66a
+                if (r < 0)
8be66a
+                        return r;
8be66a
+
8be66a
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
8be66a
+                        c->cpu_affinity_from_numa = q;
8be66a
+                        unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa");
8be66a
+                }
8be66a
+
8be66a
+                return 1;
8be66a
+
8be66a
         } else if (streq(name, "NUMAPolicy")) {
8be66a
                 int32_t type;
8be66a
 
8be66a
@@ -1653,6 +1680,7 @@ int bus_exec_context_set_transient_property(
8be66a
                         c->numa_policy.type = type;
8be66a
 
8be66a
                 return 1;
8be66a
+
8be66a
         } else if (streq(name, "IOSchedulingClass")) {
8be66a
                 int32_t q;
8be66a
 
8be66a
diff --git a/src/core/execute.c b/src/core/execute.c
8be66a
index 3c54ac1110..d528d08830 100644
8be66a
--- a/src/core/execute.c
8be66a
+++ b/src/core/execute.c
8be66a
@@ -2750,6 +2750,33 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p
8be66a
 
8be66a
 static char *exec_command_line(char **argv);
8be66a
 
8be66a
+static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
8be66a
+        _cleanup_(cpu_set_reset) CPUSet s = {};
8be66a
+        int r;
8be66a
+
8be66a
+        assert(c);
8be66a
+        assert(ret);
8be66a
+
8be66a
+        if (!c->numa_policy.nodes.set) {
8be66a
+                log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
8be66a
+                return 0;
8be66a
+        }
8be66a
+
8be66a
+        r = numa_to_cpu_set(&c->numa_policy, &s);
8be66a
+        if (r < 0)
8be66a
+                return r;
8be66a
+
8be66a
+        cpu_set_reset(ret);
8be66a
+
8be66a
+        return cpu_set_add_all(ret, &s);
8be66a
+}
8be66a
+
8be66a
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
8be66a
+        assert(c);
8be66a
+
8be66a
+        return c->cpu_affinity_from_numa;
8be66a
+}
8be66a
+
8be66a
 static int exec_child(
8be66a
                 Unit *unit,
8be66a
                 const ExecCommand *command,
8be66a
@@ -3012,11 +3039,26 @@ static int exec_child(
8be66a
                 }
8be66a
         }
8be66a
 
8be66a
-        if (context->cpu_set.set)
8be66a
-                if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
8be66a
+        if (context->cpu_affinity_from_numa || context->cpu_set.set) {
8be66a
+                _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
8be66a
+                const CPUSet *cpu_set;
8be66a
+
8be66a
+                if (context->cpu_affinity_from_numa) {
8be66a
+                        r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
8be66a
+                        if (r < 0) {
8be66a
+                                *exit_status = EXIT_CPUAFFINITY;
8be66a
+                                return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
8be66a
+                        }
8be66a
+
8be66a
+                        cpu_set = &converted_cpu_set;
8be66a
+                } else
8be66a
+                        cpu_set = &context->cpu_set;
8be66a
+
8be66a
+                if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
8be66a
                         *exit_status = EXIT_CPUAFFINITY;
8be66a
                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
8be66a
                 }
8be66a
+        }
8be66a
 
8be66a
         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
8be66a
                 r = apply_numa_policy(&context->numa_policy);
8be66a
diff --git a/src/core/execute.h b/src/core/execute.h
8be66a
index 86c1cee84c..62c6229621 100644
8be66a
--- a/src/core/execute.h
8be66a
+++ b/src/core/execute.h
8be66a
@@ -152,6 +152,7 @@ struct ExecContext {
8be66a
 
8be66a
         CPUSet cpu_set;
8be66a
         NUMAPolicy numa_policy;
8be66a
+        bool cpu_affinity_from_numa;
8be66a
 
8be66a
         ExecInput std_input;
8be66a
         ExecOutput std_output;
8be66a
@@ -375,6 +376,8 @@ int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value,
8be66a
 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
8be66a
 void exec_runtime_vacuum(Manager *m);
8be66a
 
8be66a
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c);
8be66a
+
8be66a
 const char* exec_output_to_string(ExecOutput i) _const_;
8be66a
 ExecOutput exec_output_from_string(const char *s) _pure_;
8be66a
 
8be66a
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
8be66a
index 33fdb82754..740401a582 100644
8be66a
--- a/src/core/load-fragment.c
8be66a
+++ b/src/core/load-fragment.c
8be66a
@@ -1251,13 +1251,25 @@ int config_parse_exec_cpu_affinity(const char *unit,
8be66a
                                    void *userdata) {
8be66a
 
8be66a
         ExecContext *c = data;
8be66a
+        int r;
8be66a
 
8be66a
         assert(filename);
8be66a
         assert(lvalue);
8be66a
         assert(rvalue);
8be66a
         assert(data);
8be66a
 
8be66a
-        return parse_cpu_set_extend(rvalue, &c->cpu_set, true, unit, filename, line, lvalue);
8be66a
+        if (streq(rvalue, "numa")) {
8be66a
+                c->cpu_affinity_from_numa = true;
8be66a
+                cpu_set_reset(&c->cpu_set);
8be66a
+
8be66a
+                return 0;
8be66a
+        }
8be66a
+
8be66a
+        r = parse_cpu_set_extend(rvalue, &c->cpu_set, true, unit, filename, line, lvalue);
8be66a
+        if (r >= 0)
8be66a
+                c->cpu_affinity_from_numa = false;
8be66a
+
8be66a
+        return r;
8be66a
 }
8be66a
 
8be66a
 int config_parse_capability_set(
8be66a
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
8be66a
index 7029aa5615..daa2c2dce5 100644
8be66a
--- a/src/shared/bus-unit-util.c
8be66a
+++ b/src/shared/bus-unit-util.c
8be66a
@@ -26,6 +26,8 @@
8be66a
 #include "securebits-util.h"
8be66a
 #include "signal-util.h"
8be66a
 #include "socket-protocol-list.h"
8be66a
+#include "socket-util.h"
8be66a
+#include "stdio-util.h"
8be66a
 #include "string-util.h"
8be66a
 #include "syslog-util.h"
8be66a
 #include "terminal-util.h"
8be66a
@@ -997,6 +999,13 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
8be66a
                 _cleanup_free_ uint8_t *array = NULL;
8be66a
                 size_t allocated;
8be66a
 
8be66a
+                if (eq && streq(eq, "numa")) {
8be66a
+                        r = sd_bus_message_append(m, "(sv)", "CPUAffinityFromNUMA", "b", true);
8be66a
+                        if (r < 0)
8be66a
+                                return bus_log_create_error(r);
8be66a
+                        return r;
8be66a
+                }
8be66a
+
8be66a
                 r = parse_cpu_set(eq, &cpuset);
8be66a
                 if (r < 0)
8be66a
                         return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
8be66a
diff --git a/src/test/test-cpu-set-util.c b/src/test/test-cpu-set-util.c
8be66a
index 136eaca82d..1b7be5df4e 100644
8be66a
--- a/src/test/test-cpu-set-util.c
8be66a
+++ b/src/test/test-cpu-set-util.c
8be66a
@@ -218,12 +218,12 @@ static void test_parse_cpu_set_extend(void) {
8be66a
 
8be66a
         log_info("/* %s */", __func__);
8be66a
 
8be66a
-        assert_se(parse_cpu_set_extend("1 3", &c, true, NULL, "fake", 1, "CPUAffinity") == 0);
8be66a
+        assert_se(parse_cpu_set_extend("1 3", &c, true, NULL, "fake", 1, "CPUAffinity") == 1);
8be66a
         assert_se(CPU_COUNT_S(c.allocated, c.set) == 2);
8be66a
         assert_se(s1 = cpu_set_to_string(&c);;
8be66a
         log_info("cpu_set_to_string: %s", s1);
8be66a
 
8be66a
-        assert_se(parse_cpu_set_extend("4", &c, true, NULL, "fake", 1, "CPUAffinity") == 0);
8be66a
+        assert_se(parse_cpu_set_extend("4", &c, true, NULL, "fake", 1, "CPUAffinity") == 1);
8be66a
         assert_se(CPU_COUNT_S(c.allocated, c.set) == 3);
8be66a
         assert_se(s2 = cpu_set_to_string(&c);;
8be66a
         log_info("cpu_set_to_string: %s", s2);
8be66a
@@ -240,7 +240,7 @@ static void test_cpu_set_to_from_dbus(void) {
8be66a
 
8be66a
         log_info("/* %s */", __func__);
8be66a
 
8be66a
-        assert_se(parse_cpu_set_extend("1 3 8 100-200", &c, true, NULL, "fake", 1, "CPUAffinity") == 0);
8be66a
+        assert_se(parse_cpu_set_extend("1 3 8 100-200", &c, true, NULL, "fake", 1, "CPUAffinity") == 1);
8be66a
         assert_se(s = cpu_set_to_string(&c);;
8be66a
         log_info("cpu_set_to_string: %s", s);
8be66a
         assert_se(CPU_COUNT_S(c.allocated, c.set) == 104);
8be66a
diff --git a/test/TEST-36-NUMAPOLICY/testsuite.sh b/test/TEST-36-NUMAPOLICY/testsuite.sh
8be66a
index bffac4ffe6..7ccaa5b412 100755
8be66a
--- a/test/TEST-36-NUMAPOLICY/testsuite.sh
8be66a
+++ b/test/TEST-36-NUMAPOLICY/testsuite.sh
8be66a
@@ -279,6 +279,18 @@ else
8be66a
     # Maks must be ignored
8be66a
     grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" $straceLog
8be66a
 
8be66a
+    echo "Unit file CPUAffinity=NUMA support"
8be66a
+    writeTestUnitNUMAPolicy "bind" "0"
8be66a
+    echo "CPUAffinity=numa" >> $testUnitNUMAConf
8be66a
+    systemctl daemon-reload
8be66a
+    systemctl start $testUnit
8be66a
+    systemctlCheckNUMAProperties $testUnit "bind" "0"
8be66a
+    pid=$(systemctl show --value -p MainPID $testUnit)
8be66a
+    cpulist=$(cat /sys/devices/system/node/node0/cpulist)
8be66a
+    affinity_systemd=$(systemctl show --value -p CPUAffinity $testUnit)
8be66a
+    [ $cpulist = $affinity_systemd ]
8be66a
+    pid1StopUnit $testUnit
8be66a
+
8be66a
     echo "systemd-run NUMAPolicy support"
8be66a
     runUnit='numa-systemd-run-test.service'
8be66a
 
8be66a
@@ -309,6 +321,12 @@ else
8be66a
     systemd-run -p NUMAPolicy=local -p NUMAMask=0 --unit $runUnit sleep 1000
8be66a
     systemctlCheckNUMAProperties $runUnit "local" ""
8be66a
     pid1StopUnit $runUnit
8be66a
+
8be66a
+    systemd-run -p NUMAPolicy=local -p NUMAMask=0 -p CPUAffinity=numa --unit $runUnit sleep 1000
8be66a
+    systemctlCheckNUMAProperties $runUnit "local" ""
8be66a
+    systemctl cat $runUnit | grep -q 'CPUAffinity=numa'
8be66a
+    pid1StopUnit $runUnit
8be66a
+
8be66a
 fi
8be66a
 
8be66a
 # Cleanup