From 1a822dbe19ab6634ffb2c0d3ce92b27b503e1612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Sekleta=CC=81r?= Date: Mon, 17 Feb 2020 13:50:31 +0100 Subject: [PATCH] core: add support for setting CPUAffinity= to special "numa" value systemd will automatically derive CPU affinity mask from NUMA node mask. Fixes #13248 (cherry picked from commit e2b2fb7f566d13a3de61952b5356cd4d2eaee917) Resolves: #1740657 --- man/systemd.exec.xml | 9 +++--- src/basic/cpu-set-util.c | 43 ++++++++++++++++++++++++-- src/basic/cpu-set-util.h | 1 + src/core/dbus-execute.c | 30 +++++++++++++++++- src/core/execute.c | 46 ++++++++++++++++++++++++++-- src/core/execute.h | 3 ++ src/core/load-fragment.c | 14 ++++++++- src/shared/bus-unit-util.c | 9 ++++++ src/test/test-cpu-set-util.c | 6 ++-- test/TEST-36-NUMAPOLICY/testsuite.sh | 18 +++++++++++ 10 files changed, 166 insertions(+), 13 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index e2a5ede968..696438c4ef 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -706,10 +706,11 @@ CapabilityBoundingSet=~CAP_B CAP_C CPUAffinity= Controls the CPU affinity of the executed processes. Takes a list of CPU indices or ranges - separated by either whitespace or commas. CPU ranges are specified by the lower and upper CPU indices separated - by a dash. This option may be specified more than once, in which case the specified CPU affinity masks are - merged. If the empty string is assigned, the mask is reset, all assignments prior to this will have no - effect. See + separated by either whitespace or commas. Alternatively, takes a special "numa" value in which case systemd + automatically derives allowed CPU range based on the value of NUMAMask= option. CPU ranges + are specified by the lower and upper CPU indices separated by a dash. This option may be specified more than + once, in which case the specified CPU affinity masks are merged. If the empty string is assigned, the mask + is reset, all assignments prior to this will have no effect. See sched_setaffinity2 for details. diff --git a/src/basic/cpu-set-util.c b/src/basic/cpu-set-util.c index 51752ad1a6..1922c95864 100644 --- a/src/basic/cpu-set-util.c +++ b/src/basic/cpu-set-util.c @@ -12,12 +12,14 @@ #include "cpu-set-util.h" #include "dirent-util.h" #include "extract-word.h" +#include "fileio.h" #include "fd-util.h" #include "log.h" #include "macro.h" #include "missing.h" #include "parse-util.h" #include "stat-util.h" +#include "stdio-util.h" #include "string-util.h" #include "string-table.h" #include "strv.h" @@ -179,7 +181,7 @@ int cpu_set_add_all(CPUSet *a, const CPUSet *b) { return r; } - return 0; + return 1; } int parse_cpu_set_full( @@ -264,7 +266,7 @@ int parse_cpu_set_extend( if (!old->set) { *old = cpuset; cpuset = (CPUSet) {}; - return 0; + return 1; } return cpu_set_add_all(old, &cpuset); @@ -417,6 +419,43 @@ int apply_numa_policy(const NUMAPolicy *policy) { return 0; } +int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) { + int r; + size_t i; + _cleanup_(cpu_set_reset) CPUSet s = {}; + + assert(policy); + assert(ret); + + for (i = 0; i < policy->nodes.allocated * 8; i++) { + _cleanup_free_ char *l = NULL; + char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1]; + _cleanup_(cpu_set_reset) CPUSet part = {}; + + if (!CPU_ISSET_S(i, policy->nodes.allocated, policy->nodes.set)) + continue; + + xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", i); + + r = read_one_line_file(p, &l); + if (r < 0) + return r; + + r = parse_cpu_set(l, &part); + if (r < 0) + return r; + + r = cpu_set_add_all(&s, &part); + if (r < 0) + return r; + } + + *ret = s; + s = (CPUSet) {}; + + return 0; +} + static const char* const mpol_table[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "preferred", diff --git a/src/basic/cpu-set-util.h b/src/basic/cpu-set-util.h index 8519a9b6c8..795be807af 100644 --- a/src/basic/cpu-set-util.h +++ b/src/basic/cpu-set-util.h @@ -78,6 +78,7 @@ static inline void numa_policy_reset(NUMAPolicy *p) { } int apply_numa_policy(const NUMAPolicy *policy); +int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret); const char* mpol_to_string(int i) _const_; int mpol_from_string(const char *s) _pure_; diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index d5acca384f..0fe4c14e48 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -58,6 +58,8 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); +static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa); + static int property_get_environment_files( sd_bus *bus, @@ -215,6 +217,7 @@ static int property_get_cpu_affinity( sd_bus_error *error) { ExecContext *c = userdata; + _cleanup_(cpu_set_reset) CPUSet s = {}; _cleanup_free_ uint8_t *array = NULL; size_t allocated; @@ -222,7 +225,16 @@ static int property_get_cpu_affinity( assert(reply); assert(c); - (void) cpu_set_to_dbus(&c->cpu_set, &array, &allocated); + if (c->cpu_affinity_from_numa) { + int r; + + r = numa_to_cpu_set(&c->numa_policy, &s); + if (r < 0) + return r; + } + + (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); } @@ -743,6 +755,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -1639,6 +1652,20 @@ int bus_exec_context_set_transient_property( return 1; + } else if (streq(name, "CPUAffinityFromNUMA")) { + int q; + + r = sd_bus_message_read_basic(message, 'b', &q); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->cpu_affinity_from_numa = q; + unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa"); + } + + return 1; + } else if (streq(name, "NUMAPolicy")) { int32_t type; @@ -1653,6 +1680,7 @@ int bus_exec_context_set_transient_property( c->numa_policy.type = type; return 1; + } else if (streq(name, "IOSchedulingClass")) { int32_t q; diff --git a/src/core/execute.c b/src/core/execute.c index 3c54ac1110..d528d08830 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2750,6 +2750,33 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p static char *exec_command_line(char **argv); +static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) { + _cleanup_(cpu_set_reset) CPUSet s = {}; + int r; + + assert(c); + assert(ret); + + if (!c->numa_policy.nodes.set) { + log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring"); + return 0; + } + + r = numa_to_cpu_set(&c->numa_policy, &s); + if (r < 0) + return r; + + cpu_set_reset(ret); + + return cpu_set_add_all(ret, &s); +} + +bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) { + assert(c); + + return c->cpu_affinity_from_numa; +} + static int exec_child( Unit *unit, const ExecCommand *command, @@ -3012,11 +3039,26 @@ static int exec_child( } } - if (context->cpu_set.set) - if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) { + if (context->cpu_affinity_from_numa || context->cpu_set.set) { + _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {}; + const CPUSet *cpu_set; + + if (context->cpu_affinity_from_numa) { + r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set); + if (r < 0) { + *exit_status = EXIT_CPUAFFINITY; + return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m"); + } + + cpu_set = &converted_cpu_set; + } else + cpu_set = &context->cpu_set; + + if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) { *exit_status = EXIT_CPUAFFINITY; return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m"); } + } if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) { r = apply_numa_policy(&context->numa_policy); diff --git a/src/core/execute.h b/src/core/execute.h index 86c1cee84c..62c6229621 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -152,6 +152,7 @@ struct ExecContext { CPUSet cpu_set; NUMAPolicy numa_policy; + bool cpu_affinity_from_numa; ExecInput std_input; ExecOutput std_output; @@ -375,6 +376,8 @@ int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); void exec_runtime_vacuum(Manager *m); +bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c); + const char* exec_output_to_string(ExecOutput i) _const_; ExecOutput exec_output_from_string(const char *s) _pure_; diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 33fdb82754..740401a582 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -1251,13 +1251,25 @@ int config_parse_exec_cpu_affinity(const char *unit, void *userdata) { ExecContext *c = data; + int r; assert(filename); assert(lvalue); assert(rvalue); assert(data); - return parse_cpu_set_extend(rvalue, &c->cpu_set, true, unit, filename, line, lvalue); + if (streq(rvalue, "numa")) { + c->cpu_affinity_from_numa = true; + cpu_set_reset(&c->cpu_set); + + return 0; + } + + r = parse_cpu_set_extend(rvalue, &c->cpu_set, true, unit, filename, line, lvalue); + if (r >= 0) + c->cpu_affinity_from_numa = false; + + return r; } int config_parse_capability_set( diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 7029aa5615..daa2c2dce5 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -26,6 +26,8 @@ #include "securebits-util.h" #include "signal-util.h" #include "socket-protocol-list.h" +#include "socket-util.h" +#include "stdio-util.h" #include "string-util.h" #include "syslog-util.h" #include "terminal-util.h" @@ -997,6 +999,13 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con _cleanup_free_ uint8_t *array = NULL; size_t allocated; + if (eq && streq(eq, "numa")) { + r = sd_bus_message_append(m, "(sv)", "CPUAffinityFromNUMA", "b", true); + if (r < 0) + return bus_log_create_error(r); + return r; + } + r = parse_cpu_set(eq, &cpuset); if (r < 0) return log_error_errno(r, "Failed to parse %s value: %s", field, eq); diff --git a/src/test/test-cpu-set-util.c b/src/test/test-cpu-set-util.c index 136eaca82d..1b7be5df4e 100644 --- a/src/test/test-cpu-set-util.c +++ b/src/test/test-cpu-set-util.c @@ -218,12 +218,12 @@ static void test_parse_cpu_set_extend(void) { log_info("/* %s */", __func__); - assert_se(parse_cpu_set_extend("1 3", &c, true, NULL, "fake", 1, "CPUAffinity") == 0); + assert_se(parse_cpu_set_extend("1 3", &c, true, NULL, "fake", 1, "CPUAffinity") == 1); assert_se(CPU_COUNT_S(c.allocated, c.set) == 2); assert_se(s1 = cpu_set_to_string(&c)); log_info("cpu_set_to_string: %s", s1); - assert_se(parse_cpu_set_extend("4", &c, true, NULL, "fake", 1, "CPUAffinity") == 0); + assert_se(parse_cpu_set_extend("4", &c, true, NULL, "fake", 1, "CPUAffinity") == 1); assert_se(CPU_COUNT_S(c.allocated, c.set) == 3); assert_se(s2 = cpu_set_to_string(&c)); log_info("cpu_set_to_string: %s", s2); @@ -240,7 +240,7 @@ static void test_cpu_set_to_from_dbus(void) { log_info("/* %s */", __func__); - assert_se(parse_cpu_set_extend("1 3 8 100-200", &c, true, NULL, "fake", 1, "CPUAffinity") == 0); + assert_se(parse_cpu_set_extend("1 3 8 100-200", &c, true, NULL, "fake", 1, "CPUAffinity") == 1); assert_se(s = cpu_set_to_string(&c)); log_info("cpu_set_to_string: %s", s); assert_se(CPU_COUNT_S(c.allocated, c.set) == 104); diff --git a/test/TEST-36-NUMAPOLICY/testsuite.sh b/test/TEST-36-NUMAPOLICY/testsuite.sh index bffac4ffe6..7ccaa5b412 100755 --- a/test/TEST-36-NUMAPOLICY/testsuite.sh +++ b/test/TEST-36-NUMAPOLICY/testsuite.sh @@ -279,6 +279,18 @@ else # Maks must be ignored grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" $straceLog + echo "Unit file CPUAffinity=NUMA support" + writeTestUnitNUMAPolicy "bind" "0" + echo "CPUAffinity=numa" >> $testUnitNUMAConf + systemctl daemon-reload + systemctl start $testUnit + systemctlCheckNUMAProperties $testUnit "bind" "0" + pid=$(systemctl show --value -p MainPID $testUnit) + cpulist=$(cat /sys/devices/system/node/node0/cpulist) + affinity_systemd=$(systemctl show --value -p CPUAffinity $testUnit) + [ $cpulist = $affinity_systemd ] + pid1StopUnit $testUnit + echo "systemd-run NUMAPolicy support" runUnit='numa-systemd-run-test.service' @@ -309,6 +321,12 @@ else systemd-run -p NUMAPolicy=local -p NUMAMask=0 --unit $runUnit sleep 1000 systemctlCheckNUMAProperties $runUnit "local" "" pid1StopUnit $runUnit + + systemd-run -p NUMAPolicy=local -p NUMAMask=0 -p CPUAffinity=numa --unit $runUnit sleep 1000 + systemctlCheckNUMAProperties $runUnit "local" "" + systemctl cat $runUnit | grep -q 'CPUAffinity=numa' + pid1StopUnit $runUnit + fi # Cleanup