b9a53a
From a735699a8287c19e043b7d2fe9a387a3938e1e2f Mon Sep 17 00:00:00 2001
b9a53a
From: =?UTF-8?q?Michal=20Sekleta=CC=81r?= <msekleta@redhat.com>
b9a53a
Date: Mon, 18 Nov 2019 12:50:11 +0100
b9a53a
Subject: [PATCH] core: introduce NUMAPolicy and NUMAMask options
b9a53a
b9a53a
Make possible to set NUMA allocation policy for manager. Manager's
b9a53a
policy is by default inherited to all forked off processes. However, it
b9a53a
is possible to override the policy on per-service basis. Currently we
b9a53a
support, these policies: default, prefer, bind, interleave, local.
b9a53a
See man 2 set_mempolicy for details on each policy.
b9a53a
b9a53a
Overall NUMA policy actually consists of two parts. Policy itself and
b9a53a
bitmask representing NUMA nodes where is policy effective. Node mask can
b9a53a
be specified using related option, NUMAMask. Default mask can be
b9a53a
overwritten on per-service level.
b9a53a
b9a53a
(cherry-picked from commit fe9c54b2188e6cd23262a319f96b13215f2c5e9c)
b9a53a
b9a53a
Resolves: #1734787
b9a53a
---
b9a53a
 man/systemd-system.conf.xml           | 19 ++++++
b9a53a
 man/systemd.exec.xml                  | 28 +++++++++
b9a53a
 meson.build                           |  4 ++
b9a53a
 src/basic/cpu-set-util.c              | 91 +++++++++++++++++++++++++++
b9a53a
 src/basic/cpu-set-util.h              | 28 +++++++++
b9a53a
 src/basic/exit-status.c               |  3 +
b9a53a
 src/basic/exit-status.h               |  1 +
b9a53a
 src/basic/missing_syscall.h           | 43 +++++++++++++
b9a53a
 src/core/dbus-execute.c               | 65 ++++++++++++++++++-
b9a53a
 src/core/execute.c                    | 20 ++++++
b9a53a
 src/core/execute.h                    |  1 +
b9a53a
 src/core/load-fragment-gperf.gperf.m4 |  2 +
b9a53a
 src/core/load-fragment.c              | 28 +++++++++
b9a53a
 src/core/load-fragment.h              |  2 +
b9a53a
 src/core/main.c                       | 27 ++++++++
b9a53a
 src/core/system.conf.in               |  2 +
b9a53a
 src/shared/bus-unit-util.c            | 28 +++++++++
b9a53a
 src/systemctl/systemctl.c             | 18 +++++-
b9a53a
 18 files changed, 405 insertions(+), 5 deletions(-)
b9a53a
b9a53a
diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml
b9a53a
index ab23779ec0..988c4e7665 100644
b9a53a
--- a/man/systemd-system.conf.xml
b9a53a
+++ b/man/systemd-system.conf.xml
b9a53a
@@ -132,6 +132,25 @@
b9a53a
         anymore.</para></listitem>
b9a53a
       </varlistentry>
b9a53a
 
b9a53a
+      <varlistentry>
b9a53a
+        <term><varname>NUMAPolicy=</varname></term>
b9a53a
+
b9a53a
+        <listitem><para>Configures the NUMA memory policy for the service manager and the default NUMA memory policy
b9a53a
+        for all forked off processes. Individual services may override the default policy with the
b9a53a
+        <varname>NUMAPolicy=</varname> setting in unit files, see
b9a53a
+        <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
b9a53a
+      </varlistentry>
b9a53a
+
b9a53a
+      <varlistentry>
b9a53a
+        <term><varname>NUMAMask=</varname></term>
b9a53a
+
b9a53a
+        <listitem><para>Configures the NUMA node mask that will be associated with the selected NUMA policy. Note that
b9a53a
+        <option>default</option> and <option>local</option> NUMA policies don't require explicit NUMA node mask and
b9a53a
+        value of the option can be empty. Similarly to <varname>NUMAPolicy=</varname>, value can be overriden
b9a53a
+        by individual services in unit files, see
b9a53a
+        <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
b9a53a
+      </varlistentry>
b9a53a
+
b9a53a
       <varlistentry>
b9a53a
         <term><varname>RuntimeWatchdogSec=</varname></term>
b9a53a
         <term><varname>ShutdownWatchdogSec=</varname></term>
b9a53a
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
b9a53a
index 342b8385bc..87fb8b34f4 100644
b9a53a
--- a/man/systemd.exec.xml
b9a53a
+++ b/man/systemd.exec.xml
b9a53a
@@ -710,6 +710,28 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
b9a53a
         details.</para></listitem>
b9a53a
       </varlistentry>
b9a53a
 
b9a53a
+      <varlistentry>
b9a53a
+        <term><varname>NUMAPolicy=</varname></term>
b9a53a
+
b9a53a
+        <listitem><para>Controls the NUMA memory policy of the executed processes. Takes a policy type, one of:
b9a53a
+        <option>default</option>, <option>preferred</option>, <option>bind</option>, <option>interleave</option> and
b9a53a
+        <option>local</option>. A list of NUMA nodes that should be associated with the policy must be specified
b9a53a
+        in <varname>NUMAMask=</varname>. For more details on each policy please see,
b9a53a
+        <citerefentry><refentrytitle>set_mempolicy</refentrytitle><manvolnum>2</manvolnum></citerefentry>. For overall
b9a53a
+        overview of NUMA support in Linux see,
b9a53a
+        <citerefentry><refentrytitle>numa</refentrytitle><manvolnum>7</manvolnum></citerefentry>
b9a53a
+        </para></listitem>
b9a53a
+      </varlistentry>
b9a53a
+
b9a53a
+      <varlistentry>
b9a53a
+        <term><varname>NUMAMask=</varname></term>
b9a53a
+
b9a53a
+        <listitem><para>Controls the NUMA node list which will be applied alongside with selected NUMA policy.
b9a53a
+        Takes a list of NUMA nodes and has the same syntax as a list of CPUs for <varname>CPUAffinity=</varname>
b9a53a
+        option. Note that the list of NUMA nodes is not required for <option>default</option> and <option>local</option>
b9a53a
+        policies and for <option>preferred</option> policy we expect a single NUMA node.</para></listitem>
b9a53a
+      </varlistentry>
b9a53a
+
b9a53a
       <varlistentry>
b9a53a
         <term><varname>IOSchedulingClass=</varname></term>
b9a53a
 
b9a53a
@@ -2709,6 +2731,12 @@ StandardInputData=SWNrIHNpdHplIGRhIHVuJyBlc3NlIEtsb3BzLAp1ZmYgZWVtYWwga2xvcHAncy
b9a53a
             <entry><constant>EXIT_CONFIGURATION_DIRECTORY</constant></entry>
b9a53a
             <entry>Failed to set up unit's configuration directory. See <varname>ConfigurationDirectory=</varname> above.</entry>
b9a53a
           </row>
b9a53a
+          <row>
b9a53a
+            <entry>242</entry>
b9a53a
+            <entry><constant>EXIT_NUMA_POLICY</constant></entry>
b9a53a
+            <entry>Failed to set up unit's NUMA memory policy. See <varname>NUMAPolicy=</varname> and <varname>NUMAMask=</varname>above.</entry>
b9a53a
+          </row>
b9a53a
+
b9a53a
         
b9a53a
       </tgroup>
b9a53a
     
b9a53a
diff --git a/meson.build b/meson.build
b9a53a
index 613a5133b6..fe82ca4ac2 100644
b9a53a
--- a/meson.build
b9a53a
+++ b/meson.build
b9a53a
@@ -501,6 +501,10 @@ foreach ident : [
b9a53a
                                  #include <unistd.h>'''],
b9a53a
         ['explicit_bzero' ,   '''#include <string.h>'''],
b9a53a
         ['reallocarray',      '''#include <malloc.h>'''],
b9a53a
+        ['set_mempolicy',     '''#include <stdlib.h>
b9a53a
+                                 #include <unistd.h>'''],
b9a53a
+        ['get_mempolicy',     '''#include <stdlib.h>
b9a53a
+                                 #include <unistd.h>'''],
b9a53a
 ]
b9a53a
 
b9a53a
         have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
b9a53a
diff --git a/src/basic/cpu-set-util.c b/src/basic/cpu-set-util.c
b9a53a
index 103b9703b3..36cb017ae7 100644
b9a53a
--- a/src/basic/cpu-set-util.c
b9a53a
+++ b/src/basic/cpu-set-util.c
b9a53a
@@ -10,11 +10,17 @@
b9a53a
 
b9a53a
 #include "alloc-util.h"
b9a53a
 #include "cpu-set-util.h"
b9a53a
+#include "dirent-util.h"
b9a53a
 #include "extract-word.h"
b9a53a
+#include "fd-util.h"
b9a53a
 #include "log.h"
b9a53a
 #include "macro.h"
b9a53a
+#include "missing.h"
b9a53a
 #include "parse-util.h"
b9a53a
+#include "stat-util.h"
b9a53a
 #include "string-util.h"
b9a53a
+#include "string-table.h"
b9a53a
+#include "strv.h"
b9a53a
 #include "util.h"
b9a53a
 
b9a53a
 char* cpu_set_to_string(const CPUSet *a) {
b9a53a
@@ -290,3 +296,88 @@ int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set) {
b9a53a
         s = (CPUSet) {};
b9a53a
         return 0;
b9a53a
 }
b9a53a
+
b9a53a
+bool numa_policy_is_valid(const NUMAPolicy *policy) {
b9a53a
+        assert(policy);
b9a53a
+
b9a53a
+        if (!mpol_is_valid(numa_policy_get_type(policy)))
b9a53a
+                return false;
b9a53a
+
b9a53a
+        if (!policy->nodes.set &&
b9a53a
+            !IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL, MPOL_PREFERRED))
b9a53a
+                return false;
b9a53a
+
b9a53a
+        if (policy->nodes.set &&
b9a53a
+            numa_policy_get_type(policy) == MPOL_PREFERRED &&
b9a53a
+            CPU_COUNT_S(policy->nodes.allocated, policy->nodes.set) != 1)
b9a53a
+                return false;
b9a53a
+
b9a53a
+        return true;
b9a53a
+}
b9a53a
+
b9a53a
+static int numa_policy_to_mempolicy(const NUMAPolicy *policy, unsigned long *ret_maxnode, unsigned long **ret_nodes) {
b9a53a
+        unsigned node, bits = 0, ulong_bits;
b9a53a
+        _cleanup_free_ unsigned long *out = NULL;
b9a53a
+
b9a53a
+        assert(policy);
b9a53a
+        assert(ret_maxnode);
b9a53a
+        assert(ret_nodes);
b9a53a
+
b9a53a
+        if (IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL) ||
b9a53a
+            (numa_policy_get_type(policy) == MPOL_PREFERRED && !policy->nodes.set)) {
b9a53a
+                *ret_nodes = NULL;
b9a53a
+                *ret_maxnode = 0;
b9a53a
+                return 0;
b9a53a
+        }
b9a53a
+
b9a53a
+        bits = policy->nodes.allocated * 8;
b9a53a
+        ulong_bits = sizeof(unsigned long) * 8;
b9a53a
+
b9a53a
+        out = new0(unsigned long, DIV_ROUND_UP(policy->nodes.allocated, sizeof(unsigned long)));
b9a53a
+        if (!out)
b9a53a
+                return -ENOMEM;
b9a53a
+
b9a53a
+        /* We don't make any assumptions about internal type libc is using to store NUMA node mask.
b9a53a
+           Hence we need to convert the node mask to the representation expected by set_mempolicy() */
b9a53a
+        for (node = 0; node < bits; node++)
b9a53a
+                if (CPU_ISSET_S(node, policy->nodes.allocated, policy->nodes.set))
b9a53a
+                        out[node / ulong_bits] |= 1ul << (node % ulong_bits);
b9a53a
+
b9a53a
+        *ret_nodes = TAKE_PTR(out);
b9a53a
+        *ret_maxnode = bits + 1;
b9a53a
+        return 0;
b9a53a
+}
b9a53a
+
b9a53a
+int apply_numa_policy(const NUMAPolicy *policy) {
b9a53a
+        int r;
b9a53a
+        _cleanup_free_ unsigned long *nodes = NULL;
b9a53a
+        unsigned long maxnode;
b9a53a
+
b9a53a
+        assert(policy);
b9a53a
+
b9a53a
+        if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
b9a53a
+                return -EOPNOTSUPP;
b9a53a
+
b9a53a
+        if (!numa_policy_is_valid(policy))
b9a53a
+                return -EINVAL;
b9a53a
+
b9a53a
+        r = numa_policy_to_mempolicy(policy, &maxnode, &nodes);
b9a53a
+        if (r < 0)
b9a53a
+                return r;
b9a53a
+
b9a53a
+        r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode);
b9a53a
+        if (r < 0)
b9a53a
+                return -errno;
b9a53a
+
b9a53a
+        return 0;
b9a53a
+}
b9a53a
+
b9a53a
+static const char* const mpol_table[] = {
b9a53a
+        [MPOL_DEFAULT]    = "default",
b9a53a
+        [MPOL_PREFERRED]  = "preferred",
b9a53a
+        [MPOL_BIND]       = "bind",
b9a53a
+        [MPOL_INTERLEAVE] = "interleave",
b9a53a
+        [MPOL_LOCAL]      = "local",
b9a53a
+};
b9a53a
+
b9a53a
+DEFINE_STRING_TABLE_LOOKUP(mpol, int);
b9a53a
diff --git a/src/basic/cpu-set-util.h b/src/basic/cpu-set-util.h
b9a53a
index ec640b2ec9..295028cb54 100644
b9a53a
--- a/src/basic/cpu-set-util.h
b9a53a
+++ b/src/basic/cpu-set-util.h
b9a53a
@@ -8,6 +8,7 @@
b9a53a
 #include <sched.h>
b9a53a
 
b9a53a
 #include "macro.h"
b9a53a
+#include "missing.h"
b9a53a
 
b9a53a
 /* This wraps the libc interface with a variable to keep the allocated size. */
b9a53a
 typedef struct CPUSet {
b9a53a
@@ -52,3 +53,30 @@ int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated);
b9a53a
 int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set);
b9a53a
 
b9a53a
 int cpus_in_affinity_mask(void);
b9a53a
+
b9a53a
+static inline bool mpol_is_valid(int t) {
b9a53a
+        return t >= MPOL_DEFAULT && t <= MPOL_LOCAL;
b9a53a
+}
b9a53a
+
b9a53a
+typedef struct NUMAPolicy {
b9a53a
+        /* Always use numa_policy_get_type() to read the value */
b9a53a
+        int type;
b9a53a
+        CPUSet nodes;
b9a53a
+} NUMAPolicy;
b9a53a
+
b9a53a
+bool numa_policy_is_valid(const NUMAPolicy *p);
b9a53a
+
b9a53a
+static inline int numa_policy_get_type(const NUMAPolicy *p) {
b9a53a
+        return p->type < 0 ? (p->nodes.set ? MPOL_PREFERRED : -1) : p->type;
b9a53a
+}
b9a53a
+
b9a53a
+static inline void numa_policy_reset(NUMAPolicy *p) {
b9a53a
+        assert(p);
b9a53a
+        cpu_set_reset(&p->nodes);
b9a53a
+        p->type = -1;
b9a53a
+}
b9a53a
+
b9a53a
+int apply_numa_policy(const NUMAPolicy *policy);
b9a53a
+
b9a53a
+const char* mpol_to_string(int i) _const_;
b9a53a
+int mpol_from_string(const char *s) _pure_;
b9a53a
diff --git a/src/basic/exit-status.c b/src/basic/exit-status.c
b9a53a
index 21af8c4c71..0a7a53b73d 100644
b9a53a
--- a/src/basic/exit-status.c
b9a53a
+++ b/src/basic/exit-status.c
b9a53a
@@ -155,6 +155,9 @@ const char* exit_status_to_string(int status, ExitStatusLevel level) {
b9a53a
 
b9a53a
                 case EXIT_CONFIGURATION_DIRECTORY:
b9a53a
                         return "CONFIGURATION_DIRECTORY";
b9a53a
+
b9a53a
+                case EXIT_NUMA_POLICY:
b9a53a
+                        return "NUMA_POLICY";
b9a53a
                 }
b9a53a
         }
b9a53a
 
b9a53a
diff --git a/src/basic/exit-status.h b/src/basic/exit-status.h
b9a53a
index c41e8b82c3..dc284aacb1 100644
b9a53a
--- a/src/basic/exit-status.h
b9a53a
+++ b/src/basic/exit-status.h
b9a53a
@@ -69,6 +69,7 @@ enum {
b9a53a
         EXIT_CACHE_DIRECTORY,
b9a53a
         EXIT_LOGS_DIRECTORY, /* 240 */
b9a53a
         EXIT_CONFIGURATION_DIRECTORY,
b9a53a
+        EXIT_NUMA_POLICY,
b9a53a
 };
b9a53a
 
b9a53a
 typedef enum ExitStatusLevel {
b9a53a
diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h
b9a53a
index 93c60458bf..014dd2b326 100644
b9a53a
--- a/src/basic/missing_syscall.h
b9a53a
+++ b/src/basic/missing_syscall.h
b9a53a
@@ -428,3 +428,46 @@ static inline ssize_t missing_statx(int dfd, const char *filename, unsigned flag
b9a53a
 
b9a53a
 #  define statx missing_statx
b9a53a
 #endif
b9a53a
+
b9a53a
+#if !HAVE_SET_MEMPOLICY
b9a53a
+
b9a53a
+enum {
b9a53a
+        MPOL_DEFAULT,
b9a53a
+        MPOL_PREFERRED,
b9a53a
+        MPOL_BIND,
b9a53a
+        MPOL_INTERLEAVE,
b9a53a
+        MPOL_LOCAL,
b9a53a
+};
b9a53a
+
b9a53a
+static inline long missing_set_mempolicy(int mode, const unsigned long *nodemask,
b9a53a
+                           unsigned long maxnode) {
b9a53a
+        long i;
b9a53a
+#  ifdef __NR_set_mempolicy
b9a53a
+        i = syscall(__NR_set_mempolicy, mode, nodemask, maxnode);
b9a53a
+#  else
b9a53a
+        errno = ENOSYS;
b9a53a
+        i = -1;
b9a53a
+#  endif
b9a53a
+        return i;
b9a53a
+}
b9a53a
+
b9a53a
+#  define set_mempolicy missing_set_mempolicy
b9a53a
+#endif
b9a53a
+
b9a53a
+
b9a53a
+#if !HAVE_GET_MEMPOLICY
b9a53a
+static inline long missing_get_mempolicy(int *mode, unsigned long *nodemask,
b9a53a
+                           unsigned long maxnode, void *addr,
b9a53a
+                           unsigned long flags) {
b9a53a
+        long i;
b9a53a
+#  ifdef __NR_get_mempolicy
b9a53a
+        i = syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
b9a53a
+#  else
b9a53a
+        errno = ENOSYS;
b9a53a
+        i = -1;
b9a53a
+#  endif
b9a53a
+        return i;
b9a53a
+}
b9a53a
+
b9a53a
+#define get_mempolicy missing_get_mempolicy
b9a53a
+#endif
b9a53a
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
b9a53a
index 50ea71a281..198f149210 100644
b9a53a
--- a/src/core/dbus-execute.c
b9a53a
+++ b/src/core/dbus-execute.c
b9a53a
@@ -223,6 +223,48 @@ static int property_get_cpu_affinity(
b9a53a
         return sd_bus_message_append_array(reply, 'y', c->cpu_set.set, c->cpu_set.allocated);
b9a53a
 }
b9a53a
 
b9a53a
+static int property_get_numa_mask(
b9a53a
+                sd_bus *bus,
b9a53a
+                const char *path,
b9a53a
+                const char *interface,
b9a53a
+                const char *property,
b9a53a
+                sd_bus_message *reply,
b9a53a
+                void *userdata,
b9a53a
+                sd_bus_error *error) {
b9a53a
+
b9a53a
+        ExecContext *c = userdata;
b9a53a
+        _cleanup_free_ uint8_t *array = NULL;
b9a53a
+        size_t allocated;
b9a53a
+
b9a53a
+        assert(bus);
b9a53a
+        assert(reply);
b9a53a
+        assert(c);
b9a53a
+
b9a53a
+        (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated);
b9a53a
+
b9a53a
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
b9a53a
+}
b9a53a
+
b9a53a
+static int property_get_numa_policy(
b9a53a
+                sd_bus *bus,
b9a53a
+                const char *path,
b9a53a
+                const char *interface,
b9a53a
+                const char *property,
b9a53a
+                sd_bus_message *reply,
b9a53a
+                void *userdata,
b9a53a
+                sd_bus_error *error) {
b9a53a
+        ExecContext *c = userdata;
b9a53a
+        int32_t policy;
b9a53a
+
b9a53a
+        assert(bus);
b9a53a
+        assert(reply);
b9a53a
+        assert(c);
b9a53a
+
b9a53a
+        policy = numa_policy_get_type(&c->numa_policy);
b9a53a
+
b9a53a
+        return sd_bus_message_append_basic(reply, 'i', &policy);
b9a53a
+}
b9a53a
+
b9a53a
 static int property_get_timer_slack_nsec(
b9a53a
                 sd_bus *bus,
b9a53a
                 const char *path,
b9a53a
@@ -698,6 +740,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
b9a53a
         SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
         SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
         SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
+        SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
+        SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
         SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
         SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
         SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST),
b9a53a
@@ -1550,9 +1594,10 @@ int bus_exec_context_set_transient_property(
b9a53a
                 return 1;
b9a53a
         }
b9a53a
 #endif
b9a53a
-        if (streq(name, "CPUAffinity")) {
b9a53a
+        if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
b9a53a
                 const void *a;
b9a53a
                 size_t n;
b9a53a
+                bool affinity = streq(name, "CPUAffinity");
b9a53a
                 _cleanup_(cpu_set_reset) CPUSet set = {};
b9a53a
 
b9a53a
                 r = sd_bus_message_read_array(message, 'y', &a, &n);
b9a53a
@@ -1565,7 +1610,7 @@ int bus_exec_context_set_transient_property(
b9a53a
 
b9a53a
                 if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
b9a53a
                         if (n == 0) {
b9a53a
-                                cpu_set_reset(&c->cpu_set);
b9a53a
+                                cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes);
b9a53a
                                 unit_write_settingf(u, flags, name, "%s=", name);
b9a53a
                         } else {
b9a53a
                                 _cleanup_free_ char *str = NULL;
b9a53a
@@ -1577,7 +1622,7 @@ int bus_exec_context_set_transient_property(
b9a53a
                                 /* We forego any optimizations here, and always create the structure using
b9a53a
                                  * cpu_set_add_all(), because we don't want to care if the existing size we
b9a53a
                                  * got over dbus is appropriate. */
b9a53a
-                                r = cpu_set_add_all(&c->cpu_set, &set);
b9a53a
+                                r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set);
b9a53a
                                 if (r < 0)
b9a53a
                                         return r;
b9a53a
 
b9a53a
@@ -1587,6 +1632,20 @@ int bus_exec_context_set_transient_property(
b9a53a
 
b9a53a
                 return 1;
b9a53a
 
b9a53a
+        } else if (streq(name, "NUMAPolicy")) {
b9a53a
+                int32_t type;
b9a53a
+
b9a53a
+                r = sd_bus_message_read(message, "i", &type);
b9a53a
+                if (r < 0)
b9a53a
+                        return r;
b9a53a
+
b9a53a
+                if (!mpol_is_valid(type))
b9a53a
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type);
b9a53a
+
b9a53a
+                if (!UNIT_WRITE_FLAGS_NOOP(flags))
b9a53a
+                        c->numa_policy.type = type;
b9a53a
+
b9a53a
+                return 1;
b9a53a
         } else if (streq(name, "IOSchedulingClass")) {
b9a53a
                 int32_t q;
b9a53a
 
b9a53a
diff --git a/src/core/execute.c b/src/core/execute.c
b9a53a
index bc26aa66e7..56aa89e1ec 100644
b9a53a
--- a/src/core/execute.c
b9a53a
+++ b/src/core/execute.c
b9a53a
@@ -2997,6 +2997,16 @@ static int exec_child(
b9a53a
                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
b9a53a
                 }
b9a53a
 
b9a53a
+        if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
b9a53a
+                r = apply_numa_policy(&context->numa_policy);
b9a53a
+                if (r == -EOPNOTSUPP)
b9a53a
+                        log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b9a53a
+                else if (r < 0) {
b9a53a
+                        *exit_status = EXIT_NUMA_POLICY;
b9a53a
+                        return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
b9a53a
+                }
b9a53a
+        }
b9a53a
+
b9a53a
         if (context->ioprio_set)
b9a53a
                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
b9a53a
                         *exit_status = EXIT_IOPRIO;
b9a53a
@@ -3651,6 +3661,7 @@ void exec_context_init(ExecContext *c) {
b9a53a
         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
b9a53a
         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
b9a53a
         c->log_level_max = -1;
b9a53a
+        numa_policy_reset(&c->numa_policy);
b9a53a
 }
b9a53a
 
b9a53a
 void exec_context_done(ExecContext *c) {
b9a53a
@@ -3695,6 +3706,7 @@ void exec_context_done(ExecContext *c) {
b9a53a
         c->n_temporary_filesystems = 0;
b9a53a
 
b9a53a
         cpu_set_reset(&c->cpu_set);
b9a53a
+        numa_policy_reset(&c->numa_policy);
b9a53a
 
b9a53a
         c->utmp_id = mfree(c->utmp_id);
b9a53a
         c->selinux_context = mfree(c->selinux_context);
b9a53a
@@ -4104,6 +4116,14 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
b9a53a
                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
b9a53a
         }
b9a53a
 
b9a53a
+        if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
b9a53a
+                _cleanup_free_ char *nodes = NULL;
b9a53a
+
b9a53a
+                nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
b9a53a
+                fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
b9a53a
+                fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
b9a53a
+        }
b9a53a
+
b9a53a
         if (c->timer_slack_nsec != NSEC_INFINITY)
b9a53a
                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
b9a53a
 
b9a53a
diff --git a/src/core/execute.h b/src/core/execute.h
b9a53a
index e1e7a494cd..b2eb55f8f5 100644
b9a53a
--- a/src/core/execute.h
b9a53a
+++ b/src/core/execute.h
b9a53a
@@ -150,6 +150,7 @@ struct ExecContext {
b9a53a
         int cpu_sched_priority;
b9a53a
 
b9a53a
         CPUSet cpu_set;
b9a53a
+        NUMAPolicy numa_policy;
b9a53a
 
b9a53a
         ExecInput std_input;
b9a53a
         ExecOutput std_output;
b9a53a
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
b9a53a
index 1066bcfb8f..cdf4d14c4e 100644
b9a53a
--- a/src/core/load-fragment-gperf.gperf.m4
b9a53a
+++ b/src/core/load-fragment-gperf.gperf.m4
b9a53a
@@ -36,6 +36,8 @@ $1.CPUSchedulingPolicy,          config_parse_exec_cpu_sched_policy, 0,
b9a53a
 $1.CPUSchedulingPriority,        config_parse_exec_cpu_sched_prio,   0,                             offsetof($1, exec_context)
b9a53a
 $1.CPUSchedulingResetOnFork,     config_parse_bool,                  0,                             offsetof($1, exec_context.cpu_sched_reset_on_fork)
b9a53a
 $1.CPUAffinity,                  config_parse_exec_cpu_affinity,     0,                             offsetof($1, exec_context)
b9a53a
+$1.NUMAPolicy,                   config_parse_numa_policy,           0,                             offsetof($1, exec_context.numa_policy.type)
b9a53a
+$1.NUMAMask,                     config_parse_numa_mask,             0,                             offsetof($1, exec_context.numa_policy)
b9a53a
 $1.UMask,                        config_parse_mode,                  0,                             offsetof($1, exec_context.umask)
b9a53a
 $1.Environment,                  config_parse_environ,               0,                             offsetof($1, exec_context.environment)
b9a53a
 $1.EnvironmentFile,              config_parse_unit_env_file,         0,                             offsetof($1, exec_context.environment_files)
b9a53a
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
b9a53a
index 34ae834188..35dd595098 100644
b9a53a
--- a/src/core/load-fragment.c
b9a53a
+++ b/src/core/load-fragment.c
b9a53a
@@ -93,6 +93,7 @@ DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint
b9a53a
 DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
b9a53a
 DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares");
b9a53a
 DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_flags, mount_propagation_flags_from_string, unsigned long, "Failed to parse mount flag");
b9a53a
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type");
b9a53a
 
b9a53a
 int config_parse_unit_deps(
b9a53a
                 const char *unit,
b9a53a
@@ -1159,6 +1160,33 @@ int config_parse_exec_cpu_sched_policy(const char *unit,
b9a53a
         return 0;
b9a53a
 }
b9a53a
 
b9a53a
+int config_parse_numa_mask(const char *unit,
b9a53a
+                           const char *filename,
b9a53a
+                           unsigned line,
b9a53a
+                           const char *section,
b9a53a
+                           unsigned section_line,
b9a53a
+                           const char *lvalue,
b9a53a
+                           int ltype,
b9a53a
+                           const char *rvalue,
b9a53a
+                           void *data,
b9a53a
+                           void *userdata) {
b9a53a
+        int r;
b9a53a
+        NUMAPolicy *p = data;
b9a53a
+
b9a53a
+        assert(filename);
b9a53a
+        assert(lvalue);
b9a53a
+        assert(rvalue);
b9a53a
+        assert(data);
b9a53a
+
b9a53a
+        r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue);
b9a53a
+        if (r < 0) {
b9a53a
+                log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue);
b9a53a
+                return 0;
b9a53a
+        }
b9a53a
+
b9a53a
+        return r;
b9a53a
+}
b9a53a
+
b9a53a
 int config_parse_exec_cpu_sched_prio(const char *unit,
b9a53a
                                      const char *filename,
b9a53a
                                      unsigned line,
b9a53a
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
b9a53a
index dad281ef72..f2ca1b8ee7 100644
b9a53a
--- a/src/core/load-fragment.h
b9a53a
+++ b/src/core/load-fragment.h
b9a53a
@@ -102,6 +102,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
b9a53a
 CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
b9a53a
 CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
b9a53a
 CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
b9a53a
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
b9a53a
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
b9a53a
 
b9a53a
 /* gperf prototypes */
b9a53a
 const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
b9a53a
diff --git a/src/core/main.c b/src/core/main.c
b9a53a
index c74dc641c1..83f9dd5878 100644
b9a53a
--- a/src/core/main.c
b9a53a
+++ b/src/core/main.c
b9a53a
@@ -134,6 +134,7 @@ static uint64_t arg_default_tasks_max;
b9a53a
 static sd_id128_t arg_machine_id;
b9a53a
 static EmergencyAction arg_cad_burst_action;
b9a53a
 static CPUSet arg_cpu_affinity;
b9a53a
+static NUMAPolicy arg_numa_policy;
b9a53a
 
b9a53a
 static int parse_configuration(void);
b9a53a
 
b9a53a
@@ -660,6 +661,8 @@ static int parse_config_file(void) {
b9a53a
                 { "Manager", "ShowStatus",                config_parse_show_status,      0, &arg_show_status                       },
b9a53a
                 { "Manager", "CPUAffinity",               config_parse_cpu_affinity2,    0, &arg_cpu_affinity                      },
b9a53a
                 { "Manager", "JoinControllers",           config_parse_join_controllers, 0, &arg_join_controllers                  },
b9a53a
+                { "Manager", "NUMAPolicy",                config_parse_numa_policy,      0, &arg_numa_policy.type                  },
b9a53a
+                { "Manager", "NUMAMask",                  config_parse_numa_mask,        0, &arg_numa_policy                       },
b9a53a
                 { "Manager", "RuntimeWatchdogSec",        config_parse_sec,              0, &arg_runtime_watchdog                  },
b9a53a
                 { "Manager", "ShutdownWatchdogSec",       config_parse_sec,              0, &arg_shutdown_watchdog                 },
b9a53a
                 { "Manager", "WatchdogDevice",            config_parse_path,             0, &arg_watchdog_device                   },
b9a53a
@@ -1501,6 +1504,27 @@ static void update_cpu_affinity(bool skip_setup) {
b9a53a
                 log_warning_errno(errno, "Failed to set CPU affinity: %m");
b9a53a
 }
b9a53a
 
b9a53a
+static void update_numa_policy(bool skip_setup) {
b9a53a
+        int r;
b9a53a
+        _cleanup_free_ char *nodes = NULL;
b9a53a
+        const char * policy = NULL;
b9a53a
+
b9a53a
+        if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
b9a53a
+                return;
b9a53a
+
b9a53a
+        if (DEBUG_LOGGING) {
b9a53a
+                policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
b9a53a
+                nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
b9a53a
+                log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes));
b9a53a
+        }
b9a53a
+
b9a53a
+        r = apply_numa_policy(&arg_numa_policy);
b9a53a
+        if (r == -EOPNOTSUPP)
b9a53a
+                log_debug_errno(r, "NUMA support not available, ignoring.");
b9a53a
+        else if (r < 0)
b9a53a
+                log_warning_errno(r, "Failed to set NUMA memory policy: %m");
b9a53a
+}
b9a53a
+
b9a53a
 static void do_reexecute(
b9a53a
                 int argc,
b9a53a
                 char *argv[],
b9a53a
@@ -1672,6 +1696,7 @@ static int invoke_main_loop(
b9a53a
                         set_manager_defaults(m);
b9a53a
 
b9a53a
                         update_cpu_affinity(false);
b9a53a
+                        update_numa_policy(false);
b9a53a
 
b9a53a
                         if (saved_log_level >= 0)
b9a53a
                                 manager_override_log_level(m, saved_log_level);
b9a53a
@@ -1832,6 +1857,7 @@ static int initialize_runtime(
b9a53a
                 return 0;
b9a53a
 
b9a53a
         update_cpu_affinity(skip_setup);
b9a53a
+        update_numa_policy(skip_setup);
b9a53a
 
b9a53a
         if (arg_system) {
b9a53a
                 /* Make sure we leave a core dump without panicing the kernel. */
b9a53a
@@ -2011,6 +2037,7 @@ static void reset_arguments(void) {
b9a53a
         arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
b9a53a
 
b9a53a
         cpu_set_reset(&arg_cpu_affinity);
b9a53a
+        numa_policy_reset(&arg_numa_policy);
b9a53a
 }
b9a53a
 
b9a53a
 static int parse_configuration(void) {
b9a53a
diff --git a/src/core/system.conf.in b/src/core/system.conf.in
b9a53a
index 653ec6b8c9..0d93fbf147 100644
b9a53a
--- a/src/core/system.conf.in
b9a53a
+++ b/src/core/system.conf.in
b9a53a
@@ -24,6 +24,8 @@
b9a53a
 #CtrlAltDelBurstAction=reboot-force
b9a53a
 #CPUAffinity=1 2
b9a53a
 #JoinControllers=cpu,cpuacct net_cls,net_prio
b9a53a
+#NUMAPolicy=default
b9a53a
+#NUMAMask=
b9a53a
 #RuntimeWatchdogSec=0
b9a53a
 #ShutdownWatchdogSec=10min
b9a53a
 #CapabilityBoundingSet=
b9a53a
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
b9a53a
index ec8732c226..055edd6e22 100644
b9a53a
--- a/src/shared/bus-unit-util.c
b9a53a
+++ b/src/shared/bus-unit-util.c
b9a53a
@@ -947,6 +947,34 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
b9a53a
                 return bus_append_byte_array(m, field, array, allocated);
b9a53a
         }
b9a53a
 
b9a53a
+        if (streq(field, "NUMAPolicy")) {
b9a53a
+                r = mpol_from_string(eq);
b9a53a
+                if (r < 0)
b9a53a
+                        return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
b9a53a
+
b9a53a
+                r = sd_bus_message_append(m, "(sv)", field, "i", (int32_t) r);
b9a53a
+                if (r < 0)
b9a53a
+                        return bus_log_create_error(r);
b9a53a
+
b9a53a
+                return 1;
b9a53a
+        }
b9a53a
+
b9a53a
+        if (streq(field, "NUMAMask")) {
b9a53a
+                _cleanup_(cpu_set_reset) CPUSet nodes = {};
b9a53a
+                _cleanup_free_ uint8_t *array = NULL;
b9a53a
+                size_t allocated;
b9a53a
+
b9a53a
+                r = parse_cpu_set(eq, &nodes);
b9a53a
+                if (r < 0)
b9a53a
+                        return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
b9a53a
+
b9a53a
+                r = cpu_set_to_dbus(&nodes, &array, &allocated);
b9a53a
+                if (r < 0)
b9a53a
+                        return log_error_errno(r, "Failed to serialize NUMAMask: %m");
b9a53a
+
b9a53a
+                return bus_append_byte_array(m, field, array, allocated);
b9a53a
+        }
b9a53a
+
b9a53a
         if (STR_IN_SET(field, "RestrictAddressFamilies", "SystemCallFilter")) {
b9a53a
                 int whitelist = 1;
b9a53a
                 const char *p = eq;
b9a53a
diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c
b9a53a
index 0154b300a3..7274921e6d 100644
b9a53a
--- a/src/systemctl/systemctl.c
b9a53a
+++ b/src/systemctl/systemctl.c
b9a53a
@@ -4573,6 +4573,20 @@ static int print_property(const char *name, sd_bus_message *m, bool value, bool
b9a53a
 
b9a53a
         switch (bus_type) {
b9a53a
 
b9a53a
+        case SD_BUS_TYPE_INT32:
b9a53a
+                if (streq(name, "NUMAPolicy")) {
b9a53a
+                        int32_t i;
b9a53a
+
b9a53a
+                        r = sd_bus_message_read_basic(m, bus_type, &i);
b9a53a
+                        if (r < 0)
b9a53a
+                                return r;
b9a53a
+
b9a53a
+                        print_prop(name, "%s", strna(mpol_to_string(i)));
b9a53a
+
b9a53a
+                        return 1;
b9a53a
+                }
b9a53a
+                break;
b9a53a
+
b9a53a
         case SD_BUS_TYPE_STRUCT:
b9a53a
 
b9a53a
                 if (contents[0] == SD_BUS_TYPE_UINT32 && streq(name, "Job")) {
b9a53a
@@ -4878,7 +4892,7 @@ static int print_property(const char *name, sd_bus_message *m, bool value, bool
b9a53a
                         print_prop(name, "%s", h);
b9a53a
 
b9a53a
                         return 1;
b9a53a
-                } else if (contents[0] == SD_BUS_TYPE_BYTE && streq(name, "CPUAffinity")) {
b9a53a
+                } else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
b9a53a
                         _cleanup_free_ char *affinity = NULL;
b9a53a
                         _cleanup_(cpu_set_reset) CPUSet set = {};
b9a53a
                         const void *a;
b9a53a
@@ -4890,7 +4904,7 @@ static int print_property(const char *name, sd_bus_message *m, bool value, bool
b9a53a
 
b9a53a
                         r = cpu_set_from_dbus(a, n, &set);
b9a53a
                         if (r < 0)
b9a53a
-                                return log_error_errno(r, "Failed to deserialize CPUAffinity: %m");
b9a53a
+                                return log_error_errno(r, "Failed to deserialize %s: %m", name);
b9a53a
 
b9a53a
                         affinity = cpu_set_to_range_string(&set);
b9a53a
                         if (!affinity)