84b277
From 310c5654023cd224f4c016872e39335b1b3ac603 Mon Sep 17 00:00:00 2001
cc2231
From: Lennart Poettering <lennart@poettering.net>
cc2231
Date: Thu, 13 Nov 2014 14:34:42 +0100
cc2231
Subject: [PATCH] core: introduce new Delegate=yes/no property controlling
cc2231
 creation of cgroup subhierarchies
cc2231
cc2231
For priviliged units this resource control property ensures that the
cc2231
processes have all controllers systemd manages enabled.
cc2231
cc2231
For unpriviliged services (those with User= set) this ensures that
cc2231
access rights to the service cgroup is granted to the user in question,
cc2231
to create further subgroups. Note that this only applies to the
cc2231
name=systemd hierarchy though, as access to other controllers is not
cc2231
safe for unpriviliged processes.
cc2231
cc2231
Delegate=yes should be set for container scopes where a systemd instance
cc2231
inside the container shall manage the hierarchies below its own cgroup
cc2231
and have access to all controllers.
cc2231
cc2231
Delegate=yes should also be set for user@.service, so that systemd
cc2231
--user can run, controlling its own cgroup tree.
cc2231
cc2231
This commit changes machined, systemd-nspawn@.service and user@.service
cc2231
to set this boolean, in order to ensure that container management will
cc2231
just work, and the user systemd instance can run fine.
cc2231
cc2231
(cherry picked from a931ad47a8623163a29d898224d8a8c1177ffdaf)
cc2231
84b277
Resolves: #1139223
cc2231
---
cc2231
 man/systemd.resource-control.xml      | 14 ++++++++++++
cc2231
 src/core/cgroup.c                     | 19 +++++++++++++++--
cc2231
 src/core/cgroup.h                     |  2 ++
cc2231
 src/core/dbus-cgroup.c                | 40 +++++++++++++++++++++++++++++++++++
cc2231
 src/core/execute.c                    | 23 +++++++++++++++++---
cc2231
 src/core/execute.h                    |  2 ++
cc2231
 src/core/load-fragment-gperf.gperf.m4 |  3 ++-
cc2231
 src/core/mount.c                      |  1 +
cc2231
 src/core/service.c                    |  1 +
cc2231
 src/core/socket.c                     |  1 +
cc2231
 src/core/swap.c                       |  1 +
cc2231
 src/machine/machined-dbus.c           | 10 +++++++++
cc2231
 src/shared/cgroup-util.h              |  3 ++-
cc2231
 units/systemd-nspawn@.service.in      |  1 +
cc2231
 14 files changed, 114 insertions(+), 7 deletions(-)
cc2231
cc2231
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
cc2231
index 8688905..3748c0c 100644
cc2231
--- a/man/systemd.resource-control.xml
cc2231
+++ b/man/systemd.resource-control.xml
cc2231
@@ -327,6 +327,20 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>.
cc2231
         </listitem>
cc2231
       </varlistentry>
cc2231
 
cc2231
+      <varlistentry>
cc2231
+        <term><varname>Delegate=</varname></term>
cc2231
+
cc2231
+        <listitem>
cc2231
+          <para>Turns on delegation of further resource control
cc2231
+          partitioning to processes of the unit. For unpriviliged
cc2231
+          services (i.e. those using the <varname>User=</varname>
cc2231
+          setting) this allows processes to create a subhierarchy
cc2231
+          beneath its control group path. For priviliged services and
cc2231
+          scopes this ensures the processes will have all control
cc2231
+          group controllers enabled.</para>
cc2231
+        </listitem>
cc2231
+      </varlistentry>
cc2231
+
cc2231
     </variablelist>
cc2231
   </refsect1>
cc2231
 
cc2231
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
84b277
index c215a86..443937b 100644
cc2231
--- a/src/core/cgroup.c
cc2231
+++ b/src/core/cgroup.c
cc2231
@@ -94,14 +94,16 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
cc2231
                 "%sCPUShares=%lu\n"
cc2231
                 "%sBlockIOWeight=%lu\n"
cc2231
                 "%sMemoryLimit=%" PRIu64 "\n"
cc2231
-                "%sDevicePolicy=%s\n",
cc2231
+                "%sDevicePolicy=%s\n"
cc2231
+                "%sDelegate=%s\n",
cc2231
                 prefix, yes_no(c->cpu_accounting),
cc2231
                 prefix, yes_no(c->blockio_accounting),
cc2231
                 prefix, yes_no(c->memory_accounting),
cc2231
                 prefix, c->cpu_shares,
cc2231
                 prefix, c->blockio_weight,
cc2231
                 prefix, c->memory_limit,
cc2231
-                prefix, cgroup_device_policy_to_string(c->device_policy));
cc2231
+                prefix, cgroup_device_policy_to_string(c->device_policy),
cc2231
+                prefix, yes_no(c->delegate));
cc2231
 
cc2231
         LIST_FOREACH(device_allow, a, c->device_allow)
cc2231
                 fprintf(f,
cc2231
@@ -342,6 +344,19 @@ static CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
cc2231
         if (!c)
cc2231
                 return 0;
cc2231
 
cc2231
+        /* If delegation is turned on, then turn on all cgroups,
cc2231
+         * unless the process we fork into it is known to drop
cc2231
+         * privileges anyway, and shouldn't get access to the
cc2231
+         * controllers anyway. */
cc2231
+
cc2231
+        if (c->delegate) {
cc2231
+                ExecContext *e;
cc2231
+
cc2231
+                e = unit_get_exec_context(u);
cc2231
+                if (!e || exec_context_maintains_privileges(e))
cc2231
+                        return _CGROUP_CONTROLLER_MASK_ALL;
cc2231
+        }
cc2231
+
cc2231
         return cgroup_context_get_mask(c);
cc2231
 }
cc2231
 
cc2231
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
cc2231
index 0a079e9..d00bcac 100644
cc2231
--- a/src/core/cgroup.h
cc2231
+++ b/src/core/cgroup.h
cc2231
@@ -80,6 +80,8 @@ struct CGroupContext {
cc2231
 
cc2231
         CGroupDevicePolicy device_policy;
cc2231
         LIST_HEAD(CGroupDeviceAllow, device_allow);
cc2231
+
cc2231
+        bool delegate;
cc2231
 };
cc2231
 
cc2231
 #include "unit.h"
cc2231
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
cc2231
index 9ebcad9..a13c869 100644
cc2231
--- a/src/core/dbus-cgroup.c
cc2231
+++ b/src/core/dbus-cgroup.c
cc2231
@@ -124,6 +124,7 @@ static int bus_cgroup_append_device_allow(DBusMessageIter *i, const char *proper
cc2231
 }
cc2231
 
cc2231
 const BusProperty bus_cgroup_context_properties[] = {
cc2231
+        { "Delegate",                bus_property_append_bool,            "b",     offsetof(CGroupContext, delegate)           },
cc2231
         { "CPUAccounting",           bus_property_append_bool,            "b",     offsetof(CGroupContext, cpu_accounting)     },
cc2231
         { "CPUShares",               bus_property_append_ul,              "t",     offsetof(CGroupContext, cpu_shares)         },
cc2231
         { "BlockIOAccounting",       bus_property_append_bool,            "b",     offsetof(CGroupContext, blockio_accounting) },
cc2231
@@ -138,6 +139,38 @@ const BusProperty bus_cgroup_context_properties[] = {
cc2231
         {}
cc2231
 };
cc2231
 
cc2231
+static int bus_cgroup_set_transient_property(
cc2231
+                Unit *u,
cc2231
+                CGroupContext *c,
cc2231
+                const char *name,
cc2231
+                DBusMessageIter *i,
cc2231
+                UnitSetPropertiesMode mode,
cc2231
+                DBusError *error) {
cc2231
+
cc2231
+        assert(u);
cc2231
+        assert(c);
cc2231
+        assert(name);
cc2231
+        assert(i);
cc2231
+
cc2231
+        if (streq(name, "Delegate")) {
cc2231
+
cc2231
+                if (dbus_message_iter_get_arg_type(i) != DBUS_TYPE_BOOLEAN)
cc2231
+                        return -EINVAL;
cc2231
+
cc2231
+                if (mode != UNIT_CHECK) {
cc2231
+                        dbus_bool_t b;
cc2231
+
cc2231
+                        dbus_message_iter_get_basic(i, &b);
cc2231
+                        c->delegate = b;
cc2231
+                        unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes" : "Delegate=no");
cc2231
+                }
cc2231
+
cc2231
+                return 1;
cc2231
+        }
cc2231
+
cc2231
+        return 0;
cc2231
+}
cc2231
+
cc2231
 int bus_cgroup_set_property(
cc2231
                 Unit *u,
cc2231
                 CGroupContext *c,
cc2231
@@ -550,5 +583,12 @@ int bus_cgroup_set_property(
cc2231
                 return 1;
cc2231
         }
cc2231
 
cc2231
+        if (u->transient && u->load_state == UNIT_STUB) {
cc2231
+                int r;
cc2231
+                r = bus_cgroup_set_transient_property(u, c, name, i, mode, error);
cc2231
+                if (r != 0)
cc2231
+                        return r;
cc2231
+        }
cc2231
+
cc2231
         return 0;
cc2231
 }
cc2231
diff --git a/src/core/execute.c b/src/core/execute.c
84b277
index 06713cc..d814c39 100644
cc2231
--- a/src/core/execute.c
cc2231
+++ b/src/core/execute.c
84b277
@@ -1042,6 +1042,7 @@ int exec_spawn(ExecCommand *command,
84b277
                bool selinux_context_net,
cc2231
                CGroupControllerMask cgroup_supported,
cc2231
                const char *cgroup_path,
cc2231
+               bool cgroup_delegate,
cc2231
                const char *unit_id,
cc2231
                int idle_pipe[4],
cc2231
                pid_t *ret) {
84b277
@@ -1306,8 +1307,10 @@ int exec_spawn(ExecCommand *command,
cc2231
                         }
cc2231
                 }
cc2231
 
cc2231
-#ifdef HAVE_PAM
cc2231
-                if (cgroup_path && context->user && context->pam_name) {
cc2231
+                /* If delegation is enabled we'll pass ownership of the cgroup
cc2231
+                 * (but only in systemd's own controller hierarchy!) to the
cc2231
+                 * user of the new process. */
cc2231
+               if (cgroup_path && context->user && cgroup_delegate) {
cc2231
                         err = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, 0644, uid, gid);
cc2231
                         if (err < 0) {
cc2231
                                 r = EXIT_CGROUP;
84b277
@@ -1321,7 +1324,6 @@ int exec_spawn(ExecCommand *command,
cc2231
                                 goto fail_child;
cc2231
                         }
cc2231
                 }
cc2231
-#endif
cc2231
 
cc2231
                 if (apply_permissions) {
cc2231
                         err = enforce_groups(context, username, gid);
84b277
@@ -2116,6 +2118,21 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
84b277
 
cc2231
 }
cc2231
 
cc2231
+bool exec_context_maintains_privileges(ExecContext *c) {
cc2231
+        assert(c);
cc2231
+
cc2231
+        /* Returns true if the process forked off would run under
cc2231
+         * an unchanged UID or as root. */
cc2231
+
cc2231
+        if (!c->user)
cc2231
+                return true;
cc2231
+
cc2231
+        if (streq(c->user, "root") || streq(c->user, "0"))
cc2231
+                return true;
cc2231
+
cc2231
+        return false;
cc2231
+}
cc2231
+
cc2231
 void exec_status_start(ExecStatus *s, pid_t pid) {
cc2231
         assert(s);
cc2231
 
cc2231
diff --git a/src/core/execute.h b/src/core/execute.h
84b277
index b66bd5f..03afdf3 100644
cc2231
--- a/src/core/execute.h
cc2231
+++ b/src/core/execute.h
84b277
@@ -177,6 +177,7 @@ int exec_spawn(ExecCommand *command,
84b277
                bool selinux_context_net,
cc2231
                CGroupControllerMask cgroup_mask,
cc2231
                const char *cgroup_path,
cc2231
+               bool cgroup_delegate,
cc2231
                const char *unit_id,
cc2231
                int pipe_fd[2],
cc2231
                pid_t *ret);
84b277
@@ -203,6 +204,7 @@ void exec_context_tty_reset(const ExecContext *context);
84b277
 int exec_context_load_environment(const ExecContext *c, const char *unit_id, char ***l);
cc2231
 
cc2231
 bool exec_context_may_touch_console(ExecContext *c);
cc2231
+bool exec_context_maintains_privileges(ExecContext *c);
cc2231
 void exec_context_serialize(const ExecContext *c, Unit *u, FILE *f);
cc2231
 
cc2231
 void exec_status_start(ExecStatus *s, pid_t pid);
cc2231
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
84b277
index ee7cd5d..c6eb757 100644
cc2231
--- a/src/core/load-fragment-gperf.gperf.m4
cc2231
+++ b/src/core/load-fragment-gperf.gperf.m4
84b277
@@ -96,7 +96,8 @@ $1.BlockIOAccounting,            config_parse_bool,                  0,
cc2231
 $1.BlockIOWeight,                config_parse_blockio_weight,        0,                             offsetof($1, cgroup_context)
cc2231
 $1.BlockIODeviceWeight,          config_parse_blockio_device_weight, 0,                             offsetof($1, cgroup_context)
cc2231
 $1.BlockIOReadBandwidth,         config_parse_blockio_bandwidth,     0,                             offsetof($1, cgroup_context)
cc2231
-$1.BlockIOWriteBandwidth,        config_parse_blockio_bandwidth,     0,                             offsetof($1, cgroup_context)'
cc2231
+$1.BlockIOWriteBandwidth,        config_parse_blockio_bandwidth,     0,                             offsetof($1, cgroup_context)
cc2231
+$1.Delegate,                     config_parse_bool,                  0,                             offsetof($1, cgroup_context.delegate)'
cc2231
 )m4_dnl
cc2231
 Unit.Description,                config_parse_unit_string_printf,    0,                             offsetof(Unit, description)
cc2231
 Unit.Documentation,              config_parse_documentation,         0,                             offsetof(Unit, documentation)
cc2231
diff --git a/src/core/mount.c b/src/core/mount.c
84b277
index bbceb92..a9cd28b 100644
cc2231
--- a/src/core/mount.c
cc2231
+++ b/src/core/mount.c
84b277
@@ -794,6 +794,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
84b277
                        false,
cc2231
                        UNIT(m)->manager->cgroup_supported,
cc2231
                        UNIT(m)->cgroup_path,
cc2231
+                       m->cgroup_context.delegate,
cc2231
                        UNIT(m)->id,
cc2231
                        NULL,
cc2231
                        &pid;;
cc2231
diff --git a/src/core/service.c b/src/core/service.c
84b277
index 635a953..5fd69cf 100644
cc2231
--- a/src/core/service.c
cc2231
+++ b/src/core/service.c
84b277
@@ -1870,6 +1870,7 @@ static int service_spawn(
84b277
                        s->socket_fd_selinux_context_net,
cc2231
                        UNIT(s)->manager->cgroup_supported,
cc2231
                        path,
cc2231
+                       s->cgroup_context.delegate,
cc2231
                        UNIT(s)->id,
cc2231
                        s->type == SERVICE_IDLE ? UNIT(s)->manager->idle_pipe : NULL,
cc2231
                        &pid;;
cc2231
diff --git a/src/core/socket.c b/src/core/socket.c
84b277
index 1a91700..f7fffbe 100644
cc2231
--- a/src/core/socket.c
cc2231
+++ b/src/core/socket.c
84b277
@@ -1237,6 +1237,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
84b277
                        false,
cc2231
                        UNIT(s)->manager->cgroup_supported,
cc2231
                        UNIT(s)->cgroup_path,
cc2231
+                       s->cgroup_context.delegate,
cc2231
                        UNIT(s)->id,
cc2231
                        NULL,
cc2231
                        &pid;;
cc2231
diff --git a/src/core/swap.c b/src/core/swap.c
84b277
index 36ef88b..152a080 100644
cc2231
--- a/src/core/swap.c
cc2231
+++ b/src/core/swap.c
84b277
@@ -592,6 +592,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
84b277
                        false,
cc2231
                        UNIT(s)->manager->cgroup_supported,
cc2231
                        UNIT(s)->cgroup_path,
cc2231
+                       s->cgroup_context.delegate,
cc2231
                        UNIT(s)->id,
cc2231
                        NULL,
cc2231
                        &pid;;
cc2231
diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c
cc2231
index 22caadf..0cebdc5 100644
cc2231
--- a/src/machine/machined-dbus.c
cc2231
+++ b/src/machine/machined-dbus.c
cc2231
@@ -739,9 +739,11 @@ int manager_start_scope(
cc2231
         DBusMessageIter iter, sub, sub2, sub3, sub4;
cc2231
         const char *timeout_stop_property = "TimeoutStopUSec";
cc2231
         const char *pids_property = "PIDs";
cc2231
+        const char *delegate_property = "Delegate";
cc2231
         uint64_t timeout = 500 * USEC_PER_MSEC;
cc2231
         const char *fail = "fail";
cc2231
         uint32_t u;
cc2231
+        dbus_bool_t b = 1;
cc2231
         int r;
cc2231
 
cc2231
         assert(manager);
cc2231
@@ -814,6 +816,14 @@ int manager_start_scope(
cc2231
             !dbus_message_iter_close_container(&sub, &sub2))
cc2231
                 return log_oom();
cc2231
 
cc2231
+        if (!dbus_message_iter_open_container(&sub, DBUS_TYPE_STRUCT, NULL, &sub2) ||
cc2231
+            !dbus_message_iter_append_basic(&sub2, DBUS_TYPE_STRING, &delegate_property) ||
cc2231
+            !dbus_message_iter_open_container(&sub2, DBUS_TYPE_VARIANT, "b", &sub3) ||
cc2231
+            !dbus_message_iter_append_basic(&sub3, DBUS_TYPE_BOOLEAN, &b) ||
cc2231
+            !dbus_message_iter_close_container(&sub2, &sub3) ||
cc2231
+            !dbus_message_iter_close_container(&sub, &sub2))
cc2231
+                return log_oom();
cc2231
+
cc2231
         if (more_properties) {
cc2231
                 r = copy_many_fields(&sub, more_properties);
cc2231
                 if (r < 0)
cc2231
diff --git a/src/shared/cgroup-util.h b/src/shared/cgroup-util.h
cc2231
index 0963450..0608b9a 100644
cc2231
--- a/src/shared/cgroup-util.h
cc2231
+++ b/src/shared/cgroup-util.h
cc2231
@@ -34,7 +34,8 @@ typedef enum CGroupControllerMask {
cc2231
         CGROUP_CPUACCT = 2,
cc2231
         CGROUP_BLKIO = 4,
cc2231
         CGROUP_MEMORY = 8,
cc2231
-        CGROUP_DEVICE = 16
cc2231
+        CGROUP_DEVICE = 16,
cc2231
+        _CGROUP_CONTROLLER_MASK_ALL = 31
cc2231
 } CGroupControllerMask;
cc2231
 
cc2231
 /*
cc2231
diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in
cc2231
index 8e00736..bdfa89f 100644
cc2231
--- a/units/systemd-nspawn@.service.in
cc2231
+++ b/units/systemd-nspawn@.service.in
cc2231
@@ -12,6 +12,7 @@ Documentation=man:systemd-nspawn(1)
cc2231
 [Service]
cc2231
 ExecStart=@bindir@/systemd-nspawn -bjD /var/lib/container/%i
cc2231
 Type=notify
cc2231
+Delegate=yes
cc2231
 
cc2231
 [Install]
cc2231
 WantedBy=multi-user.target