From 80753fa3c0622fd8117c64e1ed2d7841edee3b00 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 13 Nov 2014 14:34:42 +0100 Subject: [PATCH] core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies For priviliged units this resource control property ensures that the processes have all controllers systemd manages enabled. For unpriviliged services (those with User= set) this ensures that access rights to the service cgroup is granted to the user in question, to create further subgroups. Note that this only applies to the name=systemd hierarchy though, as access to other controllers is not safe for unpriviliged processes. Delegate=yes should be set for container scopes where a systemd instance inside the container shall manage the hierarchies below its own cgroup and have access to all controllers. Delegate=yes should also be set for user@.service, so that systemd --user can run, controlling its own cgroup tree. This commit changes machined, systemd-nspawn@.service and user@.service to set this boolean, in order to ensure that container management will just work, and the user systemd instance can run fine. (cherry picked from a931ad47a8623163a29d898224d8a8c1177ffdaf) Resolves: #1179715 --- man/systemd.resource-control.xml | 14 ++++++++++++ src/core/cgroup.c | 19 +++++++++++++++-- src/core/cgroup.h | 2 ++ src/core/dbus-cgroup.c | 40 +++++++++++++++++++++++++++++++++++ src/core/execute.c | 23 +++++++++++++++++--- src/core/execute.h | 2 ++ src/core/load-fragment-gperf.gperf.m4 | 3 ++- src/core/mount.c | 1 + src/core/service.c | 1 + src/core/socket.c | 1 + src/core/swap.c | 1 + src/machine/machined-dbus.c | 10 +++++++++ src/shared/cgroup-util.h | 3 ++- units/systemd-nspawn@.service.in | 1 + 14 files changed, 114 insertions(+), 7 deletions(-) diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 8688905..3748c0c 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -327,6 +327,20 @@ along with systemd; If not, see . + + Delegate= + + + Turns on delegation of further resource control + partitioning to processes of the unit. For unpriviliged + services (i.e. those using the User= + setting) this allows processes to create a subhierarchy + beneath its control group path. For priviliged services and + scopes this ensures the processes will have all control + group controllers enabled. + + + diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 32e2599..764311f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -94,14 +94,16 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { "%sCPUShares=%lu\n" "%sBlockIOWeight=%lu\n" "%sMemoryLimit=%" PRIu64 "\n" - "%sDevicePolicy=%s\n", + "%sDevicePolicy=%s\n" + "%sDelegate=%s\n", prefix, yes_no(c->cpu_accounting), prefix, yes_no(c->blockio_accounting), prefix, yes_no(c->memory_accounting), prefix, c->cpu_shares, prefix, c->blockio_weight, prefix, c->memory_limit, - prefix, cgroup_device_policy_to_string(c->device_policy)); + prefix, cgroup_device_policy_to_string(c->device_policy), + prefix, yes_no(c->delegate)); LIST_FOREACH(device_allow, a, c->device_allow) fprintf(f, @@ -342,6 +344,19 @@ static CGroupControllerMask unit_get_cgroup_mask(Unit *u) { if (!c) return 0; + /* If delegation is turned on, then turn on all cgroups, + * unless the process we fork into it is known to drop + * privileges anyway, and shouldn't get access to the + * controllers anyway. */ + + if (c->delegate) { + ExecContext *e; + + e = unit_get_exec_context(u); + if (!e || exec_context_maintains_privileges(e)) + return _CGROUP_CONTROLLER_MASK_ALL; + } + return cgroup_context_get_mask(c); } diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 0a079e9..d00bcac 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -80,6 +80,8 @@ struct CGroupContext { CGroupDevicePolicy device_policy; LIST_HEAD(CGroupDeviceAllow, device_allow); + + bool delegate; }; #include "unit.h" diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 9ebcad9..a13c869 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -124,6 +124,7 @@ static int bus_cgroup_append_device_allow(DBusMessageIter *i, const char *proper } const BusProperty bus_cgroup_context_properties[] = { + { "Delegate", bus_property_append_bool, "b", offsetof(CGroupContext, delegate) }, { "CPUAccounting", bus_property_append_bool, "b", offsetof(CGroupContext, cpu_accounting) }, { "CPUShares", bus_property_append_ul, "t", offsetof(CGroupContext, cpu_shares) }, { "BlockIOAccounting", bus_property_append_bool, "b", offsetof(CGroupContext, blockio_accounting) }, @@ -138,6 +139,38 @@ const BusProperty bus_cgroup_context_properties[] = { {} }; +static int bus_cgroup_set_transient_property( + Unit *u, + CGroupContext *c, + const char *name, + DBusMessageIter *i, + UnitSetPropertiesMode mode, + DBusError *error) { + + assert(u); + assert(c); + assert(name); + assert(i); + + if (streq(name, "Delegate")) { + + if (dbus_message_iter_get_arg_type(i) != DBUS_TYPE_BOOLEAN) + return -EINVAL; + + if (mode != UNIT_CHECK) { + dbus_bool_t b; + + dbus_message_iter_get_basic(i, &b); + c->delegate = b; + unit_write_drop_in_private(u, mode, name, b ? "Delegate=yes" : "Delegate=no"); + } + + return 1; + } + + return 0; +} + int bus_cgroup_set_property( Unit *u, CGroupContext *c, @@ -550,5 +583,12 @@ int bus_cgroup_set_property( return 1; } + if (u->transient && u->load_state == UNIT_STUB) { + int r; + r = bus_cgroup_set_transient_property(u, c, name, i, mode, error); + if (r != 0) + return r; + } + return 0; } diff --git a/src/core/execute.c b/src/core/execute.c index 981b9e4..c3fd6a8 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1035,6 +1035,7 @@ int exec_spawn(ExecCommand *command, bool confirm_spawn, CGroupControllerMask cgroup_supported, const char *cgroup_path, + bool cgroup_delegate, const char *unit_id, int idle_pipe[4], pid_t *ret) { @@ -1299,8 +1300,10 @@ int exec_spawn(ExecCommand *command, } } -#ifdef HAVE_PAM - if (cgroup_path && context->user && context->pam_name) { + /* If delegation is enabled we'll pass ownership of the cgroup + * (but only in systemd's own controller hierarchy!) to the + * user of the new process. */ + if (cgroup_path && context->user && cgroup_delegate) { err = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, 0644, uid, gid); if (err < 0) { r = EXIT_CGROUP; @@ -1314,7 +1317,6 @@ int exec_spawn(ExecCommand *command, goto fail_child; } } -#endif if (apply_permissions) { err = enforce_groups(context, username, gid); @@ -2069,6 +2071,21 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, c->utmp_id); } +bool exec_context_maintains_privileges(ExecContext *c) { + assert(c); + + /* Returns true if the process forked off would run under + * an unchanged UID or as root. */ + + if (!c->user) + return true; + + if (streq(c->user, "root") || streq(c->user, "0")) + return true; + + return false; +} + void exec_status_start(ExecStatus *s, pid_t pid) { assert(s); diff --git a/src/core/execute.h b/src/core/execute.h index c1e9717..eca9d7d 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -173,6 +173,7 @@ int exec_spawn(ExecCommand *command, bool confirm_spawn, CGroupControllerMask cgroup_mask, const char *cgroup_path, + bool cgroup_delegate, const char *unit_id, int pipe_fd[2], pid_t *ret); @@ -199,6 +200,7 @@ void exec_context_tty_reset(const ExecContext *context); int exec_context_load_environment(const ExecContext *c, char ***l); bool exec_context_may_touch_console(ExecContext *c); +bool exec_context_maintains_privileges(ExecContext *c); void exec_context_serialize(const ExecContext *c, Unit *u, FILE *f); void exec_status_start(ExecStatus *s, pid_t pid); diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 0991cb9..3e88c8a 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -95,7 +95,8 @@ $1.BlockIOAccounting, config_parse_bool, 0, $1.BlockIOWeight, config_parse_blockio_weight, 0, offsetof($1, cgroup_context) $1.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof($1, cgroup_context) $1.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) -$1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context)' +$1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof($1, cgroup_context) +$1.Delegate, config_parse_bool, 0, offsetof($1, cgroup_context.delegate)' )m4_dnl Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation) diff --git a/src/core/mount.c b/src/core/mount.c index 3672338..f7c98c4 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -793,6 +793,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { UNIT(m)->manager->confirm_spawn, UNIT(m)->manager->cgroup_supported, UNIT(m)->cgroup_path, + m->cgroup_context.delegate, UNIT(m)->id, NULL, &pid); diff --git a/src/core/service.c b/src/core/service.c index f6fdbbc..da30d77 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1869,6 +1869,7 @@ static int service_spawn( UNIT(s)->manager->confirm_spawn, UNIT(s)->manager->cgroup_supported, path, + s->cgroup_context.delegate, UNIT(s)->id, s->type == SERVICE_IDLE ? UNIT(s)->manager->idle_pipe : NULL, &pid); diff --git a/src/core/socket.c b/src/core/socket.c index 32e0d35..f365125 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1229,6 +1229,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { UNIT(s)->manager->confirm_spawn, UNIT(s)->manager->cgroup_supported, UNIT(s)->cgroup_path, + s->cgroup_context.delegate, UNIT(s)->id, NULL, &pid); diff --git a/src/core/swap.c b/src/core/swap.c index 727bb95..597c8ca 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -591,6 +591,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { UNIT(s)->manager->confirm_spawn, UNIT(s)->manager->cgroup_supported, UNIT(s)->cgroup_path, + s->cgroup_context.delegate, UNIT(s)->id, NULL, &pid); diff --git a/src/machine/machined-dbus.c b/src/machine/machined-dbus.c index 22caadf..0cebdc5 100644 --- a/src/machine/machined-dbus.c +++ b/src/machine/machined-dbus.c @@ -739,9 +739,11 @@ int manager_start_scope( DBusMessageIter iter, sub, sub2, sub3, sub4; const char *timeout_stop_property = "TimeoutStopUSec"; const char *pids_property = "PIDs"; + const char *delegate_property = "Delegate"; uint64_t timeout = 500 * USEC_PER_MSEC; const char *fail = "fail"; uint32_t u; + dbus_bool_t b = 1; int r; assert(manager); @@ -814,6 +816,14 @@ int manager_start_scope( !dbus_message_iter_close_container(&sub, &sub2)) return log_oom(); + if (!dbus_message_iter_open_container(&sub, DBUS_TYPE_STRUCT, NULL, &sub2) || + !dbus_message_iter_append_basic(&sub2, DBUS_TYPE_STRING, &delegate_property) || + !dbus_message_iter_open_container(&sub2, DBUS_TYPE_VARIANT, "b", &sub3) || + !dbus_message_iter_append_basic(&sub3, DBUS_TYPE_BOOLEAN, &b) || + !dbus_message_iter_close_container(&sub2, &sub3) || + !dbus_message_iter_close_container(&sub, &sub2)) + return log_oom(); + if (more_properties) { r = copy_many_fields(&sub, more_properties); if (r < 0) diff --git a/src/shared/cgroup-util.h b/src/shared/cgroup-util.h index 0963450..0608b9a 100644 --- a/src/shared/cgroup-util.h +++ b/src/shared/cgroup-util.h @@ -34,7 +34,8 @@ typedef enum CGroupControllerMask { CGROUP_CPUACCT = 2, CGROUP_BLKIO = 4, CGROUP_MEMORY = 8, - CGROUP_DEVICE = 16 + CGROUP_DEVICE = 16, + _CGROUP_CONTROLLER_MASK_ALL = 31 } CGroupControllerMask; /* diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index 8e00736..bdfa89f 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -12,6 +12,7 @@ Documentation=man:systemd-nspawn(1) [Service] ExecStart=@bindir@/systemd-nspawn -bjD /var/lib/container/%i Type=notify +Delegate=yes [Install] WantedBy=multi-user.target