9b3774
From 5b5571de21d1ddf9a00511a6b2f25d630a903f05 Mon Sep 17 00:00:00 2001
9b3774
From: Michal Sekletar <msekleta@redhat.com>
9b3774
Date: Wed, 1 Jun 2022 10:15:06 +0200
9b3774
Subject: [PATCH] scope: allow unprivileged delegation on scopes
9b3774
9b3774
Previously it was possible to set delegate property for scope, but you
9b3774
were not able to allow unprivileged process to manage the scope's cgroup
9b3774
hierarchy. This is useful when launching manager process that  will run
9b3774
unprivileged but is supposed to manage its own (scope) sub-hierarchy.
9b3774
9b3774
Fixes #21683
9b3774
9b3774
(cherry picked from commit 03860190fefce8bbea3a6f0e77919b882ade517c)
9b3774
9b3774
Resolves: #2068575
9b3774
---
9b3774
 src/basic/unit-def.c               |   1 +
9b3774
 src/basic/unit-def.h               |   1 +
9b3774
 src/core/dbus-scope.c              |   6 ++
9b3774
 src/core/scope.c                   | 135 +++++++++++++++++++++++++----
9b3774
 src/core/scope.h                   |   3 +
9b3774
 src/shared/bus-unit-util.c         |   5 ++
9b3774
 test/TEST-19-DELEGATE/testsuite.sh |  13 +++
9b3774
 7 files changed, 145 insertions(+), 19 deletions(-)
9b3774
9b3774
diff --git a/src/basic/unit-def.c b/src/basic/unit-def.c
9b3774
index e79cc73dd3..16c4d38d41 100644
9b3774
--- a/src/basic/unit-def.c
9b3774
+++ b/src/basic/unit-def.c
9b3774
@@ -160,6 +160,7 @@ DEFINE_STRING_TABLE_LOOKUP(path_state, PathState);
9b3774
 
9b3774
 static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
9b3774
         [SCOPE_DEAD] = "dead",
9b3774
+        [SCOPE_START_CHOWN] = "start-chown",
9b3774
         [SCOPE_RUNNING] = "running",
9b3774
         [SCOPE_ABANDONED] = "abandoned",
9b3774
         [SCOPE_STOP_SIGTERM] = "stop-sigterm",
9b3774
diff --git a/src/basic/unit-def.h b/src/basic/unit-def.h
9b3774
index 8eea379a6d..03d151ec19 100644
9b3774
--- a/src/basic/unit-def.h
9b3774
+++ b/src/basic/unit-def.h
9b3774
@@ -99,6 +99,7 @@ typedef enum PathState {
9b3774
 
9b3774
 typedef enum ScopeState {
9b3774
         SCOPE_DEAD,
9b3774
+        SCOPE_START_CHOWN,
9b3774
         SCOPE_RUNNING,
9b3774
         SCOPE_ABANDONED,
9b3774
         SCOPE_STOP_SIGTERM,
9b3774
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
9b3774
index 0bbf64fff1..534302d188 100644
9b3774
--- a/src/core/dbus-scope.c
9b3774
+++ b/src/core/dbus-scope.c
9b3774
@@ -178,6 +178,12 @@ int bus_scope_set_property(
9b3774
                 r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
9b3774
                 if (r != 0)
9b3774
                         return r;
9b3774
+
9b3774
+                if (streq(name, "User"))
9b3774
+                        return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
9b3774
+
9b3774
+                if (streq(name, "Group"))
9b3774
+                        return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
9b3774
         }
9b3774
 
9b3774
         return 0;
9b3774
diff --git a/src/core/scope.c b/src/core/scope.c
9b3774
index 5a595c65a6..9cc5f89099 100644
9b3774
--- a/src/core/scope.c
9b3774
+++ b/src/core/scope.c
9b3774
@@ -5,6 +5,8 @@
9b3774
 
9b3774
 #include "alloc-util.h"
9b3774
 #include "dbus-scope.h"
9b3774
+#include "dbus-unit.h"
9b3774
+#include "exit-status.h"
9b3774
 #include "load-dropin.h"
9b3774
 #include "log.h"
9b3774
 #include "scope.h"
9b3774
@@ -14,9 +16,11 @@
9b3774
 #include "strv.h"
9b3774
 #include "unit-name.h"
9b3774
 #include "unit.h"
9b3774
+#include "user-util.h"
9b3774
 
9b3774
 static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
9b3774
         [SCOPE_DEAD] = UNIT_INACTIVE,
9b3774
+        [SCOPE_START_CHOWN] = UNIT_ACTIVATING,
9b3774
         [SCOPE_RUNNING] = UNIT_ACTIVE,
9b3774
         [SCOPE_ABANDONED] = UNIT_ACTIVE,
9b3774
         [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
9b3774
@@ -34,6 +38,7 @@ static void scope_init(Unit *u) {
9b3774
 
9b3774
         s->timeout_stop_usec = u->manager->default_timeout_stop_usec;
9b3774
         u->ignore_on_isolate = true;
9b3774
+        s->user = s->group = NULL;
9b3774
 }
9b3774
 
9b3774
 static void scope_done(Unit *u) {
9b3774
@@ -45,6 +50,9 @@ static void scope_done(Unit *u) {
9b3774
         s->controller_track = sd_bus_track_unref(s->controller_track);
9b3774
 
9b3774
         s->timer_event_source = sd_event_source_unref(s->timer_event_source);
9b3774
+
9b3774
+        s->user = mfree(s->user);
9b3774
+        s->group = mfree(s->group);
9b3774
 }
9b3774
 
9b3774
 static int scope_arm_timer(Scope *s, usec_t usec) {
9b3774
@@ -84,7 +92,7 @@ static void scope_set_state(Scope *s, ScopeState state) {
9b3774
         old_state = s->state;
9b3774
         s->state = state;
9b3774
 
9b3774
-        if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
9b3774
+        if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL, SCOPE_START_CHOWN))
9b3774
                 s->timer_event_source = sd_event_source_unref(s->timer_event_source);
9b3774
 
9b3774
         if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
9b3774
@@ -301,26 +309,72 @@ fail:
9b3774
         scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
9b3774
 }
9b3774
 
9b3774
-static int scope_start(Unit *u) {
9b3774
-        Scope *s = SCOPE(u);
9b3774
+static int scope_enter_start_chown(Scope *s) {
9b3774
+        Unit *u = UNIT(s);
9b3774
+        pid_t pid;
9b3774
         int r;
9b3774
 
9b3774
         assert(s);
9b3774
+        assert(s->user);
9b3774
 
9b3774
-        if (unit_has_name(u, SPECIAL_INIT_SCOPE))
9b3774
-                return -EPERM;
9b3774
+        r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), u->manager->default_timeout_start_usec));
9b3774
+        if (r < 0)
9b3774
+                return r;
9b3774
 
9b3774
-        if (s->state == SCOPE_FAILED)
9b3774
-                return -EPERM;
9b3774
+        r = unit_fork_helper_process(u, "(sd-chown-cgroup)", &pid;;
9b3774
+        if (r < 0)
9b3774
+                goto fail;
9b3774
 
9b3774
-        /* We can't fulfill this right now, please try again later */
9b3774
-        if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
9b3774
-                return -EAGAIN;
9b3774
+        if (r == 0) {
9b3774
+                uid_t uid = UID_INVALID;
9b3774
+                gid_t gid = GID_INVALID;
9b3774
 
9b3774
-        assert(s->state == SCOPE_DEAD);
9b3774
+                if (!isempty(s->user)) {
9b3774
+                        const char *user = s->user;
9b3774
 
9b3774
-        if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
9b3774
-                return -ENOENT;
9b3774
+                        r = get_user_creds(&user, &uid, &gid, NULL, NULL);
9b3774
+                        if (r < 0) {
9b3774
+                                log_unit_error_errno(UNIT(s), r, "Failed to resolve user \"%s\": %m", user);
9b3774
+                                _exit(EXIT_USER);
9b3774
+                        }
9b3774
+                }
9b3774
+
9b3774
+                if (!isempty(s->group)) {
9b3774
+                        const char *group = s->group;
9b3774
+
9b3774
+                        r = get_group_creds(&group, &gid;;
9b3774
+                        if (r < 0) {
9b3774
+                                log_unit_error_errno(UNIT(s), r, "Failed to resolve group \"%s\": %m", group);
9b3774
+                                _exit(EXIT_GROUP);
9b3774
+                        }
9b3774
+                }
9b3774
+
9b3774
+                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid);
9b3774
+                if (r < 0) {
9b3774
+                        log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m");
9b3774
+                        _exit(EXIT_CGROUP);
9b3774
+                }
9b3774
+
9b3774
+                _exit(EXIT_SUCCESS);
9b3774
+        }
9b3774
+
9b3774
+        r = unit_watch_pid(UNIT(s), pid, true);
9b3774
+        if (r < 0)
9b3774
+                goto fail;
9b3774
+
9b3774
+        scope_set_state(s, SCOPE_START_CHOWN);
9b3774
+
9b3774
+        return 1;
9b3774
+fail:
9b3774
+        s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
9b3774
+        return r;
9b3774
+}
9b3774
+
9b3774
+static int scope_enter_running(Scope *s) {
9b3774
+        Unit *u = UNIT(s);
9b3774
+        int r;
9b3774
+
9b3774
+        assert(s);
9b3774
 
9b3774
         (void) bus_scope_track_controller(s);
9b3774
 
9b3774
@@ -328,11 +382,7 @@ static int scope_start(Unit *u) {
9b3774
         if (r < 0)
9b3774
                 return r;
9b3774
 
9b3774
-        (void) unit_realize_cgroup(u);
9b3774
-        (void) unit_reset_cpu_accounting(u);
9b3774
-        (void) unit_reset_ip_accounting(u);
9b3774
-
9b3774
-        unit_export_state_files(UNIT(s));
9b3774
+        unit_export_state_files(u);
9b3774
 
9b3774
         r = unit_attach_pids_to_cgroup(u, UNIT(s)->pids, NULL);
9b3774
         if (r < 0) {
9b3774
@@ -350,6 +400,38 @@ static int scope_start(Unit *u) {
9b3774
         return 1;
9b3774
 }
9b3774
 
9b3774
+static int scope_start(Unit *u) {
9b3774
+        Scope *s = SCOPE(u);
9b3774
+
9b3774
+        assert(s);
9b3774
+
9b3774
+        if (unit_has_name(u, SPECIAL_INIT_SCOPE))
9b3774
+                return -EPERM;
9b3774
+
9b3774
+        if (s->state == SCOPE_FAILED)
9b3774
+                return -EPERM;
9b3774
+
9b3774
+        /* We can't fulfill this right now, please try again later */
9b3774
+        if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
9b3774
+                return -EAGAIN;
9b3774
+
9b3774
+        assert(s->state == SCOPE_DEAD);
9b3774
+
9b3774
+        if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
9b3774
+                return -ENOENT;
9b3774
+
9b3774
+        (void) unit_realize_cgroup(u);
9b3774
+        (void) unit_reset_cpu_accounting(u);
9b3774
+        (void) unit_reset_ip_accounting(u);
9b3774
+
9b3774
+        /* We check only for User= option to keep behavior consistent with logic for service units,
9b3774
+         * i.e. having 'Delegate=true Group=foo' w/o specifing User= has no effect. */
9b3774
+        if (s->user && unit_cgroup_delegate(u))
9b3774
+                return scope_enter_start_chown(s);
9b3774
+
9b3774
+        return scope_enter_running(s);
9b3774
+}
9b3774
+
9b3774
 static int scope_stop(Unit *u) {
9b3774
         Scope *s = SCOPE(u);
9b3774
 
9b3774
@@ -462,7 +544,17 @@ static void scope_notify_cgroup_empty_event(Unit *u) {
9b3774
 }
9b3774
 
9b3774
 static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
9b3774
-        assert(u);
9b3774
+        Scope *s = SCOPE(u);
9b3774
+
9b3774
+        assert(s);
9b3774
+
9b3774
+        if (s->state == SCOPE_START_CHOWN) {
9b3774
+                if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
9b3774
+                        scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
9b3774
+                else
9b3774
+                        scope_enter_running(s);
9b3774
+                return;
9b3774
+        }
9b3774
 
9b3774
         /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
9b3774
          * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
9b3774
@@ -495,6 +587,11 @@ static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *user
9b3774
                 scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
9b3774
                 break;
9b3774
 
9b3774
+        case SCOPE_START_CHOWN:
9b3774
+                log_unit_warning(UNIT(s), "User lookup timed out. Entering failed state.");
9b3774
+                scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
9b3774
+                break;
9b3774
+
9b3774
         default:
9b3774
                 assert_not_reached("Timeout at wrong time.");
9b3774
         }
9b3774
diff --git a/src/core/scope.h b/src/core/scope.h
9b3774
index c38afb5e5d..7bed3eed9e 100644
9b3774
--- a/src/core/scope.h
9b3774
+++ b/src/core/scope.h
9b3774
@@ -32,6 +32,9 @@ struct Scope {
9b3774
         bool was_abandoned;
9b3774
 
9b3774
         sd_event_source *timer_event_source;
9b3774
+
9b3774
+        char *user;
9b3774
+        char *group;
9b3774
 };
9b3774
 
9b3774
 extern const UnitVTable scope_vtable;
9b3774
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
9b3774
index 3910dfa812..c475bbafe0 100644
9b3774
--- a/src/shared/bus-unit-util.c
9b3774
+++ b/src/shared/bus-unit-util.c
9b3774
@@ -1615,6 +1615,11 @@ static int bus_append_unit_property(sd_bus_message *m, const char *field, const
9b3774
 
9b3774
                 return bus_append_parse_sec_rename(m, field, eq);
9b3774
 
9b3774
+        /* Scope units don't have execution context but we still want to allow setting these two,
9b3774
+         * so let's handle them separately. */
9b3774
+        if (STR_IN_SET(field, "User", "Group"))
9b3774
+                return bus_append_string(m, field, eq);
9b3774
+
9b3774
         if (streq(field, "StartLimitBurst"))
9b3774
 
9b3774
                 return bus_append_safe_atou(m, field, eq);
9b3774
diff --git a/test/TEST-19-DELEGATE/testsuite.sh b/test/TEST-19-DELEGATE/testsuite.sh
9b3774
index c738bea10e..c4c948cc11 100755
9b3774
--- a/test/TEST-19-DELEGATE/testsuite.sh
9b3774
+++ b/test/TEST-19-DELEGATE/testsuite.sh
9b3774
@@ -4,6 +4,16 @@
9b3774
 set -ex
9b3774
 set -o pipefail
9b3774
 
9b3774
+test_scope_unpriv_delegation() {
9b3774
+    useradd test ||:
9b3774
+    trap "userdel -r test" RETURN
9b3774
+
9b3774
+    systemd-run --uid=test -p User=test -p Delegate=yes --slice workload.slice --unit workload0.scope --scope \
9b3774
+            test -w /sys/fs/cgroup/workload.slice/workload0.scope -a \
9b3774
+            -w /sys/fs/cgroup/workload.slice/workload0.scope/cgroup.procs -a \
9b3774
+            -w /sys/fs/cgroup/workload.slice/workload0.scope/cgroup.subtree_control
9b3774
+}
9b3774
+
9b3774
 if grep -q cgroup2 /proc/filesystems ; then
9b3774
         systemd-run --wait --unit=test0.service -p "DynamicUser=1" -p "Delegate=" \
9b3774
                     test -w /sys/fs/cgroup/system.slice/test0.service/ -a \
9b3774
@@ -15,6 +25,9 @@ if grep -q cgroup2 /proc/filesystems ; then
9b3774
 
9b3774
         systemd-run --wait --unit=test2.service -p "DynamicUser=1" -p "Delegate=memory pids" \
9b3774
                     grep pids /sys/fs/cgroup/system.slice/test2.service/cgroup.controllers
9b3774
+
9b3774
+        # Check that unprivileged delegation works for scopes
9b3774
+        test_scope_unpriv_delegation
9b3774
 else
9b3774
         echo "Skipping TEST-19-DELEGATE, as the kernel doesn't actually support cgroupsv2" >&2
9b3774
 fi