From 705cc82938b67fa110f2f6f5d28bfb9ec2f339c0 Mon Sep 17 00:00:00 2001 From: Ryan Wilson Date: Fri, 15 Nov 2024 06:56:05 -0800 Subject: [PATCH 1/2] core: Add PrivateUsers=full Recently, PrivateUsers=identity was added to support mapping the first 65536 UIDs/GIDs from parent to the child namespace and mapping the other UID/GIDs to the nobody user. However, there are use cases where users have UIDs/GIDs > 65536 and need to do a similar identity mapping. Moreover, in some of those cases, users want a full identity mapping from 0 -> UID_MAX. Note to differentiate ourselves from the init user namespace, we need to set up the uid_map/gid_map like: ``` 0 0 1 1 1 UINT32_MAX - 1 ``` as the init user namedspace uses `0 0 UINT32_MAX` and some applications - like systemd itself - determine if its a non-init user namespace based on uid_map/gid_map files. Note systemd will remove this heuristic in running_in_userns() in version 258 and uses namespace inode. But some users may be running a container image with older systemd < 258 so we keep this hack until version 259. To support this, we add PrivateUsers=full that does identity mapping for all available UID/GIDs. Fixes: #35168 --- man/systemd.exec.xml | 8 +++++-- src/core/exec-invoke.c | 28 ++++++++++++++++++++++++ src/core/namespace.c | 1 + src/core/namespace.h | 1 + test/units/TEST-07-PID1.private-users.sh | 2 ++ 5 files changed, 38 insertions(+), 2 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 607c88128ded4..482dbbda80a84 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2009,8 +2009,8 @@ BindReadOnlyPaths=/var/lib/systemd PrivateUsers= - Takes a boolean argument or one of self or - identity. Defaults to false. If enabled, sets up a new user namespace for the + Takes a boolean argument or one of self, identity, + or full. Defaults to false. If enabled, sets up a new user namespace for the executed processes and configures a user and group mapping. If set to a true value or self, a minimal user and group mapping is configured that maps the root user and group as well as the unit's own user and group to themselves and @@ -2026,6 +2026,10 @@ BindReadOnlyPaths=/var/lib/systemd since all UIDs/GIDs are chosen identically it does provide process capability isolation, and hence is often a good choice if proper user namespacing with distinct UID maps is not appropriate. + If the parameter is full, user namespacing is set up with an identity + mapping for all UIDs/GIDs. Similar to identity, this does not provide UID/GID + isolation, but it does provide process capability isolation. + If this mode is enabled, all unit processes are run without privileges in the host user namespace (regardless if the unit's own user/group is root or not). Specifically this means that the process will have zero process capabilities on the host's user namespace, but diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 9d636f552950d..682d6449d76f3 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2103,6 +2103,29 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi uid_map = strdup("0 0 65536\n"); if (!uid_map) return -ENOMEM; + } else if (private_users == PRIVATE_USERS_FULL) { + /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because + * this is the same UID/GID map as the init user namespace and systemd's running_in_userns() + * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`. + * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user + * namespace from the init namespace: + * 0 0 1 + * 1 1 UINT32_MAX - 1 + * + * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258 + * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep + * this uid_map/gid_map hack until version 259 for version N-1 compatibility. + * + * TODO: Switch to `0 0 UINT32_MAX` in systemd v259. + * + * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though + * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is + * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to + * use these UIDs/GIDs so we need to map them. */ + r = asprintf(&uid_map, "0 0 1\n" + "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1)); + if (r < 0) + return -ENOMEM; /* Can only set up multiple mappings with CAP_SETUID. */ } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) { r = asprintf(&uid_map, @@ -2123,6 +2146,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi gid_map = strdup("0 0 65536\n"); if (!gid_map) return -ENOMEM; + } else if (private_users == PRIVATE_USERS_FULL) { + r = asprintf(&gid_map, "0 0 1\n" + "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1)); + if (r < 0) + return -ENOMEM; /* Can only set up multiple mappings with CAP_SETGID. */ } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) { r = asprintf(&gid_map, diff --git a/src/core/namespace.c b/src/core/namespace.c index 57dbbc4fc7dc5..c584ea35724d1 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -3364,6 +3364,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = { [PRIVATE_USERS_NO] = "no", [PRIVATE_USERS_SELF] = "self", [PRIVATE_USERS_IDENTITY] = "identity", + [PRIVATE_USERS_FULL] = "full", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF); diff --git a/src/core/namespace.h b/src/core/namespace.h index bd48aa31da71c..5d466a8c1c724 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -65,6 +65,7 @@ typedef enum PrivateUsers { PRIVATE_USERS_NO, PRIVATE_USERS_SELF, PRIVATE_USERS_IDENTITY, + PRIVATE_USERS_FULL, _PRIVATE_USERS_MAX, _PRIVATE_USERS_INVALID = -EINVAL, } PrivateUsers; diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh index 2475b5d365d59..ba85248f9607e 100755 --- a/test/units/TEST-07-PID1.private-users.sh +++ b/test/units/TEST-07-PID1.private-users.sh @@ -10,3 +10,5 @@ systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_ma systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' From 878e86f12b7184a87a9cc1ecd4f99c5d9744f931 Mon Sep 17 00:00:00 2001 From: Ryan Wilson Date: Sat, 30 Nov 2024 14:14:35 -0800 Subject: [PATCH 2/2] core: Set /proc/pid/setgroups to allow for PrivateUsers=full When trying to run dbus-broker in a systemd unit with PrivateUsers=full, we see dbus-broker fails with EPERM at `util_audit_drop_permissions`. The root cause is dbus-broker calls the setgroups() system call and this is disallowed via systemd's implementation of PrivateUsers= by setting /proc/pid/setgroups = deny. This is done to remediate potential privilege escalation vulnerabilities in user namespaces where an attacker can remove supplementary groups and gain access to resources where those groups are restricted. However, for OS-like containers, setgroups() is a pretty common API and disabling it is not feasible. So we allow setgroups() by setting /proc/pid/setgroups to allow in PrivateUsers=full. Note security conscious users can still use SystemCallFilter= to disable setgroups() if they want to specifically prevent this system call. Fixes: #35425 --- man/systemd.exec.xml | 7 +++++-- src/core/exec-invoke.c | 23 ++++++++++++++++------- test/units/TEST-07-PID1.private-users.sh | 3 +++ 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 482dbbda80a84..b31e64f57c844 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2027,8 +2027,11 @@ BindReadOnlyPaths=/var/lib/systemd often a good choice if proper user namespacing with distinct UID maps is not appropriate. If the parameter is full, user namespacing is set up with an identity - mapping for all UIDs/GIDs. Similar to identity, this does not provide UID/GID - isolation, but it does provide process capability isolation. + mapping for all UIDs/GIDs. In addition, for system services, full allows the unit + to call setgroups() system calls (by setting + /proc/pid/setgroups to allow). + Similar to identity, this does not provide UID/GID isolation, but it does provide + process capability isolation. If this mode is enabled, all unit processes are run without privileges in the host user namespace (regardless if the unit's own user/group is root or not). Specifically diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 682d6449d76f3..8305bb2bcf7da 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2077,7 +2077,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) { return 0; } -static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { +static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) { _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; _cleanup_close_ int unshare_ready_fd = -EBADF; @@ -2196,7 +2196,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi if (read(unshare_ready_fd, &c, sizeof(c)) < 0) report_errno_and_exit(errno_pipe[1], -errno); - /* Disable the setgroups() system call in the child user namespace, for good. */ + /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full + * and using the system service manager. */ a = procfs_file_alloca(ppid, "setgroups"); fd = open(a, O_WRONLY|O_CLOEXEC); if (fd < 0) { @@ -2207,10 +2208,15 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi /* If the file is missing the kernel is too old, let's continue anyway. */ } else { - if (write(fd, "deny\n", 5) < 0) { - r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a); - report_errno_and_exit(errno_pipe[1], r); + if (allow_setgroups) { + if (write(fd, "allow\n", 6) < 0) + r = log_debug_errno(errno, "Failed to write \"allow\" to %s: %m", a); + } else { + if (write(fd, "deny\n", 5) < 0) + r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a); } + if (r < 0) + report_errno_and_exit(errno_pipe[1], r); fd = safe_close(fd); } @@ -5007,7 +5013,9 @@ int exec_invoke( if (pu == PRIVATE_USERS_NO) pu = PRIVATE_USERS_SELF; - r = setup_private_users(pu, saved_uid, saved_gid, uid, gid); + /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in + * unprivileged user namespaces. */ + r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false); /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let * the actual requested operations fail (or silently continue). */ if (r < 0 && context->private_users != PRIVATE_USERS_NO) { @@ -5177,7 +5185,8 @@ int exec_invoke( * different user namespace). */ if (needs_sandboxing && !userns_set_up) { - r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid); + r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid, + /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL); if (r < 0) { *exit_status = EXIT_USER; return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m"); diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh index ba85248f9607e..e788f52a2f73f 100755 --- a/test/units/TEST-07-PID1.private-users.sh +++ b/test/units/TEST-07-PID1.private-users.sh @@ -6,9 +6,12 @@ set -o pipefail systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'