Blob Blame History Raw
From 705cc82938b67fa110f2f6f5d28bfb9ec2f339c0 Mon Sep 17 00:00:00 2001
From: Ryan Wilson <ryantimwilson@meta.com>
Date: Fri, 15 Nov 2024 06:56:05 -0800
Subject: [PATCH 1/2] core: Add PrivateUsers=full

Recently, PrivateUsers=identity was added to support mapping the first
65536 UIDs/GIDs from parent to the child namespace and mapping the other
UID/GIDs to the nobody user.

However, there are use cases where users have UIDs/GIDs > 65536 and need
to do a similar identity mapping. Moreover, in some of those cases, users
want a full identity mapping from 0 -> UID_MAX.

Note to differentiate ourselves from the init user namespace, we need to
set up the uid_map/gid_map like:
```
0 0 1
1 1 UINT32_MAX - 1
```

as the init user namedspace uses `0 0 UINT32_MAX` and some applications -
like systemd itself - determine if its a non-init user namespace based on
uid_map/gid_map files. Note systemd will remove this heuristic in
running_in_userns() in version 258 and uses namespace inode. But some users
may be running a container image with older systemd < 258 so we keep this
hack until version 259.

To support this, we add PrivateUsers=full that does identity mapping for
all available UID/GIDs.

Fixes: #35168
---
 man/systemd.exec.xml                     |  8 +++++--
 src/core/exec-invoke.c                   | 28 ++++++++++++++++++++++++
 src/core/namespace.c                     |  1 +
 src/core/namespace.h                     |  1 +
 test/units/TEST-07-PID1.private-users.sh |  2 ++
 5 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 607c88128ded4..482dbbda80a84 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -2009,8 +2009,8 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
       <varlistentry>
         <term><varname>PrivateUsers=</varname></term>
 
-        <listitem><para>Takes a boolean argument or one of <literal>self</literal> or
-        <literal>identity</literal>. Defaults to false. If enabled, sets up a new user namespace for the
+        <listitem><para>Takes a boolean argument or one of <literal>self</literal>, <literal>identity</literal>,
+        or <literal>full</literal>. Defaults to false. If enabled, sets up a new user namespace for the
         executed processes and configures a user and group mapping. If set to a true value or
         <literal>self</literal>, a minimal user and group mapping is configured that maps the
         <literal>root</literal> user and group as well as the unit's own user and group to themselves and
@@ -2026,6 +2026,10 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
         since all UIDs/GIDs are chosen identically it does provide process capability isolation, and hence is
         often a good choice if proper user namespacing with distinct UID maps is not appropriate.</para>
 
+        <para>If the parameter is <literal>full</literal>, user namespacing is set up with an identity
+        mapping for all UIDs/GIDs. Similar to <literal>identity</literal>, this does not provide UID/GID
+        isolation, but it does provide process capability isolation.</para>
+
         <para>If this mode is enabled, all unit processes are run without privileges in the host user
         namespace (regardless if the unit's own user/group is <literal>root</literal> or not). Specifically
         this means that the process will have zero process capabilities on the host's user namespace, but
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 9d636f552950d..682d6449d76f3 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -2103,6 +2103,29 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
                 uid_map = strdup("0 0 65536\n");
                 if (!uid_map)
                         return -ENOMEM;
+        } else if (private_users == PRIVATE_USERS_FULL) {
+                /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
+                 * this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
+                 * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
+                 * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
+                 * namespace from the init namespace:
+                 *   0 0 1
+                 *   1 1 UINT32_MAX - 1
+                 *
+                 * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
+                 * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
+                 * this uid_map/gid_map hack until version 259 for version N-1 compatibility.
+                 *
+                 * TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
+                 *
+                 * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
+                 * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
+                 * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
+                 * use these UIDs/GIDs so we need to map them. */
+                r = asprintf(&uid_map, "0 0 1\n"
+                                       "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1));
+                if (r < 0)
+                        return -ENOMEM;
         /* Can only set up multiple mappings with CAP_SETUID. */
         } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
                 r = asprintf(&uid_map,
@@ -2123,6 +2146,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
                 gid_map = strdup("0 0 65536\n");
                 if (!gid_map)
                         return -ENOMEM;
+        } else if (private_users == PRIVATE_USERS_FULL) {
+                r = asprintf(&gid_map, "0 0 1\n"
+                                       "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1));
+                if (r < 0)
+                        return -ENOMEM;
         /* Can only set up multiple mappings with CAP_SETGID. */
         } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
                 r = asprintf(&gid_map,
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 57dbbc4fc7dc5..c584ea35724d1 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -3364,6 +3364,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
         [PRIVATE_USERS_NO]       = "no",
         [PRIVATE_USERS_SELF]     = "self",
         [PRIVATE_USERS_IDENTITY] = "identity",
+        [PRIVATE_USERS_FULL]     = "full",
 };
 
 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index bd48aa31da71c..5d466a8c1c724 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -65,6 +65,7 @@ typedef enum PrivateUsers {
         PRIVATE_USERS_NO,
         PRIVATE_USERS_SELF,
         PRIVATE_USERS_IDENTITY,
+        PRIVATE_USERS_FULL,
         _PRIVATE_USERS_MAX,
         _PRIVATE_USERS_INVALID = -EINVAL,
 } PrivateUsers;
diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh
index 2475b5d365d59..ba85248f9607e 100755
--- a/test/units/TEST-07-PID1.private-users.sh
+++ b/test/units/TEST-07-PID1.private-users.sh
@@ -10,3 +10,5 @@ systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_ma
 systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == "         0          0          1"'
 systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == "         0          0      65536"'
 systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == "         0          0      65536"'
+systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == "         0          0          1         1          1 4294967294"'
+systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == "         0          0          1         1          1 4294967294"'

From 878e86f12b7184a87a9cc1ecd4f99c5d9744f931 Mon Sep 17 00:00:00 2001
From: Ryan Wilson <ryantimwilson@meta.com>
Date: Sat, 30 Nov 2024 14:14:35 -0800
Subject: [PATCH 2/2] core: Set /proc/pid/setgroups to allow for
 PrivateUsers=full

When trying to run dbus-broker in a systemd unit with PrivateUsers=full,
we see dbus-broker fails with EPERM at `util_audit_drop_permissions`.

The root cause is dbus-broker calls the setgroups() system call and this
is disallowed via systemd's implementation of PrivateUsers= by setting
/proc/pid/setgroups = deny. This is done to remediate potential privilege
escalation vulnerabilities in user namespaces where an attacker can remove
supplementary groups and gain access to resources where those groups are
restricted.

However, for OS-like containers, setgroups() is a pretty common API and
disabling it is not feasible. So we allow setgroups() by setting
/proc/pid/setgroups to allow in PrivateUsers=full. Note security conscious
users can still use SystemCallFilter= to disable setgroups() if they want
to specifically prevent this system call.

Fixes: #35425
---
 man/systemd.exec.xml                     |  7 +++++--
 src/core/exec-invoke.c                   | 23 ++++++++++++++++-------
 test/units/TEST-07-PID1.private-users.sh |  3 +++
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 482dbbda80a84..b31e64f57c844 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -2027,8 +2027,11 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
         often a good choice if proper user namespacing with distinct UID maps is not appropriate.</para>
 
         <para>If the parameter is <literal>full</literal>, user namespacing is set up with an identity
-        mapping for all UIDs/GIDs. Similar to <literal>identity</literal>, this does not provide UID/GID
-        isolation, but it does provide process capability isolation.</para>
+        mapping for all UIDs/GIDs. In addition, for system services, <literal>full</literal> allows the unit
+        to call <function>setgroups()</function> system calls (by setting
+        <filename>/proc/<replaceable>pid</replaceable>/setgroups</filename> to <literal>allow</literal>).
+        Similar to <literal>identity</literal>, this does not provide UID/GID isolation, but it does provide
+        process capability isolation.</para>
 
         <para>If this mode is enabled, all unit processes are run without privileges in the host user
         namespace (regardless if the unit's own user/group is <literal>root</literal> or not). Specifically
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 682d6449d76f3..8305bb2bcf7da 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -2077,7 +2077,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) {
         return 0;
 }
 
-static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
+static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
         _cleanup_close_ int unshare_ready_fd = -EBADF;
@@ -2196,7 +2196,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
                         report_errno_and_exit(errno_pipe[1], -errno);
 
-                /* Disable the setgroups() system call in the child user namespace, for good. */
+                /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
+                 * and using the system service manager. */
                 a = procfs_file_alloca(ppid, "setgroups");
                 fd = open(a, O_WRONLY|O_CLOEXEC);
                 if (fd < 0) {
@@ -2207,10 +2208,15 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
 
                         /* If the file is missing the kernel is too old, let's continue anyway. */
                 } else {
-                        if (write(fd, "deny\n", 5) < 0) {
-                                r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
-                                report_errno_and_exit(errno_pipe[1], r);
+                        if (allow_setgroups) {
+                                if (write(fd, "allow\n", 6) < 0)
+                                        r = log_debug_errno(errno, "Failed to write \"allow\" to %s: %m", a);
+                        } else {
+                                if (write(fd, "deny\n", 5) < 0)
+                                        r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
                         }
+                        if (r < 0)
+                                report_errno_and_exit(errno_pipe[1], r);
 
                         fd = safe_close(fd);
                 }
@@ -5007,7 +5013,9 @@ int exec_invoke(
                 if (pu == PRIVATE_USERS_NO)
                         pu = PRIVATE_USERS_SELF;
 
-                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid);
+                /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
+                 * unprivileged user namespaces. */
+                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
                  * the actual requested operations fail (or silently continue). */
                 if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
@@ -5177,7 +5185,8 @@ int exec_invoke(
          * different user namespace). */
 
         if (needs_sandboxing && !userns_set_up) {
-                r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid);
+                r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid,
+                                        /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL);
                 if (r < 0) {
                         *exit_status = EXIT_USER;
                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh
index ba85248f9607e..e788f52a2f73f 100755
--- a/test/units/TEST-07-PID1.private-users.sh
+++ b/test/units/TEST-07-PID1.private-users.sh
@@ -6,9 +6,12 @@ set -o pipefail
 
 systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == "         0          0          1"'
 systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == "         0          0          1"'
+systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
 systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == "         0          0          1"'
 systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == "         0          0          1"'
+systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
 systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == "         0          0      65536"'
 systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == "         0          0      65536"'
 systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == "         0          0          1         1          1 4294967294"'
 systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == "         0          0          1         1          1 4294967294"'
+systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'