923a60
From 581edd240f8dd68b1dbb4070353ddb2059eb8a67 Mon Sep 17 00:00:00 2001
923a60
From: Lennart Poettering <lennart@poettering.net>
923a60
Date: Fri, 27 Oct 2017 10:56:42 +0200
923a60
Subject: [PATCH] fd-util: add new acquire_data_fd() API helper
923a60
923a60
All this function does is place some data in an in-memory read-only fd,
923a60
that may be read back to get the original data back.
923a60
923a60
Doing this in a way that works everywhere, given the different kernels
923a60
we support as well as different privilege levels is surprisingly
923a60
complex.
923a60
923a60
(cherry picked from commit a548e14d690133dd8cca2d5ab8082bb23259fd5f)
923a60
923a60
Related: #1446095
923a60
---
923a60
 src/shared/util.c    | 156 +++++++++++++++++++++++++++++++++++++++++++
923a60
 src/shared/util.h    |  10 +++
923a60
 src/test/test-util.c |  49 ++++++++++++++
923a60
 3 files changed, 215 insertions(+)
923a60
923a60
diff --git a/src/shared/util.c b/src/shared/util.c
923a60
index af09532733..982f5e044f 100644
923a60
--- a/src/shared/util.c
923a60
+++ b/src/shared/util.c
923a60
@@ -95,6 +95,7 @@
923a60
 #include "sparse-endian.h"
923a60
 #include "conf-parser.h"
923a60
 #include "cgroup-util.h"
923a60
+#include "memfd-util.h"
923a60
 
923a60
 int saved_argc = 0;
923a60
 char **saved_argv = NULL;
923a60
@@ -8893,3 +8894,158 @@ uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
923a60
 
923a60
         return m / max;
923a60
 }
923a60
+
923a60
+int acquire_data_fd(const void *data, size_t size, unsigned flags) {
923a60
+
923a60
+        char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
923a60
+        _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
923a60
+        char pattern[] = "/dev/shm/data-fd-XXXXXX";
923a60
+        _cleanup_close_ int fd = -1;
923a60
+        int isz = 0, r;
923a60
+        ssize_t n;
923a60
+        off_t f;
923a60
+
923a60
+        assert(data || size == 0);
923a60
+
923a60
+        /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
923a60
+         * complex than I wish it was. But here's why:
923a60
+         *
923a60
+         * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
923a60
+         *    read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
923a60
+         *
923a60
+         * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
923a60
+         *    a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
923a60
+         *    clients can only bump their size to a system-wide limit, which might be quite low.
923a60
+         *
923a60
+         * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
923a60
+         *    earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
923a60
+         *    /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
923a60
+         *
923a60
+         * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
923a60
+         *
923a60
+         * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
923a60
+         * figure. */
923a60
+
923a60
+        if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
923a60
+                /* As a special case, return /dev/null if we have been called for an empty data block */
923a60
+                r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
923a60
+                if (r < 0)
923a60
+                        return -errno;
923a60
+
923a60
+                return r;
923a60
+        }
923a60
+
923a60
+        if ((flags & ACQUIRE_NO_MEMFD) == 0) {
923a60
+                fd = memfd_new("data-fd");
923a60
+                if (fd < 0)
923a60
+                        goto try_pipe;
923a60
+
923a60
+                n = write(fd, data, size);
923a60
+                if (n < 0)
923a60
+                        return -errno;
923a60
+                if ((size_t) n != size)
923a60
+                        return -EIO;
923a60
+
923a60
+                f = lseek(fd, 0, SEEK_SET);
923a60
+                if (f != 0)
923a60
+                        return -errno;
923a60
+
923a60
+                r = memfd_set_sealed(fd);
923a60
+                if (r < 0)
923a60
+                        return r;
923a60
+
923a60
+                r = fd;
923a60
+                fd = -1;
923a60
+
923a60
+                return r;
923a60
+        }
923a60
+
923a60
+try_pipe:
923a60
+        if ((flags & ACQUIRE_NO_PIPE) == 0) {
923a60
+                if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
923a60
+                        return -errno;
923a60
+
923a60
+                isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
923a60
+                if (isz < 0)
923a60
+                        return -errno;
923a60
+
923a60
+                if ((size_t) isz < size) {
923a60
+                        isz = (int) size;
923a60
+                        if (isz < 0 || (size_t) isz != size)
923a60
+                                return -E2BIG;
923a60
+
923a60
+                        /* Try to bump the pipe size */
923a60
+                        (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
923a60
+
923a60
+                        /* See if that worked */
923a60
+                        isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
923a60
+                        if (isz < 0)
923a60
+                                return -errno;
923a60
+
923a60
+                        if ((size_t) isz < size)
923a60
+                                goto try_dev_shm;
923a60
+                }
923a60
+
923a60
+                n = write(pipefds[1], data, size);
923a60
+                if (n < 0)
923a60
+                        return -errno;
923a60
+                if ((size_t) n != size)
923a60
+                        return -EIO;
923a60
+
923a60
+                (void) fd_nonblock(pipefds[0], false);
923a60
+
923a60
+                r = pipefds[0];
923a60
+                pipefds[0] = -1;
923a60
+
923a60
+                return r;
923a60
+        }
923a60
+
923a60
+try_dev_shm:
923a60
+        if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
923a60
+                fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
923a60
+                if (fd < 0)
923a60
+                        goto try_dev_shm_without_o_tmpfile;
923a60
+
923a60
+                n = write(fd, data, size);
923a60
+                if (n < 0)
923a60
+                        return -errno;
923a60
+                if ((size_t) n != size)
923a60
+                        return -EIO;
923a60
+
923a60
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
923a60
+                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
923a60
+                r = open(procfs_path, O_RDONLY|O_CLOEXEC);
923a60
+                if (r < 0)
923a60
+                        return -errno;
923a60
+
923a60
+                return r;
923a60
+        }
923a60
+
923a60
+try_dev_shm_without_o_tmpfile:
923a60
+        if ((flags & ACQUIRE_NO_REGULAR) == 0) {
923a60
+                fd = mkostemp_safe(pattern, O_CLOEXEC);
923a60
+                if (fd < 0)
923a60
+                        return fd;
923a60
+
923a60
+                n = write(fd, data, size);
923a60
+                if (n < 0) {
923a60
+                        r = -errno;
923a60
+                        goto unlink_and_return;
923a60
+                }
923a60
+                if ((size_t) n != size) {
923a60
+                        r = -EIO;
923a60
+                        goto unlink_and_return;
923a60
+                }
923a60
+
923a60
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
923a60
+                r = open(pattern, O_RDONLY|O_CLOEXEC);
923a60
+                if (r < 0)
923a60
+                        r = -errno;
923a60
+
923a60
+        unlink_and_return:
923a60
+                (void) unlink(pattern);
923a60
+                return r;
923a60
+        }
923a60
+
923a60
+        return -EOPNOTSUPP;
923a60
+}
923a60
diff --git a/src/shared/util.h b/src/shared/util.h
923a60
index 526a6fe848..9c4be02566 100644
923a60
--- a/src/shared/util.h
923a60
+++ b/src/shared/util.h
923a60
@@ -1112,3 +1112,13 @@ int parse_percent(const char *p);
923a60
 
923a60
 uint64_t system_tasks_max(void);
923a60
 uint64_t system_tasks_max_scale(uint64_t v, uint64_t max);
923a60
+
923a60
+enum {
923a60
+        ACQUIRE_NO_DEV_NULL = 1 << 0,
923a60
+        ACQUIRE_NO_MEMFD    = 1 << 1,
923a60
+        ACQUIRE_NO_PIPE     = 1 << 2,
923a60
+        ACQUIRE_NO_TMPFILE  = 1 << 3,
923a60
+        ACQUIRE_NO_REGULAR  = 1 << 4,
923a60
+};
923a60
+
923a60
+int acquire_data_fd(const void *data, size_t size, unsigned flags);
923a60
diff --git a/src/test/test-util.c b/src/test/test-util.c
923a60
index f2c52edcee..efb02ff530 100644
923a60
--- a/src/test/test-util.c
923a60
+++ b/src/test/test-util.c
923a60
@@ -1861,6 +1861,54 @@ static void test_system_tasks_max_scale(void) {
923a60
         assert_se(system_tasks_max_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX);
923a60
 }
923a60
 
923a60
+static void test_acquire_data_fd_one(unsigned flags) {
923a60
+        char wbuffer[196*1024 - 7];
923a60
+        char rbuffer[sizeof(wbuffer)];
923a60
+        int fd;
923a60
+
923a60
+        fd = acquire_data_fd("foo", 3, flags);
923a60
+        assert_se(fd >= 0);
923a60
+
923a60
+        zero(rbuffer);
923a60
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 3);
923a60
+        assert_se(streq(rbuffer, "foo"));
923a60
+
923a60
+        fd = safe_close(fd);
923a60
+
923a60
+        fd = acquire_data_fd("", 0, flags);
923a60
+        assert_se(fd >= 0);
923a60
+
923a60
+        zero(rbuffer);
923a60
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 0);
923a60
+        assert_se(streq(rbuffer, ""));
923a60
+
923a60
+        fd = safe_close(fd);
923a60
+
923a60
+        random_bytes(wbuffer, sizeof(wbuffer));
923a60
+
923a60
+        fd = acquire_data_fd(wbuffer, sizeof(wbuffer), flags);
923a60
+        assert_se(fd >= 0);
923a60
+
923a60
+        zero(rbuffer);
923a60
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == sizeof(rbuffer));
923a60
+        assert_se(memcmp(rbuffer, wbuffer, sizeof(rbuffer)) == 0);
923a60
+
923a60
+        fd = safe_close(fd);
923a60
+}
923a60
+
923a60
+static void test_acquire_data_fd(void) {
923a60
+
923a60
+        test_acquire_data_fd_one(0);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_PIPE);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_PIPE);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
923a60
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE|ACQUIRE_NO_TMPFILE);
923a60
+}
923a60
+
923a60
 int main(int argc, char *argv[]) {
923a60
         log_parse_environment();
923a60
         log_open();
923a60
@@ -1943,6 +1991,7 @@ int main(int argc, char *argv[]) {
923a60
         test_shell_maybe_quote();
923a60
         test_system_tasks_max();
923a60
         test_system_tasks_max_scale();
923a60
+        test_acquire_data_fd();
923a60
 
923a60
         return 0;
923a60
 }