a9339c
From 581edd240f8dd68b1dbb4070353ddb2059eb8a67 Mon Sep 17 00:00:00 2001
a9339c
From: Lennart Poettering <lennart@poettering.net>
a9339c
Date: Fri, 27 Oct 2017 10:56:42 +0200
a9339c
Subject: [PATCH] fd-util: add new acquire_data_fd() API helper
a9339c
a9339c
All this function does is place some data in an in-memory read-only fd,
a9339c
that may be read back to get the original data back.
a9339c
a9339c
Doing this in a way that works everywhere, given the different kernels
a9339c
we support as well as different privilege levels is surprisingly
a9339c
complex.
a9339c
a9339c
(cherry picked from commit a548e14d690133dd8cca2d5ab8082bb23259fd5f)
a9339c
a9339c
Related: #1446095
a9339c
---
a9339c
 src/shared/util.c    | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
a9339c
 src/shared/util.h    |  10 ++++
a9339c
 src/test/test-util.c |  49 ++++++++++++++++
a9339c
 3 files changed, 215 insertions(+)
a9339c
a9339c
diff --git a/src/shared/util.c b/src/shared/util.c
a9339c
index af0953273..982f5e044 100644
a9339c
--- a/src/shared/util.c
a9339c
+++ b/src/shared/util.c
a9339c
@@ -95,6 +95,7 @@
a9339c
 #include "sparse-endian.h"
a9339c
 #include "conf-parser.h"
a9339c
 #include "cgroup-util.h"
a9339c
+#include "memfd-util.h"
a9339c
 
a9339c
 int saved_argc = 0;
a9339c
 char **saved_argv = NULL;
a9339c
@@ -8893,3 +8894,158 @@ uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
a9339c
 
a9339c
         return m / max;
a9339c
 }
a9339c
+
a9339c
+int acquire_data_fd(const void *data, size_t size, unsigned flags) {
a9339c
+
a9339c
+        char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
a9339c
+        _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
a9339c
+        char pattern[] = "/dev/shm/data-fd-XXXXXX";
a9339c
+        _cleanup_close_ int fd = -1;
a9339c
+        int isz = 0, r;
a9339c
+        ssize_t n;
a9339c
+        off_t f;
a9339c
+
a9339c
+        assert(data || size == 0);
a9339c
+
a9339c
+        /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
a9339c
+         * complex than I wish it was. But here's why:
a9339c
+         *
a9339c
+         * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
a9339c
+         *    read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
a9339c
+         *
a9339c
+         * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
a9339c
+         *    a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
a9339c
+         *    clients can only bump their size to a system-wide limit, which might be quite low.
a9339c
+         *
a9339c
+         * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
a9339c
+         *    earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
a9339c
+         *    /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
a9339c
+         *
a9339c
+         * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
a9339c
+         *
a9339c
+         * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
a9339c
+         * figure. */
a9339c
+
a9339c
+        if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
a9339c
+                /* As a special case, return /dev/null if we have been called for an empty data block */
a9339c
+                r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
a9339c
+                if (r < 0)
a9339c
+                        return -errno;
a9339c
+
a9339c
+                return r;
a9339c
+        }
a9339c
+
a9339c
+        if ((flags & ACQUIRE_NO_MEMFD) == 0) {
a9339c
+                fd = memfd_new("data-fd");
a9339c
+                if (fd < 0)
a9339c
+                        goto try_pipe;
a9339c
+
a9339c
+                n = write(fd, data, size);
a9339c
+                if (n < 0)
a9339c
+                        return -errno;
a9339c
+                if ((size_t) n != size)
a9339c
+                        return -EIO;
a9339c
+
a9339c
+                f = lseek(fd, 0, SEEK_SET);
a9339c
+                if (f != 0)
a9339c
+                        return -errno;
a9339c
+
a9339c
+                r = memfd_set_sealed(fd);
a9339c
+                if (r < 0)
a9339c
+                        return r;
a9339c
+
a9339c
+                r = fd;
a9339c
+                fd = -1;
a9339c
+
a9339c
+                return r;
a9339c
+        }
a9339c
+
a9339c
+try_pipe:
a9339c
+        if ((flags & ACQUIRE_NO_PIPE) == 0) {
a9339c
+                if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
a9339c
+                        return -errno;
a9339c
+
a9339c
+                isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
a9339c
+                if (isz < 0)
a9339c
+                        return -errno;
a9339c
+
a9339c
+                if ((size_t) isz < size) {
a9339c
+                        isz = (int) size;
a9339c
+                        if (isz < 0 || (size_t) isz != size)
a9339c
+                                return -E2BIG;
a9339c
+
a9339c
+                        /* Try to bump the pipe size */
a9339c
+                        (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
a9339c
+
a9339c
+                        /* See if that worked */
a9339c
+                        isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
a9339c
+                        if (isz < 0)
a9339c
+                                return -errno;
a9339c
+
a9339c
+                        if ((size_t) isz < size)
a9339c
+                                goto try_dev_shm;
a9339c
+                }
a9339c
+
a9339c
+                n = write(pipefds[1], data, size);
a9339c
+                if (n < 0)
a9339c
+                        return -errno;
a9339c
+                if ((size_t) n != size)
a9339c
+                        return -EIO;
a9339c
+
a9339c
+                (void) fd_nonblock(pipefds[0], false);
a9339c
+
a9339c
+                r = pipefds[0];
a9339c
+                pipefds[0] = -1;
a9339c
+
a9339c
+                return r;
a9339c
+        }
a9339c
+
a9339c
+try_dev_shm:
a9339c
+        if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
a9339c
+                fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
a9339c
+                if (fd < 0)
a9339c
+                        goto try_dev_shm_without_o_tmpfile;
a9339c
+
a9339c
+                n = write(fd, data, size);
a9339c
+                if (n < 0)
a9339c
+                        return -errno;
a9339c
+                if ((size_t) n != size)
a9339c
+                        return -EIO;
a9339c
+
a9339c
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
a9339c
+                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
a9339c
+                r = open(procfs_path, O_RDONLY|O_CLOEXEC);
a9339c
+                if (r < 0)
a9339c
+                        return -errno;
a9339c
+
a9339c
+                return r;
a9339c
+        }
a9339c
+
a9339c
+try_dev_shm_without_o_tmpfile:
a9339c
+        if ((flags & ACQUIRE_NO_REGULAR) == 0) {
a9339c
+                fd = mkostemp_safe(pattern, O_CLOEXEC);
a9339c
+                if (fd < 0)
a9339c
+                        return fd;
a9339c
+
a9339c
+                n = write(fd, data, size);
a9339c
+                if (n < 0) {
a9339c
+                        r = -errno;
a9339c
+                        goto unlink_and_return;
a9339c
+                }
a9339c
+                if ((size_t) n != size) {
a9339c
+                        r = -EIO;
a9339c
+                        goto unlink_and_return;
a9339c
+                }
a9339c
+
a9339c
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
a9339c
+                r = open(pattern, O_RDONLY|O_CLOEXEC);
a9339c
+                if (r < 0)
a9339c
+                        r = -errno;
a9339c
+
a9339c
+        unlink_and_return:
a9339c
+                (void) unlink(pattern);
a9339c
+                return r;
a9339c
+        }
a9339c
+
a9339c
+        return -EOPNOTSUPP;
a9339c
+}
a9339c
diff --git a/src/shared/util.h b/src/shared/util.h
a9339c
index 526a6fe84..9c4be0256 100644
a9339c
--- a/src/shared/util.h
a9339c
+++ b/src/shared/util.h
a9339c
@@ -1112,3 +1112,13 @@ int parse_percent(const char *p);
a9339c
 
a9339c
 uint64_t system_tasks_max(void);
a9339c
 uint64_t system_tasks_max_scale(uint64_t v, uint64_t max);
a9339c
+
a9339c
+enum {
a9339c
+        ACQUIRE_NO_DEV_NULL = 1 << 0,
a9339c
+        ACQUIRE_NO_MEMFD    = 1 << 1,
a9339c
+        ACQUIRE_NO_PIPE     = 1 << 2,
a9339c
+        ACQUIRE_NO_TMPFILE  = 1 << 3,
a9339c
+        ACQUIRE_NO_REGULAR  = 1 << 4,
a9339c
+};
a9339c
+
a9339c
+int acquire_data_fd(const void *data, size_t size, unsigned flags);
a9339c
diff --git a/src/test/test-util.c b/src/test/test-util.c
a9339c
index f2c52edce..efb02ff53 100644
a9339c
--- a/src/test/test-util.c
a9339c
+++ b/src/test/test-util.c
a9339c
@@ -1861,6 +1861,54 @@ static void test_system_tasks_max_scale(void) {
a9339c
         assert_se(system_tasks_max_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX);
a9339c
 }
a9339c
 
a9339c
+static void test_acquire_data_fd_one(unsigned flags) {
a9339c
+        char wbuffer[196*1024 - 7];
a9339c
+        char rbuffer[sizeof(wbuffer)];
a9339c
+        int fd;
a9339c
+
a9339c
+        fd = acquire_data_fd("foo", 3, flags);
a9339c
+        assert_se(fd >= 0);
a9339c
+
a9339c
+        zero(rbuffer);
a9339c
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 3);
a9339c
+        assert_se(streq(rbuffer, "foo"));
a9339c
+
a9339c
+        fd = safe_close(fd);
a9339c
+
a9339c
+        fd = acquire_data_fd("", 0, flags);
a9339c
+        assert_se(fd >= 0);
a9339c
+
a9339c
+        zero(rbuffer);
a9339c
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 0);
a9339c
+        assert_se(streq(rbuffer, ""));
a9339c
+
a9339c
+        fd = safe_close(fd);
a9339c
+
a9339c
+        random_bytes(wbuffer, sizeof(wbuffer));
a9339c
+
a9339c
+        fd = acquire_data_fd(wbuffer, sizeof(wbuffer), flags);
a9339c
+        assert_se(fd >= 0);
a9339c
+
a9339c
+        zero(rbuffer);
a9339c
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == sizeof(rbuffer));
a9339c
+        assert_se(memcmp(rbuffer, wbuffer, sizeof(rbuffer)) == 0);
a9339c
+
a9339c
+        fd = safe_close(fd);
a9339c
+}
a9339c
+
a9339c
+static void test_acquire_data_fd(void) {
a9339c
+
a9339c
+        test_acquire_data_fd_one(0);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_PIPE);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_PIPE);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
a9339c
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE|ACQUIRE_NO_TMPFILE);
a9339c
+}
a9339c
+
a9339c
 int main(int argc, char *argv[]) {
a9339c
         log_parse_environment();
a9339c
         log_open();
a9339c
@@ -1943,6 +1991,7 @@ int main(int argc, char *argv[]) {
a9339c
         test_shell_maybe_quote();
a9339c
         test_system_tasks_max();
a9339c
         test_system_tasks_max_scale();
a9339c
+        test_acquire_data_fd();
a9339c
 
a9339c
         return 0;
a9339c
 }