aed857
From 581edd240f8dd68b1dbb4070353ddb2059eb8a67 Mon Sep 17 00:00:00 2001
aed857
From: Lennart Poettering <lennart@poettering.net>
aed857
Date: Fri, 27 Oct 2017 10:56:42 +0200
aed857
Subject: [PATCH] fd-util: add new acquire_data_fd() API helper
aed857
aed857
All this function does is place some data in an in-memory read-only fd,
aed857
that may be read back to get the original data back.
aed857
aed857
Doing this in a way that works everywhere, given the different kernels
aed857
we support as well as different privilege levels is surprisingly
aed857
complex.
aed857
aed857
(cherry picked from commit a548e14d690133dd8cca2d5ab8082bb23259fd5f)
aed857
aed857
Related: #1446095
aed857
---
23b3cf
 src/shared/util.c    | 156 +++++++++++++++++++++++++++++++++++++++++++
23b3cf
 src/shared/util.h    |  10 +++
23b3cf
 src/test/test-util.c |  49 ++++++++++++++
aed857
 3 files changed, 215 insertions(+)
aed857
aed857
diff --git a/src/shared/util.c b/src/shared/util.c
c62b8e
index af09532733..982f5e044f 100644
aed857
--- a/src/shared/util.c
aed857
+++ b/src/shared/util.c
aed857
@@ -95,6 +95,7 @@
aed857
 #include "sparse-endian.h"
aed857
 #include "conf-parser.h"
aed857
 #include "cgroup-util.h"
aed857
+#include "memfd-util.h"
aed857
 
aed857
 int saved_argc = 0;
aed857
 char **saved_argv = NULL;
aed857
@@ -8893,3 +8894,158 @@ uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
aed857
 
aed857
         return m / max;
aed857
 }
aed857
+
aed857
+int acquire_data_fd(const void *data, size_t size, unsigned flags) {
aed857
+
aed857
+        char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
aed857
+        _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
aed857
+        char pattern[] = "/dev/shm/data-fd-XXXXXX";
aed857
+        _cleanup_close_ int fd = -1;
aed857
+        int isz = 0, r;
aed857
+        ssize_t n;
aed857
+        off_t f;
aed857
+
aed857
+        assert(data || size == 0);
aed857
+
aed857
+        /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
aed857
+         * complex than I wish it was. But here's why:
aed857
+         *
aed857
+         * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
aed857
+         *    read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
aed857
+         *
aed857
+         * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
aed857
+         *    a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
aed857
+         *    clients can only bump their size to a system-wide limit, which might be quite low.
aed857
+         *
aed857
+         * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
aed857
+         *    earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
aed857
+         *    /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
aed857
+         *
aed857
+         * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
aed857
+         *
aed857
+         * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
aed857
+         * figure. */
aed857
+
aed857
+        if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
aed857
+                /* As a special case, return /dev/null if we have been called for an empty data block */
aed857
+                r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
aed857
+                if (r < 0)
aed857
+                        return -errno;
aed857
+
aed857
+                return r;
aed857
+        }
aed857
+
aed857
+        if ((flags & ACQUIRE_NO_MEMFD) == 0) {
aed857
+                fd = memfd_new("data-fd");
aed857
+                if (fd < 0)
aed857
+                        goto try_pipe;
aed857
+
aed857
+                n = write(fd, data, size);
aed857
+                if (n < 0)
aed857
+                        return -errno;
aed857
+                if ((size_t) n != size)
aed857
+                        return -EIO;
aed857
+
aed857
+                f = lseek(fd, 0, SEEK_SET);
aed857
+                if (f != 0)
aed857
+                        return -errno;
aed857
+
aed857
+                r = memfd_set_sealed(fd);
aed857
+                if (r < 0)
aed857
+                        return r;
aed857
+
aed857
+                r = fd;
aed857
+                fd = -1;
aed857
+
aed857
+                return r;
aed857
+        }
aed857
+
aed857
+try_pipe:
aed857
+        if ((flags & ACQUIRE_NO_PIPE) == 0) {
aed857
+                if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
aed857
+                        return -errno;
aed857
+
aed857
+                isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
aed857
+                if (isz < 0)
aed857
+                        return -errno;
aed857
+
aed857
+                if ((size_t) isz < size) {
aed857
+                        isz = (int) size;
aed857
+                        if (isz < 0 || (size_t) isz != size)
aed857
+                                return -E2BIG;
aed857
+
aed857
+                        /* Try to bump the pipe size */
aed857
+                        (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
aed857
+
aed857
+                        /* See if that worked */
aed857
+                        isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
aed857
+                        if (isz < 0)
aed857
+                                return -errno;
aed857
+
aed857
+                        if ((size_t) isz < size)
aed857
+                                goto try_dev_shm;
aed857
+                }
aed857
+
aed857
+                n = write(pipefds[1], data, size);
aed857
+                if (n < 0)
aed857
+                        return -errno;
aed857
+                if ((size_t) n != size)
aed857
+                        return -EIO;
aed857
+
aed857
+                (void) fd_nonblock(pipefds[0], false);
aed857
+
aed857
+                r = pipefds[0];
aed857
+                pipefds[0] = -1;
aed857
+
aed857
+                return r;
aed857
+        }
aed857
+
aed857
+try_dev_shm:
aed857
+        if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
aed857
+                fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
aed857
+                if (fd < 0)
aed857
+                        goto try_dev_shm_without_o_tmpfile;
aed857
+
aed857
+                n = write(fd, data, size);
aed857
+                if (n < 0)
aed857
+                        return -errno;
aed857
+                if ((size_t) n != size)
aed857
+                        return -EIO;
aed857
+
aed857
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
aed857
+                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
aed857
+                r = open(procfs_path, O_RDONLY|O_CLOEXEC);
aed857
+                if (r < 0)
aed857
+                        return -errno;
aed857
+
aed857
+                return r;
aed857
+        }
aed857
+
aed857
+try_dev_shm_without_o_tmpfile:
aed857
+        if ((flags & ACQUIRE_NO_REGULAR) == 0) {
aed857
+                fd = mkostemp_safe(pattern, O_CLOEXEC);
aed857
+                if (fd < 0)
aed857
+                        return fd;
aed857
+
aed857
+                n = write(fd, data, size);
aed857
+                if (n < 0) {
aed857
+                        r = -errno;
aed857
+                        goto unlink_and_return;
aed857
+                }
aed857
+                if ((size_t) n != size) {
aed857
+                        r = -EIO;
aed857
+                        goto unlink_and_return;
aed857
+                }
aed857
+
aed857
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
aed857
+                r = open(pattern, O_RDONLY|O_CLOEXEC);
aed857
+                if (r < 0)
aed857
+                        r = -errno;
aed857
+
aed857
+        unlink_and_return:
aed857
+                (void) unlink(pattern);
aed857
+                return r;
aed857
+        }
aed857
+
aed857
+        return -EOPNOTSUPP;
aed857
+}
aed857
diff --git a/src/shared/util.h b/src/shared/util.h
c62b8e
index 526a6fe848..9c4be02566 100644
aed857
--- a/src/shared/util.h
aed857
+++ b/src/shared/util.h
aed857
@@ -1112,3 +1112,13 @@ int parse_percent(const char *p);
aed857
 
aed857
 uint64_t system_tasks_max(void);
aed857
 uint64_t system_tasks_max_scale(uint64_t v, uint64_t max);
aed857
+
aed857
+enum {
aed857
+        ACQUIRE_NO_DEV_NULL = 1 << 0,
aed857
+        ACQUIRE_NO_MEMFD    = 1 << 1,
aed857
+        ACQUIRE_NO_PIPE     = 1 << 2,
aed857
+        ACQUIRE_NO_TMPFILE  = 1 << 3,
aed857
+        ACQUIRE_NO_REGULAR  = 1 << 4,
aed857
+};
aed857
+
aed857
+int acquire_data_fd(const void *data, size_t size, unsigned flags);
aed857
diff --git a/src/test/test-util.c b/src/test/test-util.c
c62b8e
index f2c52edcee..efb02ff530 100644
aed857
--- a/src/test/test-util.c
aed857
+++ b/src/test/test-util.c
aed857
@@ -1861,6 +1861,54 @@ static void test_system_tasks_max_scale(void) {
aed857
         assert_se(system_tasks_max_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX);
aed857
 }
aed857
 
aed857
+static void test_acquire_data_fd_one(unsigned flags) {
aed857
+        char wbuffer[196*1024 - 7];
aed857
+        char rbuffer[sizeof(wbuffer)];
aed857
+        int fd;
aed857
+
aed857
+        fd = acquire_data_fd("foo", 3, flags);
aed857
+        assert_se(fd >= 0);
aed857
+
aed857
+        zero(rbuffer);
aed857
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 3);
aed857
+        assert_se(streq(rbuffer, "foo"));
aed857
+
aed857
+        fd = safe_close(fd);
aed857
+
aed857
+        fd = acquire_data_fd("", 0, flags);
aed857
+        assert_se(fd >= 0);
aed857
+
aed857
+        zero(rbuffer);
aed857
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 0);
aed857
+        assert_se(streq(rbuffer, ""));
aed857
+
aed857
+        fd = safe_close(fd);
aed857
+
aed857
+        random_bytes(wbuffer, sizeof(wbuffer));
aed857
+
aed857
+        fd = acquire_data_fd(wbuffer, sizeof(wbuffer), flags);
aed857
+        assert_se(fd >= 0);
aed857
+
aed857
+        zero(rbuffer);
aed857
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == sizeof(rbuffer));
aed857
+        assert_se(memcmp(rbuffer, wbuffer, sizeof(rbuffer)) == 0);
aed857
+
aed857
+        fd = safe_close(fd);
aed857
+}
aed857
+
aed857
+static void test_acquire_data_fd(void) {
aed857
+
aed857
+        test_acquire_data_fd_one(0);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_PIPE);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_PIPE);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
aed857
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE|ACQUIRE_NO_TMPFILE);
aed857
+}
aed857
+
aed857
 int main(int argc, char *argv[]) {
aed857
         log_parse_environment();
aed857
         log_open();
aed857
@@ -1943,6 +1991,7 @@ int main(int argc, char *argv[]) {
aed857
         test_shell_maybe_quote();
aed857
         test_system_tasks_max();
aed857
         test_system_tasks_max_scale();
aed857
+        test_acquire_data_fd();
aed857
 
aed857
         return 0;
aed857
 }