Blob Blame History Raw
From 581edd240f8dd68b1dbb4070353ddb2059eb8a67 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 27 Oct 2017 10:56:42 +0200
Subject: [PATCH] fd-util: add new acquire_data_fd() API helper

All this function does is place some data in an in-memory read-only fd,
that may be read back to get the original data back.

Doing this in a way that works everywhere, given the different kernels
we support as well as different privilege levels is surprisingly
complex.

(cherry picked from commit a548e14d690133dd8cca2d5ab8082bb23259fd5f)

Related: #1446095
---
 src/shared/util.c    | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/shared/util.h    |  10 ++++
 src/test/test-util.c |  49 ++++++++++++++++
 3 files changed, 215 insertions(+)

diff --git a/src/shared/util.c b/src/shared/util.c
index af0953273..982f5e044 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -95,6 +95,7 @@
 #include "sparse-endian.h"
 #include "conf-parser.h"
 #include "cgroup-util.h"
+#include "memfd-util.h"
 
 int saved_argc = 0;
 char **saved_argv = NULL;
@@ -8893,3 +8894,158 @@ uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
 
         return m / max;
 }
+
+int acquire_data_fd(const void *data, size_t size, unsigned flags) {
+
+        char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
+        _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
+        char pattern[] = "/dev/shm/data-fd-XXXXXX";
+        _cleanup_close_ int fd = -1;
+        int isz = 0, r;
+        ssize_t n;
+        off_t f;
+
+        assert(data || size == 0);
+
+        /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
+         * complex than I wish it was. But here's why:
+         *
+         * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
+         *    read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
+         *
+         * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
+         *    a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
+         *    clients can only bump their size to a system-wide limit, which might be quite low.
+         *
+         * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
+         *    earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
+         *    /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
+         *
+         * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
+         *
+         * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
+         * figure. */
+
+        if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
+                /* As a special case, return /dev/null if we have been called for an empty data block */
+                r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
+                if (r < 0)
+                        return -errno;
+
+                return r;
+        }
+
+        if ((flags & ACQUIRE_NO_MEMFD) == 0) {
+                fd = memfd_new("data-fd");
+                if (fd < 0)
+                        goto try_pipe;
+
+                n = write(fd, data, size);
+                if (n < 0)
+                        return -errno;
+                if ((size_t) n != size)
+                        return -EIO;
+
+                f = lseek(fd, 0, SEEK_SET);
+                if (f != 0)
+                        return -errno;
+
+                r = memfd_set_sealed(fd);
+                if (r < 0)
+                        return r;
+
+                r = fd;
+                fd = -1;
+
+                return r;
+        }
+
+try_pipe:
+        if ((flags & ACQUIRE_NO_PIPE) == 0) {
+                if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
+                        return -errno;
+
+                isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
+                if (isz < 0)
+                        return -errno;
+
+                if ((size_t) isz < size) {
+                        isz = (int) size;
+                        if (isz < 0 || (size_t) isz != size)
+                                return -E2BIG;
+
+                        /* Try to bump the pipe size */
+                        (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
+
+                        /* See if that worked */
+                        isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
+                        if (isz < 0)
+                                return -errno;
+
+                        if ((size_t) isz < size)
+                                goto try_dev_shm;
+                }
+
+                n = write(pipefds[1], data, size);
+                if (n < 0)
+                        return -errno;
+                if ((size_t) n != size)
+                        return -EIO;
+
+                (void) fd_nonblock(pipefds[0], false);
+
+                r = pipefds[0];
+                pipefds[0] = -1;
+
+                return r;
+        }
+
+try_dev_shm:
+        if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
+                fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
+                if (fd < 0)
+                        goto try_dev_shm_without_o_tmpfile;
+
+                n = write(fd, data, size);
+                if (n < 0)
+                        return -errno;
+                if ((size_t) n != size)
+                        return -EIO;
+
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
+                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
+                r = open(procfs_path, O_RDONLY|O_CLOEXEC);
+                if (r < 0)
+                        return -errno;
+
+                return r;
+        }
+
+try_dev_shm_without_o_tmpfile:
+        if ((flags & ACQUIRE_NO_REGULAR) == 0) {
+                fd = mkostemp_safe(pattern, O_CLOEXEC);
+                if (fd < 0)
+                        return fd;
+
+                n = write(fd, data, size);
+                if (n < 0) {
+                        r = -errno;
+                        goto unlink_and_return;
+                }
+                if ((size_t) n != size) {
+                        r = -EIO;
+                        goto unlink_and_return;
+                }
+
+                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
+                r = open(pattern, O_RDONLY|O_CLOEXEC);
+                if (r < 0)
+                        r = -errno;
+
+        unlink_and_return:
+                (void) unlink(pattern);
+                return r;
+        }
+
+        return -EOPNOTSUPP;
+}
diff --git a/src/shared/util.h b/src/shared/util.h
index 526a6fe84..9c4be0256 100644
--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -1112,3 +1112,13 @@ int parse_percent(const char *p);
 
 uint64_t system_tasks_max(void);
 uint64_t system_tasks_max_scale(uint64_t v, uint64_t max);
+
+enum {
+        ACQUIRE_NO_DEV_NULL = 1 << 0,
+        ACQUIRE_NO_MEMFD    = 1 << 1,
+        ACQUIRE_NO_PIPE     = 1 << 2,
+        ACQUIRE_NO_TMPFILE  = 1 << 3,
+        ACQUIRE_NO_REGULAR  = 1 << 4,
+};
+
+int acquire_data_fd(const void *data, size_t size, unsigned flags);
diff --git a/src/test/test-util.c b/src/test/test-util.c
index f2c52edce..efb02ff53 100644
--- a/src/test/test-util.c
+++ b/src/test/test-util.c
@@ -1861,6 +1861,54 @@ static void test_system_tasks_max_scale(void) {
         assert_se(system_tasks_max_scale(UINT64_MAX/4, UINT64_MAX) == UINT64_MAX);
 }
 
+static void test_acquire_data_fd_one(unsigned flags) {
+        char wbuffer[196*1024 - 7];
+        char rbuffer[sizeof(wbuffer)];
+        int fd;
+
+        fd = acquire_data_fd("foo", 3, flags);
+        assert_se(fd >= 0);
+
+        zero(rbuffer);
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 3);
+        assert_se(streq(rbuffer, "foo"));
+
+        fd = safe_close(fd);
+
+        fd = acquire_data_fd("", 0, flags);
+        assert_se(fd >= 0);
+
+        zero(rbuffer);
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == 0);
+        assert_se(streq(rbuffer, ""));
+
+        fd = safe_close(fd);
+
+        random_bytes(wbuffer, sizeof(wbuffer));
+
+        fd = acquire_data_fd(wbuffer, sizeof(wbuffer), flags);
+        assert_se(fd >= 0);
+
+        zero(rbuffer);
+        assert_se(read(fd, rbuffer, sizeof(rbuffer)) == sizeof(rbuffer));
+        assert_se(memcmp(rbuffer, wbuffer, sizeof(rbuffer)) == 0);
+
+        fd = safe_close(fd);
+}
+
+static void test_acquire_data_fd(void) {
+
+        test_acquire_data_fd_one(0);
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL);
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD);
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD);
+        test_acquire_data_fd_one(ACQUIRE_NO_PIPE);
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_PIPE);
+        test_acquire_data_fd_one(ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE);
+        test_acquire_data_fd_one(ACQUIRE_NO_DEV_NULL|ACQUIRE_NO_MEMFD|ACQUIRE_NO_PIPE|ACQUIRE_NO_TMPFILE);
+}
+
 int main(int argc, char *argv[]) {
         log_parse_environment();
         log_open();
@@ -1943,6 +1991,7 @@ int main(int argc, char *argv[]) {
         test_shell_maybe_quote();
         test_system_tasks_max();
         test_system_tasks_max_scale();
+        test_acquire_data_fd();
 
         return 0;
 }