ecbff1
From e5ac7ba7a16445f3ad23d9931979c20214eae913 Mon Sep 17 00:00:00 2001
ecbff1
From: Jan Synacek <jsynacek@redhat.com>
ecbff1
Date: Thu, 14 Sep 2017 16:27:08 +0200
ecbff1
Subject: [PATCH] path-util: make use of "mnt_id" field exported in
ecbff1
 /proc/self/fdinfo/<fd>
ecbff1
ecbff1
This commit is not a backport of a specific commit. It includes parts of
ecbff1
several upstream commits (3f72b427b44f39a1aec6806dad6f6b57103ae9ed,
ecbff1
5d409034017e9f9f8c4392157d95511fc2e05d87 and others).
ecbff1
ecbff1
The main goal was to bring path_is_mount_point() up to date, which meant
ecbff1
introducing fd_fdinfo_mnt_id() and fd_is_mount_point(). These were
ecbff1
needed mainly because we need to determine mount points based on
ecbff1
/proc/self/fdinfo/<fd> in containers. Also, there are more places in the
ecbff1
code where checks for mount points are performed, which would benefit from
ecbff1
this fix as well. Additionally, corresponding tests has been added.
ecbff1
ecbff1
Resolves: #1472439
ecbff1
---
ecbff1
 src/nspawn/nspawn.c       |   2 +-
ecbff1
 src/shared/path-util.c    | 219 ++++++++++++++++++++++++++++++++++++----------
ecbff1
 src/shared/path-util.h    |   1 +
ecbff1
 src/test/test-path-util.c |  62 +++++++++++++
ecbff1
 4 files changed, 235 insertions(+), 49 deletions(-)
ecbff1
ecbff1
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
ecbff1
index ea365b3f9..ee2e1832f 100644
ecbff1
--- a/src/nspawn/nspawn.c
ecbff1
+++ b/src/nspawn/nspawn.c
ecbff1
@@ -990,7 +990,7 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
ecbff1
         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
ecbff1
 
ecbff1
         r = path_is_mount_point(to, false);
ecbff1
-        if (r < 0)
ecbff1
+        if (r < 0 && r != -ENOENT)
ecbff1
                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
ecbff1
         if (r > 0)
ecbff1
                 return 0;
ecbff1
diff --git a/src/shared/path-util.c b/src/shared/path-util.c
ecbff1
index 1181ffb9d..5d4de9ec4 100644
ecbff1
--- a/src/shared/path-util.c
ecbff1
+++ b/src/shared/path-util.c
ecbff1
@@ -36,6 +36,7 @@
ecbff1
 #include "strv.h"
ecbff1
 #include "path-util.h"
ecbff1
 #include "missing.h"
ecbff1
+#include "fileio.h"
ecbff1
 
ecbff1
 bool path_is_absolute(const char *p) {
ecbff1
         return p[0] == '/';
ecbff1
@@ -473,87 +474,209 @@ char* path_join(const char *root, const char *path, const char *rest) {
ecbff1
                                NULL);
ecbff1
 }
ecbff1
 
ecbff1
-int path_is_mount_point(const char *t, bool allow_symlink) {
ecbff1
+static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) {
ecbff1
+        char path[strlen("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
ecbff1
+        _cleanup_free_ char *fdinfo = NULL;
ecbff1
+        _cleanup_close_ int subfd = -1;
ecbff1
+        char *p;
ecbff1
+        int r;
ecbff1
+
ecbff1
+        if ((flags & AT_EMPTY_PATH) && isempty(filename))
ecbff1
+                xsprintf(path, "/proc/self/fdinfo/%i", fd);
ecbff1
+        else {
ecbff1
+                subfd = openat(fd, filename, O_CLOEXEC|O_PATH);
ecbff1
+                if (subfd < 0)
ecbff1
+                        return -errno;
ecbff1
 
ecbff1
-        union file_handle_union h = FILE_HANDLE_INIT;
ecbff1
+                xsprintf(path, "/proc/self/fdinfo/%i", subfd);
ecbff1
+        }
ecbff1
+
ecbff1
+        r = read_full_file(path, &fdinfo, NULL);
ecbff1
+        if (r == -ENOENT) /* The fdinfo directory is a relatively new addition */
ecbff1
+                return -EOPNOTSUPP;
ecbff1
+        if (r < 0)
ecbff1
+                return -errno;
ecbff1
+
ecbff1
+        p = startswith(fdinfo, "mnt_id:");
ecbff1
+        if (!p) {
ecbff1
+                p = strstr(fdinfo, "\nmnt_id:");
ecbff1
+                if (!p) /* The mnt_id field is a relatively new addition */
ecbff1
+                        return -EOPNOTSUPP;
ecbff1
+
ecbff1
+                p += 8;
ecbff1
+        }
ecbff1
+
ecbff1
+        p += strspn(p, WHITESPACE);
ecbff1
+        p[strcspn(p, WHITESPACE)] = 0;
ecbff1
+
ecbff1
+        return safe_atoi(p, mnt_id);
ecbff1
+}
ecbff1
+
ecbff1
+int fd_is_mount_point(int fd, const char *filename, int flags) {
ecbff1
+        union file_handle_union h = FILE_HANDLE_INIT, h_parent = FILE_HANDLE_INIT;
ecbff1
         int mount_id = -1, mount_id_parent = -1;
ecbff1
-        _cleanup_free_ char *parent = NULL;
ecbff1
+        bool nosupp = false, check_st_dev = true;
ecbff1
         struct stat a, b;
ecbff1
         int r;
ecbff1
-        bool nosupp = false;
ecbff1
 
ecbff1
-        /* We are not actually interested in the file handles, but
ecbff1
-         * name_to_handle_at() also passes us the mount ID, hence use
ecbff1
-         * it but throw the handle away */
ecbff1
+        assert(fd >= 0);
ecbff1
+        assert(filename);
ecbff1
 
ecbff1
-        if (path_equal(t, "/"))
ecbff1
-                return 1;
ecbff1
-
ecbff1
-        r = name_to_handle_at(AT_FDCWD, t, &h.handle, &mount_id, allow_symlink ? AT_SYMLINK_FOLLOW : 0);
ecbff1
+        /* First we will try the name_to_handle_at() syscall, which
ecbff1
+         * tells us the mount id and an opaque file "handle". It is
ecbff1
+         * not supported everywhere though (kernel compile-time
ecbff1
+         * option, not all file systems are hooked up). If it works
ecbff1
+         * the mount id is usually good enough to tell us whether
ecbff1
+         * something is a mount point.
ecbff1
+         *
ecbff1
+         * If that didn't work we will try to read the mount id from
ecbff1
+         * /proc/self/fdinfo/<fd>. This is almost as good as
ecbff1
+         * name_to_handle_at(), however, does not return the
ecbff1
+         * opaque file handle. The opaque file handle is pretty useful
ecbff1
+         * to detect the root directory, which we should always
ecbff1
+         * consider a mount point. Hence we use this only as
ecbff1
+         * fallback. Exporting the mnt_id in fdinfo is a pretty recent
ecbff1
+         * kernel addition.
ecbff1
+         *
ecbff1
+         * As last fallback we do traditional fstat() based st_dev
ecbff1
+         * comparisons. This is how things were traditionally done,
ecbff1
+         * but unionfs breaks this since it exposes file
ecbff1
+         * systems with a variety of st_dev reported. Also, btrfs
ecbff1
+         * subvolumes have different st_dev, even though they aren't
ecbff1
+         * real mounts of their own. */
ecbff1
+
ecbff1
+        r = name_to_handle_at(fd, filename, &h.handle, &mount_id, flags);
ecbff1
         if (r < 0) {
ecbff1
-                if (errno == ENOSYS)
ecbff1
-                        /* This kernel does not support name_to_handle_at()
ecbff1
-                         * fall back to the traditional stat() logic. */
ecbff1
-                        goto fallback;
ecbff1
+                if (IN_SET(errno, ENOSYS, EACCES, EPERM))
ecbff1
+                        /* This kernel does not support name_to_handle_at() at all, or the syscall was blocked (maybe
ecbff1
+                         * through seccomp, because we are running inside of a container?): fall back to simpler
ecbff1
+                         * logic. */
ecbff1
+                        goto fallback_fdinfo;
ecbff1
                 else if (errno == EOPNOTSUPP)
ecbff1
                         /* This kernel or file system does not support
ecbff1
-                         * name_to_handle_at(), hence fallback to the
ecbff1
+                         * name_to_handle_at(), hence let's see if the
ecbff1
+                         * upper fs supports it (in which case it is a
ecbff1
+                         * mount point), otherwise fallback to the
ecbff1
                          * traditional stat() logic */
ecbff1
                         nosupp = true;
ecbff1
-                else if (errno == ENOENT)
ecbff1
-                        return 0;
ecbff1
                 else
ecbff1
                         return -errno;
ecbff1
         }
ecbff1
 
ecbff1
-        r = path_get_parent(t, &parent);
ecbff1
-        if (r < 0)
ecbff1
-                return r;
ecbff1
-
ecbff1
-        h.handle.handle_bytes = MAX_HANDLE_SZ;
ecbff1
-        r = name_to_handle_at(AT_FDCWD, parent, &h.handle, &mount_id_parent, AT_SYMLINK_FOLLOW);
ecbff1
-        if (r < 0)
ecbff1
-                if (errno == EOPNOTSUPP)
ecbff1
+        r = name_to_handle_at(fd, "", &h_parent.handle, &mount_id_parent, AT_EMPTY_PATH);
ecbff1
+        if (r < 0) {
ecbff1
+                if (errno == EOPNOTSUPP) {
ecbff1
                         if (nosupp)
ecbff1
                                 /* Neither parent nor child do name_to_handle_at()?
ecbff1
                                    We have no choice but to fall back. */
ecbff1
-                                goto fallback;
ecbff1
+                                goto fallback_fdinfo;
ecbff1
                         else
ecbff1
-                                /* The parent can't do name_to_handle_at() but
ecbff1
-                                 * the directory we are interested in can?
ecbff1
-                                 * Or the other way around?
ecbff1
+                                /* The parent can't do name_to_handle_at() but the
ecbff1
+                                 * directory we are interested in can?
ecbff1
                                  * If so, it must be a mount point. */
ecbff1
                                 return 1;
ecbff1
-                else
ecbff1
+                } else
ecbff1
                         return -errno;
ecbff1
-        else
ecbff1
-                return mount_id != mount_id_parent;
ecbff1
+        }
ecbff1
 
ecbff1
-fallback:
ecbff1
-        if (allow_symlink)
ecbff1
-                r = stat(t, &a);
ecbff1
-        else
ecbff1
-                r = lstat(t, &a);
ecbff1
+        /* The parent can do name_to_handle_at() but the
ecbff1
+         * directory we are interested in can't? If so, it
ecbff1
+         * must be a mount point. */
ecbff1
+        if (nosupp)
ecbff1
+                return 1;
ecbff1
 
ecbff1
-        if (r < 0) {
ecbff1
-                if (errno == ENOENT)
ecbff1
-                        return 0;
ecbff1
+        /* If the file handle for the directory we are
ecbff1
+         * interested in and its parent are identical, we
ecbff1
+         * assume this is the root directory, which is a mount
ecbff1
+         * point. */
ecbff1
 
ecbff1
-                return -errno;
ecbff1
-        }
ecbff1
+        if (h.handle.handle_bytes == h_parent.handle.handle_bytes &&
ecbff1
+            h.handle.handle_type == h_parent.handle.handle_type &&
ecbff1
+            memcmp(h.handle.f_handle, h_parent.handle.f_handle, h.handle.handle_bytes) == 0)
ecbff1
+                return 1;
ecbff1
 
ecbff1
-        free(parent);
ecbff1
-        parent = NULL;
ecbff1
+        return mount_id != mount_id_parent;
ecbff1
 
ecbff1
-        r = path_get_parent(t, &parent);
ecbff1
+fallback_fdinfo:
ecbff1
+        r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
ecbff1
+        if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM))
ecbff1
+                goto fallback_fstat;
ecbff1
         if (r < 0)
ecbff1
                 return r;
ecbff1
 
ecbff1
-        r = stat(parent, &b);
ecbff1
+        r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
ecbff1
         if (r < 0)
ecbff1
+                return r;
ecbff1
+
ecbff1
+        if (mount_id != mount_id_parent)
ecbff1
+                return 1;
ecbff1
+
ecbff1
+        /* Hmm, so, the mount ids are the same. This leaves one
ecbff1
+         * special case though for the root file system. For that,
ecbff1
+         * let's see if the parent directory has the same inode as we
ecbff1
+         * are interested in. Hence, let's also do fstat() checks now,
ecbff1
+         * too, but avoid the st_dev comparisons, since they aren't
ecbff1
+         * that useful on unionfs mounts. */
ecbff1
+        check_st_dev = false;
ecbff1
+
ecbff1
+fallback_fstat:
ecbff1
+        /* yay for fstatat() taking a different set of flags than the other
ecbff1
+         * _at() above */
ecbff1
+        if (flags & AT_SYMLINK_FOLLOW)
ecbff1
+                flags &= ~AT_SYMLINK_FOLLOW;
ecbff1
+        else
ecbff1
+                flags |= AT_SYMLINK_NOFOLLOW;
ecbff1
+        if (fstatat(fd, filename, &a, flags) < 0)
ecbff1
+                return -errno;
ecbff1
+
ecbff1
+        if (fstatat(fd, "", &b, AT_EMPTY_PATH) < 0)
ecbff1
+                return -errno;
ecbff1
+
ecbff1
+        /* A directory with same device and inode as its parent? Must
ecbff1
+         * be the root directory */
ecbff1
+        if (a.st_dev == b.st_dev &&
ecbff1
+            a.st_ino == b.st_ino)
ecbff1
+                return 1;
ecbff1
+
ecbff1
+        return check_st_dev && (a.st_dev != b.st_dev);
ecbff1
+}
ecbff1
+
ecbff1
+
ecbff1
+
ecbff1
+int path_is_mount_point(const char *t, bool allow_symlink) {
ecbff1
+        _cleanup_free_ char *canonical = NULL, *parent = NULL;
ecbff1
+        _cleanup_close_ int fd = -1;
ecbff1
+        int flags = allow_symlink ? AT_SYMLINK_FOLLOW : 0;
ecbff1
+
ecbff1
+        assert(t);
ecbff1
+
ecbff1
+        if (path_equal(t, "/"))
ecbff1
+                return 1;
ecbff1
+
ecbff1
+        /* we need to resolve symlinks manually, we can't just rely on
ecbff1
+         * fd_is_mount_point() to do that for us; if we have a structure like
ecbff1
+         * /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
ecbff1
+         * look at needs to be /usr, not /. */
ecbff1
+        if (flags & AT_SYMLINK_FOLLOW) {
ecbff1
+                canonical = canonicalize_file_name(t);
ecbff1
+                if (!canonical) {
ecbff1
+                        if (errno == ENOENT)
ecbff1
+                                return 0;
ecbff1
+                        else
ecbff1
+                                return -errno;
ecbff1
+                }
ecbff1
+                t = canonical;
ecbff1
+        }
ecbff1
+
ecbff1
+        parent = dirname_malloc(t);
ecbff1
+        if (!parent)
ecbff1
+                return -ENOMEM;
ecbff1
+
ecbff1
+        fd = openat(AT_FDCWD, parent, O_DIRECTORY|O_CLOEXEC|O_PATH);
ecbff1
+        if (fd < 0)
ecbff1
                 return -errno;
ecbff1
 
ecbff1
-        return a.st_dev != b.st_dev;
ecbff1
+        return fd_is_mount_point(fd, basename(t), flags);
ecbff1
 }
ecbff1
 
ecbff1
 int path_is_read_only_fs(const char *path) {
ecbff1
diff --git a/src/shared/path-util.h b/src/shared/path-util.h
ecbff1
index 71bb740e9..34c016229 100644
ecbff1
--- a/src/shared/path-util.h
ecbff1
+++ b/src/shared/path-util.h
ecbff1
@@ -53,6 +53,7 @@ char** path_strv_make_absolute_cwd(char **l);
ecbff1
 char** path_strv_resolve(char **l, const char *prefix);
ecbff1
 char** path_strv_resolve_uniq(char **l, const char *prefix);
ecbff1
 
ecbff1
+int fd_is_mount_point(int fd, const char *filename, int flags);
ecbff1
 int path_is_mount_point(const char *path, bool allow_symlink);
ecbff1
 int path_is_read_only_fs(const char *path);
ecbff1
 int path_is_os_tree(const char *path);
ecbff1
diff --git a/src/test/test-path-util.c b/src/test/test-path-util.c
ecbff1
index 6396fcb39..a4fec07e7 100644
ecbff1
--- a/src/test/test-path-util.c
ecbff1
+++ b/src/test/test-path-util.c
ecbff1
@@ -21,6 +21,7 @@
ecbff1
 
ecbff1
 #include <stdio.h>
ecbff1
 #include <unistd.h>
ecbff1
+#include <sys/mount.h>
ecbff1
 
ecbff1
 #include "path-util.h"
ecbff1
 #include "util.h"
ecbff1
@@ -99,6 +100,66 @@ static void test_path(void) {
ecbff1
         }
ecbff1
 }
ecbff1
 
ecbff1
+static void test_path_is_mount_point(void) {
ecbff1
+        int fd, rt, rf, rlt, rlf;
ecbff1
+        char tmp_dir[] = "/tmp/test-path-is-mount-point-XXXXXX";
ecbff1
+        _cleanup_free_ char *file1 = NULL, *file2 = NULL, *link1 = NULL, *link2 = NULL;
ecbff1
+
ecbff1
+        assert_se(path_is_mount_point("/", true) > 0);
ecbff1
+        assert_se(path_is_mount_point("/", false) > 0);
ecbff1
+
ecbff1
+        assert_se(path_is_mount_point("/proc", true) > 0);
ecbff1
+        assert_se(path_is_mount_point("/proc", false) > 0);
ecbff1
+
ecbff1
+        assert_se(path_is_mount_point("/proc/1", true) == 0);
ecbff1
+        assert_se(path_is_mount_point("/proc/1", false) == 0);
ecbff1
+
ecbff1
+        assert_se(path_is_mount_point("/sys", true) > 0);
ecbff1
+        assert_se(path_is_mount_point("/sys", false) > 0);
ecbff1
+
ecbff1
+        /* file mountpoints */
ecbff1
+        assert_se(mkdtemp(tmp_dir) != NULL);
ecbff1
+        file1 = path_join(NULL, tmp_dir, "file1");
ecbff1
+        assert_se(file1);
ecbff1
+        file2 = path_join(NULL, tmp_dir, "file2");
ecbff1
+        assert_se(file2);
ecbff1
+        fd = open(file1, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0664);
ecbff1
+        assert_se(fd > 0);
ecbff1
+        close(fd);
ecbff1
+        fd = open(file2, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0664);
ecbff1
+        assert_se(fd > 0);
ecbff1
+        close(fd);
ecbff1
+        link1 = path_join(NULL, tmp_dir, "link1");
ecbff1
+        assert_se(link1);
ecbff1
+        assert_se(symlink("file1", link1) == 0);
ecbff1
+        link2 = path_join(NULL, tmp_dir, "link2");
ecbff1
+        assert_se(link1);
ecbff1
+        assert_se(symlink("file2", link2) == 0);
ecbff1
+
ecbff1
+        assert_se(path_is_mount_point(file1, true) == 0);
ecbff1
+        assert_se(path_is_mount_point(file1, false) == 0);
ecbff1
+        assert_se(path_is_mount_point(link1, true) == 0);
ecbff1
+        assert_se(path_is_mount_point(link1, false) == 0);
ecbff1
+
ecbff1
+        /* this test will only work as root */
ecbff1
+        if (mount(file1, file2, NULL, MS_BIND, NULL) >= 0) {
ecbff1
+                rf = path_is_mount_point(file2, false);
ecbff1
+                rt = path_is_mount_point(file2, true);
ecbff1
+                rlf = path_is_mount_point(link2, false);
ecbff1
+                rlt = path_is_mount_point(link2, true);
ecbff1
+
ecbff1
+                assert_se(umount(file2) == 0);
ecbff1
+
ecbff1
+                assert_se(rf == 1);
ecbff1
+                assert_se(rt == 1);
ecbff1
+                assert_se(rlf == 0);
ecbff1
+                assert_se(rlt == 1);
ecbff1
+        } else
ecbff1
+                printf("Skipping bind mount file test: %m\n");
ecbff1
+
ecbff1
+        assert_se(rm_rf(tmp_dir, false, true, false) == 0);
ecbff1
+}
ecbff1
+
ecbff1
 static void test_find_binary(const char *self, bool local) {
ecbff1
         char *p;
ecbff1
 
ecbff1
@@ -288,6 +349,7 @@ int main(int argc, char **argv) {
ecbff1
         test_make_relative();
ecbff1
         test_strv_resolve();
ecbff1
         test_path_startswith();
ecbff1
+        test_path_is_mount_point();
ecbff1
 
ecbff1
         return 0;
ecbff1
 }