|
|
b312fc |
From 743848de574b660972f457c28c02cbb19c8aa439 Mon Sep 17 00:00:00 2001
|
|
|
b312fc |
From: "T.kabe" <kabe@>
|
|
|
b312fc |
Date: Fri, 3 Mar 2017 17:06:44 +0900
|
|
|
b312fc |
Subject: [PATCH 4/4] vfs: Lazily remove mounts on unlinked files and directories.
|
|
|
b312fc |
|
|
|
b312fc |
[upstream commit 8ed936b5671bfb33d89bc60bdcc7cf0470ba52fe]
|
|
|
b312fc |
[upstream commit 7af1364ffa64db61e386628594836e13d2ef04b5]
|
|
|
b312fc |
|
|
|
b312fc |
commit 8ed936b5671bfb33d89bc60bdcc7cf0470ba52fe
|
|
|
b312fc |
Author: Eric W. Biederman <ebiederman@twitter.com>
|
|
|
b312fc |
Date: Tue Oct 1 18:33:48 2013 -0700
|
|
|
b312fc |
|
|
|
b312fc |
vfs: Lazily remove mounts on unlinked files and directories.
|
|
|
b312fc |
|
|
|
b312fc |
With the introduction of mount namespaces and bind mounts it became
|
|
|
b312fc |
possible to access files and directories that on some paths are mount
|
|
|
b312fc |
points but are not mount points on other paths. It is very confusing
|
|
|
b312fc |
when rm -rf somedir returns -EBUSY simply because somedir is mounted
|
|
|
b312fc |
somewhere else. With the addition of user namespaces allowing
|
|
|
b312fc |
unprivileged mounts this condition has gone from annoying to allowing
|
|
|
b312fc |
a DOS attack on other users in the system.
|
|
|
b312fc |
|
|
|
b312fc |
The possibility for mischief is removed by updating the vfs to support
|
|
|
b312fc |
rename, unlink and rmdir on a dentry that is a mountpoint and by
|
|
|
b312fc |
lazily unmounting mountpoints on deleted dentries.
|
|
|
b312fc |
|
|
|
b312fc |
In particular this change allows rename, unlink and rmdir system calls
|
|
|
b312fc |
on a dentry without a mountpoint in the current mount namespace to
|
|
|
b312fc |
succeed, and it allows rename, unlink, and rmdir performed on a
|
|
|
b312fc |
distributed filesystem to update the vfs cache even if when there is a
|
|
|
b312fc |
mount in some namespace on the original dentry.
|
|
|
b312fc |
|
|
|
b312fc |
There are two common patterns of maintaining mounts: Mounts on trusted
|
|
|
b312fc |
paths with the parent directory of the mount point and all ancestory
|
|
|
b312fc |
directories up to / owned by root and modifiable only by root
|
|
|
b312fc |
(i.e. /media/xxx, /dev, /dev/pts, /proc, /sys, /sys/fs/cgroup/{cpu,
|
|
|
b312fc |
cpuacct, ...}, /usr, /usr/local). Mounts on unprivileged directories
|
|
|
b312fc |
maintained by fusermount.
|
|
|
b312fc |
|
|
|
b312fc |
In the case of mounts in trusted directories owned by root and
|
|
|
b312fc |
modifiable only by root the current parent directory permissions are
|
|
|
b312fc |
sufficient to ensure a mount point on a trusted path is not removed
|
|
|
b312fc |
or renamed by anyone other than root, even if there is a context
|
|
|
b312fc |
where the there are no mount points to prevent this.
|
|
|
b312fc |
|
|
|
b312fc |
In the case of mounts in directories owned by less privileged users
|
|
|
b312fc |
races with users modifying the path of a mount point are already a
|
|
|
b312fc |
danger. fusermount already uses a combination of chdir,
|
|
|
b312fc |
/proc/<pid>/fd/NNN, and UMOUNT_NOFOLLOW to prevent these races. The
|
|
|
b312fc |
removable of global rename, unlink, and rmdir protection really adds
|
|
|
b312fc |
nothing new to consider only a widening of the attack window, and
|
|
|
b312fc |
fusermount is already safe against unprivileged users modifying the
|
|
|
b312fc |
directory simultaneously.
|
|
|
b312fc |
|
|
|
b312fc |
In principle for perfect userspace programs returning -EBUSY for
|
|
|
b312fc |
unlink, rmdir, and rename of dentires that have mounts in the local
|
|
|
b312fc |
namespace is actually unnecessary. Unfortunately not all userspace
|
|
|
b312fc |
programs are perfect so retaining -EBUSY for unlink, rmdir and rename
|
|
|
b312fc |
of dentries that have mounts in the current mount namespace plays an
|
|
|
b312fc |
important role of maintaining consistency with historical behavior and
|
|
|
b312fc |
making imperfect userspace applications hard to exploit.
|
|
|
b312fc |
|
|
|
b312fc |
v2: Remove spurious old_dentry.
|
|
|
b312fc |
v3: Optimized shrink_submounts_and_drop
|
|
|
b312fc |
Removed unsued afs label
|
|
|
b312fc |
v4: Simplified the changes to check_submounts_and_drop
|
|
|
b312fc |
Do not rename check_submounts_and_drop shrink_submounts_and_drop
|
|
|
b312fc |
Document what why we need atomicity in check_submounts_and_drop
|
|
|
b312fc |
Rely on the parent inode mutex to make d_revalidate and d_invalidate
|
|
|
b312fc |
an atomic unit.
|
|
|
b312fc |
v5: Refcount the mountpoint to detach in case of simultaneous
|
|
|
b312fc |
renames.
|
|
|
b312fc |
|
|
|
b312fc |
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
|
|
|
b312fc |
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
|
|
|
b312fc |
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
|
b312fc |
|
|
|
b312fc |
commit 7af1364ffa64db61e386628594836e13d2ef04b5
|
|
|
b312fc |
Author: Eric W. Biederman <ebiederm@xmission.com>
|
|
|
b312fc |
Date: Fri Oct 4 19:15:13 2013 -0700
|
|
|
b312fc |
|
|
|
b312fc |
vfs: Don't allow overwriting mounts in the current mount namespace
|
|
|
b312fc |
|
|
|
b312fc |
In preparation for allowing mountpoints to be renamed and unlinked
|
|
|
b312fc |
in remote filesystems and in other mount namespaces test if on a dentry
|
|
|
b312fc |
there is a mount in the local mount namespace before allowing it to
|
|
|
b312fc |
be renamed or unlinked.
|
|
|
b312fc |
|
|
|
b312fc |
The primary motivation here are old versions of fusermount unmount
|
|
|
b312fc |
which is not safe if the a path can be renamed or unlinked while it is
|
|
|
b312fc |
verifying the mount is safe to unmount. More recent versions are simpler
|
|
|
b312fc |
and safer by simply using UMOUNT_NOFOLLOW when unmounting a mount
|
|
|
b312fc |
in a directory owned by an arbitrary user.
|
|
|
b312fc |
|
|
|
b312fc |
Miklos Szeredi <miklos@szeredi.hu> reports this is approach is good
|
|
|
b312fc |
enough to remove concerns about new kernels mixed with old versions
|
|
|
b312fc |
of fusermount.
|
|
|
b312fc |
|
|
|
b312fc |
A secondary motivation for restrictions here is that it removing empty
|
|
|
b312fc |
directories that have non-empty mount points on them appears to
|
|
|
b312fc |
violate the rule that rmdir can not remove empty directories. As
|
|
|
b312fc |
Linus Torvalds pointed out this is useful for programs (like git) that
|
|
|
b312fc |
test if a directory is empty with rmdir.
|
|
|
b312fc |
|
|
|
b312fc |
Therefore this patch arranges to enforce the existing mount point
|
|
|
b312fc |
semantics for local mount namespace.
|
|
|
b312fc |
|
|
|
b312fc |
v2: Rewrote the test to be a drop in replacement for d_mountpoint
|
|
|
b312fc |
v3: Use bool instead of int as the return type of is_local_mountpoint
|
|
|
b312fc |
|
|
|
b312fc |
Reviewed-by: Miklos Szeredi <miklos@szeredi.hu>
|
|
|
b312fc |
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
|
|
|
b312fc |
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
|
b312fc |
---
|
|
|
b312fc |
fs/dcache.c | 69 +++++++++++++++++++++++++++++++---------------------------
|
|
|
b312fc |
fs/mount.h | 9 ++++++++
|
|
|
b312fc |
fs/namei.c | 16 +++++++++-----
|
|
|
b312fc |
fs/namespace.c | 35 +++++++++++++++++++++++++++++
|
|
|
b312fc |
4 files changed, 91 insertions(+), 38 deletions(-)
|
|
|
b312fc |
|
|
|
b312fc |
diff --git a/fs/dcache.c b/fs/dcache.c
|
|
|
b312fc |
index 5dabe0e..a3e9e7a 100644
|
|
|
b312fc |
--- a/fs/dcache.c
|
|
|
b312fc |
+++ b/fs/dcache.c
|
|
|
b312fc |
@@ -1285,36 +1285,38 @@ void shrink_dcache_parent(struct dentry *parent)
|
|
|
b312fc |
}
|
|
|
b312fc |
EXPORT_SYMBOL(shrink_dcache_parent);
|
|
|
b312fc |
|
|
|
b312fc |
-static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
|
|
|
b312fc |
+struct detach_data {
|
|
|
b312fc |
+ struct select_data select;
|
|
|
b312fc |
+ struct dentry *mountpoint;
|
|
|
b312fc |
+};
|
|
|
b312fc |
+static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
|
|
|
b312fc |
{
|
|
|
b312fc |
- struct select_data *data = _data;
|
|
|
b312fc |
-
|
|
|
b312fc |
- if (d_mountpoint(dentry)) {
|
|
|
b312fc |
- data->found = -EBUSY;
|
|
|
b312fc |
- return D_WALK_QUIT;
|
|
|
b312fc |
- }
|
|
|
b312fc |
+ struct detach_data *data = _data;
|
|
|
b312fc |
|
|
|
b312fc |
- return select_collect(_data, dentry);
|
|
|
b312fc |
-}
|
|
|
b312fc |
+ if (d_mountpoint(dentry)) {
|
|
|
b312fc |
+ __dget_dlock(dentry);
|
|
|
b312fc |
+ data->mountpoint = dentry;
|
|
|
b312fc |
+ return D_WALK_QUIT;
|
|
|
b312fc |
+ }
|
|
|
b312fc |
+ return select_collect(&data->select, dentry);
|
|
|
b312fc |
+ }
|
|
|
b312fc |
|
|
|
b312fc |
static void check_and_drop(void *_data)
|
|
|
b312fc |
{
|
|
|
b312fc |
- struct select_data *data = _data;
|
|
|
b312fc |
+ struct detach_data *data = _data;
|
|
|
b312fc |
|
|
|
b312fc |
- if (d_mountpoint(data->start))
|
|
|
b312fc |
- data->found = -EBUSY;
|
|
|
b312fc |
- if (!data->found)
|
|
|
b312fc |
- __d_drop(data->start);
|
|
|
b312fc |
+ if (!data->mountpoint && !data->select.found)
|
|
|
b312fc |
+ __d_drop(data->select.start);
|
|
|
b312fc |
}
|
|
|
b312fc |
|
|
|
b312fc |
/**
|
|
|
b312fc |
- * check_submounts_and_drop - prune dcache, check for submounts and drop
|
|
|
b312fc |
+ * check_submounts_and_drop - detach submounts, prune dcache, and drop
|
|
|
b312fc |
*
|
|
|
b312fc |
- * All done as a single atomic operation relative to has_unlinked_ancestor().
|
|
|
b312fc |
- * Returns 0 if successfully unhashed @parent. If there were submounts then
|
|
|
b312fc |
- * return -EBUSY.
|
|
|
b312fc |
+ * The final d_drop is done as an atomic operation relative to
|
|
|
b312fc |
+ * rename_lock ensuring there are no races with d_set_mounted. This
|
|
|
b312fc |
+ * ensures there are no unhashed dentries on the path to a mountpoint.
|
|
|
b312fc |
*
|
|
|
b312fc |
- * @dentry: dentry to prune and drop
|
|
|
b312fc |
+ * @dentry: dentry to detach, prune and drop
|
|
|
b312fc |
*/
|
|
|
b312fc |
int check_submounts_and_drop(struct dentry *dentry)
|
|
|
b312fc |
{
|
|
|
b312fc |
@@ -1327,19 +1329,24 @@ int check_submounts_and_drop(struct dentry *dentry)
|
|
|
b312fc |
}
|
|
|
b312fc |
|
|
|
b312fc |
for (;;) {
|
|
|
b312fc |
- struct select_data data;
|
|
|
b312fc |
+ struct detach_data data;
|
|
|
b312fc |
|
|
|
b312fc |
- INIT_LIST_HEAD(&data.dispose);
|
|
|
b312fc |
- data.start = dentry;
|
|
|
b312fc |
- data.found = 0;
|
|
|
b312fc |
+ data.mountpoint = NULL;
|
|
|
b312fc |
+ INIT_LIST_HEAD(&data.select.dispose);
|
|
|
b312fc |
+ data.select.start = dentry;
|
|
|
b312fc |
+ data.select.found = 0;
|
|
|
b312fc |
|
|
|
b312fc |
- d_walk(dentry, &data, check_and_collect, check_and_drop);
|
|
|
b312fc |
- ret = data.found;
|
|
|
b312fc |
+ d_walk(dentry, &data, detach_and_collect, check_and_drop);
|
|
|
b312fc |
|
|
|
b312fc |
- if (!list_empty(&data.dispose))
|
|
|
b312fc |
- shrink_dentry_list(&data.dispose);
|
|
|
b312fc |
+ if (data.select.found)
|
|
|
b312fc |
+ shrink_dentry_list(&data.select.dispose);
|
|
|
b312fc |
|
|
|
b312fc |
- if (ret <= 0)
|
|
|
b312fc |
+ if (data.mountpoint) {
|
|
|
b312fc |
+ detach_mounts(data.mountpoint);
|
|
|
b312fc |
+ dput(data.mountpoint);
|
|
|
b312fc |
+ }
|
|
|
b312fc |
+
|
|
|
b312fc |
+ if (!data.mountpoint && !data.select.found)
|
|
|
b312fc |
break;
|
|
|
b312fc |
|
|
|
b312fc |
cond_resched();
|
|
|
b312fc |
@@ -2554,10 +2561,8 @@ static struct dentry *__d_unalias(struct inode *inode,
|
|
|
b312fc |
goto out_err;
|
|
|
b312fc |
m2 = &alias->d_parent->d_inode->i_mutex;
|
|
|
b312fc |
out_unalias:
|
|
|
b312fc |
- if (likely(!d_mountpoint(alias))) {
|
|
|
b312fc |
- __d_move(alias, dentry, false);
|
|
|
b312fc |
- ret = alias;
|
|
|
b312fc |
- }
|
|
|
b312fc |
+ __d_move(alias, dentry, false);
|
|
|
b312fc |
+ ret = alias;
|
|
|
b312fc |
out_err:
|
|
|
b312fc |
spin_unlock(&inode->i_lock);
|
|
|
b312fc |
if (m2)
|
|
|
b312fc |
diff --git a/fs/mount.h b/fs/mount.h
|
|
|
b312fc |
index 9959119..a373c86 100644
|
|
|
b312fc |
--- a/fs/mount.h
|
|
|
b312fc |
+++ b/fs/mount.h
|
|
|
b312fc |
@@ -107,3 +107,12 @@ struct proc_mounts {
|
|
|
b312fc |
#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
|
|
|
b312fc |
|
|
|
b312fc |
extern const struct seq_operations mounts_op;
|
|
|
b312fc |
+
|
|
|
b312fc |
+extern bool __is_local_mountpoint(struct dentry *dentry);
|
|
|
b312fc |
+static inline bool is_local_mountpoint(struct dentry *dentry)
|
|
|
b312fc |
+{
|
|
|
b312fc |
+ if (!d_mountpoint(dentry))
|
|
|
b312fc |
+ return false;
|
|
|
b312fc |
+
|
|
|
b312fc |
+ return __is_local_mountpoint(dentry);
|
|
|
b312fc |
+}
|
|
|
b312fc |
diff --git a/fs/namei.c b/fs/namei.c
|
|
|
b312fc |
index 872e5e5..ef70aa8 100644
|
|
|
b312fc |
--- a/fs/namei.c
|
|
|
b312fc |
+++ b/fs/namei.c
|
|
|
b312fc |
@@ -3691,8 +3691,8 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
|
|
|
b312fc |
mutex_lock(&dentry->d_inode->i_mutex);
|
|
|
b312fc |
|
|
|
b312fc |
error = -EBUSY;
|
|
|
b312fc |
- if (d_mountpoint(dentry))
|
|
|
b312fc |
- goto out;
|
|
|
b312fc |
+ if (is_local_mountpoint(dentry))
|
|
|
b312fc |
+ goto out;
|
|
|
b312fc |
|
|
|
b312fc |
error = security_inode_rmdir(dir, dentry);
|
|
|
b312fc |
if (error)
|
|
|
b312fc |
@@ -3705,6 +3705,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
|
|
|
b312fc |
|
|
|
b312fc |
dentry->d_inode->i_flags |= S_DEAD;
|
|
|
b312fc |
dont_mount(dentry);
|
|
|
b312fc |
+ detach_mounts(dentry);
|
|
|
b312fc |
|
|
|
b312fc |
out:
|
|
|
b312fc |
mutex_unlock(&dentry->d_inode->i_mutex);
|
|
|
b312fc |
@@ -3806,7 +3807,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
|
|
|
b312fc |
return -EPERM;
|
|
|
b312fc |
|
|
|
b312fc |
mutex_lock(&target->i_mutex);
|
|
|
b312fc |
- if (d_mountpoint(dentry))
|
|
|
b312fc |
+ if (is_local_mountpoint(dentry))
|
|
|
b312fc |
error = -EBUSY;
|
|
|
b312fc |
else {
|
|
|
b312fc |
error = security_inode_unlink(dir, dentry);
|
|
|
b312fc |
@@ -3815,8 +3816,10 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
|
|
|
b312fc |
if (error)
|
|
|
b312fc |
goto out;
|
|
|
b312fc |
error = dir->i_op->unlink(dir, dentry);
|
|
|
b312fc |
- if (!error)
|
|
|
b312fc |
+ if (!error) {
|
|
|
b312fc |
dont_mount(dentry);
|
|
|
b312fc |
+ detach_mounts(dentry);
|
|
|
b312fc |
+ }
|
|
|
b312fc |
}
|
|
|
b312fc |
}
|
|
|
b312fc |
out:
|
|
|
b312fc |
@@ -4254,8 +4257,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
|
|
b312fc |
mutex_lock(&target->i_mutex);
|
|
|
b312fc |
|
|
|
b312fc |
error = -EBUSY;
|
|
|
b312fc |
- if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
|
|
|
b312fc |
- goto out;
|
|
|
b312fc |
+ if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
|
|
|
b312fc |
+ goto out;
|
|
|
b312fc |
|
|
|
b312fc |
if (max_links && new_dir != old_dir) {
|
|
|
b312fc |
error = -EMLINK;
|
|
|
b312fc |
@@ -4292,6 +4295,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
|
|
b312fc |
if (is_dir)
|
|
|
b312fc |
target->i_flags |= S_DEAD;
|
|
|
b312fc |
dont_mount(new_dentry);
|
|
|
b312fc |
+ detach_mounts(new_dentry);
|
|
|
b312fc |
}
|
|
|
b312fc |
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
|
|
|
b312fc |
if (!(flags & RENAME_EXCHANGE))
|
|
|
b312fc |
diff --git a/fs/namespace.c b/fs/namespace.c
|
|
|
b312fc |
index e48fed3..d633562 100644
|
|
|
b312fc |
--- a/fs/namespace.c
|
|
|
b312fc |
+++ b/fs/namespace.c
|
|
|
b312fc |
@@ -625,6 +625,41 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
|
|
|
b312fc |
return NULL;
|
|
|
b312fc |
}
|
|
|
b312fc |
|
|
|
b312fc |
+/*
|
|
|
b312fc |
+ * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
|
|
|
b312fc |
+ * current mount namespace.
|
|
|
b312fc |
+ *
|
|
|
b312fc |
+ * The common case is dentries are not mountpoints at all and that
|
|
|
b312fc |
+ * test is handled inline. For the slow case when we are actually
|
|
|
b312fc |
+ * dealing with a mountpoint of some kind, walk through all of the
|
|
|
b312fc |
+ * mounts in the current mount namespace and test to see if the dentry
|
|
|
b312fc |
+ * is a mountpoint.
|
|
|
b312fc |
+ *
|
|
|
b312fc |
+ * The mount_hashtable is not usable in the context because we
|
|
|
b312fc |
+ * need to identify all mounts that may be in the current mount
|
|
|
b312fc |
+ * namespace not just a mount that happens to have some specified
|
|
|
b312fc |
+ * parent mount.
|
|
|
b312fc |
+ */
|
|
|
b312fc |
+bool __is_local_mountpoint(struct dentry *dentry)
|
|
|
b312fc |
+{
|
|
|
b312fc |
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
|
|
|
b312fc |
+ struct mount *mnt;
|
|
|
b312fc |
+ bool is_covered = false;
|
|
|
b312fc |
+
|
|
|
b312fc |
+ if (!d_mountpoint(dentry))
|
|
|
b312fc |
+ goto out;
|
|
|
b312fc |
+
|
|
|
b312fc |
+ down_read(&namespace_sem);
|
|
|
b312fc |
+ list_for_each_entry(mnt, &ns->list, mnt_list) {
|
|
|
b312fc |
+ is_covered = (mnt->mnt_mountpoint == dentry);
|
|
|
b312fc |
+ if (is_covered)
|
|
|
b312fc |
+ break;
|
|
|
b312fc |
+ }
|
|
|
b312fc |
+ up_read(&namespace_sem);
|
|
|
b312fc |
+out:
|
|
|
b312fc |
+ return is_covered;
|
|
|
b312fc |
+}
|
|
|
b312fc |
+
|
|
|
b312fc |
static struct mountpoint *new_mountpoint(struct dentry *dentry)
|
|
|
b312fc |
{
|
|
|
b312fc |
struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
|
|
|
b312fc |
--
|
|
|
b312fc |
1.8.3.1
|
|
|
b312fc |
|