|
|
1d442b |
From: Vivek Goyal <vgoyal@redhat.com>
|
|
|
1d442b |
Date: Mon, 27 Jan 2020 19:02:03 +0000
|
|
|
1d442b |
Subject: [PATCH] virtiofsd: Support remote posix locks
|
|
|
1d442b |
|
|
|
1d442b |
Doing posix locks with-in guest kernel are not sufficient if a file/dir
|
|
|
1d442b |
is being shared by multiple guests. So we need the notion of daemon doing
|
|
|
1d442b |
the locks which are visible to rest of the guests.
|
|
|
1d442b |
|
|
|
1d442b |
Given posix locks are per process, one can not call posix lock API on host,
|
|
|
1d442b |
otherwise bunch of basic posix locks properties are broken. For example,
|
|
|
1d442b |
If two processes (A and B) in guest open the file and take locks on different
|
|
|
1d442b |
sections of file, if one of the processes closes the fd, it will close
|
|
|
1d442b |
fd on virtiofsd and all posix locks on file will go away. This means if
|
|
|
1d442b |
process A closes the fd, then locks of process B will go away too.
|
|
|
1d442b |
|
|
|
1d442b |
Similar other problems exist too.
|
|
|
1d442b |
|
|
|
1d442b |
This patch set tries to emulate posix locks while using open file
|
|
|
1d442b |
description locks provided on Linux.
|
|
|
1d442b |
|
|
|
1d442b |
Daemon provides two options (-o posix_lock, -o no_posix_lock) to enable
|
|
|
1d442b |
or disable posix locking in daemon. By default it is enabled.
|
|
|
1d442b |
|
|
|
1d442b |
There are few issues though.
|
|
|
1d442b |
|
|
|
1d442b |
- GETLK() returns pid of process holding lock. As we are emulating locks
|
|
|
1d442b |
using OFD, and these locks are not per process and don't return pid
|
|
|
1d442b |
of process, so GETLK() in guest does not reuturn process pid.
|
|
|
1d442b |
|
|
|
1d442b |
- As of now only F_SETLK is supported and not F_SETLKW. We can't block
|
|
|
1d442b |
the thread in virtiofsd for arbitrary long duration as there is only
|
|
|
1d442b |
one thread serving the queue. That means unlock request will not make
|
|
|
1d442b |
it to daemon and F_SETLKW will block infinitely and bring virtio-fs
|
|
|
1d442b |
to a halt. This is a solvable problem though and will require significant
|
|
|
1d442b |
changes in virtiofsd and kernel. Left as a TODO item for now.
|
|
|
1d442b |
|
|
|
1d442b |
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
|
|
|
1d442b |
Reviewed-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
|
|
|
1d442b |
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
|
|
|
1d442b |
(cherry picked from commit 0e81414c54161296212f6bc8a1c70526c4a9755a)
|
|
|
1d442b |
---
|
|
|
1d442b |
tools/virtiofsd/helper.c | 3 +
|
|
|
1d442b |
tools/virtiofsd/passthrough_ll.c | 189 +++++++++++++++++++++++++++++++
|
|
|
1d442b |
2 files changed, 192 insertions(+)
|
|
|
1d442b |
|
|
|
1d442b |
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
|
|
|
1d442b |
index 567202444a..33749bfcb7 100644
|
|
|
1d442b |
--- a/tools/virtiofsd/helper.c
|
|
|
1d442b |
+++ b/tools/virtiofsd/helper.c
|
|
|
1d442b |
@@ -156,6 +156,9 @@ void fuse_cmdline_help(void)
|
|
|
1d442b |
" allowed (default: 10)\n"
|
|
|
1d442b |
" -o norace disable racy fallback\n"
|
|
|
1d442b |
" default: false\n"
|
|
|
1d442b |
+ " -o posix_lock|no_posix_lock\n"
|
|
|
1d442b |
+ " enable/disable remote posix lock\n"
|
|
|
1d442b |
+ " default: posix_lock\n"
|
|
|
1d442b |
" -o readdirplus|no_readdirplus\n"
|
|
|
1d442b |
" enable/disable readirplus\n"
|
|
|
1d442b |
" default: readdirplus except with "
|
|
|
1d442b |
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
|
|
|
1d442b |
index 05b5f898db..9414935b52 100644
|
|
|
1d442b |
--- a/tools/virtiofsd/passthrough_ll.c
|
|
|
1d442b |
+++ b/tools/virtiofsd/passthrough_ll.c
|
|
|
1d442b |
@@ -67,6 +67,12 @@
|
|
|
1d442b |
#include "passthrough_helpers.h"
|
|
|
1d442b |
#include "seccomp.h"
|
|
|
1d442b |
|
|
|
1d442b |
+/* Keep track of inode posix locks for each owner. */
|
|
|
1d442b |
+struct lo_inode_plock {
|
|
|
1d442b |
+ uint64_t lock_owner;
|
|
|
1d442b |
+ int fd; /* fd for OFD locks */
|
|
|
1d442b |
+};
|
|
|
1d442b |
+
|
|
|
1d442b |
struct lo_map_elem {
|
|
|
1d442b |
union {
|
|
|
1d442b |
struct lo_inode *inode;
|
|
|
1d442b |
@@ -95,6 +101,8 @@ struct lo_inode {
|
|
|
1d442b |
struct lo_key key;
|
|
|
1d442b |
uint64_t refcount; /* protected by lo->mutex */
|
|
|
1d442b |
fuse_ino_t fuse_ino;
|
|
|
1d442b |
+ pthread_mutex_t plock_mutex;
|
|
|
1d442b |
+ GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
|
|
|
1d442b |
};
|
|
|
1d442b |
|
|
|
1d442b |
struct lo_cred {
|
|
|
1d442b |
@@ -114,6 +122,7 @@ struct lo_data {
|
|
|
1d442b |
int norace;
|
|
|
1d442b |
int writeback;
|
|
|
1d442b |
int flock;
|
|
|
1d442b |
+ int posix_lock;
|
|
|
1d442b |
int xattr;
|
|
|
1d442b |
char *source;
|
|
|
1d442b |
double timeout;
|
|
|
1d442b |
@@ -137,6 +146,8 @@ static const struct fuse_opt lo_opts[] = {
|
|
|
1d442b |
{ "source=%s", offsetof(struct lo_data, source), 0 },
|
|
|
1d442b |
{ "flock", offsetof(struct lo_data, flock), 1 },
|
|
|
1d442b |
{ "no_flock", offsetof(struct lo_data, flock), 0 },
|
|
|
1d442b |
+ { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
|
|
|
1d442b |
+ { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
|
|
|
1d442b |
{ "xattr", offsetof(struct lo_data, xattr), 1 },
|
|
|
1d442b |
{ "no_xattr", offsetof(struct lo_data, xattr), 0 },
|
|
|
1d442b |
{ "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
|
|
|
1d442b |
@@ -485,6 +496,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
|
|
|
1d442b |
fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
|
|
|
1d442b |
conn->want |= FUSE_CAP_FLOCK_LOCKS;
|
|
|
1d442b |
}
|
|
|
1d442b |
+
|
|
|
1d442b |
+ if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
|
|
|
1d442b |
+ if (lo->posix_lock) {
|
|
|
1d442b |
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
|
|
|
1d442b |
+ conn->want |= FUSE_CAP_POSIX_LOCKS;
|
|
|
1d442b |
+ } else {
|
|
|
1d442b |
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
|
|
|
1d442b |
+ conn->want &= ~FUSE_CAP_POSIX_LOCKS;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
|
|
|
1d442b |
lo->readdirplus_clear) {
|
|
|
1d442b |
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
|
|
|
1d442b |
@@ -772,6 +794,19 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
|
|
|
1d442b |
return p;
|
|
|
1d442b |
}
|
|
|
1d442b |
|
|
|
1d442b |
+/* value_destroy_func for posix_locks GHashTable */
|
|
|
1d442b |
+static void posix_locks_value_destroy(gpointer data)
|
|
|
1d442b |
+{
|
|
|
1d442b |
+ struct lo_inode_plock *plock = data;
|
|
|
1d442b |
+
|
|
|
1d442b |
+ /*
|
|
|
1d442b |
+ * We had used open() for locks and had only one fd. So
|
|
|
1d442b |
+ * closing this fd should release all OFD locks.
|
|
|
1d442b |
+ */
|
|
|
1d442b |
+ close(plock->fd);
|
|
|
1d442b |
+ free(plock);
|
|
|
1d442b |
+}
|
|
|
1d442b |
+
|
|
|
1d442b |
static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
|
|
|
1d442b |
struct fuse_entry_param *e)
|
|
|
1d442b |
{
|
|
|
1d442b |
@@ -825,6 +860,9 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
|
|
|
1d442b |
newfd = -1;
|
|
|
1d442b |
inode->key.ino = e->attr.st_ino;
|
|
|
1d442b |
inode->key.dev = e->attr.st_dev;
|
|
|
1d442b |
+ pthread_mutex_init(&inode->plock_mutex, NULL);
|
|
|
1d442b |
+ inode->posix_locks = g_hash_table_new_full(
|
|
|
1d442b |
+ g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
|
|
|
1d442b |
|
|
|
1d442b |
pthread_mutex_lock(&lo->mutex);
|
|
|
1d442b |
inode->fuse_ino = lo_add_inode_mapping(req, inode);
|
|
|
1d442b |
@@ -1160,6 +1198,11 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
|
|
|
1d442b |
if (!inode->refcount) {
|
|
|
1d442b |
lo_map_remove(&lo->ino_map, inode->fuse_ino);
|
|
|
1d442b |
g_hash_table_remove(lo->inodes, &inode->key);
|
|
|
1d442b |
+ if (g_hash_table_size(inode->posix_locks)) {
|
|
|
1d442b |
+ fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+ g_hash_table_destroy(inode->posix_locks);
|
|
|
1d442b |
+ pthread_mutex_destroy(&inode->plock_mutex);
|
|
|
1d442b |
pthread_mutex_unlock(&lo->mutex);
|
|
|
1d442b |
close(inode->fd);
|
|
|
1d442b |
free(inode);
|
|
|
1d442b |
@@ -1516,6 +1559,136 @@ out:
|
|
|
1d442b |
}
|
|
|
1d442b |
}
|
|
|
1d442b |
|
|
|
1d442b |
+/* Should be called with inode->plock_mutex held */
|
|
|
1d442b |
+static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
|
|
|
1d442b |
+ struct lo_inode *inode,
|
|
|
1d442b |
+ uint64_t lock_owner,
|
|
|
1d442b |
+ pid_t pid, int *err)
|
|
|
1d442b |
+{
|
|
|
1d442b |
+ struct lo_inode_plock *plock;
|
|
|
1d442b |
+ char procname[64];
|
|
|
1d442b |
+ int fd;
|
|
|
1d442b |
+
|
|
|
1d442b |
+ plock =
|
|
|
1d442b |
+ g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
|
|
|
1d442b |
+
|
|
|
1d442b |
+ if (plock) {
|
|
|
1d442b |
+ return plock;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ plock = malloc(sizeof(struct lo_inode_plock));
|
|
|
1d442b |
+ if (!plock) {
|
|
|
1d442b |
+ *err = ENOMEM;
|
|
|
1d442b |
+ return NULL;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ /* Open another instance of file which can be used for ofd locks. */
|
|
|
1d442b |
+ sprintf(procname, "%i", inode->fd);
|
|
|
1d442b |
+
|
|
|
1d442b |
+ /* TODO: What if file is not writable? */
|
|
|
1d442b |
+ fd = openat(lo->proc_self_fd, procname, O_RDWR);
|
|
|
1d442b |
+ if (fd == -1) {
|
|
|
1d442b |
+ *err = errno;
|
|
|
1d442b |
+ free(plock);
|
|
|
1d442b |
+ return NULL;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ plock->lock_owner = lock_owner;
|
|
|
1d442b |
+ plock->fd = fd;
|
|
|
1d442b |
+ g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
|
|
|
1d442b |
+ plock);
|
|
|
1d442b |
+ return plock;
|
|
|
1d442b |
+}
|
|
|
1d442b |
+
|
|
|
1d442b |
+static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
|
|
|
1d442b |
+ struct flock *lock)
|
|
|
1d442b |
+{
|
|
|
1d442b |
+ struct lo_data *lo = lo_data(req);
|
|
|
1d442b |
+ struct lo_inode *inode;
|
|
|
1d442b |
+ struct lo_inode_plock *plock;
|
|
|
1d442b |
+ int ret, saverr = 0;
|
|
|
1d442b |
+
|
|
|
1d442b |
+ fuse_log(FUSE_LOG_DEBUG,
|
|
|
1d442b |
+ "lo_getlk(ino=%" PRIu64 ", flags=%d)"
|
|
|
1d442b |
+ " owner=0x%lx, l_type=%d l_start=0x%lx"
|
|
|
1d442b |
+ " l_len=0x%lx\n",
|
|
|
1d442b |
+ ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
|
|
|
1d442b |
+ lock->l_len);
|
|
|
1d442b |
+
|
|
|
1d442b |
+ inode = lo_inode(req, ino);
|
|
|
1d442b |
+ if (!inode) {
|
|
|
1d442b |
+ fuse_reply_err(req, EBADF);
|
|
|
1d442b |
+ return;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ pthread_mutex_lock(&inode->plock_mutex);
|
|
|
1d442b |
+ plock =
|
|
|
1d442b |
+ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret;;
|
|
|
1d442b |
+ if (!plock) {
|
|
|
1d442b |
+ pthread_mutex_unlock(&inode->plock_mutex);
|
|
|
1d442b |
+ fuse_reply_err(req, ret);
|
|
|
1d442b |
+ return;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ ret = fcntl(plock->fd, F_OFD_GETLK, lock);
|
|
|
1d442b |
+ if (ret == -1) {
|
|
|
1d442b |
+ saverr = errno;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+ pthread_mutex_unlock(&inode->plock_mutex);
|
|
|
1d442b |
+
|
|
|
1d442b |
+ if (saverr) {
|
|
|
1d442b |
+ fuse_reply_err(req, saverr);
|
|
|
1d442b |
+ } else {
|
|
|
1d442b |
+ fuse_reply_lock(req, lock);
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+}
|
|
|
1d442b |
+
|
|
|
1d442b |
+static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
|
|
|
1d442b |
+ struct flock *lock, int sleep)
|
|
|
1d442b |
+{
|
|
|
1d442b |
+ struct lo_data *lo = lo_data(req);
|
|
|
1d442b |
+ struct lo_inode *inode;
|
|
|
1d442b |
+ struct lo_inode_plock *plock;
|
|
|
1d442b |
+ int ret, saverr = 0;
|
|
|
1d442b |
+
|
|
|
1d442b |
+ fuse_log(FUSE_LOG_DEBUG,
|
|
|
1d442b |
+ "lo_setlk(ino=%" PRIu64 ", flags=%d)"
|
|
|
1d442b |
+ " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
|
|
|
1d442b |
+ " l_start=0x%lx l_len=0x%lx\n",
|
|
|
1d442b |
+ ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
|
|
|
1d442b |
+ lock->l_whence, lock->l_start, lock->l_len);
|
|
|
1d442b |
+
|
|
|
1d442b |
+ if (sleep) {
|
|
|
1d442b |
+ fuse_reply_err(req, EOPNOTSUPP);
|
|
|
1d442b |
+ return;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ inode = lo_inode(req, ino);
|
|
|
1d442b |
+ if (!inode) {
|
|
|
1d442b |
+ fuse_reply_err(req, EBADF);
|
|
|
1d442b |
+ return;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ pthread_mutex_lock(&inode->plock_mutex);
|
|
|
1d442b |
+ plock =
|
|
|
1d442b |
+ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret;;
|
|
|
1d442b |
+
|
|
|
1d442b |
+ if (!plock) {
|
|
|
1d442b |
+ pthread_mutex_unlock(&inode->plock_mutex);
|
|
|
1d442b |
+ fuse_reply_err(req, ret);
|
|
|
1d442b |
+ return;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ /* TODO: Is it alright to modify flock? */
|
|
|
1d442b |
+ lock->l_pid = 0;
|
|
|
1d442b |
+ ret = fcntl(plock->fd, F_OFD_SETLK, lock);
|
|
|
1d442b |
+ if (ret == -1) {
|
|
|
1d442b |
+ saverr = errno;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+ pthread_mutex_unlock(&inode->plock_mutex);
|
|
|
1d442b |
+ fuse_reply_err(req, saverr);
|
|
|
1d442b |
+}
|
|
|
1d442b |
+
|
|
|
1d442b |
static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
|
|
|
1d442b |
struct fuse_file_info *fi)
|
|
|
1d442b |
{
|
|
|
1d442b |
@@ -1617,6 +1790,19 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
|
|
|
1d442b |
{
|
|
|
1d442b |
int res;
|
|
|
1d442b |
(void)ino;
|
|
|
1d442b |
+ struct lo_inode *inode;
|
|
|
1d442b |
+
|
|
|
1d442b |
+ inode = lo_inode(req, ino);
|
|
|
1d442b |
+ if (!inode) {
|
|
|
1d442b |
+ fuse_reply_err(req, EBADF);
|
|
|
1d442b |
+ return;
|
|
|
1d442b |
+ }
|
|
|
1d442b |
+
|
|
|
1d442b |
+ /* An fd is going away. Cleanup associated posix locks */
|
|
|
1d442b |
+ pthread_mutex_lock(&inode->plock_mutex);
|
|
|
1d442b |
+ g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
|
|
|
1d442b |
+ pthread_mutex_unlock(&inode->plock_mutex);
|
|
|
1d442b |
+
|
|
|
1d442b |
res = close(dup(lo_fi_fd(req, fi)));
|
|
|
1d442b |
fuse_reply_err(req, res == -1 ? errno : 0);
|
|
|
1d442b |
}
|
|
|
1d442b |
@@ -2080,6 +2266,8 @@ static struct fuse_lowlevel_ops lo_oper = {
|
|
|
1d442b |
.releasedir = lo_releasedir,
|
|
|
1d442b |
.fsyncdir = lo_fsyncdir,
|
|
|
1d442b |
.create = lo_create,
|
|
|
1d442b |
+ .getlk = lo_getlk,
|
|
|
1d442b |
+ .setlk = lo_setlk,
|
|
|
1d442b |
.open = lo_open,
|
|
|
1d442b |
.release = lo_release,
|
|
|
1d442b |
.flush = lo_flush,
|
|
|
1d442b |
@@ -2434,6 +2622,7 @@ int main(int argc, char *argv[])
|
|
|
1d442b |
struct lo_data lo = {
|
|
|
1d442b |
.debug = 0,
|
|
|
1d442b |
.writeback = 0,
|
|
|
1d442b |
+ .posix_lock = 1,
|
|
|
1d442b |
.proc_self_fd = -1,
|
|
|
1d442b |
};
|
|
|
1d442b |
struct lo_map_elem *root_elem;
|