Blame 0094-virtiofsd-Support-remote-posix-locks.patch

1d442b
From: Vivek Goyal <vgoyal@redhat.com>
1d442b
Date: Mon, 27 Jan 2020 19:02:03 +0000
1d442b
Subject: [PATCH] virtiofsd: Support remote posix locks
1d442b
1d442b
Doing posix locks with-in guest kernel are not sufficient if a file/dir
1d442b
is being shared by multiple guests. So we need the notion of daemon doing
1d442b
the locks which are visible to rest of the guests.
1d442b
1d442b
Given posix locks are per process, one can not call posix lock API on host,
1d442b
otherwise bunch of basic posix locks properties are broken. For example,
1d442b
If two processes (A and B) in guest open the file and take locks on different
1d442b
sections of file, if one of the processes closes the fd, it will close
1d442b
fd on virtiofsd and all posix locks on file will go away. This means if
1d442b
process A closes the fd, then locks of process B will go away too.
1d442b
1d442b
Similar other problems exist too.
1d442b
1d442b
This patch set tries to emulate posix locks while using open file
1d442b
description locks provided on Linux.
1d442b
1d442b
Daemon provides two options (-o posix_lock, -o no_posix_lock) to enable
1d442b
or disable posix locking in daemon. By default it is enabled.
1d442b
1d442b
There are few issues though.
1d442b
1d442b
- GETLK() returns pid of process holding lock. As we are emulating locks
1d442b
  using OFD, and these locks are not per process and don't return pid
1d442b
  of process, so GETLK() in guest does not reuturn process pid.
1d442b
1d442b
- As of now only F_SETLK is supported and not F_SETLKW. We can't block
1d442b
  the thread in virtiofsd for arbitrary long duration as there is only
1d442b
  one thread serving the queue. That means unlock request will not make
1d442b
  it to daemon and F_SETLKW will block infinitely and bring virtio-fs
1d442b
  to a halt. This is a solvable problem though and will require significant
1d442b
  changes in virtiofsd and kernel. Left as a TODO item for now.
1d442b
1d442b
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
1d442b
Reviewed-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
1d442b
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
1d442b
(cherry picked from commit 0e81414c54161296212f6bc8a1c70526c4a9755a)
1d442b
---
1d442b
 tools/virtiofsd/helper.c         |   3 +
1d442b
 tools/virtiofsd/passthrough_ll.c | 189 +++++++++++++++++++++++++++++++
1d442b
 2 files changed, 192 insertions(+)
1d442b
1d442b
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
1d442b
index 567202444a..33749bfcb7 100644
1d442b
--- a/tools/virtiofsd/helper.c
1d442b
+++ b/tools/virtiofsd/helper.c
1d442b
@@ -156,6 +156,9 @@ void fuse_cmdline_help(void)
1d442b
            "                               allowed (default: 10)\n"
1d442b
            "    -o norace                  disable racy fallback\n"
1d442b
            "                               default: false\n"
1d442b
+           "    -o posix_lock|no_posix_lock\n"
1d442b
+           "                               enable/disable remote posix lock\n"
1d442b
+           "                               default: posix_lock\n"
1d442b
            "    -o readdirplus|no_readdirplus\n"
1d442b
            "                               enable/disable readirplus\n"
1d442b
            "                               default: readdirplus except with "
1d442b
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
1d442b
index 05b5f898db..9414935b52 100644
1d442b
--- a/tools/virtiofsd/passthrough_ll.c
1d442b
+++ b/tools/virtiofsd/passthrough_ll.c
1d442b
@@ -67,6 +67,12 @@
1d442b
 #include "passthrough_helpers.h"
1d442b
 #include "seccomp.h"
1d442b
 
1d442b
+/* Keep track of inode posix locks for each owner. */
1d442b
+struct lo_inode_plock {
1d442b
+    uint64_t lock_owner;
1d442b
+    int fd; /* fd for OFD locks */
1d442b
+};
1d442b
+
1d442b
 struct lo_map_elem {
1d442b
     union {
1d442b
         struct lo_inode *inode;
1d442b
@@ -95,6 +101,8 @@ struct lo_inode {
1d442b
     struct lo_key key;
1d442b
     uint64_t refcount; /* protected by lo->mutex */
1d442b
     fuse_ino_t fuse_ino;
1d442b
+    pthread_mutex_t plock_mutex;
1d442b
+    GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
1d442b
 };
1d442b
 
1d442b
 struct lo_cred {
1d442b
@@ -114,6 +122,7 @@ struct lo_data {
1d442b
     int norace;
1d442b
     int writeback;
1d442b
     int flock;
1d442b
+    int posix_lock;
1d442b
     int xattr;
1d442b
     char *source;
1d442b
     double timeout;
1d442b
@@ -137,6 +146,8 @@ static const struct fuse_opt lo_opts[] = {
1d442b
     { "source=%s", offsetof(struct lo_data, source), 0 },
1d442b
     { "flock", offsetof(struct lo_data, flock), 1 },
1d442b
     { "no_flock", offsetof(struct lo_data, flock), 0 },
1d442b
+    { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
1d442b
+    { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
1d442b
     { "xattr", offsetof(struct lo_data, xattr), 1 },
1d442b
     { "no_xattr", offsetof(struct lo_data, xattr), 0 },
1d442b
     { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
1d442b
@@ -485,6 +496,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
1d442b
         fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
1d442b
         conn->want |= FUSE_CAP_FLOCK_LOCKS;
1d442b
     }
1d442b
+
1d442b
+    if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
1d442b
+        if (lo->posix_lock) {
1d442b
+            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
1d442b
+            conn->want |= FUSE_CAP_POSIX_LOCKS;
1d442b
+        } else {
1d442b
+            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
1d442b
+            conn->want &= ~FUSE_CAP_POSIX_LOCKS;
1d442b
+        }
1d442b
+    }
1d442b
+
1d442b
     if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
1d442b
         lo->readdirplus_clear) {
1d442b
         fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
1d442b
@@ -772,6 +794,19 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
1d442b
     return p;
1d442b
 }
1d442b
 
1d442b
+/* value_destroy_func for posix_locks GHashTable */
1d442b
+static void posix_locks_value_destroy(gpointer data)
1d442b
+{
1d442b
+    struct lo_inode_plock *plock = data;
1d442b
+
1d442b
+    /*
1d442b
+     * We had used open() for locks and had only one fd. So
1d442b
+     * closing this fd should release all OFD locks.
1d442b
+     */
1d442b
+    close(plock->fd);
1d442b
+    free(plock);
1d442b
+}
1d442b
+
1d442b
 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
1d442b
                         struct fuse_entry_param *e)
1d442b
 {
1d442b
@@ -825,6 +860,9 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
1d442b
         newfd = -1;
1d442b
         inode->key.ino = e->attr.st_ino;
1d442b
         inode->key.dev = e->attr.st_dev;
1d442b
+        pthread_mutex_init(&inode->plock_mutex, NULL);
1d442b
+        inode->posix_locks = g_hash_table_new_full(
1d442b
+            g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
1d442b
 
1d442b
         pthread_mutex_lock(&lo->mutex);
1d442b
         inode->fuse_ino = lo_add_inode_mapping(req, inode);
1d442b
@@ -1160,6 +1198,11 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1d442b
     if (!inode->refcount) {
1d442b
         lo_map_remove(&lo->ino_map, inode->fuse_ino);
1d442b
         g_hash_table_remove(lo->inodes, &inode->key);
1d442b
+        if (g_hash_table_size(inode->posix_locks)) {
1d442b
+            fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1d442b
+        }
1d442b
+        g_hash_table_destroy(inode->posix_locks);
1d442b
+        pthread_mutex_destroy(&inode->plock_mutex);
1d442b
         pthread_mutex_unlock(&lo->mutex);
1d442b
         close(inode->fd);
1d442b
         free(inode);
1d442b
@@ -1516,6 +1559,136 @@ out:
1d442b
     }
1d442b
 }
1d442b
 
1d442b
+/* Should be called with inode->plock_mutex held */
1d442b
+static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1d442b
+                                                      struct lo_inode *inode,
1d442b
+                                                      uint64_t lock_owner,
1d442b
+                                                      pid_t pid, int *err)
1d442b
+{
1d442b
+    struct lo_inode_plock *plock;
1d442b
+    char procname[64];
1d442b
+    int fd;
1d442b
+
1d442b
+    plock =
1d442b
+        g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1d442b
+
1d442b
+    if (plock) {
1d442b
+        return plock;
1d442b
+    }
1d442b
+
1d442b
+    plock = malloc(sizeof(struct lo_inode_plock));
1d442b
+    if (!plock) {
1d442b
+        *err = ENOMEM;
1d442b
+        return NULL;
1d442b
+    }
1d442b
+
1d442b
+    /* Open another instance of file which can be used for ofd locks. */
1d442b
+    sprintf(procname, "%i", inode->fd);
1d442b
+
1d442b
+    /* TODO: What if file is not writable? */
1d442b
+    fd = openat(lo->proc_self_fd, procname, O_RDWR);
1d442b
+    if (fd == -1) {
1d442b
+        *err = errno;
1d442b
+        free(plock);
1d442b
+        return NULL;
1d442b
+    }
1d442b
+
1d442b
+    plock->lock_owner = lock_owner;
1d442b
+    plock->fd = fd;
1d442b
+    g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1d442b
+                        plock);
1d442b
+    return plock;
1d442b
+}
1d442b
+
1d442b
+static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1d442b
+                     struct flock *lock)
1d442b
+{
1d442b
+    struct lo_data *lo = lo_data(req);
1d442b
+    struct lo_inode *inode;
1d442b
+    struct lo_inode_plock *plock;
1d442b
+    int ret, saverr = 0;
1d442b
+
1d442b
+    fuse_log(FUSE_LOG_DEBUG,
1d442b
+             "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1d442b
+             " owner=0x%lx, l_type=%d l_start=0x%lx"
1d442b
+             " l_len=0x%lx\n",
1d442b
+             ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1d442b
+             lock->l_len);
1d442b
+
1d442b
+    inode = lo_inode(req, ino);
1d442b
+    if (!inode) {
1d442b
+        fuse_reply_err(req, EBADF);
1d442b
+        return;
1d442b
+    }
1d442b
+
1d442b
+    pthread_mutex_lock(&inode->plock_mutex);
1d442b
+    plock =
1d442b
+        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret;;
1d442b
+    if (!plock) {
1d442b
+        pthread_mutex_unlock(&inode->plock_mutex);
1d442b
+        fuse_reply_err(req, ret);
1d442b
+        return;
1d442b
+    }
1d442b
+
1d442b
+    ret = fcntl(plock->fd, F_OFD_GETLK, lock);
1d442b
+    if (ret == -1) {
1d442b
+        saverr = errno;
1d442b
+    }
1d442b
+    pthread_mutex_unlock(&inode->plock_mutex);
1d442b
+
1d442b
+    if (saverr) {
1d442b
+        fuse_reply_err(req, saverr);
1d442b
+    } else {
1d442b
+        fuse_reply_lock(req, lock);
1d442b
+    }
1d442b
+}
1d442b
+
1d442b
+static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1d442b
+                     struct flock *lock, int sleep)
1d442b
+{
1d442b
+    struct lo_data *lo = lo_data(req);
1d442b
+    struct lo_inode *inode;
1d442b
+    struct lo_inode_plock *plock;
1d442b
+    int ret, saverr = 0;
1d442b
+
1d442b
+    fuse_log(FUSE_LOG_DEBUG,
1d442b
+             "lo_setlk(ino=%" PRIu64 ", flags=%d)"
1d442b
+             " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1d442b
+             " l_start=0x%lx l_len=0x%lx\n",
1d442b
+             ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
1d442b
+             lock->l_whence, lock->l_start, lock->l_len);
1d442b
+
1d442b
+    if (sleep) {
1d442b
+        fuse_reply_err(req, EOPNOTSUPP);
1d442b
+        return;
1d442b
+    }
1d442b
+
1d442b
+    inode = lo_inode(req, ino);
1d442b
+    if (!inode) {
1d442b
+        fuse_reply_err(req, EBADF);
1d442b
+        return;
1d442b
+    }
1d442b
+
1d442b
+    pthread_mutex_lock(&inode->plock_mutex);
1d442b
+    plock =
1d442b
+        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret;;
1d442b
+
1d442b
+    if (!plock) {
1d442b
+        pthread_mutex_unlock(&inode->plock_mutex);
1d442b
+        fuse_reply_err(req, ret);
1d442b
+        return;
1d442b
+    }
1d442b
+
1d442b
+    /* TODO: Is it alright to modify flock? */
1d442b
+    lock->l_pid = 0;
1d442b
+    ret = fcntl(plock->fd, F_OFD_SETLK, lock);
1d442b
+    if (ret == -1) {
1d442b
+        saverr = errno;
1d442b
+    }
1d442b
+    pthread_mutex_unlock(&inode->plock_mutex);
1d442b
+    fuse_reply_err(req, saverr);
1d442b
+}
1d442b
+
1d442b
 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
1d442b
                         struct fuse_file_info *fi)
1d442b
 {
1d442b
@@ -1617,6 +1790,19 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1d442b
 {
1d442b
     int res;
1d442b
     (void)ino;
1d442b
+    struct lo_inode *inode;
1d442b
+
1d442b
+    inode = lo_inode(req, ino);
1d442b
+    if (!inode) {
1d442b
+        fuse_reply_err(req, EBADF);
1d442b
+        return;
1d442b
+    }
1d442b
+
1d442b
+    /* An fd is going away. Cleanup associated posix locks */
1d442b
+    pthread_mutex_lock(&inode->plock_mutex);
1d442b
+    g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
1d442b
+    pthread_mutex_unlock(&inode->plock_mutex);
1d442b
+
1d442b
     res = close(dup(lo_fi_fd(req, fi)));
1d442b
     fuse_reply_err(req, res == -1 ? errno : 0);
1d442b
 }
1d442b
@@ -2080,6 +2266,8 @@ static struct fuse_lowlevel_ops lo_oper = {
1d442b
     .releasedir = lo_releasedir,
1d442b
     .fsyncdir = lo_fsyncdir,
1d442b
     .create = lo_create,
1d442b
+    .getlk = lo_getlk,
1d442b
+    .setlk = lo_setlk,
1d442b
     .open = lo_open,
1d442b
     .release = lo_release,
1d442b
     .flush = lo_flush,
1d442b
@@ -2434,6 +2622,7 @@ int main(int argc, char *argv[])
1d442b
     struct lo_data lo = {
1d442b
         .debug = 0,
1d442b
         .writeback = 0,
1d442b
+        .posix_lock = 1,
1d442b
         .proc_self_fd = -1,
1d442b
     };
1d442b
     struct lo_map_elem *root_elem;