ddf19c
From 8e46d0862c4c204f92c08ce2ae961921f270efb5 Mon Sep 17 00:00:00 2001
ddf19c
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
ddf19c
Date: Mon, 27 Jan 2020 19:02:03 +0100
ddf19c
Subject: [PATCH 092/116] virtiofsd: Support remote posix locks
ddf19c
MIME-Version: 1.0
ddf19c
Content-Type: text/plain; charset=UTF-8
ddf19c
Content-Transfer-Encoding: 8bit
ddf19c
ddf19c
RH-Author: Dr. David Alan Gilbert <dgilbert@redhat.com>
ddf19c
Message-id: <20200127190227.40942-89-dgilbert@redhat.com>
ddf19c
Patchwork-id: 93537
ddf19c
O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 088/112] virtiofsd: Support remote posix locks
ddf19c
Bugzilla: 1694164
ddf19c
RH-Acked-by: Philippe Mathieu-Daudé <philmd@redhat.com>
ddf19c
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
ddf19c
RH-Acked-by: Sergio Lopez Pascual <slp@redhat.com>
ddf19c
ddf19c
From: Vivek Goyal <vgoyal@redhat.com>
ddf19c
ddf19c
Doing posix locks with-in guest kernel are not sufficient if a file/dir
ddf19c
is being shared by multiple guests. So we need the notion of daemon doing
ddf19c
the locks which are visible to rest of the guests.
ddf19c
ddf19c
Given posix locks are per process, one can not call posix lock API on host,
ddf19c
otherwise bunch of basic posix locks properties are broken. For example,
ddf19c
If two processes (A and B) in guest open the file and take locks on different
ddf19c
sections of file, if one of the processes closes the fd, it will close
ddf19c
fd on virtiofsd and all posix locks on file will go away. This means if
ddf19c
process A closes the fd, then locks of process B will go away too.
ddf19c
ddf19c
Similar other problems exist too.
ddf19c
ddf19c
This patch set tries to emulate posix locks while using open file
ddf19c
description locks provided on Linux.
ddf19c
ddf19c
Daemon provides two options (-o posix_lock, -o no_posix_lock) to enable
ddf19c
or disable posix locking in daemon. By default it is enabled.
ddf19c
ddf19c
There are few issues though.
ddf19c
ddf19c
- GETLK() returns pid of process holding lock. As we are emulating locks
ddf19c
  using OFD, and these locks are not per process and don't return pid
ddf19c
  of process, so GETLK() in guest does not reuturn process pid.
ddf19c
ddf19c
- As of now only F_SETLK is supported and not F_SETLKW. We can't block
ddf19c
  the thread in virtiofsd for arbitrary long duration as there is only
ddf19c
  one thread serving the queue. That means unlock request will not make
ddf19c
  it to daemon and F_SETLKW will block infinitely and bring virtio-fs
ddf19c
  to a halt. This is a solvable problem though and will require significant
ddf19c
  changes in virtiofsd and kernel. Left as a TODO item for now.
ddf19c
ddf19c
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
ddf19c
Reviewed-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
ddf19c
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
ddf19c
(cherry picked from commit 0e81414c54161296212f6bc8a1c70526c4a9755a)
ddf19c
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
ddf19c
---
ddf19c
 tools/virtiofsd/helper.c         |   3 +
ddf19c
 tools/virtiofsd/passthrough_ll.c | 189 +++++++++++++++++++++++++++++++++++++++
ddf19c
 2 files changed, 192 insertions(+)
ddf19c
ddf19c
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
ddf19c
index 5672024..33749bf 100644
ddf19c
--- a/tools/virtiofsd/helper.c
ddf19c
+++ b/tools/virtiofsd/helper.c
ddf19c
@@ -156,6 +156,9 @@ void fuse_cmdline_help(void)
ddf19c
            "                               allowed (default: 10)\n"
ddf19c
            "    -o norace                  disable racy fallback\n"
ddf19c
            "                               default: false\n"
ddf19c
+           "    -o posix_lock|no_posix_lock\n"
ddf19c
+           "                               enable/disable remote posix lock\n"
ddf19c
+           "                               default: posix_lock\n"
ddf19c
            "    -o readdirplus|no_readdirplus\n"
ddf19c
            "                               enable/disable readirplus\n"
ddf19c
            "                               default: readdirplus except with "
ddf19c
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
ddf19c
index 05b5f89..9414935 100644
ddf19c
--- a/tools/virtiofsd/passthrough_ll.c
ddf19c
+++ b/tools/virtiofsd/passthrough_ll.c
ddf19c
@@ -67,6 +67,12 @@
ddf19c
 #include "passthrough_helpers.h"
ddf19c
 #include "seccomp.h"
ddf19c
 
ddf19c
+/* Keep track of inode posix locks for each owner. */
ddf19c
+struct lo_inode_plock {
ddf19c
+    uint64_t lock_owner;
ddf19c
+    int fd; /* fd for OFD locks */
ddf19c
+};
ddf19c
+
ddf19c
 struct lo_map_elem {
ddf19c
     union {
ddf19c
         struct lo_inode *inode;
ddf19c
@@ -95,6 +101,8 @@ struct lo_inode {
ddf19c
     struct lo_key key;
ddf19c
     uint64_t refcount; /* protected by lo->mutex */
ddf19c
     fuse_ino_t fuse_ino;
ddf19c
+    pthread_mutex_t plock_mutex;
ddf19c
+    GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
ddf19c
 };
ddf19c
 
ddf19c
 struct lo_cred {
ddf19c
@@ -114,6 +122,7 @@ struct lo_data {
ddf19c
     int norace;
ddf19c
     int writeback;
ddf19c
     int flock;
ddf19c
+    int posix_lock;
ddf19c
     int xattr;
ddf19c
     char *source;
ddf19c
     double timeout;
ddf19c
@@ -137,6 +146,8 @@ static const struct fuse_opt lo_opts[] = {
ddf19c
     { "source=%s", offsetof(struct lo_data, source), 0 },
ddf19c
     { "flock", offsetof(struct lo_data, flock), 1 },
ddf19c
     { "no_flock", offsetof(struct lo_data, flock), 0 },
ddf19c
+    { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
ddf19c
+    { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
ddf19c
     { "xattr", offsetof(struct lo_data, xattr), 1 },
ddf19c
     { "no_xattr", offsetof(struct lo_data, xattr), 0 },
ddf19c
     { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
ddf19c
@@ -485,6 +496,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
ddf19c
         fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
ddf19c
         conn->want |= FUSE_CAP_FLOCK_LOCKS;
ddf19c
     }
ddf19c
+
ddf19c
+    if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
ddf19c
+        if (lo->posix_lock) {
ddf19c
+            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
ddf19c
+            conn->want |= FUSE_CAP_POSIX_LOCKS;
ddf19c
+        } else {
ddf19c
+            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
ddf19c
+            conn->want &= ~FUSE_CAP_POSIX_LOCKS;
ddf19c
+        }
ddf19c
+    }
ddf19c
+
ddf19c
     if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
ddf19c
         lo->readdirplus_clear) {
ddf19c
         fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
ddf19c
@@ -772,6 +794,19 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
ddf19c
     return p;
ddf19c
 }
ddf19c
 
ddf19c
+/* value_destroy_func for posix_locks GHashTable */
ddf19c
+static void posix_locks_value_destroy(gpointer data)
ddf19c
+{
ddf19c
+    struct lo_inode_plock *plock = data;
ddf19c
+
ddf19c
+    /*
ddf19c
+     * We had used open() for locks and had only one fd. So
ddf19c
+     * closing this fd should release all OFD locks.
ddf19c
+     */
ddf19c
+    close(plock->fd);
ddf19c
+    free(plock);
ddf19c
+}
ddf19c
+
ddf19c
 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
ddf19c
                         struct fuse_entry_param *e)
ddf19c
 {
ddf19c
@@ -825,6 +860,9 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
ddf19c
         newfd = -1;
ddf19c
         inode->key.ino = e->attr.st_ino;
ddf19c
         inode->key.dev = e->attr.st_dev;
ddf19c
+        pthread_mutex_init(&inode->plock_mutex, NULL);
ddf19c
+        inode->posix_locks = g_hash_table_new_full(
ddf19c
+            g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
ddf19c
 
ddf19c
         pthread_mutex_lock(&lo->mutex);
ddf19c
         inode->fuse_ino = lo_add_inode_mapping(req, inode);
ddf19c
@@ -1160,6 +1198,11 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
ddf19c
     if (!inode->refcount) {
ddf19c
         lo_map_remove(&lo->ino_map, inode->fuse_ino);
ddf19c
         g_hash_table_remove(lo->inodes, &inode->key);
ddf19c
+        if (g_hash_table_size(inode->posix_locks)) {
ddf19c
+            fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
ddf19c
+        }
ddf19c
+        g_hash_table_destroy(inode->posix_locks);
ddf19c
+        pthread_mutex_destroy(&inode->plock_mutex);
ddf19c
         pthread_mutex_unlock(&lo->mutex);
ddf19c
         close(inode->fd);
ddf19c
         free(inode);
ddf19c
@@ -1516,6 +1559,136 @@ out:
ddf19c
     }
ddf19c
 }
ddf19c
 
ddf19c
+/* Should be called with inode->plock_mutex held */
ddf19c
+static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
ddf19c
+                                                      struct lo_inode *inode,
ddf19c
+                                                      uint64_t lock_owner,
ddf19c
+                                                      pid_t pid, int *err)
ddf19c
+{
ddf19c
+    struct lo_inode_plock *plock;
ddf19c
+    char procname[64];
ddf19c
+    int fd;
ddf19c
+
ddf19c
+    plock =
ddf19c
+        g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
ddf19c
+
ddf19c
+    if (plock) {
ddf19c
+        return plock;
ddf19c
+    }
ddf19c
+
ddf19c
+    plock = malloc(sizeof(struct lo_inode_plock));
ddf19c
+    if (!plock) {
ddf19c
+        *err = ENOMEM;
ddf19c
+        return NULL;
ddf19c
+    }
ddf19c
+
ddf19c
+    /* Open another instance of file which can be used for ofd locks. */
ddf19c
+    sprintf(procname, "%i", inode->fd);
ddf19c
+
ddf19c
+    /* TODO: What if file is not writable? */
ddf19c
+    fd = openat(lo->proc_self_fd, procname, O_RDWR);
ddf19c
+    if (fd == -1) {
ddf19c
+        *err = errno;
ddf19c
+        free(plock);
ddf19c
+        return NULL;
ddf19c
+    }
ddf19c
+
ddf19c
+    plock->lock_owner = lock_owner;
ddf19c
+    plock->fd = fd;
ddf19c
+    g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
ddf19c
+                        plock);
ddf19c
+    return plock;
ddf19c
+}
ddf19c
+
ddf19c
+static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
ddf19c
+                     struct flock *lock)
ddf19c
+{
ddf19c
+    struct lo_data *lo = lo_data(req);
ddf19c
+    struct lo_inode *inode;
ddf19c
+    struct lo_inode_plock *plock;
ddf19c
+    int ret, saverr = 0;
ddf19c
+
ddf19c
+    fuse_log(FUSE_LOG_DEBUG,
ddf19c
+             "lo_getlk(ino=%" PRIu64 ", flags=%d)"
ddf19c
+             " owner=0x%lx, l_type=%d l_start=0x%lx"
ddf19c
+             " l_len=0x%lx\n",
ddf19c
+             ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
ddf19c
+             lock->l_len);
ddf19c
+
ddf19c
+    inode = lo_inode(req, ino);
ddf19c
+    if (!inode) {
ddf19c
+        fuse_reply_err(req, EBADF);
ddf19c
+        return;
ddf19c
+    }
ddf19c
+
ddf19c
+    pthread_mutex_lock(&inode->plock_mutex);
ddf19c
+    plock =
ddf19c
+        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret;;
ddf19c
+    if (!plock) {
ddf19c
+        pthread_mutex_unlock(&inode->plock_mutex);
ddf19c
+        fuse_reply_err(req, ret);
ddf19c
+        return;
ddf19c
+    }
ddf19c
+
ddf19c
+    ret = fcntl(plock->fd, F_OFD_GETLK, lock);
ddf19c
+    if (ret == -1) {
ddf19c
+        saverr = errno;
ddf19c
+    }
ddf19c
+    pthread_mutex_unlock(&inode->plock_mutex);
ddf19c
+
ddf19c
+    if (saverr) {
ddf19c
+        fuse_reply_err(req, saverr);
ddf19c
+    } else {
ddf19c
+        fuse_reply_lock(req, lock);
ddf19c
+    }
ddf19c
+}
ddf19c
+
ddf19c
+static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
ddf19c
+                     struct flock *lock, int sleep)
ddf19c
+{
ddf19c
+    struct lo_data *lo = lo_data(req);
ddf19c
+    struct lo_inode *inode;
ddf19c
+    struct lo_inode_plock *plock;
ddf19c
+    int ret, saverr = 0;
ddf19c
+
ddf19c
+    fuse_log(FUSE_LOG_DEBUG,
ddf19c
+             "lo_setlk(ino=%" PRIu64 ", flags=%d)"
ddf19c
+             " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
ddf19c
+             " l_start=0x%lx l_len=0x%lx\n",
ddf19c
+             ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
ddf19c
+             lock->l_whence, lock->l_start, lock->l_len);
ddf19c
+
ddf19c
+    if (sleep) {
ddf19c
+        fuse_reply_err(req, EOPNOTSUPP);
ddf19c
+        return;
ddf19c
+    }
ddf19c
+
ddf19c
+    inode = lo_inode(req, ino);
ddf19c
+    if (!inode) {
ddf19c
+        fuse_reply_err(req, EBADF);
ddf19c
+        return;
ddf19c
+    }
ddf19c
+
ddf19c
+    pthread_mutex_lock(&inode->plock_mutex);
ddf19c
+    plock =
ddf19c
+        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret;;
ddf19c
+
ddf19c
+    if (!plock) {
ddf19c
+        pthread_mutex_unlock(&inode->plock_mutex);
ddf19c
+        fuse_reply_err(req, ret);
ddf19c
+        return;
ddf19c
+    }
ddf19c
+
ddf19c
+    /* TODO: Is it alright to modify flock? */
ddf19c
+    lock->l_pid = 0;
ddf19c
+    ret = fcntl(plock->fd, F_OFD_SETLK, lock);
ddf19c
+    if (ret == -1) {
ddf19c
+        saverr = errno;
ddf19c
+    }
ddf19c
+    pthread_mutex_unlock(&inode->plock_mutex);
ddf19c
+    fuse_reply_err(req, saverr);
ddf19c
+}
ddf19c
+
ddf19c
 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
ddf19c
                         struct fuse_file_info *fi)
ddf19c
 {
ddf19c
@@ -1617,6 +1790,19 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
ddf19c
 {
ddf19c
     int res;
ddf19c
     (void)ino;
ddf19c
+    struct lo_inode *inode;
ddf19c
+
ddf19c
+    inode = lo_inode(req, ino);
ddf19c
+    if (!inode) {
ddf19c
+        fuse_reply_err(req, EBADF);
ddf19c
+        return;
ddf19c
+    }
ddf19c
+
ddf19c
+    /* An fd is going away. Cleanup associated posix locks */
ddf19c
+    pthread_mutex_lock(&inode->plock_mutex);
ddf19c
+    g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
ddf19c
+    pthread_mutex_unlock(&inode->plock_mutex);
ddf19c
+
ddf19c
     res = close(dup(lo_fi_fd(req, fi)));
ddf19c
     fuse_reply_err(req, res == -1 ? errno : 0);
ddf19c
 }
ddf19c
@@ -2080,6 +2266,8 @@ static struct fuse_lowlevel_ops lo_oper = {
ddf19c
     .releasedir = lo_releasedir,
ddf19c
     .fsyncdir = lo_fsyncdir,
ddf19c
     .create = lo_create,
ddf19c
+    .getlk = lo_getlk,
ddf19c
+    .setlk = lo_setlk,
ddf19c
     .open = lo_open,
ddf19c
     .release = lo_release,
ddf19c
     .flush = lo_flush,
ddf19c
@@ -2434,6 +2622,7 @@ int main(int argc, char *argv[])
ddf19c
     struct lo_data lo = {
ddf19c
         .debug = 0,
ddf19c
         .writeback = 0,
ddf19c
+        .posix_lock = 1,
ddf19c
         .proc_self_fd = -1,
ddf19c
     };
ddf19c
     struct lo_map_elem *root_elem;
ddf19c
-- 
ddf19c
1.8.3.1
ddf19c