e3c68b
From ea7f11b989896d76b8d091d26bc0241bce9413f8 Mon Sep 17 00:00:00 2001
e3c68b
From: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Date: Thu, 4 Jul 2019 13:21:33 +0200
e3c68b
Subject: [PATCH 253/255] core: fix deadlock between statedump and
e3c68b
 fd_anonymous()
e3c68b
e3c68b
There exists a deadlock between statedump generation and fd_anonymous()
e3c68b
function because they are acquiring inode table lock and inode lock in
e3c68b
reverse order.
e3c68b
e3c68b
This patch modifies fd_anonymous() so that it takes inode lock only when
e3c68b
it's really necessary, avoiding the deadlock.
e3c68b
e3c68b
Upstream patch:
e3c68b
> Change-Id: I24355447f0ea1b39e2546782ad07f0512cc381e7
e3c68b
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22995
e3c68b
> BUG: 1727068
e3c68b
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
e3c68b
Change-Id: I24355447f0ea1b39e2546782ad07f0512cc381e7
e3c68b
Fixes: bz#1722209
e3c68b
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/176096
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e3c68b
---
e3c68b
 libglusterfs/src/fd.c | 137 ++++++++++++++++++++++----------------------------
e3c68b
 1 file changed, 61 insertions(+), 76 deletions(-)
e3c68b
e3c68b
diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c
e3c68b
index b8aac72..314546a 100644
e3c68b
--- a/libglusterfs/src/fd.c
e3c68b
+++ b/libglusterfs/src/fd.c
e3c68b
@@ -532,7 +532,7 @@ fd_unref(fd_t *fd)
e3c68b
     return;
e3c68b
 }
e3c68b
 
e3c68b
-fd_t *
e3c68b
+static fd_t *
e3c68b
 __fd_bind(fd_t *fd)
e3c68b
 {
e3c68b
     list_del_init(&fd->inode_list);
e3c68b
@@ -562,9 +562,9 @@ fd_bind(fd_t *fd)
e3c68b
 }
e3c68b
 
e3c68b
 static fd_t *
e3c68b
-__fd_create(inode_t *inode, uint64_t pid)
e3c68b
+fd_allocate(inode_t *inode, uint64_t pid)
e3c68b
 {
e3c68b
-    fd_t *fd = NULL;
e3c68b
+    fd_t *fd;
e3c68b
 
e3c68b
     if (inode == NULL) {
e3c68b
         gf_msg_callingfn("fd", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG,
e3c68b
@@ -573,64 +573,67 @@ __fd_create(inode_t *inode, uint64_t pid)
e3c68b
     }
e3c68b
 
e3c68b
     fd = mem_get0(inode->table->fd_mem_pool);
e3c68b
-    if (!fd)
e3c68b
-        goto out;
e3c68b
+    if (fd == NULL) {
e3c68b
+        return NULL;
e3c68b
+    }
e3c68b
 
e3c68b
     fd->xl_count = inode->table->xl->graph->xl_count + 1;
e3c68b
 
e3c68b
     fd->_ctx = GF_CALLOC(1, (sizeof(struct _fd_ctx) * fd->xl_count),
e3c68b
                          gf_common_mt_fd_ctx);
e3c68b
-    if (!fd->_ctx)
e3c68b
-        goto free_fd;
e3c68b
+    if (fd->_ctx == NULL) {
e3c68b
+        goto failed;
e3c68b
+    }
e3c68b
 
e3c68b
     fd->lk_ctx = fd_lk_ctx_create();
e3c68b
-    if (!fd->lk_ctx)
e3c68b
-        goto free_fd_ctx;
e3c68b
-
e3c68b
-    fd->inode = inode_ref(inode);
e3c68b
-    fd->pid = pid;
e3c68b
-    INIT_LIST_HEAD(&fd->inode_list);
e3c68b
-
e3c68b
-    LOCK_INIT(&fd->lock);
e3c68b
-out:
e3c68b
-    return fd;
e3c68b
+    if (fd->lk_ctx != NULL) {
e3c68b
+        /* We need to take a reference from the inode, but we cannot do it
e3c68b
+         * here because this function can be called with the inode lock taken
e3c68b
+         * and inode_ref() takes the inode's table lock. This is the reverse
e3c68b
+         * of the logical lock acquisition order and can cause a deadlock. So
e3c68b
+         * we simply assign the inode here and we delefate the inode reference
e3c68b
+         * responsibility to the caller (when this function succeeds and the
e3c68b
+         * inode lock is released). This is safe because the caller must hold
e3c68b
+         * a reference of the inode to use it, so it's guaranteed that the
e3c68b
+         * number of references won't reach 0 before the caller finishes.
e3c68b
+         *
e3c68b
+         * TODO: minimize use of locks in favor of atomic operations to avoid
e3c68b
+         *       these dependencies. */
e3c68b
+        fd->inode = inode;
e3c68b
+        fd->pid = pid;
e3c68b
+        INIT_LIST_HEAD(&fd->inode_list);
e3c68b
+        LOCK_INIT(&fd->lock);
e3c68b
+        GF_ATOMIC_INIT(fd->refcount, 1);
e3c68b
+        return fd;
e3c68b
+    }
e3c68b
 
e3c68b
-free_fd_ctx:
e3c68b
     GF_FREE(fd->_ctx);
e3c68b
-free_fd:
e3c68b
+
e3c68b
+failed:
e3c68b
     mem_put(fd);
e3c68b
 
e3c68b
     return NULL;
e3c68b
 }
e3c68b
 
e3c68b
 fd_t *
e3c68b
-fd_create(inode_t *inode, pid_t pid)
e3c68b
+fd_create_uint64(inode_t *inode, uint64_t pid)
e3c68b
 {
e3c68b
-    fd_t *fd = NULL;
e3c68b
-
e3c68b
-    fd = __fd_create(inode, (uint64_t)pid);
e3c68b
-    if (!fd)
e3c68b
-        goto out;
e3c68b
+    fd_t *fd;
e3c68b
 
e3c68b
-    fd = fd_ref(fd);
e3c68b
+    fd = fd_allocate(inode, pid);
e3c68b
+    if (fd != NULL) {
e3c68b
+        /* fd_allocate() doesn't get a reference from the inode. We need to
e3c68b
+         * take it here in case of success. */
e3c68b
+        inode_ref(inode);
e3c68b
+    }
e3c68b
 
e3c68b
-out:
e3c68b
     return fd;
e3c68b
 }
e3c68b
 
e3c68b
 fd_t *
e3c68b
-fd_create_uint64(inode_t *inode, uint64_t pid)
e3c68b
+fd_create(inode_t *inode, pid_t pid)
e3c68b
 {
e3c68b
-    fd_t *fd = NULL;
e3c68b
-
e3c68b
-    fd = __fd_create(inode, pid);
e3c68b
-    if (!fd)
e3c68b
-        goto out;
e3c68b
-
e3c68b
-    fd = fd_ref(fd);
e3c68b
-
e3c68b
-out:
e3c68b
-    return fd;
e3c68b
+    return fd_create_uint64(inode, (uint64_t)pid);
e3c68b
 }
e3c68b
 
e3c68b
 static fd_t *
e3c68b
@@ -719,10 +722,13 @@ __fd_lookup_anonymous(inode_t *inode, int32_t flags)
e3c68b
     return fd;
e3c68b
 }
e3c68b
 
e3c68b
-static fd_t *
e3c68b
-__fd_anonymous(inode_t *inode, int32_t flags)
e3c68b
+fd_t *
e3c68b
+fd_anonymous_with_flags(inode_t *inode, int32_t flags)
e3c68b
 {
e3c68b
     fd_t *fd = NULL;
e3c68b
+    bool ref = false;
e3c68b
+
e3c68b
+    LOCK(&inode->lock);
e3c68b
 
e3c68b
     fd = __fd_lookup_anonymous(inode, flags);
e3c68b
 
e3c68b
@@ -730,54 +736,33 @@ __fd_anonymous(inode_t *inode, int32_t flags)
e3c68b
        __fd_lookup_anonymous(), so no need of one more fd_ref().
e3c68b
        if (!fd); then both create and bind won't bump up the ref
e3c68b
        count, so we have to call fd_ref() after bind. */
e3c68b
-    if (!fd) {
e3c68b
-        fd = __fd_create(inode, 0);
e3c68b
-
e3c68b
-        if (!fd)
e3c68b
-            return NULL;
e3c68b
-
e3c68b
-        fd->anonymous = _gf_true;
e3c68b
-        fd->flags = GF_ANON_FD_FLAGS | flags;
e3c68b
+    if (fd == NULL) {
e3c68b
+        fd = fd_allocate(inode, 0);
e3c68b
+        if (fd != NULL) {
e3c68b
+            fd->anonymous = _gf_true;
e3c68b
+            fd->flags = GF_ANON_FD_FLAGS | (flags & O_DIRECT);
e3c68b
 
e3c68b
-        __fd_bind(fd);
e3c68b
+            __fd_bind(fd);
e3c68b
 
e3c68b
-        __fd_ref(fd);
e3c68b
+            ref = true;
e3c68b
+        }
e3c68b
     }
e3c68b
 
e3c68b
-    return fd;
e3c68b
-}
e3c68b
-
e3c68b
-fd_t *
e3c68b
-fd_anonymous(inode_t *inode)
e3c68b
-{
e3c68b
-    fd_t *fd = NULL;
e3c68b
+    UNLOCK(&inode->lock);
e3c68b
 
e3c68b
-    LOCK(&inode->lock);
e3c68b
-    {
e3c68b
-        fd = __fd_anonymous(inode, GF_ANON_FD_FLAGS);
e3c68b
+    if (ref) {
e3c68b
+        /* fd_allocate() doesn't get a reference from the inode. We need to
e3c68b
+         * take it here in case of success. */
e3c68b
+        inode_ref(inode);
e3c68b
     }
e3c68b
-    UNLOCK(&inode->lock);
e3c68b
 
e3c68b
     return fd;
e3c68b
 }
e3c68b
 
e3c68b
 fd_t *
e3c68b
-fd_anonymous_with_flags(inode_t *inode, int32_t flags)
e3c68b
+fd_anonymous(inode_t *inode)
e3c68b
 {
e3c68b
-    fd_t *fd = NULL;
e3c68b
-
e3c68b
-    if (flags & O_DIRECT)
e3c68b
-        flags = GF_ANON_FD_FLAGS | O_DIRECT;
e3c68b
-    else
e3c68b
-        flags = GF_ANON_FD_FLAGS;
e3c68b
-
e3c68b
-    LOCK(&inode->lock);
e3c68b
-    {
e3c68b
-        fd = __fd_anonymous(inode, flags);
e3c68b
-    }
e3c68b
-    UNLOCK(&inode->lock);
e3c68b
-
e3c68b
-    return fd;
e3c68b
+    return fd_anonymous_with_flags(inode, 0);
e3c68b
 }
e3c68b
 
e3c68b
 fd_t *
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b