12a457
From 7bec1b8efce0d60b45a29b0b1de81def3bc79f6c Mon Sep 17 00:00:00 2001
12a457
From: Mohammed Rafi KC <rkavunga@redhat.com>
12a457
Date: Tue, 3 May 2016 14:43:20 +0530
12a457
Subject: [PATCH 135/139] dht:remember locked subvol and send unlock to the same
12a457
12a457
During locking we send lock request to cached subvol,
12a457
and normally we unlock to the cached subvol
12a457
But with parallel fresh lookup on a directory, there
12a457
is a race window where the cached subvol can change
12a457
and the unlock can go into a different subvol from
12a457
which we took lock.
12a457
12a457
This will result in a stale lock held on one of the
12a457
subvol.
12a457
12a457
So we will store the details of subvol which we took the lock
12a457
and will unlock from the same subvol
12a457
12a457
back port of>
12a457
>Change-Id: I47df99491671b10624eb37d1d17e40bacf0b15eb
12a457
>BUG: 1311002
12a457
>Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
12a457
>Reviewed-on: http://review.gluster.org/13492
12a457
>Reviewed-by: N Balachandran <nbalacha@redhat.com>
12a457
>Smoke: Gluster Build System <jenkins@build.gluster.com>
12a457
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
12a457
>Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
12a457
>CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
12a457
12a457
>Change-Id: Ia847e7115d2296ae9811b14a956f3b6bf39bd86d
12a457
>BUG: 1333645
12a457
12a457
Change-Id: Ief799bdd19086dca614505452ed96493d6f5f8e7
12a457
BUG: 1306194
12a457
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
12a457
Signed-off-by: N Balachandran <nbalacha@redhat.com>
12a457
Reviewed-on: https://code.engineering.redhat.com/gerrit/73806
12a457
---
12a457
 xlators/cluster/dht/src/dht-common.c     |    5 +-
12a457
 xlators/cluster/dht/src/dht-common.h     |   11 ++
12a457
 xlators/cluster/dht/src/dht-helper.c     |  155 ++++++++++++++++++++++++++++++
12a457
 xlators/cluster/dht/src/dht-inode-read.c |   62 ++++++++----
12a457
 xlators/cluster/dht/src/dht-messages.h   |    8 ++-
12a457
 5 files changed, 220 insertions(+), 21 deletions(-)
12a457
12a457
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
12a457
index 594eca0..fa053de 100644
12a457
--- a/xlators/cluster/dht/src/dht-common.c
12a457
+++ b/xlators/cluster/dht/src/dht-common.c
12a457
@@ -8406,7 +8406,10 @@ dht_entrylk_cbk (call_frame_t *frame, void *cookie,
12a457
         return 0;
12a457
 }
12a457
 
12a457
-
12a457
+/* TODO
12a457
+ * Sending entrylk to cached subvol can result in stale lock
12a457
+ * as described in the bug 1311002.
12a457
+ */
12a457
 int
12a457
 dht_entrylk (call_frame_t *frame, xlator_t *this,
12a457
              const char *volume, loc_t *loc, const char *basename,
12a457
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
12a457
index 058b149..9792189 100644
12a457
--- a/xlators/cluster/dht/src/dht-common.h
12a457
+++ b/xlators/cluster/dht/src/dht-common.h
12a457
@@ -106,6 +106,7 @@ typedef struct dht_stat_time dht_stat_time_t;
12a457
 struct dht_inode_ctx {
12a457
         dht_layout_t    *layout;
12a457
         dht_stat_time_t  time;
12a457
+        xlator_t        *lock_subvol;
12a457
 };
12a457
 
12a457
 typedef struct dht_inode_ctx dht_inode_ctx_t;
12a457
@@ -288,6 +289,8 @@ struct dht_local {
12a457
                 int                 op_errno;
12a457
         } lock;
12a457
 
12a457
+        short           lock_type;
12a457
+
12a457
         call_stub_t *stub;
12a457
         int32_t      parent_disk_layout[4];
12a457
 };
12a457
@@ -1208,4 +1211,12 @@ dht_release (xlator_t *this, fd_t *fd);
12a457
 
12a457
 int32_t
12a457
 dht_set_fixed_dir_stat (struct iatt *stat);
12a457
+
12a457
+xlator_t*
12a457
+dht_get_lock_subvolume (xlator_t *this, struct gf_flock *lock,
12a457
+                        dht_local_t *local);
12a457
+
12a457
+int
12a457
+dht_lk_inode_unref (call_frame_t *frame, int32_t op_ret);
12a457
+
12a457
 #endif/* _DHT_H */
12a457
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
12a457
index 81d49db..ac70a66 100644
12a457
--- a/xlators/cluster/dht/src/dht-helper.c
12a457
+++ b/xlators/cluster/dht/src/dht-helper.c
12a457
@@ -2300,3 +2300,158 @@ dht_heal_full_path_done (int op_ret, call_frame_t *heal_frame, void *data)
12a457
         DHT_STACK_DESTROY (heal_frame);
12a457
         return 0;
12a457
 }
12a457
+
12a457
+/* This function must be called inside an inode lock */
12a457
+int
12a457
+__dht_lock_subvol_set (inode_t *inode, xlator_t *this,
12a457
+                       xlator_t *lock_subvol)
12a457
+{
12a457
+        dht_inode_ctx_t         *ctx            = NULL;
12a457
+        int                      ret            = -1;
12a457
+        uint64_t                 value          = 0;
12a457
+
12a457
+        GF_VALIDATE_OR_GOTO (this->name, inode, out);
12a457
+
12a457
+        ret = __inode_ctx_get0 (inode, this, &value);
12a457
+        if (ret || !value) {
12a457
+                return -1;
12a457
+        }
12a457
+
12a457
+        ctx = (dht_inode_ctx_t *) value;
12a457
+        ctx->lock_subvol = lock_subvol;
12a457
+out:
12a457
+        return ret;
12a457
+}
12a457
+
12a457
+xlator_t*
12a457
+dht_get_lock_subvolume (xlator_t *this, struct gf_flock *lock,
12a457
+                        dht_local_t *local)
12a457
+{
12a457
+        xlator_t                *subvol                  = NULL;
12a457
+        inode_t                 *inode                   = NULL;
12a457
+        int32_t                  ret                     = -1;
12a457
+        uint64_t                 value                   = 0;
12a457
+        xlator_t                *cached_subvol           = NULL;
12a457
+        dht_inode_ctx_t         *ctx                     = NULL;
12a457
+        char                     gfid[GF_UUID_BUF_SIZE]  = {0};
12a457
+
12a457
+        GF_VALIDATE_OR_GOTO (this->name, lock, out);
12a457
+        GF_VALIDATE_OR_GOTO (this->name, local, out);
12a457
+
12a457
+        cached_subvol = local->cached_subvol;
12a457
+
12a457
+        if (local->loc.inode || local->fd) {
12a457
+                inode = local->loc.inode ? local->loc.inode : local->fd->inode;
12a457
+        }
12a457
+
12a457
+        if (!inode)
12a457
+                goto out;
12a457
+
12a457
+        if (!(IA_ISDIR (inode->ia_type) || IA_ISINVAL (inode->ia_type))) {
12a457
+                /*
12a457
+                 * We may get non-linked inode for directories as part
12a457
+                 * of the selfheal code path. So checking  for IA_INVAL
12a457
+                 * type also. This will only happen for directory.
12a457
+                 */
12a457
+                subvol = local->cached_subvol;
12a457
+                goto out;
12a457
+        }
12a457
+
12a457
+        if (lock->l_type != F_UNLCK) {
12a457
+                /*
12a457
+                 * inode purging might happen on NFS between a lk
12a457
+                 * and unlk. Due to this lk and unlk might be sent
12a457
+                 * to different subvols.
12a457
+                 * So during a lock request, taking a ref on inode
12a457
+                 * to prevent inode purging. inode unref will happen
12a457
+                 * in unlock cbk code path.
12a457
+                 */
12a457
+                inode_ref (inode);
12a457
+        }
12a457
+
12a457
+        LOCK (&inode->lock);
12a457
+                ret = __inode_ctx_get0 (inode, this, &value);
12a457
+                if (!ret && value) {
12a457
+                        ctx = (dht_inode_ctx_t *) value;
12a457
+                        subvol = ctx->lock_subvol;
12a457
+                }
12a457
+                if (!subvol && lock->l_type != F_UNLCK && cached_subvol) {
12a457
+                        ret = __dht_lock_subvol_set (inode, this,
12a457
+                                                     cached_subvol);
12a457
+                        if (ret) {
12a457
+                                gf_uuid_unparse(inode->gfid, gfid);
12a457
+                                gf_msg (this->name, GF_LOG_WARNING, 0,
12a457
+                                        DHT_MSG_SET_INODE_CTX_FAILED,
12a457
+                                        "Failed to set lock_subvol in "
12a457
+                                        "inode ctx for gfid %s",
12a457
+                                        gfid);
12a457
+                                goto unlock;
12a457
+                        }
12a457
+                        subvol = cached_subvol;
12a457
+                }
12a457
+unlock:
12a457
+        UNLOCK (&inode->lock);
12a457
+        if (!subvol && inode && lock->l_type != F_UNLCK) {
12a457
+                inode_unref (inode);
12a457
+        }
12a457
+out:
12a457
+        return subvol;
12a457
+}
12a457
+
12a457
+int
12a457
+dht_lk_inode_unref (call_frame_t *frame, int32_t op_ret)
12a457
+{
12a457
+        int                     ret                     = -1;
12a457
+        dht_local_t            *local                   = NULL;
12a457
+        inode_t                *inode                   = NULL;
12a457
+        xlator_t               *this                    = NULL;
12a457
+        char                    gfid[GF_UUID_BUF_SIZE]  = {0};
12a457
+
12a457
+        local = frame->local;
12a457
+        this = frame->this;
12a457
+
12a457
+        if (local->loc.inode || local->fd) {
12a457
+                inode = local->loc.inode ? local->loc.inode : local->fd->inode;
12a457
+        }
12a457
+        if (!inode) {
12a457
+                gf_msg (this->name, GF_LOG_WARNING, 0,
12a457
+                        DHT_MSG_LOCK_INODE_UNREF_FAILED,
12a457
+                        "Found a NULL inode. Failed to unref the inode");
12a457
+                goto out;
12a457
+        }
12a457
+
12a457
+        if (!(IA_ISDIR (inode->ia_type) || IA_ISINVAL (inode->ia_type))) {
12a457
+                ret = 0;
12a457
+                goto out;
12a457
+        }
12a457
+
12a457
+        switch (local->lock_type) {
12a457
+        case F_RDLCK:
12a457
+        case F_WRLCK:
12a457
+                if (op_ret) {
12a457
+                        gf_uuid_unparse(inode->gfid, gfid);
12a457
+                        gf_msg_debug (this->name, 0,
12a457
+                                "lock request failed for gfid %s", gfid);
12a457
+                        inode_unref (inode);
12a457
+                        goto out;
12a457
+                }
12a457
+                break;
12a457
+
12a457
+        case F_UNLCK:
12a457
+                if (!op_ret) {
12a457
+                        inode_unref (inode);
12a457
+                } else {
12a457
+                        gf_uuid_unparse(inode->gfid, gfid);
12a457
+                        gf_msg (this->name, GF_LOG_WARNING, 0,
12a457
+                                DHT_MSG_LOCK_INODE_UNREF_FAILED,
12a457
+                                "Unlock request failed for gfid %s."
12a457
+                                "Failed to unref the inode", gfid);
12a457
+                        goto out;
12a457
+                }
12a457
+        default:
12a457
+                break;
12a457
+        }
12a457
+        ret = 0;
12a457
+out:
12a457
+        return ret;
12a457
+}
12a457
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
12a457
index f038d40..9172cec 100644
12a457
--- a/xlators/cluster/dht/src/dht-inode-read.c
12a457
+++ b/xlators/cluster/dht/src/dht-inode-read.c
12a457
@@ -947,6 +947,7 @@ int
12a457
 dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
12a457
             int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata)
12a457
 {
12a457
+        dht_lk_inode_unref (frame, op_ret);
12a457
         DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata);
12a457
 
12a457
         return 0;
12a457
@@ -957,16 +958,24 @@ int
12a457
 dht_lk (call_frame_t *frame, xlator_t *this,
12a457
         fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata)
12a457
 {
12a457
-        xlator_t     *subvol = NULL;
12a457
-        int           op_errno = -1;
12a457
+        xlator_t        *lock_subvol       = NULL;
12a457
+        dht_local_t     *local             = NULL;
12a457
+        int              op_errno          = -1;
12a457
 
12a457
 
12a457
         VALIDATE_OR_GOTO (frame, err);
12a457
         VALIDATE_OR_GOTO (this, err);
12a457
         VALIDATE_OR_GOTO (fd, err);
12a457
 
12a457
-        subvol = dht_subvol_get_cached (this, fd->inode);
12a457
-        if (!subvol) {
12a457
+        local = dht_local_init (frame, NULL, fd, GF_FOP_LK);
12a457
+        if (!local) {
12a457
+                op_errno = ENOMEM;
12a457
+                goto err;
12a457
+        }
12a457
+
12a457
+        local->lock_type = flock->l_type;
12a457
+        lock_subvol = dht_get_lock_subvolume (this, flock, local);
12a457
+        if (!lock_subvol) {
12a457
                 gf_msg_debug (this->name, 0,
12a457
                               "no cached subvolume for fd=%p", fd);
12a457
                 op_errno = EINVAL;
12a457
@@ -974,7 +983,7 @@ dht_lk (call_frame_t *frame, xlator_t *this,
12a457
         }
12a457
 
12a457
         /* TODO: for rebalance, we need to preserve the fop arguments */
12a457
-        STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd,
12a457
+        STACK_WIND (frame, dht_lk_cbk, lock_subvol, lock_subvol->fops->lk, fd,
12a457
                     cmd, flock, xdata);
12a457
 
12a457
         return 0;
12a457
@@ -1158,6 +1167,7 @@ dht_inodelk_cbk (call_frame_t *frame, void *cookie,
12a457
                  xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
12a457
 
12a457
 {
12a457
+        dht_lk_inode_unref (frame, op_ret);
12a457
         DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata);
12a457
         return 0;
12a457
 }
12a457
@@ -1167,9 +1177,9 @@ int32_t
12a457
 dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
12a457
              loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
12a457
 {
12a457
-        xlator_t     *subvol = NULL;
12a457
-        int           op_errno = -1;
12a457
-        dht_local_t  *local = NULL;
12a457
+        xlator_t        *lock_subvol            = NULL;
12a457
+        int              op_errno               = -1;
12a457
+        dht_local_t     *local                  = NULL;
12a457
 
12a457
 
12a457
         VALIDATE_OR_GOTO (frame, err);
12a457
@@ -1183,10 +1193,11 @@ dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
12a457
                 goto err;
12a457
         }
12a457
 
12a457
-        subvol = local->cached_subvol;
12a457
-        if (!subvol) {
12a457
+        local->lock_type = lock->l_type;
12a457
+        lock_subvol = dht_get_lock_subvolume (this, lock, local);
12a457
+        if (!lock_subvol) {
12a457
                 gf_msg_debug (this->name, 0,
12a457
-                              "no cached subvolume for path=%s", loc->path);
12a457
+                              "no lock subvolume for path=%s", loc->path);
12a457
                 op_errno = EINVAL;
12a457
                 goto err;
12a457
         }
12a457
@@ -1195,7 +1206,7 @@ dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
12a457
 
12a457
         STACK_WIND (frame,
12a457
                     dht_inodelk_cbk,
12a457
-                    subvol, subvol->fops->inodelk,
12a457
+                    lock_subvol, lock_subvol->fops->inodelk,
12a457
                     volume, loc, cmd, lock, xdata);
12a457
 
12a457
         return 0;
12a457
@@ -1207,12 +1218,13 @@ err:
12a457
         return 0;
12a457
 }
12a457
 
12a457
-
12a457
 int
12a457
 dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
12a457
                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
12a457
 
12a457
 {
12a457
+
12a457
+        dht_lk_inode_unref (frame, op_ret);
12a457
         DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata);
12a457
         return 0;
12a457
 }
12a457
@@ -1222,23 +1234,35 @@ int
12a457
 dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
12a457
               fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
12a457
 {
12a457
-        xlator_t     *subvol = NULL;
12a457
-        int           op_errno = -1;
12a457
+        xlator_t     *lock_subvol       = NULL;
12a457
+        dht_local_t  *local             = NULL;
12a457
+        int           op_errno          = -1;
12a457
+
12a457
 
12a457
         VALIDATE_OR_GOTO (frame, err);
12a457
         VALIDATE_OR_GOTO (this, err);
12a457
         VALIDATE_OR_GOTO (fd, err);
12a457
 
12a457
-        subvol = dht_subvol_get_cached (this, fd->inode);
12a457
-        if (!subvol) {
12a457
+        local = dht_local_init (frame, NULL, fd, GF_FOP_INODELK);
12a457
+        if (!local) {
12a457
+                op_errno = ENOMEM;
12a457
+                goto err;
12a457
+        }
12a457
+
12a457
+        local->call_cnt = 1;
12a457
+        local->lock_type = lock->l_type;
12a457
+
12a457
+        lock_subvol = dht_get_lock_subvolume (this, lock, local);
12a457
+        if (!lock_subvol) {
12a457
                 gf_msg_debug (this->name, 0,
12a457
-                              "no cached subvolume for fd=%p", fd);
12a457
+                              "no lock subvolume for fd=%p", fd);
12a457
                 op_errno = EINVAL;
12a457
                 goto err;
12a457
         }
12a457
 
12a457
 
12a457
-        STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk,
12a457
+        STACK_WIND (frame, dht_finodelk_cbk, lock_subvol,
12a457
+                    lock_subvol->fops->finodelk,
12a457
                     volume, fd, cmd, lock, xdata);
12a457
 
12a457
         return 0;
12a457
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
12a457
index eb0f1c8..6bc7237 100644
12a457
--- a/xlators/cluster/dht/src/dht-messages.h
12a457
+++ b/xlators/cluster/dht/src/dht-messages.h
12a457
@@ -45,7 +45,7 @@
12a457
  */
12a457
 
12a457
 #define GLFS_DHT_BASE                   GLFS_MSGID_COMP_DHT
12a457
-#define GLFS_DHT_NUM_MESSAGES           114
12a457
+#define GLFS_DHT_NUM_MESSAGES           116
12a457
 #define GLFS_MSGID_END          (GLFS_DHT_BASE + GLFS_DHT_NUM_MESSAGES + 1)
12a457
 
12a457
 /* Messages with message IDs */
12a457
@@ -1057,6 +1057,12 @@
12a457
  */
12a457
 #define DHT_MSG_PARENT_LAYOUT_CHANGED  (GLFS_DHT_BASE + 114)
12a457
 
12a457
+/*
12a457
+ * @messageid 109116
12a457
+ * @diagnosis
12a457
+ * @recommendedaction None
12a457
+ */
12a457
+#define DHT_MSG_LOCK_INODE_UNREF_FAILED  (GLFS_DHT_BASE + 116)
12a457
 
12a457
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
12a457
 #endif /* _DHT_MESSAGES_H_ */
12a457
-- 
12a457
1.7.1
12a457