e7a346
From 5579f616c2c21a2a2cd2ef70b58149df85550db7 Mon Sep 17 00:00:00 2001
e7a346
From: karthik-us <ksubrahm@redhat.com>
e7a346
Date: Mon, 27 Nov 2017 12:51:16 +0530
e7a346
Subject: [PATCH 084/128] cluster/afr: Fix for arbiter becoming source
e7a346
e7a346
Problem:
e7a346
When eager-lock is on, and two writes happen in parallel on a FD
e7a346
we were observing the following behaviour:
e7a346
- First write fails on one data brick
e7a346
- Since the post-op is not yet happened, the inode refresh will get
e7a346
  both the data bricks as readable and set it in the inode context
e7a346
- In flight split brain check see both the data bricks as readable
e7a346
  and allows the second write
e7a346
- Second write fails on the other data brick
e7a346
- Now the post-op happens and marks both the data bricks as bad and
e7a346
  arbiter will become source for healing
e7a346
e7a346
Fix:
e7a346
Adding one more variable called write_suvol in inode context and it
e7a346
will have the in memory representation of the writable subvols. Inode
e7a346
refresh will not update this value and its lifetime is pre-op through
e7a346
unlock in the afr transaction. Initially the pre-op will set this
e7a346
value same as read_subvol in inode context and then in the in flight
e7a346
split brain check we will use this value instead of read_subvol.
e7a346
After all the checks we will update the value of this and set the
e7a346
read_subvol same as this to avoid having incorrect value in that.
e7a346
e7a346
Upstream patch: https://review.gluster.org/#/c/18049/
e7a346
e7a346
> Change-Id: I2ef6904524ab91af861d59690974bbc529ab1af3
e7a346
> BUG: 1482064
e7a346
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
e7a346
e7a346
Change-Id: I91cd21e378a7ae3757c2209fcb91a613d73e09ee
e7a346
BUG: 1401969
e7a346
Signed-off-by: karthik-us <ksubrahm@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/124292
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
e7a346
---
e7a346
 xlators/cluster/afr/src/afr-common.c      | 76 ++++++++++++++++++++++++++++++-
e7a346
 xlators/cluster/afr/src/afr-lk-common.c   | 18 ++++++--
e7a346
 xlators/cluster/afr/src/afr-transaction.c |  4 ++
e7a346
 xlators/cluster/afr/src/afr.h             | 10 ++++
e7a346
 4 files changed, 102 insertions(+), 6 deletions(-)
e7a346
e7a346
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
e7a346
index 9c96056..a8ba5a0 100644
e7a346
--- a/xlators/cluster/afr/src/afr-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-common.c
e7a346
@@ -149,6 +149,7 @@ __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
e7a346
                 }
e7a346
                 tmp_ctx->spb_choice = -1;
e7a346
                 tmp_ctx->read_subvol = 0;
e7a346
+                tmp_ctx->write_subvol = 0;
e7a346
         } else {
e7a346
                 tmp_ctx = (afr_inode_ctx_t *) ctx_int;
e7a346
         }
e7a346
@@ -216,7 +217,7 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
e7a346
         if (ret < 0)
e7a346
                 return ret;
e7a346
 
e7a346
-        val = ctx->read_subvol;
e7a346
+        val = ctx->write_subvol;
e7a346
 
e7a346
         metadatamap_old = metadatamap = (val & 0x000000000000ffff);
e7a346
         datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
e7a346
@@ -276,6 +277,7 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
e7a346
                 (((uint64_t) datamap) << 16) |
e7a346
                 (((uint64_t) event) << 32);
e7a346
 
e7a346
+        ctx->write_subvol = val;
e7a346
         ctx->read_subvol = val;
e7a346
 
e7a346
         return ret;
e7a346
@@ -6421,3 +6423,75 @@ afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this,
e7a346
 out:
e7a346
         return ret;
e7a346
 }
e7a346
+
e7a346
+int
e7a346
+afr_write_subvol_set (call_frame_t *frame, xlator_t *this)
e7a346
+{
e7a346
+        afr_local_t      *local = NULL;
e7a346
+        afr_inode_ctx_t  *ctx   = NULL;
e7a346
+        uint64_t          val   = 0;
e7a346
+        uint64_t          val1  = 0;
e7a346
+        int               ret   = -1;
e7a346
+
e7a346
+        local = frame->local;
e7a346
+        LOCK(&local->inode->lock);
e7a346
+        {
e7a346
+                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
e7a346
+                if (ret < 0) {
e7a346
+                        gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
+                                AFR_MSG_DICT_GET_FAILED,
e7a346
+                                "ERROR GETTING INODE CTX");
e7a346
+                        UNLOCK(&local->inode->lock);
e7a346
+                        return ret;
e7a346
+                }
e7a346
+
e7a346
+                val = ctx->write_subvol;
e7a346
+                /*
e7a346
+                 * We need to set the value of write_subvol to read_subvol in 2
e7a346
+                 * cases:
e7a346
+                 * 1. Initially when the value is 0. i.e., it's the first lock
e7a346
+                 * request.
e7a346
+                 * 2. If it's a metadata transaction. If metadata transactions
e7a346
+                 * comes in between data transactions and we have a brick
e7a346
+                 * disconnect, the next metadata transaction won't get the
e7a346
+                 * latest value of readables, since we do resetting of
e7a346
+                 * write_subvol in unlock code path only if it's a data
e7a346
+                 * transaction. To handle those scenarios we need to set the
e7a346
+                 * value of write_subvol to read_subvol in case of metadata
e7a346
+                 * transactions.
e7a346
+                */
e7a346
+                if (val == 0 ||
e7a346
+                    local->transaction.type == AFR_METADATA_TRANSACTION) {
e7a346
+                        val1 = ctx->read_subvol;
e7a346
+                        ctx->write_subvol = val1;
e7a346
+                }
e7a346
+        }
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
+
e7a346
+        return 0;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
+afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
e7a346
+{
e7a346
+        afr_local_t      *local = NULL;
e7a346
+        afr_inode_ctx_t  *ctx   = NULL;
e7a346
+        int               ret   = -1;
e7a346
+
e7a346
+        local = frame->local;
e7a346
+        LOCK(&local->inode->lock);
e7a346
+        {
e7a346
+                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
e7a346
+                if (ret < 0) {
e7a346
+                        gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
+                                AFR_MSG_DICT_GET_FAILED,
e7a346
+                                "ERROR GETTING INODE CTX");
e7a346
+                        UNLOCK(&local->inode->lock);
e7a346
+                        return ret;
e7a346
+                }
e7a346
+                ctx->write_subvol = 0;
e7a346
+        }
e7a346
+        UNLOCK(&local->inode->lock);
e7a346
+
e7a346
+        return 0;
e7a346
+}
e7a346
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
e7a346
index 1f2a117..c17f60f 100644
e7a346
--- a/xlators/cluster/afr/src/afr-lk-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-lk-common.c
e7a346
@@ -613,12 +613,16 @@ static int32_t
e7a346
 afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
e7a346
 {
e7a346
-        afr_local_t         *local    = NULL;
e7a346
-        afr_internal_lock_t *int_lock = NULL;
e7a346
-        int call_count = 0;
e7a346
+        afr_local_t             *local          = NULL;
e7a346
+        afr_internal_lock_t     *int_lock       = NULL;
e7a346
+        afr_fd_ctx_t            *fd_ctx         = NULL;
e7a346
+        afr_private_t           *priv           = NULL;
e7a346
+        int                      call_count     = 0;
e7a346
+        int                      ret            = 0;
e7a346
 
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
+        priv = this->private;
e7a346
 
e7a346
         LOCK (&frame->lock);
e7a346
         {
e7a346
@@ -629,11 +633,15 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
         if (call_count == 0) {
e7a346
                 gf_msg_trace (this->name, 0,
e7a346
                               "All internal locks unlocked");
e7a346
-
e7a346
+                if (local->fd) {
e7a346
+                        fd_ctx = afr_fd_ctx_get (local->fd, this);
e7a346
+                        if (0 == AFR_COUNT (fd_ctx->lock_acquired, priv->child_count))
e7a346
+                                ret = afr_write_subvol_reset (frame, this);
e7a346
+                }
e7a346
                 int_lock->lock_cbk (frame, this);
e7a346
         }
e7a346
 
e7a346
-        return 0;
e7a346
+        return ret;
e7a346
 }
e7a346
 
e7a346
 void
e7a346
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
e7a346
index 35621d9..91c4f78 100644
e7a346
--- a/xlators/cluster/afr/src/afr-transaction.c
e7a346
+++ b/xlators/cluster/afr/src/afr-transaction.c
e7a346
@@ -1791,6 +1791,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
e7a346
 	if (pre_nop)
e7a346
 		goto next;
e7a346
 
e7a346
+        ret = afr_write_subvol_set (frame, this);
e7a346
+        if (ret)
e7a346
+                goto err;
e7a346
+
e7a346
 	if (!local->pre_op_compat) {
e7a346
 		dict_copy (xdata_req, local->xdata_req);
e7a346
 		goto next;
e7a346
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
e7a346
index c4ceb66..672d053 100644
e7a346
--- a/xlators/cluster/afr/src/afr.h
e7a346
+++ b/xlators/cluster/afr/src/afr.h
e7a346
@@ -837,6 +837,7 @@ typedef struct _afr_local {
e7a346
 
e7a346
 typedef struct _afr_inode_ctx {
e7a346
         uint64_t        read_subvol;
e7a346
+        uint64_t        write_subvol;
e7a346
         int             spb_choice;
e7a346
         gf_timer_t      *timer;
e7a346
         gf_boolean_t    need_refresh;
e7a346
@@ -1262,4 +1263,13 @@ int
e7a346
 afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this,
e7a346
                                      char *buf, const char *default_str,
e7a346
                                      int32_t *serz_len, char delimiter);
e7a346
+
e7a346
+int
e7a346
+__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
e7a346
+
e7a346
+int
e7a346
+afr_write_subvol_set (call_frame_t *frame, xlator_t *this);
e7a346
+
e7a346
+int
e7a346
+afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
e7a346
 #endif /* __AFR_H__ */
e7a346
-- 
e7a346
1.8.3.1
e7a346