|
|
d1681e |
From 5579f616c2c21a2a2cd2ef70b58149df85550db7 Mon Sep 17 00:00:00 2001
|
|
|
d1681e |
From: karthik-us <ksubrahm@redhat.com>
|
|
|
d1681e |
Date: Mon, 27 Nov 2017 12:51:16 +0530
|
|
|
d1681e |
Subject: [PATCH 084/128] cluster/afr: Fix for arbiter becoming source
|
|
|
d1681e |
|
|
|
d1681e |
Problem:
|
|
|
d1681e |
When eager-lock is on, and two writes happen in parallel on a FD
|
|
|
d1681e |
we were observing the following behaviour:
|
|
|
d1681e |
- First write fails on one data brick
|
|
|
d1681e |
- Since the post-op is not yet happened, the inode refresh will get
|
|
|
d1681e |
both the data bricks as readable and set it in the inode context
|
|
|
d1681e |
- In flight split brain check see both the data bricks as readable
|
|
|
d1681e |
and allows the second write
|
|
|
d1681e |
- Second write fails on the other data brick
|
|
|
d1681e |
- Now the post-op happens and marks both the data bricks as bad and
|
|
|
d1681e |
arbiter will become source for healing
|
|
|
d1681e |
|
|
|
d1681e |
Fix:
|
|
|
d1681e |
Adding one more variable called write_suvol in inode context and it
|
|
|
d1681e |
will have the in memory representation of the writable subvols. Inode
|
|
|
d1681e |
refresh will not update this value and its lifetime is pre-op through
|
|
|
d1681e |
unlock in the afr transaction. Initially the pre-op will set this
|
|
|
d1681e |
value same as read_subvol in inode context and then in the in flight
|
|
|
d1681e |
split brain check we will use this value instead of read_subvol.
|
|
|
d1681e |
After all the checks we will update the value of this and set the
|
|
|
d1681e |
read_subvol same as this to avoid having incorrect value in that.
|
|
|
d1681e |
|
|
|
d1681e |
Upstream patch: https://review.gluster.org/#/c/18049/
|
|
|
d1681e |
|
|
|
d1681e |
> Change-Id: I2ef6904524ab91af861d59690974bbc529ab1af3
|
|
|
d1681e |
> BUG: 1482064
|
|
|
d1681e |
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
|
|
|
d1681e |
|
|
|
d1681e |
Change-Id: I91cd21e378a7ae3757c2209fcb91a613d73e09ee
|
|
|
d1681e |
BUG: 1401969
|
|
|
d1681e |
Signed-off-by: karthik-us <ksubrahm@redhat.com>
|
|
|
d1681e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/124292
|
|
|
d1681e |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
d1681e |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
d1681e |
---
|
|
|
d1681e |
xlators/cluster/afr/src/afr-common.c | 76 ++++++++++++++++++++++++++++++-
|
|
|
d1681e |
xlators/cluster/afr/src/afr-lk-common.c | 18 ++++++--
|
|
|
d1681e |
xlators/cluster/afr/src/afr-transaction.c | 4 ++
|
|
|
d1681e |
xlators/cluster/afr/src/afr.h | 10 ++++
|
|
|
d1681e |
4 files changed, 102 insertions(+), 6 deletions(-)
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
|
|
d1681e |
index 9c96056..a8ba5a0 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr-common.c
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr-common.c
|
|
|
d1681e |
@@ -149,6 +149,7 @@ __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
|
|
|
d1681e |
}
|
|
|
d1681e |
tmp_ctx->spb_choice = -1;
|
|
|
d1681e |
tmp_ctx->read_subvol = 0;
|
|
|
d1681e |
+ tmp_ctx->write_subvol = 0;
|
|
|
d1681e |
} else {
|
|
|
d1681e |
tmp_ctx = (afr_inode_ctx_t *) ctx_int;
|
|
|
d1681e |
}
|
|
|
d1681e |
@@ -216,7 +217,7 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
|
|
|
d1681e |
if (ret < 0)
|
|
|
d1681e |
return ret;
|
|
|
d1681e |
|
|
|
d1681e |
- val = ctx->read_subvol;
|
|
|
d1681e |
+ val = ctx->write_subvol;
|
|
|
d1681e |
|
|
|
d1681e |
metadatamap_old = metadatamap = (val & 0x000000000000ffff);
|
|
|
d1681e |
datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
|
|
|
d1681e |
@@ -276,6 +277,7 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
|
|
|
d1681e |
(((uint64_t) datamap) << 16) |
|
|
|
d1681e |
(((uint64_t) event) << 32);
|
|
|
d1681e |
|
|
|
d1681e |
+ ctx->write_subvol = val;
|
|
|
d1681e |
ctx->read_subvol = val;
|
|
|
d1681e |
|
|
|
d1681e |
return ret;
|
|
|
d1681e |
@@ -6421,3 +6423,75 @@ afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this,
|
|
|
d1681e |
out:
|
|
|
d1681e |
return ret;
|
|
|
d1681e |
}
|
|
|
d1681e |
+
|
|
|
d1681e |
+int
|
|
|
d1681e |
+afr_write_subvol_set (call_frame_t *frame, xlator_t *this)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ afr_local_t *local = NULL;
|
|
|
d1681e |
+ afr_inode_ctx_t *ctx = NULL;
|
|
|
d1681e |
+ uint64_t val = 0;
|
|
|
d1681e |
+ uint64_t val1 = 0;
|
|
|
d1681e |
+ int ret = -1;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ local = frame->local;
|
|
|
d1681e |
+ LOCK(&local->inode->lock);
|
|
|
d1681e |
+ {
|
|
|
d1681e |
+ ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
|
|
|
d1681e |
+ if (ret < 0) {
|
|
|
d1681e |
+ gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
d1681e |
+ AFR_MSG_DICT_GET_FAILED,
|
|
|
d1681e |
+ "ERROR GETTING INODE CTX");
|
|
|
d1681e |
+ UNLOCK(&local->inode->lock);
|
|
|
d1681e |
+ return ret;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+
|
|
|
d1681e |
+ val = ctx->write_subvol;
|
|
|
d1681e |
+ /*
|
|
|
d1681e |
+ * We need to set the value of write_subvol to read_subvol in 2
|
|
|
d1681e |
+ * cases:
|
|
|
d1681e |
+ * 1. Initially when the value is 0. i.e., it's the first lock
|
|
|
d1681e |
+ * request.
|
|
|
d1681e |
+ * 2. If it's a metadata transaction. If metadata transactions
|
|
|
d1681e |
+ * comes in between data transactions and we have a brick
|
|
|
d1681e |
+ * disconnect, the next metadata transaction won't get the
|
|
|
d1681e |
+ * latest value of readables, since we do resetting of
|
|
|
d1681e |
+ * write_subvol in unlock code path only if it's a data
|
|
|
d1681e |
+ * transaction. To handle those scenarios we need to set the
|
|
|
d1681e |
+ * value of write_subvol to read_subvol in case of metadata
|
|
|
d1681e |
+ * transactions.
|
|
|
d1681e |
+ */
|
|
|
d1681e |
+ if (val == 0 ||
|
|
|
d1681e |
+ local->transaction.type == AFR_METADATA_TRANSACTION) {
|
|
|
d1681e |
+ val1 = ctx->read_subvol;
|
|
|
d1681e |
+ ctx->write_subvol = val1;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ UNLOCK (&local->inode->lock);
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return 0;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+int
|
|
|
d1681e |
+afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ afr_local_t *local = NULL;
|
|
|
d1681e |
+ afr_inode_ctx_t *ctx = NULL;
|
|
|
d1681e |
+ int ret = -1;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ local = frame->local;
|
|
|
d1681e |
+ LOCK(&local->inode->lock);
|
|
|
d1681e |
+ {
|
|
|
d1681e |
+ ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
|
|
|
d1681e |
+ if (ret < 0) {
|
|
|
d1681e |
+ gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
d1681e |
+ AFR_MSG_DICT_GET_FAILED,
|
|
|
d1681e |
+ "ERROR GETTING INODE CTX");
|
|
|
d1681e |
+ UNLOCK(&local->inode->lock);
|
|
|
d1681e |
+ return ret;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ ctx->write_subvol = 0;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ UNLOCK(&local->inode->lock);
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return 0;
|
|
|
d1681e |
+}
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
|
|
|
d1681e |
index 1f2a117..c17f60f 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr-lk-common.c
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr-lk-common.c
|
|
|
d1681e |
@@ -613,12 +613,16 @@ static int32_t
|
|
|
d1681e |
afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
d1681e |
int32_t op_ret, int32_t op_errno, dict_t *xdata)
|
|
|
d1681e |
{
|
|
|
d1681e |
- afr_local_t *local = NULL;
|
|
|
d1681e |
- afr_internal_lock_t *int_lock = NULL;
|
|
|
d1681e |
- int call_count = 0;
|
|
|
d1681e |
+ afr_local_t *local = NULL;
|
|
|
d1681e |
+ afr_internal_lock_t *int_lock = NULL;
|
|
|
d1681e |
+ afr_fd_ctx_t *fd_ctx = NULL;
|
|
|
d1681e |
+ afr_private_t *priv = NULL;
|
|
|
d1681e |
+ int call_count = 0;
|
|
|
d1681e |
+ int ret = 0;
|
|
|
d1681e |
|
|
|
d1681e |
local = frame->local;
|
|
|
d1681e |
int_lock = &local->internal_lock;
|
|
|
d1681e |
+ priv = this->private;
|
|
|
d1681e |
|
|
|
d1681e |
LOCK (&frame->lock);
|
|
|
d1681e |
{
|
|
|
d1681e |
@@ -629,11 +633,15 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
d1681e |
if (call_count == 0) {
|
|
|
d1681e |
gf_msg_trace (this->name, 0,
|
|
|
d1681e |
"All internal locks unlocked");
|
|
|
d1681e |
-
|
|
|
d1681e |
+ if (local->fd) {
|
|
|
d1681e |
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
|
|
|
d1681e |
+ if (0 == AFR_COUNT (fd_ctx->lock_acquired, priv->child_count))
|
|
|
d1681e |
+ ret = afr_write_subvol_reset (frame, this);
|
|
|
d1681e |
+ }
|
|
|
d1681e |
int_lock->lock_cbk (frame, this);
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
- return 0;
|
|
|
d1681e |
+ return ret;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
void
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
|
|
|
d1681e |
index 35621d9..91c4f78 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr-transaction.c
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr-transaction.c
|
|
|
d1681e |
@@ -1791,6 +1791,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
|
|
|
d1681e |
if (pre_nop)
|
|
|
d1681e |
goto next;
|
|
|
d1681e |
|
|
|
d1681e |
+ ret = afr_write_subvol_set (frame, this);
|
|
|
d1681e |
+ if (ret)
|
|
|
d1681e |
+ goto err;
|
|
|
d1681e |
+
|
|
|
d1681e |
if (!local->pre_op_compat) {
|
|
|
d1681e |
dict_copy (xdata_req, local->xdata_req);
|
|
|
d1681e |
goto next;
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
|
|
|
d1681e |
index c4ceb66..672d053 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr.h
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr.h
|
|
|
d1681e |
@@ -837,6 +837,7 @@ typedef struct _afr_local {
|
|
|
d1681e |
|
|
|
d1681e |
typedef struct _afr_inode_ctx {
|
|
|
d1681e |
uint64_t read_subvol;
|
|
|
d1681e |
+ uint64_t write_subvol;
|
|
|
d1681e |
int spb_choice;
|
|
|
d1681e |
gf_timer_t *timer;
|
|
|
d1681e |
gf_boolean_t need_refresh;
|
|
|
d1681e |
@@ -1262,4 +1263,13 @@ int
|
|
|
d1681e |
afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this,
|
|
|
d1681e |
char *buf, const char *default_str,
|
|
|
d1681e |
int32_t *serz_len, char delimiter);
|
|
|
d1681e |
+
|
|
|
d1681e |
+int
|
|
|
d1681e |
+__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
|
|
|
d1681e |
+
|
|
|
d1681e |
+int
|
|
|
d1681e |
+afr_write_subvol_set (call_frame_t *frame, xlator_t *this);
|
|
|
d1681e |
+
|
|
|
d1681e |
+int
|
|
|
d1681e |
+afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
|
|
|
d1681e |
#endif /* __AFR_H__ */
|
|
|
d1681e |
--
|
|
|
d1681e |
1.8.3.1
|
|
|
d1681e |
|