From 37535f45135b5f6f9b62c8e011b96fc0b90743f7 Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Fri, 26 Jun 2015 17:29:20 +0530
Subject: [PATCH 148/190] afr: Block fops when file is in split-brain
Patch in master: http://review.gluster.org/#/c/11371/
Patch in release-3.7: http://review.gluster.org/#/c/11420/
For directories, block metadata FOPS.
For non-directories, block data and metadata FOPS.
Do not block entry FOPS.
Change-Id: I5d44bd8ece08ec683f930d797541eae719257857
BUG: 1223738
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/51691
---
tests/bugs/glusterfs/bug-873962.t | 8 ++--
xlators/cluster/afr/src/afr-common.c | 58 +++++++++++++++++++++++++++++
xlators/cluster/afr/src/afr-read-txn.c | 22 ++++-------
xlators/cluster/afr/src/afr-transaction.c | 7 +++
xlators/cluster/afr/src/afr.h | 3 +
5 files changed, 80 insertions(+), 18 deletions(-)
diff --git a/tests/bugs/glusterfs/bug-873962.t b/tests/bugs/glusterfs/bug-873962.t
index 492d028..7faa999 100755
--- a/tests/bugs/glusterfs/bug-873962.t
+++ b/tests/bugs/glusterfs/bug-873962.t
@@ -65,8 +65,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $
#Files are in split-brain, so open should fail
TEST ! cat $M0/a;
TEST ! cat $M1/a;
-TEST cat $M0/b;
-TEST cat $M1/b;
+TEST ! cat $M0/b;
+TEST ! cat $M1/b;
#Reset split-brain status
TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a;
@@ -92,8 +92,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $
#Files are in split-brain, so open should fail
TEST ! cat $M0/c
TEST ! cat $M1/c
-TEST cat $M0/d
-TEST cat $M1/d
+TEST ! cat $M0/d
+TEST ! cat $M1/d
TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c
TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7150f0f..9129b27 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -341,6 +341,58 @@ out:
}
int
+afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
+{
+
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ unsigned char *data = alloca0 (priv->child_count);
+ unsigned char *metadata = alloca0 (priv->child_count);
+ int data_count = 0;
+ int metadata_count = 0;
+ int event_generation = 0;
+ int ret = 0;
+
+ /* We don't care about split-brains for entry transactions. */
+ if (type == AFR_ENTRY_TRANSACTION || type == AFR_ENTRY_RENAME_TRANSACTION)
+ return 0;
+
+ ret = afr_inode_read_subvol_get (inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ return -EIO;
+
+ data_count = AFR_COUNT (data, priv->child_count);
+ metadata_count = AFR_COUNT (metadata, priv->child_count);
+
+ if (inode->ia_type == IA_IFDIR) {
+ /* For directories, allow even if it is in data split-brain. */
+ if (type == AFR_METADATA_TRANSACTION) {
+ if (!metadata_count)
+ return -EIO;
+ }
+ } else {
+ /* For files, abort in case of data/metadata split-brain. */
+ if (!data_count || !metadata_count)
+ return -EIO;
+ }
+
+ if (type == AFR_METADATA_TRANSACTION && readable)
+ memcpy (readable, metadata, priv->child_count * sizeof *metadata);
+ if (type == AFR_DATA_TRANSACTION && readable) {
+ if (!data_count)
+ memcpy (readable, local->child_up,
+ priv->child_count * sizeof *readable);
+ else
+ memcpy (readable, data, priv->child_count * sizeof *data);
+ }
+ if (event_p)
+ *event_p = event_generation;
+ return 0;
+}
+
+int
afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
int *spb_choice)
{
@@ -598,6 +650,8 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
for (i = 0; i < priv->child_count; i++) {
if (data_accused[i])
continue;
+ if ((priv->arbiter_count == 1) && (i == ARBITER_BRICK_INDEX))
+ continue;
if (replies[i].poststat.ia_size < maxsize)
data_accused[i] = 1;
}
@@ -1682,6 +1736,10 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* the slowest local subvolume is far preferable to a remote one.
*/
if (is_local) {
+ /* Don't set arbiter as read child. */
+ if ((priv->arbiter_count == 1) &&
+ (child_index == ARBITER_BRICK_INDEX))
+ goto out;
gf_log (this->name, GF_LOG_INFO,
"selecting local read_child %s",
priv->children[child_index]->name);
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 6121108..6e54549 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -52,6 +52,9 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
local->op_ret = ret; \
local->op_errno = errnum; \
read_subvol = index; \
+ gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,\
+ "Failing %s on gfid %s: split-brain observed.",\
+ gf_fop_list[local->op], uuid_utoa (inode->gfid));\
goto label; \
} while (0)
@@ -59,7 +62,6 @@ int
afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
{
afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
int read_subvol = 0;
int event_generation = 0;
inode_t *inode = NULL;
@@ -68,27 +70,19 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
local = frame->local;
inode = local->inode;
- priv = frame->this->private;
if (err)
AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, -err, -1, readfn);
- ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
- &event_generation,
- local->transaction.type);
+ ret = afr_inode_get_readable (frame, inode, this, local->readable,
+ &event_generation,
+ local->transaction.type);
if (ret == -1 || !event_generation)
/* Even after refresh, we don't have a good
read subvolume. Time to bail */
AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn);
- /* For directories in split-brain, we need to allow all fops
- * except (f)getxattr and access. */
- if (!AFR_COUNT(local->readable, priv->child_count) &&
- local->transaction.type == AFR_DATA_TRANSACTION &&
- inode->ia_type == IA_IFDIR)
- memcpy (local->readable, local->child_up, priv->child_count);
-
read_subvol = afr_read_subvol_select_by_policy (inode, this,
local->readable, NULL);
if (read_subvol == -1)
@@ -237,8 +231,8 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
if (read_subvol < 0 || read_subvol > priv->child_count) {
gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
"Unreadable subvolume %d found with event generation "
- "%d. (Possible split-brain)",
- read_subvol, event_generation);
+ "%d for gfid %s. (Possible split-brain)",
+ read_subvol, event_generation, uuid_utoa(inode->gfid));
goto refresh;
}
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index a202388..b27cfed 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1967,6 +1967,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
if (ret < 0)
goto out;
+ ret = afr_inode_get_readable (frame, local->inode, this, 0, 0, type);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,
+ "Failing %s on gfid %s: split-brain observed.",
+ gf_fop_list[local->op], uuid_utoa (local->inode->gfid));
+ goto out;
+ }
afr_transaction_eager_lock_init (local, this);
if (local->fd && local->transaction.eager_lock_on)
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index e6e7f3e..729ae82 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -766,6 +766,9 @@ typedef struct afr_read_subvol_args {
(op_errno == EBADFD)))
int
+afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
+int
afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
unsigned char *data_subvols,
unsigned char *metadata_subvols,
--
1.7.1