|
|
cb8e9e |
From 37535f45135b5f6f9b62c8e011b96fc0b90743f7 Mon Sep 17 00:00:00 2001
|
|
|
cb8e9e |
From: Ravishankar N <ravishankar@redhat.com>
|
|
|
cb8e9e |
Date: Fri, 26 Jun 2015 17:29:20 +0530
|
|
|
cb8e9e |
Subject: [PATCH 148/190] afr: Block fops when file is in split-brain
|
|
|
cb8e9e |
|
|
|
cb8e9e |
Patch in master: http://review.gluster.org/#/c/11371/
|
|
|
cb8e9e |
Patch in release-3.7: http://review.gluster.org/#/c/11420/
|
|
|
cb8e9e |
|
|
|
cb8e9e |
For directories, block metadata FOPS.
|
|
|
cb8e9e |
For non-directories, block data and metadata FOPS.
|
|
|
cb8e9e |
Do not block entry FOPS.
|
|
|
cb8e9e |
|
|
|
cb8e9e |
Change-Id: I5d44bd8ece08ec683f930d797541eae719257857
|
|
|
cb8e9e |
BUG: 1223738
|
|
|
cb8e9e |
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
|
|
cb8e9e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/51691
|
|
|
cb8e9e |
---
|
|
|
cb8e9e |
tests/bugs/glusterfs/bug-873962.t | 8 ++--
|
|
|
cb8e9e |
xlators/cluster/afr/src/afr-common.c | 58 +++++++++++++++++++++++++++++
|
|
|
cb8e9e |
xlators/cluster/afr/src/afr-read-txn.c | 22 ++++-------
|
|
|
cb8e9e |
xlators/cluster/afr/src/afr-transaction.c | 7 +++
|
|
|
cb8e9e |
xlators/cluster/afr/src/afr.h | 3 +
|
|
|
cb8e9e |
5 files changed, 80 insertions(+), 18 deletions(-)
|
|
|
cb8e9e |
|
|
|
cb8e9e |
diff --git a/tests/bugs/glusterfs/bug-873962.t b/tests/bugs/glusterfs/bug-873962.t
|
|
|
cb8e9e |
index 492d028..7faa999 100755
|
|
|
cb8e9e |
--- a/tests/bugs/glusterfs/bug-873962.t
|
|
|
cb8e9e |
+++ b/tests/bugs/glusterfs/bug-873962.t
|
|
|
cb8e9e |
@@ -65,8 +65,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $
|
|
|
cb8e9e |
#Files are in split-brain, so open should fail
|
|
|
cb8e9e |
TEST ! cat $M0/a;
|
|
|
cb8e9e |
TEST ! cat $M1/a;
|
|
|
cb8e9e |
-TEST cat $M0/b;
|
|
|
cb8e9e |
-TEST cat $M1/b;
|
|
|
cb8e9e |
+TEST ! cat $M0/b;
|
|
|
cb8e9e |
+TEST ! cat $M1/b;
|
|
|
cb8e9e |
|
|
|
cb8e9e |
#Reset split-brain status
|
|
|
cb8e9e |
TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a;
|
|
|
cb8e9e |
@@ -92,8 +92,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $
|
|
|
cb8e9e |
#Files are in split-brain, so open should fail
|
|
|
cb8e9e |
TEST ! cat $M0/c
|
|
|
cb8e9e |
TEST ! cat $M1/c
|
|
|
cb8e9e |
-TEST cat $M0/d
|
|
|
cb8e9e |
-TEST cat $M1/d
|
|
|
cb8e9e |
+TEST ! cat $M0/d
|
|
|
cb8e9e |
+TEST ! cat $M1/d
|
|
|
cb8e9e |
|
|
|
cb8e9e |
TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c
|
|
|
cb8e9e |
TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d
|
|
|
cb8e9e |
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
|
|
cb8e9e |
index 7150f0f..9129b27 100644
|
|
|
cb8e9e |
--- a/xlators/cluster/afr/src/afr-common.c
|
|
|
cb8e9e |
+++ b/xlators/cluster/afr/src/afr-common.c
|
|
|
cb8e9e |
@@ -341,6 +341,58 @@ out:
|
|
|
cb8e9e |
}
|
|
|
cb8e9e |
|
|
|
cb8e9e |
int
|
|
|
cb8e9e |
+afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
|
|
|
cb8e9e |
+ unsigned char *readable, int *event_p, int type)
|
|
|
cb8e9e |
+{
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ afr_private_t *priv = this->private;
|
|
|
cb8e9e |
+ afr_local_t *local = frame->local;
|
|
|
cb8e9e |
+ unsigned char *data = alloca0 (priv->child_count);
|
|
|
cb8e9e |
+ unsigned char *metadata = alloca0 (priv->child_count);
|
|
|
cb8e9e |
+ int data_count = 0;
|
|
|
cb8e9e |
+ int metadata_count = 0;
|
|
|
cb8e9e |
+ int event_generation = 0;
|
|
|
cb8e9e |
+ int ret = 0;
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ /* We don't care about split-brains for entry transactions. */
|
|
|
cb8e9e |
+ if (type == AFR_ENTRY_TRANSACTION || type == AFR_ENTRY_RENAME_TRANSACTION)
|
|
|
cb8e9e |
+ return 0;
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ ret = afr_inode_read_subvol_get (inode, this, data, metadata,
|
|
|
cb8e9e |
+ &event_generation);
|
|
|
cb8e9e |
+ if (ret == -1)
|
|
|
cb8e9e |
+ return -EIO;
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ data_count = AFR_COUNT (data, priv->child_count);
|
|
|
cb8e9e |
+ metadata_count = AFR_COUNT (metadata, priv->child_count);
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ if (inode->ia_type == IA_IFDIR) {
|
|
|
cb8e9e |
+ /* For directories, allow even if it is in data split-brain. */
|
|
|
cb8e9e |
+ if (type == AFR_METADATA_TRANSACTION) {
|
|
|
cb8e9e |
+ if (!metadata_count)
|
|
|
cb8e9e |
+ return -EIO;
|
|
|
cb8e9e |
+ }
|
|
|
cb8e9e |
+ } else {
|
|
|
cb8e9e |
+ /* For files, abort in case of data/metadata split-brain. */
|
|
|
cb8e9e |
+ if (!data_count || !metadata_count)
|
|
|
cb8e9e |
+ return -EIO;
|
|
|
cb8e9e |
+ }
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ if (type == AFR_METADATA_TRANSACTION && readable)
|
|
|
cb8e9e |
+ memcpy (readable, metadata, priv->child_count * sizeof *metadata);
|
|
|
cb8e9e |
+ if (type == AFR_DATA_TRANSACTION && readable) {
|
|
|
cb8e9e |
+ if (!data_count)
|
|
|
cb8e9e |
+ memcpy (readable, local->child_up,
|
|
|
cb8e9e |
+ priv->child_count * sizeof *readable);
|
|
|
cb8e9e |
+ else
|
|
|
cb8e9e |
+ memcpy (readable, data, priv->child_count * sizeof *data);
|
|
|
cb8e9e |
+ }
|
|
|
cb8e9e |
+ if (event_p)
|
|
|
cb8e9e |
+ *event_p = event_generation;
|
|
|
cb8e9e |
+ return 0;
|
|
|
cb8e9e |
+}
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+int
|
|
|
cb8e9e |
afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
|
|
|
cb8e9e |
int *spb_choice)
|
|
|
cb8e9e |
{
|
|
|
cb8e9e |
@@ -598,6 +650,8 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
|
|
|
cb8e9e |
for (i = 0; i < priv->child_count; i++) {
|
|
|
cb8e9e |
if (data_accused[i])
|
|
|
cb8e9e |
continue;
|
|
|
cb8e9e |
+ if ((priv->arbiter_count == 1) && (i == ARBITER_BRICK_INDEX))
|
|
|
cb8e9e |
+ continue;
|
|
|
cb8e9e |
if (replies[i].poststat.ia_size < maxsize)
|
|
|
cb8e9e |
data_accused[i] = 1;
|
|
|
cb8e9e |
}
|
|
|
cb8e9e |
@@ -1682,6 +1736,10 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
cb8e9e |
* the slowest local subvolume is far preferable to a remote one.
|
|
|
cb8e9e |
*/
|
|
|
cb8e9e |
if (is_local) {
|
|
|
cb8e9e |
+ /* Don't set arbiter as read child. */
|
|
|
cb8e9e |
+ if ((priv->arbiter_count == 1) &&
|
|
|
cb8e9e |
+ (child_index == ARBITER_BRICK_INDEX))
|
|
|
cb8e9e |
+ goto out;
|
|
|
cb8e9e |
gf_log (this->name, GF_LOG_INFO,
|
|
|
cb8e9e |
"selecting local read_child %s",
|
|
|
cb8e9e |
priv->children[child_index]->name);
|
|
|
cb8e9e |
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
|
|
|
cb8e9e |
index 6121108..6e54549 100644
|
|
|
cb8e9e |
--- a/xlators/cluster/afr/src/afr-read-txn.c
|
|
|
cb8e9e |
+++ b/xlators/cluster/afr/src/afr-read-txn.c
|
|
|
cb8e9e |
@@ -52,6 +52,9 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
|
|
|
cb8e9e |
local->op_ret = ret; \
|
|
|
cb8e9e |
local->op_errno = errnum; \
|
|
|
cb8e9e |
read_subvol = index; \
|
|
|
cb8e9e |
+ gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,\
|
|
|
cb8e9e |
+ "Failing %s on gfid %s: split-brain observed.",\
|
|
|
cb8e9e |
+ gf_fop_list[local->op], uuid_utoa (inode->gfid));\
|
|
|
cb8e9e |
goto label; \
|
|
|
cb8e9e |
} while (0)
|
|
|
cb8e9e |
|
|
|
cb8e9e |
@@ -59,7 +62,6 @@ int
|
|
|
cb8e9e |
afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
|
|
|
cb8e9e |
{
|
|
|
cb8e9e |
afr_local_t *local = NULL;
|
|
|
cb8e9e |
- afr_private_t *priv = NULL;
|
|
|
cb8e9e |
int read_subvol = 0;
|
|
|
cb8e9e |
int event_generation = 0;
|
|
|
cb8e9e |
inode_t *inode = NULL;
|
|
|
cb8e9e |
@@ -68,27 +70,19 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
|
|
|
cb8e9e |
|
|
|
cb8e9e |
local = frame->local;
|
|
|
cb8e9e |
inode = local->inode;
|
|
|
cb8e9e |
- priv = frame->this->private;
|
|
|
cb8e9e |
|
|
|
cb8e9e |
if (err)
|
|
|
cb8e9e |
AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, -err, -1, readfn);
|
|
|
cb8e9e |
|
|
|
cb8e9e |
- ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
|
|
|
cb8e9e |
- &event_generation,
|
|
|
cb8e9e |
- local->transaction.type);
|
|
|
cb8e9e |
+ ret = afr_inode_get_readable (frame, inode, this, local->readable,
|
|
|
cb8e9e |
+ &event_generation,
|
|
|
cb8e9e |
+ local->transaction.type);
|
|
|
cb8e9e |
|
|
|
cb8e9e |
if (ret == -1 || !event_generation)
|
|
|
cb8e9e |
/* Even after refresh, we don't have a good
|
|
|
cb8e9e |
read subvolume. Time to bail */
|
|
|
cb8e9e |
AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn);
|
|
|
cb8e9e |
|
|
|
cb8e9e |
- /* For directories in split-brain, we need to allow all fops
|
|
|
cb8e9e |
- * except (f)getxattr and access. */
|
|
|
cb8e9e |
- if (!AFR_COUNT(local->readable, priv->child_count) &&
|
|
|
cb8e9e |
- local->transaction.type == AFR_DATA_TRANSACTION &&
|
|
|
cb8e9e |
- inode->ia_type == IA_IFDIR)
|
|
|
cb8e9e |
- memcpy (local->readable, local->child_up, priv->child_count);
|
|
|
cb8e9e |
-
|
|
|
cb8e9e |
read_subvol = afr_read_subvol_select_by_policy (inode, this,
|
|
|
cb8e9e |
local->readable, NULL);
|
|
|
cb8e9e |
if (read_subvol == -1)
|
|
|
cb8e9e |
@@ -237,8 +231,8 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
|
|
|
cb8e9e |
if (read_subvol < 0 || read_subvol > priv->child_count) {
|
|
|
cb8e9e |
gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
|
|
|
cb8e9e |
"Unreadable subvolume %d found with event generation "
|
|
|
cb8e9e |
- "%d. (Possible split-brain)",
|
|
|
cb8e9e |
- read_subvol, event_generation);
|
|
|
cb8e9e |
+ "%d for gfid %s. (Possible split-brain)",
|
|
|
cb8e9e |
+ read_subvol, event_generation, uuid_utoa(inode->gfid));
|
|
|
cb8e9e |
goto refresh;
|
|
|
cb8e9e |
}
|
|
|
cb8e9e |
|
|
|
cb8e9e |
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
|
|
|
cb8e9e |
index a202388..b27cfed 100644
|
|
|
cb8e9e |
--- a/xlators/cluster/afr/src/afr-transaction.c
|
|
|
cb8e9e |
+++ b/xlators/cluster/afr/src/afr-transaction.c
|
|
|
cb8e9e |
@@ -1967,6 +1967,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
|
|
|
cb8e9e |
if (ret < 0)
|
|
|
cb8e9e |
goto out;
|
|
|
cb8e9e |
|
|
|
cb8e9e |
+ ret = afr_inode_get_readable (frame, local->inode, this, 0, 0, type);
|
|
|
cb8e9e |
+ if (ret) {
|
|
|
cb8e9e |
+ gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,
|
|
|
cb8e9e |
+ "Failing %s on gfid %s: split-brain observed.",
|
|
|
cb8e9e |
+ gf_fop_list[local->op], uuid_utoa (local->inode->gfid));
|
|
|
cb8e9e |
+ goto out;
|
|
|
cb8e9e |
+ }
|
|
|
cb8e9e |
afr_transaction_eager_lock_init (local, this);
|
|
|
cb8e9e |
|
|
|
cb8e9e |
if (local->fd && local->transaction.eager_lock_on)
|
|
|
cb8e9e |
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
|
|
|
cb8e9e |
index e6e7f3e..729ae82 100644
|
|
|
cb8e9e |
--- a/xlators/cluster/afr/src/afr.h
|
|
|
cb8e9e |
+++ b/xlators/cluster/afr/src/afr.h
|
|
|
cb8e9e |
@@ -766,6 +766,9 @@ typedef struct afr_read_subvol_args {
|
|
|
cb8e9e |
(op_errno == EBADFD)))
|
|
|
cb8e9e |
|
|
|
cb8e9e |
int
|
|
|
cb8e9e |
+afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
|
|
|
cb8e9e |
+ unsigned char *readable, int *event_p, int type);
|
|
|
cb8e9e |
+int
|
|
|
cb8e9e |
afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
|
|
|
cb8e9e |
unsigned char *data_subvols,
|
|
|
cb8e9e |
unsigned char *metadata_subvols,
|
|
|
cb8e9e |
--
|
|
|
cb8e9e |
1.7.1
|
|
|
cb8e9e |
|