From 37535f45135b5f6f9b62c8e011b96fc0b90743f7 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Fri, 26 Jun 2015 17:29:20 +0530 Subject: [PATCH 148/190] afr: Block fops when file is in split-brain Patch in master: http://review.gluster.org/#/c/11371/ Patch in release-3.7: http://review.gluster.org/#/c/11420/ For directories, block metadata FOPS. For non-directories, block data and metadata FOPS. Do not block entry FOPS. Change-Id: I5d44bd8ece08ec683f930d797541eae719257857 BUG: 1223738 Signed-off-by: Ravishankar N Reviewed-on: https://code.engineering.redhat.com/gerrit/51691 --- tests/bugs/glusterfs/bug-873962.t | 8 ++-- xlators/cluster/afr/src/afr-common.c | 58 +++++++++++++++++++++++++++++ xlators/cluster/afr/src/afr-read-txn.c | 22 ++++------- xlators/cluster/afr/src/afr-transaction.c | 7 +++ xlators/cluster/afr/src/afr.h | 3 + 5 files changed, 80 insertions(+), 18 deletions(-) diff --git a/tests/bugs/glusterfs/bug-873962.t b/tests/bugs/glusterfs/bug-873962.t index 492d028..7faa999 100755 --- a/tests/bugs/glusterfs/bug-873962.t +++ b/tests/bugs/glusterfs/bug-873962.t @@ -65,8 +65,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $ #Files are in split-brain, so open should fail TEST ! cat $M0/a; TEST ! cat $M1/a; -TEST cat $M0/b; -TEST cat $M1/b; +TEST ! cat $M0/b; +TEST ! cat $M1/b; #Reset split-brain status TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a; @@ -92,8 +92,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $ #Files are in split-brain, so open should fail TEST ! cat $M0/c TEST ! cat $M1/c -TEST cat $M0/d -TEST cat $M1/d +TEST ! cat $M0/d +TEST ! cat $M1/d TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 7150f0f..9129b27 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -341,6 +341,58 @@ out: } int +afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) +{ + + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + unsigned char *data = alloca0 (priv->child_count); + unsigned char *metadata = alloca0 (priv->child_count); + int data_count = 0; + int metadata_count = 0; + int event_generation = 0; + int ret = 0; + + /* We don't care about split-brains for entry transactions. */ + if (type == AFR_ENTRY_TRANSACTION || type == AFR_ENTRY_RENAME_TRANSACTION) + return 0; + + ret = afr_inode_read_subvol_get (inode, this, data, metadata, + &event_generation); + if (ret == -1) + return -EIO; + + data_count = AFR_COUNT (data, priv->child_count); + metadata_count = AFR_COUNT (metadata, priv->child_count); + + if (inode->ia_type == IA_IFDIR) { + /* For directories, allow even if it is in data split-brain. */ + if (type == AFR_METADATA_TRANSACTION) { + if (!metadata_count) + return -EIO; + } + } else { + /* For files, abort in case of data/metadata split-brain. */ + if (!data_count || !metadata_count) + return -EIO; + } + + if (type == AFR_METADATA_TRANSACTION && readable) + memcpy (readable, metadata, priv->child_count * sizeof *metadata); + if (type == AFR_DATA_TRANSACTION && readable) { + if (!data_count) + memcpy (readable, local->child_up, + priv->child_count * sizeof *readable); + else + memcpy (readable, data, priv->child_count * sizeof *data); + } + if (event_p) + *event_p = event_generation; + return 0; +} + +int afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, int *spb_choice) { @@ -598,6 +650,8 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, for (i = 0; i < priv->child_count; i++) { if (data_accused[i]) continue; + if ((priv->arbiter_count == 1) && (i == ARBITER_BRICK_INDEX)) + continue; if (replies[i].poststat.ia_size < maxsize) data_accused[i] = 1; } @@ -1682,6 +1736,10 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * the slowest local subvolume is far preferable to a remote one. */ if (is_local) { + /* Don't set arbiter as read child. */ + if ((priv->arbiter_count == 1) && + (child_index == ARBITER_BRICK_INDEX)) + goto out; gf_log (this->name, GF_LOG_INFO, "selecting local read_child %s", priv->children[child_index]->name); diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 6121108..6e54549 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -52,6 +52,9 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) local->op_ret = ret; \ local->op_errno = errnum; \ read_subvol = index; \ + gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,\ + "Failing %s on gfid %s: split-brain observed.",\ + gf_fop_list[local->op], uuid_utoa (inode->gfid));\ goto label; \ } while (0) @@ -59,7 +62,6 @@ int afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) { afr_local_t *local = NULL; - afr_private_t *priv = NULL; int read_subvol = 0; int event_generation = 0; inode_t *inode = NULL; @@ -68,27 +70,19 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) local = frame->local; inode = local->inode; - priv = frame->this->private; if (err) AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, -err, -1, readfn); - ret = afr_inode_read_subvol_type_get (inode, this, local->readable, - &event_generation, - local->transaction.type); + ret = afr_inode_get_readable (frame, inode, this, local->readable, + &event_generation, + local->transaction.type); if (ret == -1 || !event_generation) /* Even after refresh, we don't have a good read subvolume. Time to bail */ AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn); - /* For directories in split-brain, we need to allow all fops - * except (f)getxattr and access. */ - if (!AFR_COUNT(local->readable, priv->child_count) && - local->transaction.type == AFR_DATA_TRANSACTION && - inode->ia_type == IA_IFDIR) - memcpy (local->readable, local->child_up, priv->child_count); - read_subvol = afr_read_subvol_select_by_policy (inode, this, local->readable, NULL); if (read_subvol == -1) @@ -237,8 +231,8 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, if (read_subvol < 0 || read_subvol > priv->child_count) { gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN, "Unreadable subvolume %d found with event generation " - "%d. (Possible split-brain)", - read_subvol, event_generation); + "%d for gfid %s. (Possible split-brain)", + read_subvol, event_generation, uuid_utoa(inode->gfid)); goto refresh; } diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index a202388..b27cfed 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1967,6 +1967,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) if (ret < 0) goto out; + ret = afr_inode_get_readable (frame, local->inode, this, 0, 0, type); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN, + "Failing %s on gfid %s: split-brain observed.", + gf_fop_list[local->op], uuid_utoa (local->inode->gfid)); + goto out; + } afr_transaction_eager_lock_init (local, this); if (local->fd && local->transaction.eager_lock_on) diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index e6e7f3e..729ae82 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -766,6 +766,9 @@ typedef struct afr_read_subvol_args { (op_errno == EBADFD))) int +afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type); +int afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data_subvols, unsigned char *metadata_subvols, -- 1.7.1