From 4c47d6dd7c5ddcaa2a1e159427c0f6713fd33907 Mon Sep 17 00:00:00 2001 From: karthik-us Date: Wed, 23 Dec 2020 14:57:51 +0530 Subject: [PATCH 514/517] afr: event gen changes The general idea of the changes is to prevent resetting event generation to zero in the inode ctx, since event gen is something that should follow 'causal order'. Change #1: For a read txn, in inode refresh cbk, if event_generation is found zero, we are failing the read fop. This is not needed because change in event gen is only a marker for the next inode refresh to happen and should not be taken into account by the current read txn. Change #2: The event gen being zero above can happen if there is a racing lookup, which resets even get (in afr_lookup_done) if there are non zero afr xattrs. The resetting is done only to trigger an inode refresh and a possible client side heal on the next lookup. That can be acheived by setting the need_refresh flag in the inode ctx. So replaced all occurences of resetting even gen to zero with a call to afr_inode_need_refresh_set(). Change #3: In both lookup and discover path, we are doing an inode refresh which is not required since all 3 essentially do the same thing- update the inode ctx with the good/bad copies from the brick replies. Inode refresh also triggers background heals, but I think it is okay to do it when we call refresh during the read and write txns and not in the lookup path. The .ts which relied on inode refresh in lookup path to trigger heals are now changed to do read txn so that inode refresh and the heal happens. Upstream patch details: > Change-Id: Iebf39a9be6ffd7ffd6e4046c96b0fa78ade6c5ec > Fixes: #1179 > Signed-off-by: Ravishankar N > Reported-by: Erik Jacobson Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24316/ BUG: 1640148 Change-Id: Iebf39a9be6ffd7ffd6e4046c96b0fa78ade6c5ec Signed-off-by: karthik-us Reviewed-on: https://code.engineering.redhat.com/gerrit/222074 Tested-by: RHGS Build Bot Reviewed-by: Ravishankar Narayanankutty --- ...fid-mismatch-resolution-with-fav-child-policy.t | 8 +- xlators/cluster/afr/src/afr-common.c | 92 +++++----------------- xlators/cluster/afr/src/afr-dir-write.c | 6 +- xlators/cluster/afr/src/afr.h | 5 +- 4 files changed, 29 insertions(+), 82 deletions(-) diff --git a/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t b/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t index f4aa351..12af0c8 100644 --- a/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t +++ b/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t @@ -168,8 +168,8 @@ TEST [ "$gfid_1" != "$gfid_2" ] #We know that second brick has the bigger size file BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/f3 | cut -d\ -f1) -TEST ls $M0/f3 -TEST cat $M0/f3 +TEST ls $M0 #Trigger entry heal via readdir inode refresh +TEST cat $M0/f3 #Trigger data heal via readv inode refresh EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 #gfid split-brain should be resolved @@ -215,8 +215,8 @@ TEST $CLI volume start $V0 force EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2 -TEST ls $M0/f4 -TEST cat $M0/f4 +TEST ls $M0 #Trigger entry heal via readdir inode refresh +TEST cat $M0/f4 #Trigger data heal via readv inode refresh EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 #gfid split-brain should be resolved diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index fca2cd5..90b4f14 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -284,7 +284,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, metadatamap |= (1 << index); } if (metadatamap_old != metadatamap) { - event = 0; + __afr_inode_need_refresh_set(inode, this); } break; @@ -297,7 +297,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, datamap |= (1 << index); } if (datamap_old != datamap) - event = 0; + __afr_inode_need_refresh_set(inode, this); break; default: @@ -461,34 +461,6 @@ out: } int -__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this) -{ - int ret = -1; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint32_t event = 0; - uint64_t val = 0; - afr_inode_ctx_t *ctx = NULL; - - ret = __afr_inode_ctx_get(this, inode, &ctx); - if (ret) - return ret; - - val = ctx->read_subvol; - - metadatamap = (val & 0x000000000000ffff) >> 0; - datamap = (val & 0x00000000ffff0000) >> 16; - event = 0; - - val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | - (((uint64_t)event) << 32); - - ctx->read_subvol = val; - - return ret; -} - -int __afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int *event_p) { @@ -559,22 +531,6 @@ out: } int -__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - if (priv->child_count <= 16) - ret = __afr_inode_event_gen_reset_small(inode, this); - else - ret = -1; - - return ret; -} - -int afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int *event_p) { @@ -723,30 +679,22 @@ out: return need_refresh; } -static int -afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) +int +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { int ret = -1; afr_inode_ctx_t *ctx = NULL; - GF_VALIDATE_OR_GOTO(this->name, inode, out); - - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get(this, inode, &ctx); - if (ret) - goto unlock; - + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret == 0) { ctx->need_refresh = _gf_true; } -unlock: - UNLOCK(&inode->lock); -out: + return ret; } int -afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { int ret = -1; @@ -754,7 +702,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) LOCK(&inode->lock); { - ret = __afr_inode_event_gen_reset(inode, this); + ret = __afr_inode_need_refresh_set(inode, this); } UNLOCK(&inode->lock); out: @@ -1191,7 +1139,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) ret = afr_inode_get_readable(frame, inode, this, local->readable, &event_generation, local->transaction.type); - if (ret == -EIO || (local->is_read_txn && !event_generation)) { + if (ret == -EIO) { /* No readable subvolume even after refresh ==> splitbrain.*/ if (!priv->fav_child_policy) { err = EIO; @@ -2413,7 +2361,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) if (read_subvol == -1) goto cant_interpret; if (ret) { - afr_inode_event_gen_reset(local->inode, this); + afr_inode_need_refresh_set(local->inode, this); dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); } } else { @@ -2971,6 +2919,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; afr_local_t *local = NULL; int read_subvol = -1; + int ret = 0; unsigned char *data_readable = NULL; unsigned char *success_replies = NULL; @@ -2992,7 +2941,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) if (!afr_has_quorum(success_replies, this, frame)) goto unwind; - afr_replies_interpret(frame, this, local->inode, NULL); + ret = afr_replies_interpret(frame, this, local->inode, NULL); + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + } read_subvol = afr_read_subvol_decide(local->inode, this, NULL, data_readable); @@ -3248,11 +3200,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd(loc->inode, this, event, - local->event_generation)) - afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do); - else - afr_discover_do(frame, this, 0); + afr_discover_do(frame, this, 0); return 0; out: @@ -3393,11 +3341,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd(loc->inode, this, event, - local->event_generation)) - afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do); - else - afr_lookup_do(frame, this, 0); + afr_lookup_do(frame, this, 0); return 0; out: diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 416c19d..d419bfc 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -123,11 +123,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) continue; if (local->replies[i].op_ret < 0) { if (local->inode) - afr_inode_event_gen_reset(local->inode, this); + afr_inode_need_refresh_set(local->inode, this); if (local->parent) - afr_inode_event_gen_reset(local->parent, this); + afr_inode_need_refresh_set(local->parent, this); if (local->parent2) - afr_inode_event_gen_reset(local->parent2, this); + afr_inode_need_refresh_set(local->parent2, this); continue; } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index ed5096e..3a2b26d 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -948,7 +948,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, int event_generation); int -afr_inode_event_gen_reset(inode_t *inode, xlator_t *this); +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); + +int +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); int afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, -- 1.8.3.1