From a06d455f2f3ad30e7aab2a6ac9594281dd1386c7 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Wed, 8 Jul 2015 17:52:11 +0530 Subject: [PATCH 236/244] cluster/ec: Prevent data corruptions Backport of http://review.gluster.org/11580 Backport of http://review.gluster.com/11640 - Don't read from bricks that are healing - On lock reuse preserve 'healing' bits - Don't set ctx->size outside locks in healing code - Allow xattrop internal fops also on the fop->mask. Change-Id: I35503039e4723cf7f33d6797f0ba90dd0aca130b BUG: 1230513 Signed-off-by: Pranith Kumar K Reviewed-on: https://code.engineering.redhat.com/gerrit/52663 --- xlators/cluster/ec/src/ec-common.c | 25 +++++++++++++++++++++---- xlators/cluster/ec/src/ec-data.h | 1 + xlators/cluster/ec/src/ec-heal.c | 22 +++++++++++----------- xlators/cluster/ec/src/ec.c | 2 +- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 439773e..b2d0348 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -401,6 +401,18 @@ ec_must_wind (ec_fop_data_t *fop) return _gf_false; } +static gf_boolean_t +ec_internal_op (ec_fop_data_t *fop) +{ + if (ec_must_wind (fop)) + return _gf_true; + if (fop->id == GF_FOP_XATTROP) + return _gf_true; + if (fop->id == GF_FOP_FXATTROP) + return _gf_true; + return _gf_false; +} + int32_t ec_child_select(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; @@ -413,8 +425,9 @@ int32_t ec_child_select(ec_fop_data_t * fop) /* Wind the fop on same subvols as parent for any internal extra fops like * head/tail read in case of writev fop. Unlocks shouldn't do this because * unlock should go on all subvols where lock is performed*/ - if (fop->parent && !ec_must_wind (fop)) - fop->mask &= fop->parent->mask; + if (fop->parent && !ec_internal_op (fop)) { + fop->mask &= (fop->parent->mask & ~fop->parent->healing); + } mask = ec->xl_up; if (fop->parent == NULL) @@ -972,6 +985,7 @@ out: parent->mask &= fop->good; /*As of now only data healing marks bricks as healing*/ + lock->healing |= fop->healing; if (ec_is_data_fop (parent->id)) { parent->healing |= fop->healing; } @@ -996,9 +1010,13 @@ void ec_get_size_version(ec_lock_link_t *link) lock = link->lock; ctx = lock->ctx; + fop = link->fop; /* If ec metadata has already been retrieved, do not try again. */ if (ctx->have_info) { + if (ec_is_data_fop (fop->id)) { + fop->healing |= lock->healing; + } return; } @@ -1008,8 +1026,6 @@ void ec_get_size_version(ec_lock_link_t *link) return; } - fop = link->fop; - uid = fop->frame->root->uid; gid = fop->frame->root->gid; @@ -1274,6 +1290,7 @@ int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, link = fop->data; lock = link->lock; lock->mask = lock->good_mask = fop->good; + lock->healing = 0; ec_lock_acquired(link); ec_lock(fop->parent); diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 135ccdf..ec470e9 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -146,6 +146,7 @@ struct _ec_lock the next unlock/lock cycle. */ uintptr_t mask; uintptr_t good_mask; + uintptr_t healing; int32_t refs; int32_t refs_frozen; int32_t inserted; diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 6f82203..a7c97a5 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -256,10 +256,13 @@ int32_t ec_heal_readv_cbk(call_frame_t * frame, void * cookie, xlator_t * this, } else { - gf_msg_debug (fop->xl->name, 0, "%s: read failed %s, failing " - "to heal block at %"PRIu64, - uuid_utoa (heal->fd->inode->gfid), strerror (op_errno), - heal->offset); + if (op_ret < 0) { + gf_msg_debug (fop->xl->name, 0, "%s: read failed %s, failing " + "to heal block at %"PRIu64, + uuid_utoa (heal->fd->inode->gfid), strerror (op_errno), + heal->offset); + heal->bad = 0; + } heal->done = 1; } @@ -1709,10 +1712,7 @@ ec_manager_heal_block (ec_fop_data_t *fop, int32_t state) case EC_STATE_HEAL_DATA_UNLOCK: ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0); - if (state < 0) - return -EC_STATE_REPORT; - else - return EC_STATE_REPORT; + return EC_STATE_REPORT; case EC_STATE_REPORT: if (fop->cbks.heal) { @@ -1728,7 +1728,7 @@ ec_manager_heal_block (ec_fop_data_t *fop, int32_t state) EIO, 0, 0, 0, NULL); } - return -EC_STATE_END; + return EC_STATE_END; default: gf_msg (fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s", @@ -1759,8 +1759,6 @@ ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target, if (fop == NULL) goto out; - GF_ASSERT(ec_set_inode_size(fop, heal->fd->inode, heal->total_size)); - error = 0; out: @@ -1834,6 +1832,8 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size, break; } + memset (healed_sinks, 0, ec->nodes); + ec_mask_to_char_array (heal->bad, healed_sinks, ec->nodes); fd_unref (heal->fd); LOCK_DESTROY (&heal->lock); syncbarrier_destroy (heal->data); diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index e28f402..4673c11 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -1303,7 +1303,7 @@ struct volume_options options[] = .type = GF_OPTION_TYPE_INT, .min = 0,/*Disabling background heals*/ .max = 256, - .default_value = "8", + .default_value = "0", .description = "This option can be used to control number of parallel" " heals", }, -- 1.7.1