cb8e9e
From a06d455f2f3ad30e7aab2a6ac9594281dd1386c7 Mon Sep 17 00:00:00 2001
cb8e9e
From: Pranith Kumar K <pkarampu@redhat.com>
cb8e9e
Date: Wed, 8 Jul 2015 17:52:11 +0530
cb8e9e
Subject: [PATCH 236/244] cluster/ec: Prevent data corruptions
cb8e9e
cb8e9e
        Backport of http://review.gluster.org/11580
cb8e9e
        Backport of http://review.gluster.com/11640
cb8e9e
cb8e9e
- Don't read from bricks that are healing
cb8e9e
- On lock reuse preserve 'healing' bits
cb8e9e
- Don't set ctx->size outside locks in healing code
cb8e9e
- Allow xattrop internal fops also on the fop->mask.
cb8e9e
cb8e9e
Change-Id: I35503039e4723cf7f33d6797f0ba90dd0aca130b
cb8e9e
BUG: 1230513
cb8e9e
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/52663
cb8e9e
---
cb8e9e
 xlators/cluster/ec/src/ec-common.c |   25 +++++++++++++++++++++----
cb8e9e
 xlators/cluster/ec/src/ec-data.h   |    1 +
cb8e9e
 xlators/cluster/ec/src/ec-heal.c   |   22 +++++++++++-----------
cb8e9e
 xlators/cluster/ec/src/ec.c        |    2 +-
cb8e9e
 4 files changed, 34 insertions(+), 16 deletions(-)
cb8e9e
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
cb8e9e
index 439773e..b2d0348 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-common.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-common.c
cb8e9e
@@ -401,6 +401,18 @@ ec_must_wind (ec_fop_data_t *fop)
cb8e9e
         return _gf_false;
cb8e9e
 }
cb8e9e
 
cb8e9e
+static gf_boolean_t
cb8e9e
+ec_internal_op (ec_fop_data_t *fop)
cb8e9e
+{
cb8e9e
+        if (ec_must_wind (fop))
cb8e9e
+                return _gf_true;
cb8e9e
+        if (fop->id == GF_FOP_XATTROP)
cb8e9e
+                return _gf_true;
cb8e9e
+        if (fop->id == GF_FOP_FXATTROP)
cb8e9e
+                return _gf_true;
cb8e9e
+        return _gf_false;
cb8e9e
+}
cb8e9e
+
cb8e9e
 int32_t ec_child_select(ec_fop_data_t * fop)
cb8e9e
 {
cb8e9e
     ec_t * ec = fop->xl->private;
cb8e9e
@@ -413,8 +425,9 @@ int32_t ec_child_select(ec_fop_data_t * fop)
cb8e9e
     /* Wind the fop on same subvols as parent for any internal extra fops like
cb8e9e
      * head/tail read in case of writev fop. Unlocks shouldn't do this because
cb8e9e
      * unlock should go on all subvols where lock is performed*/
cb8e9e
-    if (fop->parent && !ec_must_wind (fop))
cb8e9e
-            fop->mask &= fop->parent->mask;
cb8e9e
+    if (fop->parent && !ec_internal_op (fop)) {
cb8e9e
+            fop->mask &= (fop->parent->mask & ~fop->parent->healing);
cb8e9e
+    }
cb8e9e
 
cb8e9e
     mask = ec->xl_up;
cb8e9e
     if (fop->parent == NULL)
cb8e9e
@@ -972,6 +985,7 @@ out:
cb8e9e
         parent->mask &= fop->good;
cb8e9e
 
cb8e9e
         /*As of now only data healing marks bricks as healing*/
cb8e9e
+        lock->healing |= fop->healing;
cb8e9e
         if (ec_is_data_fop (parent->id)) {
cb8e9e
             parent->healing |= fop->healing;
cb8e9e
         }
cb8e9e
@@ -996,9 +1010,13 @@ void ec_get_size_version(ec_lock_link_t *link)
cb8e9e
 
cb8e9e
     lock = link->lock;
cb8e9e
     ctx = lock->ctx;
cb8e9e
+    fop = link->fop;
cb8e9e
 
cb8e9e
     /* If ec metadata has already been retrieved, do not try again. */
cb8e9e
     if (ctx->have_info) {
cb8e9e
+        if (ec_is_data_fop (fop->id)) {
cb8e9e
+            fop->healing |= lock->healing;
cb8e9e
+        }
cb8e9e
         return;
cb8e9e
     }
cb8e9e
 
cb8e9e
@@ -1008,8 +1026,6 @@ void ec_get_size_version(ec_lock_link_t *link)
cb8e9e
         return;
cb8e9e
     }
cb8e9e
 
cb8e9e
-    fop = link->fop;
cb8e9e
-
cb8e9e
     uid = fop->frame->root->uid;
cb8e9e
     gid = fop->frame->root->gid;
cb8e9e
 
cb8e9e
@@ -1274,6 +1290,7 @@ int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         link = fop->data;
cb8e9e
         lock = link->lock;
cb8e9e
         lock->mask = lock->good_mask = fop->good;
cb8e9e
+        lock->healing = 0;
cb8e9e
 
cb8e9e
         ec_lock_acquired(link);
cb8e9e
         ec_lock(fop->parent);
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
cb8e9e
index 135ccdf..ec470e9 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-data.h
cb8e9e
+++ b/xlators/cluster/ec/src/ec-data.h
cb8e9e
@@ -146,6 +146,7 @@ struct _ec_lock
cb8e9e
                                    the next unlock/lock cycle. */
cb8e9e
     uintptr_t          mask;
cb8e9e
     uintptr_t          good_mask;
cb8e9e
+    uintptr_t          healing;
cb8e9e
     int32_t            refs;
cb8e9e
     int32_t            refs_frozen;
cb8e9e
     int32_t            inserted;
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
cb8e9e
index 6f82203..a7c97a5 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-heal.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-heal.c
cb8e9e
@@ -256,10 +256,13 @@ int32_t ec_heal_readv_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
cb8e9e
     }
cb8e9e
     else
cb8e9e
     {
cb8e9e
-            gf_msg_debug (fop->xl->name, 0, "%s: read failed %s, failing "
cb8e9e
-                    "to heal block at %"PRIu64,
cb8e9e
-                    uuid_utoa (heal->fd->inode->gfid), strerror (op_errno),
cb8e9e
-                    heal->offset);
cb8e9e
+        if (op_ret < 0) {
cb8e9e
+                gf_msg_debug (fop->xl->name, 0, "%s: read failed %s, failing "
cb8e9e
+                        "to heal block at %"PRIu64,
cb8e9e
+                        uuid_utoa (heal->fd->inode->gfid), strerror (op_errno),
cb8e9e
+                        heal->offset);
cb8e9e
+                heal->bad = 0;
cb8e9e
+        }
cb8e9e
         heal->done = 1;
cb8e9e
     }
cb8e9e
 
cb8e9e
@@ -1709,10 +1712,7 @@ ec_manager_heal_block (ec_fop_data_t *fop, int32_t state)
cb8e9e
     case EC_STATE_HEAL_DATA_UNLOCK:
cb8e9e
         ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0);
cb8e9e
 
cb8e9e
-        if (state < 0)
cb8e9e
-                return -EC_STATE_REPORT;
cb8e9e
-        else
cb8e9e
-                return EC_STATE_REPORT;
cb8e9e
+         return EC_STATE_REPORT;
cb8e9e
 
cb8e9e
     case EC_STATE_REPORT:
cb8e9e
         if (fop->cbks.heal) {
cb8e9e
@@ -1728,7 +1728,7 @@ ec_manager_heal_block (ec_fop_data_t *fop, int32_t state)
cb8e9e
                             EIO, 0, 0, 0, NULL);
cb8e9e
         }
cb8e9e
 
cb8e9e
-        return -EC_STATE_END;
cb8e9e
+        return EC_STATE_END;
cb8e9e
     default:
cb8e9e
         gf_msg (fop->xl->name, GF_LOG_ERROR, 0,
cb8e9e
                 EC_MSG_UNHANDLED_STATE, "Unhandled state %d for %s",
cb8e9e
@@ -1759,8 +1759,6 @@ ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target,
cb8e9e
     if (fop == NULL)
cb8e9e
         goto out;
cb8e9e
 
cb8e9e
-    GF_ASSERT(ec_set_inode_size(fop, heal->fd->inode, heal->total_size));
cb8e9e
-
cb8e9e
     error = 0;
cb8e9e
 
cb8e9e
 out:
cb8e9e
@@ -1834,6 +1832,8 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
cb8e9e
                         break;
cb8e9e
 
cb8e9e
         }
cb8e9e
+        memset (healed_sinks, 0, ec->nodes);
cb8e9e
+        ec_mask_to_char_array (heal->bad, healed_sinks, ec->nodes);
cb8e9e
         fd_unref (heal->fd);
cb8e9e
         LOCK_DESTROY (&heal->lock);
cb8e9e
         syncbarrier_destroy (heal->data);
cb8e9e
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
cb8e9e
index e28f402..4673c11 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec.c
cb8e9e
@@ -1303,7 +1303,7 @@ struct volume_options options[] =
cb8e9e
       .type = GF_OPTION_TYPE_INT,
cb8e9e
       .min = 0,/*Disabling background heals*/
cb8e9e
       .max = 256,
cb8e9e
-      .default_value = "8",
cb8e9e
+      .default_value = "0",
cb8e9e
       .description = "This option can be used to control number of parallel"
cb8e9e
                      " heals",
cb8e9e
     },
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e