3604df
From 4a6bdc7f785b8a030142d8036a13ae2f78984934 Mon Sep 17 00:00:00 2001
3604df
From: Krutika Dhananjay <kdhananj@redhat.com>
3604df
Date: Thu, 6 Apr 2017 18:10:41 +0530
3604df
Subject: [PATCH 308/316] features/shard: Fix vm corruption upon fix-layout
3604df
3604df
        Backport of: https://review.gluster.org/17010
3604df
3604df
shard's writev implementation, as part of identifying
3604df
presence of participant shards that aren't in memory,
3604df
first sends an MKNOD on these shards, and upon EEXIST error,
3604df
looks up the shards before proceeding with the writes.
3604df
3604df
The VM corruption was caused when the following happened:
3604df
1. DHT had n subvolumes initially.
3604df
2. Upon add-brick + fix-layout, the layout of .shard changed
3604df
   although the existing shards under it were yet to be migrated
3604df
   to their new hashed subvolumes.
3604df
3. During this time, there were writes on the VM falling in regions
3604df
   of the file whose corresponding shards were already existing under
3604df
   .shard.
3604df
4. Sharding xl sent MKNOD on these shards, now creating them in their
3604df
   new hashed subvolumes although there already exist shard blocks for
3604df
   this region with valid data.
3604df
5. All subsequent writes were wound on these newly created copies.
3604df
3604df
The net outcome is that both copies of the shard didn't have the correct
3604df
data. This caused the affected VMs to be unbootable.
3604df
3604df
FIX:
3604df
For want of better alternatives in DHT, the fix changes shard fops to do
3604df
a LOOKUP before the MKNOD and upon EEXIST error, perform another lookup.
3604df
3604df
Change-Id: I4086e7a9c27c9325b3830f4274be87847283a9f2
3604df
BUG: 1439753
3604df
RCA'd-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
3604df
Reported-by: Mahdi Adnan <mahdi.adnan@outlook.com>
3604df
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
3604df
Reviewed-on: https://code.engineering.redhat.com/gerrit/105195
3604df
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
3604df
---
3604df
 xlators/features/shard/src/shard.c | 154 +++++++++++++++++++++++--------------
3604df
 xlators/features/shard/src/shard.h |   1 +
3604df
 2 files changed, 96 insertions(+), 59 deletions(-)
3604df
3604df
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
3604df
index 9504f12..eabed90 100644
3604df
--- a/xlators/features/shard/src/shard.c
3604df
+++ b/xlators/features/shard/src/shard.c
3604df
@@ -1693,11 +1693,30 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie,
3604df
 
3604df
         if (op_ret < 0) {
3604df
                 /* Ignore absence of shards in the backend in truncate fop. */
3604df
-                if (((local->fop == GF_FOP_TRUNCATE) ||
3604df
-                    (local->fop == GF_FOP_FTRUNCATE) ||
3604df
-                    (local->fop == GF_FOP_RENAME) ||
3604df
-                    (local->fop == GF_FOP_UNLINK)) && (op_errno == ENOENT))
3604df
-                        goto done;
3604df
+                switch (local->fop) {
3604df
+                case GF_FOP_TRUNCATE:
3604df
+                case GF_FOP_FTRUNCATE:
3604df
+                case GF_FOP_RENAME:
3604df
+                case GF_FOP_UNLINK:
3604df
+                        if (op_errno == ENOENT)
3604df
+                                goto done;
3604df
+                        break;
3604df
+                case GF_FOP_WRITE:
3604df
+                case GF_FOP_READ:
3604df
+                case GF_FOP_ZEROFILL:
3604df
+                case GF_FOP_DISCARD:
3604df
+                case GF_FOP_FALLOCATE:
3604df
+                        if ((!local->first_lookup_done) &&
3604df
+                            (op_errno == ENOENT)) {
3604df
+                                local->create_count++;
3604df
+                                goto done;
3604df
+                        }
3604df
+                        break;
3604df
+                default:
3604df
+                        break;
3604df
+                }
3604df
+
3604df
+                /* else */
3604df
                 gf_msg (this->name, GF_LOG_ERROR, op_errno,
3604df
                         SHARD_MSG_LOOKUP_SHARD_FAILED, "Lookup on shard %d "
3604df
                         "failed. Base file gfid = %s", shard_block_num,
3604df
@@ -1714,6 +1733,8 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie,
3604df
 done:
3604df
         call_count = shard_call_count_return (frame);
3604df
         if (call_count == 0) {
3604df
+                if (!local->first_lookup_done)
3604df
+                        local->first_lookup_done = _gf_true;
3604df
                 if (local->op_ret < 0)
3604df
                         goto unwind;
3604df
                 else
3604df
@@ -3197,47 +3218,6 @@ next:
3604df
 }
3604df
 
3604df
 int
3604df
-shard_post_lookup_shards_readv_handler (call_frame_t *frame, xlator_t *this)
3604df
-{
3604df
-        shard_local_t *local = NULL;
3604df
-
3604df
-        local = frame->local;
3604df
-
3604df
-        if (local->op_ret < 0) {
3604df
-                SHARD_STACK_UNWIND (readv, frame, local->op_ret,
3604df
-                                    local->op_errno, NULL, 0, NULL, NULL, NULL);
3604df
-                return 0;
3604df
-        }
3604df
-
3604df
-        shard_readv_do (frame, this);
3604df
-
3604df
-        return 0;
3604df
-}
3604df
-
3604df
-int
3604df
-shard_post_mknod_readv_handler (call_frame_t *frame, xlator_t *this)
3604df
-{
3604df
-        shard_local_t *local = NULL;
3604df
-
3604df
-        local = frame->local;
3604df
-
3604df
-        if (local->op_ret < 0) {
3604df
-                SHARD_STACK_UNWIND (readv, frame, local->op_ret,
3604df
-                                    local->op_errno, NULL, 0, NULL, NULL, NULL);
3604df
-                return 0;
3604df
-        }
3604df
-
3604df
-        if (!local->eexist_count) {
3604df
-                shard_readv_do (frame, this);
3604df
-        } else {
3604df
-                local->call_count = local->eexist_count;
3604df
-                shard_common_lookup_shards (frame, this, local->loc.inode,
3604df
-                                        shard_post_lookup_shards_readv_handler);
3604df
-        }
3604df
-        return 0;
3604df
-}
3604df
-
3604df
-int
3604df
 shard_common_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
3604df
                         int32_t op_ret, int32_t op_errno, inode_t *inode,
3604df
                         struct iatt *buf, struct iatt *preparent,
3604df
@@ -3267,6 +3247,7 @@ done:
3604df
         call_count = shard_call_count_return (frame);
3604df
         if (call_count == 0) {
3604df
                 SHARD_UNSET_ROOT_FS_ID (frame, local);
3604df
+                local->create_count = 0;
3604df
                 local->post_mknod_handler (frame, this);
3604df
         }
3604df
 
3604df
@@ -3397,6 +3378,55 @@ err:
3604df
 }
3604df
 
3604df
 int
3604df
+shard_post_mknod_readv_handler (call_frame_t *frame, xlator_t *this);
3604df
+
3604df
+int
3604df
+shard_post_lookup_shards_readv_handler (call_frame_t *frame, xlator_t *this)
3604df
+{
3604df
+        shard_local_t *local = NULL;
3604df
+
3604df
+        local = frame->local;
3604df
+
3604df
+        if (local->op_ret < 0) {
3604df
+                SHARD_STACK_UNWIND (readv, frame, local->op_ret,
3604df
+                                    local->op_errno, NULL, 0, NULL, NULL, NULL);
3604df
+                return 0;
3604df
+        }
3604df
+
3604df
+        if (local->create_count) {
3604df
+                shard_common_resume_mknod (frame, this,
3604df
+                                           shard_post_mknod_readv_handler);
3604df
+        } else {
3604df
+                shard_readv_do (frame, this);
3604df
+        }
3604df
+
3604df
+        return 0;
3604df
+}
3604df
+
3604df
+int
3604df
+shard_post_mknod_readv_handler (call_frame_t *frame, xlator_t *this)
3604df
+{
3604df
+        shard_local_t *local = NULL;
3604df
+
3604df
+        local = frame->local;
3604df
+
3604df
+        if (local->op_ret < 0) {
3604df
+                SHARD_STACK_UNWIND (readv, frame, local->op_ret,
3604df
+                                    local->op_errno, NULL, 0, NULL, NULL, NULL);
3604df
+                return 0;
3604df
+        }
3604df
+
3604df
+        if (!local->eexist_count) {
3604df
+                shard_readv_do (frame, this);
3604df
+        } else {
3604df
+                local->call_count = local->eexist_count;
3604df
+                shard_common_lookup_shards (frame, this, local->loc.inode,
3604df
+                                        shard_post_lookup_shards_readv_handler);
3604df
+        }
3604df
+        return 0;
3604df
+}
3604df
+
3604df
+int
3604df
 shard_post_resolve_readv_handler (call_frame_t *frame, xlator_t *this)
3604df
 {
3604df
         shard_local_t *local = NULL;
3604df
@@ -3422,9 +3452,9 @@ shard_post_resolve_readv_handler (call_frame_t *frame, xlator_t *this)
3604df
         }
3604df
 
3604df
         if (local->call_count) {
3604df
-                local->create_count = local->call_count;
3604df
-                shard_common_resume_mknod (frame, this,
3604df
-                                           shard_post_mknod_readv_handler);
3604df
+                shard_common_lookup_shards (frame, this,
3604df
+                                            local->resolver_base_inode,
3604df
+                                        shard_post_lookup_shards_readv_handler);
3604df
         } else {
3604df
                 shard_readv_do (frame, this);
3604df
         }
3604df
@@ -3575,14 +3605,11 @@ shard_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
3604df
 
3604df
         shard_lookup_base_file (frame, this, &local->loc,
3604df
                                 shard_post_lookup_readv_handler);
3604df
-
3604df
         return 0;
3604df
-
3604df
 err:
3604df
         SHARD_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL,
3604df
                             NULL);
3604df
         return 0;
3604df
-
3604df
 }
3604df
 
3604df
 int
3604df
@@ -3875,6 +3902,10 @@ next:
3604df
 }
3604df
 
3604df
 int
3604df
+shard_common_inode_write_post_mknod_handler (call_frame_t *frame,
3604df
+                                             xlator_t *this);
3604df
+
3604df
+int
3604df
 shard_common_inode_write_post_lookup_shards_handler (call_frame_t *frame,
3604df
                                                      xlator_t *this)
3604df
 {
3604df
@@ -3889,7 +3920,12 @@ shard_common_inode_write_post_lookup_shards_handler (call_frame_t *frame,
3604df
                 return 0;
3604df
         }
3604df
 
3604df
-        shard_common_inode_write_do (frame, this);
3604df
+        if (local->create_count) {
3604df
+                shard_common_resume_mknod (frame, this,
3604df
+                                   shard_common_inode_write_post_mknod_handler);
3604df
+        } else {
3604df
+                shard_common_inode_write_do (frame, this);
3604df
+        }
3604df
 
3604df
         return 0;
3604df
 }
3604df
@@ -3937,11 +3973,13 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
3604df
 
3604df
         local->postbuf = local->prebuf;
3604df
 
3604df
-        if (local->create_count)
3604df
-                shard_common_resume_mknod (frame, this,
3604df
-                                   shard_common_inode_write_post_mknod_handler);
3604df
-        else
3604df
+        if (local->call_count) {
3604df
+                shard_common_lookup_shards (frame, this,
3604df
+                                            local->resolver_base_inode,
3604df
+                           shard_common_inode_write_post_lookup_shards_handler);
3604df
+        } else {
3604df
                 shard_common_inode_write_do (frame, this);
3604df
+        }
3604df
 
3604df
         return 0;
3604df
 }
3604df
@@ -3961,8 +3999,6 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
3604df
                 return 0;
3604df
         }
3604df
 
3604df
-        local->create_count = local->call_count;
3604df
-
3604df
         shard_lookup_base_file (frame, this, &local->loc,
3604df
                                 shard_common_inode_write_post_lookup_handler);
3604df
         return 0;
3604df
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
3604df
index 09232a4..7319598 100644
3604df
--- a/xlators/features/shard/src/shard.h
3604df
+++ b/xlators/features/shard/src/shard.h
3604df
@@ -255,6 +255,7 @@ typedef struct shard_local {
3604df
                 shard_lock_t *shard_lock;
3604df
         } lock;
3604df
         inode_t *resolver_base_inode;
3604df
+        gf_boolean_t first_lookup_done;
3604df
 } shard_local_t;
3604df
 
3604df
 typedef struct shard_inode_ctx {
3604df
-- 
3604df
1.8.3.1
3604df