3604df
From 4e4c3c581daeeb63034bfc89dd4b7a9bcb23e1d2 Mon Sep 17 00:00:00 2001
3604df
From: Krutika Dhananjay <kdhananj@redhat.com>
3604df
Date: Mon, 26 Dec 2016 21:08:03 +0530
3604df
Subject: [PATCH 259/267] cluster/afr: Fix missing name indices due to EEXIST
3604df
 error
3604df
3604df
        Backport of: http://review.gluster.org/16286
3604df
3604df
PROBLEM:
3604df
Consider a volume with  granular-entry-heal and sharding enabled. When
3604df
a replica is down and a shard is created as part of a write, the name
3604df
index is correctly created under indices/entry-changes/<dot-shard-gfid>.
3604df
Now when a read on the same region triggers another MKNOD, the fop
3604df
fails on the online bricks with EEXIST. By virtue of this being a
3604df
symmetric error, the failed_subvols[] array is reset to all zeroes.
3604df
Because of this, before post-op, the GF_XATTROP_ENTRY_OUT_KEY will be
3604df
set, causing the name index, which was created in the previous MKNOD
3604df
operation, to be wrongly deleted in THIS MKNOD operation.
3604df
3604df
FIX:
3604df
The ideal fix would have been for a transaction to delete the name
3604df
index ONLY if it knows it is the one that created the index in the first
3604df
place. This would involve gathering information as to whether THIS xattrop
3604df
created the index from individual bricks, aggregating their responses and
3604df
based on the various posisble combinations of responses, decide whether to
3604df
delete the index or not. This is rather complex. Simpler fix would be
3604df
for post-op to examine local->op_ret in the event of no failed_subvols
3604df
to figure out whether to delete the name index or not. This can occasionally
3604df
lead to creation of stale name indices but they won't be affecting the IO path
3604df
or mess with pending changelogs in any way and self-heal in its crawl of
3604df
"entry-changes" directory would take care to delete such indices.
3604df
3604df
Change-Id: I4e7717d66ed7d5edb942ecd28ab765b65d90ed98
3604df
BUG: 1408426
3604df
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
3604df
Reviewed-on: https://code.engineering.redhat.com/gerrit/93754
3604df
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
3604df
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
3604df
---
3604df
 tests/bugs/replicate/bug-1408712.t        | 87 +++++++++++++++++++++++++++++++
3604df
 tests/include.rc                          |  1 +
3604df
 xlators/cluster/afr/src/afr-transaction.c | 10 ++++
3604df
 3 files changed, 98 insertions(+)
3604df
 create mode 100644 tests/bugs/replicate/bug-1408712.t
3604df
3604df
diff --git a/tests/bugs/replicate/bug-1408712.t b/tests/bugs/replicate/bug-1408712.t
3604df
new file mode 100644
3604df
index 0000000..b26e8a0
3604df
--- /dev/null
3604df
+++ b/tests/bugs/replicate/bug-1408712.t
3604df
@@ -0,0 +1,87 @@
3604df
+#!/bin/bash
3604df
+
3604df
+. $(dirname $0)/../../include.rc
3604df
+. $(dirname $0)/../../volume.rc
3604df
+. $(dirname $0)/../../afr.rc
3604df
+
3604df
+cleanup
3604df
+
3604df
+TESTS_EXPECTED_IN_LOOP=12
3604df
+
3604df
+TEST glusterd
3604df
+TEST pidof glusterd
3604df
+
3604df
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
3604df
+TEST $CLI volume start $V0
3604df
+TEST $CLI volume set $V0 features.shard on
3604df
+TEST $CLI volume heal $V0 granular-entry-heal enable
3604df
+TEST $CLI volume set $V0 cluster.data-self-heal off
3604df
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
3604df
+TEST $CLI volume set $V0 cluster.entry-self-heal off
3604df
+TEST $CLI volume set $V0 self-heal-daemon off
3604df
+TEST $CLI volume set $V0 performance.flush-behind off
3604df
+
3604df
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
3604df
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M1
3604df
+
3604df
+cd $M0
3604df
+TEST dd if=/dev/zero of=file bs=1M count=8
3604df
+
3604df
+# Kill brick-0.
3604df
+TEST kill_brick $V0 $H0 $B0/${V0}0
3604df
+
3604df
+TEST "dd if=/dev/zero bs=1M count=8 >> file"
3604df
+
3604df
+FILE_GFID=$(get_gfid_string $M0/file)
3604df
+
3604df
+# Test that the index associated with '/.shard' is created on B1 and B2.
3604df
+TEST stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
3604df
+TEST stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
3604df
+# Check for successful creation of granular entry indices
3604df
+for i in {2..3}
3604df
+do
3604df
+        TEST_IN_LOOP stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
3604df
+        TEST_IN_LOOP stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
3604df
+done
3604df
+
3604df
+cd ~
3604df
+TEST md5sum $M1/file
3604df
+
3604df
+# Test that the index associated with '/.shard' and the created shards do not disappear on B1 and B2.
3604df
+TEST stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
3604df
+TEST stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
3604df
+for i in {2..3}
3604df
+do
3604df
+        TEST_IN_LOOP stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
3604df
+        TEST_IN_LOOP stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
3604df
+done
3604df
+
3604df
+# Start the brick that was down
3604df
+TEST $CLI volume start $V0 force
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
3604df
+
3604df
+# Enable shd
3604df
+TEST gluster volume set $V0 cluster.self-heal-daemon on
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
3604df
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
3604df
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
3604df
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
3604df
+
3604df
+TEST $CLI volume heal $V0
3604df
+
3604df
+# Wait for heal to complete
3604df
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
3604df
+
3604df
+# Now verify that there are no name indices left after self-heal
3604df
+TEST ! stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
3604df
+TEST ! stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
3604df
+
3604df
+for i in {2..3}
3604df
+do
3604df
+        TEST_IN_LOOP ! stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
3604df
+        TEST_IN_LOOP ! stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
3604df
+done
3604df
+
3604df
+cleanup
3604df
diff --git a/tests/include.rc b/tests/include.rc
3604df
index 44f5798..954fb42 100644
3604df
--- a/tests/include.rc
3604df
+++ b/tests/include.rc
3604df
@@ -11,6 +11,7 @@ B0=${B0:=/d/backends};        # top level of brick directories
3604df
 WORKDIRS="$B0 $M0 $M1 $M2 $N0 $N1" 
3604df
 
3604df
 ROOT_GFID="00000000-0000-0000-0000-000000000001"
3604df
+DOT_SHARD_GFID="be318638-e8a0-4c6d-977d-7a937aa84806"
3604df
 
3604df
 META_VOL=${META_VOL:=gluster_shared_storage}; # shared gluster storage volume used by snapshot scheduler, nfs ganesha and geo-rep.
3604df
 META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared gluster volume.
3604df
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
3604df
index 9a3b2e9..1e70c1c 100644
3604df
--- a/xlators/cluster/afr/src/afr-transaction.c
3604df
+++ b/xlators/cluster/afr/src/afr-transaction.c
3604df
@@ -1206,6 +1206,16 @@ afr_changelog_populate_xdata (call_frame_t *frame, afr_xattrop_type_t op,
3604df
                                 need_entry_key_set = _gf_false;
3604df
                                 break;
3604df
                         }
3604df
+                        /* If the transaction itself did not fail and there
3604df
+                         * are no failed subvolumes, check whether the fop
3604df
+                         * failed due to a symmetric error. If it did, do
3604df
+                         * not set the ENTRY_OUT xattr which would end up
3604df
+                         * deleting a name index which was created possibly by
3604df
+                         * an earlier entry txn that may have failed on some
3604df
+                         * of the sub-volumes.
3604df
+                         */
3604df
+                        if (local->op_ret)
3604df
+                                need_entry_key_set = _gf_false;
3604df
                 } else {
3604df
                         key = GF_XATTROP_ENTRY_IN_KEY;
3604df
                 }
3604df
-- 
3604df
2.9.3
3604df