d1681e
From 77c33f6c257928576d328e6e735f7e7a086202a3 Mon Sep 17 00:00:00 2001
d1681e
From: karthik-us <ksubrahm@redhat.com>
d1681e
Date: Tue, 17 Jul 2018 11:56:10 +0530
d1681e
Subject: [PATCH 323/325] cluster/afr: Mark dirty for entry transactions for
d1681e
 quorum failures
d1681e
d1681e
Backport of:https://review.gluster.org/#/c/20153/
d1681e
Problem:
d1681e
If an entry creation transaction fails on quprum number of bricks
d1681e
it might end up setting the pending changelogs on the file itself
d1681e
on the brick where it got created. But the parent does not have
d1681e
any entry pending marker set. This will lead to the entry not
d1681e
getting healed by the self heal daemon automatically.
d1681e
d1681e
Fix:
d1681e
For entry transactions mark dirty on the parent if it fails on
d1681e
quorum number of bricks, so that the heal can do conservative
d1681e
merge and entry gets healed by shd.
d1681e
d1681e
Change-Id: I8bbd02da7c4c9edd9c3f947e9a4ed3d37c9bec1c
d1681e
BUG: 1566336
d1681e
Signed-off-by: karthik-us <ksubrahm@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/144145
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
d1681e
---
d1681e
 ...20-mark-dirty-for-entry-txn-on-quorum-failure.t | 73 ++++++++++++++++++++++
d1681e
 xlators/cluster/afr/src/afr-transaction.c          | 62 ++++++++++++++----
d1681e
 2 files changed, 124 insertions(+), 11 deletions(-)
d1681e
 create mode 100644 tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t
d1681e
d1681e
diff --git a/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t b/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t
d1681e
new file mode 100644
d1681e
index 0000000..7fec3b4
d1681e
--- /dev/null
d1681e
+++ b/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t
d1681e
@@ -0,0 +1,73 @@
d1681e
+#!/bin/bash
d1681e
+
d1681e
+. $(dirname $0)/../../include.rc
d1681e
+. $(dirname $0)/../../volume.rc
d1681e
+
d1681e
+cleanup;
d1681e
+
d1681e
+function create_files {
d1681e
+        local i=1
d1681e
+        while (true)
d1681e
+        do
d1681e
+                dd if=/dev/zero of=$M0/file$i bs=1M count=10
d1681e
+                if [ -e $B0/${V0}0/file$i ] && [ -e $B0/${V0}1/file$i ]; then
d1681e
+                        ((i++))
d1681e
+                else
d1681e
+                        break
d1681e
+                fi
d1681e
+        done
d1681e
+        echo $i
d1681e
+}
d1681e
+
d1681e
+TEST glusterd
d1681e
+
d1681e
+#Create brick partitions
d1681e
+TEST truncate -s 100M $B0/brick0
d1681e
+TEST truncate -s 100M $B0/brick1
d1681e
+#Have the 3rd brick of a higher size to test the scenario of entry transaction
d1681e
+#passing on only one brick and not on other bricks.
d1681e
+TEST truncate -s 110M $B0/brick2
d1681e
+LO1=`SETUP_LOOP $B0/brick0`
d1681e
+TEST [ $? -eq 0 ]
d1681e
+TEST MKFS_LOOP $LO1
d1681e
+LO2=`SETUP_LOOP $B0/brick1`
d1681e
+TEST [ $? -eq 0 ]
d1681e
+TEST MKFS_LOOP $LO2
d1681e
+LO3=`SETUP_LOOP $B0/brick2`
d1681e
+TEST [ $? -eq 0 ]
d1681e
+TEST MKFS_LOOP $LO3
d1681e
+TEST mkdir -p $B0/${V0}0 $B0/${V0}1 $B0/${V0}2
d1681e
+TEST MOUNT_LOOP $LO1 $B0/${V0}0
d1681e
+TEST MOUNT_LOOP $LO2 $B0/${V0}1
d1681e
+TEST MOUNT_LOOP $LO3 $B0/${V0}2
d1681e
+
d1681e
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
d1681e
+TEST $CLI volume start $V0
d1681e
+TEST $CLI volume set $V0 performance.write-behind off
d1681e
+TEST $CLI volume set $V0 self-heal-daemon off
d1681e
+TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0
d1681e
+
d1681e
+i=$(create_files)
d1681e
+TEST ! ls $B0/${V0}0/file$i
d1681e
+TEST ! ls $B0/${V0}1/file$i
d1681e
+TEST ls $B0/${V0}2/file$i
d1681e
+EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}2
d1681e
+EXPECT "000000010000000100000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file$i
d1681e
+EXPECT "000000010000000100000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file$i
d1681e
+
d1681e
+TEST $CLI volume set $V0 self-heal-daemon on
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
d1681e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
d1681e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
d1681e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
d1681e
+TEST rm -f $M0/file1
d1681e
+
d1681e
+TEST $CLI volume heal $V0
d1681e
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
d1681e
+TEST force_umount $M0
d1681e
+TEST $CLI volume stop $V0
d1681e
+EXPECT 'Stopped' volinfo_field $V0 'Status';
d1681e
+TEST $CLI volume delete $V0;
d1681e
+UMOUNT_LOOP ${B0}/${V0}{0,1,2}
d1681e
+rm -f ${B0}/brick{0,1,2}
d1681e
+cleanup;
d1681e
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
d1681e
index 5b18f63..321b6f1 100644
d1681e
--- a/xlators/cluster/afr/src/afr-transaction.c
d1681e
+++ b/xlators/cluster/afr/src/afr-transaction.c
d1681e
@@ -774,8 +774,38 @@ afr_has_fop_cbk_quorum (call_frame_t *frame)
d1681e
         return afr_has_quorum (success, this);
d1681e
 }
d1681e
 
d1681e
+gf_boolean_t
d1681e
+afr_need_dirty_marking (call_frame_t *frame, xlator_t *this)
d1681e
+{
d1681e
+        afr_private_t           *priv           = this->private;
d1681e
+        afr_local_t             *local          = NULL;
d1681e
+        gf_boolean_t            need_dirty      = _gf_false;
d1681e
+
d1681e
+        local = frame->local;
d1681e
+
d1681e
+        if (!priv->quorum_count || !local->optimistic_change_log)
d1681e
+                return _gf_false;
d1681e
+
d1681e
+        if (local->transaction.type == AFR_DATA_TRANSACTION ||
d1681e
+            local->transaction.type == AFR_METADATA_TRANSACTION)
d1681e
+                return _gf_false;
d1681e
+
d1681e
+        if (AFR_COUNT (local->transaction.failed_subvols, priv->child_count) ==
d1681e
+            priv->child_count)
d1681e
+                return _gf_false;
d1681e
+
d1681e
+        if (priv->arbiter_count) {
d1681e
+                if (!afr_has_arbiter_fop_cbk_quorum (frame))
d1681e
+                        need_dirty = _gf_true;
d1681e
+        } else if (!afr_has_fop_cbk_quorum (frame)) {
d1681e
+                need_dirty = _gf_true;
d1681e
+        }
d1681e
+
d1681e
+        return need_dirty;
d1681e
+}
d1681e
+
d1681e
 void
d1681e
-afr_handle_quorum (call_frame_t *frame)
d1681e
+afr_handle_quorum (call_frame_t *frame, xlator_t *this)
d1681e
 {
d1681e
         afr_local_t   *local = NULL;
d1681e
         afr_private_t *priv  = NULL;
d1681e
@@ -826,11 +856,15 @@ afr_handle_quorum (call_frame_t *frame)
d1681e
                 return;
d1681e
         }
d1681e
 
d1681e
+        if (afr_need_dirty_marking (frame, this))
d1681e
+                goto set_response;
d1681e
+
d1681e
         for (i = 0; i < priv->child_count; i++) {
d1681e
                 if (local->transaction.pre_op[i])
d1681e
                         afr_transaction_fop_failed (frame, frame->this, i);
d1681e
         }
d1681e
 
d1681e
+set_response:
d1681e
         local->op_ret = -1;
d1681e
         local->op_errno = afr_final_errno (local, priv);
d1681e
         if (local->op_errno == 0)
d1681e
@@ -874,9 +908,17 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
d1681e
         int                     nothing_failed  = 1;
d1681e
         gf_boolean_t            need_undirty    = _gf_false;
d1681e
 
d1681e
-        afr_handle_quorum (frame);
d1681e
+        afr_handle_quorum (frame, this);
d1681e
         local = frame->local;
d1681e
-	idx = afr_index_for_transaction_type (local->transaction.type);
d1681e
+        idx = afr_index_for_transaction_type (local->transaction.type);
d1681e
+
d1681e
+        xattr = dict_new ();
d1681e
+        if (!xattr) {
d1681e
+                local->op_ret = -1;
d1681e
+                local->op_errno = ENOMEM;
d1681e
+                afr_changelog_post_op_done (frame, this);
d1681e
+                goto out;
d1681e
+        }
d1681e
 
d1681e
         nothing_failed = afr_txn_nothing_failed (frame, this);
d1681e
 
d1681e
@@ -886,6 +928,11 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
d1681e
 		need_undirty = _gf_true;
d1681e
 
d1681e
         if (local->op_ret < 0 && !nothing_failed) {
d1681e
+                if (afr_need_dirty_marking (frame, this)) {
d1681e
+                        local->dirty[idx] = hton32(1);
d1681e
+                        goto set_dirty;
d1681e
+                }
d1681e
+
d1681e
                 afr_changelog_post_op_done (frame, this);
d1681e
                 goto out;
d1681e
         }
d1681e
@@ -902,14 +949,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
d1681e
                 goto out;
d1681e
         }
d1681e
 
d1681e
-	xattr = dict_new ();
d1681e
-	if (!xattr) {
d1681e
-		local->op_ret = -1;
d1681e
-		local->op_errno = ENOMEM;
d1681e
-		afr_changelog_post_op_done (frame, this);
d1681e
-		goto out;
d1681e
-	}
d1681e
-
d1681e
 	for (i = 0; i < priv->child_count; i++) {
d1681e
 		if (local->transaction.failed_subvols[i])
d1681e
 			local->pending[i][idx] = hton32(1);
d1681e
@@ -928,6 +967,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
d1681e
 	else
d1681e
 		local->dirty[idx] = hton32(0);
d1681e
 
d1681e
+set_dirty:
d1681e
 	ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty,
d1681e
 				   sizeof(int) * AFR_NUM_CHANGE_LOGS);
d1681e
 	if (ret) {
d1681e
-- 
d1681e
1.8.3.1
d1681e