Blob Blame History Raw
From 8b56e3a35b31a01d31092d24520e125f0b189f2a Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Mon, 28 Nov 2016 14:12:43 +0530
Subject: [PATCH 196/206] afr: allow I/O when favorite-child-policy is enabled

Problem:
Currently, I/O on a split-brained file fails even when the
favorite-child-policy is set until the self-heal is complete.

Fix:
If a valid 'source' is found using the set favorite-child-policy, inspect
and reset the afr pending xattrs on the 'sinks' (inside appropriate locks),
refresh the inode and then proceed with the read or write transaction.

The resetting itself happens in the self-heal code and hence can also
happen in the client side background-heal or by the shd's index-heal in
addition to the txn code path explained above. When it happens in via
heal, we also add checks in undo-pending to not reset the sink xattrs
again.

>Change-Id: Ic8c1317720cb26bd114b6fe6af4e58c73b864626
>BUG: 1386188
>Signed-off-by: Ravishankar N <ravishankar@redhat.com>
>Reported-by: Simon Turcotte-Langevin <simon.turcotte-langevin@ubisoft.com>
>Reviewed-on: http://review.gluster.org/15673
>Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>

Change-Id: Icbb11b6e3556614e6054e576a7876b4c165add82
BUG: 1387501
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/91354
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 .../bugs/replicate/bug-1386188-sbrain-fav-child.t  |  82 +++++++++
 xlators/cluster/afr/src/afr-common.c               | 205 ++++++++++++++++++++-
 xlators/cluster/afr/src/afr-read-txn.c             |  13 +-
 xlators/cluster/afr/src/afr-self-heal-common.c     | 159 +++++++++++++---
 xlators/cluster/afr/src/afr-self-heal-data.c       |  20 +-
 xlators/cluster/afr/src/afr-self-heal-entry.c      |   3 +
 xlators/cluster/afr/src/afr-self-heal-metadata.c   |  15 +-
 xlators/cluster/afr/src/afr-self-heal.h            |  24 ++-
 xlators/cluster/afr/src/afr-transaction.c          |   9 +-
 9 files changed, 469 insertions(+), 61 deletions(-)
 create mode 100644 tests/bugs/replicate/bug-1386188-sbrain-fav-child.t

diff --git a/tests/bugs/replicate/bug-1386188-sbrain-fav-child.t b/tests/bugs/replicate/bug-1386188-sbrain-fav-child.t
new file mode 100644
index 0000000..d049d95
--- /dev/null
+++ b/tests/bugs/replicate/bug-1386188-sbrain-fav-child.t
@@ -0,0 +1,82 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 data-self-heal off
+TEST $CLI volume set $V0 entry-self-heal off
+TEST $CLI volume set $V0 metadata-self-heal off
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST touch $M0/data.txt
+TEST touch $M0/mdata.txt
+
+#Create data and metadata split-brain
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST dd if=/dev/urandom of=$M0/data.txt bs=1024 count=1024
+TEST setfattr -n user.value -v value1 $M0/mdata.txt
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/data.txt bs=1024 count=1024
+TEST setfattr -n user.value -v value2 $M0/mdata.txt
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+## Check that the file still in split-brain,
+  ## I/O fails
+  cat $M0/data.txt > /dev/null
+  EXPECT "1" echo $?
+  ## pending xattrs blame each other.
+  brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/data.txt)
+  brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/data.txt)
+  TEST [ $brick0_pending -ne "000000000000000000000000" ]
+  TEST [ $brick1_pending -ne "000000000000000000000000" ]
+
+  ## I/O fails
+  getfattr -n user.value $M0/mdata.txt
+  EXPECT "1" echo $?
+  brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/mdata.txt)
+  brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/mdata.txt)
+  TEST [ $brick0_pending -ne "000000000000000000000000" ]
+  TEST [ $brick1_pending -ne "000000000000000000000000" ]
+
+## Let us use mtime as fav-child policy. So brick0 will be source.
+   # Set dirty (data part) on the sink brick to check if it is reset later along with the pending xattr.
+   TEST setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/${V0}1/data.txt
+   # Set dirty (metadata part) on the sink brick to check if it is reset later along with the pending xattr.
+   TEST setfattr -n trusted.afr.dirty -v 0x000000000000000100000000 $B0/${V0}1/mdata.txt
+
+   TEST $CLI volume set $V0 favorite-child-policy mtime
+
+   # Reading the file should be allowed and sink brick xattrs must be reset.
+   cat $M0/data.txt > /dev/null
+   EXPECT "0" echo $?
+   TEST brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/data.txt)
+   TEST brick1_dirty=$(get_hex_xattr trusted.afr.dirty $B0/${V0}1/data.txt)
+   TEST [ $brick1_dirty -eq "000000000000000000000000" ]
+   TEST [ $brick1_pending -eq "000000000000000000000000" ]
+
+   # Accessing the file should be allowed and sink brick xattrs must be reset.
+   EXPECT "value2" echo $(getfattr --only-values -n user.value  $M0/mdata.txt)
+   TEST brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/data.txt)
+   TEST brick1_dirty=$(get_hex_xattr trusted.afr.dirty $B0/${V0}1/data.txt)
+   TEST [ $brick1_dirty -eq "000000000000000000000000" ]
+   TEST [ $brick1_pending -eq "000000000000000000000000" ]
+
+#Enable shd and heal the file.
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+EXPECT 0 get_pending_heal_count $V0
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index ab60406..2d9b496 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -982,6 +982,13 @@ ret:
 	return -err;
 }
 
+int
+afr_fav_child_reset_sink_xattrs (void *opaque);
+
+int
+afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *frame,
+                                     void *opaque);
+
 gf_boolean_t
 afr_selfheal_enabled (xlator_t *this)
 {
@@ -997,6 +1004,82 @@ afr_selfheal_enabled (xlator_t *this)
 	return data || priv->metadata_self_heal || priv->entry_self_heal;
 }
 
+
+int
+afr_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+{
+
+        call_frame_t *heal_frame = NULL;
+        afr_local_t *heal_local = NULL;
+        afr_local_t *local = NULL;
+        afr_private_t *priv = NULL;
+        inode_t *inode = NULL;
+        int event_generation = 0;
+        int read_subvol = -1;
+        int op_errno = ENOMEM;
+        int ret = 0;
+
+        local = frame->local;
+        inode = local->inode;
+        priv = this->private;
+
+        if (err)
+                goto refresh_done;
+
+        if (local->op == GF_FOP_LOOKUP)
+                goto refresh_done;
+
+        ret = afr_inode_get_readable (frame, inode, this, local->readable,
+                                      &event_generation,
+                                      local->transaction.type);
+
+        if (ret == -EIO || !event_generation) {
+                /* No readable subvolume even after refresh ==> splitbrain.*/
+                if (!priv->fav_child_policy) {
+                        err = -EIO;
+                        goto refresh_done;
+                }
+                read_subvol = afr_sh_get_fav_by_policy (this, local->replies,
+                                                        inode, NULL);
+                if (read_subvol == -1) {
+                        err = -EIO;
+                        goto refresh_done;
+                }
+
+                heal_frame = copy_frame (frame);
+                if (!heal_frame) {
+                        err = -EIO;
+                        goto refresh_done;
+                }
+                heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+                heal_local = AFR_FRAME_INIT (heal_frame, op_errno);
+                if (!heal_local) {
+                        err = -EIO;
+                        AFR_STACK_DESTROY (heal_frame);
+                        goto refresh_done;
+                }
+                heal_local->xdata_req = dict_new();
+                if (!heal_local->xdata_req) {
+                        err = -EIO;
+                        AFR_STACK_DESTROY (heal_frame);
+                        goto refresh_done;
+                }
+                heal_local->heal_frame = frame;
+                ret = synctask_new (this->ctx->env,
+                                    afr_fav_child_reset_sink_xattrs,
+                                    afr_fav_child_reset_sink_xattrs_cbk,
+                                    heal_frame,
+                                    heal_frame);
+                return 0;
+        }
+
+refresh_done:
+        afr_local_replies_wipe (local, this->private);
+        local->refreshfn (frame, this, err);
+
+        return 0;
+}
+
 int
 afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
 {
@@ -1015,8 +1098,6 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
 
 	err = afr_inode_refresh_err (frame, this);
 
-        afr_local_replies_wipe (local, this->private);
-
 	if (ret && afr_selfheal_enabled (this) && start_heal) {
                 heal_frame = copy_frame (frame);
                 if (!heal_frame)
@@ -1033,7 +1114,7 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
         }
 
 refresh_done:
-        local->refreshfn (frame, this, err);
+        afr_txn_refresh_done (frame, this, err);
 
 	return 0;
 }
@@ -5013,6 +5094,7 @@ afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
         unsigned char *sources = NULL;
         unsigned char *sinks = NULL;
         unsigned char *healed_sinks = NULL;
+        unsigned char *undid_pending = NULL;
         struct afr_reply *locked_replies = NULL;
 
         afr_private_t *priv = this->private;
@@ -5021,6 +5103,7 @@ afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
         sources = alloca0 (priv->child_count);
         sinks = alloca0 (priv->child_count);
         healed_sinks = alloca0 (priv->child_count);
+        undid_pending = alloca0 (priv->child_count);
 
         locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
 
@@ -5037,6 +5120,7 @@ afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
                 ret = __afr_selfheal_metadata_prepare (frame, this, inode,
                                                        locked_on, sources,
                                                        sinks, healed_sinks,
+                                                       undid_pending,
                                                        locked_replies,
                                                        pending);
                 *msh = afr_decide_heal_info (priv, sources, ret);
@@ -5060,6 +5144,7 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
         unsigned char *sources = NULL;
         unsigned char *sinks = NULL;
         unsigned char *healed_sinks = NULL;
+        unsigned char *undid_pending = NULL;
         afr_private_t   *priv = NULL;
         fd_t          *fd = NULL;
         struct afr_reply *locked_replies = NULL;
@@ -5073,6 +5158,7 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
         sources = alloca0 (priv->child_count);
         sinks = alloca0 (priv->child_count);
         healed_sinks = alloca0 (priv->child_count);
+        undid_pending = alloca0 (priv->child_count);
 
         /* Heal-info does an open() on the file being examined so that the
          * current eager-lock holding client, if present, at some point sees
@@ -5112,6 +5198,7 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
                         ret = __afr_selfheal_data_prepare (frame, this, inode,
                                                            data_lock, sources,
                                                            sinks, healed_sinks,
+                                                           undid_pending,
                                                            locked_replies,
                                                            pflag);
                         *dsh = afr_decide_heal_info (priv, sources, ret);
@@ -5699,3 +5786,115 @@ afr_compound_cleanup (compound_args_t *args, dict_t *xdata,
         if (newloc_xdata)
                 dict_unref (newloc_xdata);
 }
+
+int
+afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *heal_frame,
+                                     void *opaque)
+{
+
+        call_frame_t *txn_frame = NULL;
+        afr_local_t *local = NULL;
+        afr_local_t *heal_local = NULL;
+        xlator_t *this = NULL;
+
+        heal_local = heal_frame->local;
+        txn_frame = heal_local->heal_frame;
+        local = txn_frame->local;
+        this = txn_frame->this;
+
+        /* Refresh the inode agan and proceed with the transaction.*/
+        afr_inode_refresh (txn_frame, this, local->inode, NULL,
+                           local->refreshfn);
+
+        if (heal_frame)
+                AFR_STACK_DESTROY (heal_frame);
+
+        return 0;
+}
+
+int
+afr_fav_child_reset_sink_xattrs (void *opaque)
+{
+        call_frame_t *heal_frame = NULL;
+        call_frame_t *txn_frame = NULL;
+        xlator_t *this = NULL;
+        gf_boolean_t d_spb = _gf_false;
+        gf_boolean_t m_spb = _gf_false;
+        afr_local_t *heal_local = NULL;
+        afr_local_t *txn_local = NULL;
+        afr_private_t *priv = NULL;
+        inode_t *inode  = NULL;
+        unsigned char *locked_on = NULL;
+        unsigned char *sources = NULL;
+        unsigned char *sinks = NULL;
+        unsigned char *healed_sinks = NULL;
+        unsigned char *undid_pending = NULL;
+        struct afr_reply *locked_replies = NULL;
+        int ret = 0;
+
+        heal_frame = (call_frame_t *) opaque;
+        heal_local = heal_frame->local;
+        txn_frame = heal_local->heal_frame;
+        txn_local = txn_frame->local;
+        this = txn_frame->this;
+        inode = txn_local->inode;
+        priv = this->private;
+        locked_on = alloca0 (priv->child_count);
+        sources = alloca0 (priv->child_count);
+        sinks = alloca0 (priv->child_count);
+        healed_sinks = alloca0 (priv->child_count);
+        undid_pending = alloca0 (priv->child_count);
+        locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+        ret = _afr_is_split_brain (txn_frame, this, txn_local->replies,
+                                   AFR_DATA_TRANSACTION, &d_spb);
+
+        ret = _afr_is_split_brain (txn_frame, this, txn_local->replies,
+                                   AFR_METADATA_TRANSACTION, &m_spb);
+
+        /* Take appropriate locks and reset sink xattrs. */
+        if (d_spb) {
+                ret = afr_selfheal_inodelk (heal_frame, this, inode, this->name,
+                                            0, 0, locked_on);
+                {
+                        if (ret < AFR_SH_MIN_PARTICIPANTS)
+                                goto data_unlock;
+                        ret = __afr_selfheal_data_prepare (heal_frame, this,
+                                                           inode, locked_on,
+                                                           sources, sinks,
+                                                           healed_sinks,
+                                                           undid_pending,
+                                                           locked_replies,
+                                                           NULL);
+                }
+data_unlock:
+                afr_selfheal_uninodelk (heal_frame, this, inode, this->name,
+                                        0, 0, locked_on);
+        }
+
+        if (m_spb) {
+                memset (locked_on, 0, sizeof (*locked_on) * priv->child_count);
+                memset (undid_pending, 0,
+                        sizeof (*undid_pending) * priv->child_count);
+                ret = afr_selfheal_inodelk (heal_frame, this, inode, this->name,
+                                            LLONG_MAX-1, 0, locked_on);
+                {
+                        if (ret < AFR_SH_MIN_PARTICIPANTS)
+                                goto mdata_unlock;
+                        ret = __afr_selfheal_metadata_prepare (heal_frame, this,
+                                                               inode, locked_on,
+                                                               sources, sinks,
+                                                               healed_sinks,
+                                                               undid_pending,
+                                                               locked_replies,
+                                                               NULL);
+
+                }
+mdata_unlock:
+                afr_selfheal_uninodelk (heal_frame, this, inode, this->name,
+                                        LLONG_MAX-1, 0, locked_on);
+        }
+
+        return ret;
+
+}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 74749f0..1c7b4f9 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -64,7 +64,6 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
 {
 	afr_local_t *local = NULL;
 	int read_subvol = 0;
-	int event_generation = 0;
 	inode_t *inode = NULL;
 	int ret = -1;
         int spb_choice = -1;
@@ -76,18 +75,12 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
                 local->op_errno = -err;
                 local->op_ret = -1;
                 read_subvol = -1;
+                gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,
+                        "Failing %s on gfid %s: split-brain observed.",
+                        gf_fop_list[local->op], uuid_utoa (inode->gfid));
                 goto readfn;
         }
 
-	ret = afr_inode_get_readable (frame, inode, this, local->readable,
-			              &event_generation,
-				      local->transaction.type);
-
-	if (ret == -EIO || !event_generation)
-		/* Even after refresh, we don't have a good
-		   read subvolume. Time to bail */
-                AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn);
-
 	read_subvol = afr_read_subvol_select_by_policy (inode, this,
 							local->readable, NULL);
 	if (read_subvol == -1)
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 3110c1a..8f8b76c 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -145,8 +145,10 @@ err:
 int
 afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
 			   unsigned char *sources, unsigned char *sinks,
-			   unsigned char *healed_sinks, afr_transaction_type type,
-			   struct afr_reply *replies, unsigned char *locked_on)
+			   unsigned char *healed_sinks,
+                           unsigned char *undid_pending,
+                           afr_transaction_type type, struct afr_reply *replies,
+                           unsigned char *locked_on)
 {
 	afr_private_t *priv = NULL;
         afr_local_t *local = NULL;
@@ -214,6 +216,10 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
 			   and inspected on.
 			*/
 			continue;
+                if (undid_pending[i])
+                        /* We already unset the pending xattrs in
+                         * _afr_fav_child_reset_sink_xattrs(). */
+                        continue;
 
 		xattr = afr_selfheal_output_xattr (this, local->need_full_crawl,
                                                    type, output_dirty,
@@ -735,6 +741,42 @@ afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode)
         return fav_child;
 }
 
+int
+afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
+                          inode_t *inode, char **policy_str)
+{
+        afr_private_t *priv = NULL;
+        int fav_child = -1;
+
+        priv = this->private;
+        switch (priv->fav_child_policy) {
+        case AFR_FAV_CHILD_BY_SIZE:
+                fav_child = afr_sh_fav_by_size (this, replies, inode);
+                if (policy_str && fav_child >= 0)
+                        *policy_str = "SIZE";
+                break;
+        case AFR_FAV_CHILD_BY_CTIME:
+                fav_child = afr_sh_fav_by_ctime (this, replies, inode);
+                if (policy_str && fav_child >= 0)
+                        *policy_str = "CTIME";
+                break;
+        case AFR_FAV_CHILD_BY_MTIME:
+                fav_child = afr_sh_fav_by_mtime (this, replies, inode);
+                if (policy_str && fav_child >= 0)
+                        *policy_str = "MTIME";
+                break;
+        case AFR_FAV_CHILD_BY_MAJORITY:
+                fav_child = afr_sh_fav_by_majority (this, replies, inode);
+                if (policy_str && fav_child >= 0)
+                        *policy_str = "MAJORITY";
+                break;
+        case AFR_FAV_CHILD_NONE:
+        default:
+                break;
+        }
+
+        return fav_child;
+}
 
 int
 afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
@@ -756,24 +798,9 @@ afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
         time_t time;
 
         priv = this->private;
-        if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MAJORITY)  {
-                fav_child = afr_sh_fav_by_majority (this, replies, inode);
-                if (fav_child >= 0)
-                        policy_str = "MAJORITY";
-        } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MTIME)  {
-                fav_child = afr_sh_fav_by_mtime (this, replies, inode);
-                if (fav_child >= 0)
-                        policy_str = "MTIME";
-        } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_CTIME)  {
-                fav_child = afr_sh_fav_by_ctime (this, replies, inode);
-                if (fav_child >= 0)
-                        policy_str = "CTIME";
-        } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_SIZE)  {
-                fav_child = afr_sh_fav_by_size (this, replies, inode);
-                if (fav_child >= 0)
-                        policy_str = "SIZE";
-        }
 
+        fav_child = afr_sh_get_fav_by_policy (this, replies, inode,
+                                              &policy_str);
         if (fav_child > priv->child_count - 1) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
                         AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Invalid child (%d) "
@@ -829,6 +856,7 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
         dict_t *xdata_req = NULL;
         int heal_op = -1;
         int ret = -1;
+        int source = -1;
 
         local = frame->local;
         priv = this->private;
@@ -838,27 +866,96 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
         if (ret)
                 goto autoheal;
 
-        ret = afr_mark_split_brain_source_sinks_by_heal_op (frame, this,
+        source = afr_mark_split_brain_source_sinks_by_heal_op (frame, this,
                                                             sources, sinks,
                                                             healed_sinks,
                                                             locked_on, replies,
                                                             type, heal_op);
-        return ret;
+        return source;
 
 autoheal:
         /* Automatically heal if fav_child_policy is set. */
         if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
-                ret = afr_mark_split_brain_source_sinks_by_policy (frame, this,
-                                                                   inode,
-                                                                   sources,
-                                                                   sinks,
+                source = afr_mark_split_brain_source_sinks_by_policy (frame,
+                                                                      this,
+                                                                      inode,
+                                                                      sources,
+                                                                      sinks,
                                                                    healed_sinks,
-                                                                   locked_on,
-                                                                   replies,
-                                                                   type);
+                                                                      locked_on,
+                                                                      replies,
+                                                                      type);
+                if (source != -1) {
+                        ret = dict_set_int32 (xdata_req, "fav-child-policy", 1);
+                        if (ret)
+                                return -1;
+                }
         }
 
-        return ret;
+        return source;
+}
+
+int
+_afr_fav_child_reset_sink_xattrs (call_frame_t *frame, xlator_t *this,
+                                  inode_t *inode, int source,
+                                  unsigned char *healed_sinks,
+                                  unsigned char *undid_pending,
+                                  afr_transaction_type type,
+                                  unsigned char *locked_on,
+                                  struct afr_reply *replies)
+{
+        afr_private_t *priv = NULL;
+        afr_local_t *local = NULL;
+        int *input_dirty = NULL;
+        int **input_matrix = NULL;
+	int *output_dirty = NULL;
+	int **output_matrix = NULL;
+        dict_t *xattr = NULL;
+        dict_t *xdata = NULL;
+        int i = 0;
+
+        priv = this->private;
+        local = frame->local;
+
+        if (!dict_get (local->xdata_req, "fav-child-policy"))
+                return 0;
+
+        xdata = dict_new();
+        if (!xdata)
+                return -1;
+
+        input_dirty = alloca0 (priv->child_count * sizeof (int));
+	input_matrix = ALLOC_MATRIX (priv->child_count, int);
+	output_dirty = alloca0 (priv->child_count * sizeof (int));
+	output_matrix = ALLOC_MATRIX (priv->child_count, int);
+
+        afr_selfheal_extract_xattr (this, replies, type, input_dirty,
+                                    input_matrix);
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (i == source || !healed_sinks[i])
+                        continue;
+                output_dirty[i] = -input_dirty[i];
+                output_matrix[i][source] = -input_matrix[i][source];
+        }
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (!healed_sinks[i] || !locked_on[i])
+                        continue;
+                xattr = afr_selfheal_output_xattr (this, _gf_false, type,
+                                                   output_dirty, output_matrix,
+                                                   i, NULL);
+
+                afr_selfheal_post_op (frame, this, inode, i, xattr, xdata);
+
+                undid_pending[i] = 1;
+                dict_unref (xattr);
+        }
+
+        if (xdata)
+                dict_unref (xdata);
+
+        return 0;
 }
 
 gf_boolean_t
@@ -1906,11 +2003,15 @@ afr_selfheal (xlator_t *this, uuid_t gfid)
 {
         int           ret   = -1;
 	call_frame_t *frame = NULL;
+        afr_local_t *local = NULL;
 
 	frame = afr_frame_create (this);
 	if (!frame)
 		return ret;
 
+        local = frame->local;
+        local->xdata_req = dict_new();
+
         ret = afr_selfheal_do (frame, this, gfid);
 
 	if (frame)
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index fbbbd19..d032284 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -580,6 +580,7 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
                                      unsigned char *sinks,
 				     unsigned char *healed_sinks,
 				     unsigned char *locked_on,
+				     unsigned char *undid_pending,
 				     struct afr_reply *replies,
                                      uint64_t *witness)
 {
@@ -603,6 +604,11 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
                                  "file=%s", this->name, uuid_utoa(inode->gfid));
                         return -EIO;
                 }
+
+                _afr_fav_child_reset_sink_xattrs (frame, this, inode, source,
+                                                 healed_sinks, undid_pending,
+                                                 AFR_DATA_TRANSACTION,
+                                                 locked_on, replies);
                 return source;
 	}
 
@@ -642,6 +648,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
                              inode_t *inode, unsigned char *locked_on,
                              unsigned char *sources, unsigned char *sinks,
                              unsigned char *healed_sinks,
+                             unsigned char *undid_pending,
 			     struct afr_reply *replies, gf_boolean_t *pflag)
 {
 	int ret = -1;
@@ -677,8 +684,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
 	source = __afr_selfheal_data_finalize_source (frame, this, inode,
                                                       sources, sinks,
                                                       healed_sinks,
-                                                      locked_on, replies,
-                                                      witness);
+                                                      locked_on, undid_pending,
+                                                      replies, witness);
 	if (source < 0)
 		return -EIO;
 
@@ -696,6 +703,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
 	unsigned char *sinks = NULL;
 	unsigned char *data_lock = NULL;
 	unsigned char *healed_sinks = NULL;
+	unsigned char *undid_pending = NULL;
 	struct afr_reply *locked_replies = NULL;
 	int source = -1;
         gf_boolean_t did_sh = _gf_true;
@@ -707,6 +715,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
 	sinks = alloca0 (priv->child_count);
 	healed_sinks = alloca0 (priv->child_count);
 	data_lock = alloca0 (priv->child_count);
+        undid_pending = alloca0 (priv->child_count);
 
 	locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
 
@@ -726,9 +735,8 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
 
 		ret = __afr_selfheal_data_prepare (frame, this, fd->inode,
                                                    data_lock, sources, sinks,
-                                                   healed_sinks,
-						   locked_replies,
-                                                   NULL);
+                                                   healed_sinks, undid_pending,
+                                                   locked_replies, NULL);
 		if (ret < 0)
 			goto unlock;
 
@@ -787,7 +795,7 @@ restore_time:
         }
         ret = afr_selfheal_undo_pending (frame, this, fd->inode,
                                          sources, sinks, healed_sinks,
-                                         AFR_DATA_TRANSACTION,
+                                         undid_pending, AFR_DATA_TRANSACTION,
                                          locked_replies, data_lock);
 skip_undo_pending:
 	afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index add9e28..6b5340a 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -914,6 +914,7 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
 	unsigned char          *data_lock             = NULL;
         unsigned char          *postop_lock           = NULL;
 	unsigned char          *healed_sinks          = NULL;
+	unsigned char          *undid_pending         = NULL;
 	struct afr_reply       *locked_replies        = NULL;
         afr_local_t            *local                 = NULL;
 	afr_private_t          *priv                  = NULL;
@@ -925,6 +926,7 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
 	sources = alloca0 (priv->child_count);
 	sinks = alloca0 (priv->child_count);
 	healed_sinks = alloca0 (priv->child_count);
+	undid_pending = alloca0 (priv->child_count);
 	data_lock = alloca0 (priv->child_count);
         postop_lock = alloca0 (priv->child_count);
 
@@ -997,6 +999,7 @@ unlock:
 
                 ret = afr_selfheal_undo_pending (frame, this, fd->inode,
                                                  sources, sinks, healed_sinks,
+                                                 undid_pending,
                                                  AFR_ENTRY_TRANSACTION,
                                                  locked_replies, postop_lock);
         }
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 9dfe4a1..5839ddc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -203,6 +203,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
                                          unsigned char *sources,
                                          unsigned char *sinks,
 					 unsigned char *healed_sinks,
+					 unsigned char *undid_pending,
 					 unsigned char *locked_on,
 					 struct afr_reply *replies)
 {
@@ -224,8 +225,14 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
                                                             healed_sinks,
                                                             locked_on, replies,
                                                       AFR_METADATA_TRANSACTION);
-                if (source >= 0)
+                if (source >= 0) {
+                        _afr_fav_child_reset_sink_xattrs (frame, this, inode,
+                                                         source, healed_sinks,
+                                                         undid_pending,
+                                                       AFR_METADATA_TRANSACTION,
+                                                         locked_on, replies);
                         return source;
+                }
 
 		/* If this is a directory mtime/ctime only split brain
 		   use the most recent */
@@ -308,6 +315,7 @@ int
 __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
 				 unsigned char *locked_on, unsigned char *sources,
 				 unsigned char *sinks, unsigned char *healed_sinks,
+                                 unsigned char *undid_pending,
 				 struct afr_reply *replies, gf_boolean_t *pflag)
 {
 	int ret = -1;
@@ -362,6 +370,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
 	source = __afr_selfheal_metadata_finalize_source (frame, this, inode,
                                                           sources, sinks,
                                                           healed_sinks,
+                                                          undid_pending,
                                                           locked_on, replies);
 
 	if (source < 0)
@@ -379,6 +388,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
 	unsigned char *sinks = NULL;
 	unsigned char *data_lock = NULL;
 	unsigned char *healed_sinks = NULL;
+	unsigned char *undid_pending = NULL;
 	struct afr_reply *locked_replies = NULL;
         gf_boolean_t did_sh = _gf_true;
 	int source = -1;
@@ -388,6 +398,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
 	sources = alloca0 (priv->child_count);
 	sinks = alloca0 (priv->child_count);
 	healed_sinks = alloca0 (priv->child_count);
+	undid_pending = alloca0 (priv->child_count);
 	data_lock = alloca0 (priv->child_count);
 
 	locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
@@ -403,6 +414,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
 		ret = __afr_selfheal_metadata_prepare (frame, this, inode,
                                                        data_lock, sources,
                                                        sinks, healed_sinks,
+                                                       undid_pending,
 						       locked_replies, NULL);
 		if (ret < 0)
 			goto unlock;
@@ -421,6 +433,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
 
                 ret = afr_selfheal_undo_pending (frame, this, inode, sources,
                                                  sinks, healed_sinks,
+                                                 undid_pending,
                                                  AFR_METADATA_TRANSACTION,
                                                  locked_replies, data_lock);
 	}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index ec5337e..80b7f3a 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -167,8 +167,10 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
 int
 afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
 			   unsigned char *sources, unsigned char *sinks,
-			   unsigned char *healed_sinks, afr_transaction_type type,
-			   struct afr_reply *replies, unsigned char *locked_on);
+			   unsigned char *healed_sinks,
+                           unsigned char *undid_pending,
+                           afr_transaction_type type, struct afr_reply *replies,
+                           unsigned char *locked_on);
 
 int
 afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir,
@@ -229,6 +231,19 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
                                    afr_transaction_type type);
 
 int
+afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
+                          inode_t *inode, char **policy_str);
+
+int
+_afr_fav_child_reset_sink_xattrs (call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, int source,
+                                 unsigned char *healed_sinks,
+                                 unsigned char *undid_pending,
+                                 afr_transaction_type type,
+                                 unsigned char *locked_on,
+                                 struct afr_reply *replies);
+
+int
 afr_get_child_index_from_name (xlator_t *this, char *name);
 
 gf_boolean_t
@@ -239,8 +254,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
                              inode_t *inode, unsigned char *locked_on,
                              unsigned char *sources,
                              unsigned char *sinks, unsigned char *healed_sinks,
-                             struct afr_reply *replies,
-                             gf_boolean_t *flag);
+                             unsigned char *undid_pending,
+                             struct afr_reply *replies, gf_boolean_t *flag);
 
 int
 __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this,
@@ -248,6 +263,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this,
                                  unsigned char *sources,
                                  unsigned char *sinks,
                                  unsigned char *healed_sinks,
+                                 unsigned char *undid_pending,
                                  struct afr_reply *replies,
                                  gf_boolean_t *flag);
 int
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 1ed327c..02a968e 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -2534,19 +2534,12 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
         if (err) {
                 local->op_errno = -err;
                 local->op_ret = -1;
-                goto fail;
-        }
-	ret = afr_inode_get_readable (frame, local->inode, this,
-                                      local->readable, NULL,
-				      local->transaction.type);
-        if (ret < 0) {
                 gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_SPLIT_BRAIN,
                         "Failing %s on gfid %s: split-brain observed.",
                         gf_fop_list[local->op], uuid_utoa (local->inode->gfid));
-                local->op_ret = -1;
-                local->op_errno = -ret;
                 goto fail;
         }
+
         afr_transaction_start (frame, this);
         return 0;
 fail:
-- 
2.9.3