12a457
From 231cea9678734044d2294e74d3742e61058bc760 Mon Sep 17 00:00:00 2001
12a457
From: Pranith Kumar K <pkarampu@redhat.com>
12a457
Date: Fri, 27 May 2016 15:47:07 +0530
12a457
Subject: [PATCH 175/178] cluster/afr: Unwind xdata_rsp even in case of failures
12a457
12a457
DHT expects GF_PREOP_CHECK_FAILED to be present in xdata_rsp in case of mkdir
12a457
failures because of stale layout. But AFR was unwinding null xdata_rsp in case
12a457
of failures. This was leading to mkdir failures just after remove-brick. Unwind
12a457
the xdata_rsp in case of failures to make sure the response from brick reaches
12a457
dht.
12a457
12a457
 >BUG: 1340623
12a457
 >Change-Id: Idd3f7b95730e8ea987b608e892011ff190e181d1
12a457
 >Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
12a457
 >Reviewed-on: http://review.gluster.org/14553
12a457
 >NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
12a457
 >Reviewed-by: Ravishankar N <ravishankar@redhat.com>
12a457
 >Smoke: Gluster Build System <jenkins@build.gluster.com>
12a457
 >CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
12a457
 >Reviewed-by: Anuradha Talur <atalur@redhat.com>
12a457
 >Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
12a457
12a457
BUG: 1340085
12a457
Change-Id: I7fb653cea3e888de9f0db6f96a32156ace3c3fea
12a457
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
12a457
Reviewed-on: https://code.engineering.redhat.com/gerrit/75455
12a457
---
12a457
 xlators/cluster/afr/src/afr-common.c      |   29 +++++++++---
12a457
 xlators/cluster/afr/src/afr-dir-read.c    |    3 +-
12a457
 xlators/cluster/afr/src/afr-dir-write.c   |   20 ++++++---
12a457
 xlators/cluster/afr/src/afr-inode-write.c |    6 +-
12a457
 xlators/cluster/afr/src/afr-transaction.c |   71 +++++++++++++++++++++++++++++
12a457
 xlators/cluster/afr/src/afr-transaction.h |    4 ++
12a457
 xlators/cluster/afr/src/afr.h             |    8 ++-
12a457
 7 files changed, 120 insertions(+), 21 deletions(-)
12a457
12a457
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
12a457
index 3bed0c5..605c641 100644
12a457
--- a/xlators/cluster/afr/src/afr-common.c
12a457
+++ b/xlators/cluster/afr/src/afr-common.c
12a457
@@ -1274,6 +1274,7 @@ afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
12a457
 
12a457
 int
12a457
 afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
12a457
+                     unsigned char *readables,
12a457
 		     int *event_p, afr_transaction_type type,
12a457
                      afr_read_subvol_args_t *args)
12a457
 {
12a457
@@ -1310,6 +1311,9 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
12a457
 		*subvol_p = subvol;
12a457
 	if (event_p)
12a457
 		*event_p = event;
12a457
+        if (readables)
12a457
+                memcpy (readables, readable,
12a457
+                        sizeof (*readables) * priv->child_count);
12a457
 	return subvol;
12a457
 }
12a457
 
12a457
@@ -1439,6 +1443,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
12a457
         GF_FREE (local->read_attempted);
12a457
 
12a457
         GF_FREE (local->readable);
12a457
+        GF_FREE (local->readable2);
12a457
 
12a457
 	if (local->inode)
12a457
 		inode_unref (local->inode);
12a457
@@ -1598,8 +1603,8 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,
12a457
         priv = this->private;
12a457
 
12a457
         if (parent)
12a457
-                par_read_subvol = afr_data_subvol_get (parent, this, 0, 0,
12a457
-                                                       NULL);
12a457
+                par_read_subvol = afr_data_subvol_get (parent, this, NULL, NULL,
12a457
+                                                       NULL, NULL);
12a457
 
12a457
         for (i = 0; i < priv->child_count; i++) {
12a457
                 if (!replies[i].valid)
12a457
@@ -1638,8 +1643,7 @@ afr_read_subvol_decide (inode_t *inode, xlator_t *this,
12a457
         int data_subvol  = -1;
12a457
         int mdata_subvol = -1;
12a457
 
12a457
-        data_subvol = afr_data_subvol_get (inode, this,
12a457
-                                           0, 0, args);
12a457
+        data_subvol = afr_data_subvol_get (inode, this, NULL, NULL, NULL, args);
12a457
         mdata_subvol = afr_metadata_subvol_get (inode, this,
12a457
                                                 0, 0, args);
12a457
         if (data_subvol == -1 || mdata_subvol == -1)
12a457
@@ -1787,7 +1791,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
12a457
 			goto cant_interpret;
12a457
 		} else {
12a457
                         read_subvol = afr_data_subvol_get (local->inode, this,
12a457
-                                                           0, 0, &args);
12a457
+                                                       NULL, NULL, NULL, &args);
12a457
 		}
12a457
 	} else {
12a457
 	cant_interpret:
12a457
@@ -2414,7 +2418,7 @@ afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req
12a457
 		return 0;
12a457
 	}
12a457
 
12a457
-	afr_read_subvol_get (loc->inode, this, NULL, &event,
12a457
+	afr_read_subvol_get (loc->inode, this, NULL, NULL, &event,
12a457
 			     AFR_DATA_TRANSACTION, NULL);
12a457
 
12a457
 	if (event != local->event_generation)
12a457
@@ -2565,7 +2569,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
12a457
                 }
12a457
         }
12a457
 
12a457
-	afr_read_subvol_get (loc->parent, this, NULL, &event,
12a457
+	afr_read_subvol_get (loc->parent, this, NULL, NULL, &event,
12a457
 			     AFR_DATA_TRANSACTION, NULL);
12a457
 
12a457
 	if (event != local->event_generation)
12a457
@@ -2888,7 +2892,8 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
12a457
 
12a457
         local = frame->local;
12a457
 
12a457
-	read_subvol = afr_data_subvol_get (local->inode, this, 0, 0, NULL);
12a457
+	read_subvol = afr_data_subvol_get (local->inode, this, NULL, NULL,
12a457
+                                           NULL, NULL);
12a457
 
12a457
         LOCK (&frame->lock);
12a457
         {
12a457
@@ -4288,6 +4293,14 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
12a457
 		goto out;
12a457
 	}
12a457
 
12a457
+        local->readable2 = GF_CALLOC (priv->child_count, sizeof (char),
12a457
+                                      gf_afr_mt_char);
12a457
+        if (!local->readable2) {
12a457
+                if (op_errno)
12a457
+                        *op_errno = ENOMEM;
12a457
+                goto out;
12a457
+        }
12a457
+
12a457
 	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
12a457
 				   gf_afr_mt_reply_t);
12a457
 	if (!local->replies) {
12a457
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
12a457
index 559211e..f7da55f 100644
12a457
--- a/xlators/cluster/afr/src/afr-dir-read.c
12a457
+++ b/xlators/cluster/afr/src/afr-dir-read.c
12a457
@@ -158,7 +158,8 @@ afr_validate_read_subvol (inode_t *inode, xlator_t *this, int par_read_subvol)
12a457
          * -1 above due to gen being 0, which is why it is OK to pass NULL for
12a457
          *  read_subvol_args here.
12a457
          */
12a457
-        entry_read_subvol = afr_data_subvol_get (inode, this, 0, 0, NULL);
12a457
+        entry_read_subvol = afr_data_subvol_get (inode, this, NULL, NULL,
12a457
+                                                 NULL, NULL);
12a457
         if (entry_read_subvol != par_read_subvol)
12a457
                 return -1;
12a457
 
12a457
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
12a457
index b1ff557..bd24276 100644
12a457
--- a/xlators/cluster/afr/src/afr-dir-write.c
12a457
+++ b/xlators/cluster/afr/src/afr-dir-write.c
12a457
@@ -106,18 +106,21 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
12a457
 	if (local->inode) {
12a457
 		afr_replies_interpret (frame, this, local->inode, NULL);
12a457
 		inode_read_subvol = afr_data_subvol_get (local->inode, this,
12a457
-							 NULL, NULL, &args);
12a457
+                                                       NULL, NULL, NULL, &args);
12a457
 	}
12a457
 
12a457
 	if (local->parent)
12a457
 		parent_read_subvol = afr_data_subvol_get (local->parent, this,
12a457
-							  NULL, NULL, NULL);
12a457
+                                             NULL, local->readable, NULL, NULL);
12a457
+
12a457
 	if (local->parent2)
12a457
 		parent2_read_subvol = afr_data_subvol_get (local->parent2, this,
12a457
-							   NULL, NULL, NULL);
12a457
+                                            NULL, local->readable2, NULL, NULL);
12a457
 
12a457
 	local->op_ret = -1;
12a457
 	local->op_errno = afr_final_errno (local, priv);
12a457
+        afr_pick_error_xdata (local, priv, local->parent, local->readable,
12a457
+                              local->parent2, local->readable2);
12a457
 
12a457
 	for (i = 0; i < priv->child_count; i++) {
12a457
 		if (!local->replies[i].valid)
12a457
@@ -149,6 +152,11 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
12a457
 				local->replies[i].preparent2;
12a457
 			local->cont.dir_fop.postnewparent =
12a457
 				local->replies[i].postparent2;
12a457
+                        if (local->xdata_rsp) {
12a457
+                                dict_unref (local->xdata_rsp);
12a457
+                                local->xdata_rsp = NULL;
12a457
+                        }
12a457
+
12a457
 			if (local->replies[i].xdata)
12a457
 				local->xdata_rsp =
12a457
 					dict_ref (local->replies[i].xdata);
12a457
@@ -201,6 +209,9 @@ __afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
12a457
 	local->replies[child_index].valid = 1;
12a457
 	local->replies[child_index].op_ret = op_ret;
12a457
 	local->replies[child_index].op_errno = op_errno;
12a457
+        if (xdata)
12a457
+                local->replies[child_index].xdata = dict_ref (xdata);
12a457
+
12a457
 
12a457
 	if (op_ret >= 0) {
12a457
 		if (poststat)
12a457
@@ -213,9 +224,6 @@ __afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
12a457
 			local->replies[child_index].preparent2 = *preparent2;
12a457
 		if (postparent2)
12a457
 			local->replies[child_index].postparent2 = *postparent2;
12a457
-		if (xdata)
12a457
-			local->replies[child_index].xdata = dict_ref (xdata);
12a457
-
12a457
 		if (fd_ctx)
12a457
 			fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
12a457
 	} else {
12a457
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
12a457
index baa5871..9206681 100644
12a457
--- a/xlators/cluster/afr/src/afr-inode-write.c
12a457
+++ b/xlators/cluster/afr/src/afr-inode-write.c
12a457
@@ -89,7 +89,7 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
12a457
                                                                &args);
12a457
 		else
12a457
 			read_subvol = afr_data_subvol_get (local->inode, this,
12a457
-							   NULL, NULL, &args);
12a457
+                                                       NULL, NULL, NULL, &args);
12a457
 	}
12a457
 
12a457
 	local->op_ret = -1;
12a457
@@ -169,8 +169,8 @@ __afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
12a457
 			local->replies[child_index].poststat = *postbuf;
12a457
 		if (xattr)
12a457
 			local->replies[child_index].xattr = dict_ref (xattr);
12a457
-		if (xdata)
12a457
-			local->replies[child_index].xdata = dict_ref (xdata);
12a457
+                if (xdata)
12a457
+                        local->replies[child_index].xdata = dict_ref (xdata);
12a457
 	} else {
12a457
 		afr_transaction_fop_failed (frame, this, child_index);
12a457
 	}
12a457
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
12a457
index 8b667c9..316c7cd 100644
12a457
--- a/xlators/cluster/afr/src/afr-transaction.c
12a457
+++ b/xlators/cluster/afr/src/afr-transaction.c
12a457
@@ -51,6 +51,66 @@ afr_zero_fill_stat (afr_local_t *local)
12a457
         }
12a457
 }
12a457
 
12a457
+/* In case of errors afr needs to choose which xdata from lower xlators it needs
12a457
+ * to unwind with. The way it is done is by checking if there are
12a457
+ * any good subvols which failed. Give preference to errnos other than
12a457
+ * ENOTCONN even if the child is source */
12a457
+void
12a457
+afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
12a457
+                      inode_t *inode1, unsigned char *readable1,
12a457
+                      inode_t *inode2, unsigned char *readable2)
12a457
+{
12a457
+        int     s = -1;/*selection*/
12a457
+        int     i = 0;
12a457
+        unsigned char *readable = NULL;
12a457
+
12a457
+        if (local->xdata_rsp) {
12a457
+                dict_unref (local->xdata_rsp);
12a457
+                local->xdata_rsp = NULL;
12a457
+        }
12a457
+
12a457
+        readable = alloca0 (priv->child_count * sizeof (*readable));
12a457
+        if (inode2 && readable2) {/*rename fop*/
12a457
+                AFR_INTERSECT (readable, readable1, readable2,
12a457
+                               priv->child_count);
12a457
+        } else {
12a457
+                memcpy (readable, readable1,
12a457
+                        sizeof (*readable) * priv->child_count);
12a457
+        }
12a457
+
12a457
+        for (i = 0; i < priv->child_count; i++) {
12a457
+                if (!local->replies[i].valid)
12a457
+                        continue;
12a457
+
12a457
+                if (local->replies[i].op_ret >= 0)
12a457
+                        continue;
12a457
+
12a457
+                if (local->replies[i].op_errno == ENOTCONN)
12a457
+                        continue;
12a457
+
12a457
+                /*Order is important in the following condition*/
12a457
+                if ((s < 0) || (!readable[s] && readable[i]))
12a457
+                        s = i;
12a457
+        }
12a457
+
12a457
+        if (s != -1 && local->replies[s].xdata) {
12a457
+                local->xdata_rsp = dict_ref (local->replies[s].xdata);
12a457
+        } else if (s == -1) {
12a457
+                for (i = 0; i < priv->child_count; i++) {
12a457
+                        if (!local->replies[i].valid)
12a457
+                                continue;
12a457
+
12a457
+                        if (local->replies[i].op_ret >= 0)
12a457
+                                continue;
12a457
+
12a457
+                        if (!local->replies[i].xdata)
12a457
+                                continue;
12a457
+                        local->xdata_rsp = dict_ref (local->replies[i].xdata);
12a457
+                        break;
12a457
+                }
12a457
+        }
12a457
+}
12a457
+
12a457
 gf_boolean_t
12a457
 afr_needs_changelog_update (afr_local_t *local)
12a457
 {
12a457
@@ -741,6 +801,17 @@ afr_handle_quorum (call_frame_t *frame)
12a457
         local->op_errno = afr_final_errno (local, priv);
12a457
         if (local->op_errno == 0)
12a457
                 local->op_errno = afr_quorum_errno (priv);
12a457
+        switch (local->transaction.type) {
12a457
+        case AFR_ENTRY_TRANSACTION:
12a457
+        case AFR_ENTRY_RENAME_TRANSACTION:
12a457
+                afr_pick_error_xdata (local, priv, local->parent,
12a457
+                                      local->readable, local->parent2,
12a457
+                                      local->readable2);
12a457
+                break;
12a457
+        default:
12a457
+                /*TBD*/
12a457
+                break;
12a457
+        }
12a457
 }
12a457
 
12a457
 int
12a457
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
12a457
index c58531e..ca8fcfe 100644
12a457
--- a/xlators/cluster/afr/src/afr-transaction.h
12a457
+++ b/xlators/cluster/afr/src/afr-transaction.h
12a457
@@ -55,4 +55,8 @@ gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
12a457
 gf_boolean_t afr_needs_changelog_update (afr_local_t *local);
12a457
 void afr_zero_fill_stat (afr_local_t *local);
12a457
 
12a457
+void
12a457
+afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
12a457
+                      inode_t *inode1, unsigned char *readable1,
12a457
+                      inode_t *inode2, unsigned char *readable2);
12a457
 #endif /* __TRANSACTION_H__ */
12a457
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
12a457
index 6370577..63be1fb 100644
12a457
--- a/xlators/cluster/afr/src/afr.h
12a457
+++ b/xlators/cluster/afr/src/afr.h
12a457
@@ -417,6 +417,7 @@ typedef struct _afr_local {
12a457
 	   performed. This is the output of afr_inode_refresh()
12a457
 	*/
12a457
 	unsigned char *readable;
12a457
+	unsigned char *readable2; /*For rename transaction*/
12a457
 
12a457
 	afr_inode_refresh_cbk_t refreshfn;
12a457
 
12a457
@@ -836,14 +837,15 @@ afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
12a457
 				int type);
12a457
 int
12a457
 afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
12a457
+                     unsigned char *readables,
12a457
 		     int *event_p, afr_transaction_type type,
12a457
                      afr_read_subvol_args_t *args);
12a457
 
12a457
-#define afr_data_subvol_get(i, t, s, e, a) \
12a457
-	afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION, a)
12a457
+#define afr_data_subvol_get(i, t, s, r, e, a) \
12a457
+	afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a)
12a457
 
12a457
 #define afr_metadata_subvol_get(i, t, s, e, a) \
12a457
-	afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION, a)
12a457
+	afr_read_subvol_get(i, t, s, NULL, e, AFR_METADATA_TRANSACTION, a)
12a457
 
12a457
 int
12a457
 afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
12a457
-- 
12a457
1.7.1
12a457