Blob Blame History Raw
From bebc9766e75e2d7dd1f744206e16c296189762aa Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Mon, 12 Jun 2017 22:06:18 +0530
Subject: [PATCH 520/525] cluster/afr: Implement quorum for lk fop

Problem:
At the moment when we have replica 3 or arbiter setup, even when
lk succeeds on just one brick we give success to application which
is wrong

Fix:
Consider quorum-number of successes as success when quorum is enabled.

 >BUG: 1461792
 >Change-Id: I5789e6eb5defb68f8a0eb9cd594d316f5cdebaea
 >Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
 >Reviewed-on: https://review.gluster.org/17524
 >Smoke: Gluster Build System <jenkins@build.gluster.org>
 >NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
 >CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
 >Reviewed-by: Ravishankar N <ravishankar@redhat.com>

BUG: 1463104
Change-Id: I5789e6eb5defb68f8a0eb9cd594d316f5cdebaea
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/109473
---
 tests/basic/afr/lk-quorum.t          | 255 +++++++++++++++++++++++++++++++++++
 xlators/cluster/afr/src/afr-common.c |  56 +++++---
 xlators/cluster/afr/src/afr.h        |   5 -
 3 files changed, 293 insertions(+), 23 deletions(-)
 create mode 100644 tests/basic/afr/lk-quorum.t

diff --git a/tests/basic/afr/lk-quorum.t b/tests/basic/afr/lk-quorum.t
new file mode 100644
index 0000000..ad14365
--- /dev/null
+++ b/tests/basic/afr/lk-quorum.t
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../fileio.rc
+cleanup;
+
+TEST glusterd;
+TEST pidof glusterd
+
+#Tests for quorum-type option for replica 2
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1};
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0
+TEST $GFS -s $H0 --volfile-id=$V0 --direct-io-mode=enable $M0;
+
+TEST touch $M0/a
+
+#When all bricks are up, lock and unlock should succeed
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST flock -x $fd1
+TEST fd_close $fd1
+
+#When all bricks are down, lock/unlock should fail
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST $CLI volume stop $V0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#Check locking behavior with quorum 'fixed' and quorum-count 2
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "^fixed$" mount_get_option_value $M0 $V0-replicate-0 quorum-type
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "^2$" mount_get_option_value $M0 $V0-replicate-0 quorum-count
+
+#When all bricks are up, lock and unlock should succeed
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST flock -x $fd1
+TEST fd_close $fd1
+
+#When all bricks are down, lock/unlock should fail
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST $CLI volume stop $V0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#When any of the bricks is down lock/unlock should fail
+#kill first brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST fd_close $fd1
+
+#kill 2nd brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#Check locking behavior with quorum 'fixed' and quorum-count 1
+TEST $CLI volume set $V0 cluster.quorum-count 1
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "^1$" mount_get_option_value $M0 $V0-replicate-0 quorum-count
+
+#When all bricks are up, lock and unlock should succeed
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST flock -x $fd1
+TEST fd_close $fd1
+
+#When all bricks are down, lock/unlock should fail
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST $CLI volume stop $V0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#When any of the bricks is down lock/unlock should succeed
+#kill first brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST fd_close $fd1
+
+#kill 2nd brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#Check locking behavior with quorum 'auto'
+TEST $CLI volume set $V0 cluster.quorum-type auto
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "^auto$" mount_get_option_value $M0 $V0-replicate-0 quorum-type
+
+#When all bricks are up, lock and unlock should succeed
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST flock -x $fd1
+TEST fd_close $fd1
+
+#When all bricks are down, lock/unlock should fail
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST $CLI volume stop $V0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#When first brick is down lock/unlock should fail
+#kill first brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST fd_close $fd1
+
+#When second brick is down lock/unlock should succeed
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+cleanup;
+TEST glusterd;
+TEST pidof glusterd
+
+#Tests for replica 3
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0
+TEST $GFS -s $H0 --volfile-id=$V0 --direct-io-mode=enable $M0;
+
+TEST touch $M0/a
+
+#When all bricks are up, lock and unlock should succeed
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST flock -x $fd1
+TEST fd_close $fd1
+
+#When all bricks are down, lock/unlock should fail
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST $CLI volume stop $V0
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+TEST fd_close $fd1
+
+#When any of the bricks is down lock/unlock should succeed
+#kill first brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST fd_close $fd1
+
+#kill 2nd brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#kill 3rd brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+TEST fd_close $fd1
+
+#When any two of the bricks are down lock/unlock should fail
+#kill first,second bricks
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST fd_close $fd1
+
+#kill 2nd,3rd bricks
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+TEST fd_close $fd1
+
+#kill 1st,3rd brick
+TEST fd1=`fd_available`
+TEST fd_open $fd1 'w' $M0/a
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST ! flock -x $fd1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST fd_close $fd1
+
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4b8334d..d96a819 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -3754,7 +3754,7 @@ unwind:
 
 static int
 afr_common_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
         afr_local_t *local = NULL;
         int child_index = (long)cookie;
@@ -4123,15 +4123,27 @@ afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                    int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
                    dict_t *xdata)
 {
-        afr_local_t * local = NULL;
+        afr_local_t *local = NULL;
+        afr_private_t *priv = this->private;
         int call_count = -1;
+        int child_index = (long)cookie;
 
         local = frame->local;
-        call_count = afr_frame_return (frame);
 
+        if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+                gf_msg (this->name, GF_LOG_ERROR, op_errno,
+                        AFR_MSG_UNLOCK_FAIL,
+                        "gfid=%s: unlock failed on subvolume %s "
+                        "with lock owner %s",
+                        uuid_utoa (local->fd->inode->gfid),
+                        priv->children[child_index]->name,
+                        lkowner_utoa (&frame->root->lk_owner));
+        }
+
+        call_count = afr_frame_return (frame);
         if (call_count == 0)
                 AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
-                                  lock, xdata);
+                                  NULL, local->xdata_rsp);
 
         return 0;
 }
@@ -4153,7 +4165,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this)
 
         if (call_count == 0) {
                 AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
-                                  &local->cont.lk.ret_flock, NULL);
+                                  NULL, local->xdata_rsp);
                 return 0;
         }
 
@@ -4163,8 +4175,8 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this)
 
         for (i = 0; i < priv->child_count; i++) {
                 if (local->cont.lk.locked_nodes[i]) {
-                        STACK_WIND (frame, afr_lk_unlock_cbk,
-                                    priv->children[i],
+                        STACK_WIND_COOKIE (frame, afr_lk_unlock_cbk,
+                                    (void *) (long) i, priv->children[i],
                                     priv->children[i]->fops->lk,
                                     local->fd, F_SETLK,
                                     &local->cont.lk.user_flock, NULL);
@@ -4180,12 +4192,12 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this)
 
 int32_t
 afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-            int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+            int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+            dict_t *xdata)
 {
         afr_local_t *local = NULL;
         afr_private_t *priv = NULL;
         int child_index = -1;
-/*        int            ret  = 0; */
 
 
         local = frame->local;
@@ -4193,9 +4205,10 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 
         child_index = (long) cookie;
 
-        if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
+        afr_common_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+        if (op_ret < 0 && op_errno == EAGAIN) {
                 local->op_ret   = -1;
-                local->op_errno = op_errno;
+                local->op_errno = EAGAIN;
 
                 afr_lk_unlock (frame, this);
                 return 0;
@@ -4215,15 +4228,20 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                                    priv->children[child_index],
                                    priv->children[child_index]->fops->lk,
                                    local->fd, local->cont.lk.cmd,
-                                   &local->cont.lk.user_flock, xdata);
-        } else if (local->op_ret == -1) {
-                /* all nodes have gone down */
+                                   &local->cont.lk.user_flock,
+                                   local->xdata_req);
+        } else if (priv->quorum_count &&
+                   !afr_has_quorum (local->cont.lk.locked_nodes, this)) {
+                local->op_ret   = -1;
+                local->op_errno = afr_final_errno (local, priv);
 
-                AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
-                                  &local->cont.lk.ret_flock, NULL);
+                afr_lk_unlock (frame, this);
         } else {
+                if (local->op_ret < 0)
+                        local->op_errno = afr_final_errno (local, priv);
+
                 AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
-                                  &local->cont.lk.ret_flock, NULL);
+                                  &local->cont.lk.ret_flock, local->xdata_rsp);
         }
 
         return 0;
@@ -4258,11 +4276,13 @@ afr_lk (call_frame_t *frame, xlator_t *this,
         local->cont.lk.cmd   = cmd;
         local->cont.lk.user_flock = *flock;
         local->cont.lk.ret_flock = *flock;
+        if (xdata)
+                local->xdata_req = dict_ref (xdata);
 
         STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
                            priv->children[i],
                            priv->children[i]->fops->lk,
-                           fd, cmd, flock, xdata);
+                           fd, cmd, flock, local->xdata_req);
 
 	return 0;
 out:
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 05f8249..f6a1a6a 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -864,11 +864,6 @@ typedef struct afr_granular_esh_args {
                                   mismatch */
 } afr_granular_esh_args_t;
 
-/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&            \
-                                           ((op_errno == ENOTCONN) ||   \
-                                            (op_errno == EBADFD)))
-
 int
 afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
                         unsigned char *readable, int *event_p, int type);
-- 
1.8.3.1