Blob Blame History Raw
From fc94bc56c6ba0eb9876be606092d6aa8407af0ae Mon Sep 17 00:00:00 2001
From: Poornima G <pgurusid@redhat.com>
Date: Thu, 13 Apr 2017 16:20:29 +0530
Subject: [PATCH 383/393] dht: Add readdir-ahead in rebalance graph if
 parallel-readdir is on

Issue:
The value of linkto xattr is generally the name of the dht's
next subvol, this requires that the next subvol of dht is not
changed for the life time of the volume. But with parallel
readdir enabled, the readdir-ahead loaded below dht, is optional.
The linkto xattr for first subvol, when:
- parallel readdir is enabled : "<volname>-readdir-head-0"
- plain distribute volume : "<volname>-client-0"
- distribute replicate volume : "<volname>-afr-0"

The value of linkto xattr is "<volname>-readdir-head-0" when
parallel readdir is enabled, and is "<volname>-client-0" if
its disabled. But the dht_lookup takes care of healing if it
cannot identify which linkto subvol, the xattr points to.

In dht_lookup_cbk, if linkto xattr is found to be "<volname>-client-0"
and parallel readdir is enabled, then it cannot understand the
value "<volname>-client-0" as it expects "<volname>-readdir-head-0".
In that case, dht_lookup_everywhere is issued and then the linkto file
is unlinked and recreated with the right linkto xattr. The issue is
when parallel readdir is enabled, mount point accesses the file
that is currently being migrated. Since rebalance process doesn't
have parallel-readdir feature, it expects "<volname>-client-0"
where as mount expects "<volname>-readdir-head-0". Thus at some point
either the mount or rebalance will fail.

Solution:
Enable parallel-readdir for rebalance as well and then do not
allow enabling/disabling parallel-readdir if rebalance is in
progress.

>Reviewed-on: https://review.gluster.org/17056
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
>Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
>Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
>Signed-off-by: Poornima G <pgurusid@redhat.com>

Change-Id: I241ab966bdd850e667f7768840540546f5289483
BUG: 1442026
Signed-off-by: Poornima G <pgurusid@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/103637
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 libglusterfs/src/inode.c                        |  9 +++++
 tests/bugs/distribute/bug-1161311.t             |  1 +
 tests/bugs/readdir-ahead/bug-1436090.t          | 44 +++++++++++++++++++++++++
 xlators/mgmt/glusterd/src/glusterd-volgen.c     |  6 ++--
 xlators/mgmt/glusterd/src/glusterd-volume-set.c | 26 ++++++++++++++-
 5 files changed, 81 insertions(+), 5 deletions(-)
 create mode 100755 tests/bugs/readdir-ahead/bug-1436090.t

diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c
index 650a301..747c1f1 100644
--- a/libglusterfs/src/inode.c
+++ b/libglusterfs/src/inode.c
@@ -2528,6 +2528,15 @@ inode_ctx_size (inode_t *inode)
                         old_THIS = THIS;
                         THIS = xl;
 
+                        /* If inode ref is taken when THIS is global xlator,
+                         * the ctx xl_key is set, but the value is NULL.
+                         * For global xlator the cbks can be NULL, hence check
+                         * for the same */
+                        if (!xl->cbks) {
+                                THIS = old_THIS;
+                                continue;
+                        }
+
                         if (xl->cbks->ictxsize)
                                 size += xl->cbks->ictxsize (xl, inode);
 
diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t
index c5a7f04..93e9d03 100755
--- a/tests/bugs/distribute/bug-1161311.t
+++ b/tests/bugs/distribute/bug-1161311.t
@@ -43,6 +43,7 @@ EXPECT "$V0" volinfo_field $V0 'Volume Name';
 EXPECT 'Created' volinfo_field $V0 'Status';
 EXPECT '3' brick_count $V0
 
+TEST $CLI volume set $V0 parallel-readdir on
 TEST $CLI volume start $V0;
 EXPECT 'Started' volinfo_field $V0 'Status';
 
diff --git a/tests/bugs/readdir-ahead/bug-1436090.t b/tests/bugs/readdir-ahead/bug-1436090.t
new file mode 100755
index 0000000..58e9093
--- /dev/null
+++ b/tests/bugs/readdir-ahead/bug-1436090.t
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../cluster.rc
+
+cleanup;
+
+TEST launch_cluster 2;
+TEST $CLI_1 peer probe $H2;
+EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count
+
+$CLI_1 volume create $V0 $H1:$B1/$V0  $H2:$B2/$V0
+EXPECT 'Created' cluster_volinfo_field 1 $V0 'Status';
+
+$CLI_1 volume start $V0
+EXPECT 'Started' cluster_volinfo_field 1 $V0 'Status';
+
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
+TEST mkdir $M0/dir1
+
+# Create a large file (3.2 GB), so that rebalance takes time
+# Reading from /dev/urandom is slow, so we will cat it together
+dd if=/dev/urandom of=/tmp/FILE2 bs=64k count=10240
+for i in {1..5}; do
+  cat /tmp/FILE2 >> $M0/dir1/foo
+done
+
+TEST mv $M0/dir1/foo $M0/dir1/bar
+
+TEST $CLI_1 volume rebalance $V0 start force
+TEST ! $CLI_1 volume set $V0 parallel-readdir on
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" cluster_rebalance_status_field 1 $V0
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" cluster_rebalance_status_field 2 $V0
+TEST $CLI_1 volume set $V0 parallel-readdir on
+TEST mv $M0/dir1/bar $M0/dir1/foo
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
+TEST $CLI_1 volume rebalance $V0 start force
+TEST ln $M0/dir1/foo $M0/dir1/bar
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" cluster_rebalance_status_field 1 $V0
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" cluster_rebalance_status_field 2 $V0
+cleanup;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 6e52d44..faa6c72 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -3308,8 +3308,7 @@ volgen_graph_build_readdir_ahead (volgen_graph_t *graph,
 {
         int32_t                 clusters                 = 0;
 
-        if (graph->type == GF_REBALANCED ||
-            graph->type == GF_QUOTAD ||
+        if (graph->type == GF_QUOTAD ||
             graph->type == GF_SNAPD ||
             !glusterd_volinfo_get_boolean (volinfo, VKEY_PARALLEL_READDIR) ||
             !glusterd_volinfo_get_boolean (volinfo, VKEY_READDIR_AHEAD))
@@ -3617,8 +3616,7 @@ client_graph_set_rda_options (volgen_graph_t *graph,
         if (dist_count <= 1)
                 goto out;
 
-        if (graph->type == GF_REBALANCED ||
-            graph->type == GF_QUOTAD ||
+        if (graph->type == GF_QUOTAD ||
             graph->type == GF_SNAPD ||
             !glusterd_volinfo_get_boolean (volinfo, VKEY_PARALLEL_READDIR) ||
             !glusterd_volinfo_get_boolean (volinfo, VKEY_READDIR_AHEAD))
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 4a1c780..8e729da 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -919,6 +919,30 @@ out:
 
 
 static int
+validate_parallel_readdir (glusterd_volinfo_t *volinfo, dict_t *dict,
+                           char *key, char *value, char **op_errstr)
+{
+        int ret             =       -1;
+
+        ret = validate_boolean (volinfo, dict, key, value, op_errstr);
+        if (ret)
+                goto out;
+
+        ret = glusterd_is_defrag_on (volinfo);
+        if (ret) {
+                gf_asprintf (op_errstr, "%s option should be set "
+                             "after rebalance is complete", key);
+                gf_msg ("glusterd", GF_LOG_ERROR, 0,
+                        GD_MSG_INVALID_ENTRY, "%s", *op_errstr);
+        }
+out:
+        gf_msg_debug ("glusterd", 0, "Returning %d", ret);
+
+        return ret;
+}
+
+
+static int
 validate_worm_period (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
                char *value, char **op_errstr)
 {
@@ -3024,7 +3048,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .value       = "off",
           .type        = DOC,
           .op_version  = GD_OP_VERSION_3_10_0,
-          .validate_fn = validate_boolean,
+          .validate_fn = validate_parallel_readdir,
           .description = "If this option is enabled, the readdir operation is "
                          "performed parallely on all the bricks, thus improving"
                          " the performance of readdir. Note that the performance"
-- 
1.8.3.1