7f4c2a
From ede515d765e55744bcbaa199a9ed703d265aa88b Mon Sep 17 00:00:00 2001
7f4c2a
From: Shyam <srangana@redhat.com>
7f4c2a
Date: Fri, 15 May 2015 15:50:42 -0400
7f4c2a
Subject: [PATCH 19/57] dht: Add lookup-optimize configuration option for DHT
7f4c2a
7f4c2a
Currently with commit 4eaaf5 a mixed version cluster would
7f4c2a
have issues if lookup-uhashed is set to auto, as older clients
7f4c2a
would fail to validate the layouts if newer clients (i.e 3.7 or
7f4c2a
upwards) create directories. Also, in a mixed version cluster
7f4c2a
rebalance daemon would set commit hash for some subvolumes and
7f4c2a
not for the others.
7f4c2a
7f4c2a
This commit fixes this problem by moving the enabling of the
7f4c2a
functionality introduced in the above mentioned commit to a
7f4c2a
new dht option. This option also has a op_version of 3_7_1
7f4c2a
thereby preventing it from being set in a mixed version
7f4c2a
cluster. It brings in the following changes,
7f4c2a
- Option can be set only if min version of the cluster is
7f4c2a
3.7.1 or more
7f4c2a
- Rebalance and mkdir update the layout with the commit hashes
7f4c2a
only if this option is set, hence ensuring rebalance works in a
7f4c2a
mixed version cluster, and also directories created by newer
7f4c2a
clients do not cause layout errors when read by older clients
7f4c2a
- This option also supersedes lookup-unhased, to enable the
7f4c2a
optimization for lookups more deterministic and not conflict
7f4c2a
with lookup-unhashed settings.
7f4c2a
7f4c2a
Option added is cluster.lookup-optimize, which is a boolean.
7f4c2a
7f4c2a
Usage: # gluster volume set VOLNAME cluster.lookup-optimize on
7f4c2a
7f4c2a
Change-Id: Ifd1d4ce3f6438fcbcd60ffbfdbfb647355ea1ae0
7f4c2a
BUG: 1222053
7f4c2a
Signed-off-by: Shyam <srangana@redhat.com>
7f4c2a
Reviewed-on: https://code.engineering.redhat.com/gerrit/50238
7f4c2a
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
7f4c2a
Reviewed-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
7f4c2a
Tested-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
7f4c2a
---
7f4c2a
 tests/features/unhashed-auto.t                  |   28 ++++++++++-
7f4c2a
 xlators/cluster/dht/src/dht-common.c            |   62 +++++++++++++++++-----
7f4c2a
 xlators/cluster/dht/src/dht-common.h            |    1 +
7f4c2a
 xlators/cluster/dht/src/dht-rebalance.c         |    6 ++-
7f4c2a
 xlators/cluster/dht/src/dht-shared.c            |   13 +++++
7f4c2a
 xlators/mgmt/glusterd/src/glusterd-volume-set.c |    5 ++
7f4c2a
 6 files changed, 98 insertions(+), 17 deletions(-)
7f4c2a
7f4c2a
diff --git a/tests/features/unhashed-auto.t b/tests/features/unhashed-auto.t
7f4c2a
index 97663c2..cba5b77 100755
7f4c2a
--- a/tests/features/unhashed-auto.t
7f4c2a
+++ b/tests/features/unhashed-auto.t
7f4c2a
@@ -39,6 +39,11 @@ get_xattr () {
7f4c2a
 	$cmd $1 | od -tx1 -An | tr -d ' '
7f4c2a
 }
7f4c2a
 
7f4c2a
+get_xattr_hash () {
7f4c2a
+        cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht"
7f4c2a
+        $cmd $1 | od -tx1 -An | awk '{printf("%s%s%s%s\n", $1, $2, $3, $4);}'
7f4c2a
+}
7f4c2a
+
7f4c2a
 cleanup
7f4c2a
 
7f4c2a
 TEST glusterd
7f4c2a
@@ -49,7 +54,7 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}
7f4c2a
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
7f4c2a
 EXPECT 'Created' volinfo_field $V0 'Status'
7f4c2a
 
7f4c2a
-TEST $CLI volume set $V0 cluster.lookup-unhashed auto
7f4c2a
+TEST $CLI volume set $V0 cluster.lookup-optimize ON
7f4c2a
 
7f4c2a
 TEST $CLI volume start $V0
7f4c2a
 EXPECT 'Started' volinfo_field $V0 'Status'
7f4c2a
@@ -96,4 +101,25 @@ TEST wait_for_rebalance
7f4c2a
 new_val=$(get_xattr $B0/${V0}1/dir)
7f4c2a
 TEST [ ! x"$old_val" = x"$new_val" ]
7f4c2a
 
7f4c2a
+# Force an anomoly on an existing layout and heal it
7f4c2a
+## The healed layout should not carry a commit-hash (or should carry 1 in the
7f4c2a
+## commit-hash)
7f4c2a
+TEST setfattr -x trusted.glusterfs.dht $B0/${V0}1/dir
7f4c2a
+TEST $GFS -s $H0 --volfile-id $V0 $M0
7f4c2a
+TEST [ -d $M0/dir ]
7f4c2a
+new_hash=$(get_xattr_hash $B0/${V0}1/dir)
7f4c2a
+TEST [ x"$new_hash" = x"00000001" ]
7f4c2a
+new_hash=$(get_xattr_hash $B0/${V0}2/dir)
7f4c2a
+TEST [ x"$new_hash" = x"00000001" ]
7f4c2a
+
7f4c2a
+# Unset the option and check that newly created directories get 1 in the
7f4c2a
+# disk layout
7f4c2a
+TEST $CLI volume reset $V0 cluster.lookup-optimize
7f4c2a
+TEST mkdir $M0/dir1
7f4c2a
+new_hash=$(get_xattr_hash $B0/${V0}1/dir1)
7f4c2a
+TEST [ x"$new_hash" = x"00000001" ]
7f4c2a
+new_hash=$(get_xattr_hash $B0/${V0}2/dir1)
7f4c2a
+TEST [ x"$new_hash" = x"00000001" ]
7f4c2a
+
7f4c2a
+
7f4c2a
 cleanup
7f4c2a
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
7f4c2a
index 8e78746..48a003c 100644
7f4c2a
--- a/xlators/cluster/dht/src/dht-common.c
7f4c2a
+++ b/xlators/cluster/dht/src/dht-common.c
7f4c2a
@@ -1923,25 +1923,51 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7f4c2a
                               "Entry %s missing on subvol %s",
7f4c2a
                               loc->path, prev->this->name);
7f4c2a
 
7f4c2a
-                if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {
7f4c2a
-                        local->op_errno = ENOENT;
7f4c2a
-                        dht_lookup_everywhere (frame, this, loc);
7f4c2a
-                        return 0;
7f4c2a
-                }
7f4c2a
-                if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) &&
7f4c2a
-                    (loc->parent)) {
7f4c2a
+                /* lookup-optimize supercedes lookup-unhashed settings,
7f4c2a
+                 *   - so if it is set, do not process search_unhashed
7f4c2a
+                 *   - except, in the case of rebalance deamon, we want to
7f4c2a
+                 *     force the lookup_everywhere behavior */
7f4c2a
+                if (!conf->defrag && conf->lookup_optimize && loc->parent) {
7f4c2a
                         ret = dht_inode_ctx_layout_get (loc->parent, this,
7f4c2a
                                                         &parent_layout);
7f4c2a
-                        if (ret || !parent_layout)
7f4c2a
-                                goto out;
7f4c2a
-                        if (parent_layout->commit_hash
7f4c2a
-                                  != conf->vol_commit_hash) {
7f4c2a
-                                gf_log (this->name, GF_LOG_DEBUG,
7f4c2a
-                                        "hashes don't match, do global lookup");
7f4c2a
+                        if (ret || !parent_layout ||
7f4c2a
+                            (parent_layout->commit_hash !=
7f4c2a
+                             conf->vol_commit_hash)) {
7f4c2a
+                                gf_msg_debug (this->name, 0,
7f4c2a
+                                        "hashes don't match (ret - %d,"
7f4c2a
+                                        " parent_layout - %p, parent_hash - %x,"
7f4c2a
+                                        " vol_hash - %x), do global lookup",
7f4c2a
+                                        ret, parent_layout,
7f4c2a
+                                        (parent_layout ?
7f4c2a
+                                         parent_layout->commit_hash : -1),
7f4c2a
+                                        conf->vol_commit_hash);
7f4c2a
+                                local->op_errno = ENOENT;
7f4c2a
+                                dht_lookup_everywhere (frame, this, loc);
7f4c2a
+                                return 0;
7f4c2a
+                        }
7f4c2a
+                } else {
7f4c2a
+                        if (conf->search_unhashed ==
7f4c2a
+                            GF_DHT_LOOKUP_UNHASHED_ON) {
7f4c2a
                                 local->op_errno = ENOENT;
7f4c2a
                                 dht_lookup_everywhere (frame, this, loc);
7f4c2a
                                 return 0;
7f4c2a
                         }
7f4c2a
+
7f4c2a
+                        if ((conf->search_unhashed ==
7f4c2a
+                            GF_DHT_LOOKUP_UNHASHED_AUTO) &&
7f4c2a
+                            (loc->parent)) {
7f4c2a
+                                ret = dht_inode_ctx_layout_get (loc->parent,
7f4c2a
+                                                                this,
7f4c2a
+                                                                &parent_layout);
7f4c2a
+                                if (ret || !parent_layout)
7f4c2a
+                                        goto out;
7f4c2a
+                                if (parent_layout->search_unhashed) {
7f4c2a
+                                        local->op_errno = ENOENT;
7f4c2a
+                                        dht_lookup_everywhere (frame, this,
7f4c2a
+                                                               loc);
7f4c2a
+                                        return 0;
7f4c2a
+                                }
7f4c2a
+                        }
7f4c2a
                 }
7f4c2a
         }
7f4c2a
 
7f4c2a
@@ -5800,7 +5826,15 @@ dht_mkdir (call_frame_t *frame, xlator_t *this,
7f4c2a
                 goto err;
7f4c2a
         }
7f4c2a
 
7f4c2a
-        local->layout->commit_hash = conf->vol_commit_hash;
7f4c2a
+        /* set the newly created directory hash to the commit hash
7f4c2a
+         * if the configuration option is set. If configuration option
7f4c2a
+         * is not set, the older clients may still be connecting to the
7f4c2a
+         * volume and hence we need to preserve the 1 in disk[0] part of the
7f4c2a
+         * layout xattr */
7f4c2a
+        if (conf->lookup_optimize)
7f4c2a
+                local->layout->commit_hash = conf->vol_commit_hash;
7f4c2a
+        else
7f4c2a
+                local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
7f4c2a
 
7f4c2a
         STACK_WIND (frame, dht_mkdir_hashed_cbk,
7f4c2a
                     hashed_subvol,
7f4c2a
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
7f4c2a
index 7a5d40f..4b6531c 100644
7f4c2a
--- a/xlators/cluster/dht/src/dht-common.h
7f4c2a
+++ b/xlators/cluster/dht/src/dht-common.h
7f4c2a
@@ -398,6 +398,7 @@ struct dht_conf {
7f4c2a
         dht_layout_t **file_layouts;
7f4c2a
         dht_layout_t **dir_layouts;
7f4c2a
         gf_boolean_t   search_unhashed;
7f4c2a
+        gf_boolean_t   lookup_optimize;
7f4c2a
         int            gen;
7f4c2a
         dht_du_t      *du_stats;
7f4c2a
         double         min_free_disk;
7f4c2a
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
7f4c2a
index 3ab73d4..89cc3a8 100644
7f4c2a
--- a/xlators/cluster/dht/src/dht-rebalance.c
7f4c2a
+++ b/xlators/cluster/dht/src/dht-rebalance.c
7f4c2a
@@ -2362,8 +2362,10 @@ gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag,
7f4c2a
                 return -1;
7f4c2a
         }
7f4c2a
 
7f4c2a
-        if (conf->local_subvols_cnt == 0) {
7f4c2a
-                /* Commit hash updates are only done on local subvolumes
7f4c2a
+        if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) {
7f4c2a
+                /* Commit hash updates are only done on local subvolumes and
7f4c2a
+                 * only when lookup optmization is needed (for older client
7f4c2a
+                 * support)
7f4c2a
                  */
7f4c2a
                 return 0;
7f4c2a
         }
7f4c2a
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
7f4c2a
index a1f72a8..456d831 100644
7f4c2a
--- a/xlators/cluster/dht/src/dht-shared.c
7f4c2a
+++ b/xlators/cluster/dht/src/dht-shared.c
7f4c2a
@@ -431,6 +431,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)
7f4c2a
                 }
7f4c2a
         }
7f4c2a
 
7f4c2a
+        GF_OPTION_RECONF ("lookup-optimize", conf->lookup_optimize, options,
7f4c2a
+                          bool, out);
7f4c2a
+
7f4c2a
 	GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
7f4c2a
                           percent_or_size, out);
7f4c2a
         /* option can be any one of percent or bytes */
7f4c2a
@@ -667,6 +670,8 @@ dht_init (xlator_t *this)
7f4c2a
                         conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
7f4c2a
         }
7f4c2a
 
7f4c2a
+        GF_OPTION_INIT ("lookup-optimize", conf->lookup_optimize, bool, err);
7f4c2a
+
7f4c2a
         GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
7f4c2a
                         err);
7f4c2a
 
7f4c2a
@@ -838,6 +843,14 @@ struct volume_options options[] = {
7f4c2a
           "from the hash subvolume. If set to OFF, it does not do a lookup "
7f4c2a
           "on the remaining subvolumes."
7f4c2a
         },
7f4c2a
+        { .key = {"lookup-optimize"},
7f4c2a
+          .type = GF_OPTION_TYPE_BOOL,
7f4c2a
+          .default_value = "off",
7f4c2a
+          .description = "This option if set to ON enables the optimization "
7f4c2a
+          "of -ve lookups, by not doing a lookup on non-hashed subvolumes for "
7f4c2a
+          "files, in case the hashed subvolume does not return any result. "
7f4c2a
+          "This option disregards the lookup-unhashed setting, when enabled."
7f4c2a
+        },
7f4c2a
         { .key  = {"min-free-disk"},
7f4c2a
           .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
7f4c2a
           .default_value = "10%",
7f4c2a
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
7f4c2a
index b8ca6be..65cbfc0 100644
7f4c2a
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
7f4c2a
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
7f4c2a
@@ -333,6 +333,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
7f4c2a
           .op_version = 1,
7f4c2a
           .flags      = OPT_FLAG_CLIENT_OPT
7f4c2a
         },
7f4c2a
+        { .key        = "cluster.lookup-optimize",
7f4c2a
+          .voltype    = "cluster/distribute",
7f4c2a
+          .op_version  = GD_OP_VERSION_3_7_2,
7f4c2a
+          .flags      = OPT_FLAG_CLIENT_OPT
7f4c2a
+        },
7f4c2a
         { .key        = "cluster.min-free-disk",
7f4c2a
           .voltype    = "cluster/distribute",
7f4c2a
           .op_version = 1,
7f4c2a
-- 
7f4c2a
1.7.1
7f4c2a