21ab4e
From d5ccab5920894fedbd6b9cbf2c90819494ffe890 Mon Sep 17 00:00:00 2001
21ab4e
From: N Balachandran <nbalacha@redhat.com>
21ab4e
Date: Thu, 18 May 2017 23:24:38 +0530
21ab4e
Subject: [PATCH 458/473] cluster/dht: Rebalance on all nodes should migrate
21ab4e
 files
21ab4e
21ab4e
Problem:
21ab4e
Rebalance compares the node-uuid of a file against its own
21ab4e
to and migrates a file only if they match. However, the
21ab4e
current behaviour in both AFR and EC is to return
21ab4e
the node-uuid of the first brick in a replica set for all
21ab4e
files. This means a single node ends up migrating all
21ab4e
the files if the first brick of every replica set is on the
21ab4e
same node.
21ab4e
21ab4e
Fix:
21ab4e
AFR and EC will return all node-uuids for the replica set.
21ab4e
The rebalance process will divide the files to be migrated
21ab4e
among all the nodes by hashing the gfid of the file and
21ab4e
using that value to select a node to perform the migration.
21ab4e
This patch makes the required DHT and tiering changes.
21ab4e
21ab4e
Some tests in rebal-all-nodes-migrate.t will need to be
21ab4e
uncommented once the AFR and EC changes are merged.
21ab4e
21ab4e
> BUG: 1366817
21ab4e
> Signed-off-by: N Balachandran <nbalacha@redhat.com>
21ab4e
> Reviewed-on: https://review.gluster.org/17239
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: Amar Tumballi <amarts@redhat.com>
21ab4e
> Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
21ab4e
> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
21ab4e
21ab4e
Change-Id: I7058d9246050832a7c496b0f912b9751435328c3
21ab4e
BUG: 1315781
21ab4e
Signed-off-by: N Balachandran <nbalacha@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/106602
21ab4e
Reviewed-by: Susant Palai <spalai@redhat.com>
21ab4e
---
21ab4e
 tests/basic/distribute/rebal-all-nodes-migrate.t | 143 +++++++++++++++++++++++
21ab4e
 tests/dht.rc                                     |  22 +++-
21ab4e
 xlators/cluster/dht/src/dht-common.c             |  64 +++++++++-
21ab4e
 xlators/cluster/dht/src/dht-common.h             |   9 ++
21ab4e
 xlators/cluster/dht/src/dht-helper.c             |   6 +-
21ab4e
 xlators/cluster/dht/src/dht-mem-types.h          |   2 +
21ab4e
 xlators/cluster/dht/src/dht-rebalance.c          |  95 +++++++++++++--
21ab4e
 xlators/cluster/dht/src/tier.c                   |  58 ++++++++-
21ab4e
 8 files changed, 379 insertions(+), 20 deletions(-)
21ab4e
 create mode 100644 tests/basic/distribute/rebal-all-nodes-migrate.t
21ab4e
21ab4e
diff --git a/tests/basic/distribute/rebal-all-nodes-migrate.t b/tests/basic/distribute/rebal-all-nodes-migrate.t
21ab4e
new file mode 100644
21ab4e
index 0000000..14f0a53
21ab4e
--- /dev/null
21ab4e
+++ b/tests/basic/distribute/rebal-all-nodes-migrate.t
21ab4e
@@ -0,0 +1,143 @@
21ab4e
+#!/bin/bash
21ab4e
+
21ab4e
+. $(dirname $0)/../../include.rc
21ab4e
+. $(dirname $0)/../../cluster.rc
21ab4e
+. $(dirname $0)/../../dht.rc
21ab4e
+
21ab4e
+
21ab4e
+# Check if every single rebalance process migrated some files
21ab4e
+
21ab4e
+function cluster_rebal_all_nodes_migrated_files {
21ab4e
+        val=0
21ab4e
+        a=$($CLI_1 volume rebalance $V0 status | grep "completed" | awk '{print $2}');
21ab4e
+#        echo $a
21ab4e
+        b=($a)
21ab4e
+        for i in "${b[@]}"
21ab4e
+        do
21ab4e
+#                echo "$i";
21ab4e
+                if [ "$i" -eq "0" ]; then
21ab4e
+                        echo "false";
21ab4e
+                        val=1;
21ab4e
+                fi
21ab4e
+        done
21ab4e
+        echo $val
21ab4e
+}
21ab4e
+
21ab4e
+cleanup
21ab4e
+
21ab4e
+TEST launch_cluster 3;
21ab4e
+TEST $CLI_1 peer probe $H2;
21ab4e
+TEST $CLI_1 peer probe $H3;
21ab4e
+EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count
21ab4e
+
21ab4e
+
21ab4e
+#Start with a pure distribute volume (multiple bricks on the same node)
21ab4e
+TEST $CLI_1 volume create $V0 $H1:$B1/dist1 $H1:$B1/dist2 $H2:$B2/dist3 $H2:$B2/dist4
21ab4e
+
21ab4e
+TEST $CLI_1 volume start $V0
21ab4e
+$CLI_1 volume info $V0
21ab4e
+
21ab4e
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
21ab4e
+
21ab4e
+## Mount FUSE
21ab4e
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
21ab4e
+
21ab4e
+TEST mkdir $M0/dir1 2>/dev/null;
21ab4e
+TEST touch $M0/dir1/file-{1..500}
21ab4e
+
21ab4e
+## Add-brick and run rebalance to force file migration
21ab4e
+TEST $CLI_1 volume add-brick $V0 $H1:$B1/dist5 $H2:$B2/dist6
21ab4e
+
21ab4e
+#Start a rebalance
21ab4e
+TEST $CLI_1 volume rebalance $V0 start force
21ab4e
+
21ab4e
+#volume rebalance status should work
21ab4e
+#TEST $CLI_1 volume rebalance $V0 status
21ab4e
+#$CLI_1 volume rebalance $V0 status
21ab4e
+
21ab4e
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
21ab4e
+EXPECT "0" cluster_rebal_all_nodes_migrated_files
21ab4e
+$CLI_1 volume rebalance $V0 status
21ab4e
+
21ab4e
+
21ab4e
+TEST umount -f $M0
21ab4e
+TEST $CLI_1 volume stop $V0
21ab4e
+TEST $CLI_1 volume delete $V0
21ab4e
+
21ab4e
+
21ab4e
+##############################################################
21ab4e
+
21ab4e
+# Next, a dist-rep volume
21ab4e
+TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/drep1 $H2:$B2/drep1 $H1:$B1/drep2 $H2:$B2/drep2
21ab4e
+
21ab4e
+TEST $CLI_1 volume start $V0
21ab4e
+$CLI_1 volume info $V0
21ab4e
+
21ab4e
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
21ab4e
+
21ab4e
+## Mount FUSE
21ab4e
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
21ab4e
+
21ab4e
+TEST mkdir $M0/dir1 2>/dev/null;
21ab4e
+TEST touch $M0/dir1/file-{1..500}
21ab4e
+
21ab4e
+## Add-brick and run rebalance to force file migration
21ab4e
+TEST $CLI_1 volume add-brick $V0 replica 2 $H1:$B1/drep3 $H2:$B2/drep3
21ab4e
+
21ab4e
+#Start a rebalance
21ab4e
+TEST $CLI_1 volume rebalance $V0 start force
21ab4e
+
21ab4e
+#volume rebalance status should work
21ab4e
+#TEST $CLI_1 volume rebalance $V0 status
21ab4e
+#$CLI_1 volume rebalance $V0 status
21ab4e
+
21ab4e
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
21ab4e
+#EXPECT "0" cluster_rebal_all_nodes_migrated_files
21ab4e
+$CLI_1 volume rebalance $V0 status
21ab4e
+
21ab4e
+
21ab4e
+TEST umount -f $M0
21ab4e
+TEST $CLI_1 volume stop $V0
21ab4e
+TEST $CLI_1 volume delete $V0
21ab4e
+
21ab4e
+##############################################################
21ab4e
+
21ab4e
+# Next, a disperse volume
21ab4e
+TEST $CLI_1 volume create $V0 disperse 3 $H1:$B1/ec1 $H2:$B1/ec2 $H3:$B1/ec3 force
21ab4e
+
21ab4e
+TEST $CLI_1 volume start $V0
21ab4e
+$CLI_1 volume info $V0
21ab4e
+
21ab4e
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
21ab4e
+
21ab4e
+## Mount FUSE
21ab4e
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
21ab4e
+
21ab4e
+TEST mkdir $M0/dir1 2>/dev/null;
21ab4e
+TEST touch $M0/dir1/file-{1..500}
21ab4e
+
21ab4e
+## Add-brick and run rebalance to force file migration
21ab4e
+TEST $CLI_1 volume add-brick $V0 $H1:$B2/ec4 $H2:$B2/ec5 $H3:$B2/ec6
21ab4e
+
21ab4e
+#Start a rebalance
21ab4e
+TEST $CLI_1 volume rebalance $V0 start force
21ab4e
+
21ab4e
+#volume rebalance status should work
21ab4e
+#TEST $CLI_1 volume rebalance $V0 status
21ab4e
+#$CLI_1 volume rebalance $V0 status
21ab4e
+
21ab4e
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
21ab4e
+
21ab4e
+# this will not work unless EC is changed to return all node-uuids
21ab4e
+# comment this out once that patch is ready
21ab4e
+#EXPECT "0" cluster_rebal_all_nodes_migrated_files
21ab4e
+$CLI_1 volume rebalance $V0 status
21ab4e
+
21ab4e
+
21ab4e
+TEST umount -f $M0
21ab4e
+TEST $CLI_1 volume stop $V0
21ab4e
+TEST $CLI_1 volume delete $V0
21ab4e
+
21ab4e
+##############################################################
21ab4e
+
21ab4e
+cleanup
21ab4e
diff --git a/tests/dht.rc b/tests/dht.rc
21ab4e
index bf5e08b..051b075 100644
21ab4e
--- a/tests/dht.rc
21ab4e
+++ b/tests/dht.rc
21ab4e
@@ -65,11 +65,31 @@ function get_hashed_brick()
21ab4e
         return $hashed
21ab4e
 }
21ab4e
 
21ab4e
+function cluster_rebalance_completed()
21ab4e
+{
21ab4e
+       val=1
21ab4e
+
21ab4e
+       # Rebalance status will be either "failed" or "completed"
21ab4e
+
21ab4e
+       test=$($CLI_1 volume rebalance $V0 status | grep "in progress" 2>&1)
21ab4e
+       if [ $? -ne 0 ]
21ab4e
+       then
21ab4e
+               val=0
21ab4e
+       fi
21ab4e
+
21ab4e
+       echo $val
21ab4e
+       # Do not *return* the value here.  If it's non-zero, that will cause
21ab4e
+       # EXPECT_WITHIN (e.g. in bug-884455.t) to return prematurely, leading to
21ab4e
+       # a spurious test failure.  Nothing else checks the return value anyway
21ab4e
+       # (they all check the output) so there's no need for it to be non-zero
21ab4e
+       # just because grep didn't find what we want.
21ab4e
+}
21ab4e
+
21ab4e
 
21ab4e
 function rebalance_completed()
21ab4e
 {
21ab4e
        val=1
21ab4e
-       test=$(gluster volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)
21ab4e
+       test=$($CLI volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)
21ab4e
        if [ $? -eq 0 ]
21ab4e
        then
21ab4e
                 val=0
21ab4e
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
21ab4e
index 264ca65..9286125 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-common.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-common.c
21ab4e
@@ -2997,6 +2997,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
21ab4e
  out:
21ab4e
         return ret;
21ab4e
 }
21ab4e
+
21ab4e
+
21ab4e
 int
21ab4e
 dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
21ab4e
                            int op_ret, int op_errno, dict_t *xattr,
21ab4e
@@ -3012,6 +3014,11 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
21ab4e
         char         *next_uuid_str = NULL;
21ab4e
         char         *saveptr       = NULL;
21ab4e
         uuid_t        node_uuid     = {0,};
21ab4e
+        char         *uuid_list_copy = NULL;
21ab4e
+        int           count          = 0;
21ab4e
+        int           i              = 0;
21ab4e
+        int           index          = 0;
21ab4e
+        int           found          = 0;
21ab4e
 
21ab4e
 
21ab4e
         VALIDATE_OR_GOTO (frame, out);
21ab4e
@@ -3021,6 +3028,10 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
21ab4e
         prev = cookie;
21ab4e
         conf = this->private;
21ab4e
 
21ab4e
+        VALIDATE_OR_GOTO (conf->defrag, out);
21ab4e
+
21ab4e
+        gf_msg_debug (this->name, 0, "subvol %s returned", prev->name);
21ab4e
+
21ab4e
         LOCK (&frame->lock);
21ab4e
         {
21ab4e
                 this_call_cnt = --local->call_cnt;
21ab4e
@@ -3044,6 +3055,15 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
21ab4e
                         goto unlock;
21ab4e
                 }
21ab4e
 
21ab4e
+                /* As DHT will not know details of its child xlators
21ab4e
+                 * we need to parse this twice to get the count first
21ab4e
+                 * and allocate memory later.
21ab4e
+                 */
21ab4e
+                count = 0;
21ab4e
+                index = conf->local_subvols_cnt;
21ab4e
+
21ab4e
+                uuid_list_copy = gf_strdup (uuid_list);
21ab4e
+
21ab4e
                 for (uuid_str = strtok_r (uuid_list, " ", &saveptr);
21ab4e
                      uuid_str;
21ab4e
                      uuid_str = next_uuid_str) {
21ab4e
@@ -3053,24 +3073,58 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
21ab4e
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
                                         DHT_MSG_UUID_PARSE_ERROR,
21ab4e
                                         "Failed to parse uuid"
21ab4e
-                                        " failed for %s", prev->name);
21ab4e
+                                        " for %s", prev->name);
21ab4e
                                 local->op_ret = -1;
21ab4e
                                 local->op_errno = EINVAL;
21ab4e
                                 goto unlock;
21ab4e
                         }
21ab4e
 
21ab4e
+                        count++;
21ab4e
+
21ab4e
                         if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) {
21ab4e
                                 gf_msg_debug (this->name, 0, "subvol %s does not"
21ab4e
                                               "belong to this node",
21ab4e
                                               prev->name);
21ab4e
                         } else {
21ab4e
+
21ab4e
+                                /* handle multiple bricks of the same replica
21ab4e
+                                 * on the same node */
21ab4e
+                                if (found)
21ab4e
+                                        continue;
21ab4e
                                 conf->local_subvols[(conf->local_subvols_cnt)++]
21ab4e
-                                        = prev;
21ab4e
+                                                = prev;
21ab4e
+                                found = 1;
21ab4e
                                 gf_msg_debug (this->name, 0, "subvol %s belongs to"
21ab4e
                                               " this node", prev->name);
21ab4e
-                                break;
21ab4e
                         }
21ab4e
                 }
21ab4e
+
21ab4e
+                if (!found) {
21ab4e
+                        local->op_ret = 0;
21ab4e
+                        goto unlock;
21ab4e
+                }
21ab4e
+
21ab4e
+                conf->local_nodeuuids[index].count = count;
21ab4e
+                conf->local_nodeuuids[index].uuids
21ab4e
+                                 = GF_CALLOC (count, sizeof (uuid_t), 1);
21ab4e
+
21ab4e
+                /* The node-uuids are guaranteed to be returned in the same
21ab4e
+                 * order as the bricks
21ab4e
+                 * A null node-uuid is returned for a brick that is down.
21ab4e
+                 */
21ab4e
+
21ab4e
+                saveptr = NULL;
21ab4e
+                i = 0;
21ab4e
+
21ab4e
+                for (uuid_str = strtok_r (uuid_list_copy, " ", &saveptr);
21ab4e
+                     uuid_str;
21ab4e
+                     uuid_str = next_uuid_str) {
21ab4e
+
21ab4e
+                        next_uuid_str = strtok_r (NULL, " ", &saveptr);
21ab4e
+                        gf_uuid_parse (uuid_str,
21ab4e
+                                       conf->local_nodeuuids[index].uuids[i]);
21ab4e
+                        i++;
21ab4e
+                }
21ab4e
         }
21ab4e
 
21ab4e
         local->op_ret = 0;
21ab4e
@@ -3088,8 +3142,12 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
21ab4e
         goto out;
21ab4e
 
21ab4e
  unwind:
21ab4e
+        GF_FREE (conf->local_nodeuuids[index].uuids);
21ab4e
+        conf->local_nodeuuids[index].uuids = NULL;
21ab4e
+
21ab4e
         DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata);
21ab4e
  out:
21ab4e
+        GF_FREE (uuid_list_copy);
21ab4e
         return 0;
21ab4e
 }
21ab4e
 
21ab4e
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
21ab4e
index b4d9e84..184bd22 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-common.h
21ab4e
+++ b/xlators/cluster/dht/src/dht-common.h
21ab4e
@@ -356,6 +356,7 @@ struct dht_container {
21ab4e
         xlator_t        *this;
21ab4e
         loc_t           *parent_loc;
21ab4e
         dict_t          *migrate_data;
21ab4e
+        int             local_subvol_index;
21ab4e
 };
21ab4e
 
21ab4e
 typedef enum tier_mode_ {
21ab4e
@@ -410,6 +411,13 @@ typedef struct gf_tier_conf {
21ab4e
         char                         volname[GD_VOLUME_NAME_MAX + 1];
21ab4e
 } gf_tier_conf_t;
21ab4e
 
21ab4e
+
21ab4e
+typedef struct subvol_nodeuuids {
21ab4e
+        uuid_t *uuids;
21ab4e
+        int count;
21ab4e
+} subvol_nodeuuid_t;
21ab4e
+
21ab4e
+
21ab4e
 struct gf_defrag_info_ {
21ab4e
         uint64_t                     total_files;
21ab4e
         uint64_t                     total_data;
21ab4e
@@ -543,6 +551,7 @@ struct dht_conf {
21ab4e
 
21ab4e
         /*local subvol storage for rebalance*/
21ab4e
         xlator_t       **local_subvols;
21ab4e
+        subvol_nodeuuid_t       *local_nodeuuids;
21ab4e
         int32_t          local_subvols_cnt;
21ab4e
 
21ab4e
         /*
21ab4e
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
21ab4e
index c22f700..40c6eb5 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-helper.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-helper.c
21ab4e
@@ -992,7 +992,11 @@ dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf)
21ab4e
 
21ab4e
         conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *),
21ab4e
                                         gf_dht_mt_xlator_t);
21ab4e
-        if (!conf->local_subvols) {
21ab4e
+
21ab4e
+        /* FIX FIX : do this dynamically*/
21ab4e
+        conf->local_nodeuuids = GF_CALLOC (cnt, sizeof (subvol_nodeuuid_t),
21ab4e
+                                           gf_dht_nodeuuids_t);
21ab4e
+        if (!conf->local_subvols || !conf->local_nodeuuids) {
21ab4e
                 return -1;
21ab4e
         }
21ab4e
 
21ab4e
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
21ab4e
index 5de5d18..19cccef 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-mem-types.h
21ab4e
+++ b/xlators/cluster/dht/src/dht-mem-types.h
21ab4e
@@ -38,6 +38,8 @@ enum gf_dht_mem_types_ {
21ab4e
         gf_tier_mt_ipc_ctr_params_t,
21ab4e
         gf_dht_mt_fd_ctx_t,
21ab4e
         gf_tier_mt_qfile_array_t,
21ab4e
+        gf_dht_ret_cache_t,
21ab4e
+        gf_dht_nodeuuids_t,
21ab4e
         gf_dht_mt_end
21ab4e
 };
21ab4e
 #endif
21ab4e
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
index f1189e9..507ca81 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
@@ -2441,6 +2441,43 @@ gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) {
21ab4e
         return 0;
21ab4e
 }
21ab4e
 
21ab4e
+
21ab4e
+/* Return value
21ab4e
+ * 0 : this node does not migrate the file
21ab4e
+ * 1 : this node migrates the file
21ab4e
+ */
21ab4e
+int
21ab4e
+gf_defrag_should_i_migrate (xlator_t *this, int local_subvol_index, uuid_t gfid)
21ab4e
+{
21ab4e
+        int         ret               = 0;
21ab4e
+        int         i                 = local_subvol_index;
21ab4e
+        char       *str               = NULL;
21ab4e
+        uint32_t    hashval           = 0;
21ab4e
+        int32_t     index        = 0;
21ab4e
+        dht_conf_t *conf              = NULL;
21ab4e
+        char        buf[UUID_CANONICAL_FORM_LEN + 1] = {0, };
21ab4e
+
21ab4e
+        conf = this->private;
21ab4e
+
21ab4e
+        /* Pure distribute */
21ab4e
+
21ab4e
+        if (conf->local_nodeuuids[i].count == 1) {
21ab4e
+                return 1;
21ab4e
+        }
21ab4e
+
21ab4e
+        str = uuid_utoa_r (gfid, buf);
21ab4e
+
21ab4e
+        ret = dht_hash_compute (this, 0, str, &hashval);
21ab4e
+        if (ret == 0) {
21ab4e
+                index = (hashval % conf->local_nodeuuids[i].count);
21ab4e
+                if (!gf_uuid_compare (conf->defrag->node_uuid,
21ab4e
+                                      conf->local_nodeuuids[i].uuids[index]))
21ab4e
+                        ret = 1;
21ab4e
+        }
21ab4e
+        return ret;
21ab4e
+}
21ab4e
+
21ab4e
+
21ab4e
 int
21ab4e
 gf_defrag_migrate_single_file (void *opaque)
21ab4e
 {
21ab4e
@@ -2519,6 +2556,13 @@ gf_defrag_migrate_single_file (void *opaque)
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
+        if (!gf_defrag_should_i_migrate (this, rebal_entry->local_subvol_index,
21ab4e
+                                         entry->d_stat.ia_gfid)) {
21ab4e
+                gf_msg_debug (this->name, 0, "Don't migrate %s ",
21ab4e
+                              entry_loc.path);
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
         gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
21ab4e
 
21ab4e
         gf_uuid_copy (entry_loc.pargfid, loc->gfid);
21ab4e
@@ -2955,6 +2999,8 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
21ab4e
                         goto out;
21ab4e
                 }
21ab4e
 
21ab4e
+                tmp_container->local_subvol_index = i;
21ab4e
+
21ab4e
                 tmp_container->df_entry->d_stat = df_entry->d_stat;
21ab4e
 
21ab4e
                 tmp_container->df_entry->d_ino  = df_entry->d_ino;
21ab4e
@@ -4034,6 +4080,32 @@ int gf_defrag_total_file_cnt (xlator_t *this, loc_t *root_loc)
21ab4e
 
21ab4e
 
21ab4e
 int
21ab4e
+dht_get_local_subvols_and_nodeuuids (xlator_t *this, dht_conf_t *conf,
21ab4e
+                                     loc_t *loc)
21ab4e
+{
21ab4e
+
21ab4e
+        dict_t                  *dict                   = NULL;
21ab4e
+        int                      ret                    = -1;
21ab4e
+
21ab4e
+                /* Find local subvolumes */
21ab4e
+        ret = syncop_getxattr (this, loc, &dict,
21ab4e
+                               GF_REBAL_FIND_LOCAL_SUBVOL,
21ab4e
+                               NULL, NULL);
21ab4e
+        if (ret) {
21ab4e
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
21ab4e
+                        "subvolume determination failed with error: %d",
21ab4e
+                        -ret);
21ab4e
+                ret = -1;
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = 0;
21ab4e
+out:
21ab4e
+        return ret;
21ab4e
+}
21ab4e
+
21ab4e
+
21ab4e
+int
21ab4e
 gf_defrag_start_crawl (void *data)
21ab4e
 {
21ab4e
         xlator_t                *this                   = NULL;
21ab4e
@@ -4050,13 +4122,14 @@ gf_defrag_start_crawl (void *data)
21ab4e
         glusterfs_ctx_t         *ctx                    = NULL;
21ab4e
         dht_methods_t           *methods                = NULL;
21ab4e
         int                      i                      = 0;
21ab4e
-        int                     thread_index            = 0;
21ab4e
-        int                     err                     = 0;
21ab4e
-        int                     thread_spawn_count      = 0;
21ab4e
+        int                      thread_index           = 0;
21ab4e
+        int                      err                    = 0;
21ab4e
+        int                      thread_spawn_count     = 0;
21ab4e
         pthread_t               *tid                    = NULL;
21ab4e
-        gf_boolean_t            is_tier_detach          = _gf_false;
21ab4e
+        gf_boolean_t             is_tier_detach         = _gf_false;
21ab4e
         call_frame_t            *statfs_frame           = NULL;
21ab4e
         xlator_t                *old_THIS               = NULL;
21ab4e
+        int                      j                      = 0;
21ab4e
 
21ab4e
         this = data;
21ab4e
         if (!this)
21ab4e
@@ -4185,14 +4258,9 @@ gf_defrag_start_crawl (void *data)
21ab4e
                         goto out;
21ab4e
                 }
21ab4e
 
21ab4e
-                /* Find local subvolumes */
21ab4e
-                ret = syncop_getxattr (this, &loc, &dict,
21ab4e
-                                       GF_REBAL_FIND_LOCAL_SUBVOL,
21ab4e
-                                       NULL, NULL);
21ab4e
+                ret = dht_get_local_subvols_and_nodeuuids (this, conf, &loc;;
21ab4e
                 if (ret) {
21ab4e
-                        gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
21ab4e
-                                "subvolume determination failed with error: %d",
21ab4e
-                                -ret);
21ab4e
+
21ab4e
                         ret = -1;
21ab4e
                         goto out;
21ab4e
                 }
21ab4e
@@ -4200,6 +4268,11 @@ gf_defrag_start_crawl (void *data)
21ab4e
                 for (i = 0 ; i < conf->local_subvols_cnt; i++) {
21ab4e
                         gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols "
21ab4e
                                 "are %s", conf->local_subvols[i]->name);
21ab4e
+                        for (j = 0; j < conf->local_nodeuuids[i].count; j++) {
21ab4e
+                                gf_msg (this->name, GF_LOG_INFO, 0, 0,
21ab4e
+                                        "node uuids are %s",
21ab4e
+                                  uuid_utoa(conf->local_nodeuuids[i].uuids[j]));
21ab4e
+                        }
21ab4e
                 }
21ab4e
 
21ab4e
                 ret = gf_defrag_total_file_cnt (this, &loc;;
21ab4e
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
21ab4e
index abd8925..1fba88a 100644
21ab4e
--- a/xlators/cluster/dht/src/tier.c
21ab4e
+++ b/xlators/cluster/dht/src/tier.c
21ab4e
@@ -199,10 +199,18 @@ out:
21ab4e
 static int
21ab4e
 tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
21ab4e
 {
21ab4e
-        int     ret            = -1;
21ab4e
-        dict_t *dict           = NULL;
21ab4e
-        char   *uuid_str       = NULL;
21ab4e
-        uuid_t  node_uuid      = {0,};
21ab4e
+        int         ret                     = -1;
21ab4e
+        dict_t     *dict                    = NULL;
21ab4e
+        char       *uuid_str                = NULL;
21ab4e
+        uuid_t      node_uuid               = {0,};
21ab4e
+        char       *dup_str                 = NULL;
21ab4e
+        char       *str                     = NULL;
21ab4e
+        char       *save_ptr                = NULL;
21ab4e
+        int         count                   = 0;
21ab4e
+        uint32_t    hashval                 = 0;
21ab4e
+        int32_t     index                   = 0;
21ab4e
+        char        buf[GF_UUID_BUF_SIZE]   = {0,};
21ab4e
+
21ab4e
 
21ab4e
         GF_VALIDATE_OR_GOTO ("tier", this, out);
21ab4e
         GF_VALIDATE_OR_GOTO (this->name, loc, out);
21ab4e
@@ -216,15 +224,56 @@ tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
+
21ab4e
+        /*  This returns multiple node-uuids now - one for each brick
21ab4e
+         *  of the subvol.
21ab4e
+         */
21ab4e
+
21ab4e
         if (dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {
21ab4e
                 gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
21ab4e
                         "Failed to get node-uuid for %s", loc->path);
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
+        dup_str = gf_strdup (uuid_str);
21ab4e
+        str = dup_str;
21ab4e
+
21ab4e
+        /* How many uuids returned?
21ab4e
+         * No need to check if one of these is that of the current node.
21ab4e
+         */
21ab4e
+
21ab4e
+        count = 1;
21ab4e
+        while ((str = strchr (str, ' '))) {
21ab4e
+                count++;
21ab4e
+                str++;
21ab4e
+        }
21ab4e
+
21ab4e
+        /* Only one node-uuid - pure distribute? */
21ab4e
+        if (count == 1)
21ab4e
+                goto check_node;
21ab4e
+
21ab4e
+        uuid_utoa_r (loc->gfid, buf);
21ab4e
+        ret = dht_hash_compute (this, 0, buf, &hashval);
21ab4e
+        if (ret == 0) {
21ab4e
+                index = (hashval % count);
21ab4e
+        }
21ab4e
+
21ab4e
+        count = 0;
21ab4e
+        str = dup_str;
21ab4e
+        while ((uuid_str = strtok_r (str, " ", &save_ptr))) {
21ab4e
+                if (count == index)
21ab4e
+                        break;
21ab4e
+                count++;
21ab4e
+                str = NULL;
21ab4e
+        }
21ab4e
+
21ab4e
+
21ab4e
+check_node:
21ab4e
+
21ab4e
         if (gf_uuid_parse (uuid_str, node_uuid)) {
21ab4e
                 gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
21ab4e
                         "uuid_parse failed for %s", loc->path);
21ab4e
+                ret = -1;
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
@@ -240,6 +289,7 @@ out:
21ab4e
         if (dict)
21ab4e
                 dict_unref(dict);
21ab4e
 
21ab4e
+        GF_FREE (dup_str);
21ab4e
         return ret;
21ab4e
 }
21ab4e
 
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e