|
|
21ab4e |
From d5ccab5920894fedbd6b9cbf2c90819494ffe890 Mon Sep 17 00:00:00 2001
|
|
|
21ab4e |
From: N Balachandran <nbalacha@redhat.com>
|
|
|
21ab4e |
Date: Thu, 18 May 2017 23:24:38 +0530
|
|
|
21ab4e |
Subject: [PATCH 458/473] cluster/dht: Rebalance on all nodes should migrate
|
|
|
21ab4e |
files
|
|
|
21ab4e |
|
|
|
21ab4e |
Problem:
|
|
|
21ab4e |
Rebalance compares the node-uuid of a file against its own
|
|
|
21ab4e |
to and migrates a file only if they match. However, the
|
|
|
21ab4e |
current behaviour in both AFR and EC is to return
|
|
|
21ab4e |
the node-uuid of the first brick in a replica set for all
|
|
|
21ab4e |
files. This means a single node ends up migrating all
|
|
|
21ab4e |
the files if the first brick of every replica set is on the
|
|
|
21ab4e |
same node.
|
|
|
21ab4e |
|
|
|
21ab4e |
Fix:
|
|
|
21ab4e |
AFR and EC will return all node-uuids for the replica set.
|
|
|
21ab4e |
The rebalance process will divide the files to be migrated
|
|
|
21ab4e |
among all the nodes by hashing the gfid of the file and
|
|
|
21ab4e |
using that value to select a node to perform the migration.
|
|
|
21ab4e |
This patch makes the required DHT and tiering changes.
|
|
|
21ab4e |
|
|
|
21ab4e |
Some tests in rebal-all-nodes-migrate.t will need to be
|
|
|
21ab4e |
uncommented once the AFR and EC changes are merged.
|
|
|
21ab4e |
|
|
|
21ab4e |
> BUG: 1366817
|
|
|
21ab4e |
> Signed-off-by: N Balachandran <nbalacha@redhat.com>
|
|
|
21ab4e |
> Reviewed-on: https://review.gluster.org/17239
|
|
|
21ab4e |
> Smoke: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
> Reviewed-by: Amar Tumballi <amarts@redhat.com>
|
|
|
21ab4e |
> Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
|
|
|
21ab4e |
> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
|
|
|
21ab4e |
|
|
|
21ab4e |
Change-Id: I7058d9246050832a7c496b0f912b9751435328c3
|
|
|
21ab4e |
BUG: 1315781
|
|
|
21ab4e |
Signed-off-by: N Balachandran <nbalacha@redhat.com>
|
|
|
21ab4e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/106602
|
|
|
21ab4e |
Reviewed-by: Susant Palai <spalai@redhat.com>
|
|
|
21ab4e |
---
|
|
|
21ab4e |
tests/basic/distribute/rebal-all-nodes-migrate.t | 143 +++++++++++++++++++++++
|
|
|
21ab4e |
tests/dht.rc | 22 +++-
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-common.c | 64 +++++++++-
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-common.h | 9 ++
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-helper.c | 6 +-
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-mem-types.h | 2 +
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-rebalance.c | 95 +++++++++++++--
|
|
|
21ab4e |
xlators/cluster/dht/src/tier.c | 58 ++++++++-
|
|
|
21ab4e |
8 files changed, 379 insertions(+), 20 deletions(-)
|
|
|
21ab4e |
create mode 100644 tests/basic/distribute/rebal-all-nodes-migrate.t
|
|
|
21ab4e |
|
|
|
21ab4e |
diff --git a/tests/basic/distribute/rebal-all-nodes-migrate.t b/tests/basic/distribute/rebal-all-nodes-migrate.t
|
|
|
21ab4e |
new file mode 100644
|
|
|
21ab4e |
index 0000000..14f0a53
|
|
|
21ab4e |
--- /dev/null
|
|
|
21ab4e |
+++ b/tests/basic/distribute/rebal-all-nodes-migrate.t
|
|
|
21ab4e |
@@ -0,0 +1,143 @@
|
|
|
21ab4e |
+#!/bin/bash
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+. $(dirname $0)/../../include.rc
|
|
|
21ab4e |
+. $(dirname $0)/../../cluster.rc
|
|
|
21ab4e |
+. $(dirname $0)/../../dht.rc
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+# Check if every single rebalance process migrated some files
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+function cluster_rebal_all_nodes_migrated_files {
|
|
|
21ab4e |
+ val=0
|
|
|
21ab4e |
+ a=$($CLI_1 volume rebalance $V0 status | grep "completed" | awk '{print $2}');
|
|
|
21ab4e |
+# echo $a
|
|
|
21ab4e |
+ b=($a)
|
|
|
21ab4e |
+ for i in "${b[@]}"
|
|
|
21ab4e |
+ do
|
|
|
21ab4e |
+# echo "$i";
|
|
|
21ab4e |
+ if [ "$i" -eq "0" ]; then
|
|
|
21ab4e |
+ echo "false";
|
|
|
21ab4e |
+ val=1;
|
|
|
21ab4e |
+ fi
|
|
|
21ab4e |
+ done
|
|
|
21ab4e |
+ echo $val
|
|
|
21ab4e |
+}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+cleanup
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST launch_cluster 3;
|
|
|
21ab4e |
+TEST $CLI_1 peer probe $H2;
|
|
|
21ab4e |
+TEST $CLI_1 peer probe $H3;
|
|
|
21ab4e |
+EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#Start with a pure distribute volume (multiple bricks on the same node)
|
|
|
21ab4e |
+TEST $CLI_1 volume create $V0 $H1:$B1/dist1 $H1:$B1/dist2 $H2:$B2/dist3 $H2:$B2/dist4
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST $CLI_1 volume start $V0
|
|
|
21ab4e |
+$CLI_1 volume info $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## Mount FUSE
|
|
|
21ab4e |
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST mkdir $M0/dir1 2>/dev/null;
|
|
|
21ab4e |
+TEST touch $M0/dir1/file-{1..500}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## Add-brick and run rebalance to force file migration
|
|
|
21ab4e |
+TEST $CLI_1 volume add-brick $V0 $H1:$B1/dist5 $H2:$B2/dist6
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#Start a rebalance
|
|
|
21ab4e |
+TEST $CLI_1 volume rebalance $V0 start force
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#volume rebalance status should work
|
|
|
21ab4e |
+#TEST $CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+#$CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
|
|
|
21ab4e |
+EXPECT "0" cluster_rebal_all_nodes_migrated_files
|
|
|
21ab4e |
+$CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST umount -f $M0
|
|
|
21ab4e |
+TEST $CLI_1 volume stop $V0
|
|
|
21ab4e |
+TEST $CLI_1 volume delete $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+##############################################################
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+# Next, a dist-rep volume
|
|
|
21ab4e |
+TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/drep1 $H2:$B2/drep1 $H1:$B1/drep2 $H2:$B2/drep2
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST $CLI_1 volume start $V0
|
|
|
21ab4e |
+$CLI_1 volume info $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## Mount FUSE
|
|
|
21ab4e |
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST mkdir $M0/dir1 2>/dev/null;
|
|
|
21ab4e |
+TEST touch $M0/dir1/file-{1..500}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## Add-brick and run rebalance to force file migration
|
|
|
21ab4e |
+TEST $CLI_1 volume add-brick $V0 replica 2 $H1:$B1/drep3 $H2:$B2/drep3
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#Start a rebalance
|
|
|
21ab4e |
+TEST $CLI_1 volume rebalance $V0 start force
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#volume rebalance status should work
|
|
|
21ab4e |
+#TEST $CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+#$CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
|
|
|
21ab4e |
+#EXPECT "0" cluster_rebal_all_nodes_migrated_files
|
|
|
21ab4e |
+$CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST umount -f $M0
|
|
|
21ab4e |
+TEST $CLI_1 volume stop $V0
|
|
|
21ab4e |
+TEST $CLI_1 volume delete $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+##############################################################
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+# Next, a disperse volume
|
|
|
21ab4e |
+TEST $CLI_1 volume create $V0 disperse 3 $H1:$B1/ec1 $H2:$B1/ec2 $H3:$B1/ec3 force
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST $CLI_1 volume start $V0
|
|
|
21ab4e |
+$CLI_1 volume info $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## Mount FUSE
|
|
|
21ab4e |
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST mkdir $M0/dir1 2>/dev/null;
|
|
|
21ab4e |
+TEST touch $M0/dir1/file-{1..500}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## Add-brick and run rebalance to force file migration
|
|
|
21ab4e |
+TEST $CLI_1 volume add-brick $V0 $H1:$B2/ec4 $H2:$B2/ec5 $H3:$B2/ec6
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#Start a rebalance
|
|
|
21ab4e |
+TEST $CLI_1 volume rebalance $V0 start force
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#volume rebalance status should work
|
|
|
21ab4e |
+#TEST $CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+#$CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+# this will not work unless EC is changed to return all node-uuids
|
|
|
21ab4e |
+# comment this out once that patch is ready
|
|
|
21ab4e |
+#EXPECT "0" cluster_rebal_all_nodes_migrated_files
|
|
|
21ab4e |
+$CLI_1 volume rebalance $V0 status
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST umount -f $M0
|
|
|
21ab4e |
+TEST $CLI_1 volume stop $V0
|
|
|
21ab4e |
+TEST $CLI_1 volume delete $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+##############################################################
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+cleanup
|
|
|
21ab4e |
diff --git a/tests/dht.rc b/tests/dht.rc
|
|
|
21ab4e |
index bf5e08b..051b075 100644
|
|
|
21ab4e |
--- a/tests/dht.rc
|
|
|
21ab4e |
+++ b/tests/dht.rc
|
|
|
21ab4e |
@@ -65,11 +65,31 @@ function get_hashed_brick()
|
|
|
21ab4e |
return $hashed
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+function cluster_rebalance_completed()
|
|
|
21ab4e |
+{
|
|
|
21ab4e |
+ val=1
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ # Rebalance status will be either "failed" or "completed"
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ test=$($CLI_1 volume rebalance $V0 status | grep "in progress" 2>&1)
|
|
|
21ab4e |
+ if [ $? -ne 0 ]
|
|
|
21ab4e |
+ then
|
|
|
21ab4e |
+ val=0
|
|
|
21ab4e |
+ fi
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ echo $val
|
|
|
21ab4e |
+ # Do not *return* the value here. If it's non-zero, that will cause
|
|
|
21ab4e |
+ # EXPECT_WITHIN (e.g. in bug-884455.t) to return prematurely, leading to
|
|
|
21ab4e |
+ # a spurious test failure. Nothing else checks the return value anyway
|
|
|
21ab4e |
+ # (they all check the output) so there's no need for it to be non-zero
|
|
|
21ab4e |
+ # just because grep didn't find what we want.
|
|
|
21ab4e |
+}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
|
|
|
21ab4e |
function rebalance_completed()
|
|
|
21ab4e |
{
|
|
|
21ab4e |
val=1
|
|
|
21ab4e |
- test=$(gluster volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)
|
|
|
21ab4e |
+ test=$($CLI volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)
|
|
|
21ab4e |
if [ $? -eq 0 ]
|
|
|
21ab4e |
then
|
|
|
21ab4e |
val=0
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
|
|
|
21ab4e |
index 264ca65..9286125 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-common.c
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-common.c
|
|
|
21ab4e |
@@ -2997,6 +2997,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
|
|
|
21ab4e |
out:
|
|
|
21ab4e |
return ret;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
int
|
|
|
21ab4e |
dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
21ab4e |
int op_ret, int op_errno, dict_t *xattr,
|
|
|
21ab4e |
@@ -3012,6 +3014,11 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
21ab4e |
char *next_uuid_str = NULL;
|
|
|
21ab4e |
char *saveptr = NULL;
|
|
|
21ab4e |
uuid_t node_uuid = {0,};
|
|
|
21ab4e |
+ char *uuid_list_copy = NULL;
|
|
|
21ab4e |
+ int count = 0;
|
|
|
21ab4e |
+ int i = 0;
|
|
|
21ab4e |
+ int index = 0;
|
|
|
21ab4e |
+ int found = 0;
|
|
|
21ab4e |
|
|
|
21ab4e |
|
|
|
21ab4e |
VALIDATE_OR_GOTO (frame, out);
|
|
|
21ab4e |
@@ -3021,6 +3028,10 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
21ab4e |
prev = cookie;
|
|
|
21ab4e |
conf = this->private;
|
|
|
21ab4e |
|
|
|
21ab4e |
+ VALIDATE_OR_GOTO (conf->defrag, out);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "subvol %s returned", prev->name);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
LOCK (&frame->lock);
|
|
|
21ab4e |
{
|
|
|
21ab4e |
this_call_cnt = --local->call_cnt;
|
|
|
21ab4e |
@@ -3044,6 +3055,15 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
21ab4e |
goto unlock;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ /* As DHT will not know details of its child xlators
|
|
|
21ab4e |
+ * we need to parse this twice to get the count first
|
|
|
21ab4e |
+ * and allocate memory later.
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+ count = 0;
|
|
|
21ab4e |
+ index = conf->local_subvols_cnt;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ uuid_list_copy = gf_strdup (uuid_list);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
for (uuid_str = strtok_r (uuid_list, " ", &saveptr);
|
|
|
21ab4e |
uuid_str;
|
|
|
21ab4e |
uuid_str = next_uuid_str) {
|
|
|
21ab4e |
@@ -3053,24 +3073,58 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
21ab4e |
DHT_MSG_UUID_PARSE_ERROR,
|
|
|
21ab4e |
"Failed to parse uuid"
|
|
|
21ab4e |
- " failed for %s", prev->name);
|
|
|
21ab4e |
+ " for %s", prev->name);
|
|
|
21ab4e |
local->op_ret = -1;
|
|
|
21ab4e |
local->op_errno = EINVAL;
|
|
|
21ab4e |
goto unlock;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ count++;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) {
|
|
|
21ab4e |
gf_msg_debug (this->name, 0, "subvol %s does not"
|
|
|
21ab4e |
"belong to this node",
|
|
|
21ab4e |
prev->name);
|
|
|
21ab4e |
} else {
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* handle multiple bricks of the same replica
|
|
|
21ab4e |
+ * on the same node */
|
|
|
21ab4e |
+ if (found)
|
|
|
21ab4e |
+ continue;
|
|
|
21ab4e |
conf->local_subvols[(conf->local_subvols_cnt)++]
|
|
|
21ab4e |
- = prev;
|
|
|
21ab4e |
+ = prev;
|
|
|
21ab4e |
+ found = 1;
|
|
|
21ab4e |
gf_msg_debug (this->name, 0, "subvol %s belongs to"
|
|
|
21ab4e |
" this node", prev->name);
|
|
|
21ab4e |
- break;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ if (!found) {
|
|
|
21ab4e |
+ local->op_ret = 0;
|
|
|
21ab4e |
+ goto unlock;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ conf->local_nodeuuids[index].count = count;
|
|
|
21ab4e |
+ conf->local_nodeuuids[index].uuids
|
|
|
21ab4e |
+ = GF_CALLOC (count, sizeof (uuid_t), 1);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* The node-uuids are guaranteed to be returned in the same
|
|
|
21ab4e |
+ * order as the bricks
|
|
|
21ab4e |
+ * A null node-uuid is returned for a brick that is down.
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ saveptr = NULL;
|
|
|
21ab4e |
+ i = 0;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ for (uuid_str = strtok_r (uuid_list_copy, " ", &saveptr);
|
|
|
21ab4e |
+ uuid_str;
|
|
|
21ab4e |
+ uuid_str = next_uuid_str) {
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ next_uuid_str = strtok_r (NULL, " ", &saveptr);
|
|
|
21ab4e |
+ gf_uuid_parse (uuid_str,
|
|
|
21ab4e |
+ conf->local_nodeuuids[index].uuids[i]);
|
|
|
21ab4e |
+ i++;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
local->op_ret = 0;
|
|
|
21ab4e |
@@ -3088,8 +3142,12 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
|
|
|
21ab4e |
unwind:
|
|
|
21ab4e |
+ GF_FREE (conf->local_nodeuuids[index].uuids);
|
|
|
21ab4e |
+ conf->local_nodeuuids[index].uuids = NULL;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata);
|
|
|
21ab4e |
out:
|
|
|
21ab4e |
+ GF_FREE (uuid_list_copy);
|
|
|
21ab4e |
return 0;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
|
|
|
21ab4e |
index b4d9e84..184bd22 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-common.h
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-common.h
|
|
|
21ab4e |
@@ -356,6 +356,7 @@ struct dht_container {
|
|
|
21ab4e |
xlator_t *this;
|
|
|
21ab4e |
loc_t *parent_loc;
|
|
|
21ab4e |
dict_t *migrate_data;
|
|
|
21ab4e |
+ int local_subvol_index;
|
|
|
21ab4e |
};
|
|
|
21ab4e |
|
|
|
21ab4e |
typedef enum tier_mode_ {
|
|
|
21ab4e |
@@ -410,6 +411,13 @@ typedef struct gf_tier_conf {
|
|
|
21ab4e |
char volname[GD_VOLUME_NAME_MAX + 1];
|
|
|
21ab4e |
} gf_tier_conf_t;
|
|
|
21ab4e |
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+typedef struct subvol_nodeuuids {
|
|
|
21ab4e |
+ uuid_t *uuids;
|
|
|
21ab4e |
+ int count;
|
|
|
21ab4e |
+} subvol_nodeuuid_t;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
struct gf_defrag_info_ {
|
|
|
21ab4e |
uint64_t total_files;
|
|
|
21ab4e |
uint64_t total_data;
|
|
|
21ab4e |
@@ -543,6 +551,7 @@ struct dht_conf {
|
|
|
21ab4e |
|
|
|
21ab4e |
/*local subvol storage for rebalance*/
|
|
|
21ab4e |
xlator_t **local_subvols;
|
|
|
21ab4e |
+ subvol_nodeuuid_t *local_nodeuuids;
|
|
|
21ab4e |
int32_t local_subvols_cnt;
|
|
|
21ab4e |
|
|
|
21ab4e |
/*
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
|
|
|
21ab4e |
index c22f700..40c6eb5 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-helper.c
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-helper.c
|
|
|
21ab4e |
@@ -992,7 +992,11 @@ dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf)
|
|
|
21ab4e |
|
|
|
21ab4e |
conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *),
|
|
|
21ab4e |
gf_dht_mt_xlator_t);
|
|
|
21ab4e |
- if (!conf->local_subvols) {
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* FIX FIX : do this dynamically*/
|
|
|
21ab4e |
+ conf->local_nodeuuids = GF_CALLOC (cnt, sizeof (subvol_nodeuuid_t),
|
|
|
21ab4e |
+ gf_dht_nodeuuids_t);
|
|
|
21ab4e |
+ if (!conf->local_subvols || !conf->local_nodeuuids) {
|
|
|
21ab4e |
return -1;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
|
|
|
21ab4e |
index 5de5d18..19cccef 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-mem-types.h
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-mem-types.h
|
|
|
21ab4e |
@@ -38,6 +38,8 @@ enum gf_dht_mem_types_ {
|
|
|
21ab4e |
gf_tier_mt_ipc_ctr_params_t,
|
|
|
21ab4e |
gf_dht_mt_fd_ctx_t,
|
|
|
21ab4e |
gf_tier_mt_qfile_array_t,
|
|
|
21ab4e |
+ gf_dht_ret_cache_t,
|
|
|
21ab4e |
+ gf_dht_nodeuuids_t,
|
|
|
21ab4e |
gf_dht_mt_end
|
|
|
21ab4e |
};
|
|
|
21ab4e |
#endif
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
21ab4e |
index f1189e9..507ca81 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
21ab4e |
@@ -2441,6 +2441,43 @@ gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) {
|
|
|
21ab4e |
return 0;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+/* Return value
|
|
|
21ab4e |
+ * 0 : this node does not migrate the file
|
|
|
21ab4e |
+ * 1 : this node migrates the file
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+int
|
|
|
21ab4e |
+gf_defrag_should_i_migrate (xlator_t *this, int local_subvol_index, uuid_t gfid)
|
|
|
21ab4e |
+{
|
|
|
21ab4e |
+ int ret = 0;
|
|
|
21ab4e |
+ int i = local_subvol_index;
|
|
|
21ab4e |
+ char *str = NULL;
|
|
|
21ab4e |
+ uint32_t hashval = 0;
|
|
|
21ab4e |
+ int32_t index = 0;
|
|
|
21ab4e |
+ dht_conf_t *conf = NULL;
|
|
|
21ab4e |
+ char buf[UUID_CANONICAL_FORM_LEN + 1] = {0, };
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ conf = this->private;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* Pure distribute */
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ if (conf->local_nodeuuids[i].count == 1) {
|
|
|
21ab4e |
+ return 1;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ str = uuid_utoa_r (gfid, buf);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ ret = dht_hash_compute (this, 0, str, &hashval);
|
|
|
21ab4e |
+ if (ret == 0) {
|
|
|
21ab4e |
+ index = (hashval % conf->local_nodeuuids[i].count);
|
|
|
21ab4e |
+ if (!gf_uuid_compare (conf->defrag->node_uuid,
|
|
|
21ab4e |
+ conf->local_nodeuuids[i].uuids[index]))
|
|
|
21ab4e |
+ ret = 1;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+ return ret;
|
|
|
21ab4e |
+}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
int
|
|
|
21ab4e |
gf_defrag_migrate_single_file (void *opaque)
|
|
|
21ab4e |
{
|
|
|
21ab4e |
@@ -2519,6 +2556,13 @@ gf_defrag_migrate_single_file (void *opaque)
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ if (!gf_defrag_should_i_migrate (this, rebal_entry->local_subvol_index,
|
|
|
21ab4e |
+ entry->d_stat.ia_gfid)) {
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "Don't migrate %s ",
|
|
|
21ab4e |
+ entry_loc.path);
|
|
|
21ab4e |
+ goto out;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
|
|
|
21ab4e |
|
|
|
21ab4e |
gf_uuid_copy (entry_loc.pargfid, loc->gfid);
|
|
|
21ab4e |
@@ -2955,6 +2999,8 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ tmp_container->local_subvol_index = i;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
tmp_container->df_entry->d_stat = df_entry->d_stat;
|
|
|
21ab4e |
|
|
|
21ab4e |
tmp_container->df_entry->d_ino = df_entry->d_ino;
|
|
|
21ab4e |
@@ -4034,6 +4080,32 @@ int gf_defrag_total_file_cnt (xlator_t *this, loc_t *root_loc)
|
|
|
21ab4e |
|
|
|
21ab4e |
|
|
|
21ab4e |
int
|
|
|
21ab4e |
+dht_get_local_subvols_and_nodeuuids (xlator_t *this, dht_conf_t *conf,
|
|
|
21ab4e |
+ loc_t *loc)
|
|
|
21ab4e |
+{
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ dict_t *dict = NULL;
|
|
|
21ab4e |
+ int ret = -1;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* Find local subvolumes */
|
|
|
21ab4e |
+ ret = syncop_getxattr (this, loc, &dict,
|
|
|
21ab4e |
+ GF_REBAL_FIND_LOCAL_SUBVOL,
|
|
|
21ab4e |
+ NULL, NULL);
|
|
|
21ab4e |
+ if (ret) {
|
|
|
21ab4e |
+ gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
|
|
|
21ab4e |
+ "subvolume determination failed with error: %d",
|
|
|
21ab4e |
+ -ret);
|
|
|
21ab4e |
+ ret = -1;
|
|
|
21ab4e |
+ goto out;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ ret = 0;
|
|
|
21ab4e |
+out:
|
|
|
21ab4e |
+ return ret;
|
|
|
21ab4e |
+}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+int
|
|
|
21ab4e |
gf_defrag_start_crawl (void *data)
|
|
|
21ab4e |
{
|
|
|
21ab4e |
xlator_t *this = NULL;
|
|
|
21ab4e |
@@ -4050,13 +4122,14 @@ gf_defrag_start_crawl (void *data)
|
|
|
21ab4e |
glusterfs_ctx_t *ctx = NULL;
|
|
|
21ab4e |
dht_methods_t *methods = NULL;
|
|
|
21ab4e |
int i = 0;
|
|
|
21ab4e |
- int thread_index = 0;
|
|
|
21ab4e |
- int err = 0;
|
|
|
21ab4e |
- int thread_spawn_count = 0;
|
|
|
21ab4e |
+ int thread_index = 0;
|
|
|
21ab4e |
+ int err = 0;
|
|
|
21ab4e |
+ int thread_spawn_count = 0;
|
|
|
21ab4e |
pthread_t *tid = NULL;
|
|
|
21ab4e |
- gf_boolean_t is_tier_detach = _gf_false;
|
|
|
21ab4e |
+ gf_boolean_t is_tier_detach = _gf_false;
|
|
|
21ab4e |
call_frame_t *statfs_frame = NULL;
|
|
|
21ab4e |
xlator_t *old_THIS = NULL;
|
|
|
21ab4e |
+ int j = 0;
|
|
|
21ab4e |
|
|
|
21ab4e |
this = data;
|
|
|
21ab4e |
if (!this)
|
|
|
21ab4e |
@@ -4185,14 +4258,9 @@ gf_defrag_start_crawl (void *data)
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
- /* Find local subvolumes */
|
|
|
21ab4e |
- ret = syncop_getxattr (this, &loc, &dict,
|
|
|
21ab4e |
- GF_REBAL_FIND_LOCAL_SUBVOL,
|
|
|
21ab4e |
- NULL, NULL);
|
|
|
21ab4e |
+ ret = dht_get_local_subvols_and_nodeuuids (this, conf, &loc;;
|
|
|
21ab4e |
if (ret) {
|
|
|
21ab4e |
- gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
|
|
|
21ab4e |
- "subvolume determination failed with error: %d",
|
|
|
21ab4e |
- -ret);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
ret = -1;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
@@ -4200,6 +4268,11 @@ gf_defrag_start_crawl (void *data)
|
|
|
21ab4e |
for (i = 0 ; i < conf->local_subvols_cnt; i++) {
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols "
|
|
|
21ab4e |
"are %s", conf->local_subvols[i]->name);
|
|
|
21ab4e |
+ for (j = 0; j < conf->local_nodeuuids[i].count; j++) {
|
|
|
21ab4e |
+ gf_msg (this->name, GF_LOG_INFO, 0, 0,
|
|
|
21ab4e |
+ "node uuids are %s",
|
|
|
21ab4e |
+ uuid_utoa(conf->local_nodeuuids[i].uuids[j]));
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
ret = gf_defrag_total_file_cnt (this, &loc;;
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
|
|
|
21ab4e |
index abd8925..1fba88a 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/tier.c
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/tier.c
|
|
|
21ab4e |
@@ -199,10 +199,18 @@ out:
|
|
|
21ab4e |
static int
|
|
|
21ab4e |
tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
|
|
|
21ab4e |
{
|
|
|
21ab4e |
- int ret = -1;
|
|
|
21ab4e |
- dict_t *dict = NULL;
|
|
|
21ab4e |
- char *uuid_str = NULL;
|
|
|
21ab4e |
- uuid_t node_uuid = {0,};
|
|
|
21ab4e |
+ int ret = -1;
|
|
|
21ab4e |
+ dict_t *dict = NULL;
|
|
|
21ab4e |
+ char *uuid_str = NULL;
|
|
|
21ab4e |
+ uuid_t node_uuid = {0,};
|
|
|
21ab4e |
+ char *dup_str = NULL;
|
|
|
21ab4e |
+ char *str = NULL;
|
|
|
21ab4e |
+ char *save_ptr = NULL;
|
|
|
21ab4e |
+ int count = 0;
|
|
|
21ab4e |
+ uint32_t hashval = 0;
|
|
|
21ab4e |
+ int32_t index = 0;
|
|
|
21ab4e |
+ char buf[GF_UUID_BUF_SIZE] = {0,};
|
|
|
21ab4e |
+
|
|
|
21ab4e |
|
|
|
21ab4e |
GF_VALIDATE_OR_GOTO ("tier", this, out);
|
|
|
21ab4e |
GF_VALIDATE_OR_GOTO (this->name, loc, out);
|
|
|
21ab4e |
@@ -216,15 +224,56 @@ tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* This returns multiple node-uuids now - one for each brick
|
|
|
21ab4e |
+ * of the subvol.
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+
|
|
|
21ab4e |
if (dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
|
|
|
21ab4e |
"Failed to get node-uuid for %s", loc->path);
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ dup_str = gf_strdup (uuid_str);
|
|
|
21ab4e |
+ str = dup_str;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* How many uuids returned?
|
|
|
21ab4e |
+ * No need to check if one of these is that of the current node.
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ count = 1;
|
|
|
21ab4e |
+ while ((str = strchr (str, ' '))) {
|
|
|
21ab4e |
+ count++;
|
|
|
21ab4e |
+ str++;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ /* Only one node-uuid - pure distribute? */
|
|
|
21ab4e |
+ if (count == 1)
|
|
|
21ab4e |
+ goto check_node;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ uuid_utoa_r (loc->gfid, buf);
|
|
|
21ab4e |
+ ret = dht_hash_compute (this, 0, buf, &hashval);
|
|
|
21ab4e |
+ if (ret == 0) {
|
|
|
21ab4e |
+ index = (hashval % count);
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ count = 0;
|
|
|
21ab4e |
+ str = dup_str;
|
|
|
21ab4e |
+ while ((uuid_str = strtok_r (str, " ", &save_ptr))) {
|
|
|
21ab4e |
+ if (count == index)
|
|
|
21ab4e |
+ break;
|
|
|
21ab4e |
+ count++;
|
|
|
21ab4e |
+ str = NULL;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+check_node:
|
|
|
21ab4e |
+
|
|
|
21ab4e |
if (gf_uuid_parse (uuid_str, node_uuid)) {
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
|
|
|
21ab4e |
"uuid_parse failed for %s", loc->path);
|
|
|
21ab4e |
+ ret = -1;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
@@ -240,6 +289,7 @@ out:
|
|
|
21ab4e |
if (dict)
|
|
|
21ab4e |
dict_unref(dict);
|
|
|
21ab4e |
|
|
|
21ab4e |
+ GF_FREE (dup_str);
|
|
|
21ab4e |
return ret;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
--
|
|
|
21ab4e |
1.8.3.1
|
|
|
21ab4e |
|