21ab4e
From 362f13dda23925530a9d89bdf3568160af2bc07c Mon Sep 17 00:00:00 2001
21ab4e
From: Xavier Hernandez <xhernandez@datalab.es>
21ab4e
Date: Fri, 12 May 2017 09:23:47 +0200
21ab4e
Subject: [PATCH 460/473] cluster/ec: return all node uuids from all subvolumes
21ab4e
21ab4e
EC was retuning the UUID of the brick with smaller value. This had
21ab4e
the side effect of not evenly balancing the load between bricks on
21ab4e
rebalance operations.
21ab4e
21ab4e
This patch modifies the common functions that combine multiple subvolume
21ab4e
values into a single result to take into account the subvolume order
21ab4e
and, optionally, other subvolumes that could be damaged.
21ab4e
21ab4e
This makes easier to add future features where brick order is important.
21ab4e
It also makes possible to easily identify the originating brick of each
21ab4e
answer, in case some brick will have an special meaning in the future.
21ab4e
21ab4e
>Change-Id: Iee0a4da710b41224a6dc8e13fa8dcddb36c73a2f
21ab4e
>BUG: 1366817
21ab4e
>Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
21ab4e
>Reviewed-on: https://review.gluster.org/17297
21ab4e
>Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>Reviewed-by: Ashish Pandey <aspandey@redhat.com>
21ab4e
>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
21ab4e
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
21ab4e
21ab4e
Change-Id: Iee0a4da710b41224a6dc8e13fa8dcddb36c73a2f
21ab4e
BUG: 1315781
21ab4e
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/106643
21ab4e
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
21ab4e
Tested-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
21ab4e
---
21ab4e
 xlators/cluster/ec/src/ec-combine.c | 241 ++++++++++++++++++++----------------
21ab4e
 xlators/cluster/ec/src/ec.c         |   5 +-
21ab4e
 2 files changed, 141 insertions(+), 105 deletions(-)
21ab4e
21ab4e
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
21ab4e
index fb2e933..60605be 100644
21ab4e
--- a/xlators/cluster/ec/src/ec-combine.c
21ab4e
+++ b/xlators/cluster/ec/src/ec-combine.c
21ab4e
@@ -22,6 +22,8 @@
21ab4e
 
21ab4e
 #define EC_QUOTA_PREFIX "trusted.glusterfs.quota."
21ab4e
 
21ab4e
+#define EC_MISSING_DATA ((data_t *)1ULL)
21ab4e
+
21ab4e
 struct _ec_dict_info;
21ab4e
 typedef struct _ec_dict_info ec_dict_info_t;
21ab4e
 
21ab4e
@@ -285,35 +287,45 @@ ec_dict_compare (dict_t *dict1, dict_t *dict2)
21ab4e
         return 0;
21ab4e
 }
21ab4e
 
21ab4e
-int32_t ec_dict_list(data_t ** list, int32_t * count, ec_cbk_data_t * cbk,
21ab4e
-                     int32_t which, char * key)
21ab4e
+static uint32_t
21ab4e
+ec_dict_list(data_t **list, ec_cbk_data_t *cbk, int32_t which, char *key,
21ab4e
+             gf_boolean_t global)
21ab4e
 {
21ab4e
-    ec_cbk_data_t *ans = NULL;
21ab4e
-    dict_t *dict = NULL;
21ab4e
-    int32_t i, max;
21ab4e
-
21ab4e
-    max = *count;
21ab4e
-    i = 0;
21ab4e
-    for (ans = cbk; ans != NULL; ans = ans->next) {
21ab4e
-        if (i >= max) {
21ab4e
-            gf_msg (cbk->fop->xl->name, GF_LOG_ERROR, EINVAL,
21ab4e
-                    EC_MSG_INVALID_DICT_NUMS,
21ab4e
-                    "Unexpected number of "
21ab4e
-                    "dictionaries");
21ab4e
-
21ab4e
-            return -EINVAL;
21ab4e
+        ec_t *ec = cbk->fop->xl->private;
21ab4e
+        ec_cbk_data_t *ans = NULL;
21ab4e
+        dict_t *dict = NULL;
21ab4e
+        data_t *data;
21ab4e
+        uint32_t count;
21ab4e
+        int32_t i;
21ab4e
+
21ab4e
+        for (i = 0; i < ec->nodes; i++) {
21ab4e
+                /* We initialize the list with EC_MISSING_DATA if we are
21ab4e
+                 * returning a global list or the current subvolume belongs
21ab4e
+                 * to the group of the accepted answer. Note that if some
21ab4e
+                 * subvolume is known to be down before issuing the request,
21ab4e
+                 * we won't have any answer from it, so we set here the
21ab4e
+                 * appropriate default value. */
21ab4e
+                if (global || ((cbk->mask & (1ULL << i)) != 0)) {
21ab4e
+                        list[i] = EC_MISSING_DATA;
21ab4e
+                } else {
21ab4e
+                        list[i] = NULL;
21ab4e
+                }
21ab4e
         }
21ab4e
 
21ab4e
-        dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict;
21ab4e
-        list[i] = dict_get(dict, key);
21ab4e
-        if (list[i] != NULL) {
21ab4e
-            i++;
21ab4e
+        count = 0;
21ab4e
+        list_for_each_entry(ans, &cbk->fop->answer_list, answer_list) {
21ab4e
+                if (global || ((cbk->mask & ans->mask) != 0)) {
21ab4e
+                        dict = (which == EC_COMBINE_XDATA) ? ans->xdata
21ab4e
+                                                           : ans->dict;
21ab4e
+                        data = dict_get(dict, key);
21ab4e
+                        if (data != NULL) {
21ab4e
+                                list[ans->idx] = data;
21ab4e
+                                count++;
21ab4e
+                        }
21ab4e
+                }
21ab4e
         }
21ab4e
-    }
21ab4e
-
21ab4e
-    *count = i;
21ab4e
 
21ab4e
-    return 0;
21ab4e
+        return count;
21ab4e
 }
21ab4e
 
21ab4e
 int32_t ec_concat_prepare(xlator_t *xl, char **str, char **sep, char **post,
21ab4e
@@ -352,23 +364,21 @@ out:
21ab4e
     return -EINVAL;
21ab4e
 }
21ab4e
 
21ab4e
-int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk,
21ab4e
-                            int32_t which, char * key, ...)
21ab4e
+static int32_t
21ab4e
+ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which,
21ab4e
+                    char *key, const char *def, gf_boolean_t global, ...)
21ab4e
 {
21ab4e
-    data_t * data[cbk->count];
21ab4e
-    char * str = NULL, * pre = NULL, * sep, * post;
21ab4e
-    dict_t * dict;
21ab4e
+    ec_t *ec = cbk->fop->xl->private;
21ab4e
+    data_t *data[ec->nodes];
21ab4e
+    char *str = NULL, *pre = NULL, *sep, *post;
21ab4e
+    dict_t *dict;
21ab4e
     va_list args;
21ab4e
-    int32_t i, num, len, prelen, postlen, seplen, tmp;
21ab4e
+    int32_t i, num, len, deflen, prelen, postlen, seplen, tmp;
21ab4e
     int32_t err;
21ab4e
 
21ab4e
-    num = cbk->count;
21ab4e
-    err = ec_dict_list(data, &num, cbk, which, key);
21ab4e
-    if (err != 0) {
21ab4e
-        return err;
21ab4e
-    }
21ab4e
+    ec_dict_list(data, cbk, which, key, global);
21ab4e
 
21ab4e
-    va_start(args, key);
21ab4e
+    va_start(args, global);
21ab4e
     err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args);
21ab4e
     va_end(args);
21ab4e
 
21ab4e
@@ -380,9 +390,29 @@ int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk,
21ab4e
     seplen = strlen(sep);
21ab4e
     postlen = strlen(post);
21ab4e
 
21ab4e
-    len = prelen + (num - 1) * seplen + postlen + 1;
21ab4e
-    for (i = 0; i < num; i++) {
21ab4e
-        len += data[i]->len - 1;
21ab4e
+    deflen = 0;
21ab4e
+    if (def != NULL) {
21ab4e
+        deflen = strlen(def);
21ab4e
+    }
21ab4e
+
21ab4e
+    len = prelen + postlen + 1;
21ab4e
+    num = -1;
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if (data[i] == NULL) {
21ab4e
+            continue;
21ab4e
+        }
21ab4e
+        if (data[i] == EC_MISSING_DATA) {
21ab4e
+            if (def == NULL) {
21ab4e
+                continue;
21ab4e
+            }
21ab4e
+            len += deflen;
21ab4e
+        } else {
21ab4e
+            len += data[i]->len - 1;
21ab4e
+        }
21ab4e
+        if (num >= 0) {
21ab4e
+            len += seplen;
21ab4e
+        }
21ab4e
+        num++;
21ab4e
     }
21ab4e
 
21ab4e
     err = -ENOMEM;
21ab4e
@@ -394,14 +424,25 @@ int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk,
21ab4e
 
21ab4e
     memcpy(str, pre, prelen);
21ab4e
     len = prelen;
21ab4e
-    for (i = 0; i < num; i++) {
21ab4e
-        if (i > 0) {
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if (data[i] == NULL) {
21ab4e
+            continue;
21ab4e
+        }
21ab4e
+        if (data[i] == EC_MISSING_DATA) {
21ab4e
+            if (deflen == 0) {
21ab4e
+                continue;
21ab4e
+            }
21ab4e
+            tmp = deflen;
21ab4e
+            memcpy(str + len, def, tmp);
21ab4e
+        } else {
21ab4e
+            tmp = data[i]->len - 1;
21ab4e
+            memcpy(str + len, data[i]->data, tmp);
21ab4e
+        }
21ab4e
+        len += tmp;
21ab4e
+        if (i < num) {
21ab4e
             memcpy(str + len, sep, seplen);
21ab4e
             len += seplen;
21ab4e
         }
21ab4e
-        tmp = data[i]->len - 1;
21ab4e
-        memcpy(str + len, data[i]->data, tmp);
21ab4e
-        len += tmp;
21ab4e
     }
21ab4e
     memcpy(str + len, post, postlen + 1);
21ab4e
 
21ab4e
@@ -422,30 +463,26 @@ out:
21ab4e
 
21ab4e
 int32_t ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 {
21ab4e
-    data_t *data[cbk->count];
21ab4e
+    ec_t *ec = cbk->fop->xl->private;
21ab4e
+    data_t *data[ec->nodes];
21ab4e
     dict_t *dict, *lockinfo, *tmp = NULL;
21ab4e
     char *ptr = NULL;
21ab4e
-    int32_t i, num, len;
21ab4e
+    int32_t i, len;
21ab4e
     int32_t err;
21ab4e
 
21ab4e
-    num = cbk->count;
21ab4e
-    err = ec_dict_list(data, &num, cbk, which, key);
21ab4e
-    if (err != 0) {
21ab4e
-        return err;
21ab4e
-    }
21ab4e
+
21ab4e
+    ec_dict_list(data, cbk, which, key, _gf_false);
21ab4e
 
21ab4e
     lockinfo = dict_new();
21ab4e
     if (lockinfo == NULL) {
21ab4e
         return -ENOMEM;
21ab4e
     }
21ab4e
 
21ab4e
-    err = dict_unserialize(data[0]->data, data[0]->len, &lockinfo);
21ab4e
-    if (err != 0) {
21ab4e
-        goto out;
21ab4e
-    }
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
21ab4e
+            continue;
21ab4e
+        }
21ab4e
 
21ab4e
-    for (i = 1; i < num; i++)
21ab4e
-    {
21ab4e
         tmp = dict_new();
21ab4e
         if (tmp == NULL) {
21ab4e
             err = -ENOMEM;
21ab4e
@@ -532,19 +569,20 @@ int32_t ec_dict_data_uuid(ec_cbk_data_t * cbk, int32_t which, char * key)
21ab4e
 
21ab4e
 int32_t ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 {
21ab4e
-    data_t * data[cbk->count];
21ab4e
-    dict_t * dict;
21ab4e
-    int32_t i, num, err;
21ab4e
+    ec_t *ec = cbk->fop->xl->private;
21ab4e
+    data_t *data[ec->nodes];
21ab4e
+    dict_t *dict;
21ab4e
+    int32_t i;
21ab4e
     uint32_t max, tmp;
21ab4e
 
21ab4e
-    num = cbk->count;
21ab4e
-    err = ec_dict_list(data, &num, cbk, which, key);
21ab4e
-    if (err != 0) {
21ab4e
-        return err;
21ab4e
-    }
21ab4e
+    ec_dict_list(data, cbk, which, key, _gf_false);
21ab4e
+
21ab4e
+    max = 0;
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
21ab4e
+            continue;
21ab4e
+        }
21ab4e
 
21ab4e
-    max = data_to_uint32(data[0]);
21ab4e
-    for (i = 1; i < num; i++) {
21ab4e
         tmp = data_to_uint32(data[i]);
21ab4e
         if (max < tmp) {
21ab4e
             max = tmp;
21ab4e
@@ -557,19 +595,20 @@ int32_t ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 
21ab4e
 int32_t ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 {
21ab4e
-    data_t *data[cbk->count];
21ab4e
+    ec_t *ec = cbk->fop->xl->private;
21ab4e
+    data_t *data[ec->nodes];
21ab4e
     dict_t *dict;
21ab4e
-    int32_t i, num, err;
21ab4e
+    int32_t i;
21ab4e
     uint64_t max, tmp;
21ab4e
 
21ab4e
-    num = cbk->count;
21ab4e
-    err = ec_dict_list(data, &num, cbk, which, key);
21ab4e
-    if (err != 0) {
21ab4e
-        return err;
21ab4e
-    }
21ab4e
+    ec_dict_list(data, cbk, which, key, _gf_false);
21ab4e
+
21ab4e
+    max = 0;
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
21ab4e
+            continue;
21ab4e
+        }
21ab4e
 
21ab4e
-    max = data_to_uint64(data[0]);
21ab4e
-    for (i = 1; i < num; i++) {
21ab4e
         tmp = data_to_uint64(data[i]);
21ab4e
         if (max < tmp) {
21ab4e
             max = tmp;
21ab4e
@@ -582,22 +621,14 @@ int32_t ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 
21ab4e
 int32_t ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 {
21ab4e
-    data_t      *data[cbk->count];
21ab4e
+    ec_t        *ec               = cbk->fop->xl->private;
21ab4e
+    data_t      *data[ec->nodes];
21ab4e
     dict_t      *dict             = NULL;
21ab4e
-    ec_t        *ec               = NULL;
21ab4e
     int32_t      i                = 0;
21ab4e
-    int32_t      num              = 0;
21ab4e
-    int32_t      err              = 0;
21ab4e
     quota_meta_t size             = {0, };
21ab4e
     quota_meta_t max_size         = {0, };
21ab4e
 
21ab4e
-    num = cbk->count;
21ab4e
-    err = ec_dict_list(data, &num, cbk, which, key);
21ab4e
-    if (err != 0) {
21ab4e
-        return err;
21ab4e
-    }
21ab4e
-
21ab4e
-    if (num == 0) {
21ab4e
+    if (ec_dict_list(data, cbk, which, key, _gf_false) == 0) {
21ab4e
         return 0;
21ab4e
     }
21ab4e
 
21ab4e
@@ -606,8 +637,9 @@ int32_t ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
      * bricks and we can receive slightly different values. If that's the
21ab4e
      * case, we take the maximum of all received values.
21ab4e
      */
21ab4e
-    for (i = 0; i < num; i++) {
21ab4e
-        if (quota_data_to_meta (data[i], QUOTA_SIZE_KEY, &size) < 0) {
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA) ||
21ab4e
+            (quota_data_to_meta (data[i], QUOTA_SIZE_KEY, &size) < 0)) {
21ab4e
                 continue;
21ab4e
         }
21ab4e
 
21ab4e
@@ -619,7 +651,6 @@ int32_t ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
                 max_size.dir_count = size.dir_count;
21ab4e
     }
21ab4e
 
21ab4e
-    ec = cbk->fop->xl->private;
21ab4e
     max_size.size *= ec->fragments;
21ab4e
 
21ab4e
     dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
21ab4e
@@ -628,18 +659,18 @@ int32_t ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
21ab4e
 
21ab4e
 int32_t ec_dict_data_stime(ec_cbk_data_t * cbk, int32_t which, char * key)
21ab4e
 {
21ab4e
-    data_t * data[cbk->count];
21ab4e
-    dict_t * dict;
21ab4e
-    int32_t i, num, err;
21ab4e
+    ec_t *ec = cbk->fop->xl->private;
21ab4e
+    data_t *data[ec->nodes];
21ab4e
+    dict_t *dict;
21ab4e
+    int32_t i, err;
21ab4e
 
21ab4e
-    num = cbk->count;
21ab4e
-    err = ec_dict_list(data, &num, cbk, which, key);
21ab4e
-    if (err != 0) {
21ab4e
-        return err;
21ab4e
-    }
21ab4e
+    ec_dict_list(data, cbk, which, key, _gf_false);
21ab4e
 
21ab4e
     dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
21ab4e
-    for (i = 1; i < num; i++) {
21ab4e
+    for (i = 0; i < ec->nodes; i++) {
21ab4e
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
21ab4e
+            continue;
21ab4e
+        }
21ab4e
         err = gf_get_max_stime(cbk->fop->xl, dict, key, data[i]);
21ab4e
         if (err != 0) {
21ab4e
             gf_msg (cbk->fop->xl->name, GF_LOG_ERROR, -err,
21ab4e
@@ -661,12 +692,14 @@ int32_t ec_dict_data_combine(dict_t * dict, char * key, data_t * value,
21ab4e
         (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0))
21ab4e
     {
21ab4e
         return ec_dict_data_concat("(<EC:%s> { })", data->cbk, data->which,
21ab4e
-                                   key, data->cbk->fop->xl->name);
21ab4e
+                                   key, NULL, _gf_false,
21ab4e
+                                   data->cbk->fop->xl->name);
21ab4e
     }
21ab4e
 
21ab4e
     if (strncmp(key, GF_XATTR_CLRLK_CMD, strlen(GF_XATTR_CLRLK_CMD)) == 0)
21ab4e
     {
21ab4e
-        return ec_dict_data_concat("{\n}", data->cbk, data->which, key);
21ab4e
+        return ec_dict_data_concat("{\n}", data->cbk, data->which, key, NULL,
21ab4e
+                                   _gf_false);
21ab4e
     }
21ab4e
 
21ab4e
     if (strncmp(key, GF_XATTR_LOCKINFO_KEY,
21ab4e
@@ -696,9 +729,9 @@ int32_t ec_dict_data_combine(dict_t * dict, char * key, data_t * value,
21ab4e
         return 0;
21ab4e
     }
21ab4e
 
21ab4e
-    if (XATTR_IS_NODE_UUID(key))
21ab4e
-    {
21ab4e
-        return ec_dict_data_uuid(data->cbk, data->which, key);
21ab4e
+    if (XATTR_IS_NODE_UUID(key)) {
21ab4e
+        return ec_dict_data_concat("{ }", data->cbk, data->which, key,
21ab4e
+                                   UUID0_STR, _gf_true);
21ab4e
     }
21ab4e
 
21ab4e
     if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
21ab4e
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
21ab4e
index dfd8129..f687050 100644
21ab4e
--- a/xlators/cluster/ec/src/ec.c
21ab4e
+++ b/xlators/cluster/ec/src/ec.c
21ab4e
@@ -828,8 +828,11 @@ ec_gf_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
21ab4e
                                             NULL, ec_marker_populate_args) == 0)
21ab4e
                 return 0;
21ab4e
 
21ab4e
-        if (name && (fnmatch (GF_XATTR_STIME_PATTERN, name, 0) == 0))
21ab4e
+        if (name &&
21ab4e
+            ((fnmatch (GF_XATTR_STIME_PATTERN, name, 0) == 0) ||
21ab4e
+             (XATTR_IS_NODE_UUID(name)))) {
21ab4e
                 minimum = EC_MINIMUM_ALL;
21ab4e
+        }
21ab4e
 
21ab4e
         ec_getxattr (frame, this, -1, minimum, default_getxattr_cbk,
21ab4e
                      NULL, loc, name, xdata);
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e