From d7c52ddd2cbadb1d9a55767c2f7fe6ba38d9a2ed Mon Sep 17 00:00:00 2001 From: Sheetal Pamecha Date: Wed, 20 Nov 2019 12:42:12 +0530 Subject: [PATCH 431/449] glusterd: check for same node while adding bricks in disperse volume The optimal way for configuring disperse and replicate volumes is to have all bricks in different nodes. During create operation it fails saying it is not optimal, user must use force to over-ride this behavior. Implementing same during add-brick operation to avoid situation where all the added bricks end up from same host. Operation will error out accordingly. and this can be over-ridden by using force same as create. > Upstream Patch Link: https://review.gluster.org/#/c/glusterfs/+/23729 > fixes: #1047 > Change-Id: I3ee9c97c1a14b73f4532893bc00187ef9355238b > Signed-off-by: Sheetal Pamecha BUG: 1524457 Change-Id: I3ee9c97c1a14b73f4532893bc00187ef9355238b Signed-off-by: Sheetal Pamecha Reviewed-on: https://code.engineering.redhat.com/gerrit/202621 Tested-by: RHGS Build Bot Reviewed-by: Sanju Rakonde Reviewed-by: Sunil Kumar Heggodu Gopala Acharya --- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 20 +- xlators/mgmt/glusterd/src/glusterd-utils.c | 224 ++++++++++++++++++ xlators/mgmt/glusterd/src/glusterd-utils.h | 4 + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 293 +++--------------------- 4 files changed, 276 insertions(+), 265 deletions(-) diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index c5141de..d424f31 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -21,7 +21,6 @@ #include "glusterd-messages.h" #include "glusterd-server-quorum.h" #include -#include "glusterd-volgen.h" #include #include @@ -1575,6 +1574,25 @@ glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict) is_force = dict_get_str_boolean(dict, "force", _gf_false); + /* Check brick order if the volume type is replicate or disperse. If + * force at the end of command not given then check brick order. + */ + + if (!is_force) { + if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) || + (volinfo->type == GF_CLUSTER_TYPE_DISPERSE)) { + ret = glusterd_check_brick_order(dict, msg, volinfo->type); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER, + "Not adding brick because of " + "bad brick order. %s", + msg); + *op_errstr = gf_strdup(msg); + goto out; + } + } + } + if (volinfo->replica_count < replica_count && !is_force) { cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list) { diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index a1299bc..14e23d1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -14759,3 +14759,227 @@ glusterd_is_profile_on(glusterd_volinfo_t *volinfo) return _gf_true; return _gf_false; } + +static gf_ai_compare_t +glusterd_compare_addrinfo(struct addrinfo *first, struct addrinfo *next) +{ + int ret = -1; + struct addrinfo *tmp1 = NULL; + struct addrinfo *tmp2 = NULL; + char firstip[NI_MAXHOST] = {0.}; + char nextip[NI_MAXHOST] = { + 0, + }; + + for (tmp1 = first; tmp1 != NULL; tmp1 = tmp1->ai_next) { + ret = getnameinfo(tmp1->ai_addr, tmp1->ai_addrlen, firstip, NI_MAXHOST, + NULL, 0, NI_NUMERICHOST); + if (ret) + return GF_AI_COMPARE_ERROR; + for (tmp2 = next; tmp2 != NULL; tmp2 = tmp2->ai_next) { + ret = getnameinfo(tmp2->ai_addr, tmp2->ai_addrlen, nextip, + NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + if (ret) + return GF_AI_COMPARE_ERROR; + if (!strcmp(firstip, nextip)) { + return GF_AI_COMPARE_MATCH; + } + } + } + return GF_AI_COMPARE_NO_MATCH; +} + +/* Check for non optimal brick order for Replicate/Disperse : + * Checks if bricks belonging to a replicate or disperse + * volume are present on the same server + */ +int32_t +glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type) +{ + int ret = -1; + int i = 0; + int j = 0; + int k = 0; + xlator_t *this = NULL; + addrinfo_list_t *ai_list = NULL; + addrinfo_list_t *ai_list_tmp1 = NULL; + addrinfo_list_t *ai_list_tmp2 = NULL; + char *brick = NULL; + char *brick_list = NULL; + char *brick_list_dup = NULL; + char *brick_list_ptr = NULL; + char *tmpptr = NULL; + char *volname = NULL; + int32_t brick_count = 0; + int32_t sub_count = 0; + struct addrinfo *ai_info = NULL; + char brick_addr[128] = { + 0, + }; + int addrlen = 0; + + const char failed_string[2048] = + "Failed to perform brick order " + "check. Use 'force' at the end of the command" + " if you want to override this behavior. "; + const char found_string[2048] = + "Multiple bricks of a %s " + "volume are present on the same server. This " + "setup is not optimal. Bricks should be on " + "different nodes to have best fault tolerant " + "configuration. Use 'force' at the end of the " + "command if you want to override this " + "behavior. "; + + this = THIS; + + GF_ASSERT(this); + + ai_list = MALLOC(sizeof(addrinfo_list_t)); + ai_list->info = NULL; + CDS_INIT_LIST_HEAD(&ai_list->list); + + ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Unable to get volume name"); + goto out; + } + + ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &brick_list); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Bricks check : Could not " + "retrieve bricks list"); + goto out; + } + + ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Bricks check : Could not " + "retrieve brick count"); + goto out; + } + + if (type != GF_CLUSTER_TYPE_DISPERSE) { + ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"), + &sub_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Bricks check : Could" + " not retrieve replica count"); + goto out; + } + gf_msg_debug(this->name, 0, + "Replicate cluster type " + "found. Checking brick order."); + } else { + ret = dict_get_int32n(dict, "disperse-count", SLEN("disperse-count"), + &sub_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, + "Bricks check : Could" + " not retrieve disperse count"); + goto out; + } + gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DISPERSE_CLUSTER_FOUND, + "Disperse cluster type" + " found. Checking brick order."); + } + brick_list_dup = brick_list_ptr = gf_strdup(brick_list); + /* Resolve hostnames and get addrinfo */ + while (i < brick_count) { + ++i; + brick = strtok_r(brick_list_dup, " \n", &tmpptr); + brick_list_dup = tmpptr; + if (brick == NULL) + goto check_failed; + tmpptr = strrchr(brick, ':'); + if (tmpptr == NULL) + goto check_failed; + addrlen = strlen(brick) - strlen(tmpptr); + strncpy(brick_addr, brick, addrlen); + brick_addr[addrlen] = '\0'; + ret = getaddrinfo(brick_addr, NULL, NULL, &ai_info); + if (ret != 0) { + ret = 0; + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_RESOLVE_FAIL, + "unable to resolve host name for addr %s", brick_addr); + goto out; + } + ai_list_tmp1 = MALLOC(sizeof(addrinfo_list_t)); + if (ai_list_tmp1 == NULL) { + ret = 0; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY, + "failed to allocate " + "memory"); + freeaddrinfo(ai_info); + goto out; + } + ai_list_tmp1->info = ai_info; + cds_list_add_tail(&ai_list_tmp1->list, &ai_list->list); + ai_list_tmp1 = NULL; + } + + i = 0; + ai_list_tmp1 = cds_list_entry(ai_list->list.next, addrinfo_list_t, list); + + /* Check for bad brick order */ + while (i < brick_count) { + ++i; + ai_info = ai_list_tmp1->info; + ai_list_tmp1 = cds_list_entry(ai_list_tmp1->list.next, addrinfo_list_t, + list); + if (0 == i % sub_count) { + j = 0; + continue; + } + ai_list_tmp2 = ai_list_tmp1; + k = j; + while (k < sub_count - 1) { + ++k; + ret = glusterd_compare_addrinfo(ai_info, ai_list_tmp2->info); + if (GF_AI_COMPARE_ERROR == ret) + goto check_failed; + if (GF_AI_COMPARE_MATCH == ret) + goto found_bad_brick_order; + ai_list_tmp2 = cds_list_entry(ai_list_tmp2->list.next, + addrinfo_list_t, list); + } + ++j; + } + gf_msg_debug(this->name, 0, "Brick order okay"); + ret = 0; + goto out; + +check_failed: + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER_CHECK_FAIL, + "Failed bad brick order check"); + snprintf(err_str, sizeof(failed_string), failed_string); + ret = -1; + goto out; + +found_bad_brick_order: + gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_BAD_BRKORDER, + "Bad brick order found"); + if (type == GF_CLUSTER_TYPE_DISPERSE) { + snprintf(err_str, sizeof(found_string), found_string, "disperse"); + } else { + snprintf(err_str, sizeof(found_string), found_string, "replicate"); + } + + ret = -1; +out: + ai_list_tmp2 = NULL; + GF_FREE(brick_list_ptr); + cds_list_for_each_entry(ai_list_tmp1, &ai_list->list, list) + { + if (ai_list_tmp1->info) + freeaddrinfo(ai_list_tmp1->info); + free(ai_list_tmp2); + ai_list_tmp2 = ai_list_tmp1; + } + free(ai_list_tmp2); + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index ead16b2..e2e2454 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -881,4 +881,8 @@ glusterd_is_profile_on(glusterd_volinfo_t *volinfo); char * search_brick_path_from_proc(pid_t brick_pid, char *brickpath); + +int32_t +glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type); + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 93042ab..8da2ff3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -41,240 +41,6 @@ #define glusterd_op_start_volume_args_get(dict, volname, flags) \ glusterd_op_stop_volume_args_get(dict, volname, flags) -gf_ai_compare_t -glusterd_compare_addrinfo(struct addrinfo *first, struct addrinfo *next) -{ - int ret = -1; - struct addrinfo *tmp1 = NULL; - struct addrinfo *tmp2 = NULL; - char firstip[NI_MAXHOST] = {0.}; - char nextip[NI_MAXHOST] = { - 0, - }; - - for (tmp1 = first; tmp1 != NULL; tmp1 = tmp1->ai_next) { - ret = getnameinfo(tmp1->ai_addr, tmp1->ai_addrlen, firstip, NI_MAXHOST, - NULL, 0, NI_NUMERICHOST); - if (ret) - return GF_AI_COMPARE_ERROR; - for (tmp2 = next; tmp2 != NULL; tmp2 = tmp2->ai_next) { - ret = getnameinfo(tmp2->ai_addr, tmp2->ai_addrlen, nextip, - NI_MAXHOST, NULL, 0, NI_NUMERICHOST); - if (ret) - return GF_AI_COMPARE_ERROR; - if (!strcmp(firstip, nextip)) { - return GF_AI_COMPARE_MATCH; - } - } - } - return GF_AI_COMPARE_NO_MATCH; -} - -/* Check for non optimal brick order for replicate : - * Checks if bricks belonging to a replicate volume - * are present on the same server - */ -int32_t -glusterd_check_brick_order(dict_t *dict, char *err_str) -{ - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - xlator_t *this = NULL; - addrinfo_list_t *ai_list = NULL; - addrinfo_list_t *ai_list_tmp1 = NULL; - addrinfo_list_t *ai_list_tmp2 = NULL; - char *brick = NULL; - char *brick_list = NULL; - char *brick_list_dup = NULL; - char *brick_list_ptr = NULL; - char *tmpptr = NULL; - char *volname = NULL; - int32_t brick_count = 0; - int32_t type = GF_CLUSTER_TYPE_NONE; - int32_t sub_count = 0; - struct addrinfo *ai_info = NULL; - char brick_addr[128] = { - 0, - }; - int addrlen = 0; - - const char failed_string[2048] = - "Failed to perform brick order " - "check. Use 'force' at the end of the command" - " if you want to override this behavior. "; - const char found_string[2048] = - "Multiple bricks of a %s " - "volume are present on the same server. This " - "setup is not optimal. Bricks should be on " - "different nodes to have best fault tolerant " - "configuration. Use 'force' at the end of the " - "command if you want to override this " - "behavior. "; - - this = THIS; - - GF_ASSERT(this); - - ai_list = MALLOC(sizeof(addrinfo_list_t)); - ai_list->info = NULL; - CDS_INIT_LIST_HEAD(&ai_list->list); - - ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Unable to get volume name"); - goto out; - } - - ret = dict_get_int32n(dict, "type", SLEN("type"), &type); - if (ret) { - snprintf(err_str, 512, "Unable to get type of volume %s", volname); - gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED, "%s", - err_str); - goto out; - } - - ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &brick_list); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Bricks check : Could not " - "retrieve bricks list"); - goto out; - } - - ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Bricks check : Could not " - "retrieve brick count"); - goto out; - } - - if (type != GF_CLUSTER_TYPE_DISPERSE) { - ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"), - &sub_count); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Bricks check : Could" - " not retrieve replica count"); - goto out; - } - gf_msg_debug(this->name, 0, - "Replicate cluster type " - "found. Checking brick order."); - } else { - ret = dict_get_int32n(dict, "disperse-count", SLEN("disperse-count"), - &sub_count); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, - "Bricks check : Could" - " not retrieve disperse count"); - goto out; - } - gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DISPERSE_CLUSTER_FOUND, - "Disperse cluster type" - " found. Checking brick order."); - } - - brick_list_dup = brick_list_ptr = gf_strdup(brick_list); - /* Resolve hostnames and get addrinfo */ - while (i < brick_count) { - ++i; - brick = strtok_r(brick_list_dup, " \n", &tmpptr); - brick_list_dup = tmpptr; - if (brick == NULL) - goto check_failed; - tmpptr = strrchr(brick, ':'); - if (tmpptr == NULL) - goto check_failed; - addrlen = strlen(brick) - strlen(tmpptr); - strncpy(brick_addr, brick, addrlen); - brick_addr[addrlen] = '\0'; - ret = getaddrinfo(brick_addr, NULL, NULL, &ai_info); - if (ret != 0) { - ret = 0; - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_RESOLVE_FAIL, - "unable to resolve host name for addr %s", brick_addr); - goto out; - } - ai_list_tmp1 = MALLOC(sizeof(addrinfo_list_t)); - if (ai_list_tmp1 == NULL) { - ret = 0; - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY, - "failed to allocate " - "memory"); - freeaddrinfo(ai_info); - goto out; - } - ai_list_tmp1->info = ai_info; - cds_list_add_tail(&ai_list_tmp1->list, &ai_list->list); - ai_list_tmp1 = NULL; - } - - i = 0; - ai_list_tmp1 = cds_list_entry(ai_list->list.next, addrinfo_list_t, list); - - /* Check for bad brick order */ - while (i < brick_count) { - ++i; - ai_info = ai_list_tmp1->info; - ai_list_tmp1 = cds_list_entry(ai_list_tmp1->list.next, addrinfo_list_t, - list); - if (0 == i % sub_count) { - j = 0; - continue; - } - ai_list_tmp2 = ai_list_tmp1; - k = j; - while (k < sub_count - 1) { - ++k; - ret = glusterd_compare_addrinfo(ai_info, ai_list_tmp2->info); - if (GF_AI_COMPARE_ERROR == ret) - goto check_failed; - if (GF_AI_COMPARE_MATCH == ret) - goto found_bad_brick_order; - ai_list_tmp2 = cds_list_entry(ai_list_tmp2->list.next, - addrinfo_list_t, list); - } - ++j; - } - gf_msg_debug(this->name, 0, "Brick order okay"); - ret = 0; - goto out; - -check_failed: - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER_CHECK_FAIL, - "Failed bad brick order check"); - snprintf(err_str, sizeof(failed_string), failed_string); - ret = -1; - goto out; - -found_bad_brick_order: - gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_BAD_BRKORDER, - "Bad brick order found"); - if (type == GF_CLUSTER_TYPE_DISPERSE) { - snprintf(err_str, sizeof(found_string), found_string, "disperse"); - } else { - snprintf(err_str, sizeof(found_string), found_string, "replicate"); - } - - ret = -1; -out: - ai_list_tmp2 = NULL; - GF_FREE(brick_list_ptr); - cds_list_for_each_entry(ai_list_tmp1, &ai_list->list, list) - { - if (ai_list_tmp1->info) - freeaddrinfo(ai_list_tmp1->info); - free(ai_list_tmp2); - ai_list_tmp2 = ai_list_tmp1; - } - free(ai_list_tmp2); - return ret; -} - int __glusterd_handle_create_volume(rpcsvc_request_t *req) { @@ -1337,6 +1103,35 @@ glusterd_op_stage_create_volume(dict_t *dict, char **op_errstr, } } + /*Check brick order if the volume type is replicate or disperse. If + * force at the end of command not given then check brick order. + */ + if (is_origin_glusterd(dict)) { + ret = dict_get_int32n(dict, "type", SLEN("type"), &type); + if (ret) { + snprintf(msg, sizeof(msg), + "Unable to get type of " + "volume %s", + volname); + gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED, "%s", + msg); + goto out; + } + + if (!is_force) { + if ((type == GF_CLUSTER_TYPE_REPLICATE) || + (type == GF_CLUSTER_TYPE_DISPERSE)) { + ret = glusterd_check_brick_order(dict, msg, type); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER, + "Not creating volume because of " + "bad brick order"); + goto out; + } + } + } + } + while (i < brick_count) { i++; brick = strtok_r(brick_list, " \n", &tmpptr); @@ -1423,36 +1218,6 @@ glusterd_op_stage_create_volume(dict_t *dict, char **op_errstr, brick_info = NULL; } - /*Check brick order if the volume type is replicate or disperse. If - * force at the end of command not given then check brick order. - */ - if (is_origin_glusterd(dict)) { - ret = dict_get_int32n(dict, "type", SLEN("type"), &type); - if (ret) { - snprintf(msg, sizeof(msg), - "Unable to get type of " - "volume %s", - volname); - gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED, "%s", - msg); - goto out; - } - - if (!is_force) { - if ((type == GF_CLUSTER_TYPE_REPLICATE) || - (type == GF_CLUSTER_TYPE_DISPERSE)) { - ret = glusterd_check_brick_order(dict, msg); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER, - "Not " - "creating volume because of " - "bad brick order"); - goto out; - } - } - } - } - ret = dict_set_int32n(rsp_dict, "brick_count", SLEN("brick_count"), local_brick_count); if (ret) { -- 1.8.3.1