From df8fdd1d7fab692169a667fcc07c652fcc5e2ad7 Mon Sep 17 00:00:00 2001 From: Atin Mukherjee Date: Wed, 5 Oct 2016 14:59:51 +0530 Subject: [PATCH 317/361] glusterd: daemon restart logic should adhere server side quorum Just like brick processes, other daemon services should also follow the same logic of quorum checks to see if a particular service needs to come up if glusterd is restarted or the incoming friend add/update request is received (in glusterd_restart_bricks () function) mainline: > BUG: 1383893 > Reviewed-on: https://review.gluster.org/15626 > Smoke: Gluster Build System > NetBSD-regression: NetBSD Build System > CentOS-regression: Gluster Build System > Reviewed-by: Prashanth Pai (cherry picked from commit 5a6f509263a810ca21a22bbbd1e6ffcf43b70d18) BUG: 1381825 Change-Id: I54a1fbdaa1571cc45eed627181b81463fead47a3 Signed-off-by: Atin Mukherjee Reviewed-on: https://code.engineering.redhat.com/gerrit/101298 Tested-by: Milind Changire --- .../bug-1383893-daemons-to-follow-quorum.t | 57 +++++ xlators/mgmt/glusterd/src/glusterd-utils.c | 235 +-------------------- 2 files changed, 64 insertions(+), 228 deletions(-) create mode 100644 tests/bugs/glusterd/bug-1383893-daemons-to-follow-quorum.t diff --git a/tests/bugs/glusterd/bug-1383893-daemons-to-follow-quorum.t b/tests/bugs/glusterd/bug-1383893-daemons-to-follow-quorum.t new file mode 100644 index 0000000..105292a --- /dev/null +++ b/tests/bugs/glusterd/bug-1383893-daemons-to-follow-quorum.t @@ -0,0 +1,57 @@ +#!/bin/bash + +# This test checks for if shd or any other daemons brought down (apart from +# brick processes) is not brought up automatically when glusterd on the other +# node is (re)started + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../cluster.rc + +function shd_up_status_1 { + $CLI_1 volume status | grep "localhost" | grep "Self-heal Daemon" | awk '{print $7}' +} + +function shd_up_status_2 { + $CLI_2 volume status | grep "localhost" | grep "Self-heal Daemon" | awk '{print $7}' +} + +function get_shd_pid_2 { + $CLI_2 volume status | grep "localhost" | grep "Self-heal Daemon" | awk '{print $8}' +} +cleanup; + +TEST launch_cluster 3 + +TEST $CLI_1 peer probe $H2; +EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count + +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count + +# Lets create the volume +TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/${V0}1 $H2:$B2/${V0}2 + +# Start the volume +TEST $CLI_1 volume start $V0 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H1 $B1/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H2 $B2/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" shd_up_status_1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" shd_up_status_2 + +# Bring down shd on 2nd node +kill -15 $(get_shd_pid_2) + +# Bring down glusterd on 1st node +TEST kill_glusterd 1 + +#Bring back 1st glusterd +TEST $glusterd_1 + +# We need to wait till PROCESS_UP_TIMEOUT and then check shd service does not +# come up on node 2 +sleep $PROCESS_UP_TIMEOUT +EXPECT "N" shd_up_status_2 + +cleanup; diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 9e9d609..91cc12e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -4902,10 +4902,6 @@ glusterd_restart_bricks (glusterd_conf_t *conf) cds_list_for_each_entry (volinfo, &conf->volumes, vol_list) { if (volinfo->status != GLUSTERD_STATUS_STARTED) continue; - if (start_svcs == _gf_false) { - start_svcs = _gf_true; - glusterd_svcs_manager (NULL); - } gf_msg_debug (this->name, 0, "starting the volume %s", volinfo->volname); @@ -4928,6 +4924,11 @@ glusterd_restart_bricks (glusterd_conf_t *conf) */ continue; } else { + if (start_svcs == _gf_false) { + start_svcs = _gf_true; + glusterd_svcs_manager (NULL); + } + cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { glusterd_brick_start (volinfo, brickinfo, @@ -4940,8 +4941,8 @@ glusterd_restart_bricks (glusterd_conf_t *conf) cds_list_for_each_entry (volinfo, &snap->volumes, vol_list) { if (volinfo->status != GLUSTERD_STATUS_STARTED) continue; - /* Check the quorum, if quorum is not met, don't start the - bricks + /* Check the quorum, if quorum is not met, don't start + * the bricks */ ret = check_quorum_for_brick_start (volinfo, node_quorum); @@ -9403,228 +9404,6 @@ out: } int -<<<<<<< 07a9e00a5702e76932142e9d9cdc2df601632b7a -======= -glusterd_volume_tier_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict) -{ - char key[256] = {0,}; - char *node_uuid = NULL; - char *node_uuid_str = NULL; - char *volname = NULL; - dict_t *ctx_dict = NULL; - double elapsed_time = 0; - glusterd_volinfo_t *volinfo = NULL; - int ret = 0; - int32_t index = 0; - int32_t count = 0; - int32_t value32 = 0; - uint64_t value = 0; - xlator_t *this = NULL; - char *task_id_str = NULL; - - this = THIS; - GF_VALIDATE_OR_GOTO (this->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, rsp_dict, out); - - if (aggr) { - ctx_dict = aggr; - - } else { - gf_msg (this->name, GF_LOG_ERROR, 0, - GD_MSG_OPCTX_GET_FAIL, - "Operation Context is not present"); - goto out; - } - - if (!ctx_dict) - goto out; - - ret = dict_get_str (ctx_dict, "volname", &volname); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - GD_MSG_DICT_GET_FAILED, - "Unable to get volume name"); - goto out; - } - - ret = glusterd_volinfo_find (volname, &volinfo); - - if (ret) - goto out; - - ret = dict_get_int32 (rsp_dict, "count", &index); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - GD_MSG_DICT_GET_FAILED, - "failed to get index"); - - memset (key, 0, 256); - snprintf (key, 256, "node-uuid-%d", index); - ret = dict_get_str (rsp_dict, key, &node_uuid); - if (!ret) { - node_uuid_str = gf_strdup (node_uuid); - - } - ret = dict_get_int32 (ctx_dict, "count", &count); - count++; - ret = dict_set_int32 (ctx_dict, "count", count); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - GD_MSG_DICT_SET_FAILED, - "Failed to set count"); - - memset (key, 0, 256); - snprintf (key, 256, "node-uuid-%d", count); - ret = dict_set_dynstr (ctx_dict, key, node_uuid_str); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set node-uuid"); - } - - snprintf (key, 256, "files-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "files-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set the file count"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "size-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "size-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set the size of migration"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "lookups-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "lookups-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set looked up file count"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "status-%d", index); - ret = dict_get_int32 (rsp_dict, key, &value32); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "status-%d", count); - ret = dict_set_int32 (ctx_dict, key, value32); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set status"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "failures-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "failures-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set failure count"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "skipped-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "skipped-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set skipped count"); - } - } - memset (key, 0, 256); - snprintf (key, 256, "run-time-%d", index); - ret = dict_get_double (rsp_dict, key, &elapsed_time); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "run-time-%d", count); - ret = dict_set_double (ctx_dict, key, elapsed_time); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set run-time"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "demoted-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "demoted-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set demoted count"); - } - } - memset (key, 0, 256); - snprintf (key, 256, "promoted-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "promoted-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (this->name, 0, - "failed to set promoted count"); - } - } - - memset (key, 0, 256); - snprintf (key, 256, "time-left-%d", index); - ret = dict_get_uint64 (rsp_dict, key, &value); - if (!ret) { - memset (key, 0, 256); - snprintf (key, 256, "time-left-%d", count); - ret = dict_set_uint64 (ctx_dict, key, value); - if (ret) { - gf_msg_debug (THIS->name, 0, - "failed to set time-left"); - } - } - - ret = dict_get_str (rsp_dict, GF_REMOVE_BRICK_TID_KEY, - &task_id_str); - if (ret) { - gf_msg_debug (this->name, errno, - "Missing remove-brick-id"); - } else - ret = dict_set_str (ctx_dict, GF_REMOVE_BRICK_TID_KEY, - task_id_str); - - ret = 0; - -out: - return ret; -} - -int ->>>>>>> dht/rebalance Estimate time to complete rebalance glusterd_sys_exec_output_rsp_dict (dict_t *dst, dict_t *src) { char output_name[PATH_MAX] = ""; -- 1.8.3.1