From 12b3a9ec4c661ae6bc8fc546eb29114191c9c404 Mon Sep 17 00:00:00 2001 From: anand Date: Tue, 21 Jul 2015 15:42:24 +0530 Subject: [PATCH 255/279] glusterd: getting txn_id from frame->cookie in op_sm call back RCA: If rebalance start is triggered from one node and one of other nodes in the cluster goes down simultaneously we might end up in a case where callback will use the txn_id from priv->global_txn_id which is always zeros and this means injecting an event with an incorrect txn_id will result into op-sm getting stuck. fix: set txn_id in frame->cookie during sumbit_and_request, so that we can get txn_id in call back functions. >Change-Id: I519176c259ea9d37897791a77a7c92eb96d10052 >BUG: 1245142 >Signed-off-by: anand >Reviewed-on: http://review.gluster.org/11728 >Reviewed-by: Atin Mukherjee >Tested-by: NetBSD Build System >Tested-by: Gluster Build System >Signed-off-by: Anand Change-Id: I4f5df48890953651fc80bc3c3e66711b230d8fd1 BUG: 1244527 Signed-off-by: Anand Reviewed-on: https://code.engineering.redhat.com/gerrit/55080 Reviewed-by: Atin Mukherjee Tested-by: Atin Mukherjee --- tests/bugs/glusterd/bug-1245142-rebalance_test.t | 28 ++++++++ xlators/mgmt/glusterd/src/glusterd-rpc-ops.c | 77 +++++++++++++++++----- 2 files changed, 89 insertions(+), 16 deletions(-) create mode 100644 tests/bugs/glusterd/bug-1245142-rebalance_test.t diff --git a/tests/bugs/glusterd/bug-1245142-rebalance_test.t b/tests/bugs/glusterd/bug-1245142-rebalance_test.t new file mode 100644 index 0000000..a28810e --- /dev/null +++ b/tests/bugs/glusterd/bug-1245142-rebalance_test.t @@ -0,0 +1,28 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../volume.rc + + +cleanup; +TEST launch_cluster 2; +TEST $CLI_1 peer probe $H2; + +EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count + +$CLI_1 volume create $V0 $H1:$B1/$V0 $H2:$B2/$V0 +EXPECT 'Created' cluster_volinfo_field 1 $V0 'Status'; + +$CLI_1 volume start $V0 +EXPECT 'Started' cluster_volinfo_field 1 $V0 'Status'; + +$CLI_1 volume rebalance $V0 start & +#kill glusterd2 after requst sent, so that call back is called +#with rpc->status fail ,so roughly 1sec delay is introduced to get this scenario. +sleep 1 +kill_glusterd 2 +#check glusterd commands are working after rebalance start command +EXPECT 'Started' cluster_volinfo_field 1 $V0 'Status'; + +cleanup; diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c index 0890b02..bcdffdc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c @@ -1131,14 +1131,17 @@ __glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov, xlator_t *this = NULL; glusterd_conf_t *priv = NULL; uuid_t *txn_id = NULL; + call_frame_t *frame = NULL; this = THIS; GF_ASSERT (this); GF_ASSERT (req); priv = this->private; GF_ASSERT (priv); + GF_ASSERT(myframe); - txn_id = &priv->global_txn_id; + frame = myframe; + txn_id = frame->cookie; if (-1 == req->rpc_status) { rsp.op_ret = -1; @@ -1196,10 +1199,6 @@ out: uuid_utoa (rsp.uuid)); } - ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); - gf_msg_debug (this->name, 0, "transaction ID = %s", - uuid_utoa (*txn_id)); - rcu_read_lock (); peerinfo = glusterd_peerinfo_find (rsp.uuid, NULL); if (peerinfo == NULL) { @@ -1246,6 +1245,7 @@ out: } else { free (rsp.dict.dict_val); //malloced by xdr } + GF_FREE (frame->cookie); GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe)); return ret; } @@ -1274,14 +1274,17 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov, glusterd_conf_t *priv = NULL; uuid_t *txn_id = NULL; glusterd_op_info_t txn_op_info = {{0},}; + call_frame_t *frame = NULL; this = THIS; GF_ASSERT (this); GF_ASSERT (req); priv = this->private; GF_ASSERT (priv); + GF_ASSERT(myframe); - txn_id = &priv->global_txn_id; + frame = myframe; + txn_id = frame->cookie; if (-1 == req->rpc_status) { rsp.op_ret = -1; @@ -1339,9 +1342,6 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov, "Received commit ACC from uuid: %s", uuid_utoa (rsp.uuid)); } - ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); - gf_msg_debug (this->name, 0, "transaction ID = %s", - uuid_utoa (*txn_id)); ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info); if (ret) { @@ -1414,6 +1414,7 @@ out: if (dict) dict_unref (dict); free (rsp.op_errstr); //malloced by xdr + GF_FREE (frame->cookie); GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe)); return ret; } @@ -1898,9 +1899,9 @@ glusterd_stage_op (call_frame_t *frame, xlator_t *this, int ret = -1; glusterd_peerinfo_t *peerinfo = NULL; glusterd_conf_t *priv = NULL; - call_frame_t *dummy_frame = NULL; dict_t *dict = NULL; gf_boolean_t is_alloc = _gf_true; + uuid_t *txn_id = NULL; if (!this) { goto out; @@ -1930,13 +1931,34 @@ glusterd_stage_op (call_frame_t *frame, xlator_t *this, "to request buffer"); goto out; } + /* Sending valid transaction ID to peers */ + ret = dict_get_bin (dict, "transaction_id", + (void **)&txn_id); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_TRANS_ID_GET_FAIL, + "Failed to get transaction id."); + goto out; + } else { + gf_msg_debug (this->name, 0, + "Transaction_id = %s", uuid_utoa (*txn_id)); + } + if (!frame) + frame = create_frame (this, this->ctx->pool); - dummy_frame = create_frame (this, this->ctx->pool); - if (!dummy_frame) + if (!frame) { + ret = -1; goto out; + } + frame->cookie = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + if (!frame->cookie) { + ret = -1; + goto out; + } + gf_uuid_copy (frame->cookie, *txn_id); - ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame, + ret = glusterd_submit_request (peerinfo->rpc, &req, frame, peerinfo->mgmt, GLUSTERD_MGMT_STAGE_OP, NULL, this, glusterd_stage_op_cbk, @@ -1961,6 +1983,7 @@ glusterd_commit_op (call_frame_t *frame, xlator_t *this, call_frame_t *dummy_frame = NULL; dict_t *dict = NULL; gf_boolean_t is_alloc = _gf_true; + uuid_t *txn_id = NULL; if (!this) { goto out; @@ -1989,12 +2012,34 @@ glusterd_commit_op (call_frame_t *frame, xlator_t *this, "request buffer"); goto out; } + /* Sending valid transaction ID to peers */ + ret = dict_get_bin (dict, "transaction_id", + (void **)&txn_id); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_TRANS_ID_GET_FAIL, + "Failed to get transaction id."); + goto out; + } else { + gf_msg_debug (this->name, 0, + "Transaction_id = %s", uuid_utoa (*txn_id)); + } - dummy_frame = create_frame (this, this->ctx->pool); - if (!dummy_frame) + if (!frame) + frame = create_frame (this, this->ctx->pool); + + if (!frame) { + ret = -1; goto out; + } + frame->cookie = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + if (!frame->cookie) { + ret = -1; + goto out; + } + gf_uuid_copy (frame->cookie, *txn_id); - ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame, + ret = glusterd_submit_request (peerinfo->rpc, &req, frame, peerinfo->mgmt, GLUSTERD_MGMT_COMMIT_OP, NULL, this, glusterd_commit_op_cbk, -- 1.7.1