74096c
From 6c3b21ce5bb76b35856a6c270eb65d11f869061f Mon Sep 17 00:00:00 2001
74096c
From: Sanju Rakonde <srakonde@redhat.com>
74096c
Date: Fri, 26 Jun 2020 12:10:31 +0530
74096c
Subject: [PATCH 484/511] glusterd: rebalance status displays stats as 0 after
74096c
 reboot
74096c
74096c
problem: while the rebalance is in progress, if a node is
74096c
rebooted rebalance v status shows the stats of this node as
74096c
0 once the node is back.
74096c
74096c
Reason: when the node is rebooted, once it is back
74096c
glusterd_volume_defrag_restart() starts the rebalance and
74096c
creates the rpc. but due to some race, rebalance process is
74096c
sending disconnect event, so rpc object is getting destroyed. As
74096c
the rpc object is null, request for fetching the latest stats is
74096c
not sent to rebalance process. and stats are shows as default values
74096c
which is 0.
74096c
74096c
Solution: When the rpc object null, we should create the rpc if the
74096c
rebalance process is up. so that request can be sent to rebalance
74096c
process using the rpc.
74096c
74096c
>fixes: #1339
74096c
>Change-Id: I1c7533fedd17dcaffc0f7a5a918c87356133a81c
74096c
>Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
74096c
Upstream Patch : https://review.gluster.org/c/glusterfs/+/24641
74096c
74096c
BUG: 1832306
74096c
Change-Id: I1c7533fedd17dcaffc0f7a5a918c87356133a81c
74096c
Signed-off-by: Srijan Sivakumar <ssivakum@redhat.com>
74096c
Reviewed-on: https://code.engineering.redhat.com/gerrit/220369
74096c
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74096c
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
74096c
---
74096c
 xlators/mgmt/glusterd/src/glusterd-syncop.c | 29 ++++++++++++++++++++---------
74096c
 1 file changed, 20 insertions(+), 9 deletions(-)
74096c
74096c
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c
74096c
index c78983a..df78fef 100644
74096c
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c
74096c
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c
74096c
@@ -1693,6 +1693,7 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
74096c
     rpc_clnt_t *rpc = NULL;
74096c
     dict_t *rsp_dict = NULL;
74096c
     int32_t cmd = GF_OP_CMD_NONE;
74096c
+    glusterd_volinfo_t *volinfo = NULL;
74096c
 
74096c
     this = THIS;
74096c
     rsp_dict = dict_new();
74096c
@@ -1724,18 +1725,28 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
74096c
     cds_list_for_each_entry_safe(pending_node, tmp, &selected, list)
74096c
     {
74096c
         rpc = glusterd_pending_node_get_rpc(pending_node);
74096c
+        /* In the case of rebalance if the rpc object is null, we try to
74096c
+         * create the rpc object. if the rebalance daemon is down, it returns
74096c
+         * -1. otherwise, rpc object will be created and referenced.
74096c
+         */
74096c
         if (!rpc) {
74096c
-            if (pending_node->type == GD_NODE_REBALANCE) {
74096c
-                ret = 0;
74096c
-                glusterd_defrag_volume_node_rsp(req_dict, NULL, op_ctx);
74096c
+            if (pending_node->type == GD_NODE_REBALANCE && pending_node->node) {
74096c
+                volinfo = pending_node->node;
74096c
+                ret = glusterd_rebalance_rpc_create(volinfo);
74096c
+                if (ret) {
74096c
+                    ret = 0;
74096c
+                    glusterd_defrag_volume_node_rsp(req_dict, NULL, op_ctx);
74096c
+                    goto out;
74096c
+                } else {
74096c
+                    rpc = glusterd_defrag_rpc_get(volinfo->rebal.defrag);
74096c
+                }
74096c
+            } else {
74096c
+                ret = -1;
74096c
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE,
74096c
+                       "Brick Op failed "
74096c
+                       "due to rpc failure.");
74096c
                 goto out;
74096c
             }
74096c
-
74096c
-            ret = -1;
74096c
-            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE,
74096c
-                   "Brick Op failed "
74096c
-                   "due to rpc failure.");
74096c
-            goto out;
74096c
         }
74096c
 
74096c
         /* Redirect operation to be detach tier via rebalance flow. */
74096c
-- 
74096c
1.8.3.1
74096c