Blob Blame History Raw
From d5c01bd7537c05cd6606751ac36ff4ae73b2de81 Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Tue, 7 Mar 2017 18:36:58 -0500
Subject: [PATCH 351/361] glusterd: don't queue attach reqs before connecting

This was causing USS tests to fail.  The underlying problem here is
that if we try to queue the attach request too soon after starting a
brick process then the socket code will get an error trying to write
to the still-unconnected socket.  Its response is to shut down the
socket, which causes the queued attach requests to be force-unwound.
There's nothing to retry them, so they effectively never happen and
those bricks (second and succeeding for a snapshot) never become
available.

We *do* have a retry loop for attach requests, but currently break out
as soon as a request is queued - not actually sent.  The fix is to
modify that loop so it will wait some more if the rpc connection isn't
even complete yet.  Now we break out only when we have a completed
connection *and* a queued request.

mainline:
> BUG: 1430148
> Reviewed-on: https://review.gluster.org/16868
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Reviewed-by: Prashanth Pai <ppai@redhat.com>
(cherry picked from commit a410273c20ac7f616f4652d07124f76ee8f3f59f)

BUG: 1417815
Change-Id: Ib6be13646f1fa9072b4a944ab5f13e1b29084841
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/101327
Tested-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-utils.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 42887e9..eee2224 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -4900,22 +4900,29 @@ my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
 int
 send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op)
 {
-        int            ret      = -1;
-        struct iobuf  *iobuf    = NULL;
-        struct iobref *iobref   = NULL;
-        struct iovec   iov      = {0, };
-        ssize_t        req_size = 0;
-        call_frame_t  *frame    = NULL;
-        gd1_mgmt_brick_op_req   brick_req;
-        void                    *req = &brick_req;
-        void          *errlbl   = &&err;
-        extern struct rpc_clnt_program gd_brick_prog;
+        int                             ret      = -1;
+        struct iobuf                    *iobuf    = NULL;
+        struct iobref                   *iobref   = NULL;
+        struct iovec                    iov      = {0, };
+        ssize_t                         req_size = 0;
+        call_frame_t                    *frame    = NULL;
+        gd1_mgmt_brick_op_req           brick_req;
+        void                            *req = &brick_req;
+        void                            *errlbl   = &&err;
+        struct rpc_clnt_connection      *conn;
+        extern struct rpc_clnt_program  gd_brick_prog;
 
         if (!rpc) {
                 gf_log (this->name, GF_LOG_ERROR, "called with null rpc");
                 return -1;
         }
 
+        conn = &rpc->conn;
+        if (!conn->connected || conn->disconnected) {
+                gf_log (this->name, GF_LOG_INFO, "not connected yet");
+                return -1;
+        }
+
         brick_req.op = op;
         brick_req.name = path;
         brick_req.input.input_val = NULL;
@@ -5033,7 +5040,7 @@ attach_brick (xlator_t *this,
         (void) build_volfile_path (full_id, path, sizeof(path), NULL);
 
         int tries = 0;
-        while (tries++ <= 10) {
+        while (tries++ <= 15) {
                 ret = send_attach_req (this, other_brick->rpc, path,
                                        GLUSTERD_BRICK_ATTACH);
                 if (!ret) {
-- 
1.8.3.1