Blob Blame History Raw
From f96f711ddacfe8c758de3a994e339339325a1539 Mon Sep 17 00:00:00 2001
From: Rajesh Joseph <rjoseph@redhat.com>
Date: Sat, 3 Dec 2016 01:10:51 +0530
Subject: [PATCH 227/227] rpc: fix for race between rpc and protocol/client

It is possible that the notification thread which notifies
protocol/client layer about the disconnection is put to sleep
and meanwhile, a fuse thread or a timer thread initiates and
completes reconnection to the brick. The notification thread
is then woken up and protocol/client layer updates its flags
to indicate that network is disconnected. No reconnection is
initiated because reconnection is rpc-lib layer's responsibility
and its flags indicate that connection is connected.

Fix: Serialize connect and disconnect notify

Backport-of: http://review.gluster.org/15916

Credit: Raghavendra Talur <rtalur@redhat.com>
Change-Id: I8ff5d1a3283b47f5c26848a42016a40bc34ffc1d
BUG: 1385605
Signed-off-by: Rajesh Joseph <rjoseph@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/92095
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 rpc/rpc-lib/src/rpc-clnt.c | 98 +++++++++++++++++++++++++++-------------------
 rpc/rpc-lib/src/rpc-clnt.h |  1 +
 2 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index a9e43eb..fe099f9 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -872,6 +872,41 @@ rpc_clnt_destroy (struct rpc_clnt *rpc);
 
 #define RPC_THIS_RESTORE        (THIS = old_THIS)
 
+static int
+rpc_clnt_handle_disconnect (struct rpc_clnt *clnt, rpc_clnt_connection_t *conn)
+{
+        struct timespec         ts          = {0, };
+        gf_boolean_t            unref_clnt  = _gf_false;
+
+        rpc_clnt_connection_cleanup (conn);
+
+        pthread_mutex_lock (&conn->lock);
+        {
+                if (!conn->rpc_clnt->disabled && (conn->reconnect == NULL)) {
+                        ts.tv_sec = 10;
+                        ts.tv_nsec = 0;
+
+                        rpc_clnt_ref (clnt);
+                        conn->reconnect = gf_timer_call_after (clnt->ctx, ts,
+                                                rpc_clnt_reconnect, conn);
+                        if (conn->reconnect == NULL) {
+                                gf_log (conn->name, GF_LOG_WARNING,
+                                                "Cannot create rpc_clnt_reconnect timer");
+                                unref_clnt = _gf_true;
+                        }
+                }
+        }
+        pthread_mutex_unlock (&conn->lock);
+
+        if (clnt->notifyfn)
+                clnt->notifyfn (clnt, clnt->mydata, RPC_CLNT_DISCONNECT, NULL);
+
+        if (unref_clnt)
+                rpc_clnt_ref (clnt);
+
+        return 0;
+}
+
 int
 rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
                  rpc_transport_event_t event, void *data, ...)
@@ -881,9 +916,7 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
         int                     ret         = -1;
         rpc_request_info_t     *req_info    = NULL;
         rpc_transport_pollin_t *pollin      = NULL;
-        struct timespec         ts          = {0, };
         void                   *clnt_mydata = NULL;
-        gf_boolean_t            unref_clnt  = _gf_false;
         DECLARE_OLD_THIS;
 
         conn = mydata;
@@ -899,35 +932,11 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
         switch (event) {
         case RPC_TRANSPORT_DISCONNECT:
         {
-                rpc_clnt_connection_cleanup (conn);
-
-                pthread_mutex_lock (&conn->lock);
+                pthread_mutex_lock (&clnt->notifylock);
                 {
-                        if (!conn->rpc_clnt->disabled
-                            && (conn->reconnect == NULL)) {
-                                ts.tv_sec = 10;
-                                ts.tv_nsec = 0;
-
-                                rpc_clnt_ref (clnt);
-                                conn->reconnect =
-                                        gf_timer_call_after (clnt->ctx, ts,
-                                                             rpc_clnt_reconnect,
-                                                             conn);
-                                if (conn->reconnect == NULL) {
-                                        gf_log (conn->name, GF_LOG_WARNING,
-                                                "Cannot create rpc_clnt_reconnect timer");
-                                        unref_clnt = _gf_true;
-                                }
-                        }
+                        rpc_clnt_handle_disconnect (clnt, conn);
                 }
-                pthread_mutex_unlock (&conn->lock);
-
-                if (clnt->notifyfn)
-                        ret = clnt->notifyfn (clnt, clnt->mydata,
-                                              RPC_CLNT_DISCONNECT, NULL);
-                if (unref_clnt)
-                        rpc_clnt_ref (clnt);
-
+                pthread_mutex_unlock (&clnt->notifylock);
                 break;
         }
 
@@ -982,17 +991,21 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
 
         case RPC_TRANSPORT_CONNECT:
         {
-                /* Every time there is a disconnection, processes
-                   should try to connect to 'glusterd' (ie, default
-                   port) or whichever port given as 'option remote-port'
-                   in volume file. */
-                /* Below code makes sure the (re-)configured port lasts
-                   for just one successful attempt */
-                conn->config.remote_port = 0;
-
-                if (clnt->notifyfn)
-                        ret = clnt->notifyfn (clnt, clnt->mydata,
-                                              RPC_CLNT_CONNECT, NULL);
+                pthread_mutex_lock (&clnt->notifylock);
+                {
+                        /* Every time there is a disconnection, processes
+                         * should try to connect to 'glusterd' (ie, default
+                         * port) or whichever port given as 'option remote-port'
+                         * in volume file. */
+                        /* Below code makes sure the (re-)configured port lasts
+                         * for just one successful attempt */
+                        conn->config.remote_port = 0;
+
+                        if (clnt->notifyfn)
+                                ret = clnt->notifyfn (clnt, clnt->mydata,
+                                                RPC_CLNT_CONNECT, NULL);
+                }
+                pthread_mutex_unlock (&clnt->notifylock);
                 break;
         }
 
@@ -1116,6 +1129,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
         }
 
         pthread_mutex_init (&rpc->lock, NULL);
+        pthread_mutex_init (&rpc->notifylock, NULL);
         rpc->ctx = ctx;
         rpc->owner = owner;
 
@@ -1125,6 +1139,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
         rpc->reqpool = mem_pool_new (struct rpc_req, reqpool_size);
         if (rpc->reqpool == NULL) {
                 pthread_mutex_destroy (&rpc->lock);
+                pthread_mutex_destroy (&rpc->notifylock);
                 GF_FREE (rpc);
                 rpc = NULL;
                 goto out;
@@ -1134,6 +1149,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
                                                reqpool_size);
         if (rpc->saved_frames_pool == NULL) {
                 pthread_mutex_destroy (&rpc->lock);
+                pthread_mutex_destroy (&rpc->notifylock);
                 mem_pool_destroy (rpc->reqpool);
                 GF_FREE (rpc);
                 rpc = NULL;
@@ -1143,6 +1159,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
         ret = rpc_clnt_connection_init (rpc, ctx, options, name);
         if (ret == -1) {
                 pthread_mutex_destroy (&rpc->lock);
+                pthread_mutex_destroy (&rpc->notifylock);
                 mem_pool_destroy (rpc->reqpool);
                 mem_pool_destroy (rpc->saved_frames_pool);
                 GF_FREE (rpc);
@@ -1738,6 +1755,7 @@ rpc_clnt_destroy (struct rpc_clnt *rpc)
         saved_frames_destroy (rpc->conn.saved_frames);
         pthread_mutex_destroy (&rpc->lock);
         pthread_mutex_destroy (&rpc->conn.lock);
+        pthread_mutex_destroy (&rpc->notifylock);
 
         /* mem-pool should be destroyed, otherwise,
            it will cause huge memory leaks */
diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h
index f84b4cb..3a5b287 100644
--- a/rpc/rpc-lib/src/rpc-clnt.h
+++ b/rpc/rpc-lib/src/rpc-clnt.h
@@ -172,6 +172,7 @@ struct rpc_req {
 
 typedef struct rpc_clnt {
         pthread_mutex_t        lock;
+        pthread_mutex_t        notifylock;
         rpc_clnt_notify_t      notifyfn;
         rpc_clnt_connection_t  conn;
         void                  *mydata;
-- 
2.9.3