Blob Blame History Raw
From cded8a819bee945041f86ba42fc531dcb075d521 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Tue, 4 Apr 2017 15:39:46 -0400
Subject: [PATCH 456/473] geo-rep: Improve worker log messages

Monitor process expects worker to establish SSH Tunnel to slave node
and mount master volume locally with in 60 secs and acknowledge monitor
process by closing feedback fd. If something goes wrong and worker
does not close feedback fd with in 60 secs, monitor kills the worker.
But there was no clue in log message about the actual issue. This patch
adds log and indicates whether the worker is hung during SSH
or master mount.

> Change-Id: Id08a12fa6f3bba1d4fe8036728dbc290e6c14c8c
> BUG: 1261689
> Signed-off-by: Kotresh HR <khiremat@redhat.com>
> Reviewed-on: https://review.gluster.org/16997
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Reviewed-by: Aravinda VK <avishwan@redhat.com>

Change-Id: Id08a12fa6f3bba1d4fe8036728dbc290e6c14c8c
BUG: 1435587
Signed-off-by: Kotresh HR <khiremat@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/106655
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 geo-replication/syncdaemon/gsyncd.py   | 1 +
 geo-replication/syncdaemon/monitor.py  | 9 +++++++--
 geo-replication/syncdaemon/resource.py | 8 ++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py
index 3092728..f9471e4 100644
--- a/geo-replication/syncdaemon/gsyncd.py
+++ b/geo-replication/syncdaemon/gsyncd.py
@@ -775,6 +775,7 @@ def main_i():
             remote.connect_remote(go_daemon='done')
     local.connect()
     if ffd:
+        logging.info ("Closing feedback fd, waking up the monitor")
         os.close(ffd)
     local.service_loop(*[r for r in [remote] if r])
 
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index d9a9c33..dc0211e 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -369,8 +369,13 @@ class Monitor(object):
 
                         time.sleep(1)
             else:
-                logging.info("worker(%s) not confirmed in %d sec, "
-                             "aborting it" % (w[0]['dir'], conn_timeout))
+                logging.info("worker(%s) not confirmed in %d sec, aborting it. "
+                             "Gsyncd invocation on remote slave via SSH or "
+                             "gluster master mount might have hung. Please "
+                             "check the above logs for exact issue and check "
+                             "master or slave volume for errors. Restarting "
+                             "master/slave volume accordingly might help."
+                             % (w[0]['dir'], conn_timeout))
                 errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                 nwait(apid)  # wait for agent
                 ret = nwait(cpid)
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
index e4b4e55..55bebe1 100644
--- a/geo-replication/syncdaemon/resource.py
+++ b/geo-replication/syncdaemon/resource.py
@@ -1413,6 +1413,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
         with given backend
         """
 
+        logging.info ("Mounting gluster volume locally...")
+        t0 = time.time()
         label = getattr(gconf, 'mountbroker', None)
         if not label and not privileged():
             label = syncdutils.getusername()
@@ -1423,6 +1425,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
             ['log-file=' + gconf.gluster_log_file, 'volfile-server=' +
              self.host, 'volfile-id=' + self.volume, 'client-pid=-1']
         mounter(params).inhibit(*[l for l in [label] if l])
+        logging.info ("Mounted gluster volume. Time taken: {0:.4f} "
+                      "secs".format((time.time() - t0)))
 
     def connect_remote(self, *a, **kw):
         sup(self, *a, **kw)
@@ -1669,10 +1673,14 @@ class SSH(AbstractUrl, SlaveRemote):
                                  self.inner_rsc.url)
 
         deferred = go_daemon == 'postconn'
+        logging.info ("Initializing SSH connection between master and slave...")
+        t0 = time.time()
         ret = sup(self, gconf.ssh_command.split() +
                   ["-p", str(gconf.ssh_port)] +
                   gconf.ssh_ctl_args + [self.remote_addr],
                   slave=self.inner_rsc.url, deferred=deferred)
+        logging.info ("SSH connection between master and slave established. "
+                      "Time taken: {0:.4f} secs".format((time.time() - t0)))
 
         if deferred:
             # send a message to peer so that we can wait for
-- 
1.8.3.1