21ab4e
From cded8a819bee945041f86ba42fc531dcb075d521 Mon Sep 17 00:00:00 2001
21ab4e
From: Kotresh HR <khiremat@redhat.com>
21ab4e
Date: Tue, 4 Apr 2017 15:39:46 -0400
21ab4e
Subject: [PATCH 456/473] geo-rep: Improve worker log messages
21ab4e
21ab4e
Monitor process expects worker to establish SSH Tunnel to slave node
21ab4e
and mount master volume locally with in 60 secs and acknowledge monitor
21ab4e
process by closing feedback fd. If something goes wrong and worker
21ab4e
does not close feedback fd with in 60 secs, monitor kills the worker.
21ab4e
But there was no clue in log message about the actual issue. This patch
21ab4e
adds log and indicates whether the worker is hung during SSH
21ab4e
or master mount.
21ab4e
21ab4e
> Change-Id: Id08a12fa6f3bba1d4fe8036728dbc290e6c14c8c
21ab4e
> BUG: 1261689
21ab4e
> Signed-off-by: Kotresh HR <khiremat@redhat.com>
21ab4e
> Reviewed-on: https://review.gluster.org/16997
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: Aravinda VK <avishwan@redhat.com>
21ab4e
21ab4e
Change-Id: Id08a12fa6f3bba1d4fe8036728dbc290e6c14c8c
21ab4e
BUG: 1435587
21ab4e
Signed-off-by: Kotresh HR <khiremat@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/106655
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 geo-replication/syncdaemon/gsyncd.py   | 1 +
21ab4e
 geo-replication/syncdaemon/monitor.py  | 9 +++++++--
21ab4e
 geo-replication/syncdaemon/resource.py | 8 ++++++++
21ab4e
 3 files changed, 16 insertions(+), 2 deletions(-)
21ab4e
21ab4e
diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py
21ab4e
index 3092728..f9471e4 100644
21ab4e
--- a/geo-replication/syncdaemon/gsyncd.py
21ab4e
+++ b/geo-replication/syncdaemon/gsyncd.py
21ab4e
@@ -775,6 +775,7 @@ def main_i():
21ab4e
             remote.connect_remote(go_daemon='done')
21ab4e
     local.connect()
21ab4e
     if ffd:
21ab4e
+        logging.info ("Closing feedback fd, waking up the monitor")
21ab4e
         os.close(ffd)
21ab4e
     local.service_loop(*[r for r in [remote] if r])
21ab4e
 
21ab4e
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
21ab4e
index d9a9c33..dc0211e 100644
21ab4e
--- a/geo-replication/syncdaemon/monitor.py
21ab4e
+++ b/geo-replication/syncdaemon/monitor.py
21ab4e
@@ -369,8 +369,13 @@ class Monitor(object):
21ab4e
 
21ab4e
                         time.sleep(1)
21ab4e
             else:
21ab4e
-                logging.info("worker(%s) not confirmed in %d sec, "
21ab4e
-                             "aborting it" % (w[0]['dir'], conn_timeout))
21ab4e
+                logging.info("worker(%s) not confirmed in %d sec, aborting it. "
21ab4e
+                             "Gsyncd invocation on remote slave via SSH or "
21ab4e
+                             "gluster master mount might have hung. Please "
21ab4e
+                             "check the above logs for exact issue and check "
21ab4e
+                             "master or slave volume for errors. Restarting "
21ab4e
+                             "master/slave volume accordingly might help."
21ab4e
+                             % (w[0]['dir'], conn_timeout))
21ab4e
                 errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
21ab4e
                 nwait(apid)  # wait for agent
21ab4e
                 ret = nwait(cpid)
21ab4e
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
21ab4e
index e4b4e55..55bebe1 100644
21ab4e
--- a/geo-replication/syncdaemon/resource.py
21ab4e
+++ b/geo-replication/syncdaemon/resource.py
21ab4e
@@ -1413,6 +1413,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
21ab4e
         with given backend
21ab4e
         """
21ab4e
 
21ab4e
+        logging.info ("Mounting gluster volume locally...")
21ab4e
+        t0 = time.time()
21ab4e
         label = getattr(gconf, 'mountbroker', None)
21ab4e
         if not label and not privileged():
21ab4e
             label = syncdutils.getusername()
21ab4e
@@ -1423,6 +1425,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
21ab4e
             ['log-file=' + gconf.gluster_log_file, 'volfile-server=' +
21ab4e
              self.host, 'volfile-id=' + self.volume, 'client-pid=-1']
21ab4e
         mounter(params).inhibit(*[l for l in [label] if l])
21ab4e
+        logging.info ("Mounted gluster volume. Time taken: {0:.4f} "
21ab4e
+                      "secs".format((time.time() - t0)))
21ab4e
 
21ab4e
     def connect_remote(self, *a, **kw):
21ab4e
         sup(self, *a, **kw)
21ab4e
@@ -1669,10 +1673,14 @@ class SSH(AbstractUrl, SlaveRemote):
21ab4e
                                  self.inner_rsc.url)
21ab4e
 
21ab4e
         deferred = go_daemon == 'postconn'
21ab4e
+        logging.info ("Initializing SSH connection between master and slave...")
21ab4e
+        t0 = time.time()
21ab4e
         ret = sup(self, gconf.ssh_command.split() +
21ab4e
                   ["-p", str(gconf.ssh_port)] +
21ab4e
                   gconf.ssh_ctl_args + [self.remote_addr],
21ab4e
                   slave=self.inner_rsc.url, deferred=deferred)
21ab4e
+        logging.info ("SSH connection between master and slave established. "
21ab4e
+                      "Time taken: {0:.4f} secs".format((time.time() - t0)))
21ab4e
 
21ab4e
         if deferred:
21ab4e
             # send a message to peer so that we can wait for
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e