12a457
From 8295424dcd1c28e80fa576bc5ae0febc43f58744 Mon Sep 17 00:00:00 2001
12a457
From: Aravinda VK <avishwan@redhat.com>
12a457
Date: Tue, 24 May 2016 14:13:29 +0530
12a457
Subject: [PATCH 177/178] geo-rep: Handle Worker kill gracefully if worker already died
12a457
12a457
If Agent dies for any reason, monitor tries to kill Worker also. But
12a457
if worker is also died then kill command raises error ESRCH: No such
12a457
process.
12a457
12a457
[2016-05-23 16:49:33.903965] I [monitor(monitor):326:monitor] Monitor:
12a457
    Changelog Agent died, Aborting Worker(/bricks/brick0/master_brick0)
12a457
[2016-05-23 16:49:33.904535] E [syncdutils(monitor):276:log_raise_exception]
12a457
    <top>: FAIL:
12a457
Traceback (most recent call last):
12a457
  File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 306 in
12a457
  twrap
12a457
    tf(*aa)
12a457
  File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 393, in
12a457
  wmon
12a457
     slave_host, master)
12a457
  File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 327, in
12a457
  monitor
12a457
     os.kill(cpid, signal.SIGKILL)
12a457
     OSError: [Errno 3] No such process
12a457
12a457
With this patch, monitor will gracefully handle if worker is already died.
12a457
12a457
Change-Id: I3ae5f816a3a197343b64540cf46f5453167fb660
12a457
Signed-off-by: Aravinda VK <avishwan@redhat.com>
12a457
BUG: 1339163
12a457
Reviewed-on: http://review.gluster.org/14512
12a457
Reviewed-on: http://review.gluster.org/14562
12a457
Reviewed-on: http://review.gluster.org/14563
12a457
Reviewed-on: https://code.engineering.redhat.com/gerrit/75474
12a457
Reviewed-by: Milind Changire <mchangir@redhat.com>
12a457
Reviewed-by: Kotresh Hiremath Ravishankar <khiremat@redhat.com>
12a457
---
12a457
 geo-replication/syncdaemon/monitor.py |   18 +++++++++---------
12a457
 1 files changed, 9 insertions(+), 9 deletions(-)
12a457
12a457
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
12a457
index 2b570a9..050218b 100644
12a457
--- a/geo-replication/syncdaemon/monitor.py
12a457
+++ b/geo-replication/syncdaemon/monitor.py
12a457
@@ -18,11 +18,11 @@ import xml.etree.ElementTree as XET
12a457
 from subprocess import PIPE
12a457
 from resource import Popen, FILE, GLUSTER, SSH
12a457
 from threading import Lock
12a457
-from errno import ECHILD
12a457
+from errno import ECHILD, ESRCH
12a457
 import re
12a457
 import random
12a457
 from gconf import gconf
12a457
-from syncdutils import select, waitpid
12a457
+from syncdutils import select, waitpid, errno_wrap
12a457
 from syncdutils import set_term_handler, is_host_local, GsyncdError
12a457
 from syncdutils import escape, Thread, finalize, memoize
12a457
 
12a457
@@ -187,7 +187,7 @@ class Monitor(object):
12a457
         # standard handler
12a457
         set_term_handler(lambda *a: set_term_handler())
12a457
         # give a chance to graceful exit
12a457
-        os.kill(-os.getpid(), signal.SIGTERM)
12a457
+        errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
12a457
 
12a457
     def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
12a457
         """the monitor loop
12a457
@@ -324,7 +324,7 @@ class Monitor(object):
12a457
                     # Agent is died Kill Worker
12a457
                     logging.info("Changelog Agent died, "
12a457
                                  "Aborting Worker(%s)" % w[0])
12a457
-                    os.kill(cpid, signal.SIGKILL)
12a457
+                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
12a457
                     nwait(cpid)
12a457
                     nwait(apid)
12a457
 
12a457
@@ -348,7 +348,7 @@ class Monitor(object):
12a457
                             # Agent is died Kill Worker
12a457
                             logging.info("Changelog Agent died, Aborting "
12a457
                                          "Worker(%s)" % w[0])
12a457
-                            os.kill(cpid, signal.SIGKILL)
12a457
+                            errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
12a457
                             nwait(cpid)
12a457
                             nwait(apid)
12a457
                             break
12a457
@@ -357,7 +357,7 @@ class Monitor(object):
12a457
             else:
12a457
                 logging.info("worker(%s) not confirmed in %d sec, "
12a457
                              "aborting it" % (w[0], conn_timeout))
12a457
-                os.kill(cpid, signal.SIGKILL)
12a457
+                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
12a457
                 nwait(apid)  # wait for agent
12a457
                 ret = nwait(cpid)
12a457
             if ret is None:
12a457
@@ -394,9 +394,9 @@ class Monitor(object):
12a457
                 time.sleep(1)
12a457
                 self.lock.acquire()
12a457
                 for cpid in cpids:
12a457
-                    os.kill(cpid, signal.SIGKILL)
12a457
+                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
12a457
                 for apid in agents:
12a457
-                    os.kill(apid, signal.SIGKILL)
12a457
+                    errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH])
12a457
                 self.lock.release()
12a457
                 finalize(exval=1)
12a457
             t = Thread(target=wmon, args=[wx])
12a457
@@ -464,7 +464,7 @@ def monitor(*resources):
12a457
     # yes, send SIGSTOP to negative of monitor pid
12a457
     # to go back to pause state.
12a457
     if gconf.pause_on_start:
12a457
-        os.kill(-os.getpid(), signal.SIGSTOP)
12a457
+        errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])
12a457
 
12a457
     """oh yeah, actually Monitor is used as singleton, too"""
12a457
     return Monitor().multiplex(*distribute(*resources))
12a457
-- 
12a457
1.7.1
12a457