Blob Blame History Raw
From 62fe36178cab588b658b44808cc954a57a1fc452 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Fri, 10 Aug 2018 08:14:14 -0400
Subject: [PATCH 369/385] geo-rep: Fix deadlock during worker start

Analysis:
Monitor process spawns monitor threads (one per brick).
Each monitor thread, forks worker and agent processes.
Each monitor thread, while intializing, updates the
monitor status file. It is synchronized using flock.
The race is that, some thread can fork worker while
other thread opened the status file resulting in
holding the reference of fd in worker process.

Cause:
flock gets unlocked either by specifically unlocking it
or by closing all duplicate fds referring to the file.
The code was relying on fd close, hence a reference
in worker/agent process by fork could cause the deadlock.

Fix:
1. flock is unlocked specifically.
2. Also made sure to update status file in approriate places so that
the reference is not leaked to worker/agent process.

With this fix, both the deadlock and possible fd
leaks is solved.

Upstream Patch : https://review.gluster.org/#/c/glusterfs/+/20704/
>fixes: bz#1614799
>Signed-off-by: Kotresh HR <khiremat@redhat.com>

Change-Id: I0d1ce93072dab07d0dbcc7e779287368cd9f093d
BUG: 1623749
Signed-off-by: Sunny Kumar <sunkumar@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/149760
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 geo-replication/syncdaemon/gsyncdstatus.py |  1 +
 geo-replication/syncdaemon/monitor.py      | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/geo-replication/syncdaemon/gsyncdstatus.py b/geo-replication/syncdaemon/gsyncdstatus.py
index 909c669..67493ca 100644
--- a/geo-replication/syncdaemon/gsyncdstatus.py
+++ b/geo-replication/syncdaemon/gsyncdstatus.py
@@ -99,6 +99,7 @@ class LockedOpen(object):
         return f
 
     def __exit__(self, _exc_type, _exc_value, _traceback):
+        fcntl.flock(self.fileobj, fcntl.LOCK_UN)
         self.fileobj.close()
 
 
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index 9245572..3451fe4 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -144,9 +144,6 @@ class Monitor(object):
                                                     "%s::%s" % (slave_host,
                                                                 slave_vol))
 
-        set_monitor_status(gconf.state_file, self.ST_STARTED)
-        self.status[w[0]['dir']].set_worker_status(self.ST_INIT)
-
         ret = 0
 
         def nwait(p, o=0):
@@ -196,6 +193,7 @@ class Monitor(object):
             # Spawn the worker and agent in lock to avoid fd leak
             self.lock.acquire()
 
+            self.status[w[0]['dir']].set_worker_status(self.ST_INIT)
             logging.info(lf('starting gsyncd worker',
                             brick=w[0]['dir'],
                             slave_node=remote_host))
@@ -375,6 +373,19 @@ class Monitor(object):
             t = Thread(target=wmon, args=[wx])
             t.start()
             ta.append(t)
+
+        # monitor status was being updated in each monitor thread. It
+        # should not be done as it can cause deadlock for a worker start.
+        # set_monitor_status uses flock to synchronize multple instances
+        # updating the file. Since each monitor thread forks worker and
+        # agent, these processes can hold the reference to fd of status
+        # file causing deadlock to workers which starts later as flock
+        # will not be release until all references to same fd is closed.
+        # It will also cause fd leaks.
+
+        self.lock.acquire()
+        set_monitor_status(gconf.get("state-file"), self.ST_STARTED)
+        self.lock.release()
         for t in ta:
             t.join()
 
-- 
1.8.3.1