d1681e
From 4623ebe9a69a5284565b4ad253084a1236478988 Mon Sep 17 00:00:00 2001
d1681e
From: Soumya Koduri <skoduri@redhat.com>
d1681e
Date: Fri, 3 Nov 2017 15:41:34 +0530
d1681e
Subject: [PATCH 243/260] timer: Fix possible race during cleanup
d1681e
d1681e
As mentioned in bug1509189, there is a possible race
d1681e
between gf_timer_cancel(), gf_timer_proc() and
d1681e
gf_timer_registry_destroy() leading to use_after_free.
d1681e
d1681e
Problem:
d1681e
d1681e
1) gf_timer_proc() is called, locks reg, and gets an event.
d1681e
It unlocks reg, and calls the callback.
d1681e
d1681e
2) Meanwhile gf_timer_registry_destroy() is called, and removes
d1681e
reg from ctx, and joins on gf_timer_proc().
d1681e
d1681e
3) gf_timer_call_cancel() is called on the event being
d1681e
processed.  It cannot find reg (since it's been removed from reg),
d1681e
so it frees event.
d1681e
d1681e
4) the callback returns into gf_timer_proc(), and it tries to free
d1681e
event, but it's already free, so double free.
d1681e
d1681e
Solution:
d1681e
The fix is to bail out in gf_timer_cancel() when registry
d1681e
is not found. The logic behind this is that, gf_timer_cancel()
d1681e
is called only on any existing event. That means there was a valid
d1681e
registry earlier while creating that event. And the only reason
d1681e
we cannot find that registry now is that it must have got set to
d1681e
NULL when context cleanup is started.
d1681e
Since gf_timer_proc() takes care of releasing all the remaining
d1681e
events active on that registry, it seems safe to bail out
d1681e
in gf_timer_cancel().
d1681e
d1681e
>Change-Id: Ia9b088533141c3bb335eff2fe06b52d1575bb34f
d1681e
>BUG: 1509189
d1681e
>Reported-by: Daniel Gryniewicz <dang@redhat.com>
d1681e
>Signed-off-by: Soumya Koduri <skoduri@redhat.com>
d1681e
Upstream Link: https://review.gluster.org/#/c/18652/
d1681e
d1681e
BUG: 1568374
d1681e
Change-Id: Ia9b088533141c3bb335eff2fe06b52d1575bb34f
d1681e
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/135896
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
---
d1681e
 libglusterfs/src/timer.c | 15 ++++++++++++---
d1681e
 1 file changed, 12 insertions(+), 3 deletions(-)
d1681e
d1681e
diff --git a/libglusterfs/src/timer.c b/libglusterfs/src/timer.c
d1681e
index 34dfd35..d6f008d 100644
d1681e
--- a/libglusterfs/src/timer.c
d1681e
+++ b/libglusterfs/src/timer.c
d1681e
@@ -83,6 +83,13 @@ gf_timer_call_cancel (glusterfs_ctx_t *ctx,
d1681e
                 return 0;
d1681e
         }
d1681e
 
d1681e
+        if (ctx->cleanup_started) {
d1681e
+                gf_msg_callingfn ("timer", GF_LOG_INFO, 0,
d1681e
+                                  LG_MSG_CTX_CLEANUP_STARTED,
d1681e
+                                  "ctx cleanup started");
d1681e
+                return 0;
d1681e
+        }
d1681e
+
d1681e
         LOCK (&ctx->lock);
d1681e
         {
d1681e
                 reg = ctx->timer;
d1681e
@@ -90,9 +97,11 @@ gf_timer_call_cancel (glusterfs_ctx_t *ctx,
d1681e
         UNLOCK (&ctx->lock);
d1681e
 
d1681e
         if (!reg) {
d1681e
-                gf_msg ("timer", GF_LOG_ERROR, 0, LG_MSG_INIT_TIMER_FAILED,
d1681e
-                        "!reg");
d1681e
-                GF_FREE (event);
d1681e
+                /* This can happen when cleanup may have just started and
d1681e
+                 * gf_timer_registry_destroy() sets ctx->timer to NULL.
d1681e
+                 * Just bail out as success as gf_timer_proc() takes
d1681e
+                 * care of cleaning up the events.
d1681e
+                 */
d1681e
                 return 0;
d1681e
         }
d1681e
 
d1681e
-- 
d1681e
1.8.3.1
d1681e