ca3909
From bf24623765817ede84ea47f3265f5e6c2ae17ee7 Mon Sep 17 00:00:00 2001
ca3909
From: Mohit Agrawal <moagrawal@redhat.com>
ca3909
Date: Tue, 16 Jul 2019 20:36:57 +0530
ca3909
Subject: [PATCH 279/284] posix: In brick_mux brick is crashed while start/stop
ca3909
 volume in loop
ca3909
ca3909
Problem: In brick_mux environment sometime brick is crashed while
ca3909
         volume stop/start in a loop.Brick is crashed in janitor task
ca3909
         at the time of accessing priv.If posix priv is cleaned up before
ca3909
         call janitor task then janitor task is crashed.
ca3909
ca3909
Solution: To avoid the crash in brick_mux environment introduce a new
ca3909
          flag janitor_task_stop in posix_private and before send CHILD_DOWN event
ca3909
          wait for update the flag by janitor_task_done
ca3909
ca3909
> Change-Id: Id9fa5d183a463b2b682774ab5cb9868357d139a4
ca3909
> fixes: bz#1730409
ca3909
> Signed-off-by: Mohit Agrawal <moagrawal@redhat.com>
ca3909
> (Cherry picked from commit f138d3fa2237e7fa940ecf17153fd700350c4138)
ca3909
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/23060/)
ca3909
ca3909
Change-Id: Id9fa5d183a463b2b682774ab5cb9868357d139a4
ca3909
fixex: bz#1729971
ca3909
Signed-off-by: Mohit Agrawal <moagrawal@redhat.com>
ca3909
Reviewed-on: https://code.engineering.redhat.com/gerrit/178934
ca3909
Tested-by: Mohit Agrawal <moagrawa@redhat.com>
ca3909
Tested-by: RHGS Build Bot <nigelb@redhat.com>
ca3909
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
ca3909
---
ca3909
 libglusterfs/src/glusterfs/xlator.h        |  3 +++
ca3909
 xlators/mgmt/glusterd/src/glusterd-utils.c |  5 ++--
ca3909
 xlators/protocol/server/src/server.c       |  6 ++++-
ca3909
 xlators/storage/posix/src/posix-common.c   | 40 +++++++++++++++++++++++++++++-
ca3909
 xlators/storage/posix/src/posix-helpers.c  | 16 ++++++++++++
ca3909
 xlators/storage/posix/src/posix.h          |  3 +++
ca3909
 6 files changed, 69 insertions(+), 4 deletions(-)
ca3909
ca3909
diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h
ca3909
index b78daad..da551e9 100644
ca3909
--- a/libglusterfs/src/glusterfs/xlator.h
ca3909
+++ b/libglusterfs/src/glusterfs/xlator.h
ca3909
@@ -861,6 +861,9 @@ struct _xlator {
ca3909
 
ca3909
     /* Flag to notify got CHILD_DOWN event for detach brick */
ca3909
     uint32_t notify_down;
ca3909
+
ca3909
+    /* Flag to avoid throw duplicate PARENT_DOWN event */
ca3909
+    uint32_t parent_down;
ca3909
 };
ca3909
 
ca3909
 /* This would be the only structure which needs to be exported by
ca3909
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
ca3909
index 2aa975b..812c698 100644
ca3909
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
ca3909
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
ca3909
@@ -4082,8 +4082,9 @@ out:
ca3909
     if (msg[0]) {
ca3909
         gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_IMPORT_FAIL, "%s",
ca3909
                msg);
ca3909
-        gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s",
ca3909
-                 new_brickinfo->hostname, new_brickinfo->path);
ca3909
+        if (new_brickinfo)
ca3909
+            gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s",
ca3909
+                     new_brickinfo->hostname, new_brickinfo->path);
ca3909
     }
ca3909
     gf_msg_debug("glusterd", 0, "Returning with %d", ret);
ca3909
     return ret;
ca3909
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
ca3909
index 6ae63ba..a5f09fe 100644
ca3909
--- a/xlators/protocol/server/src/server.c
ca3909
+++ b/xlators/protocol/server/src/server.c
ca3909
@@ -580,6 +580,7 @@ server_graph_janitor_threads(void *data)
ca3909
     gf_boolean_t victim_found = _gf_false;
ca3909
     xlator_list_t **trav_p = NULL;
ca3909
     xlator_t *top = NULL;
ca3909
+    uint32_t parent_down = 0;
ca3909
 
ca3909
     GF_ASSERT(data);
ca3909
 
ca3909
@@ -598,7 +599,10 @@ server_graph_janitor_threads(void *data)
ca3909
         victim = (*trav_p)->xlator;
ca3909
         if (victim->cleanup_starting &&
ca3909
             strcmp(victim->name, victim_name) == 0) {
ca3909
-            victim_found = _gf_true;
ca3909
+            parent_down = victim->parent_down;
ca3909
+            victim->parent_down = 1;
ca3909
+            if (!parent_down)
ca3909
+                victim_found = _gf_true;
ca3909
             break;
ca3909
         }
ca3909
     }
ca3909
diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c
ca3909
index d738692..69857d9 100644
ca3909
--- a/xlators/storage/posix/src/posix-common.c
ca3909
+++ b/xlators/storage/posix/src/posix-common.c
ca3909
@@ -146,10 +146,15 @@ int32_t
ca3909
 posix_notify(xlator_t *this, int32_t event, void *data, ...)
ca3909
 {
ca3909
     xlator_t *victim = data;
ca3909
+    struct posix_private *priv = this->private;
ca3909
+    int ret = 0;
ca3909
+    struct timespec sleep_till = {
ca3909
+        0,
ca3909
+    };
ca3909
 
ca3909
     switch (event) {
ca3909
         case GF_EVENT_PARENT_UP: {
ca3909
-            /* Tell the parent that posix xlator is up */
ca3909
+            /* the parent that posix xlator is up */
ca3909
             default_notify(this, GF_EVENT_CHILD_UP, data);
ca3909
         } break;
ca3909
 
ca3909
@@ -158,6 +163,31 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...)
ca3909
                 break;
ca3909
             gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s",
ca3909
                    victim->name);
ca3909
+
ca3909
+            if (priv->janitor) {
ca3909
+                pthread_mutex_lock(&priv->janitor_mutex);
ca3909
+                {
ca3909
+                    priv->janitor_task_stop = _gf_true;
ca3909
+                    ret = gf_tw_del_timer(this->ctx->tw->timer_wheel,
ca3909
+                                          priv->janitor);
ca3909
+                    if (!ret) {
ca3909
+                        clock_gettime(CLOCK_REALTIME, &sleep_till);
ca3909
+                        sleep_till.tv_sec += 1;
ca3909
+                        /* Wait to set janitor_task flag to _gf_false by
ca3909
+                         * janitor_task_done */
ca3909
+                        while (priv->janitor_task_stop) {
ca3909
+                            (void)pthread_cond_timedwait(&priv->janitor_cond,
ca3909
+                                                         &priv->janitor_mutex,
ca3909
+                                                         &sleep_till);
ca3909
+                            clock_gettime(CLOCK_REALTIME, &sleep_till);
ca3909
+                            sleep_till.tv_sec += 1;
ca3909
+                        }
ca3909
+                    }
ca3909
+                }
ca3909
+                pthread_mutex_unlock(&priv->janitor_mutex);
ca3909
+                GF_FREE(priv->janitor);
ca3909
+            }
ca3909
+            priv->janitor = NULL;
ca3909
             default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data);
ca3909
         } break;
ca3909
         default:
ca3909
@@ -1008,6 +1038,8 @@ posix_init(xlator_t *this)
ca3909
 
ca3909
     pthread_mutex_init(&_private->fsync_mutex, NULL);
ca3909
     pthread_cond_init(&_private->fsync_cond, NULL);
ca3909
+    pthread_mutex_init(&_private->janitor_mutex, NULL);
ca3909
+    pthread_cond_init(&_private->janitor_cond, NULL);
ca3909
     INIT_LIST_HEAD(&_private->fsyncs);
ca3909
     ret = posix_spawn_ctx_janitor_thread(this);
ca3909
     if (ret)
ca3909
@@ -1128,6 +1160,7 @@ posix_fini(xlator_t *this)
ca3909
         (void)gf_thread_cleanup_xint(priv->disk_space_check);
ca3909
         priv->disk_space_check = 0;
ca3909
     }
ca3909
+
ca3909
     if (priv->janitor) {
ca3909
         /*TODO: Make sure the synctask is also complete */
ca3909
         ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor);
ca3909
@@ -1135,8 +1168,10 @@ posix_fini(xlator_t *this)
ca3909
             gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TIMER_DELETE_FAILED,
ca3909
                    "Failed to delete janitor timer");
ca3909
         }
ca3909
+        GF_FREE(priv->janitor);
ca3909
         priv->janitor = NULL;
ca3909
     }
ca3909
+
ca3909
     if (priv->fsyncer) {
ca3909
         (void)gf_thread_cleanup_xint(priv->fsyncer);
ca3909
         priv->fsyncer = 0;
ca3909
@@ -1148,6 +1183,9 @@ posix_fini(xlator_t *this)
ca3909
     GF_FREE(priv->base_path);
ca3909
     LOCK_DESTROY(&priv->lock);
ca3909
     pthread_mutex_destroy(&priv->fsync_mutex);
ca3909
+    pthread_cond_destroy(&priv->fsync_cond);
ca3909
+    pthread_mutex_destroy(&priv->janitor_mutex);
ca3909
+    pthread_cond_destroy(&priv->janitor_cond);
ca3909
     GF_FREE(priv->hostname);
ca3909
     GF_FREE(priv->trash_path);
ca3909
     GF_FREE(priv);
ca3909
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
ca3909
index 07169b5..ef5bfd5 100644
ca3909
--- a/xlators/storage/posix/src/posix-helpers.c
ca3909
+++ b/xlators/storage/posix/src/posix-helpers.c
ca3909
@@ -1432,12 +1432,24 @@ posix_janitor_task_done(int ret, call_frame_t *frame, void *data)
ca3909
     this = data;
ca3909
     priv = this->private;
ca3909
 
ca3909
+    pthread_mutex_lock(&priv->janitor_mutex);
ca3909
+    {
ca3909
+        if (priv->janitor_task_stop) {
ca3909
+            priv->janitor_task_stop = _gf_false;
ca3909
+            pthread_cond_signal(&priv->janitor_cond);
ca3909
+            pthread_mutex_unlock(&priv->janitor_mutex);
ca3909
+            goto out;
ca3909
+        }
ca3909
+    }
ca3909
+    pthread_mutex_unlock(&priv->janitor_mutex);
ca3909
+
ca3909
     LOCK(&priv->lock);
ca3909
     {
ca3909
         __posix_janitor_timer_start(this);
ca3909
     }
ca3909
     UNLOCK(&priv->lock);
ca3909
 
ca3909
+out:
ca3909
     return 0;
ca3909
 }
ca3909
 
ca3909
@@ -1456,6 +1468,9 @@ posix_janitor_task(void *data)
ca3909
     old_this = THIS;
ca3909
     THIS = this;
ca3909
 
ca3909
+    if (!priv)
ca3909
+        goto out;
ca3909
+
ca3909
     time(&now;;
ca3909
     if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) {
ca3909
         if (priv->disable_landfill_purge) {
ca3909
@@ -1475,6 +1490,7 @@ posix_janitor_task(void *data)
ca3909
 
ca3909
     THIS = old_this;
ca3909
 
ca3909
+out:
ca3909
     return 0;
ca3909
 }
ca3909
 
ca3909
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
ca3909
index b0935a7..64288a7 100644
ca3909
--- a/xlators/storage/posix/src/posix.h
ca3909
+++ b/xlators/storage/posix/src/posix.h
ca3909
@@ -203,6 +203,8 @@ struct posix_private {
ca3909
     struct list_head fsyncs;
ca3909
     pthread_mutex_t fsync_mutex;
ca3909
     pthread_cond_t fsync_cond;
ca3909
+    pthread_mutex_t janitor_mutex;
ca3909
+    pthread_cond_t janitor_cond;
ca3909
     int fsync_queue_count;
ca3909
 
ca3909
     enum {
ca3909
@@ -257,6 +259,7 @@ struct posix_private {
ca3909
 
ca3909
     gf_boolean_t fips_mode_rchecksum;
ca3909
     gf_boolean_t ctime;
ca3909
+    gf_boolean_t janitor_task_stop;
ca3909
 };
ca3909
 
ca3909
 typedef struct {
ca3909
-- 
ca3909
1.8.3.1
ca3909