d1681e
From a4f3087ecbd1979525add83a149acaf2443d8e59 Mon Sep 17 00:00:00 2001
d1681e
From: Xavier Hernandez <jahernan@redhat.com>
d1681e
Date: Wed, 22 Nov 2017 11:10:32 +0100
d1681e
Subject: [PATCH 101/128] cluster/ec: Prevent self-heal to work after
d1681e
 PARENT_DOWN
d1681e
d1681e
When the volume is being stopped, PARENT_DOWN event is received.
d1681e
This instructs EC to wait until all pending operations are completed
d1681e
before declaring itself down. However heal operations are ignored
d1681e
and allowed to continue even after having said it was down.
d1681e
d1681e
This may cause unexpected results and crashes.
d1681e
d1681e
To solve this, heal operations are considered exactly equal as any
d1681e
other operation and EC won't propagate PARENT_DOWN until all
d1681e
operations, including healing, are complete. To avoid big delays
d1681e
if this happens in the middle of a big heal, a check has been
d1681e
added to quit current heal if shutdown is detected.
d1681e
d1681e
>Change-Id: I26645e236ebd115eb22c7ad4972461111a2d2034
d1681e
>BUG: 1515266
d1681e
>Signed-off-by: Xavier Hernandez <jahernan@redhat.com>
d1681e
Upstream Patch: https://review.gluster.org/#/c/18840/
d1681e
d1681e
BUG: 1505570
d1681e
Change-Id: I26645e236ebd115eb22c7ad4972461111a2d2034
d1681e
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/125199
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
---
d1681e
 xlators/cluster/ec/src/ec-data.c | 21 ++------------
d1681e
 xlators/cluster/ec/src/ec-heal.c | 59 +++++++++++++++++++++++++++++++++-------
d1681e
 2 files changed, 52 insertions(+), 28 deletions(-)
d1681e
d1681e
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
d1681e
index 28bf988..54c708a 100644
d1681e
--- a/xlators/cluster/ec/src/ec-data.c
d1681e
+++ b/xlators/cluster/ec/src/ec-data.c
d1681e
@@ -103,19 +103,6 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk)
d1681e
     mem_put(cbk);
d1681e
 }
d1681e
 
d1681e
-/* PARENT_DOWN will be notified to children only after these fops are complete
d1681e
- * when graph switch happens.  We do not want graph switch to be waiting on
d1681e
- * heal to complete as healing big file/directory could take a while. Which
d1681e
- * will lead to hang on the mount.
d1681e
- */
d1681e
-static gf_boolean_t
d1681e
-ec_needs_graceful_completion (ec_fop_data_t *fop)
d1681e
-{
d1681e
-        if ((fop->id != EC_FOP_HEAL) && (fop->id != EC_FOP_FHEAL))
d1681e
-                return _gf_true;
d1681e
-        return _gf_false;
d1681e
-}
d1681e
-
d1681e
 ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
d1681e
                                      int32_t id, uint32_t flags,
d1681e
                                      uintptr_t target, int32_t minimum,
d1681e
@@ -202,13 +189,11 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
d1681e
         fop->parent = parent;
d1681e
     }
d1681e
 
d1681e
-    if (ec_needs_graceful_completion (fop)) {
d1681e
-            LOCK(&ec->lock);
d1681e
+    LOCK(&ec->lock);
d1681e
 
d1681e
-            list_add_tail(&fop->pending_list, &ec->pending_fops);
d1681e
+    list_add_tail(&fop->pending_list, &ec->pending_fops);
d1681e
 
d1681e
-            UNLOCK(&ec->lock);
d1681e
-    }
d1681e
+    UNLOCK(&ec->lock);
d1681e
 
d1681e
     return fop;
d1681e
 }
d1681e
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
d1681e
index fd8c902..b8518d6 100644
d1681e
--- a/xlators/cluster/ec/src/ec-heal.c
d1681e
+++ b/xlators/cluster/ec/src/ec-heal.c
d1681e
@@ -1418,6 +1418,12 @@ ec_name_heal_handler (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
d1681e
         int                 i          = 0;
d1681e
         int                 ret        = 0;
d1681e
 
d1681e
+        if (ec->shutdown) {
d1681e
+                gf_msg_debug(this->name, 0, "Cancelling directory heal "
d1681e
+                                            "because EC is stopping.");
d1681e
+                return -ENOTCONN;
d1681e
+        }
d1681e
+
d1681e
         memcpy (name_on, name_data->participants, ec->nodes);
d1681e
         ret = ec_heal_name (name_data->frame, ec, parent->inode,
d1681e
                             entry->d_name, name_on);
d1681e
@@ -1439,6 +1445,7 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
d1681e
         int j = 0;
d1681e
         loc_t loc = {0};
d1681e
         struct ec_name_data name_data = {0};
d1681e
+        int ret = 0;
d1681e
 
d1681e
         loc.inode = inode_ref (inode);
d1681e
         gf_uuid_copy (loc.gfid, inode->gfid);
d1681e
@@ -1449,18 +1456,23 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
d1681e
         for (i = 0; i < ec->nodes; i++) {
d1681e
                 if (!participants[i])
d1681e
                         continue;
d1681e
-                syncop_dir_scan (ec->xl_list[i], &loc,
d1681e
-                                GF_CLIENT_PID_SELF_HEALD, &name_data,
d1681e
-                                ec_name_heal_handler);
d1681e
+                ret = syncop_dir_scan (ec->xl_list[i], &loc,
d1681e
+                                       GF_CLIENT_PID_SELF_HEALD, &name_data,
d1681e
+                                       ec_name_heal_handler);
d1681e
+                if (ret < 0) {
d1681e
+                        break;
d1681e
+                }
d1681e
                 for (j = 0; j < ec->nodes; j++)
d1681e
                         if (name_data.failed_on[j])
d1681e
                                 participants[j] = 0;
d1681e
 
d1681e
-                if (EC_COUNT (participants, ec->nodes) <= ec->fragments)
d1681e
-                        return -ENOTCONN;
d1681e
+                if (EC_COUNT (participants, ec->nodes) <= ec->fragments) {
d1681e
+                        ret = -ENOTCONN;
d1681e
+                        break;
d1681e
+                }
d1681e
         }
d1681e
         loc_wipe (&loc;;
d1681e
-        return 0;
d1681e
+        return ret;
d1681e
 }
d1681e
 
d1681e
 int
d1681e
@@ -1999,6 +2011,17 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
d1681e
 
d1681e
         for (heal->offset = 0; (heal->offset < size) && !heal->done;
d1681e
                                                    heal->offset += heal->size) {
d1681e
+                /* We immediately abort any heal if a shutdown request has been
d1681e
+                 * received to avoid delays. The healing of this file will be
d1681e
+                 * restarted by another SHD or other client that accesses the
d1681e
+                 * file. */
d1681e
+                if (ec->shutdown) {
d1681e
+                        gf_msg_debug(ec->xl->name, 0, "Cancelling heal because "
d1681e
+                                                      "EC is stopping.");
d1681e
+                        ret = -ENOTCONN;
d1681e
+                        break;
d1681e
+                }
d1681e
+
d1681e
                 gf_msg_debug (ec->xl->name, 0, "%s: sources: %d, sinks: "
d1681e
                         "%d, offset: %"PRIu64" bsize: %"PRIu64,
d1681e
                         uuid_utoa (fd->inode->gfid),
d1681e
@@ -2595,16 +2618,32 @@ ec_handle_healers_done (ec_fop_data_t *fop)
d1681e
                 return;
d1681e
 
d1681e
         LOCK (&ec->lock);
d1681e
-        {
d1681e
-                list_del_init (&fop->healer);
d1681e
+
d1681e
+        list_del_init (&fop->healer);
d1681e
+
d1681e
+        do {
d1681e
                 ec->healers--;
d1681e
                 heal_fop = __ec_dequeue_heals (ec);
d1681e
-        }
d1681e
+
d1681e
+                if ((heal_fop != NULL) && ec->shutdown) {
d1681e
+                        /* This will prevent ec_handle_healers_done() to be
d1681e
+                         * called recursively. That would be problematic if
d1681e
+                         * the queue is too big. */
d1681e
+                        list_del_init(&heal_fop->healer);
d1681e
+
d1681e
+                        UNLOCK(&ec->lock);
d1681e
+
d1681e
+                        ec_fop_set_error(fop, ENOTCONN);
d1681e
+                        ec_heal_fail(ec, heal_fop);
d1681e
+
d1681e
+                        LOCK(&ec->lock);
d1681e
+                }
d1681e
+        } while ((heal_fop != NULL) && ec->shutdown);
d1681e
+
d1681e
         UNLOCK (&ec->lock);
d1681e
 
d1681e
         if (heal_fop)
d1681e
                 ec_launch_heal (ec, heal_fop);
d1681e
-
d1681e
 }
d1681e
 
d1681e
 void
d1681e
-- 
d1681e
1.8.3.1
d1681e