Blob Blame History Raw
From 488a5aa4932842334e2749224e9c39f8b6fd379c Mon Sep 17 00:00:00 2001
From: Ashish Pandey <aspandey@redhat.com>
Date: Wed, 20 May 2020 11:30:17 +0530
Subject: [PATCH 561/584] cluster/ec: Inform failure when some bricks are
 unavailable.

Provide proper information about failure when a fop
fails on some of the brick.
Also provide information about parent fop and
the map of the bricks on which it is failing.

Upstream patch details:
>Change-Id: If812739617df65cd146c8e667fbacff653717248
>updates #1434
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
>https://review.gluster.org/#/c/glusterfs/+/24858/

Change-Id: I3549d637e7345f05f21ac1c0e8106973c69d1be9
BUG: 1908635
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244926
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 xlators/cluster/ec/src/ec-common.c | 76 +++++++++++++++++++++++---------------
 xlators/cluster/ec/src/ec.c        | 14 ++++++-
 2 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index e3f8769..a9624d8 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -316,17 +316,19 @@ ec_check_status(ec_fop_data_t *fop)
         }
     }
 
-    gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
-           "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
-           "remaining=%s, good=%s, bad=%s, %s)",
-           gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
-           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
-           ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
-           ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
-           ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
-           ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
-                  ec->nodes),
-           ec_msg_str(fop));
+    gf_msg(
+        fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+        "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+        "remaining=%s, good=%s, bad=%s,"
+        "(Least significant bit represents first client/brick of subvol), %s)",
+        gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+        ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+        ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+        ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+        ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+        ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+               ec->nodes),
+        ec_msg_str(fop));
     if (fop->use_fd) {
         if (fop->fd != NULL) {
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
@@ -614,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop)
     loc_t *loc2 = NULL;
     char gfid1[64] = {0};
     char gfid2[64] = {0};
+    ec_fop_data_t *parent = fop->parent;
 
     if (fop->errstr)
         return fop->errstr;
-
     if (!fop->use_fd) {
         loc1 = &fop->loc[0];
         loc2 = &fop->loc[1];
@@ -625,23 +627,45 @@ ec_msg_str(ec_fop_data_t *fop)
         if (fop->id == GF_FOP_RENAME) {
             gf_asprintf(&fop->errstr,
                         "FOP : '%s' failed on '%s' and '%s' with gfids "
-                        "%s and %s respectively",
+                        "%s and %s respectively. Parent FOP: %s",
                         ec_fop_name(fop->id), loc1->path, loc2->path,
                         uuid_utoa_r(loc1->gfid, gfid1),
-                        uuid_utoa_r(loc2->gfid, gfid2));
+                        uuid_utoa_r(loc2->gfid, gfid2),
+                        parent ? ec_fop_name(parent->id) : "No Parent");
         } else {
-            gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s",
-                        ec_fop_name(fop->id), loc1->path,
-                        uuid_utoa_r(loc1->gfid, gfid1));
+            gf_asprintf(
+                &fop->errstr,
+                "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s",
+                ec_fop_name(fop->id), loc1->path,
+                uuid_utoa_r(loc1->gfid, gfid1),
+                parent ? ec_fop_name(parent->id) : "No Parent");
         }
     } else {
-        gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s",
-                    ec_fop_name(fop->id),
-                    uuid_utoa_r(fop->fd->inode->gfid, gfid1));
+        gf_asprintf(
+            &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s",
+            ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1),
+            parent ? ec_fop_name(parent->id) : "No Parent");
     }
     return fop->errstr;
 }
 
+static void
+ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need,
+                        int32_t loglevel)
+{
+    ec_t *ec = fop->xl->private;
+    char str1[32], str2[32], str3[32];
+
+    gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT,
+           "Insufficient available children for this request: "
+           "Have : %d, Need : %u : Child UP : %s "
+           "Mask: %s, Healing : %s : %s ",
+           have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+           ec_bin(str3, sizeof(str3), fop->healing, ec->nodes),
+           ec_msg_str(fop));
+}
+
 static int32_t
 ec_child_select(ec_fop_data_t *fop)
 {
@@ -699,11 +723,7 @@ ec_child_select(ec_fop_data_t *fop)
     ec_trace("SELECT", fop, "");
 
     if ((num < fop->minimum) && (num < ec->fragments)) {
-        gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT,
-               "Insufficient available children "
-               "for this request (have %d, need "
-               "%d). %s",
-               num, fop->minimum, ec_msg_str(fop));
+        ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR);
         return 0;
     }
 
@@ -711,11 +731,7 @@ ec_child_select(ec_fop_data_t *fop)
         (fop->locks[0].update[EC_DATA_TXN] ||
          fop->locks[0].update[EC_METADATA_TXN])) {
         if (ec->quorum_count && (num < ec->quorum_count)) {
-            gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT,
-                   "Insufficient available children "
-                   "for this request (have %d, need "
-                   "%d). %s",
-                   num, ec->quorum_count, ec_msg_str(fop));
+            ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR);
             return 0;
         }
     }
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index a930089..047cdd8 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -325,13 +325,18 @@ ec_get_event_from_state(ec_t *ec)
 void
 ec_up(xlator_t *this, ec_t *ec)
 {
+    char str1[32], str2[32];
+
     if (ec->timer != NULL) {
         gf_timer_call_cancel(this->ctx, ec->timer);
         ec->timer = NULL;
     }
 
     ec->up = 1;
-    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
+    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP,
+           "Going UP : Child UP = %s Child Notify = %s",
+           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
 
     gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
 }
@@ -339,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec)
 void
 ec_down(xlator_t *this, ec_t *ec)
 {
+    char str1[32], str2[32];
+
     if (ec->timer != NULL) {
         gf_timer_call_cancel(this->ctx, ec->timer);
         ec->timer = NULL;
     }
 
     ec->up = 0;
-    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
+    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN,
+           "Going DOWN : Child UP = %s Child Notify = %s",
+           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
 
     gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
 }
-- 
1.8.3.1