Blob Blame History Raw
From 7e14fd1769c2e5e189efbeebed997ebcf7a020c1 Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Thu, 2 Mar 2017 07:14:14 +0530
Subject: [PATCH 302/302] cluster/ec: Introduce optimistic changelog in EC

	Backport of https://review.gluster.org/16821

Problem: Fix to https://bugzilla.redhat.com/show_bug.cgi?id=1316873 has made
changes to set dirty flag before every update fop, data or metadata, and unset
it after successful operation. That makes some of the fops very slow such as
entry operations or metadata operations.

Solution: File data operations are the only operation which take some time and
setting dirty flag before a fop and unsetting it after serves the purpose as
probability of failure of a fop is high when the time duration is more. For all
the other operations, set dirty flag at the end of the fop, if any brick is
down and need heal.

Providing following option to choose between high performance or better heal
marking for metadata and entry fops.

Set/Unset dirty flag for every update fop at the start of the fop. If ON, this
option impacts performance of entry operations or metadata operations as it
will set dirty flag at the start and unset it at the end of ALL update fop. If
OFF and all the bricks are good, dirty flag will be set at the start only for
file fops For metadata and entry fops dirty flag will not be set at the start,
if all the bricks are good. This does not impact performance for metadata
operations and entry operation but has a very small window to miss marking
entry as dirty in case it is required to be healed.

Thanks to Xavi and Ashish for the design
Picked the .t file from Ashish' patch https://review.gluster.org/16298

 >BUG: 1408809
 >Change-Id: I3ce860063f0e2901e50754dcfc3e4ed22daf819f
 >Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>

BUG: 1408655
Change-Id: Ia8f2e9c5f39d8306ab8e8dcda7cf75a92519e3d7
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/99318
---
 libglusterfs/src/globals.h                      |   4 +-
 tests/basic/ec/ec-optimistic-changelog.t        | 152 ++++++++++++++++++++++++
 xlators/cluster/ec/src/ec-common.c              |  49 +++++++-
 xlators/cluster/ec/src/ec-data.h                |   3 +-
 xlators/cluster/ec/src/ec-generic.c             |  14 ++-
 xlators/cluster/ec/src/ec.c                     |  21 +++-
 xlators/cluster/ec/src/ec.h                     |   1 +
 xlators/mgmt/glusterd/src/glusterd-volume-set.c |   6 +
 8 files changed, 243 insertions(+), 7 deletions(-)
 create mode 100644 tests/basic/ec/ec-optimistic-changelog.t

diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
index f6164c6..bbddb21 100644
--- a/libglusterfs/src/globals.h
+++ b/libglusterfs/src/globals.h
@@ -43,7 +43,7 @@
  */
 #define GD_OP_VERSION_MIN  1 /* MIN is the fresh start op-version, mostly
                                 should not change */
-#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_9_1 /* MAX VERSION is the maximum
+#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_10_1 /* MAX VERSION is the maximum
                                                   count in VME table, should
                                                   keep changing with
                                                   introduction of newer
@@ -85,6 +85,8 @@
 
 #define GD_OP_VERSION_3_9_1    30901 /* Op-version for GlusterFS 3.9.1 */
 
+#define GD_OP_VERSION_3_10_1   31001 /* Op-version for GlusterFS 3.10.1 */
+
 #include "xlator.h"
 
 /* THIS */
diff --git a/tests/basic/ec/ec-optimistic-changelog.t b/tests/basic/ec/ec-optimistic-changelog.t
new file mode 100644
index 0000000..1277da6
--- /dev/null
+++ b/tests/basic/ec/ec-optimistic-changelog.t
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# This test checks optimistic-change-log option
+
+cleanup
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume heal $V0 disable
+
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 disperse.background-heals 0
+TEST $CLI volume set $V0 disperse.optimistic-change-log off
+TEST $CLI volume set $V0 disperse.eager-lock off
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength
+
+TEST $CLI volume set $V0 disperse.background-heals 1
+TEST touch $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}1
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}2
+
+
+
+### optimistic-change-log = off ; All bricks good. Test file operation
+echo abc > $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick . Test file operation
+TEST kill_brick $V0 $H0 $B0/${V0}2
+echo abc > $M0/a
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+#Accessing file should heal the file now
+EXPECT "abc" cat $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; All bricks good. Test entry operation
+TEST touch $M0/b
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; All bricks good. Test metadata operation
+TEST chmod 0777 $M0/b
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick. Test entry operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST touch $M0/c
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick. Test metadata operation
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST chmod 0777 $M0/c
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+TEST $CLI volume set $V0 disperse.optimistic-change-log on
+
+### optimistic-change-log = on ; All bricks good. Test file operation
+
+echo abc > $M0/aa
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test file operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+echo abc > $M0/aa
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+#Accessing file should heal the file now
+getfattr -d -m. -e hex $M0/aa 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; All bricks good. Test entry operation
+
+TEST touch $M0/bb
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; All bricks good. Test metadata operation
+
+TEST chmod 0777 $M0/bb
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test entry operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST touch $M0/cc
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test metadata operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST chmod 0777 $M0/cc
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+cleanup
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 3064af6..647e750 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -926,16 +926,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
 }
 
 gf_boolean_t
-ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx,
+                   uint64_t *dirty)
 {
 
     gf_boolean_t set_dirty = _gf_false;
 
     if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+            if (!link->optimistic_changelog)
                 dirty[EC_DATA_TXN] = 1;
     }
 
     if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+            if (!link->optimistic_changelog)
                 dirty[EC_METADATA_TXN] = 1;
     }
 
@@ -956,6 +959,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
     ec_lock_link_t *link = fop->data;
     ec_lock_t *lock = NULL;
     ec_inode_t *ctx;
+    gf_boolean_t release = _gf_false;
 
     lock = link->lock;
     parent = link->fop;
@@ -1049,6 +1053,26 @@ unlock:
     UNLOCK(&lock->loc.inode->lock);
 
     if (op_errno == 0) {
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away if dirty was not set before.
+         */
+        if (lock->good_mask & ~(fop->good | fop->remaining)) {
+                release = _gf_true;
+        }
+
+        /* lock->release is a critical field that is checked and modified most
+         * of the time inside a locked region. This use here is safe because we
+         * are in a modifying fop and we currently don't allow two modifying
+         * fops to be processed concurrently, so no one else could be checking
+         * or modifying it.*/
+        if (link->update[0] && !link->dirty[0]) {
+                lock->release |= release;
+        }
+
+        if (link->update[1] && !link->dirty[1]) {
+                lock->release |= release;
+        }
+
         /* We don't allow the main fop to be executed on bricks that have not
          * succeeded the initial xattrop. */
         parent->mask &= fop->good;
@@ -1091,6 +1115,7 @@ void ec_get_size_version(ec_lock_link_t *link)
     ec_inode_t *ctx;
     ec_fop_data_t *fop;
     dict_t *dict = NULL;
+    ec_t   *ec = NULL;
     int32_t error = 0;
     gf_boolean_t getting_xattr;
     gf_boolean_t set_dirty = _gf_false;
@@ -1099,6 +1124,17 @@ void ec_get_size_version(ec_lock_link_t *link)
     lock = link->lock;
     ctx = lock->ctx;
     fop = link->fop;
+    ec  = fop->xl->private;
+
+    if (ec->optimistic_changelog &&
+        !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id))
+            link->optimistic_changelog = _gf_true;
+
+    /* If ctx->have_info is false and lock->query is true, it means that we'll
+     * send the xattrop anyway, so we can use it to update dirty counts, even
+     * if it's not necessary to do it right now. */
+    if (!ctx->have_info && lock->query)
+            link->optimistic_changelog = _gf_false;
 
     set_dirty = ec_set_dirty_flag (link, ctx, dirty);
 
@@ -1709,6 +1745,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
         if (link->update[1]) {
             ctx->post_version[1]++;
         }
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away. */
+        if (link->update[0] || link->update[1]) {
+                if (lock->good_mask & ~(fop->good | fop->remaining)) {
+                        lock->release = _gf_true;
+                }
+        }
     }
 
     ec_lock_update_good(lock, fop);
@@ -2024,9 +2067,13 @@ ec_update_info(ec_lock_link_t *link)
                     if (ctx->dirty[1] != 0) {
                         dirty[1] = -1;
                     }
+            } else {
+                    link->optimistic_changelog = _gf_false;
+                    ec_set_dirty_flag (link, ctx, dirty);
             }
             memset(ctx->dirty, 0, sizeof(ctx->dirty));
     }
+
     if ((version[0] != 0) || (version[1] != 0) ||
         (dirty[0] != 0) || (dirty[1] != 0)) {
         ec_update_size_version(link, version, size, dirty);
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index c3ec5cb..ddb9fab 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -184,6 +184,8 @@ struct _ec_lock_link
     struct list_head  owner_list;
     struct list_head  wait_list;
     gf_boolean_t      update[2];
+    gf_boolean_t      dirty[2];
+    gf_boolean_t      optimistic_changelog;
     loc_t            *base;
     uint64_t          size;
 };
@@ -271,7 +273,6 @@ struct _ec_cbk_data
     int32_t          op_errno;
     int32_t          count;
     uintptr_t        mask;
-    uint64_t         dirty[2];
 
     dict_t *         xdata;
     dict_t *         dict;
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 37b3b78..878277f 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -696,6 +696,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
     ec_fop_data_t * fop = NULL;
     ec_cbk_data_t * cbk = NULL;
     int32_t idx = (int32_t)(uintptr_t)cookie;
+    uint64_t       dirty[2] = {0};
 
     VALIDATE_OR_GOTO(this, out);
     GF_VALIDATE_OR_GOTO(this->name, frame, out);
@@ -745,8 +746,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
 
                 goto out;
             }
-            ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty,
-                               EC_VERSION_SIZE);
+            ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
         }
 
         ec_combine(cbk, ec_combine_lookup);
@@ -1141,7 +1141,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 dict_t *xdata)
 {
         ec_fop_data_t *fop = NULL;
+        ec_lock_link_t *link = NULL;
         ec_cbk_data_t *cbk = NULL;
+        uint64_t       dirty[2] = {0};
         data_t *data;
         uint64_t *version;
         int32_t idx = (int32_t)(uintptr_t)cookie;
@@ -1177,8 +1179,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                     }
                 }
 
-                ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
+                ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty,
                                    EC_VERSION_SIZE);
+                link = fop->data;
+                if (link) {
+                        /*Keep a note of if the dirty is already set or not*/
+                        link->dirty[0] |= (dirty[0] != 0);
+                        link->dirty[1] |= (dirty[1] != 0);
+                }
         }
 
         if (xdata)
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index bcdb9fa..7eeff30 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -281,6 +281,8 @@ reconfigure (xlator_t *this, dict_t *options)
         GF_OPTION_RECONF ("shd-wait-qlength", ec->shd.wait_qlength,
                           options, uint32, failed);
 
+        GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
+                          options, bool, failed);
         return 0;
 failed:
         return -1;
@@ -639,6 +641,7 @@ init (xlator_t *this)
 
     GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
     GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
+    GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
 
     this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
     if (!this->itable)
@@ -1415,5 +1418,21 @@ struct volume_options options[] =
       .description = "This option can be used to control number of heals"
                      " that can wait in SHD per subvolume"
     },
-    { }
+    {   .key = {"optimistic-change-log"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "on",
+        .description =  "Set/Unset dirty flag for every update fop at the start"
+                        "of the fop. If OFF, this option impacts performance of"
+                        "entry  operations or metadata operations as it will"
+                        "set dirty flag at the start and unset it at the end of"
+                        "ALL update fop. If ON and all the bricks are good,"
+                        "dirty flag will be set at the start only for file fops"
+                        "For metadata and entry fops dirty flag will not be set"
+                        "at the start, if all the bricks are good. This does"
+                        "not impact performance for metadata operations and"
+                        "entry operation but has a very small window to miss"
+                        "marking entry as dirty in case it is required to be"
+                        "healed"
+    },
+    { .key = {NULL} }
 };
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 49af5c2..bded652 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -55,6 +55,7 @@ struct _ec
     gf_timer_t *      timer;
     gf_boolean_t      shutdown;
     gf_boolean_t      eager_lock;
+    gf_boolean_t      optimistic_changelog;
     uint32_t          background_heals;
     uint32_t          heal_wait_qlen;
     struct list_head  pending_fops;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 873ff99..36874f5 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3038,6 +3038,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .flags       = OPT_FLAG_CLIENT_OPT,
           .op_version  = GD_OP_VERSION_3_9_1,
         },
+        { .key        = "disperse.optimistic-change-log",
+          .voltype    = "cluster/disperse",
+          .type       = NO_DOC,
+          .op_version = GD_OP_VERSION_3_10_1,
+          .flags      = OPT_FLAG_CLIENT_OPT
+        },
         { .key         = NULL
         }
 };
-- 
2.9.3