|
|
3604df |
From 7e14fd1769c2e5e189efbeebed997ebcf7a020c1 Mon Sep 17 00:00:00 2001
|
|
|
3604df |
From: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
3604df |
Date: Thu, 2 Mar 2017 07:14:14 +0530
|
|
|
3604df |
Subject: [PATCH 302/302] cluster/ec: Introduce optimistic changelog in EC
|
|
|
3604df |
|
|
|
3604df |
Backport of https://review.gluster.org/16821
|
|
|
3604df |
|
|
|
3604df |
Problem: Fix to https://bugzilla.redhat.com/show_bug.cgi?id=1316873 has made
|
|
|
3604df |
changes to set dirty flag before every update fop, data or metadata, and unset
|
|
|
3604df |
it after successful operation. That makes some of the fops very slow such as
|
|
|
3604df |
entry operations or metadata operations.
|
|
|
3604df |
|
|
|
3604df |
Solution: File data operations are the only operation which take some time and
|
|
|
3604df |
setting dirty flag before a fop and unsetting it after serves the purpose as
|
|
|
3604df |
probability of failure of a fop is high when the time duration is more. For all
|
|
|
3604df |
the other operations, set dirty flag at the end of the fop, if any brick is
|
|
|
3604df |
down and need heal.
|
|
|
3604df |
|
|
|
3604df |
Providing following option to choose between high performance or better heal
|
|
|
3604df |
marking for metadata and entry fops.
|
|
|
3604df |
|
|
|
3604df |
Set/Unset dirty flag for every update fop at the start of the fop. If ON, this
|
|
|
3604df |
option impacts performance of entry operations or metadata operations as it
|
|
|
3604df |
will set dirty flag at the start and unset it at the end of ALL update fop. If
|
|
|
3604df |
OFF and all the bricks are good, dirty flag will be set at the start only for
|
|
|
3604df |
file fops For metadata and entry fops dirty flag will not be set at the start,
|
|
|
3604df |
if all the bricks are good. This does not impact performance for metadata
|
|
|
3604df |
operations and entry operation but has a very small window to miss marking
|
|
|
3604df |
entry as dirty in case it is required to be healed.
|
|
|
3604df |
|
|
|
3604df |
Thanks to Xavi and Ashish for the design
|
|
|
3604df |
Picked the .t file from Ashish' patch https://review.gluster.org/16298
|
|
|
3604df |
|
|
|
3604df |
>BUG: 1408809
|
|
|
3604df |
>Change-Id: I3ce860063f0e2901e50754dcfc3e4ed22daf819f
|
|
|
3604df |
>Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
3604df |
|
|
|
3604df |
BUG: 1408655
|
|
|
3604df |
Change-Id: Ia8f2e9c5f39d8306ab8e8dcda7cf75a92519e3d7
|
|
|
3604df |
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
3604df |
Reviewed-on: https://code.engineering.redhat.com/gerrit/99318
|
|
|
3604df |
---
|
|
|
3604df |
libglusterfs/src/globals.h | 4 +-
|
|
|
3604df |
tests/basic/ec/ec-optimistic-changelog.t | 152 ++++++++++++++++++++++++
|
|
|
3604df |
xlators/cluster/ec/src/ec-common.c | 49 +++++++-
|
|
|
3604df |
xlators/cluster/ec/src/ec-data.h | 3 +-
|
|
|
3604df |
xlators/cluster/ec/src/ec-generic.c | 14 ++-
|
|
|
3604df |
xlators/cluster/ec/src/ec.c | 21 +++-
|
|
|
3604df |
xlators/cluster/ec/src/ec.h | 1 +
|
|
|
3604df |
xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 +
|
|
|
3604df |
8 files changed, 243 insertions(+), 7 deletions(-)
|
|
|
3604df |
create mode 100644 tests/basic/ec/ec-optimistic-changelog.t
|
|
|
3604df |
|
|
|
3604df |
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
|
|
|
3604df |
index f6164c6..bbddb21 100644
|
|
|
3604df |
--- a/libglusterfs/src/globals.h
|
|
|
3604df |
+++ b/libglusterfs/src/globals.h
|
|
|
3604df |
@@ -43,7 +43,7 @@
|
|
|
3604df |
*/
|
|
|
3604df |
#define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly
|
|
|
3604df |
should not change */
|
|
|
3604df |
-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_9_1 /* MAX VERSION is the maximum
|
|
|
3604df |
+#define GD_OP_VERSION_MAX GD_OP_VERSION_3_10_1 /* MAX VERSION is the maximum
|
|
|
3604df |
count in VME table, should
|
|
|
3604df |
keep changing with
|
|
|
3604df |
introduction of newer
|
|
|
3604df |
@@ -85,6 +85,8 @@
|
|
|
3604df |
|
|
|
3604df |
#define GD_OP_VERSION_3_9_1 30901 /* Op-version for GlusterFS 3.9.1 */
|
|
|
3604df |
|
|
|
3604df |
+#define GD_OP_VERSION_3_10_1 31001 /* Op-version for GlusterFS 3.10.1 */
|
|
|
3604df |
+
|
|
|
3604df |
#include "xlator.h"
|
|
|
3604df |
|
|
|
3604df |
/* THIS */
|
|
|
3604df |
diff --git a/tests/basic/ec/ec-optimistic-changelog.t b/tests/basic/ec/ec-optimistic-changelog.t
|
|
|
3604df |
new file mode 100644
|
|
|
3604df |
index 0000000..1277da6
|
|
|
3604df |
--- /dev/null
|
|
|
3604df |
+++ b/tests/basic/ec/ec-optimistic-changelog.t
|
|
|
3604df |
@@ -0,0 +1,152 @@
|
|
|
3604df |
+#!/bin/bash
|
|
|
3604df |
+
|
|
|
3604df |
+. $(dirname $0)/../../include.rc
|
|
|
3604df |
+. $(dirname $0)/../../volume.rc
|
|
|
3604df |
+
|
|
|
3604df |
+# This test checks optimistic-change-log option
|
|
|
3604df |
+
|
|
|
3604df |
+cleanup
|
|
|
3604df |
+TEST glusterd
|
|
|
3604df |
+TEST pidof glusterd
|
|
|
3604df |
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
|
|
|
3604df |
+TEST $CLI volume heal $V0 disable
|
|
|
3604df |
+
|
|
|
3604df |
+TEST $CLI volume set $V0 performance.stat-prefetch off
|
|
|
3604df |
+TEST $CLI volume set $V0 performance.write-behind off
|
|
|
3604df |
+TEST $CLI volume set $V0 performance.quick-read off
|
|
|
3604df |
+TEST $CLI volume set $V0 performance.read-ahead off
|
|
|
3604df |
+TEST $CLI volume set $V0 performance.io-cache off
|
|
|
3604df |
+TEST $CLI volume set $V0 disperse.background-heals 0
|
|
|
3604df |
+TEST $CLI volume set $V0 disperse.optimistic-change-log off
|
|
|
3604df |
+TEST $CLI volume set $V0 disperse.eager-lock off
|
|
|
3604df |
+TEST $CLI volume start $V0
|
|
|
3604df |
+
|
|
|
3604df |
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals
|
|
|
3604df |
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength
|
|
|
3604df |
+
|
|
|
3604df |
+TEST $CLI volume set $V0 disperse.background-heals 1
|
|
|
3604df |
+TEST touch $M0/a
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}0
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}1
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}2
|
|
|
3604df |
+
|
|
|
3604df |
+
|
|
|
3604df |
+
|
|
|
3604df |
+### optimistic-change-log = off ; All bricks good. Test file operation
|
|
|
3604df |
+echo abc > $M0/a
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = off ; Kill one brick . Test file operation
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+echo abc > $M0/a
|
|
|
3604df |
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
|
|
|
3604df |
+$CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+#Accessing file should heal the file now
|
|
|
3604df |
+EXPECT "abc" cat $M0/a
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = off ; All bricks good. Test entry operation
|
|
|
3604df |
+TEST touch $M0/b
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = off ; All bricks good. Test metadata operation
|
|
|
3604df |
+TEST chmod 0777 $M0/b
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = off ; Kill one brick. Test entry operation
|
|
|
3604df |
+
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+TEST touch $M0/c
|
|
|
3604df |
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
|
|
|
3604df |
+$CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
|
|
|
3604df |
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = off ; Kill one brick. Test metadata operation
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+TEST chmod 0777 $M0/c
|
|
|
3604df |
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
|
|
|
3604df |
+$CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+TEST $CLI volume set $V0 disperse.optimistic-change-log on
|
|
|
3604df |
+
|
|
|
3604df |
+### optimistic-change-log = on ; All bricks good. Test file operation
|
|
|
3604df |
+
|
|
|
3604df |
+echo abc > $M0/aa
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = on ; Kill one brick. Test file operation
|
|
|
3604df |
+
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+echo abc > $M0/aa
|
|
|
3604df |
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
|
|
|
3604df |
+$CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+#Accessing file should heal the file now
|
|
|
3604df |
+getfattr -d -m. -e hex $M0/aa 2>&1 > /dev/null
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = on ; All bricks good. Test entry operation
|
|
|
3604df |
+
|
|
|
3604df |
+TEST touch $M0/bb
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = on ; All bricks good. Test metadata operation
|
|
|
3604df |
+
|
|
|
3604df |
+TEST chmod 0777 $M0/bb
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = on ; Kill one brick. Test entry operation
|
|
|
3604df |
+
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+TEST touch $M0/cc
|
|
|
3604df |
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
|
|
|
3604df |
+$CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
|
|
|
3604df |
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+## optimistic-change-log = on ; Kill one brick. Test metadata operation
|
|
|
3604df |
+
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+TEST chmod 0777 $M0/cc
|
|
|
3604df |
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
|
|
|
3604df |
+$CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
3604df |
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+
|
|
|
3604df |
+############################################################
|
|
|
3604df |
+
|
|
|
3604df |
+cleanup
|
|
|
3604df |
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
|
|
|
3604df |
index 3064af6..647e750 100644
|
|
|
3604df |
--- a/xlators/cluster/ec/src/ec-common.c
|
|
|
3604df |
+++ b/xlators/cluster/ec/src/ec-common.c
|
|
|
3604df |
@@ -926,16 +926,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
gf_boolean_t
|
|
|
3604df |
-ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
|
|
|
3604df |
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx,
|
|
|
3604df |
+ uint64_t *dirty)
|
|
|
3604df |
{
|
|
|
3604df |
|
|
|
3604df |
gf_boolean_t set_dirty = _gf_false;
|
|
|
3604df |
|
|
|
3604df |
if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
|
|
|
3604df |
+ if (!link->optimistic_changelog)
|
|
|
3604df |
dirty[EC_DATA_TXN] = 1;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
|
|
|
3604df |
+ if (!link->optimistic_changelog)
|
|
|
3604df |
dirty[EC_METADATA_TXN] = 1;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
@@ -956,6 +959,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
|
|
|
3604df |
ec_lock_link_t *link = fop->data;
|
|
|
3604df |
ec_lock_t *lock = NULL;
|
|
|
3604df |
ec_inode_t *ctx;
|
|
|
3604df |
+ gf_boolean_t release = _gf_false;
|
|
|
3604df |
|
|
|
3604df |
lock = link->lock;
|
|
|
3604df |
parent = link->fop;
|
|
|
3604df |
@@ -1049,6 +1053,26 @@ unlock:
|
|
|
3604df |
UNLOCK(&lock->loc.inode->lock);
|
|
|
3604df |
|
|
|
3604df |
if (op_errno == 0) {
|
|
|
3604df |
+ /* If the fop fails on any of the good bricks, it is important to mark
|
|
|
3604df |
+ * it dirty and update versions right away if dirty was not set before.
|
|
|
3604df |
+ */
|
|
|
3604df |
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
|
|
|
3604df |
+ release = _gf_true;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
+ /* lock->release is a critical field that is checked and modified most
|
|
|
3604df |
+ * of the time inside a locked region. This use here is safe because we
|
|
|
3604df |
+ * are in a modifying fop and we currently don't allow two modifying
|
|
|
3604df |
+ * fops to be processed concurrently, so no one else could be checking
|
|
|
3604df |
+ * or modifying it.*/
|
|
|
3604df |
+ if (link->update[0] && !link->dirty[0]) {
|
|
|
3604df |
+ lock->release |= release;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
+ if (link->update[1] && !link->dirty[1]) {
|
|
|
3604df |
+ lock->release |= release;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
/* We don't allow the main fop to be executed on bricks that have not
|
|
|
3604df |
* succeeded the initial xattrop. */
|
|
|
3604df |
parent->mask &= fop->good;
|
|
|
3604df |
@@ -1091,6 +1115,7 @@ void ec_get_size_version(ec_lock_link_t *link)
|
|
|
3604df |
ec_inode_t *ctx;
|
|
|
3604df |
ec_fop_data_t *fop;
|
|
|
3604df |
dict_t *dict = NULL;
|
|
|
3604df |
+ ec_t *ec = NULL;
|
|
|
3604df |
int32_t error = 0;
|
|
|
3604df |
gf_boolean_t getting_xattr;
|
|
|
3604df |
gf_boolean_t set_dirty = _gf_false;
|
|
|
3604df |
@@ -1099,6 +1124,17 @@ void ec_get_size_version(ec_lock_link_t *link)
|
|
|
3604df |
lock = link->lock;
|
|
|
3604df |
ctx = lock->ctx;
|
|
|
3604df |
fop = link->fop;
|
|
|
3604df |
+ ec = fop->xl->private;
|
|
|
3604df |
+
|
|
|
3604df |
+ if (ec->optimistic_changelog &&
|
|
|
3604df |
+ !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id))
|
|
|
3604df |
+ link->optimistic_changelog = _gf_true;
|
|
|
3604df |
+
|
|
|
3604df |
+ /* If ctx->have_info is false and lock->query is true, it means that we'll
|
|
|
3604df |
+ * send the xattrop anyway, so we can use it to update dirty counts, even
|
|
|
3604df |
+ * if it's not necessary to do it right now. */
|
|
|
3604df |
+ if (!ctx->have_info && lock->query)
|
|
|
3604df |
+ link->optimistic_changelog = _gf_false;
|
|
|
3604df |
|
|
|
3604df |
set_dirty = ec_set_dirty_flag (link, ctx, dirty);
|
|
|
3604df |
|
|
|
3604df |
@@ -1709,6 +1745,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
|
|
|
3604df |
if (link->update[1]) {
|
|
|
3604df |
ctx->post_version[1]++;
|
|
|
3604df |
}
|
|
|
3604df |
+ /* If the fop fails on any of the good bricks, it is important to mark
|
|
|
3604df |
+ * it dirty and update versions right away. */
|
|
|
3604df |
+ if (link->update[0] || link->update[1]) {
|
|
|
3604df |
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
|
|
|
3604df |
+ lock->release = _gf_true;
|
|
|
3604df |
+ }
|
|
|
3604df |
+ }
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
ec_lock_update_good(lock, fop);
|
|
|
3604df |
@@ -2024,9 +2067,13 @@ ec_update_info(ec_lock_link_t *link)
|
|
|
3604df |
if (ctx->dirty[1] != 0) {
|
|
|
3604df |
dirty[1] = -1;
|
|
|
3604df |
}
|
|
|
3604df |
+ } else {
|
|
|
3604df |
+ link->optimistic_changelog = _gf_false;
|
|
|
3604df |
+ ec_set_dirty_flag (link, ctx, dirty);
|
|
|
3604df |
}
|
|
|
3604df |
memset(ctx->dirty, 0, sizeof(ctx->dirty));
|
|
|
3604df |
}
|
|
|
3604df |
+
|
|
|
3604df |
if ((version[0] != 0) || (version[1] != 0) ||
|
|
|
3604df |
(dirty[0] != 0) || (dirty[1] != 0)) {
|
|
|
3604df |
ec_update_size_version(link, version, size, dirty);
|
|
|
3604df |
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
|
|
|
3604df |
index c3ec5cb..ddb9fab 100644
|
|
|
3604df |
--- a/xlators/cluster/ec/src/ec-data.h
|
|
|
3604df |
+++ b/xlators/cluster/ec/src/ec-data.h
|
|
|
3604df |
@@ -184,6 +184,8 @@ struct _ec_lock_link
|
|
|
3604df |
struct list_head owner_list;
|
|
|
3604df |
struct list_head wait_list;
|
|
|
3604df |
gf_boolean_t update[2];
|
|
|
3604df |
+ gf_boolean_t dirty[2];
|
|
|
3604df |
+ gf_boolean_t optimistic_changelog;
|
|
|
3604df |
loc_t *base;
|
|
|
3604df |
uint64_t size;
|
|
|
3604df |
};
|
|
|
3604df |
@@ -271,7 +273,6 @@ struct _ec_cbk_data
|
|
|
3604df |
int32_t op_errno;
|
|
|
3604df |
int32_t count;
|
|
|
3604df |
uintptr_t mask;
|
|
|
3604df |
- uint64_t dirty[2];
|
|
|
3604df |
|
|
|
3604df |
dict_t * xdata;
|
|
|
3604df |
dict_t * dict;
|
|
|
3604df |
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
|
|
|
3604df |
index 37b3b78..878277f 100644
|
|
|
3604df |
--- a/xlators/cluster/ec/src/ec-generic.c
|
|
|
3604df |
+++ b/xlators/cluster/ec/src/ec-generic.c
|
|
|
3604df |
@@ -696,6 +696,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
|
|
|
3604df |
ec_fop_data_t * fop = NULL;
|
|
|
3604df |
ec_cbk_data_t * cbk = NULL;
|
|
|
3604df |
int32_t idx = (int32_t)(uintptr_t)cookie;
|
|
|
3604df |
+ uint64_t dirty[2] = {0};
|
|
|
3604df |
|
|
|
3604df |
VALIDATE_OR_GOTO(this, out);
|
|
|
3604df |
GF_VALIDATE_OR_GOTO(this->name, frame, out);
|
|
|
3604df |
@@ -745,8 +746,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
|
|
|
3604df |
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
- ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty,
|
|
|
3604df |
- EC_VERSION_SIZE);
|
|
|
3604df |
+ ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
ec_combine(cbk, ec_combine_lookup);
|
|
|
3604df |
@@ -1141,7 +1141,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
3604df |
dict_t *xdata)
|
|
|
3604df |
{
|
|
|
3604df |
ec_fop_data_t *fop = NULL;
|
|
|
3604df |
+ ec_lock_link_t *link = NULL;
|
|
|
3604df |
ec_cbk_data_t *cbk = NULL;
|
|
|
3604df |
+ uint64_t dirty[2] = {0};
|
|
|
3604df |
data_t *data;
|
|
|
3604df |
uint64_t *version;
|
|
|
3604df |
int32_t idx = (int32_t)(uintptr_t)cookie;
|
|
|
3604df |
@@ -1177,8 +1179,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
3604df |
}
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
- ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
|
|
|
3604df |
+ ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty,
|
|
|
3604df |
EC_VERSION_SIZE);
|
|
|
3604df |
+ link = fop->data;
|
|
|
3604df |
+ if (link) {
|
|
|
3604df |
+ /*Keep a note of if the dirty is already set or not*/
|
|
|
3604df |
+ link->dirty[0] |= (dirty[0] != 0);
|
|
|
3604df |
+ link->dirty[1] |= (dirty[1] != 0);
|
|
|
3604df |
+ }
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
if (xdata)
|
|
|
3604df |
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
|
|
|
3604df |
index bcdb9fa..7eeff30 100644
|
|
|
3604df |
--- a/xlators/cluster/ec/src/ec.c
|
|
|
3604df |
+++ b/xlators/cluster/ec/src/ec.c
|
|
|
3604df |
@@ -281,6 +281,8 @@ reconfigure (xlator_t *this, dict_t *options)
|
|
|
3604df |
GF_OPTION_RECONF ("shd-wait-qlength", ec->shd.wait_qlength,
|
|
|
3604df |
options, uint32, failed);
|
|
|
3604df |
|
|
|
3604df |
+ GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
|
|
|
3604df |
+ options, bool, failed);
|
|
|
3604df |
return 0;
|
|
|
3604df |
failed:
|
|
|
3604df |
return -1;
|
|
|
3604df |
@@ -639,6 +641,7 @@ init (xlator_t *this)
|
|
|
3604df |
|
|
|
3604df |
GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
|
|
|
3604df |
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
|
|
|
3604df |
+ GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
|
|
|
3604df |
|
|
|
3604df |
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
|
|
|
3604df |
if (!this->itable)
|
|
|
3604df |
@@ -1415,5 +1418,21 @@ struct volume_options options[] =
|
|
|
3604df |
.description = "This option can be used to control number of heals"
|
|
|
3604df |
" that can wait in SHD per subvolume"
|
|
|
3604df |
},
|
|
|
3604df |
- { }
|
|
|
3604df |
+ { .key = {"optimistic-change-log"},
|
|
|
3604df |
+ .type = GF_OPTION_TYPE_BOOL,
|
|
|
3604df |
+ .default_value = "on",
|
|
|
3604df |
+ .description = "Set/Unset dirty flag for every update fop at the start"
|
|
|
3604df |
+ "of the fop. If OFF, this option impacts performance of"
|
|
|
3604df |
+ "entry operations or metadata operations as it will"
|
|
|
3604df |
+ "set dirty flag at the start and unset it at the end of"
|
|
|
3604df |
+ "ALL update fop. If ON and all the bricks are good,"
|
|
|
3604df |
+ "dirty flag will be set at the start only for file fops"
|
|
|
3604df |
+ "For metadata and entry fops dirty flag will not be set"
|
|
|
3604df |
+ "at the start, if all the bricks are good. This does"
|
|
|
3604df |
+ "not impact performance for metadata operations and"
|
|
|
3604df |
+ "entry operation but has a very small window to miss"
|
|
|
3604df |
+ "marking entry as dirty in case it is required to be"
|
|
|
3604df |
+ "healed"
|
|
|
3604df |
+ },
|
|
|
3604df |
+ { .key = {NULL} }
|
|
|
3604df |
};
|
|
|
3604df |
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
|
|
|
3604df |
index 49af5c2..bded652 100644
|
|
|
3604df |
--- a/xlators/cluster/ec/src/ec.h
|
|
|
3604df |
+++ b/xlators/cluster/ec/src/ec.h
|
|
|
3604df |
@@ -55,6 +55,7 @@ struct _ec
|
|
|
3604df |
gf_timer_t * timer;
|
|
|
3604df |
gf_boolean_t shutdown;
|
|
|
3604df |
gf_boolean_t eager_lock;
|
|
|
3604df |
+ gf_boolean_t optimistic_changelog;
|
|
|
3604df |
uint32_t background_heals;
|
|
|
3604df |
uint32_t heal_wait_qlen;
|
|
|
3604df |
struct list_head pending_fops;
|
|
|
3604df |
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
3604df |
index 873ff99..36874f5 100644
|
|
|
3604df |
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
3604df |
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
3604df |
@@ -3038,6 +3038,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
|
|
|
3604df |
.flags = OPT_FLAG_CLIENT_OPT,
|
|
|
3604df |
.op_version = GD_OP_VERSION_3_9_1,
|
|
|
3604df |
},
|
|
|
3604df |
+ { .key = "disperse.optimistic-change-log",
|
|
|
3604df |
+ .voltype = "cluster/disperse",
|
|
|
3604df |
+ .type = NO_DOC,
|
|
|
3604df |
+ .op_version = GD_OP_VERSION_3_10_1,
|
|
|
3604df |
+ .flags = OPT_FLAG_CLIENT_OPT
|
|
|
3604df |
+ },
|
|
|
3604df |
{ .key = NULL
|
|
|
3604df |
}
|
|
|
3604df |
};
|
|
|
3604df |
--
|
|
|
3604df |
2.9.3
|
|
|
3604df |
|