|
|
d1681e |
From 0f2adea7ae377ea2efbab388f3af7e2a048f5f68 Mon Sep 17 00:00:00 2001
|
|
|
d1681e |
From: karthik-us <ksubrahm@redhat.com>
|
|
|
d1681e |
Date: Wed, 17 Jan 2018 17:30:06 +0530
|
|
|
d1681e |
Subject: [PATCH 190/201] cluster/afr: Adding option to take full file lock
|
|
|
d1681e |
|
|
|
d1681e |
Problem:
|
|
|
d1681e |
In replica 3 volumes there is a possibilities of ending up in split
|
|
|
d1681e |
brain scenario, when multiple clients writing data on the same file
|
|
|
d1681e |
at non overlapping regions in parallel.
|
|
|
d1681e |
|
|
|
d1681e |
Scenario:
|
|
|
d1681e |
- Initially all the copies are good and all the clients gets the value
|
|
|
d1681e |
of data readables as all good.
|
|
|
d1681e |
- Client C0 performs write W1 which fails on brick B0 and succeeds on
|
|
|
d1681e |
other two bricks.
|
|
|
d1681e |
- C1 performs write W2 which fails on B1 and succeeds on other two bricks.
|
|
|
d1681e |
- C2 performs write W3 which fails on B2 and succeeds on other two bricks.
|
|
|
d1681e |
- All the 3 writes above happen in parallel and fall on different ranges
|
|
|
d1681e |
so afr takes granular locks and all the writes are performed in parallel.
|
|
|
d1681e |
Since each client had data-readables as good, it does not see
|
|
|
d1681e |
file going into split-brain in the in_flight_split_brain check, hence
|
|
|
d1681e |
performs the post-op marking the pending xattrs. Now all the bricks
|
|
|
d1681e |
are being blamed by each other, ending up in split-brain.
|
|
|
d1681e |
|
|
|
d1681e |
Fix:
|
|
|
d1681e |
Have an option to take either full lock or range lock on files while
|
|
|
d1681e |
doing data transactions, to prevent the possibility of ending up in
|
|
|
d1681e |
split brains. With this change, by default the files will take full
|
|
|
d1681e |
lock while doing IO. If you want to make use of the old range lock
|
|
|
d1681e |
change the value of "cluster.full-lock" to "no".
|
|
|
d1681e |
|
|
|
d1681e |
Upstream patch: https://review.gluster.org/#/c/19218/
|
|
|
d1681e |
|
|
|
d1681e |
> Change-Id: I7893fa33005328ed63daa2f7c35eeed7c5218962
|
|
|
d1681e |
> BUG: 1535438
|
|
|
d1681e |
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
|
|
|
d1681e |
|
|
|
d1681e |
Change-Id: I4d8b1c90bfff8f597cf7f7e49a71f5f6eb19f986
|
|
|
d1681e |
BUG: 1552414
|
|
|
d1681e |
Signed-off-by: karthik-us <ksubrahm@redhat.com>
|
|
|
d1681e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/131966
|
|
|
d1681e |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
d1681e |
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
|
|
|
d1681e |
---
|
|
|
d1681e |
libglusterfs/src/globals.h | 4 +++-
|
|
|
d1681e |
xlators/cluster/afr/src/afr-transaction.c | 2 +-
|
|
|
d1681e |
xlators/cluster/afr/src/afr.c | 8 ++++++++
|
|
|
d1681e |
xlators/cluster/afr/src/afr.h | 5 +++--
|
|
|
d1681e |
xlators/mgmt/glusterd/src/glusterd-volume-set.c | 7 +++++++
|
|
|
d1681e |
5 files changed, 22 insertions(+), 4 deletions(-)
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
|
|
|
d1681e |
index 6bbe3e6..8fd3318 100644
|
|
|
d1681e |
--- a/libglusterfs/src/globals.h
|
|
|
d1681e |
+++ b/libglusterfs/src/globals.h
|
|
|
d1681e |
@@ -43,7 +43,7 @@
|
|
|
d1681e |
*/
|
|
|
d1681e |
#define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly
|
|
|
d1681e |
should not change */
|
|
|
d1681e |
-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_1 /* MAX VERSION is the maximum
|
|
|
d1681e |
+#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_2 /* MAX VERSION is the maximum
|
|
|
d1681e |
count in VME table, should
|
|
|
d1681e |
keep changing with
|
|
|
d1681e |
introduction of newer
|
|
|
d1681e |
@@ -107,6 +107,8 @@
|
|
|
d1681e |
|
|
|
d1681e |
#define GD_OP_VERSION_3_13_1 31301 /* Op-version for GlusterFS 3.13.1 */
|
|
|
d1681e |
|
|
|
d1681e |
+#define GD_OP_VERSION_3_13_2 31302 /* Op-version for GlusterFS 3.13.2 */
|
|
|
d1681e |
+
|
|
|
d1681e |
#include "xlator.h"
|
|
|
d1681e |
|
|
|
d1681e |
/* THIS */
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
|
|
|
d1681e |
index 97f9dd4..1c80c6b 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr-transaction.c
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr-transaction.c
|
|
|
d1681e |
@@ -1991,7 +1991,7 @@ afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
|
|
|
d1681e |
inodelk = afr_get_inodelk (int_lock, int_lock->domain);
|
|
|
d1681e |
priv = this->private;
|
|
|
d1681e |
|
|
|
d1681e |
- if (priv->arbiter_count &&
|
|
|
d1681e |
+ if ((priv->arbiter_count || priv->full_lock) &&
|
|
|
d1681e |
local->transaction.type == AFR_DATA_TRANSACTION) {
|
|
|
d1681e |
/*Lock entire file to avoid network split brains.*/
|
|
|
d1681e |
inodelk->flock.l_len = 0;
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
|
|
|
d1681e |
index d3aee77..9493fbb 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr.c
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr.c
|
|
|
d1681e |
@@ -244,6 +244,7 @@ reconfigure (xlator_t *this, dict_t *options)
|
|
|
d1681e |
out);
|
|
|
d1681e |
GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str,
|
|
|
d1681e |
out);
|
|
|
d1681e |
+ GF_OPTION_RECONF ("full-lock", priv->full_lock, options, bool, out);
|
|
|
d1681e |
GF_OPTION_RECONF ("use-compound-fops", priv->use_compound_fops,
|
|
|
d1681e |
options, bool,
|
|
|
d1681e |
out);
|
|
|
d1681e |
@@ -534,6 +535,7 @@ init (xlator_t *this)
|
|
|
d1681e |
|
|
|
d1681e |
GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
|
|
|
d1681e |
GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out);
|
|
|
d1681e |
+ GF_OPTION_INIT ("full-lock", priv->full_lock, bool, out);
|
|
|
d1681e |
GF_OPTION_INIT ("use-compound-fops", priv->use_compound_fops,
|
|
|
d1681e |
bool, out);
|
|
|
d1681e |
GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out);
|
|
|
d1681e |
@@ -1084,6 +1086,12 @@ struct volume_options options[] = {
|
|
|
d1681e |
"stop being compatible with afr-v1, which helps afr "
|
|
|
d1681e |
"be more granular while self-healing",
|
|
|
d1681e |
},
|
|
|
d1681e |
+ { .key = {"full-lock"},
|
|
|
d1681e |
+ .type = GF_OPTION_TYPE_BOOL,
|
|
|
d1681e |
+ .default_value = "yes",
|
|
|
d1681e |
+ .description = "If this option is disabled, then the IOs will take "
|
|
|
d1681e |
+ "range locks same as versions till 3.13.1."
|
|
|
d1681e |
+ },
|
|
|
d1681e |
{ .key = {"granular-entry-heal"},
|
|
|
d1681e |
.type = GF_OPTION_TYPE_BOOL,
|
|
|
d1681e |
.default_value = "no",
|
|
|
d1681e |
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
|
|
|
d1681e |
index c822221..b6f5388 100644
|
|
|
d1681e |
--- a/xlators/cluster/afr/src/afr.h
|
|
|
d1681e |
+++ b/xlators/cluster/afr/src/afr.h
|
|
|
d1681e |
@@ -178,9 +178,10 @@ typedef struct _afr_private {
|
|
|
d1681e |
void *pump_private;
|
|
|
d1681e |
gf_boolean_t use_afr_in_pump;
|
|
|
d1681e |
char *locking_scheme;
|
|
|
d1681e |
- gf_boolean_t esh_granular;
|
|
|
d1681e |
+ gf_boolean_t full_lock;
|
|
|
d1681e |
+ gf_boolean_t esh_granular;
|
|
|
d1681e |
gf_boolean_t consistent_io;
|
|
|
d1681e |
- gf_boolean_t use_compound_fops;
|
|
|
d1681e |
+ gf_boolean_t use_compound_fops;
|
|
|
d1681e |
} afr_private_t;
|
|
|
d1681e |
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
d1681e |
index b603c7f..8d3407d 100644
|
|
|
d1681e |
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
d1681e |
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
d1681e |
@@ -1507,6 +1507,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
|
|
|
d1681e |
.flags = OPT_FLAG_CLIENT_OPT
|
|
|
d1681e |
},
|
|
|
d1681e |
|
|
|
d1681e |
+ { .key = "cluster.full-lock",
|
|
|
d1681e |
+ .voltype = "cluster/replicate",
|
|
|
d1681e |
+ .type = NO_DOC,
|
|
|
d1681e |
+ .op_version = GD_OP_VERSION_3_13_2,
|
|
|
d1681e |
+ .flags = OPT_FLAG_CLIENT_OPT
|
|
|
d1681e |
+ },
|
|
|
d1681e |
+
|
|
|
d1681e |
/* stripe xlator options */
|
|
|
d1681e |
{ .key = "cluster.stripe-block-size",
|
|
|
d1681e |
.voltype = "cluster/stripe",
|
|
|
d1681e |
--
|
|
|
d1681e |
1.8.3.1
|
|
|
d1681e |
|