e7a346
From 0f2adea7ae377ea2efbab388f3af7e2a048f5f68 Mon Sep 17 00:00:00 2001
e7a346
From: karthik-us <ksubrahm@redhat.com>
e7a346
Date: Wed, 17 Jan 2018 17:30:06 +0530
e7a346
Subject: [PATCH 190/201] cluster/afr: Adding option to take full file lock
e7a346
e7a346
Problem:
e7a346
In replica 3 volumes there is a possibilities of ending up in split
e7a346
brain scenario, when multiple clients writing data on the same file
e7a346
at non overlapping regions in parallel.
e7a346
e7a346
Scenario:
e7a346
- Initially all the copies are good and all the clients gets the value
e7a346
  of data readables as all good.
e7a346
- Client C0 performs write W1 which fails on brick B0 and succeeds on
e7a346
  other two bricks.
e7a346
- C1 performs write W2 which fails on B1 and succeeds on other two bricks.
e7a346
- C2 performs write W3 which fails on B2 and succeeds on other two bricks.
e7a346
- All the 3 writes above happen in parallel and fall on different ranges
e7a346
  so afr takes granular locks and all the writes are performed in parallel.
e7a346
  Since each client had data-readables as good, it does not see
e7a346
  file going into split-brain in the in_flight_split_brain check, hence
e7a346
  performs the post-op marking the pending xattrs. Now all the bricks
e7a346
  are being blamed by each other, ending up in split-brain.
e7a346
e7a346
Fix:
e7a346
Have an option to take either full lock or range lock on files while
e7a346
doing data transactions, to prevent the possibility of ending up in
e7a346
split brains. With this change, by default the files will take full
e7a346
lock while doing IO. If you want to make use of the old range lock
e7a346
change the value of "cluster.full-lock" to "no".
e7a346
e7a346
Upstream patch: https://review.gluster.org/#/c/19218/
e7a346
e7a346
> Change-Id: I7893fa33005328ed63daa2f7c35eeed7c5218962
e7a346
> BUG: 1535438
e7a346
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
e7a346
e7a346
Change-Id: I4d8b1c90bfff8f597cf7f7e49a71f5f6eb19f986
e7a346
BUG: 1552414
e7a346
Signed-off-by: karthik-us <ksubrahm@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/131966
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
e7a346
---
e7a346
 libglusterfs/src/globals.h                      | 4 +++-
e7a346
 xlators/cluster/afr/src/afr-transaction.c       | 2 +-
e7a346
 xlators/cluster/afr/src/afr.c                   | 8 ++++++++
e7a346
 xlators/cluster/afr/src/afr.h                   | 5 +++--
e7a346
 xlators/mgmt/glusterd/src/glusterd-volume-set.c | 7 +++++++
e7a346
 5 files changed, 22 insertions(+), 4 deletions(-)
e7a346
e7a346
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
e7a346
index 6bbe3e6..8fd3318 100644
e7a346
--- a/libglusterfs/src/globals.h
e7a346
+++ b/libglusterfs/src/globals.h
e7a346
@@ -43,7 +43,7 @@
e7a346
  */
e7a346
 #define GD_OP_VERSION_MIN  1 /* MIN is the fresh start op-version, mostly
e7a346
                                 should not change */
e7a346
-#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_13_1 /* MAX VERSION is the maximum
e7a346
+#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_13_2 /* MAX VERSION is the maximum
e7a346
                                                   count in VME table, should
e7a346
                                                   keep changing with
e7a346
                                                   introduction of newer
e7a346
@@ -107,6 +107,8 @@
e7a346
 
e7a346
 #define GD_OP_VERSION_3_13_1   31301 /* Op-version for GlusterFS 3.13.1 */
e7a346
 
e7a346
+#define GD_OP_VERSION_3_13_2   31302 /* Op-version for GlusterFS 3.13.2 */
e7a346
+
e7a346
 #include "xlator.h"
e7a346
 
e7a346
 /* THIS */
e7a346
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
e7a346
index 97f9dd4..1c80c6b 100644
e7a346
--- a/xlators/cluster/afr/src/afr-transaction.c
e7a346
+++ b/xlators/cluster/afr/src/afr-transaction.c
e7a346
@@ -1991,7 +1991,7 @@ afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
e7a346
         inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
         priv = this->private;
e7a346
 
e7a346
-        if (priv->arbiter_count &&
e7a346
+        if ((priv->arbiter_count || priv->full_lock) &&
e7a346
             local->transaction.type == AFR_DATA_TRANSACTION) {
e7a346
                 /*Lock entire file to avoid network split brains.*/
e7a346
                 inodelk->flock.l_len   = 0;
e7a346
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
e7a346
index d3aee77..9493fbb 100644
e7a346
--- a/xlators/cluster/afr/src/afr.c
e7a346
+++ b/xlators/cluster/afr/src/afr.c
e7a346
@@ -244,6 +244,7 @@ reconfigure (xlator_t *this, dict_t *options)
e7a346
                           out);
e7a346
         GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str,
e7a346
                           out);
e7a346
+        GF_OPTION_RECONF ("full-lock", priv->full_lock, options, bool, out);
e7a346
         GF_OPTION_RECONF ("use-compound-fops", priv->use_compound_fops,
e7a346
                           options, bool,
e7a346
                           out);
e7a346
@@ -534,6 +535,7 @@ init (xlator_t *this)
e7a346
 
e7a346
         GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
e7a346
         GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out);
e7a346
+        GF_OPTION_INIT ("full-lock", priv->full_lock, bool, out);
e7a346
         GF_OPTION_INIT ("use-compound-fops", priv->use_compound_fops,
e7a346
                         bool, out);
e7a346
         GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out);
e7a346
@@ -1084,6 +1086,12 @@ struct volume_options options[] = {
e7a346
                          "stop being compatible with afr-v1, which helps afr "
e7a346
                          "be more granular while self-healing",
e7a346
         },
e7a346
+        { .key = {"full-lock"},
e7a346
+          .type = GF_OPTION_TYPE_BOOL,
e7a346
+          .default_value = "yes",
e7a346
+          .description = "If this option is disabled, then the IOs will take "
e7a346
+                         "range locks same as versions till 3.13.1."
e7a346
+        },
e7a346
         { .key = {"granular-entry-heal"},
e7a346
           .type = GF_OPTION_TYPE_BOOL,
e7a346
           .default_value = "no",
e7a346
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
e7a346
index c822221..b6f5388 100644
e7a346
--- a/xlators/cluster/afr/src/afr.h
e7a346
+++ b/xlators/cluster/afr/src/afr.h
e7a346
@@ -178,9 +178,10 @@ typedef struct _afr_private {
e7a346
 	void                   *pump_private;
e7a346
 	gf_boolean_t           use_afr_in_pump;
e7a346
 	char                   *locking_scheme;
e7a346
-        gf_boolean_t            esh_granular;
e7a346
+        gf_boolean_t           full_lock;
e7a346
+        gf_boolean_t           esh_granular;
e7a346
         gf_boolean_t           consistent_io;
e7a346
-        gf_boolean_t            use_compound_fops;
e7a346
+        gf_boolean_t           use_compound_fops;
e7a346
 } afr_private_t;
e7a346
 
e7a346
 
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
e7a346
index b603c7f..8d3407d 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
e7a346
@@ -1507,6 +1507,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
e7a346
           .flags      = OPT_FLAG_CLIENT_OPT
e7a346
         },
e7a346
 
e7a346
+        { .key        = "cluster.full-lock",
e7a346
+          .voltype    = "cluster/replicate",
e7a346
+          .type       = NO_DOC,
e7a346
+          .op_version = GD_OP_VERSION_3_13_2,
e7a346
+          .flags      = OPT_FLAG_CLIENT_OPT
e7a346
+        },
e7a346
+
e7a346
         /* stripe xlator options */
e7a346
         { .key         = "cluster.stripe-block-size",
e7a346
           .voltype     = "cluster/stripe",
e7a346
-- 
e7a346
1.8.3.1
e7a346