diff --git a/SOURCES/0633-mgmt-glusterd-Cleanup-memory-leaks-in-handshake.patch b/SOURCES/0633-mgmt-glusterd-Cleanup-memory-leaks-in-handshake.patch new file mode 100644 index 0000000..d9b85f4 --- /dev/null +++ b/SOURCES/0633-mgmt-glusterd-Cleanup-memory-leaks-in-handshake.patch @@ -0,0 +1,54 @@ +From 9a3f1f2d4348e50919cd2cbb109ef1c7f7372c43 Mon Sep 17 00:00:00 2001 +From: Vijay Bellur +Date: Tue, 4 Oct 2016 13:55:53 -0400 +Subject: [PATCH 633/642] mgmt/glusterd: Cleanup memory leaks in handshake + +Thanks to bingxuan.zhang at nokia dot com for the report and patch. + +> Change-Id: I994f82493fec7827f31592340af5bda83322f878 +> BUG: 1377584 +> Signed-off-by: Vijay Bellur +> Reviewed-on: http://review.gluster.org/15612 +> NetBSD-regression: NetBSD Build System +> Smoke: Gluster Build System +> CentOS-regression: Gluster Build System +> Reviewed-by: Atin Mukherjee +> (cherry pick from commit f0c588e5e6fa1552325a31e0e01704ecf063c7e1) + +BUG: 1526363 +Change-Id: I994f82493fec7827f31592340af5bda83322f878 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/125953 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-handshake.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c +index a09437e..652c868 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c +@@ -476,6 +476,10 @@ out: + peerinfo->max_op_version = client_max_op_version; + peerinfo->min_op_version = client_min_op_version; + ++ if (dict) ++ dict_unref (dict); ++ ++ + return ret; + } + +@@ -930,6 +934,8 @@ fail: + (xdrproc_t)xdr_gf_getspec_rsp); + free (args.key);//malloced by xdr + free (rsp.spec); ++ if (args.xdata.xdata_val) ++ free (args.xdata.xdata_val); + + return 0; + } +-- +2.9.3 + diff --git a/SOURCES/0634-glusterd-Fix-glusterd-mem-leaks.patch b/SOURCES/0634-glusterd-Fix-glusterd-mem-leaks.patch new file mode 100644 index 0000000..ad64af8 --- /dev/null +++ b/SOURCES/0634-glusterd-Fix-glusterd-mem-leaks.patch @@ -0,0 +1,105 @@ +From 8e25c6abc4f71b0323f56efb4f001e1f1dc2482b Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Thu, 7 Dec 2017 10:32:05 +0530 +Subject: [PATCH 634/642] glusterd : Fix glusterd mem leaks + +Problem: glusterd eats a huge amount of meory during volume set/stop/start. + +Solution: At the time of compare graph topology create a graph and populate + key values in the dictionary, after finished graph comparison we + do destroy the new graph.At the time of construct graph we don't take + any reference and for server xlators we do take reference in + server_setvolume so in glusterd we do take reference after prepare + a new graph while we do create a graph to compare graph topology. + +> BUG: 1520245 +> Change-Id: I573133d57771b7dc431a04422c5001a06b7dda9a +> Reviewed on https://review.gluster.org/#/c/18915/ +> Signed-off-by: Mohit Agrawal +> (cherry pick from commit e016bcaf8171373cbc327faf42a6b2f2c5449b0e) + +BUG: 1526363 +Change-Id: I573133d57771b7dc431a04422c5001a06b7dda9a +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/125952 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-handshake.c | 3 +++ + xlators/mgmt/glusterd/src/glusterd-utils.c | 34 ++++++++++++++++++++++++++ + 2 files changed, 37 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c +index 652c868..b72df90 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c +@@ -1221,6 +1221,9 @@ out: + if (rsp.hndsk.hndsk_val) + GF_FREE (rsp.hndsk.hndsk_val); + ++ if (args_dict) ++ dict_unref (args_dict); ++ + return ret; + } + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 7b57884..1e399f4 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -8805,6 +8805,36 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo, + + return ret; + } ++/* ++ The function is required to take dict ref for every xlator at graph. ++ At the time of compare graph topology create a graph and populate ++ key values in the dictionary, after finished graph comparison we do destroy ++ the new graph.At the time of construct graph we don't take any reference ++ so to avoid leak due to ref counter underflow we need to call dict_ref here. ++ ++*/ ++ ++void ++glusterd_graph_take_reference (xlator_t *tree) ++{ xlator_t *trav = tree; ++ xlator_t *prev = tree; ++ ++ if (!tree) { ++ gf_msg ("parser", GF_LOG_ERROR, 0, LG_MSG_TREE_NOT_FOUND, ++ "Translator tree not found"); ++ return; ++ } ++ ++ while (prev) { ++ trav = prev->next; ++ if (prev->options) ++ dict_ref (prev->options); ++ prev = trav; ++ } ++ return; ++} ++ ++ + + int + glusterd_check_topology_identical (const char *filename1, +@@ -8852,11 +8882,15 @@ glusterd_check_topology_identical (const char *filename1, + if (grph1 == NULL) + goto out; + ++ glusterd_graph_take_reference (grph1->first); ++ + /* create the graph for filename2 */ + grph2 = glusterfs_graph_construct(fp2); + if (grph2 == NULL) + goto out; + ++ glusterd_graph_take_reference (grph2->first); ++ + /* compare the graph topology */ + *identical = is_graph_topology_equal(grph1, grph2); + ret = 0; /* SUCCESS */ +-- +2.9.3 + diff --git a/SOURCES/0635-glusterd-Marking-all-the-brick-status-as-stopped-whe.patch b/SOURCES/0635-glusterd-Marking-all-the-brick-status-as-stopped-whe.patch new file mode 100644 index 0000000..0009ab8 --- /dev/null +++ b/SOURCES/0635-glusterd-Marking-all-the-brick-status-as-stopped-whe.patch @@ -0,0 +1,164 @@ +From 778a8e338c40103ff7837e30413ed62d3c1a3c8c Mon Sep 17 00:00:00 2001 +From: Sanju Rakonde +Date: Sat, 7 Oct 2017 03:33:40 +0530 +Subject: [PATCH 635/642] glusterd:Marking all the brick status as stopped when + a process goes down in brick multiplexing + +In brick multiplexing environment, if a brick process goes down +i.e., if we kill it with SIGKILL, the status of the brick for which +the process came up for the first time is only changing to stopped. +all other brick statuses are remain started. This is happening because +the process was killed abruptly using SIGKILL signal and signal +handler wasn't invoked and further cleanup wasn't triggered. + +When we try to start a volume using force, it shows error saying +"Request timed out", since all the brickinfo->status are still in +started state, we're waiting for one of the brick process to come up +which never going to happen since the brick process was killed. + +To resolve this, In the disconnect event, We are checking all the +processes that whether the brick which got disconnected belongs the +process. Once we get the process we are calling a function named +glusterd_mark_bricks_stopped_by_proc() and sending brick_proc_t object as +an argument. + +From the glusterd_brick_proc_t we can get all the bricks attached +to that process. but these are duplicated ones. To get the original +brickinfo we are reading volinfo from brick. In volinfo we will have +original brickinfo copies. We are changing brickinfo->status to +stopped for all the bricks. + +>upstream patch : https://review.gluster.org/#/c/18444/ + +Change-Id: Ifb9054b3ee081ef56b39b2903ae686984fe827e7 +BUG: 1526373 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/125949 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + .../glusterd/bug-1499509-disconnect-in-brick-mux.t | 27 ++++++++++ + xlators/mgmt/glusterd/src/glusterd-handler.c | 59 +++++++++++++++++++++- + 2 files changed, 85 insertions(+), 1 deletion(-) + create mode 100644 tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t + +diff --git a/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t +new file mode 100644 +index 0000000..3c5bebe +--- /dev/null ++++ b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t +@@ -0,0 +1,27 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../cluster.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++ ++## Enable brick multiplexing ++TEST $CLI volume set all cluster.brick-multiplex on ++ ++## creating 1x3 replicated volumes ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}_{1..3} ++TEST $CLI volume create $V1 replica 3 $H0:$B1/${V1}_{1..3} ++ ++## Start the volume ++TEST $CLI volume start $V0 ++TEST $CLI volume start $V1 ++ ++kill -9 $(pgrep glusterfsd) ++ ++EXPECT 0 online_brick_count ++ ++cleanup +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index ae8ddde..350ef23 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -5987,6 +5987,31 @@ out: + + static int gd_stale_rpc_disconnect_log; + ++static int ++glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) { ++ glusterd_brickinfo_t *brickinfo = NULL; ++ glusterd_brickinfo_t *brickinfo_tmp = NULL; ++ glusterd_volinfo_t *volinfo = NULL; ++ int ret = -1; ++ ++ cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) { ++ ret = glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo); ++ if (ret) { ++ gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL, ++ "Failed to get volinfo from brick(%s)", ++ brickinfo->path); ++ goto out; ++ } ++ cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) { ++ if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0) ++ glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED); ++ } ++ } ++ return 0; ++out: ++ return ret; ++} ++ + int + __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, + rpc_clnt_event_t event, void *data) +@@ -5997,6 +6022,9 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; ++ int temp = 0; ++ glusterd_brickinfo_t *brickinfo_tmp = NULL; ++ glusterd_brick_proc_t *brick_proc = NULL; + + brickid = mydata; + if (!brickid) +@@ -6097,7 +6125,36 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, + brickinfo->path); + } + +- glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED); ++ if (is_brick_mx_enabled()) { ++ cds_list_for_each_entry (brick_proc, &conf->brick_procs, ++ brick_proc_list) { ++ cds_list_for_each_entry (brickinfo_tmp, ++ &brick_proc->bricks, ++ brick_list) { ++ if (strcmp (brickinfo_tmp->path, ++ brickinfo->path) == 0) { ++ ret = glusterd_mark_bricks_stopped_by_proc ++ (brick_proc); ++ if (ret) { ++ gf_msg(THIS->name, ++ GF_LOG_ERROR, 0, ++ GD_MSG_BRICK_STOP_FAIL, ++ "Unable to stop " ++ "bricks of process" ++ " to which brick(%s)" ++ " belongs", ++ brickinfo->path); ++ goto out; ++ } ++ temp = 1; ++ break; ++ } ++ } ++ if (temp == 1) ++ break; ++ } ++ } else ++ glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED); + break; + + case RPC_CLNT_DESTROY: +-- +2.9.3 + diff --git a/SOURCES/0636-glusterd-clean-up-portmap-on-brick-disconnect.patch b/SOURCES/0636-glusterd-clean-up-portmap-on-brick-disconnect.patch new file mode 100644 index 0000000..97a5c25 --- /dev/null +++ b/SOURCES/0636-glusterd-clean-up-portmap-on-brick-disconnect.patch @@ -0,0 +1,171 @@ +From 7af9edd57ea7222dad59321d0779df00301d48bb Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 17 Oct 2017 21:32:44 +0530 +Subject: [PATCH 636/642] glusterd: clean up portmap on brick disconnect + +GlusterD's portmap entry for a brick is cleaned up when a PMAP_SIGNOUT event is +initiated by the brick process at the shutdown. But if the brick process crashes +or gets killed through SIGKILL then this event is not initiated and glusterd +ends up with a stale port. Since GlusterD's portmap traversal happens both ways, +forward for allocation and backward for registry search, there is a possibility +that glusterd might end up running with a stale port for a brick which +eventually will end up with clients to fail to connect to the bricks. + +Solution is to clean up the port entry in case the process is down as +part of the brick disconnect event. Although with this the handling +PMAP_SIGNOUT event becomes redundant in most of the cases, but this is +the safeguard method to avoid glusterd getting into the stale port +issues. + +>upstream patch : https://review.gluster.org/#/c/18541 + +Change-Id: I04c5be6d11e772ee4de16caf56dbb37d5c944303 +BUG: 1526371 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/125945 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 25 +++++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-pmap.c | 26 +++++++++++++++++--------- + xlators/mgmt/glusterd/src/glusterd-pmap.h | 3 ++- + xlators/mgmt/glusterd/src/glusterd.c | 3 ++- + 4 files changed, 46 insertions(+), 11 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 350ef23..83f0e7d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -6025,6 +6025,8 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, + int temp = 0; + glusterd_brickinfo_t *brickinfo_tmp = NULL; + glusterd_brick_proc_t *brick_proc = NULL; ++ int32_t pid = -1; ++ char pidfile[PATH_MAX] = {0}; + + brickid = mydata; + if (!brickid) +@@ -6123,6 +6125,29 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, + "peer=%s;volume=%s;brick=%s", + brickinfo->hostname, volinfo->volname, + brickinfo->path); ++ /* In case of an abrupt shutdown of a brick PMAP_SIGNOUT ++ * event is not received by glusterd which can lead to a ++ * stale port entry in glusterd, so forcibly clean up ++ * the same if the process is not running ++ */ ++ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, ++ brickinfo, conf); ++ if (!gf_is_service_running (pidfile, &pid)) { ++ ret = pmap_registry_remove ( ++ THIS, brickinfo->port, ++ brickinfo->path, ++ GF_PMAP_PORT_BRICKSERVER, ++ NULL, _gf_true); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_WARNING, ++ GD_MSG_PMAP_REGISTRY_REMOVE_FAIL, ++ 0, "Failed to remove pmap " ++ "registry for port %d for " ++ "brick %s", brickinfo->port, ++ brickinfo->path); ++ ret = 0; ++ } ++ } + } + + if (is_brick_mx_enabled()) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c +index 7acee19..1789ef3 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c ++++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c +@@ -237,7 +237,8 @@ pmap_assign_port (xlator_t *this, int old_port, const char *path) + + if (old_port) { + ret = pmap_registry_remove (this, 0, path, +- GF_PMAP_PORT_BRICKSERVER, NULL); ++ GF_PMAP_PORT_BRICKSERVER, NULL, ++ _gf_false); + if (ret) { + gf_msg (this->name, GF_LOG_WARNING, + GD_MSG_PMAP_REGISTRY_REMOVE_FAIL, 0, "Failed to" +@@ -340,7 +341,8 @@ pmap_registry_extend (xlator_t *this, int port, const char *brickname) + + int + pmap_registry_remove (xlator_t *this, int port, const char *brickname, +- gf_pmap_port_type_t type, void *xprt) ++ gf_pmap_port_type_t type, void *xprt, ++ gf_boolean_t brick_disconnect) + { + struct pmap_registry *pmap = NULL; + int p = 0; +@@ -387,11 +389,16 @@ remove: + * can delete the entire entry. + */ + if (!pmap->ports[p].xprt) { +- brick_str = pmap->ports[p].brickname; +- if (brick_str) { +- while (*brick_str != '\0') { +- if (*(brick_str++) != ' ') { +- goto out; ++ /* If the signout call is being triggered by brick disconnect ++ * then clean up all the bricks (in case of brick mux) ++ */ ++ if (!brick_disconnect) { ++ brick_str = pmap->ports[p].brickname; ++ if (brick_str) { ++ while (*brick_str != '\0') { ++ if (*(brick_str++) != ' ') { ++ goto out; ++ } + } + } + } +@@ -578,14 +585,15 @@ __gluster_pmap_signout (rpcsvc_request_t *req) + goto fail; + } + rsp.op_ret = pmap_registry_remove (THIS, args.port, args.brick, +- GF_PMAP_PORT_BRICKSERVER, req->trans); ++ GF_PMAP_PORT_BRICKSERVER, req->trans, ++ _gf_false); + + ret = glusterd_get_brickinfo (THIS, args.brick, args.port, &brickinfo); + if (args.rdma_port) { + snprintf(brick_path, PATH_MAX, "%s.rdma", args.brick); + rsp.op_ret = pmap_registry_remove (THIS, args.rdma_port, + brick_path, GF_PMAP_PORT_BRICKSERVER, +- req->trans); ++ req->trans, _gf_false); + } + /* Update portmap status on brickinfo */ + if (brickinfo) +diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.h b/xlators/mgmt/glusterd/src/glusterd-pmap.h +index 9965a95..253b4cc 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-pmap.h ++++ b/xlators/mgmt/glusterd/src/glusterd-pmap.h +@@ -42,7 +42,8 @@ int pmap_registry_bind (xlator_t *this, int port, const char *brickname, + gf_pmap_port_type_t type, void *xprt); + int pmap_registry_extend (xlator_t *this, int port, const char *brickname); + int pmap_registry_remove (xlator_t *this, int port, const char *brickname, +- gf_pmap_port_type_t type, void *xprt); ++ gf_pmap_port_type_t type, void *xprt, ++ gf_boolean_t brick_disconnect); + int pmap_registry_search (xlator_t *this, const char *brickname, + gf_pmap_port_type_t type, gf_boolean_t destroy); + struct pmap_registry *pmap_registry_get (xlator_t *this); +diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c +index 34182b0..45587c0 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.c ++++ b/xlators/mgmt/glusterd/src/glusterd.c +@@ -423,7 +423,8 @@ glusterd_rpcsvc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, + pthread_mutex_lock (&priv->xprt_lock); + list_del (&xprt->list); + pthread_mutex_unlock (&priv->xprt_lock); +- pmap_registry_remove (this, 0, NULL, GF_PMAP_PORT_NONE, xprt); ++ pmap_registry_remove (this, 0, NULL, GF_PMAP_PORT_NONE, xprt, ++ _gf_false); + break; + } + +-- +2.9.3 + diff --git a/SOURCES/0637-glusterd-fix-brick-restart-parallelism.patch b/SOURCES/0637-glusterd-fix-brick-restart-parallelism.patch new file mode 100644 index 0000000..eca49ac --- /dev/null +++ b/SOURCES/0637-glusterd-fix-brick-restart-parallelism.patch @@ -0,0 +1,283 @@ +From 580975c2850b2a2440a43bc05396d7ff55f42a9f Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Thu, 26 Oct 2017 14:26:30 +0530 +Subject: [PATCH 637/642] glusterd: fix brick restart parallelism + +glusterd's brick restart logic is not always sequential as there is +atleast three different ways how the bricks are restarted. +1. through friend-sm and glusterd_spawn_daemons () +2. through friend-sm and handling volume quorum action +3. through friend handshaking when there is a mimatch on quorum on +friend import. + +In a brick multiplexing setup, glusterd ended up trying to spawn the +same brick process couple of times as almost in fraction of milliseconds +two threads hit glusterd_brick_start () because of which glusterd didn't +have any choice of rejecting any one of them as for both the case brick +start criteria met. + +As a solution, it'd be better to control this madness by two different +flags, one is a boolean called start_triggered which indicates a brick +start has been triggered and it continues to be true till a brick dies +or killed, the second is a mutex lock to ensure for a particular brick +we don't end up getting into glusterd_brick_start () more than once at +same point of time. + +>upstream patch : https://review.gluster.org/#/c/18577 + +Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2 +BUG: 1526368 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/125950 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 24 ++++++++----- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 31 ++++++++++------- + xlators/mgmt/glusterd/src/glusterd-server-quorum.c | 15 +++++++-- + xlators/mgmt/glusterd/src/glusterd-utils.c | 39 +++++++++++++++++----- + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 8 +++++ + xlators/mgmt/glusterd/src/glusterd.h | 2 ++ + 6 files changed, 87 insertions(+), 32 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 83f0e7d..0f97573 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -5995,16 +5995,22 @@ glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) { + int ret = -1; + + cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) { +- ret = glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo); ++ ret = glusterd_get_volinfo_from_brick (brickinfo->path, ++ &volinfo); + if (ret) { +- gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL, +- "Failed to get volinfo from brick(%s)", +- brickinfo->path); ++ gf_msg (THIS->name, GF_LOG_ERROR, 0, ++ GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo" ++ " from brick(%s)", brickinfo->path); + goto out; + } +- cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) { +- if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0) +- glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED); ++ cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, ++ brick_list) { ++ if (strcmp (brickinfo->path, ++ brickinfo_tmp->path) == 0) { ++ glusterd_set_brick_status (brickinfo_tmp, ++ GF_BRICK_STOPPED); ++ brickinfo_tmp->start_triggered = _gf_false; ++ } + } + } + return 0; +@@ -6178,8 +6184,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, + if (temp == 1) + break; + } +- } else ++ } else { + glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED); ++ brickinfo->start_triggered = _gf_false; ++ } + break; + + case RPC_CLNT_DESTROY: +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index 2ee9af4..51eae2d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -2346,18 +2346,25 @@ glusterd_start_bricks (glusterd_volinfo_t *volinfo) + GF_ASSERT (volinfo); + + cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { +- ret = glusterd_brick_start (volinfo, brickinfo, _gf_false); +- if (ret) { +- gf_msg (THIS->name, GF_LOG_ERROR, 0, +- GD_MSG_BRICK_DISCONNECTED, +- "Failed to start %s:%s for %s", +- brickinfo->hostname, brickinfo->path, +- volinfo->volname); +- gf_event (EVENT_BRICK_START_FAILED, +- "peer=%s;volume=%s;brick=%s", +- brickinfo->hostname, volinfo->volname, +- brickinfo->path); +- goto out; ++ if (!brickinfo->start_triggered) { ++ pthread_mutex_lock (&brickinfo->restart_mutex); ++ { ++ ret = glusterd_brick_start (volinfo, brickinfo, ++ _gf_false); ++ } ++ pthread_mutex_unlock (&brickinfo->restart_mutex); ++ if (ret) { ++ gf_msg (THIS->name, GF_LOG_ERROR, 0, ++ GD_MSG_BRICK_DISCONNECTED, ++ "Failed to start %s:%s for %s", ++ brickinfo->hostname, brickinfo->path, ++ volinfo->volname); ++ gf_event (EVENT_BRICK_START_FAILED, ++ "peer=%s;volume=%s;brick=%s", ++ brickinfo->hostname, volinfo->volname, ++ brickinfo->path); ++ goto out; ++ } + } + + } +diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c +index b37f808..d6af0d9 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c ++++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c +@@ -353,10 +353,19 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo, + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + if (!glusterd_is_local_brick (this, volinfo, brickinfo)) + continue; +- if (quorum_status == DOESNT_MEET_QUORUM) ++ if (quorum_status == DOESNT_MEET_QUORUM) { + glusterd_brick_stop (volinfo, brickinfo, _gf_false); +- else +- glusterd_brick_start (volinfo, brickinfo, _gf_false); ++ } else { ++ if (!brickinfo->start_triggered) { ++ pthread_mutex_lock (&brickinfo->restart_mutex); ++ { ++ glusterd_brick_start (volinfo, ++ brickinfo, ++ _gf_false); ++ } ++ pthread_mutex_unlock (&brickinfo->restart_mutex); ++ } ++ } + } + volinfo->quorum_status = quorum_status; + if (quorum_status == MEETS_QUORUM) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 1e399f4..6288683 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -1049,7 +1049,7 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo) + goto out; + + CDS_INIT_LIST_HEAD (&new_brickinfo->brick_list); +- ++ pthread_mutex_init (&new_brickinfo->restart_mutex, NULL); + *brickinfo = new_brickinfo; + + ret = 0; +@@ -2429,7 +2429,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, + (void) sys_unlink (pidfile); + + brickinfo->status = GF_BRICK_STOPPED; +- ++ brickinfo->start_triggered = _gf_false; + if (del_brick) + glusterd_delete_brick (volinfo, brickinfo); + out: +@@ -5764,13 +5764,14 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, + * three different triggers for an attempt to start the brick process + * due to the quorum handling code in glusterd_friend_sm. + */ +- if (brickinfo->status == GF_BRICK_STARTING) { ++ if (brickinfo->status == GF_BRICK_STARTING || ++ brickinfo->start_triggered) { + gf_msg_debug (this->name, 0, "brick %s is already in starting " + "phase", brickinfo->path); + ret = 0; + goto out; + } +- ++ brickinfo->start_triggered = _gf_true; + GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf); + if (gf_is_service_running (pidfile, &pid)) { + if (brickinfo->status != GF_BRICK_STARTING && +@@ -5883,6 +5884,9 @@ run: + } + + out: ++ if (ret && brickinfo) { ++ brickinfo->start_triggered = _gf_false; ++ } + gf_msg_debug (this->name, 0, "returning %d ", ret); + return ret; + } +@@ -5944,11 +5948,19 @@ glusterd_restart_bricks (glusterd_conf_t *conf) + start_svcs = _gf_true; + glusterd_svcs_manager (NULL); + } +- + cds_list_for_each_entry (brickinfo, &volinfo->bricks, + brick_list) { +- glusterd_brick_start (volinfo, brickinfo, +- _gf_false); ++ if (!brickinfo->start_triggered) { ++ pthread_mutex_lock ++ (&brickinfo->restart_mutex); ++ { ++ glusterd_brick_start ++ (volinfo, brickinfo, ++ _gf_false); ++ } ++ pthread_mutex_unlock ++ (&brickinfo->restart_mutex); ++ } + } + ret = glusterd_store_volinfo + (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE); +@@ -5987,8 +5999,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf) + "volume %s", volinfo->volname); + cds_list_for_each_entry (brickinfo, &volinfo->bricks, + brick_list) { +- glusterd_brick_start (volinfo, brickinfo, +- _gf_false); ++ if (!brickinfo->start_triggered) { ++ pthread_mutex_lock ++ (&brickinfo->restart_mutex); ++ { ++ glusterd_brick_start ++ (volinfo, brickinfo, ++ _gf_false); ++ } ++ pthread_mutex_unlock ++ (&brickinfo->restart_mutex); ++ } + } + ret = glusterd_store_volinfo + (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE); +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +index 0c985db..4e410ce 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +@@ -2505,6 +2505,14 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags, + GF_ASSERT (volinfo); + + cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { ++ /* Mark start_triggered to false so that in case if this brick ++ * was brought down through gf_attach utility, the ++ * brickinfo->start_triggered wouldn't have been updated to ++ * _gf_false ++ */ ++ if (flags & GF_CLI_FLAG_OP_FORCE) { ++ brickinfo->start_triggered = _gf_false; ++ } + ret = glusterd_brick_start (volinfo, brickinfo, wait); + /* If 'force' try to start all bricks regardless of success or + * failure +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index 0beb6e1..fa39201 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -228,6 +228,8 @@ struct glusterd_brickinfo { + */ + uint16_t group; + gf_boolean_t port_registered; ++ gf_boolean_t start_triggered; ++ pthread_mutex_t restart_mutex; + }; + + typedef struct glusterd_brickinfo glusterd_brickinfo_t; +-- +2.9.3 + diff --git a/SOURCES/0638-glusterd-Free-up-svc-conn-on-volume-delete.patch b/SOURCES/0638-glusterd-Free-up-svc-conn-on-volume-delete.patch new file mode 100644 index 0000000..450e51b --- /dev/null +++ b/SOURCES/0638-glusterd-Free-up-svc-conn-on-volume-delete.patch @@ -0,0 +1,41 @@ +From 2cf7f8a0216b5364e2f96504a45d66bee964a4cc Mon Sep 17 00:00:00 2001 +From: moagrawa +Date: Fri, 15 Dec 2017 17:21:59 +0530 +Subject: [PATCH 638/642] glusterd: Free up svc->conn on volume delete + +Daemons snapd is maintained on per volume basis and on a volume +delete we should destroy the rpc connection established for the same. + +Change-Id: Id1440e39da07b990fdb9b207df18da04b1ca8014 +> BUG: 1522775 +> Signed-off-by: Atin Mukherjee +> Reviewed on https://review.gluster.org/18957 +> (Cherry pick from commit 36ce4c614a3391043a3417aa061d0aa16e60b2d3) + +BUG: 1526363 +Signed-off-by: moagrawa +Change-Id: I82e8d44dc2b33335bd632414a564be53100f0b70 +Reviewed-on: https://code.engineering.redhat.com/gerrit/125978 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 6288683..b594b9e 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -994,6 +994,9 @@ glusterd_volinfo_delete (glusterd_volinfo_t *volinfo) + if (volinfo->rebal.dict) + dict_unref (volinfo->rebal.dict); + ++ /* Destroy the connection object for per volume svc daemons */ ++ glusterd_conn_term (&volinfo->snapd.svc.conn); ++ + gf_store_handle_destroy (volinfo->quota_conf_shandle); + + glusterd_auth_cleanup (volinfo); +-- +2.9.3 + diff --git a/SOURCES/0639-glusterd-introduce-timer-in-mgmt_v3_lock.patch b/SOURCES/0639-glusterd-introduce-timer-in-mgmt_v3_lock.patch new file mode 100644 index 0000000..daa6f08 --- /dev/null +++ b/SOURCES/0639-glusterd-introduce-timer-in-mgmt_v3_lock.patch @@ -0,0 +1,473 @@ +From 57e33cb14a264b0b728befa81955484f5842f09f Mon Sep 17 00:00:00 2001 +From: Gaurav Yadav +Date: Thu, 5 Oct 2017 23:44:46 +0530 +Subject: [PATCH 639/642] glusterd : introduce timer in mgmt_v3_lock + +Problem: +In a multinode environment, if two of the op-sm transactions +are initiated on one of the receiver nodes at the same time, +there might be a possibility that glusterd may end up in +stale lock. + +Solution: +During mgmt_v3_lock a registration is made to gf_timer_call_after +which release the lock after certain period of time + +>mainline patch : https://review.gluster.org/#/c/18437 + +Change-Id: I16cc2e5186a2e8a5e35eca2468b031811e093843 +BUG: 1526372 +Signed-off-by: Gaurav Yadav +Reviewed-on: https://code.engineering.redhat.com/gerrit/126026 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + extras/glusterd.vol.in | 1 + + libglusterfs/src/common-utils.h | 2 +- + libglusterfs/src/mem-types.h | 1 + + xlators/mgmt/glusterd/src/glusterd-locks.c | 220 +++++++++++++++++++++++++++-- + xlators/mgmt/glusterd/src/glusterd-locks.h | 13 ++ + xlators/mgmt/glusterd/src/glusterd.c | 27 +++- + xlators/mgmt/glusterd/src/glusterd.h | 2 + + 7 files changed, 247 insertions(+), 19 deletions(-) + +diff --git a/extras/glusterd.vol.in b/extras/glusterd.vol.in +index 957b277..5338aa2 100644 +--- a/extras/glusterd.vol.in ++++ b/extras/glusterd.vol.in +@@ -7,6 +7,7 @@ volume management + option transport.socket.read-fail-log off + option ping-timeout 0 + option event-threads 1 ++# option lock-timer 180 + # option transport.address-family inet6 + # option base-port 49152 + end-volume +diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h +index 1cff48b..d982b1d 100644 +--- a/libglusterfs/src/common-utils.h ++++ b/libglusterfs/src/common-utils.h +@@ -100,7 +100,7 @@ void trap (void); + #define GF_CLNT_INSECURE_PORT_CEILING (GF_IANA_PRIV_PORTS_START - 1) + #define GF_PORT_MAX 65535 + #define GF_PORT_ARRAY_SIZE ((GF_PORT_MAX + 7) / 8) +- ++#define GF_LOCK_TIMER 180 + #define GF_MINUTE_IN_SECONDS 60 + #define GF_HOUR_IN_SECONDS (60*60) + #define GF_DAY_IN_SECONDS (24*60*60) +diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h +index ac3f878..55b1630 100644 +--- a/libglusterfs/src/mem-types.h ++++ b/libglusterfs/src/mem-types.h +@@ -170,6 +170,7 @@ enum gf_common_mem_types_ { + gf_common_mt_lock_mig, + gf_common_mt_pthread_t, + gf_common_volfile_t, ++ gf_common_mt_mgmt_v3_lock_timer_t, + gf_common_mt_end + }; + #endif +diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.c b/xlators/mgmt/glusterd/src/glusterd-locks.c +index 146092d..c7951b3 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-locks.c ++++ b/xlators/mgmt/glusterd/src/glusterd-locks.c +@@ -94,6 +94,50 @@ glusterd_mgmt_v3_lock_fini () + dict_unref (priv->mgmt_v3_lock); + } + ++/* Initialize the global mgmt_v3_timer lock list(dict) when ++ * glusterd is spawned */ ++int32_t ++glusterd_mgmt_v3_lock_timer_init () ++{ ++ int32_t ret = -1; ++ xlator_t *this = NULL; ++ glusterd_conf_t *priv = NULL; ++ ++ this = THIS; ++ GF_VALIDATE_OR_GOTO ("glusterd", this, out); ++ ++ priv = this->private; ++ GF_VALIDATE_OR_GOTO (this->name, priv, out); ++ ++ priv->mgmt_v3_lock_timer = dict_new (); ++ if (!priv->mgmt_v3_lock_timer) ++ goto out; ++ ++ ret = 0; ++out: ++ return ret; ++} ++ ++/* Destroy the global mgmt_v3_timer lock list(dict) when ++ * glusterd cleanup is performed */ ++void ++glusterd_mgmt_v3_lock_timer_fini () ++{ ++ xlator_t *this = NULL; ++ glusterd_conf_t *priv = NULL; ++ ++ this = THIS; ++ GF_VALIDATE_OR_GOTO ("glusterd", this, out); ++ ++ priv = this->private; ++ GF_VALIDATE_OR_GOTO (this->name, priv, out); ++ ++ if (priv->mgmt_v3_lock_timer) ++ dict_unref (priv->mgmt_v3_lock_timer); ++out: ++ return; ++} ++ + int32_t + glusterd_get_mgmt_v3_lock_owner (char *key, uuid_t *uuid) + { +@@ -513,17 +557,23 @@ int32_t + glusterd_mgmt_v3_lock (const char *name, uuid_t uuid, uint32_t *op_errno, + char *type) + { +- char key[PATH_MAX] = ""; +- int32_t ret = -1; +- glusterd_mgmt_v3_lock_obj *lock_obj = NULL; +- glusterd_conf_t *priv = NULL; +- gf_boolean_t is_valid = _gf_true; +- uuid_t owner = {0}; +- xlator_t *this = NULL; +- char *bt = NULL; ++ char key[PATH_MAX] = ""; ++ int32_t ret = -1; ++ glusterd_mgmt_v3_lock_obj *lock_obj = NULL; ++ glusterd_mgmt_v3_lock_timer *mgmt_lock_timer = NULL; ++ glusterd_conf_t *priv = NULL; ++ gf_boolean_t is_valid = _gf_true; ++ uuid_t owner = {0}; ++ xlator_t *this = NULL; ++ char *bt = NULL; ++ struct timespec delay = {0}; ++ char *key_dup = NULL; ++ glusterfs_ctx_t *mgmt_lock_timer_ctx = NULL; ++ xlator_t *mgmt_lock_timer_xl = NULL; + + this = THIS; + GF_ASSERT (this); ++ + priv = this->private; + GF_ASSERT (priv); + +@@ -594,6 +644,42 @@ glusterd_mgmt_v3_lock (const char *name, uuid_t uuid, uint32_t *op_errno, + goto out; + } + ++ mgmt_lock_timer = GF_CALLOC (1, sizeof(glusterd_mgmt_v3_lock_timer), ++ gf_common_mt_mgmt_v3_lock_timer_t); ++ ++ if (!mgmt_lock_timer) { ++ ret = -1; ++ goto out; ++ } ++ ++ mgmt_lock_timer->xl = THIS; ++ key_dup = gf_strdup (key); ++ delay.tv_sec = priv->mgmt_v3_lock_timeout; ++ delay.tv_nsec = 0; ++ ++ ret = -1; ++ mgmt_lock_timer_xl = mgmt_lock_timer->xl; ++ GF_VALIDATE_OR_GOTO (this->name, mgmt_lock_timer_xl, out); ++ ++ mgmt_lock_timer_ctx = mgmt_lock_timer_xl->ctx; ++ GF_VALIDATE_OR_GOTO (this->name, mgmt_lock_timer_ctx, out); ++ ++ mgmt_lock_timer->timer = gf_timer_call_after ++ (mgmt_lock_timer_ctx, delay, ++ gd_mgmt_v3_unlock_timer_cbk, ++ key_dup); ++ ++ ret = dict_set_bin (priv->mgmt_v3_lock_timer, key, mgmt_lock_timer, ++ sizeof (glusterd_mgmt_v3_lock_timer)); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, ++ "Unable to set timer in mgmt_v3 lock"); ++ GF_FREE (mgmt_lock_timer); ++ goto out; ++ } ++ ++ + /* Saving the backtrace into the pre-allocated buffer, ctx->btbuf*/ + if ((bt = gf_backtrace_save (NULL))) { + snprintf (key, sizeof (key), "debug.last-success-bt-%s-%s", +@@ -617,18 +703,99 @@ out: + return ret; + } + ++/* ++ * This call back will ensure to unlock the lock_obj, in case we hit a situation ++ * where unlocking failed and stale lock exist*/ ++void ++gd_mgmt_v3_unlock_timer_cbk (void *data) ++{ ++ xlator_t *this = NULL; ++ glusterd_conf_t *conf = NULL; ++ glusterd_mgmt_v3_lock_timer *mgmt_lock_timer = NULL; ++ char *key = NULL; ++ char *type = NULL; ++ char bt_key[PATH_MAX] = ""; ++ char name[PATH_MAX] = ""; ++ int32_t ret = -1; ++ glusterfs_ctx_t *mgmt_lock_timer_ctx = NULL; ++ xlator_t *mgmt_lock_timer_xl = NULL; ++ ++ this = THIS; ++ GF_VALIDATE_OR_GOTO ("glusterd", this, out); ++ ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO (this->name, conf, out); ++ ++ gf_log (THIS->name, GF_LOG_INFO, "In gd_mgmt_v3_unlock_timer_cbk"); ++ GF_ASSERT (NULL != data); ++ key = (char *)data; ++ ++ dict_del (conf->mgmt_v3_lock, key); ++ ++ type = strrchr (key, '_'); ++ strncpy (name, key, strlen (key) - strlen (type) - 1); ++ ++ ret = snprintf (bt_key, PATH_MAX, "debug.last-success-bt-%s-%s", ++ name, type + 1); ++ if (ret != strlen ("debug.last-success-bt-") + strlen (name) + ++ strlen (type)) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_CREATE_KEY_FAIL, "Unable to create backtrace " ++ "key"); ++ goto out; ++ } ++ ++ dict_del (conf->mgmt_v3_lock, bt_key); ++ ++ ret = dict_get_bin (conf->mgmt_v3_lock_timer, key, ++ (void **)&mgmt_lock_timer); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, ++ "Unable to get lock owner in mgmt_v3 lock"); ++ goto out; ++ } ++ ++out: ++ if (mgmt_lock_timer->timer) { ++ mgmt_lock_timer_xl = mgmt_lock_timer->xl; ++ GF_VALIDATE_OR_GOTO (this->name, mgmt_lock_timer_xl, ++ ret_function); ++ ++ mgmt_lock_timer_ctx = mgmt_lock_timer_xl->ctx; ++ GF_VALIDATE_OR_GOTO (this->name, mgmt_lock_timer_ctx, ++ ret_function); ++ ++ gf_timer_call_cancel (mgmt_lock_timer_ctx, ++ mgmt_lock_timer->timer); ++ GF_FREE(key); ++ dict_del (conf->mgmt_v3_lock_timer, bt_key); ++ mgmt_lock_timer->timer = NULL; ++ } ++ ++ret_function: ++ ++ return; ++ ++} ++ + int32_t + glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type) + { +- char key[PATH_MAX] = ""; +- int32_t ret = -1; +- gf_boolean_t is_valid = _gf_true; +- glusterd_conf_t *priv = NULL; +- uuid_t owner = {0}; +- xlator_t *this = NULL; ++ char key[PATH_MAX] = ""; ++ char key_dup[PATH_MAX] = ""; ++ int32_t ret = -1; ++ gf_boolean_t is_valid = _gf_true; ++ glusterd_conf_t *priv = NULL; ++ glusterd_mgmt_v3_lock_timer *mgmt_lock_timer = NULL; ++ uuid_t owner = {0}; ++ xlator_t *this = NULL; ++ glusterfs_ctx_t *mgmt_lock_timer_ctx = NULL; ++ xlator_t *mgmt_lock_timer_xl = NULL; + + this = THIS; + GF_ASSERT (this); ++ + priv = this->private; + GF_ASSERT (priv); + +@@ -657,6 +824,7 @@ glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type) + ret = -1; + goto out; + } ++ strncpy (key_dup, key, strlen(key)); + + gf_msg_debug (this->name, 0, + "Trying to release lock of %s %s for %s as %s", +@@ -690,6 +858,15 @@ glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type) + /* Removing the mgmt_v3 lock from the global list */ + dict_del (priv->mgmt_v3_lock, key); + ++ ret = dict_get_bin (priv->mgmt_v3_lock_timer, key, ++ (void **)&mgmt_lock_timer); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, ++ "Unable to get mgmt lock key in mgmt_v3 lock"); ++ goto out; ++ } ++ + /* Remove the backtrace key as well */ + ret = snprintf (key, sizeof(key), "debug.last-success-bt-%s-%s", name, + type); +@@ -708,7 +885,22 @@ glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type) + type, name); + + ret = 0; ++ /* Release owner refernce which was held during lock */ ++ if (mgmt_lock_timer->timer) { ++ ret = -1; ++ mgmt_lock_timer_xl = mgmt_lock_timer->xl; ++ GF_VALIDATE_OR_GOTO (this->name, mgmt_lock_timer_xl, out); ++ ++ mgmt_lock_timer_ctx = mgmt_lock_timer_xl->ctx; ++ GF_VALIDATE_OR_GOTO (this->name, mgmt_lock_timer_ctx, out); ++ ret = 0; ++ gf_timer_call_cancel (mgmt_lock_timer_ctx, ++ mgmt_lock_timer->timer); ++ dict_del (priv->mgmt_v3_lock_timer, key_dup); ++ mgmt_lock_timer->timer = NULL; ++ } + out: ++ + gf_msg_trace (this->name, 0, "Returning %d", ret); + return ret; + } +diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.h b/xlators/mgmt/glusterd/src/glusterd-locks.h +index 437053d..226d5c6 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-locks.h ++++ b/xlators/mgmt/glusterd/src/glusterd-locks.h +@@ -14,6 +14,11 @@ typedef struct glusterd_mgmt_v3_lock_object_ { + uuid_t lock_owner; + } glusterd_mgmt_v3_lock_obj; + ++typedef struct glusterd_mgmt_v3_lock_timer_ { ++ gf_timer_t *timer; ++ xlator_t *xl; ++} glusterd_mgmt_v3_lock_timer; ++ + typedef struct glusterd_mgmt_v3_lock_valid_entities { + char *type; /* Entity type like vol, snap */ + gf_boolean_t default_value; /* The default value that * +@@ -29,6 +34,12 @@ void + glusterd_mgmt_v3_lock_fini (); + + int32_t ++glusterd_mgmt_v3_lock_timer_init (); ++ ++void ++glusterd_mgmt_v3_lock_timer_fini (); ++ ++int32_t + glusterd_get_mgmt_v3_lock_owner (char *volname, uuid_t *uuid); + + int32_t +@@ -44,4 +55,6 @@ glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid, uint32_t *op_errno); + int32_t + glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid); + ++void ++gd_mgmt_v3_unlock_timer_cbk(void *data); + #endif +diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c +index 45587c0..71261af 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.c ++++ b/xlators/mgmt/glusterd/src/glusterd.c +@@ -1852,12 +1852,21 @@ init (xlator_t *this) + if (ret) + goto out; + +- conf->base_port = GF_IANA_PRIV_PORTS_START; +- if (dict_get_uint32(this->options, "base-port", &conf->base_port) == 0) { ++ conf->base_port = GF_IANA_PRIV_PORTS_START; ++ if (dict_get_uint32(this->options, "base-port", ++ &conf->base_port) == 0) { ++ gf_msg (this->name, GF_LOG_INFO, 0, ++ GD_MSG_DICT_SET_FAILED, ++ "base-port override: %d", conf->base_port); ++ } ++ ++ conf->mgmt_v3_lock_timeout = GF_LOCK_TIMER; ++ if (dict_get_uint32 (this->options, "lock-timer", ++ &conf->mgmt_v3_lock_timeout) == 0) { + gf_msg (this->name, GF_LOG_INFO, 0, + GD_MSG_DICT_SET_FAILED, +- "base-port override: %d", conf->base_port); +- } ++ "lock-timer override: %d", conf->mgmt_v3_lock_timeout); ++ } + + /* Set option to run bricks on valgrind if enabled in glusterd.vol */ + conf->valgrind = _gf_false; +@@ -1881,6 +1890,7 @@ init (xlator_t *this) + + this->private = conf; + glusterd_mgmt_v3_lock_init (); ++ glusterd_mgmt_v3_lock_timer_init(); + glusterd_txn_opinfo_dict_init (); + glusterd_svcs_build (); + +@@ -2022,6 +2032,7 @@ fini (xlator_t *this) + gf_store_handle_destroy (conf->handle); + glusterd_sm_tr_log_delete (&conf->op_sm_log); + glusterd_mgmt_v3_lock_fini (); ++ glusterd_mgmt_v3_lock_timer_fini (); + glusterd_txn_opinfo_dict_fini (); + GF_FREE (conf); + +@@ -2140,6 +2151,14 @@ struct volume_options options[] = { + .type = GF_OPTION_TYPE_INT, + .description = "Sets the base port for portmap query" + }, ++ { .key = {"mgmt-v3-lock-timeout"}, ++ .type = GF_OPTION_TYPE_INT, ++ .max = 600, ++ .description = "Sets the mgmt-v3-lock-timeout for transactions." ++ "Specifes the default timeout value after which " ++ "lock acquired while performing transaction will " ++ "be released." ++ }, + { .key = {"snap-brick-path"}, + .type = GF_OPTION_TYPE_STR, + .description = "directory where the bricks for the snapshots will be created" +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index fa39201..d8a0a6f 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -167,6 +167,7 @@ typedef struct { + * cluster with no + * transaction ids */ + ++ dict_t *mgmt_v3_lock_timer; + struct cds_list_head mount_specs; + gf_boolean_t valgrind; + pthread_t brick_thread; +@@ -188,6 +189,7 @@ typedef struct { + uint32_t generation; + int32_t workers; + uint32_t blockers; ++ uint32_t mgmt_v3_lock_timeout; + } glusterd_conf_t; + + +-- +2.9.3 + diff --git a/SOURCES/0640-dict-Don-t-expose-get_new_dict-dict_destroy.patch b/SOURCES/0640-dict-Don-t-expose-get_new_dict-dict_destroy.patch new file mode 100644 index 0000000..75de796 --- /dev/null +++ b/SOURCES/0640-dict-Don-t-expose-get_new_dict-dict_destroy.patch @@ -0,0 +1,439 @@ +From bb41d03cf8a907794749cd83266461b06c1b1172 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 6 Jan 2016 14:30:08 +0530 +Subject: [PATCH 640/642] dict: Don't expose get_new_dict/dict_destroy + +get_new_dict/dict_destroy is causing confusion where, dict_new/dict_destroy or +get_new_dict/dict_unref are used instead of dict_new/dict_unref. + +The downstream patch only covers the dictionary changes in glusterd and +cli codebase and skip the other parts as the bug is only tracked for +glusterd memory leak issues. + +>Reviewed-on: http://review.gluster.org/13183 +>Smoke: Gluster Build System +>CentOS-regression: Gluster Build System +>NetBSD-regression: NetBSD Build System +>Reviewed-by: Jeff Darcy +>Reviewed-by: Krutika Dhananjay + +Change-Id: I4cc69f5b6711d720823395e20fd624a0c6c1168c +BUG: 1526363 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/125957 +Tested-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + cli/src/cli-cmd-parser.c | 44 ++++++++++++++-------------- + cli/src/cli-cmd-system.c | 6 ++-- + cli/src/cli-cmd-volume.c | 2 +- + cli/src/cli-rpc-ops.c | 4 +-- + cli/src/cli.c | 2 +- + libglusterfs/src/dict.h | 4 +-- + xlators/mgmt/glusterd/src/glusterd-geo-rep.c | 4 +-- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 4 +-- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 12 ++++---- + 9 files changed, 40 insertions(+), 42 deletions(-) + +diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c +index 12717cd..0056581 100644 +--- a/cli/src/cli-cmd-parser.c ++++ b/cli/src/cli-cmd-parser.c +@@ -741,7 +741,7 @@ out: + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Unable to parse create volume CLI"); + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + GF_FREE (trans_type); +@@ -817,7 +817,7 @@ cli_cmd_volume_reset_parse (const char **words, int wordcount, dict_t **options) + + out: + if (ret && dict) { +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +@@ -1138,7 +1138,7 @@ cli_cmd_inode_quota_parse (const char **words, int wordcount, dict_t **options) + out: + if (ret < 0) { + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +@@ -1477,7 +1477,7 @@ set_type: + out: + if (ret < 0) { + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +@@ -1748,8 +1748,8 @@ cli_cmd_volume_set_parse (struct cli_state *state, const char **words, + *options = dict; + + out: +- if (ret) +- dict_destroy (dict); ++ if (ret && dict) ++ dict_unref (dict); + + return ret; + } +@@ -1889,7 +1889,7 @@ out: + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Unable to parse add-brick CLI"); + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +@@ -1968,7 +1968,7 @@ out: + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Unable to parse tier CLI"); + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +@@ -2197,7 +2197,7 @@ out: + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Unable to parse remove-brick CLI"); + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + GF_FREE (tmp_brick); +@@ -2392,7 +2392,7 @@ out: + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Unable to parse reset-brick CLI"); + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +@@ -2453,7 +2453,7 @@ cli_cmd_log_filename_parse (const char **words, int wordcount, dict_t **options) + + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -2510,7 +2510,7 @@ cli_cmd_log_level_parse (const char **words, int worcount, dict_t **options) + + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -2561,7 +2561,7 @@ cli_cmd_log_locate_parse (const char **words, int wordcount, dict_t **options) + + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -2615,7 +2615,7 @@ cli_cmd_log_rotate_parse (const char **words, int wordcount, dict_t **options) + + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -3057,7 +3057,7 @@ out: + GF_FREE (slave_temp); + if (ret) { + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } else + *options = dict; + +@@ -3154,7 +3154,7 @@ cli_cmd_volume_profile_parse (const char **words, int wordcount, + *options = dict; + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + return ret; + } + +@@ -3343,7 +3343,7 @@ cli_cmd_volume_top_parse (const char **words, int wordcount, + *options = dict; + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + return ret; + } + +@@ -3556,7 +3556,7 @@ cli_cmd_volume_status_parse (const char **words, int wordcount, + + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -3644,7 +3644,7 @@ cli_cmd_volume_statedump_options_parse (const char **words, int wordcount, + out: + GF_FREE (tmp); + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + if (ret) + gf_log ("cli", GF_LOG_ERROR, "Error parsing dumpoptions"); + return ret; +@@ -4134,7 +4134,7 @@ done: + + out: + if (ret && dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -5458,7 +5458,7 @@ cli_cmd_snapshot_parse (const char **words, int wordcount, dict_t **options, + out: + if (ret) { + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } else + *options = dict; + +@@ -5716,7 +5716,7 @@ out: + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Unable to parse bitrot command"); + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + } + + return ret; +diff --git a/cli/src/cli-cmd-system.c b/cli/src/cli-cmd-system.c +index ad6bb73..93aac0b 100644 +--- a/cli/src/cli-cmd-system.c ++++ b/cli/src/cli-cmd-system.c +@@ -66,7 +66,7 @@ cli_cmd_getspec_cbk (struct cli_state *state, struct cli_cmd_word *word, + out: + if (!proc && ret) { + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + if (wordcount > 1) + cli_out ("Fetching spec for volume %s failed", + (char *)words[2]); +@@ -109,7 +109,7 @@ cli_cmd_pmap_b2p_cbk (struct cli_state *state, struct cli_cmd_word *word, + out: + if (!proc && ret) { + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + if (wordcount > 1) + cli_out ("Fetching spec for volume %s failed", + (char *)words[3]); +@@ -188,7 +188,7 @@ make_seq_dict (int argc, char **argv) + } + + if (ret) { +- dict_destroy (dict); ++ dict_unref (dict); + dict = NULL; + } + +diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c +index 3ec39fd..927c802 100644 +--- a/cli/src/cli-cmd-volume.c ++++ b/cli/src/cli-cmd-volume.c +@@ -592,7 +592,7 @@ cli_cmd_volume_rename_cbk (struct cli_state *state, struct cli_cmd_word *word, + + out: + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + + if (ret) { + cli_cmd_sent_status_get (&sent); +diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c +index 7ded5f9..24037aa 100644 +--- a/cli/src/cli-rpc-ops.c ++++ b/cli/src/cli-rpc-ops.c +@@ -493,7 +493,7 @@ out: + cli_cmd_broadcast_response (ret); + + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + + return ret; + } +@@ -1127,7 +1127,7 @@ out: + cli_cmd_broadcast_response (ret); + + if (dict) +- dict_destroy (dict); ++ dict_unref (dict); + + free (rsp.dict.dict_val); + +diff --git a/cli/src/cli.c b/cli/src/cli.c +index e0d6b3e..33250d3 100644 +--- a/cli/src/cli.c ++++ b/cli/src/cli.c +@@ -607,7 +607,7 @@ cli_quotad_clnt_rpc_init (void) + out: + if (ret) { + if (rpc_opts) +- dict_destroy(rpc_opts); ++ dict_unref(rpc_opts); + } + return rpc; + } +diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h +index b5d9f3e..455391a 100644 +--- a/libglusterfs/src/dict.h ++++ b/libglusterfs/src/dict.h +@@ -156,10 +156,7 @@ char *data_to_str (data_t *data); + void *data_to_bin (data_t *data); + void *data_to_ptr (data_t *data); + +-data_t *get_new_data (); + data_t * data_copy (data_t *old); +-dict_t *get_new_dict_full (int size_hint); +-dict_t *get_new_dict (); + + int dict_foreach (dict_t *this, + int (*fn)(dict_t *this, +@@ -193,6 +190,7 @@ int dict_null_foreach_fn (dict_t *d, char *k, + int dict_remove_foreach_fn (dict_t *d, char *k, + data_t *v, void *tmp); + dict_t *dict_copy (dict_t *this, dict_t *new); ++dict_t *get_new_dict (void); + int dict_keys_join (void *value, int size, dict_t *dict, + int (*filter_fn)(char *key)); + +diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c +index fa3b151..907c29f 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c ++++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c +@@ -1976,7 +1976,7 @@ is_geo_rep_active (glusterd_volinfo_t *volinfo, char *slave, + ret = 0; + out: + if (confd) +- dict_destroy (confd); ++ dict_unref (confd); + return ret; + } + +@@ -2617,7 +2617,7 @@ fetch_data: + } + + if (confd) +- dict_destroy (confd); ++ dict_unref (confd); + + gf_msg_debug (this->name, 0, "Returning %d ", ret); + return ret; +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index 51eae2d..f034ae8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -4369,7 +4369,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) + if (ret) + goto out; + } +- dict_destroy (req_dict); ++ dict_unref (req_dict); + req_dict = dict_ref (dict); + } + break; +@@ -4395,7 +4395,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) + goto out; + } + +- dict_destroy (req_dict); ++ dict_unref (req_dict); + req_dict = dict_ref (dict); + } + break; +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 701cccf..859a932 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -93,7 +93,7 @@ xlator_instantiate_va (const char *type, const char *format, va_list arg) + ret = xlator_set_type_virtual (xl, type); + if (ret) + goto error; +- xl->options = get_new_dict(); ++ xl->options = dict_new (); + if (!xl->options) + goto error; + xl->name = volname; +@@ -1052,7 +1052,7 @@ build_graph_generic (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + int ret = 0; + + if (mod_dict) { +- set_dict = dict_copy (volinfo->dict, NULL); ++ set_dict = dict_copy_with_ref (volinfo->dict, NULL); + if (!set_dict) + return -1; + dict_copy (mod_dict, set_dict); +@@ -1066,7 +1066,7 @@ build_graph_generic (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + ret = volgen_graph_set_options (graph, set_dict); + + if (mod_dict) +- dict_destroy (set_dict); ++ dict_unref (set_dict); + + return ret; + } +@@ -4728,7 +4728,7 @@ build_rebalance_volfile (glusterd_volinfo_t *volinfo, char *filepath, + return 0; + } + +- set_dict = dict_copy (volinfo->dict, NULL); ++ set_dict = dict_copy_with_ref (volinfo->dict, NULL); + if (!set_dict) + return -1; + +@@ -4770,7 +4770,7 @@ build_rebalance_volfile (glusterd_volinfo_t *volinfo, char *filepath, + out: + volgen_graph_free (&graph); + +- dict_destroy (set_dict); ++ dict_unref (set_dict); + + return ret; + } +@@ -5037,7 +5037,7 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) + + out: + gf_msg_debug ("glusterd", 0, "Returning %d", ret); +- dict_destroy (set_dict); ++ dict_unref (set_dict); + + return ret; + } +-- +2.9.3 + diff --git a/SOURCES/0641-features-locks-Fix-memory-leaks.patch b/SOURCES/0641-features-locks-Fix-memory-leaks.patch new file mode 100644 index 0000000..9983cb7 --- /dev/null +++ b/SOURCES/0641-features-locks-Fix-memory-leaks.patch @@ -0,0 +1,140 @@ +From dcf9f6fd363820f4e9b7b15bd6fb631e9fa3e049 Mon Sep 17 00:00:00 2001 +From: Xavier Hernandez +Date: Mon, 20 Nov 2017 10:51:09 +0100 +Subject: [PATCH 641/642] features/locks: Fix memory leaks + +> Upstream patch: https://review.gluster.org/18812 + +Change-Id: Ic1d2e17a7d14389b6734d1b88bd28c0a2907bbd6 +BUG: 1526377 +Signed-off-by: Xavier Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/125954 +Reviewed-by: Pranith Kumar Karampuri +Tested-by: RHGS Build Bot +--- + xlators/features/locks/src/clear.c | 2 +- + xlators/features/locks/src/common.c | 8 +++++++- + xlators/features/locks/src/entrylk.c | 8 ++++---- + xlators/features/locks/src/inodelk.c | 7 ++++--- + xlators/features/locks/src/posix.c | 3 +++ + 5 files changed, 19 insertions(+), 9 deletions(-) + +diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c +index 640c6bb..fc1a150 100644 +--- a/xlators/features/locks/src/clear.c ++++ b/xlators/features/locks/src/clear.c +@@ -184,7 +184,7 @@ clrlk_clear_posixlk (xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args, + } else { + gcount++; + } +- GF_FREE (plock); ++ __destroy_lock(plock); + } + } + pthread_mutex_unlock (&pl_inode->mutex); +diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c +index 015464c..b929786 100644 +--- a/xlators/features/locks/src/common.c ++++ b/xlators/features/locks/src/common.c +@@ -436,7 +436,13 @@ pl_inode_get (xlator_t *this, inode_t *inode) + INIT_LIST_HEAD (&pl_inode->queued_locks); + gf_uuid_copy (pl_inode->gfid, inode->gfid); + +- __inode_ctx_put (inode, this, (uint64_t)(long)(pl_inode)); ++ ret = __inode_ctx_put (inode, this, (uint64_t)(long)(pl_inode)); ++ if (ret) { ++ pthread_mutex_destroy (&pl_inode->mutex); ++ GF_FREE (pl_inode); ++ pl_inode = NULL; ++ goto unlock; ++ } + } + unlock: + UNLOCK (&inode->lock); +diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c +index 783c57e..8cd6f85 100644 +--- a/xlators/features/locks/src/entrylk.c ++++ b/xlators/features/locks/src/entrylk.c +@@ -672,15 +672,12 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, + break; + + default: +- inode_unref (pinode->inode); ++ need_inode_unref = _gf_true; + gf_log (this->name, GF_LOG_ERROR, + "Unexpected case in entrylk (cmd=%d). Please file" + "a bug report at http://bugs.gluster.com", cmd); + goto out; + } +- if (need_inode_unref) +- inode_unref (pinode->inode); +- + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ +@@ -688,6 +685,8 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, + inode_unref (pinode->inode); + + out: ++ if (need_inode_unref) ++ inode_unref (pinode->inode); + + if (unwind) { + entrylk_trace_out (this, frame, volume, fd, loc, basename, +@@ -848,6 +847,7 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) + __pl_entrylk_unref (l); + } + pthread_mutex_unlock (&pinode->mutex); ++ + inode_unref (pinode->inode); + } + +diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c +index 1564f26..f0aa766 100644 +--- a/xlators/features/locks/src/inodelk.c ++++ b/xlators/features/locks/src/inodelk.c +@@ -600,9 +600,6 @@ out: + if (ctx) + pthread_mutex_unlock (&ctx->lock); + +- if (need_inode_unref) +- inode_unref (pl_inode->inode); +- + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ +@@ -611,6 +608,10 @@ out: + grant_blocked_inode_locks (this, pl_inode, dom); + } + ++ if (need_inode_unref) { ++ inode_unref (pl_inode->inode); ++ } ++ + return ret; + } + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index feca2f1..932831a 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -2482,6 +2482,7 @@ pl_forget (xlator_t *this, + list) { + + list_del (&rw_req->list); ++ call_stub_destroy(rw_req->stub); + GF_FREE (rw_req); + } + } +@@ -2565,6 +2566,8 @@ pl_forget (xlator_t *this, + + } + ++ pthread_mutex_destroy(&pl_inode->mutex); ++ + GF_FREE (pl_inode); + + return 0; +-- +2.9.3 + diff --git a/SOURCES/0642-gfapi-set-lkowner-in-glfd.patch b/SOURCES/0642-gfapi-set-lkowner-in-glfd.patch new file mode 100644 index 0000000..d01caed --- /dev/null +++ b/SOURCES/0642-gfapi-set-lkowner-in-glfd.patch @@ -0,0 +1,454 @@ +From dacb230663c9f6deeebc62139e675a594ec778f0 Mon Sep 17 00:00:00 2001 +From: Soumya Koduri +Date: Tue, 17 Oct 2017 16:12:06 +0530 +Subject: [PATCH 642/642] gfapi: set lkowner in glfd + +We need a provision to be able to set lkowner (which is +used to distinguish locks maintained by server) in gfapi. +Since the same lk_owner need to be used to be able to +flush the lock while closing the fd, store the lkowner +in the glfd structure itself. + +A new API has been added to be able to set lkowner in glfd. + +Upstream reference : +1.) This is backport of below mainline fix - + https://review.gluster.org/#/c/18429 + https://review.gluster.org/#/c/18522/ + +2.) 3.12 branch +https://review.gluster.org/#/c/18524/ + +>Change-Id: I67591d6b9a89c20b9617d52616513ff9e6c06b47 +>BUG: 1501956 +>Signed-off-by: Soumya Koduri + +Change-Id: I67591d6b9a89c20b9617d52616513ff9e6c06b47 +BUG: 1526378 +Signed-off-by: Jiffin Tony Thottan +Reviewed-on: https://code.engineering.redhat.com/gerrit/126039 +Tested-by: RHGS Build Bot +Reviewed-by: Poornima Gurusiddaiah +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + api/src/gfapi.aliases | 1 + + api/src/gfapi.map | 7 +- + api/src/glfs-fops.c | 51 ++++++++++ + api/src/glfs-internal.h | 1 + + api/src/glfs.h | 28 ++++++ + tests/basic/gfapi/glfd-lkowner.c | 212 +++++++++++++++++++++++++++++++++++++++ + tests/basic/gfapi/glfd-lkowner.t | 27 +++++ + 7 files changed, 326 insertions(+), 1 deletion(-) + create mode 100644 tests/basic/gfapi/glfd-lkowner.c + create mode 100755 tests/basic/gfapi/glfd-lkowner.t + +diff --git a/api/src/gfapi.aliases b/api/src/gfapi.aliases +index c69d650..bb60aa3 100644 +--- a/api/src/gfapi.aliases ++++ b/api/src/gfapi.aliases +@@ -158,5 +158,6 @@ _pub_glfs_upcall_inode_get_oldpstat _glfs_upcall_inode_get_oldpstat$GFAPI_3.7.16 + _pub_glfs_realpath _glfs_realpath$GFAPI_3.7.17 + + _pub_glfs_sysrq _glfs_sysrq$GFAPI_3.10.0 ++_pub_glfs_fd_set_lkowner _glfs_fd_set_lkowner$GFAPI_3.10.7 + + _pub_glfs_ipc _glfs_ipc$GFAPI_4.0.0 +diff --git a/api/src/gfapi.map b/api/src/gfapi.map +index 3bf2ffe..a8c686f 100644 +--- a/api/src/gfapi.map ++++ b/api/src/gfapi.map +@@ -198,7 +198,12 @@ GFAPI_3.10.0 { + glfs_sysrq; + } GFAPI_3.7.17; + ++GFAPI_3.10.7 { ++ global: ++ glfs_fd_set_lkowner; ++} GFAPI_3.10.0; ++ + GFAPI_4.0.0 { + global: + glfs_ipc; +-} GFAPI_3.10.0; ++} GFAPI_3.10.7; +diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c +index 3bba7d6..9a5e7b5 100644 +--- a/api/src/glfs-fops.c ++++ b/api/src/glfs-fops.c +@@ -267,6 +267,12 @@ pub_glfs_close (struct glfs_fd *glfd) + goto out; + } + ++ if (glfd->lk_owner.len != 0) { ++ ret = syncopctx_setfslkowner (&glfd->lk_owner); ++ if (ret) ++ goto out; ++ } ++ + ret = syncop_flush (subvol, fd, NULL, NULL); + DECODE_SYNCOP_ERR (ret); + out: +@@ -4242,6 +4248,14 @@ pub_glfs_posix_lock (struct glfs_fd *glfd, int cmd, struct flock *flock) + + gf_flock_from_flock (&gf_flock, flock); + gf_flock_from_flock (&saved_flock, flock); ++ ++ if (glfd->lk_owner.len != 0) { ++ ret = syncopctx_setfslkowner (&glfd->lk_owner); ++ ++ if (ret) ++ goto out; ++ } ++ + ret = syncop_lk (subvol, fd, cmd, &gf_flock, NULL, NULL); + DECODE_SYNCOP_ERR (ret); + gf_flock_to_flock (&gf_flock, flock); +@@ -4264,6 +4278,43 @@ invalid_fs: + + GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_posix_lock, 3.4.0); + ++int ++pub_glfs_fd_set_lkowner (glfs_fd_t *glfd, void *data, int len) ++{ ++ int ret = -1; ++ ++ DECLARE_OLD_THIS; ++ __GLFS_ENTRY_VALIDATE_FD (glfd, invalid_fs); ++ ++ if (!GF_REF_GET (glfd)) { ++ goto invalid_fs; ++ } ++ ++ GF_VALIDATE_OR_GOTO (THIS->name, data, out); ++ ++ if ((len <= 0) || (len > GFAPI_MAX_LOCK_OWNER_LEN)) { ++ errno = EINVAL; ++ gf_msg (THIS->name, GF_LOG_ERROR, errno, ++ LG_MSG_INVALID_ARG, ++ "Invalid lk_owner len (%d)", len); ++ goto out; ++ } ++ ++ glfd->lk_owner.len = len; ++ ++ memcpy (glfd->lk_owner.data, data, len); ++ ++ ret = 0; ++out: ++ if (glfd) ++ GF_REF_PUT (glfd); ++ ++ __GLFS_EXIT_FS; ++ ++invalid_fs: ++ return ret; ++} ++GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_fd_set_lkowner, 3.10.7); + + struct glfs_fd * + pub_glfs_dup (struct glfs_fd *glfd) +diff --git a/api/src/glfs-internal.h b/api/src/glfs-internal.h +index 3f4ed71..f17a854 100644 +--- a/api/src/glfs-internal.h ++++ b/api/src/glfs-internal.h +@@ -215,6 +215,7 @@ struct glfs_fd { + struct list_head entries; + gf_dirent_t *next; + struct dirent *readdirbuf; ++ gf_lkowner_t lk_owner; + }; + + /* glfs object handle introduced for the alternate gfapi implementation based +diff --git a/api/src/glfs.h b/api/src/glfs.h +index 9780b52..770d138 100644 +--- a/api/src/glfs.h ++++ b/api/src/glfs.h +@@ -798,6 +798,34 @@ int glfs_sysrq (glfs_t *fs, char sysrq) __THROW + int glfs_ipc (glfs_fd_t *fd, int cmd) __THROW + GFAPI_PUBLIC(glfs_ipc, 3.7.0); + ++#define GFAPI_MAX_LOCK_OWNER_LEN 255 ++ ++/* ++ * ++ * DESCRIPTION ++ * ++ * This API allows application to set lk_owner on a fd. ++ * A glfd can be associated with only single lk_owner. In case if there ++ * is need to set another lk_owner, applications can make use of ++ * 'glfs_dup' to get duplicate glfd and set new lk_owner on that second ++ * glfd. ++ * ++ * Also its not recommended to override or clear lk_owner value as the ++ * same shall be used to flush any outstanding locks while closing the fd. ++ * ++ * PARAMETERS ++ * ++ * INPUT: ++ * @glfd: GFAPI file descriptor ++ * @len: Size of lk_owner buffer. Max value can be GFAPI_MAX_LOCK_OWNER_LEN ++ * @data: lk_owner data buffer. ++ * ++ * OUTPUT: ++ * 0: SUCCESS ++ * -1: FAILURE ++ */ ++int glfs_fd_set_lkowner (glfs_fd_t *glfd, void *data, int len); ++ GFAPI_PUBLIC(glfs_fd_set_lkowner, 3.10.7); + __END_DECLS + + #endif /* !_GLFS_H */ +diff --git a/tests/basic/gfapi/glfd-lkowner.c b/tests/basic/gfapi/glfd-lkowner.c +new file mode 100644 +index 0000000..031a076 +--- /dev/null ++++ b/tests/basic/gfapi/glfd-lkowner.c +@@ -0,0 +1,212 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int gfapi = 1; ++ ++#define LOG_ERR(func, ret) do { \ ++ if (ret != 0) { \ ++ fprintf (stderr, "%s : returned error %d (%s)\n", \ ++ func, ret, strerror (errno)); \ ++ goto out; \ ++ } else { \ ++ fprintf (stderr, "%s : returned %d\n", func, ret); \ ++ } \ ++ } while (0) ++ ++char lownera[8] = "lownera", lownerb[8] = "lownerb"; ++char lownerc[8] = "lownerc"; ++ ++int lock_test (glfs_fd_t *glfd1, glfs_fd_t *glfd2, bool should_fail, ++ int l1_start, int l1_len, char *l1_owner, int lo1_len, ++ int l2_start, int l2_len, char *l2_owner, int lo2_len) ++{ ++ int ret = -1, f_ret = -1; ++ struct flock lock1 = {0, }, lock2 = {0, }; ++ ++lock1: ++ if (!glfd1) ++ goto lock2; ++ ++ /* lock on glfd1 */ ++ lock1.l_type = F_WRLCK; ++ lock1.l_whence = SEEK_SET; ++ lock1.l_start = l1_start; ++ lock1.l_len = l1_len; ++ ++ ret = glfs_fd_set_lkowner (glfd1, l1_owner, lo1_len); ++ LOG_ERR ("glfs_fd_set_lkowner on glfd1", ret); ++ ++ ret = glfs_posix_lock (glfd1, F_SETLK, &lock1); ++ LOG_ERR ("glfs_posix_lock on glfd1", ret); ++ ++lock2: ++ if (!glfd2) ++ goto out; ++ ++ /* lock on glfd2 */ ++ lock2.l_type = F_WRLCK; ++ lock2.l_whence = SEEK_SET; ++ lock2.l_start = l2_start; ++ lock2.l_len = l2_len; ++ ++ ret = glfs_fd_set_lkowner (glfd2, l2_owner, lo2_len); ++ LOG_ERR ("glfs_fd_set_lkowner on glfd2", ret); ++ ++ ret = glfs_posix_lock (glfd2, F_SETLK, &lock2); ++ ++ if (should_fail && ret) { ++ f_ret = 0; ++ } else if (!ret && !should_fail) { ++ f_ret = 0; ++ } else { ++ f_ret = -1; ++ } ++out: ++ fprintf (stderr, "Lock test on glfd1 (start(%d), len(%d)," ++ " lk_owner(%s)) and glfd2 (start(%d), len(%d), " ++ "lk_owner(%s)) - expected(%s) - result(%s)\n", ++ l1_start, l1_len, l1_owner, l2_start, l2_len, l2_owner, ++ (should_fail ? "FAIL" : "SUCCESS"), ++ (ret ? "FAIL" : "SUCCESS")); ++ return f_ret; ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ glfs_t *fs = NULL; ++ int ret = 0, i, status = 0; ++ glfs_fd_t *fd1 = NULL; ++ glfs_fd_t *fd2 = NULL; ++ glfs_fd_t *fd3 = NULL; ++ char *filename = "file_tmp"; ++ char *volname = NULL; ++ char *logfile = NULL; ++ char *hostname = NULL; ++ ++ if (argc != 4) { ++ fprintf (stderr, "Invalid argument\n"); ++ exit(1); ++ } ++ ++ hostname = argv[1]; ++ volname = argv[2]; ++ logfile = argv[3]; ++ ++ fs = glfs_new (volname); ++ if (!fs) { ++ fprintf (stderr, "glfs_new: returned NULL\n"); ++ return -1; ++ } ++ ++ ret = glfs_set_volfile_server (fs, "tcp", hostname, 24007); ++ LOG_ERR("glfs_set_volfile_server", ret); ++ ++ ret = glfs_set_logging (fs, logfile, 7); ++ LOG_ERR("glfs_set_logging", ret); ++ ++ ret = glfs_init (fs); ++ LOG_ERR("glfs_init", ret); ++ ++ fd1 = glfs_creat(fs, filename, O_RDWR|O_SYNC, 0644); ++ if (fd1 <= 0) { ++ ret = -1; ++ LOG_ERR ("glfs_creat", ret); ++ } ++ fprintf (stderr, "glfs-create fd1 - %d\n", fd1); ++ ++ fd2 = glfs_dup(fd1); ++ fprintf (stderr, "glfs-dup fd2 - %d\n", fd2); ++ ++ fd3 = glfs_open(fs, filename, O_RDWR|O_SYNC); ++ if (fd2 <= 0) { ++ ret = -1; ++ LOG_ERR ("glfs_open", ret); ++ } ++ fprintf (stderr, "glfs-open fd3 - %d\n", fd3); ++ ++ /* TEST 1: Conflicting ranges, same lk_owner ++ * lock1 (0, 10, lownera) ++ * lock2 (5, 10, lownera) ++ * Expected: should not fail but get merged ++ */ ++ ret = lock_test (fd1, fd2, false, 0, 10, lownera, 8, ++ 5, 10, lownera, 8); ++ LOG_ERR ("==== glfs_lock_test_1", ret); ++ ++ /* TEST 2: Conflicting ranges, different lk_owner ++ * lock1 (0, 10, lownera) - already taken ++ * lock2 (5, 10, lownerb) ++ * Expected: should fail and not get merged ++ */ ++ ret = lock_test (NULL, fd2, true, 0, 10, lownera, 8, ++ 5, 10, lownerb, 8); ++ LOG_ERR ("==== glfs_lock_test_2", ret); ++ ++ /* TEST 3: Different ranges, same lk_owner ++ * lock1 (0, 10, lownera) - already taken ++ * lock2 (30, 10, lownera) ++ * Expected: should not fail ++ */ ++ ret = lock_test (NULL, fd2, false, 0, 10, lownera, 8, ++ 30, 10, lownera, 8); ++ LOG_ERR ("==== glfs_lock_test_3", ret); ++ ++ /* TEST 4: Conflicting ranges, different lk_owner ++ * lock1 (0, 10, lownera) - already taken ++ * lock2 (50, 10, lownerb) ++ * Expected: should not fail ++ */ ++ ret = lock_test (NULL, fd2, false, 0, 10, lownera, 8, ++ 50, 10, lownerb, 8); ++ LOG_ERR ("==== glfs_lock_test_4", ret); ++ ++ /* TEST 5: Close fd1 & retry TEST2 ++ * lock1 (not applicable) ++ * lock2 (5, 10, lownerb) ++ * Expected: should succeed now ++ */ ++ ret = glfs_close(fd1); ++ LOG_ERR ("glfs_close", ret); ++ ++ ret = lock_test (NULL, fd2, false, 0, 10, lownera, 8, ++ 5, 10, lownerb, 8); ++ LOG_ERR ("==== glfs_lock_test_5", ret); ++ ++ /* TEST 6: Check closing fd1 doesn't flush fd2 locks ++ * retry TEST 4 but with fd2 and fd3. ++ * lock1 (50, 10, lownerb) - already taken ++ * lock2 (55, 10, lownerc) ++ * Expected: should fail ++ */ ++ ret = lock_test (NULL, fd3, true, 50, 10, lownerb, 8, ++ 55, 10, lownerc, 8); ++ LOG_ERR ("==== glfs_lock_test_6", ret); ++ ++err: ++ ret = glfs_close(fd2); ++ LOG_ERR ("glfs_close", ret); ++ ++ ret = glfs_close(fd3); ++ LOG_ERR ("glfs_close", ret); ++ ++out: ++ if (fs) { ++ ret = glfs_fini(fs); ++ fprintf (stderr, "glfs_fini(fs) returned %d\n", ret); ++ } ++ ++ if (ret) ++ exit(1); ++ exit(0); ++} +diff --git a/tests/basic/gfapi/glfd-lkowner.t b/tests/basic/gfapi/glfd-lkowner.t +new file mode 100755 +index 0000000..ad7b026 +--- /dev/null ++++ b/tests/basic/gfapi/glfd-lkowner.t +@@ -0,0 +1,27 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++TEST glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/brick1; ++EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++ ++logdir=`gluster --print-logdir` ++ ++TEST build_tester $(dirname $0)/glfd-lkowner.c -lgfapi ++ ++TEST ./$(dirname $0)/glfd-lkowner $H0 $V0 $logdir/glfd-lkowner.log ++ ++cleanup_tester $(dirname $0)/glfd-lkowner ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup; +-- +2.9.3 + diff --git a/SOURCES/0643-build-remove-ExclusiveArch-from-spec-file.patch b/SOURCES/0643-build-remove-ExclusiveArch-from-spec-file.patch new file mode 100644 index 0000000..2148c1d --- /dev/null +++ b/SOURCES/0643-build-remove-ExclusiveArch-from-spec-file.patch @@ -0,0 +1,48 @@ +From cfe482bf1ee566b8fd91d85638cff332fbae7ee1 Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Wed, 20 Dec 2017 14:12:41 +0530 +Subject: [PATCH 643/643] build: remove ExclusiveArch from spec file + +ExclusiveArch directive limits builds for specific architectures. +This gets in the way of building for non-x86_64 architectures. + +So, removing the directive keeps the sources open to be built +for non-x86_64 architectures as well. + +Label: DOWNSTREAM ONLY + +Change-Id: I8626e6e8c38cacdadbf2144329b09e14d8f9ebe2 +BUG: 1527772 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/126253 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfs.spec.in | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 63fbeba..fc9125b 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -191,7 +191,6 @@ Vendor: Fedora Project + Name: @PACKAGE_NAME@ + Version: @PACKAGE_VERSION@ + Release: @PACKAGE_RELEASE@%{?dist} +-ExclusiveArch: x86_64 aarch64 + %endif + License: GPLv2 or LGPLv3+ + Group: System Environment/Base +@@ -2068,6 +2067,9 @@ fi + %endif + + %changelog ++* Wed Dec 20 2017 Milind Changire ++- Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) ++ + * Tue Oct 10 2017 Milind Changire + - DOWNSTREAM ONLY patch - launch glusterd in upgrade mode after all new bits have been installed + +-- +1.8.3.1 + diff --git a/SOURCES/0644-libglusterfs-fix-the-call_stack_set_group-function.patch b/SOURCES/0644-libglusterfs-fix-the-call_stack_set_group-function.patch new file mode 100644 index 0000000..bfbbfc3 --- /dev/null +++ b/SOURCES/0644-libglusterfs-fix-the-call_stack_set_group-function.patch @@ -0,0 +1,261 @@ +From a413f8754a7389c1b5dd93c219b47b9f904711fb Mon Sep 17 00:00:00 2001 +From: Csaba Henk +Date: Fri, 15 Dec 2017 08:02:30 +0100 +Subject: [PATCH 644/644] libglusterfs: fix the call_stack_set_group() function + +- call_stack_set_group() will take the ownership of passed + buffer from caller; +- to indicate the change, its signature is changed from + including the buffer directly to take a pointer to it; +- either the content of the buffer is copied to the + groups_small embedded buffer of the call stack, or + the buffer is set as groups_large member of the call + stack; +- the groups member of the call stack is set to, + respectively, groups_small or groups_large, according + to the memory management conventions of the call stack; +- the buffer address is overwritten with junk to effectively + prevent the caller from using it further on. + +Also move call_stack_set_group to stack.c from stack.h +to prevent "defined but not used [-Wunused-function]" +warnings (not using it anymore in call_stack_alloc_group() +implementation, which saved us from this so far). + +protocol/server: refactor gid_resolve() + +In gid_resolve there are two cases: +either the gid_cache_lookup() call returns +a value or not. The result is caputured in +the agl variable, and throughout the function, +each particular stage of the implementation +comes with an agl and a no-agl variant. + +In most cases this is explicitly indicated +via an + + if (agl) { + ... + } else { + ... + } + +but some of this branching are expressed via +goto constructs (obfuscating the fact we stated +above, that is, each particular stage having +an agl/no-agl variant). + +In the current refactor, we bring the agl +conditional to the top, and present the +agl/non-agl implementations sequentially. + +Also we take the opportunity to clean up and +fix the agl case: +- remove the spurious + gl.gl_list = agl->gl_list; + setting, as gl is not used in the agl caae +- populate the group list of call stack from + agl, fixing thus referred BUG. + +> Change-Id: I61f4574ba21969f7661b9ff0c9dce202b874025d +> BUG: 1513928 +> Signed-off-by: Csaba Henk +> Reviewed-on: https://review.gluster.org/18789 + +Change-Id: I61f4574ba21969f7661b9ff0c9dce202b874025d +BUG: 1527147 +Signed-off-by: Csaba Henk +Reviewed-on: https://code.engineering.redhat.com/gerrit/126437 +Reviewed-by: Csaba Henk +Tested-by: Csaba Henk +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + libglusterfs/src/stack.c | 20 +++++++++ + libglusterfs/src/stack.h | 14 +++--- + xlators/mount/fuse/src/fuse-helpers.c | 2 +- + xlators/protocol/server/src/server-helpers.c | 65 +++++++++++++--------------- + 4 files changed, 57 insertions(+), 44 deletions(-) + +diff --git a/libglusterfs/src/stack.c b/libglusterfs/src/stack.c +index 6977814..d64ac8a 100644 +--- a/libglusterfs/src/stack.c ++++ b/libglusterfs/src/stack.c +@@ -65,6 +65,26 @@ create_frame (xlator_t *xl, call_pool_t *pool) + } + + void ++call_stack_set_groups (call_stack_t *stack, int ngrps, gid_t **groupbuf_p) ++{ ++ /* We take the ownership of the passed group buffer. */ ++ ++ if (ngrps <= SMALL_GROUP_COUNT) { ++ memcpy (stack->groups_small, *groupbuf_p, ++ sizeof (gid_t) * ngrps); ++ stack->groups = stack->groups_small; ++ GF_FREE (*groupbuf_p); ++ } else { ++ stack->groups_large = *groupbuf_p; ++ stack->groups = stack->groups_large; ++ } ++ ++ stack->ngrps = ngrps; ++ /* Set a canary. */ ++ *groupbuf_p = (void *)0xdeadf00d; ++} ++ ++void + gf_proc_dump_call_frame (call_frame_t *call_frame, const char *key_buf,...) + { + +diff --git a/libglusterfs/src/stack.h b/libglusterfs/src/stack.h +index c2848b7..a71a150 100644 +--- a/libglusterfs/src/stack.h ++++ b/libglusterfs/src/stack.h +@@ -422,26 +422,21 @@ STACK_RESET (call_stack_t *stack) + } while (0) + + +-static void +-call_stack_set_groups (call_stack_t *stack, int ngrps, gid_t *groupbuf) +-{ +- stack->groups = groupbuf; +- stack->ngrps = ngrps; +-} +- + static inline int + call_stack_alloc_groups (call_stack_t *stack, int ngrps) + { + if (ngrps <= SMALL_GROUP_COUNT) { +- call_stack_set_groups (stack, ngrps, stack->groups_small); ++ stack->groups = stack->groups_small; + } else { + stack->groups_large = GF_CALLOC (ngrps, sizeof (gid_t), + gf_common_mt_groups_t); + if (!stack->groups_large) + return -1; +- call_stack_set_groups (stack, ngrps, stack->groups_large); ++ stack->groups = stack->groups_large; + } + ++ stack->ngrps = ngrps; ++ + return 0; + } + +@@ -530,6 +525,7 @@ copy_frame (call_frame_t *frame) + return newframe; + } + ++void call_stack_set_groups (call_stack_t *stack, int ngrps, gid_t **groupbuf_p); + void gf_proc_dump_pending_frames(call_pool_t *call_pool); + void gf_proc_dump_pending_frames_to_dict (call_pool_t *call_pool, + dict_t *dict); +diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c +index 5ccb0a5..cd43ded 100644 +--- a/xlators/mount/fuse/src/fuse-helpers.c ++++ b/xlators/mount/fuse/src/fuse-helpers.c +@@ -181,7 +181,7 @@ frame_fill_groups (call_frame_t *frame) + return; + } + +- call_stack_set_groups (frame->root, ngroups, mygroups); ++ call_stack_set_groups (frame->root, ngroups, &mygroups); + } else { + ret = snprintf (filename, sizeof filename, "/proc/%d/status", + frame->root->pid); +diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c +index 647f144..a01e72f 100644 +--- a/xlators/protocol/server/src/server-helpers.c ++++ b/xlators/protocol/server/src/server-helpers.c +@@ -31,13 +31,24 @@ gid_resolve (server_conf_t *conf, call_stack_t *root) + struct passwd *result; + gid_t *mygroups; + gid_list_t gl; +- const gid_list_t *agl; + int ngroups; ++ const gid_list_t *agl; + + agl = gid_cache_lookup (&conf->gid_cache, root->uid, 0, 0); + if (agl) { + root->ngrps = agl->gl_count; +- goto fill_groups; ++ ++ if (root->ngrps > 0) { ++ ret = call_stack_alloc_groups (root, agl->gl_count); ++ if (ret == 0) { ++ memcpy (root->groups, agl->gl_list, ++ sizeof (gid_t) * agl->gl_count); ++ } ++ } ++ ++ gid_cache_release (&conf->gid_cache, agl); ++ ++ return ret; + } + + ret = getpwuid_r (root->uid, &mypw, mystrs, sizeof(mystrs), &result); +@@ -66,42 +77,28 @@ gid_resolve (server_conf_t *conf, call_stack_t *root) + } + root->ngrps = (uint16_t) ngroups; + +-fill_groups: +- if (agl) { +- /* the gl is not complete, we only use gl.gl_list later on */ +- gl.gl_list = agl->gl_list; +- } else { +- /* setup a full gid_list_t to add it to the gid_cache */ +- gl.gl_id = root->uid; +- gl.gl_uid = root->uid; +- gl.gl_gid = root->gid; +- gl.gl_count = root->ngrps; +- +- gl.gl_list = GF_MALLOC (root->ngrps * sizeof(gid_t), +- gf_common_mt_groups_t); +- if (gl.gl_list) +- memcpy (gl.gl_list, mygroups, +- sizeof(gid_t) * root->ngrps); +- else { +- GF_FREE (mygroups); +- return -1; +- } ++ /* setup a full gid_list_t to add it to the gid_cache */ ++ gl.gl_id = root->uid; ++ gl.gl_uid = root->uid; ++ gl.gl_gid = root->gid; ++ gl.gl_count = root->ngrps; ++ ++ gl.gl_list = GF_MALLOC (root->ngrps * sizeof(gid_t), ++ gf_common_mt_groups_t); ++ if (gl.gl_list) ++ memcpy (gl.gl_list, mygroups, ++ sizeof(gid_t) * root->ngrps); ++ else { ++ GF_FREE (mygroups); ++ return -1; + } + +- if (root->ngrps == 0) { +- ret = 0; +- goto out; ++ if (root->ngrps > 0) { ++ call_stack_set_groups (root, root->ngrps, &mygroups); + } + +- call_stack_set_groups (root, root->ngrps, mygroups); +- +-out: +- if (agl) { +- gid_cache_release (&conf->gid_cache, agl); +- } else { +- if (gid_cache_add (&conf->gid_cache, &gl) != 1) +- GF_FREE (gl.gl_list); +- } ++ if (gid_cache_add (&conf->gid_cache, &gl) != 1) ++ GF_FREE (gl.gl_list); + + return ret; + } +-- +1.8.3.1 + diff --git a/SOURCES/0645-glusterd-Nullify-pmap-entry-for-bricks-belonging-to-.patch b/SOURCES/0645-glusterd-Nullify-pmap-entry-for-bricks-belonging-to-.patch new file mode 100644 index 0000000..0fe5260 --- /dev/null +++ b/SOURCES/0645-glusterd-Nullify-pmap-entry-for-bricks-belonging-to-.patch @@ -0,0 +1,38 @@ +From d44694d9b16fb6ca686bd4ce46a40cf5f91232d6 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 2 Jan 2018 20:26:31 +0530 +Subject: [PATCH 645/645] glusterd: Nullify pmap entry for bricks belonging to + same port + +Commit 30e0b86 tried to address all the stale port issues glusterd had +in case of a brick is abruptly killed. For brick multiplexing case +because of a bug the portmap entry was not getting removed. This patch +addresses the same. + +>upstream mainline patch : https://review.gluster.org/#/c/19119/ + +Change-Id: Ib020b967a9b92f1abae9cab9492f0cacec59aaa1 +BUG: 1530217 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/126775 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-pmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c +index 1789ef3..68f0f27 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c ++++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c +@@ -388,7 +388,7 @@ remove: + * there's no xprt either, then we have nothing left worth saving and + * can delete the entire entry. + */ +- if (!pmap->ports[p].xprt) { ++ if (brick_disconnect || !pmap->ports[p].xprt) { + /* If the signout call is being triggered by brick disconnect + * then clean up all the bricks (in case of brick mux) + */ +-- +1.8.3.1 + diff --git a/SOURCES/0646-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch b/SOURCES/0646-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch new file mode 100644 index 0000000..d2a7473 --- /dev/null +++ b/SOURCES/0646-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch @@ -0,0 +1,271 @@ +From fa2086eaa9545b472acd3fcc07be776d9f2cb38a Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 15 Feb 2018 01:46:29 -0500 +Subject: [PATCH 646/646] geo-rep: Remove lazy umount and use mount namespaces + +Lazy umounting the master volume by worker causes +issues with rsync's usage of getcwd. Henc removing +the lazy umount and using private mount namespace +for the same. On the slave, the lazy umount is +retained as we can't use private namespace in non +root geo-rep setup because gsyncd is spawned as +non privileged user. + +Backport of https://review.gluster.org/#/c/19544/ + +Change-Id: I851e8dc2b8523dc5668a97e87ef619ab70471dfd +BUG: 1547931 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/131128 +Tested-by: RHGS Build Bot +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Reviewed-by: Atin Mukherjee +--- + geo-replication/syncdaemon/gconf.py | 3 +++ + geo-replication/syncdaemon/gsyncd.py | 14 +++++++----- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++++++++++------------ + geo-replication/syncdaemon/resource.py | 16 ++++++++++++-- + geo-replication/syncdaemon/syncdutils.py | 18 ++++++++++++++- + glusterfs.spec.in | 4 ++++ + 6 files changed, 70 insertions(+), 23 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 97395b4..2280f44 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,5 +28,8 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None ++ mountbroker = False ++ mount_point = None ++ mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index f9471e4..96256cf 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,6 +269,8 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) ++ op.add_option('--access-mount', default=False, action='store_true') ++ op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -414,7 +416,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node'] ++ 'local_id', 'local_node', 'slave_access_mount'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -748,15 +750,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- label = 'monitor' ++ gconf.label = 'monitor' + elif be_agent: +- label = gconf.local_path ++ gconf.label = gconf.local_path + elif remote: + # master +- label = gconf.local_path ++ gconf.label = gconf.local_path + else: +- label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=label) ++ gconf.label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index dc0211e..087a202 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize ++from syncdutils import escape, Thread, finalize, memoize, boolify + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,19 +301,29 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- os.execv(sys.executable, argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] + +- (['--is-hottier'] if w[3] else []) + +- ['--resource-remote', remote_host]) ++ args_to_worker = argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] ++ ++ if w[3]: ++ args_to_worker.append('--is-hottier') ++ args_to_worker += ['--resource-remote', remote_host] ++ ++ access_mount = boolify(gconf.access_mount) ++ if access_mount: ++ os.execv(sys.executable, args_to_worker) ++ else: ++ unshare_cmd = ['unshare', '-m', '--propagation', 'private'] ++ cmd = unshare_cmd + args_to_worker ++ os.execvp("unshare", cmd) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 943e3ec..39d537b 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,6 +989,8 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') ++ if boolify(gconf.slave_access_mount): ++ extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1258,6 +1260,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None ++ self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1348,13 +1351,16 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted: ++ if mounted and gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- self.cleanup_mntpt(mntpt) ++ if gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1374,6 +1380,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') ++ gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1405,6 +1412,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] ++ gconf.mount_point = self.mntpt ++ gconf.mountbroker = True ++ self.umount_cmd = self.make_cli_argv() + ['umount'] ++ gconf.mbr_umount_cmd = self.umount_cmd ++ + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index a22289e..8dc6c96 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,6 +16,7 @@ import fcntl + import shutil + import logging + import socket ++import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -188,7 +189,6 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + +- + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,6 +233,22 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + ++ """ Unmount if not done """ ++ if gconf.mount_point: ++ if gconf.mountbroker: ++ umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] ++ else: ++ umount_cmd = ['umount', '-l', gconf.mount_point] ++ p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) ++ _, errdata = p0.communicate() ++ if p0.returncode == 0: ++ try: ++ os.rmdir(gconf.mount_point) ++ except OSError: ++ pass ++ else: ++ pass ++ + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index fc9125b..b7bfcac 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,6 +439,7 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync ++Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2067,6 +2068,9 @@ fi + %endif + + %changelog ++* Thu Feb 22 2018 Kotresh HR ++- Added util-linux as dependency to georeplication rpm (#1544382) ++ + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0647-glusterd-optimize-glusterd-import-volumes-code-path.patch b/SOURCES/0647-glusterd-optimize-glusterd-import-volumes-code-path.patch new file mode 100644 index 0000000..9275f05 --- /dev/null +++ b/SOURCES/0647-glusterd-optimize-glusterd-import-volumes-code-path.patch @@ -0,0 +1,51 @@ +From d2c270576da3f887b5933805e7efb029e6b0ecd0 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Mon, 29 Jan 2018 10:23:52 +0530 +Subject: [PATCH 647/649] glusterd: optimize glusterd import volumes code path + +In case there's a version mismatch detected for one of the volumes +glusterd was ending up with updating all the volumes which is a +overkill. + +> upstream mainline patch : https://review.gluster.org/#/c/19358/ + +Change-Id: I6df792db391ce3a1697cfa9260f7dbc3f59aa62d +BUG: 1556670 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/132723 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index b594b9e..59ef282 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -4753,16 +4753,18 @@ glusterd_compare_friend_data (dict_t *peer_data, int32_t *status, + ret = 0; + goto out; + } +- if (GLUSTERD_VOL_COMP_UPDATE_REQ == *status) ++ if (GLUSTERD_VOL_COMP_UPDATE_REQ == *status) { ++ ret = glusterd_import_friend_volume (peer_data, i); ++ if (ret) { ++ goto out; ++ } + update = _gf_true; +- ++ *status = GLUSTERD_VOL_COMP_NONE; ++ } + i++; + } + + if (update) { +- ret = glusterd_import_friend_volumes (peer_data); +- if (ret) +- goto out; + glusterd_svcs_manager (NULL); + } + +-- +1.8.3.1 + diff --git a/SOURCES/0648-glusterd-import-volumes-in-separate-synctask.patch b/SOURCES/0648-glusterd-import-volumes-in-separate-synctask.patch new file mode 100644 index 0000000..a17d3dd --- /dev/null +++ b/SOURCES/0648-glusterd-import-volumes-in-separate-synctask.patch @@ -0,0 +1,751 @@ +From 359b99fc520d16bc2a2013b555fda774db03aa90 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Thu, 8 Feb 2018 09:09:00 +0530 +Subject: [PATCH 648/649] glusterd: import volumes in separate synctask + +With brick multiplexing, to attach a brick to an existing brick process +the prerequisite is to have the compatible brick to finish it's +initialization and portmap sign in and hence the thread might have to go +to a sleep and context switch the synctask to allow the brick process to +communicate with glusterd. In normal code path, this works fine as +glusterd_restart_bricks () is launched through a separate synctask. + +In case there's a mismatch of the volume when glusterd restarts, +glusterd_import_friend_volume is invoked and then it tries to call +glusterd_start_bricks () from the main thread which eventually may land +into the similar situation. Now since this is not done through a +separate synctask, the 1st brick will never be able to get its turn to +finish all of its handshaking and as a consequence to it, all the bricks +will fail to get attached to it. + +Solution : Execute import volume and glusterd restart bricks in separate +synctask. Importing snaps had to be also done through synctask as +there's a dependency of the parent volume need to be available for the +importing snap functionality to work. + +>upstream mainline patch : https://review.gluster.org/#/c/19357 + +Change-Id: I290b244d456afcc9b913ab30be4af040d340428c +BUG: 1556670 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/132724 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 9 +- + xlators/mgmt/glusterd/src/glusterd-op-sm.h | 2 + + .../mgmt/glusterd/src/glusterd-snapshot-utils.c | 226 +++++++++++++++++---- + xlators/mgmt/glusterd/src/glusterd-utils.c | 166 ++++++++++++--- + xlators/mgmt/glusterd/src/glusterd-utils.h | 4 + + xlators/mgmt/glusterd/src/glusterd.h | 3 +- + 6 files changed, 340 insertions(+), 70 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index f034ae8..ab2886e 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -2339,6 +2339,7 @@ glusterd_stop_bricks (glusterd_volinfo_t *volinfo) + + int + glusterd_start_bricks (glusterd_volinfo_t *volinfo) ++ + { + int ret = -1; + glusterd_brickinfo_t *brickinfo = NULL; +@@ -2366,14 +2367,6 @@ glusterd_start_bricks (glusterd_volinfo_t *volinfo) + goto out; + } + } +- +- } +- ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE); +- if (ret) { +- gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL, +- "Failed to write volinfo for volume %s", +- volinfo->volname); +- goto out; + } + ret = 0; + out: +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h +index 571905f..9f857b6 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h +@@ -269,8 +269,10 @@ glusterd_volume_stats_write_perf (char *brick_path, int32_t blk_size, + int32_t blk_count, double *throughput, double *time); + gf_boolean_t + glusterd_is_volume_started (glusterd_volinfo_t *volinfo); ++ + int + glusterd_start_bricks (glusterd_volinfo_t *volinfo); ++ + gf_boolean_t + glusterd_are_all_volumes_stopped (); + int +diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +index 3fe424a..e32fb29 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +@@ -1738,8 +1738,11 @@ out: + * state, i.e either both would be hosting bricks or both would not be hosting + * bricks, then a decision can't be taken and a peer-reject will happen. + * +- * glusterd_compare_and_update_snap() implements the following algorithm to +- * perform the above task: ++ * glusterd_compare_snap() & glusterd_update_snaps () implement the following ++ * algorithm to perform the above task. Please note the former function tries to ++ * iterate over the snaps one at a time and updating the relevant fields in the ++ * dictionary and then glusterd_update_snaps () go over all the snaps and update ++ * them at one go as part of a synctask. + * Step 1: Start. + * Step 2: Check if the peer is missing a delete or restore on the said snap. + * If yes, goto step 6. +@@ -1764,21 +1767,18 @@ out: + * + */ + int32_t +-glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count, +- char *peername, uuid_t peerid) ++glusterd_compare_snap (dict_t *peer_data, int32_t snap_count, ++ char *peername, uuid_t peerid) + { + char buf[NAME_MAX] = ""; + char prefix[NAME_MAX] = ""; + char *peer_snap_name = NULL; + char *peer_snap_id = NULL; +- dict_t *dict = NULL; + glusterd_snap_t *snap = NULL; + gf_boolean_t conflict = _gf_false; + gf_boolean_t is_local = _gf_false; + gf_boolean_t is_hosted = _gf_false; + gf_boolean_t missed_delete = _gf_false; +- gf_boolean_t remove_lvm = _gf_true; +- + int32_t ret = -1; + int32_t volcount = 0; + xlator_t *this = NULL; +@@ -1790,6 +1790,14 @@ glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count, + + snprintf (prefix, sizeof(prefix), "snap%d", snap_count); + ++ ret = dict_set_uint32 (peer_data, buf, 0); ++ snprintf (buf, sizeof(buf), "%s.accept_peer_data", prefix); ++ ret = dict_set_uint32 (peer_data, buf, 0); ++ snprintf (buf, sizeof(buf), "%s.remove_lvm", prefix); ++ ret = dict_set_uint32 (peer_data, buf, 0); ++ snprintf (buf, sizeof(buf), "%s.remove_my_data", prefix); ++ ret = dict_set_uint32 (peer_data, buf, 0); ++ + /* Fetch the peer's snapname */ + snprintf (buf, sizeof(buf), "%s.snapname", prefix); + ret = dict_get_str (peer_data, buf, &peer_snap_name); +@@ -1846,7 +1854,10 @@ glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count, + /* Peer has snap with the same snapname + * and snap_id, which local node doesn't have. + */ +- goto accept_peer_data; ++ snprintf (buf, sizeof(buf), "%s.accept_peer_data", ++ prefix); ++ ret = dict_set_uint32 (peer_data, buf, 1); ++ goto out; + } + /* Peer has snap with the same snapname + * and snap_id. Now check if peer has a +@@ -1873,12 +1884,15 @@ glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count, + * When removing data from local node, make sure + * we are not removing backend lvm of the snap. + */ +- remove_lvm = _gf_false; +- goto remove_my_data; ++ snprintf (buf, sizeof(buf), "%s.remove_lvm", prefix); ++ ret = dict_set_uint32 (peer_data, buf, 0); ++ snprintf (buf, sizeof(buf), "%s.remove_my_data", ++ prefix); ++ ret = dict_set_uint32 (peer_data, buf, 1); + } else { + ret = 0; +- goto out; + } ++ goto out; + } + + /* There is a conflict. Check if the current node is +@@ -1930,50 +1944,176 @@ glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count, + * And local node isn't. Hence remove local node's + * data and accept peer data + */ +- + gf_msg_debug (this->name, 0, "Peer hosts bricks for conflicting " + "snap(%s). Removing local data. Accepting peer data.", + peer_snap_name); +- remove_lvm = _gf_true; ++ snprintf (buf, sizeof(buf), "%s.remove_lvm", prefix); ++ ret = dict_set_uint32 (peer_data, buf, 1); ++ snprintf (buf, sizeof(buf), "%s.remove_my_data", ++ prefix); ++ ret = dict_set_uint32 (peer_data, buf, 1); ++ snprintf (buf, sizeof(buf), "%s.accept_peer_data", prefix); ++ ret = dict_set_uint32 (peer_data, buf, 1); + +-remove_my_data: ++out: ++ gf_msg_trace (this->name, 0, "Returning %d", ret); ++ return ret; ++} + +- dict = dict_new(); +- if (!dict) { +- gf_msg (this->name, GF_LOG_ERROR, 0, +- GD_MSG_DICT_CREATE_FAIL, +- "Unable to create dict"); +- ret = -1; +- goto out; ++int32_t ++glusterd_update_snaps_synctask (void *opaque) ++{ ++ int32_t ret = -1; ++ int32_t snap_count = 0; ++ int i = 1; ++ xlator_t *this = NULL; ++ dict_t *peer_data = NULL; ++ char buf[NAME_MAX] = ""; ++ char prefix[NAME_MAX] = ""; ++ char *peer_snap_name = NULL; ++ char *peer_snap_id = NULL; ++ char *peername = NULL; ++ gf_boolean_t remove_lvm = _gf_false; ++ gf_boolean_t remove_my_data = _gf_false; ++ gf_boolean_t accept_peer_data = _gf_false; ++ int32_t val = 0; ++ glusterd_snap_t *snap = NULL; ++ dict_t *dict = NULL; ++ glusterd_conf_t *conf = NULL; ++ ++ this = THIS; ++ GF_ASSERT (this); ++ ++ conf = this->private; ++ GF_ASSERT (conf); ++ ++ peer_data = (dict_t *)opaque; ++ GF_ASSERT (peer_data); ++ ++ synclock_lock (&conf->big_lock); ++ ++ while (conf->restart_bricks) { ++ synclock_unlock (&conf->big_lock); ++ sleep (2); ++ synclock_lock (&conf->big_lock); + } ++ conf->restart_bricks = _gf_true; + +- ret = glusterd_snap_remove (dict, snap, remove_lvm, _gf_false, +- _gf_false); ++ ret = dict_get_int32 (peer_data, "snap_count", &snap_count); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, +- GD_MSG_SNAP_REMOVE_FAIL, +- "Failed to remove snap %s", snap->snapname); ++ GD_MSG_DICT_GET_FAILED, "Failed to fetch snap_count"); + goto out; + } +- +-accept_peer_data: +- +- /* Accept Peer Data */ +- ret = glusterd_import_friend_snap (peer_data, snap_count, +- peer_snap_name, peer_snap_id); ++ ret = dict_get_str (peer_data, "peername", &peername); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, +- GD_MSG_SNAP_IMPORT_FAIL, +- "Failed to import snap %s from peer %s", +- peer_snap_name, peername); ++ GD_MSG_DICT_GET_FAILED, "Failed to fetch peername"); + goto out; + } + ++ for (i = 1; i <= snap_count; i++) { ++ snprintf (prefix, sizeof(prefix), "snap%d", i); ++ ++ /* Fetch the peer's snapname */ ++ snprintf (buf, sizeof(buf), "%s.snapname", prefix); ++ ret = dict_get_str (peer_data, buf, &peer_snap_name); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_GET_FAILED, ++ "Unable to fetch snapname from peer: %s", ++ peername); ++ goto out; ++ } ++ ++ /* Fetch the peer's snap_id */ ++ snprintf (buf, sizeof(buf), "%s.snap_id", prefix); ++ ret = dict_get_str (peer_data, buf, &peer_snap_id); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_GET_FAILED, ++ "Unable to fetch snap_id from peer: %s", ++ peername); ++ goto out; ++ } ++ ++ /* remove_my_data */ ++ snprintf (buf, sizeof(buf), "%s.remove_my_data", prefix); ++ ret = dict_get_int32 (peer_data, buf, &val); ++ if (val) ++ remove_my_data = _gf_true; ++ else ++ remove_my_data = _gf_false; ++ ++ if (remove_my_data) { ++ snprintf (buf, sizeof(buf), "%s.remove_lvm", prefix); ++ ret = dict_get_int32 (peer_data, buf, &val); ++ if (val) ++ remove_lvm = _gf_true; ++ else ++ remove_lvm = _gf_false; ++ ++ dict = dict_new(); ++ if (!dict) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_CREATE_FAIL, ++ "Unable to create dict"); ++ ret = -1; ++ goto out; ++ } ++ snap = glusterd_find_snap_by_name (peer_snap_name); ++ if (!snap) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_MISSED_SNAP_PRESENT, ++ "Snapshot %s from peer %s missing on " ++ "localhost", peer_snap_name, ++ peername); ++ ret = -1; ++ goto out; ++ } ++ ++ ret = glusterd_snap_remove (dict, snap, remove_lvm, ++ _gf_false, _gf_false); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_SNAP_REMOVE_FAIL, ++ "Failed to remove snap %s", ++ snap->snapname); ++ goto out; ++ } ++ if (dict) ++ dict_unref (dict); ++ } ++ snprintf (buf, sizeof(buf), "%s.accept_peer_data", prefix); ++ ret = dict_get_int32 (peer_data, buf, &val); ++ if (val) ++ accept_peer_data = _gf_true; ++ else ++ accept_peer_data = _gf_false; ++ ++ if (accept_peer_data) { ++ /* Accept Peer Data */ ++ ret = glusterd_import_friend_snap (peer_data, ++ i, ++ peer_snap_name, ++ peer_snap_id); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_SNAP_IMPORT_FAIL, ++ "Failed to import snap %s from peer %s", ++ peer_snap_name, peername); ++ goto out; ++ } ++ } ++ } ++ + out: ++ if (peer_data) ++ dict_unref (peer_data); + if (dict) + dict_unref (dict); ++ conf->restart_bricks = _gf_false; + +- gf_msg_trace (this->name, 0, "Returning %d", ret); + return ret; + } + +@@ -1988,6 +2128,7 @@ glusterd_compare_friend_snapshots (dict_t *peer_data, char *peername, + int32_t snap_count = 0; + int i = 1; + xlator_t *this = NULL; ++ dict_t *peer_data_copy = NULL; + + this = THIS; + GF_ASSERT (this); +@@ -2003,8 +2144,7 @@ glusterd_compare_friend_snapshots (dict_t *peer_data, char *peername, + + for (i = 1; i <= snap_count; i++) { + /* Compare one snapshot from peer_data at a time */ +- ret = glusterd_compare_and_update_snap (peer_data, i, peername, +- peerid); ++ ret = glusterd_compare_snap (peer_data, i, peername, peerid); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_SNAPSHOT_OP_FAILED, +@@ -2013,6 +2153,18 @@ glusterd_compare_friend_snapshots (dict_t *peer_data, char *peername, + goto out; + } + } ++ /* Update the snaps at one go */ ++ peer_data_copy = dict_copy_with_ref (peer_data, NULL); ++ ret = dict_set_str (peer_data_copy, "peername", peername); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, ++ "Failed to set peername into the dict"); ++ if (peer_data_copy) ++ dict_unref (peer_data_copy); ++ goto out; ++ } ++ glusterd_launch_synctask (glusterd_update_snaps_synctask, ++ peer_data_copy); + + out: + gf_msg_trace (this->name, 0, "Returning %d", ret); +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 59ef282..a04ed99 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -3378,6 +3378,14 @@ glusterd_compare_friend_volume (dict_t *peer_data, int32_t count, + *status = GLUSTERD_VOL_COMP_SCS; + + out: ++ memset (key, 0, sizeof (key)); ++ snprintf (key, sizeof (key), "volume%d.update", count); ++ ++ if (*status == GLUSTERD_VOL_COMP_UPDATE_REQ) { ++ ret = dict_set_int32 (peer_data, key, 1); ++ } else { ++ ret = dict_set_int32 (peer_data, key, 0); ++ } + if (*status == GLUSTERD_VOL_COMP_RJT) { + gf_event (EVENT_COMPARE_FRIEND_VOLUME_FAILED, "volume=%s", + volinfo->volname); +@@ -3450,11 +3458,10 @@ glusterd_spawn_daemons (void *opaque) + int ret = -1; + + synclock_lock (&conf->big_lock); +- glusterd_restart_bricks (conf); ++ glusterd_restart_bricks (); + glusterd_restart_gsyncds (conf); + glusterd_restart_rebalance (conf); + ret = glusterd_snapdsvc_restart (); +- + return ret; + } + +@@ -4224,20 +4231,35 @@ out: + int32_t + glusterd_volume_disconnect_all_bricks (glusterd_volinfo_t *volinfo) + { +- int ret = 0; +- glusterd_brickinfo_t *brickinfo = NULL; ++ int ret = 0; ++ glusterd_brickinfo_t *brickinfo = NULL; ++ glusterd_brick_proc_t *brick_proc = NULL; ++ int brick_count = 0; ++ + GF_ASSERT (volinfo); + + cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + if (glusterd_is_brick_started (brickinfo)) { +- ret = glusterd_brick_disconnect (brickinfo); +- if (ret) { +- gf_msg ("glusterd", GF_LOG_ERROR, 0, +- GD_MSD_BRICK_DISCONNECT_FAIL, +- "Failed to " +- "disconnect %s:%s", brickinfo->hostname, +- brickinfo->path); +- break; ++ /* If brick multiplexing is enabled then we can't ++ * blindly set brickinfo->rpc to NULL as it might impact ++ * the other attached bricks. ++ */ ++ ret = glusterd_brick_proc_for_port (brickinfo->port, ++ &brick_proc); ++ if (!ret) { ++ brick_count = brick_proc->brick_count; ++ } ++ if (!is_brick_mx_enabled () || brick_count == 0) { ++ ret = glusterd_brick_disconnect (brickinfo); ++ if (ret) { ++ gf_msg ("glusterd", GF_LOG_ERROR, 0, ++ GD_MSD_BRICK_DISCONNECT_FAIL, ++ "Failed to " ++ "disconnect %s:%s", ++ brickinfo->hostname, ++ brickinfo->path); ++ break; ++ } + } + } + } +@@ -4477,7 +4499,7 @@ out: + } + + int32_t +-glusterd_import_friend_volume (dict_t *peer_data, size_t count) ++glusterd_import_friend_volume (dict_t *peer_data, int count) + { + + int32_t ret = -1; +@@ -4486,6 +4508,8 @@ glusterd_import_friend_volume (dict_t *peer_data, size_t count) + glusterd_volinfo_t *old_volinfo = NULL; + glusterd_volinfo_t *new_volinfo = NULL; + glusterd_svc_t *svc = NULL; ++ int32_t update = 0; ++ char key[512] = {0,}; + + GF_ASSERT (peer_data); + +@@ -4493,6 +4517,15 @@ glusterd_import_friend_volume (dict_t *peer_data, size_t count) + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); ++ ++ memset (key, 0, sizeof (key)); ++ snprintf (key, sizeof (key), "volume%d.update", count); ++ ret = dict_get_int32 (peer_data, key, &update); ++ if (ret || !update) { ++ /* if update is 0 that means the volume is not imported */ ++ goto out; ++ } ++ + ret = glusterd_import_volinfo (peer_data, count, + &new_volinfo, "volume"); + if (ret) +@@ -4506,6 +4539,14 @@ glusterd_import_friend_volume (dict_t *peer_data, size_t count) + + ret = glusterd_volinfo_find (new_volinfo->volname, &old_volinfo); + if (0 == ret) { ++ if (new_volinfo->version <= old_volinfo->version) { ++ /* When this condition is true, it already means that ++ * the other synctask thread of import volume has ++ * already up to date volume, so just ignore this volume ++ * now ++ */ ++ goto out; ++ } + /* Ref count the old_volinfo such that deleting it doesn't crash + * if its been already in use by other thread + */ +@@ -4536,7 +4577,8 @@ glusterd_import_friend_volume (dict_t *peer_data, size_t count) + } + } + +- ret = glusterd_store_volinfo (new_volinfo, GLUSTERD_VOLINFO_VER_AC_NONE); ++ ret = glusterd_store_volinfo (new_volinfo, ++ GLUSTERD_VOLINFO_VER_AC_NONE); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_VOLINFO_STORE_FAIL, "Failed to store " +@@ -4564,6 +4606,60 @@ out: + } + + int32_t ++glusterd_import_friend_volumes_synctask (void *opaque) ++{ ++ int32_t ret = -1; ++ int32_t count = 0; ++ int i = 1; ++ xlator_t *this = NULL; ++ glusterd_conf_t *conf = NULL; ++ dict_t *peer_data = NULL; ++ ++ this = THIS; ++ GF_ASSERT (this); ++ ++ conf = this->private; ++ GF_ASSERT (conf); ++ ++ peer_data = (dict_t *)opaque; ++ GF_ASSERT (peer_data); ++ ++ ret = dict_get_int32 (peer_data, "count", &count); ++ if (ret) ++ goto out; ++ ++ synclock_lock (&conf->big_lock); ++ ++ /* We need to ensure that importing a volume shouldn't race with an ++ * other thread where as part of restarting glusterd, bricks are ++ * restarted (refer glusterd_restart_bricks ()) ++ */ ++ while (conf->restart_bricks) { ++ synclock_unlock (&conf->big_lock); ++ sleep (2); ++ synclock_lock (&conf->big_lock); ++ } ++ conf->restart_bricks = _gf_true; ++ ++ while (i <= count) { ++ ret = glusterd_import_friend_volume (peer_data, i); ++ if (ret) { ++ conf->restart_bricks = _gf_false; ++ goto out; ++ } ++ i++; ++ } ++ glusterd_svcs_manager (NULL); ++ conf->restart_bricks = _gf_false; ++out: ++ if (peer_data) ++ dict_unref (peer_data); ++ ++ gf_msg_debug ("glusterd", 0, "Returning with %d", ret); ++ return ret; ++} ++ ++int32_t + glusterd_import_friend_volumes (dict_t *peer_data) + { + int32_t ret = -1; +@@ -4702,8 +4798,10 @@ glusterd_import_global_opts (dict_t *friend_data) + * recompute if quorum is met. If quorum is not met bricks are + * not started and those already running are stopped + */ +- if (old_quorum != new_quorum) +- glusterd_restart_bricks (conf); ++ if (old_quorum != new_quorum) { ++ glusterd_launch_synctask (glusterd_restart_bricks, ++ NULL); ++ } + } + + ret = 0; +@@ -4723,6 +4821,7 @@ glusterd_compare_friend_data (dict_t *peer_data, int32_t *status, + gf_boolean_t update = _gf_false; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; ++ dict_t *peer_data_copy = NULL; + + this = THIS; + GF_ASSERT (this); +@@ -4754,18 +4853,23 @@ glusterd_compare_friend_data (dict_t *peer_data, int32_t *status, + goto out; + } + if (GLUSTERD_VOL_COMP_UPDATE_REQ == *status) { +- ret = glusterd_import_friend_volume (peer_data, i); +- if (ret) { +- goto out; +- } + update = _gf_true; +- *status = GLUSTERD_VOL_COMP_NONE; + } + i++; + } + + if (update) { +- glusterd_svcs_manager (NULL); ++ /* Launch the import friend volume as a separate synctask as it ++ * has to trigger start bricks where we may need to wait for the ++ * first brick to come up before attaching the subsequent bricks ++ * in case brick multiplexing is enabled ++ */ ++ peer_data_copy = dict_copy_with_ref (peer_data, NULL); ++ glusterd_launch_synctask ++ (glusterd_import_friend_volumes_synctask, ++ peer_data_copy); ++ if (ret) ++ goto out; + } + + out: +@@ -5897,7 +6001,7 @@ out: + } + + int +-glusterd_restart_bricks (glusterd_conf_t *conf) ++glusterd_restart_bricks (void *opaque) + { + int ret = 0; + glusterd_volinfo_t *volinfo = NULL; +@@ -5905,6 +6009,7 @@ glusterd_restart_bricks (glusterd_conf_t *conf) + glusterd_snap_t *snap = NULL; + gf_boolean_t start_svcs = _gf_false; + xlator_t *this = NULL; ++ glusterd_conf_t *conf = NULL; + int active_count = 0; + int quorum_count = 0; + gf_boolean_t node_quorum = _gf_false; +@@ -5915,6 +6020,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf) + conf = this->private; + GF_VALIDATE_OR_GOTO (this->name, conf, return_block); + ++ /* We need to ensure that restarting the bricks during glusterd restart ++ * shouldn't race with the import volume thread (refer ++ * glusterd_compare_friend_data ()) ++ */ ++ while (conf->restart_bricks) { ++ synclock_unlock (&conf->big_lock); ++ sleep (2); ++ synclock_lock (&conf->big_lock); ++ } ++ conf->restart_bricks = _gf_true; ++ + ++(conf->blockers); + ret = glusterd_get_quorum_cluster_counts (this, &active_count, + &quorum_count); +@@ -5925,8 +6041,9 @@ glusterd_restart_bricks (glusterd_conf_t *conf) + node_quorum = _gf_true; + + cds_list_for_each_entry (volinfo, &conf->volumes, vol_list) { +- if (volinfo->status != GLUSTERD_STATUS_STARTED) ++ if (volinfo->status != GLUSTERD_STATUS_STARTED) { + continue; ++ } + gf_msg_debug (this->name, 0, "starting the volume %s", + volinfo->volname); + +@@ -6033,6 +6150,7 @@ glusterd_restart_bricks (glusterd_conf_t *conf) + out: + --(conf->blockers); + conf->restart_done = _gf_true; ++ conf->restart_bricks = _gf_false; + + return_block: + return ret; +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index adc3cb1..7a5bfd9 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -243,6 +243,10 @@ glusterd_pending_node_put_rpc (glusterd_pending_node_t *pending_node); + int + glusterd_remote_hostname_get (rpcsvc_request_t *req, + char *remote_host, int len); ++ ++int32_t ++glusterd_import_friend_volumes_synctask (void *opaque); ++ + int32_t + glusterd_import_friend_volumes (dict_t *peer_data); + void +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index d8a0a6f..b94ccc9 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -190,6 +190,7 @@ typedef struct { + int32_t workers; + uint32_t blockers; + uint32_t mgmt_v3_lock_timeout; ++ gf_boolean_t restart_bricks; + } glusterd_conf_t; + + +@@ -1033,7 +1034,7 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, + dict_t *volumes, int count); + + int +-glusterd_restart_bricks (glusterd_conf_t *conf); ++glusterd_restart_bricks (); + + int32_t + glusterd_volume_txn (rpcsvc_request_t *req, char *volname, int flags, +-- +1.8.3.1 + diff --git a/SOURCES/0649-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch b/SOURCES/0649-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch new file mode 100644 index 0000000..a3d777f --- /dev/null +++ b/SOURCES/0649-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch @@ -0,0 +1,262 @@ +From c1ce1c64f1d18cadeaa4f67fc9b9557b6a0dc390 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Thu, 15 Mar 2018 09:08:21 +0530 +Subject: [PATCH 649/649] Revert "geo-rep: Remove lazy umount and use mount + namespaces" + +This reverts commit fa2086eaa9545b472acd3fcc07be776d9f2cb38a. + +BUG: 1556680 +Change-Id: Id099d32902afadaf2039dc3486eff3c287397839 +Reviewed-on: https://code.engineering.redhat.com/gerrit/132725 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + geo-replication/syncdaemon/gconf.py | 3 --- + geo-replication/syncdaemon/gsyncd.py | 14 +++++------- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++-------------------- + geo-replication/syncdaemon/resource.py | 16 ++------------ + geo-replication/syncdaemon/syncdutils.py | 18 +-------------- + glusterfs.spec.in | 4 ---- + 6 files changed, 23 insertions(+), 70 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 2280f44..97395b4 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,8 +28,5 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None +- mountbroker = False +- mount_point = None +- mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index 96256cf..f9471e4 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,8 +269,6 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) +- op.add_option('--access-mount', default=False, action='store_true') +- op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -416,7 +414,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node', 'slave_access_mount'] ++ 'local_id', 'local_node'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -750,15 +748,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- gconf.label = 'monitor' ++ label = 'monitor' + elif be_agent: +- gconf.label = gconf.local_path ++ label = gconf.local_path + elif remote: + # master +- gconf.label = gconf.local_path ++ label = gconf.local_path + else: +- gconf.label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) ++ label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index 087a202..dc0211e 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize, boolify ++from syncdutils import escape, Thread, finalize, memoize + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,29 +301,19 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- args_to_worker = argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] +- +- if w[3]: +- args_to_worker.append('--is-hottier') +- args_to_worker += ['--resource-remote', remote_host] +- +- access_mount = boolify(gconf.access_mount) +- if access_mount: +- os.execv(sys.executable, args_to_worker) +- else: +- unshare_cmd = ['unshare', '-m', '--propagation', 'private'] +- cmd = unshare_cmd + args_to_worker +- os.execvp("unshare", cmd) ++ os.execv(sys.executable, argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] + ++ (['--is-hottier'] if w[3] else []) + ++ ['--resource-remote', remote_host]) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 39d537b..943e3ec 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,8 +989,6 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') +- if boolify(gconf.slave_access_mount): +- extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1260,7 +1258,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None +- self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1351,16 +1348,13 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted and gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): ++ if mounted: + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- if gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): +- self.cleanup_mntpt(mntpt) ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1380,7 +1374,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') +- gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1412,11 +1405,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] +- gconf.mount_point = self.mntpt +- gconf.mountbroker = True +- self.umount_cmd = self.make_cli_argv() + ['umount'] +- gconf.mbr_umount_cmd = self.umount_cmd +- + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index 8dc6c96..a22289e 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,7 +16,6 @@ import fcntl + import shutil + import logging + import socket +-import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -189,6 +188,7 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + ++ + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,22 +233,6 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + +- """ Unmount if not done """ +- if gconf.mount_point: +- if gconf.mountbroker: +- umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] +- else: +- umount_cmd = ['umount', '-l', gconf.mount_point] +- p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) +- _, errdata = p0.communicate() +- if p0.returncode == 0: +- try: +- os.rmdir(gconf.mount_point) +- except OSError: +- pass +- else: +- pass +- + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index b7bfcac..fc9125b 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,7 +439,6 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync +-Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2068,9 +2067,6 @@ fi + %endif + + %changelog +-* Thu Feb 22 2018 Kotresh HR +-- Added util-linux as dependency to georeplication rpm (#1544382) +- + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0650-glusterd-snapshot-fix-the-compare-snap-logic.patch b/SOURCES/0650-glusterd-snapshot-fix-the-compare-snap-logic.patch new file mode 100644 index 0000000..66e0cc7 --- /dev/null +++ b/SOURCES/0650-glusterd-snapshot-fix-the-compare-snap-logic.patch @@ -0,0 +1,41 @@ +From af9022dc13863965feb5b6a3b30d817ef3d94216 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Wed, 21 Mar 2018 09:08:15 +0530 +Subject: [PATCH 650/650] glusterd/snapshot : fix the compare snap logic + +In one of the case in commit cb0339f there's one particular case where +after removing the old snap it wasn't writing the new snap version and +this resulted into one of the test to fail spuriously. + +> upstream mainline patch : https://review.gluster.org/#/c/19536/ + +>BUG: 1540607 +>Change-Id: I3b737754edb0c7aba93ca1f149f2ffe16f3f48f4 +>Signed-off-by: Atin Mukherjee + +BUG: 1556670 +Change-Id: I7a2448c8f8cf7efdb668474cb9df4c7725b6b36b +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/133381 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +index e32fb29..680802b 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +@@ -1889,6 +1889,9 @@ glusterd_compare_snap (dict_t *peer_data, int32_t snap_count, + snprintf (buf, sizeof(buf), "%s.remove_my_data", + prefix); + ret = dict_set_uint32 (peer_data, buf, 1); ++ snprintf (buf, sizeof(buf), "%s.accept_peer_data", ++ prefix); ++ ret = dict_set_uint32 (peer_data, buf, 1); + } else { + ret = 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0651-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch b/SOURCES/0651-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch new file mode 100644 index 0000000..a677ca7 --- /dev/null +++ b/SOURCES/0651-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch @@ -0,0 +1,270 @@ +From 7bf9b8de3e59c3d31edfe5284fe283708d6b4ea0 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 15 Feb 2018 01:46:29 -0500 +Subject: [PATCH 651/651] geo-rep: Remove lazy umount and use mount namespaces + +Lazy umounting the master volume by worker causes +issues with rsync's usage of getcwd. Henc removing +the lazy umount and using private mount namespace +for the same. On the slave, the lazy umount is +retained as we can't use private namespace in non +root geo-rep setup because gsyncd is spawned as +non privileged user. + +Backport of https://review.gluster.org/#/c/19544/ + +BUG: 1547931 +Change-Id: I0ef69639708f123b633f236d26b5ce8c336a4302 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/133551 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + geo-replication/syncdaemon/gconf.py | 3 +++ + geo-replication/syncdaemon/gsyncd.py | 14 +++++++----- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++++++++++------------ + geo-replication/syncdaemon/resource.py | 16 ++++++++++++-- + geo-replication/syncdaemon/syncdutils.py | 18 ++++++++++++++- + glusterfs.spec.in | 4 ++++ + 6 files changed, 70 insertions(+), 23 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 97395b4..2280f44 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,5 +28,8 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None ++ mountbroker = False ++ mount_point = None ++ mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index f9471e4..96256cf 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,6 +269,8 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) ++ op.add_option('--access-mount', default=False, action='store_true') ++ op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -414,7 +416,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node'] ++ 'local_id', 'local_node', 'slave_access_mount'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -748,15 +750,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- label = 'monitor' ++ gconf.label = 'monitor' + elif be_agent: +- label = gconf.local_path ++ gconf.label = gconf.local_path + elif remote: + # master +- label = gconf.local_path ++ gconf.label = gconf.local_path + else: +- label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=label) ++ gconf.label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index dc0211e..087a202 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize ++from syncdutils import escape, Thread, finalize, memoize, boolify + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,19 +301,29 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- os.execv(sys.executable, argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] + +- (['--is-hottier'] if w[3] else []) + +- ['--resource-remote', remote_host]) ++ args_to_worker = argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] ++ ++ if w[3]: ++ args_to_worker.append('--is-hottier') ++ args_to_worker += ['--resource-remote', remote_host] ++ ++ access_mount = boolify(gconf.access_mount) ++ if access_mount: ++ os.execv(sys.executable, args_to_worker) ++ else: ++ unshare_cmd = ['unshare', '-m', '--propagation', 'private'] ++ cmd = unshare_cmd + args_to_worker ++ os.execvp("unshare", cmd) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 943e3ec..39d537b 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,6 +989,8 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') ++ if boolify(gconf.slave_access_mount): ++ extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1258,6 +1260,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None ++ self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1348,13 +1351,16 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted: ++ if mounted and gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- self.cleanup_mntpt(mntpt) ++ if gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1374,6 +1380,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') ++ gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1405,6 +1412,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] ++ gconf.mount_point = self.mntpt ++ gconf.mountbroker = True ++ self.umount_cmd = self.make_cli_argv() + ['umount'] ++ gconf.mbr_umount_cmd = self.umount_cmd ++ + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index a22289e..8dc6c96 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,6 +16,7 @@ import fcntl + import shutil + import logging + import socket ++import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -188,7 +189,6 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + +- + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,6 +233,22 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + ++ """ Unmount if not done """ ++ if gconf.mount_point: ++ if gconf.mountbroker: ++ umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] ++ else: ++ umount_cmd = ['umount', '-l', gconf.mount_point] ++ p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) ++ _, errdata = p0.communicate() ++ if p0.returncode == 0: ++ try: ++ os.rmdir(gconf.mount_point) ++ except OSError: ++ pass ++ else: ++ pass ++ + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index fc9125b..d1aa3ea 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,6 +439,7 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync ++Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2067,6 +2068,9 @@ fi + %endif + + %changelog ++* Thu Mar 22 2018 Kotresh HR ++- Added util-linux as dependency to georeplication rpm (#1544382) ++ + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0652-shared-storage-Prevent-mounting-shared-storage-from-.patch b/SOURCES/0652-shared-storage-Prevent-mounting-shared-storage-from-.patch new file mode 100644 index 0000000..4f4faa4 --- /dev/null +++ b/SOURCES/0652-shared-storage-Prevent-mounting-shared-storage-from-.patch @@ -0,0 +1,66 @@ +From aee1f5c071719aa36c500b89cb99706ddea67e47 Mon Sep 17 00:00:00 2001 +From: Mohammed Rafi KC +Date: Mon, 26 Mar 2018 20:27:34 +0530 +Subject: [PATCH 652/653] shared storage: Prevent mounting shared storage from + non-trusted client + +gluster shared storage is a volume used for internal storage for +various features including ganesha, geo-rep, snapshot. + +So this volume should not be exposed to the client, as it is +a special volume for internal use. + +This fix wont't generate non trusted volfile for shared storage volume. + +Change-Id: I8ffe30ae99ec05196d75466210b84db311611a4c +BUG: 1559331 +Signed-off-by: Mohammed Rafi KC +Reviewed-on: https://code.engineering.redhat.com/gerrit/134590 +Reviewed-by: Atin Mukherjee +Tested-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 859a932..fb7c91f 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -5494,6 +5494,7 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo, + int i = 0; + int ret = -1; + char filepath[PATH_MAX] = {0,}; ++ char *volname = NULL; + char *types[] = {NULL, NULL, NULL}; + dict_t *dict = NULL; + xlator_t *this = NULL; +@@ -5501,6 +5502,26 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo, + + this = THIS; + ++ volname = volinfo->is_snap_volume ? ++ volinfo->parent_volname : volinfo->volname; ++ ++ ++ if (volname && !strcmp (volname, GLUSTER_SHARED_STORAGE) && ++ client_type != GF_CLIENT_TRUSTED) { ++ /* ++ * shared storage volume cannot be mounted from non trusted ++ * nodes. So we are not creating volfiles for non-trusted ++ * clients for shared volumes as well as snapshot of shared ++ * volumes. ++ */ ++ ++ ret = 0; ++ gf_msg_debug ("glusterd", 0, "Skipping the non-trusted volfile" ++ "creation for shared storage volume. Volume %s", ++ volname); ++ goto out; ++ } ++ + enumerate_transport_reqs (volinfo->transport_type, types); + dict = dict_new (); + if (!dict) +-- +1.8.3.1 + diff --git a/SOURCES/0653-server-auth-add-option-for-strict-authentication.patch b/SOURCES/0653-server-auth-add-option-for-strict-authentication.patch new file mode 100644 index 0000000..a23854d --- /dev/null +++ b/SOURCES/0653-server-auth-add-option-for-strict-authentication.patch @@ -0,0 +1,441 @@ +From 16d6904d54e7409f95e9e893e131b6bf11f0e4c7 Mon Sep 17 00:00:00 2001 +From: Mohammed Rafi KC +Date: Mon, 2 Apr 2018 12:20:47 +0530 +Subject: [PATCH 653/653] server/auth: add option for strict authentication + +When this option is enabled, we will check for a matching +username and password, if not found then the connection will +be rejected. This also deos a checksum validation of volfile + +Change-Id: I2ac4f0cfa5b59cc789cc5a265358389b04556b59 +BUG: 1559331 +Signed-off-by: Mohammed Rafi KC +Reviewed-on: https://code.engineering.redhat.com/gerrit/134591 +Reviewed-by: Pranith Kumar Karampuri +Reviewed-by: Mohit Agrawal +Reviewed-by: Atin Mukherjee +Tested-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 15 ++++ + xlators/protocol/auth/login/src/login.c | 104 +++++++++++++++++-------- + xlators/protocol/server/src/authenticate.c | 32 ++++++-- + xlators/protocol/server/src/authenticate.h | 6 +- + xlators/protocol/server/src/server-handshake.c | 4 +- + xlators/protocol/server/src/server.c | 20 ++++- + xlators/protocol/server/src/server.h | 4 + + 7 files changed, 136 insertions(+), 49 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index fb7c91f..02e8393 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -2145,6 +2145,7 @@ brick_graph_add_server (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + char key[1024] = {0}; + char *ssl_user = NULL; + char *value = NULL; ++ char *volname = NULL; + char *address_family_data = NULL; + + if (!graph || !volinfo || !set_dict || !brickinfo) +@@ -2220,6 +2221,20 @@ brick_graph_add_server (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + if (ret) + return -1; + ++ volname = volinfo->is_snap_volume ? ++ volinfo->parent_volname : volinfo->volname; ++ ++ ++ if (volname && !strcmp (volname, GLUSTER_SHARED_STORAGE)) { ++ ++ memset (key, 0, sizeof (key)); ++ snprintf (key, sizeof (key), "strict-auth-accept"); ++ ++ ret = xlator_set_option (xl, key, "true"); ++ if (ret) ++ return -1; ++ } ++ + if (dict_get_str (volinfo->dict, "auth.ssl-allow", &ssl_user) == 0) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "auth.login.%s.ssl-allow", +diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c +index e799dd2..e918d38 100644 +--- a/xlators/protocol/auth/login/src/login.c ++++ b/xlators/protocol/auth/login/src/login.c +@@ -11,45 +11,18 @@ + #include + #include "authenticate.h" + +-auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) +-{ ++auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, ++ char *username, char *password, ++ gf_boolean_t using_ssl) { + auth_result_t result = AUTH_DONT_CARE; + int ret = 0; + data_t *allow_user = NULL; +- data_t *username_data = NULL; + data_t *passwd_data = NULL; +- data_t *password_data = NULL; +- char *username = NULL; +- char *password = NULL; + char *brick_name = NULL; + char *searchstr = NULL; + char *username_str = NULL; + char *tmp = NULL; + char *username_cpy = NULL; +- gf_boolean_t using_ssl = _gf_false; +- +- username_data = dict_get (input_params, "ssl-name"); +- if (username_data) { +- gf_log ("auth/login", GF_LOG_INFO, +- "connecting user name: %s", username_data->data); +- using_ssl = _gf_true; +- } +- else { +- username_data = dict_get (input_params, "username"); +- if (!username_data) { +- gf_log ("auth/login", GF_LOG_DEBUG, +- "username not found, returning DONT-CARE"); +- goto out; +- } +- password_data = dict_get (input_params, "password"); +- if (!password_data) { +- gf_log ("auth/login", GF_LOG_WARNING, +- "password not found, returning DONT-CARE"); +- goto out; +- } +- password = data_to_str (password_data); +- } +- username = data_to_str (username_data); + + brick_name = data_to_str (dict_get (input_params, "remote-subvolume")); + if (!brick_name) { +@@ -64,7 +37,8 @@ auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) + if (-1 == ret) { + gf_log ("auth/login", GF_LOG_WARNING, + "asprintf failed while setting search string, " +- "returning DONT-CARE"); ++ "returning AUTH_STRICT_ACCEPT"); ++ result = AUTH_STRICT_ACCEPT; + goto out; + } + +@@ -84,7 +58,7 @@ auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) + * who do provide a valid username and password (in fact that's + * pretty much the only thing we use non-SSL login auth for), + * but they are allowed to connect. It's wrong, but it's not +- * worth changing elsewhere. Therefore, we do the sane thing ++ * worth changing elsewhere. Therefore, we do the same thing + * only for SSL here. + * + * For SSL, if there's a list *you must be on it*. Note that +@@ -92,13 +66,20 @@ auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) + * ssl-allow=* case as well) authorization is effectively + * disabled, though authentication and encryption are still + * active. ++ * ++ * If option strict-auth-accept is enabled, connection ++ * will be rejected if you don't have any matching ++ * username and password (password is only for non-ssl users). + */ + if (using_ssl) { + result = AUTH_REJECT; + } + username_cpy = gf_strdup (allow_user->data); +- if (!username_cpy) ++ if (!username_cpy) { ++ if (!using_ssl) ++ result = AUTH_STRICT_ACCEPT; + goto out; ++ } + + username_str = strtok_r (username_cpy, " ,", &tmp); + +@@ -120,6 +101,7 @@ auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) + if (-1 == ret) { + gf_log ("auth/login", GF_LOG_WARNING, + "asprintf failed while setting search string"); ++ result = AUTH_STRICT_ACCEPT; + goto out; + } + passwd_data = dict_get (config_params, searchstr); +@@ -145,11 +127,65 @@ auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) + } + username_str = strtok_r (NULL, " ,", &tmp); + } ++ } else { ++ result = AUTH_STRICT_ACCEPT; + } +- + out: + GF_FREE (username_cpy); ++ return result; ++} + ++auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) ++{ ++ auth_result_t result = AUTH_DONT_CARE; ++ auth_result_t ssl_result = AUTH_DONT_CARE; ++ data_t *username_data = NULL; ++ data_t *password_data = NULL; ++ char *username = NULL; ++ char *password = NULL; ++ gf_boolean_t using_ssl = _gf_false; ++ ++ username_data = dict_get (input_params, "username"); ++ if (!username_data) { ++ gf_log ("auth/login", GF_LOG_DEBUG, ++ "username not found, returning AUTH_STRICT_ACCEPT"); ++ result = AUTH_STRICT_ACCEPT; ++ goto out; ++ } ++ password_data = dict_get (input_params, "password"); ++ if (!password_data) { ++ gf_log ("auth/login", GF_LOG_WARNING, ++ "password not found, returning AUTH_STRICT_ACCEPT"); ++ result = AUTH_STRICT_ACCEPT; ++ goto out; ++ } ++ password = data_to_str (password_data); ++ username = data_to_str (username_data); ++ ++ result = gf_authenticate_user (input_params, config_params, ++ username, password, using_ssl); ++ ++ username_data = dict_get (input_params, "ssl-name"); ++ if (username_data) { ++ gf_log ("auth/login", GF_LOG_INFO, ++ "connecting user name: %s", username_data->data); ++ username = data_to_str (username_data); ++ using_ssl = _gf_true; ++ ssl_result = gf_authenticate_user (input_params, config_params, ++ username, NULL, using_ssl); ++ if (ssl_result == AUTH_ACCEPT && result != AUTH_ACCEPT) { ++ /* ++ * Here, ssl authentication returns true, but non-ssl ++ * authentication returns differnt result. We are ++ * calling for a strict auth check in this case. ++ */ ++ result = AUTH_STRICT_ACCEPT; ++ } else { ++ result = ssl_result; ++ } ++ } ++ ++out: + return result; + } + +diff --git a/xlators/protocol/server/src/authenticate.c b/xlators/protocol/server/src/authenticate.c +index c000776..977ed75 100644 +--- a/xlators/protocol/server/src/authenticate.c ++++ b/xlators/protocol/server/src/authenticate.c +@@ -18,7 +18,9 @@ + #include + #include + #include "authenticate.h" ++#include "authenticate.h" + #include "server-messages.h" ++#include "server.h" + + static int + init (dict_t *this, char *key, data_t *value, void *data) +@@ -173,6 +175,7 @@ gf_auth_one_method (dict_t *this, char *key, data_t *value, void *data) + { + gf_auth_args_t *args = data; + auth_handle_t *handle = NULL; ++ int64_t result; + + if (!value) { + return 0; +@@ -183,10 +186,13 @@ gf_auth_one_method (dict_t *this, char *key, data_t *value, void *data) + return 0; + } + +- switch (handle->authenticate (args->iparams, args->cparams)) { ++ result = handle->authenticate (args->iparams, args->cparams); ++ switch (result) { + case AUTH_ACCEPT: +- if (args->result != AUTH_REJECT) { +- args->result = AUTH_ACCEPT; ++ case AUTH_STRICT_ACCEPT: ++ if (args->result != AUTH_REJECT && ++ args->result != AUTH_STRICT_ACCEPT) { ++ args->result = result; + } + /* FALLTHROUGH */ + default: +@@ -198,9 +204,8 @@ gf_auth_one_method (dict_t *this, char *key, data_t *value, void *data) + } + + auth_result_t +-gf_authenticate (dict_t *input_params, +- dict_t *config_params, +- dict_t *auth_modules) ++gf_authenticate (server_conf_t *conf, dict_t *input_params, ++ dict_t *config_params, dict_t *auth_modules) + { + char *name = NULL; + data_t *peerinfo_data = NULL; +@@ -210,9 +215,19 @@ gf_authenticate (dict_t *input_params, + args.cparams = config_params; + args.result = AUTH_DONT_CARE; + ++ GF_VALIDATE_OR_GOTO ("authentication", conf, out); + dict_foreach (auth_modules, gf_auth_one_method, &args); + +- if (AUTH_DONT_CARE == args.result) { ++ switch (args.result) { ++ case AUTH_STRICT_ACCEPT: ++ if (!conf->strict_auth_enabled) { ++ args.result = AUTH_ACCEPT; ++ break; ++ } ++ gf_msg ("auth", GF_LOG_ERROR, 0, PS_MSG_REMOTE_CLIENT_REFUSED, ++ "Authentication is failed due to the strict options."); ++ /* Fallthrough */ ++ case AUTH_DONT_CARE: + peerinfo_data = dict_get (input_params, "peer-info-name"); + + if (peerinfo_data) { +@@ -223,8 +238,9 @@ gf_authenticate (dict_t *input_params, + "no authentication module is interested in " + "accepting remote-client %s", name); + args.result = AUTH_REJECT; ++ /* Fallthrough */ + } +- ++out: + return args.result; + } + +diff --git a/xlators/protocol/server/src/authenticate.h b/xlators/protocol/server/src/authenticate.h +index 3f80231..d5971d3 100644 +--- a/xlators/protocol/server/src/authenticate.h ++++ b/xlators/protocol/server/src/authenticate.h +@@ -25,7 +25,8 @@ + typedef enum { + AUTH_ACCEPT, + AUTH_REJECT, +- AUTH_DONT_CARE ++ AUTH_DONT_CARE, ++ AUTH_STRICT_ACCEPT + } auth_result_t; + + typedef auth_result_t (*auth_fn_t) (dict_t *input_params, +@@ -37,9 +38,6 @@ typedef struct { + volume_opt_list_t *vol_opt; + } auth_handle_t; + +-auth_result_t gf_authenticate (dict_t *input_params, +- dict_t *config_params, +- dict_t *auth_modules); + int32_t gf_auth_init (xlator_t *xl, dict_t *auth_modules); + void gf_auth_fini (dict_t *auth_modules); + +diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c +index ba5b11a..b67ed47 100644 +--- a/xlators/protocol/server/src/server-handshake.c ++++ b/xlators/protocol/server/src/server-handshake.c +@@ -709,7 +709,7 @@ server_setvolume (rpcsvc_request_t *req) + ret = dict_get_str (params, "volfile-key", + &volfile_key); + if (ret) +- gf_msg_debug (this->name, 0, "failed to set " ++ gf_msg_debug (this->name, 0, "failed to get " + "'volfile-key'"); + + ret = _validate_volfile_checksum (this, volfile_key, +@@ -765,7 +765,7 @@ server_setvolume (rpcsvc_request_t *req) + PS_MSG_CLIENT_VERSION_NOT_SET, + "client-version not set, may be of older version"); + +- ret = gf_authenticate (params, config_params, ++ ret = gf_authenticate (conf, params, config_params, + conf->auth_modules); + + if (ret == AUTH_ACCEPT) { +diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c +index 96aa012..f89e743 100644 +--- a/xlators/protocol/server/src/server.c ++++ b/xlators/protocol/server/src/server.c +@@ -866,6 +866,10 @@ do_rpc: + goto out; + } + ++ GF_OPTION_RECONF ("strict-auth-accept", conf->strict_auth_enabled, ++ options, bool, out); ++ ++ + GF_OPTION_RECONF ("dynamic-auth", conf->dync_auth, options, + bool, out); + +@@ -906,7 +910,7 @@ do_rpc: + if (strcmp (xprt_path, auth_path) != 0) { + continue; + } +- ret = gf_authenticate (xprt->clnt_options, ++ ret = gf_authenticate (conf, xprt->clnt_options, + options, + conf->auth_modules); + if (ret == AUTH_ACCEPT) { +@@ -1098,6 +1102,14 @@ init (xlator_t *this) + "Failed to initialize group cache."); + goto out; + } ++ ++ ret = dict_get_str_boolean (this->options, "strict-auth-accept", ++ _gf_false); ++ if (ret == -1) ++ conf->strict_auth_enabled = _gf_false; ++ else ++ conf->strict_auth_enabled = ret; ++ + ret = dict_get_str_boolean (this->options, "dynamic-auth", + _gf_true); + if (ret == -1) +@@ -1738,5 +1750,11 @@ struct volume_options options[] = { + "transport connection immediately in response to " + "*.allow | *.reject volume set options." + }, ++ { .key = {"strict-auth-accept"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .default_value = "off", ++ .description = "strict-auth-accept reject connection with out" ++ "a valid username and password." ++ }, + { .key = {NULL} }, + }; +diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h +index 1be9a38..4c35aaa 100644 +--- a/xlators/protocol/server/src/server.h ++++ b/xlators/protocol/server/src/server.h +@@ -24,6 +24,7 @@ + #include "client_t.h" + #include "gidcache.h" + #include "defaults.h" ++#include "authenticate.h" + + #define DEFAULT_BLOCK_SIZE 4194304 /* 4MB */ + #define DEFAULT_VOLUME_FILE_PATH CONFDIR "/glusterfs.vol" +@@ -111,6 +112,7 @@ struct server_conf { + struct _child_status *child_status; + + gf_lock_t itable_lock; ++ gf_boolean_t strict_auth_enabled; + }; + typedef struct server_conf server_conf_t; + +@@ -243,6 +245,8 @@ serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp); + + int + serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp); ++auth_result_t gf_authenticate (server_conf_t *conf, dict_t *input_params, ++ dict_t *config_params, dict_t *auth_modules); + + server_ctx_t* + server_ctx_get (client_t *client, xlator_t *xlator); +-- +1.8.3.1 + diff --git a/SOURCES/0654-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch b/SOURCES/0654-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch new file mode 100644 index 0000000..597bbdc --- /dev/null +++ b/SOURCES/0654-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch @@ -0,0 +1,267 @@ +From e7436848f4e27c31d6777a6531c62a83cc313aae Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Mon, 16 Apr 2018 20:38:31 +0530 +Subject: [PATCH 654/654] Revert "geo-rep: Remove lazy umount and use mount + namespaces" + +This reverts commit 7bf9b8de3e59c3d31edfe5284fe283708d6b4ea0. This patch +is put up to ensure we don't include this fix RHEL6 build as this patch +break geo-rep functionality in RHEL6. Once the RHEL6 build is done in +RHGS 3.3.1-async, we will put this patch again and then at RHGS 3.4.0 +we'll try to address this scenario for both the cases. + +Change-Id: If279c0f2b80a78edde874beaf7fb11eb6cab25ef +BUG: 1556680 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/135792 +Tested-by: RHGS Build Bot +Reviewed-by: Kotresh Hiremath Ravishankar +--- + geo-replication/syncdaemon/gconf.py | 3 --- + geo-replication/syncdaemon/gsyncd.py | 14 +++++------- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++-------------------- + geo-replication/syncdaemon/resource.py | 16 ++------------ + geo-replication/syncdaemon/syncdutils.py | 18 +-------------- + glusterfs.spec.in | 4 ---- + 6 files changed, 23 insertions(+), 70 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 2280f44..97395b4 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,8 +28,5 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None +- mountbroker = False +- mount_point = None +- mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index 96256cf..f9471e4 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,8 +269,6 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) +- op.add_option('--access-mount', default=False, action='store_true') +- op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -416,7 +414,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node', 'slave_access_mount'] ++ 'local_id', 'local_node'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -750,15 +748,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- gconf.label = 'monitor' ++ label = 'monitor' + elif be_agent: +- gconf.label = gconf.local_path ++ label = gconf.local_path + elif remote: + # master +- gconf.label = gconf.local_path ++ label = gconf.local_path + else: +- gconf.label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) ++ label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index 087a202..dc0211e 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize, boolify ++from syncdutils import escape, Thread, finalize, memoize + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,29 +301,19 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- args_to_worker = argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] +- +- if w[3]: +- args_to_worker.append('--is-hottier') +- args_to_worker += ['--resource-remote', remote_host] +- +- access_mount = boolify(gconf.access_mount) +- if access_mount: +- os.execv(sys.executable, args_to_worker) +- else: +- unshare_cmd = ['unshare', '-m', '--propagation', 'private'] +- cmd = unshare_cmd + args_to_worker +- os.execvp("unshare", cmd) ++ os.execv(sys.executable, argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] + ++ (['--is-hottier'] if w[3] else []) + ++ ['--resource-remote', remote_host]) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 39d537b..943e3ec 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,8 +989,6 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') +- if boolify(gconf.slave_access_mount): +- extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1260,7 +1258,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None +- self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1351,16 +1348,13 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted and gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): ++ if mounted: + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- if gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): +- self.cleanup_mntpt(mntpt) ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1380,7 +1374,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') +- gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1412,11 +1405,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] +- gconf.mount_point = self.mntpt +- gconf.mountbroker = True +- self.umount_cmd = self.make_cli_argv() + ['umount'] +- gconf.mbr_umount_cmd = self.umount_cmd +- + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index 8dc6c96..a22289e 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,7 +16,6 @@ import fcntl + import shutil + import logging + import socket +-import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -189,6 +188,7 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + ++ + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,22 +233,6 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + +- """ Unmount if not done """ +- if gconf.mount_point: +- if gconf.mountbroker: +- umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] +- else: +- umount_cmd = ['umount', '-l', gconf.mount_point] +- p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) +- _, errdata = p0.communicate() +- if p0.returncode == 0: +- try: +- os.rmdir(gconf.mount_point) +- except OSError: +- pass +- else: +- pass +- + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index d1aa3ea..fc9125b 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,7 +439,6 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync +-Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2068,9 +2067,6 @@ fi + %endif + + %changelog +-* Thu Mar 22 2018 Kotresh HR +-- Added util-linux as dependency to georeplication rpm (#1544382) +- + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0655-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch b/SOURCES/0655-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch new file mode 100644 index 0000000..2e2f63c --- /dev/null +++ b/SOURCES/0655-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch @@ -0,0 +1,272 @@ +From 1747de037b7fa7513d5a3755d273002076165784 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 15 Feb 2018 01:46:29 -0500 +Subject: [PATCH 655/656] geo-rep: Remove lazy umount and use mount namespaces + +Lazy umounting the master volume by worker causes +issues with rsync's usage of getcwd. Henc removing +the lazy umount and using private mount namespace +for the same. On the slave, the lazy umount is +retained as we can't use private namespace in non +root geo-rep setup because gsyncd is spawned as +non privileged user. + +Putting this fix back into the branch post RHEL6 build. + +Backport of https://review.gluster.org/#/c/19544/ + +BUG: 1556680 +Change-Id: Ibb3b6f5b59173e1df555ae506419f6274f4c754f +Signed-off-by: Kotresh HR +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/135797 +--- + geo-replication/syncdaemon/gconf.py | 3 +++ + geo-replication/syncdaemon/gsyncd.py | 14 +++++++----- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++++++++++------------ + geo-replication/syncdaemon/resource.py | 16 ++++++++++++-- + geo-replication/syncdaemon/syncdutils.py | 18 ++++++++++++++- + glusterfs.spec.in | 4 ++++ + 6 files changed, 70 insertions(+), 23 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 97395b4..2280f44 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,5 +28,8 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None ++ mountbroker = False ++ mount_point = None ++ mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index f9471e4..96256cf 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,6 +269,8 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) ++ op.add_option('--access-mount', default=False, action='store_true') ++ op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -414,7 +416,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node'] ++ 'local_id', 'local_node', 'slave_access_mount'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -748,15 +750,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- label = 'monitor' ++ gconf.label = 'monitor' + elif be_agent: +- label = gconf.local_path ++ gconf.label = gconf.local_path + elif remote: + # master +- label = gconf.local_path ++ gconf.label = gconf.local_path + else: +- label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=label) ++ gconf.label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index dc0211e..087a202 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize ++from syncdutils import escape, Thread, finalize, memoize, boolify + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,19 +301,29 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- os.execv(sys.executable, argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] + +- (['--is-hottier'] if w[3] else []) + +- ['--resource-remote', remote_host]) ++ args_to_worker = argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] ++ ++ if w[3]: ++ args_to_worker.append('--is-hottier') ++ args_to_worker += ['--resource-remote', remote_host] ++ ++ access_mount = boolify(gconf.access_mount) ++ if access_mount: ++ os.execv(sys.executable, args_to_worker) ++ else: ++ unshare_cmd = ['unshare', '-m', '--propagation', 'private'] ++ cmd = unshare_cmd + args_to_worker ++ os.execvp("unshare", cmd) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 943e3ec..39d537b 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,6 +989,8 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') ++ if boolify(gconf.slave_access_mount): ++ extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1258,6 +1260,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None ++ self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1348,13 +1351,16 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted: ++ if mounted and gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- self.cleanup_mntpt(mntpt) ++ if gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1374,6 +1380,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') ++ gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1405,6 +1412,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] ++ gconf.mount_point = self.mntpt ++ gconf.mountbroker = True ++ self.umount_cmd = self.make_cli_argv() + ['umount'] ++ gconf.mbr_umount_cmd = self.umount_cmd ++ + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index a22289e..8dc6c96 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,6 +16,7 @@ import fcntl + import shutil + import logging + import socket ++import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -188,7 +189,6 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + +- + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,6 +233,22 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + ++ """ Unmount if not done """ ++ if gconf.mount_point: ++ if gconf.mountbroker: ++ umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] ++ else: ++ umount_cmd = ['umount', '-l', gconf.mount_point] ++ p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) ++ _, errdata = p0.communicate() ++ if p0.returncode == 0: ++ try: ++ os.rmdir(gconf.mount_point) ++ except OSError: ++ pass ++ else: ++ pass ++ + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index fc9125b..d1aa3ea 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,6 +439,7 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync ++Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2067,6 +2068,9 @@ fi + %endif + + %changelog ++* Thu Mar 22 2018 Kotresh HR ++- Added util-linux as dependency to georeplication rpm (#1544382) ++ + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0656-server-auth-fix-regression-in-honouring-auth.allow.patch b/SOURCES/0656-server-auth-fix-regression-in-honouring-auth.allow.patch new file mode 100644 index 0000000..5ec8280 --- /dev/null +++ b/SOURCES/0656-server-auth-fix-regression-in-honouring-auth.allow.patch @@ -0,0 +1,415 @@ +From 8f44b5a73f2c838ed0c498370d525ab702be6da2 Mon Sep 17 00:00:00 2001 +From: Mohammed Rafi KC +Date: Mon, 23 Apr 2018 09:02:09 +0530 +Subject: [PATCH 656/656] server/auth: fix regression in honouring auth.allow + +The patch 16d6904d54e7409f95e9e893e131b6bf11f0e4c7 to fix +problem with shared storage introduced a regression. auth.allow +is now not honouring. This changes will fix the issue with +auth.allow + +The option is invalid when SSL/TLS is in use, at which point +the SSL/TLS certificate user name is used to validate and +hence authorize the right user. This expects TLS allow rules +to be setup correctly rather than the default *. + +credit : Shyam + +Change-Id: Ifcb202b16a451e5ef2ea04a6486276ba163c07e4 +BUG: 1570906 +Signed-off-by: Mohammed Rafi KC +Reviewed-on: https://code.engineering.redhat.com/gerrit/136453 +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 3 +- + xlators/protocol/auth/login/src/login.c | 142 ++++++++++++------------- + xlators/protocol/server/src/authenticate.c | 32 ++---- + xlators/protocol/server/src/authenticate.h | 4 +- + xlators/protocol/server/src/server-handshake.c | 2 +- + xlators/protocol/server/src/server.c | 4 +- + xlators/protocol/server/src/server.h | 2 - + 7 files changed, 85 insertions(+), 104 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 02e8393..4198be8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -2226,7 +2226,6 @@ brick_graph_add_server (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + + + if (volname && !strcmp (volname, GLUSTER_SHARED_STORAGE)) { +- + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "strict-auth-accept"); + +@@ -5522,7 +5521,7 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo, + + + if (volname && !strcmp (volname, GLUSTER_SHARED_STORAGE) && +- client_type != GF_CLIENT_TRUSTED) { ++ client_type != GF_CLIENT_TRUSTED) { + /* + * shared storage volume cannot be mounted from non trusted + * nodes. So we are not creating volfiles for non-trusted +diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c +index e918d38..4310027 100644 +--- a/xlators/protocol/auth/login/src/login.c ++++ b/xlators/protocol/auth/login/src/login.c +@@ -11,18 +11,78 @@ + #include + #include "authenticate.h" + +-auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, +- char *username, char *password, +- gf_boolean_t using_ssl) { ++/* Note on strict_auth ++ * - Strict auth kicks in when authentication is using the username, password ++ * in the volfile to login ++ * - If enabled, auth is rejected if the username and password is not matched ++ * or is not present ++ * - When using SSL names, this is automatically strict, and allows only those ++ * names that are present in the allow list, IOW strict auth checking has no ++ * implication when using SSL names ++*/ ++ ++auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) ++{ + auth_result_t result = AUTH_DONT_CARE; + int ret = 0; + data_t *allow_user = NULL; ++ data_t *username_data = NULL; + data_t *passwd_data = NULL; ++ data_t *password_data = NULL; ++ char *username = NULL; ++ char *password = NULL; + char *brick_name = NULL; + char *searchstr = NULL; + char *username_str = NULL; + char *tmp = NULL; + char *username_cpy = NULL; ++ gf_boolean_t using_ssl = _gf_false; ++ gf_boolean_t strict_auth = _gf_false; ++ ++ username_data = dict_get (input_params, "ssl-name"); ++ if (username_data) { ++ gf_log ("auth/login", GF_LOG_INFO, ++ "connecting user name: %s", username_data->data); ++ using_ssl = _gf_true; ++ } else { ++ ret = dict_get_str_boolean (config_params, "strict-auth-accept", ++ _gf_false); ++ if (ret == -1) ++ strict_auth = _gf_false; ++ else ++ strict_auth = ret; ++ ++ username_data = dict_get (input_params, "username"); ++ if (!username_data) { ++ if (strict_auth) { ++ gf_log ("auth/login", GF_LOG_DEBUG, ++ "username not found, strict auth" ++ " configured returning REJECT"); ++ result = AUTH_REJECT; ++ } else { ++ gf_log ("auth/login", GF_LOG_DEBUG, ++ "username not found, returning" ++ " DONT-CARE"); ++ } ++ goto out; ++ } ++ password_data = dict_get (input_params, "password"); ++ if (!password_data) { ++ if (strict_auth) { ++ gf_log ("auth/login", GF_LOG_DEBUG, ++ "password not found, strict auth" ++ " configured returning REJECT"); ++ result = AUTH_REJECT; ++ } else { ++ gf_log ("auth/login", GF_LOG_WARNING, ++ "password not found, returning" ++ " DONT-CARE"); ++ } ++ goto out; ++ } ++ password = data_to_str (password_data); ++ } ++ username = data_to_str (username_data); + + brick_name = data_to_str (dict_get (input_params, "remote-subvolume")); + if (!brick_name) { +@@ -35,10 +95,10 @@ auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, + ret = gf_asprintf (&searchstr, "auth.login.%s.%s", brick_name, + using_ssl ? "ssl-allow" : "allow"); + if (-1 == ret) { +- gf_log ("auth/login", GF_LOG_WARNING, ++ gf_log ("auth/login", GF_LOG_ERROR, + "asprintf failed while setting search string, " +- "returning AUTH_STRICT_ACCEPT"); +- result = AUTH_STRICT_ACCEPT; ++ "returning REJECT"); ++ result = AUTH_REJECT; + goto out; + } + +@@ -58,7 +118,7 @@ auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, + * who do provide a valid username and password (in fact that's + * pretty much the only thing we use non-SSL login auth for), + * but they are allowed to connect. It's wrong, but it's not +- * worth changing elsewhere. Therefore, we do the same thing ++ * worth changing elsewhere. Therefore, we do the sane thing + * only for SSL here. + * + * For SSL, if there's a list *you must be on it*. Note that +@@ -67,19 +127,14 @@ auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, + * disabled, though authentication and encryption are still + * active. + * +- * If option strict-auth-accept is enabled, connection +- * will be rejected if you don't have any matching +- * username and password (password is only for non-ssl users). ++ * Read NOTE on strict_auth above. + */ +- if (using_ssl) { ++ if (using_ssl || strict_auth) { + result = AUTH_REJECT; + } + username_cpy = gf_strdup (allow_user->data); +- if (!username_cpy) { +- if (!using_ssl) +- result = AUTH_STRICT_ACCEPT; ++ if (!username_cpy) + goto out; +- } + + username_str = strtok_r (username_cpy, " ,", &tmp); + +@@ -101,7 +156,6 @@ auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, + if (-1 == ret) { + gf_log ("auth/login", GF_LOG_WARNING, + "asprintf failed while setting search string"); +- result = AUTH_STRICT_ACCEPT; + goto out; + } + passwd_data = dict_get (config_params, searchstr); +@@ -127,65 +181,11 @@ auth_result_t gf_authenticate_user (dict_t *input_params, dict_t *config_params, + } + username_str = strtok_r (NULL, " ,", &tmp); + } +- } else { +- result = AUTH_STRICT_ACCEPT; + } ++ + out: + GF_FREE (username_cpy); +- return result; +-} +- +-auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) +-{ +- auth_result_t result = AUTH_DONT_CARE; +- auth_result_t ssl_result = AUTH_DONT_CARE; +- data_t *username_data = NULL; +- data_t *password_data = NULL; +- char *username = NULL; +- char *password = NULL; +- gf_boolean_t using_ssl = _gf_false; + +- username_data = dict_get (input_params, "username"); +- if (!username_data) { +- gf_log ("auth/login", GF_LOG_DEBUG, +- "username not found, returning AUTH_STRICT_ACCEPT"); +- result = AUTH_STRICT_ACCEPT; +- goto out; +- } +- password_data = dict_get (input_params, "password"); +- if (!password_data) { +- gf_log ("auth/login", GF_LOG_WARNING, +- "password not found, returning AUTH_STRICT_ACCEPT"); +- result = AUTH_STRICT_ACCEPT; +- goto out; +- } +- password = data_to_str (password_data); +- username = data_to_str (username_data); +- +- result = gf_authenticate_user (input_params, config_params, +- username, password, using_ssl); +- +- username_data = dict_get (input_params, "ssl-name"); +- if (username_data) { +- gf_log ("auth/login", GF_LOG_INFO, +- "connecting user name: %s", username_data->data); +- username = data_to_str (username_data); +- using_ssl = _gf_true; +- ssl_result = gf_authenticate_user (input_params, config_params, +- username, NULL, using_ssl); +- if (ssl_result == AUTH_ACCEPT && result != AUTH_ACCEPT) { +- /* +- * Here, ssl authentication returns true, but non-ssl +- * authentication returns differnt result. We are +- * calling for a strict auth check in this case. +- */ +- result = AUTH_STRICT_ACCEPT; +- } else { +- result = ssl_result; +- } +- } +- +-out: + return result; + } + +diff --git a/xlators/protocol/server/src/authenticate.c b/xlators/protocol/server/src/authenticate.c +index 977ed75..c000776 100644 +--- a/xlators/protocol/server/src/authenticate.c ++++ b/xlators/protocol/server/src/authenticate.c +@@ -18,9 +18,7 @@ + #include + #include + #include "authenticate.h" +-#include "authenticate.h" + #include "server-messages.h" +-#include "server.h" + + static int + init (dict_t *this, char *key, data_t *value, void *data) +@@ -175,7 +173,6 @@ gf_auth_one_method (dict_t *this, char *key, data_t *value, void *data) + { + gf_auth_args_t *args = data; + auth_handle_t *handle = NULL; +- int64_t result; + + if (!value) { + return 0; +@@ -186,13 +183,10 @@ gf_auth_one_method (dict_t *this, char *key, data_t *value, void *data) + return 0; + } + +- result = handle->authenticate (args->iparams, args->cparams); +- switch (result) { ++ switch (handle->authenticate (args->iparams, args->cparams)) { + case AUTH_ACCEPT: +- case AUTH_STRICT_ACCEPT: +- if (args->result != AUTH_REJECT && +- args->result != AUTH_STRICT_ACCEPT) { +- args->result = result; ++ if (args->result != AUTH_REJECT) { ++ args->result = AUTH_ACCEPT; + } + /* FALLTHROUGH */ + default: +@@ -204,8 +198,9 @@ gf_auth_one_method (dict_t *this, char *key, data_t *value, void *data) + } + + auth_result_t +-gf_authenticate (server_conf_t *conf, dict_t *input_params, +- dict_t *config_params, dict_t *auth_modules) ++gf_authenticate (dict_t *input_params, ++ dict_t *config_params, ++ dict_t *auth_modules) + { + char *name = NULL; + data_t *peerinfo_data = NULL; +@@ -215,19 +210,9 @@ gf_authenticate (server_conf_t *conf, dict_t *input_params, + args.cparams = config_params; + args.result = AUTH_DONT_CARE; + +- GF_VALIDATE_OR_GOTO ("authentication", conf, out); + dict_foreach (auth_modules, gf_auth_one_method, &args); + +- switch (args.result) { +- case AUTH_STRICT_ACCEPT: +- if (!conf->strict_auth_enabled) { +- args.result = AUTH_ACCEPT; +- break; +- } +- gf_msg ("auth", GF_LOG_ERROR, 0, PS_MSG_REMOTE_CLIENT_REFUSED, +- "Authentication is failed due to the strict options."); +- /* Fallthrough */ +- case AUTH_DONT_CARE: ++ if (AUTH_DONT_CARE == args.result) { + peerinfo_data = dict_get (input_params, "peer-info-name"); + + if (peerinfo_data) { +@@ -238,9 +223,8 @@ gf_authenticate (server_conf_t *conf, dict_t *input_params, + "no authentication module is interested in " + "accepting remote-client %s", name); + args.result = AUTH_REJECT; +- /* Fallthrough */ + } +-out: ++ + return args.result; + } + +diff --git a/xlators/protocol/server/src/authenticate.h b/xlators/protocol/server/src/authenticate.h +index d5971d3..5f92183 100644 +--- a/xlators/protocol/server/src/authenticate.h ++++ b/xlators/protocol/server/src/authenticate.h +@@ -25,8 +25,7 @@ + typedef enum { + AUTH_ACCEPT, + AUTH_REJECT, +- AUTH_DONT_CARE, +- AUTH_STRICT_ACCEPT ++ AUTH_DONT_CARE + } auth_result_t; + + typedef auth_result_t (*auth_fn_t) (dict_t *input_params, +@@ -40,5 +39,6 @@ typedef struct { + + int32_t gf_auth_init (xlator_t *xl, dict_t *auth_modules); + void gf_auth_fini (dict_t *auth_modules); ++auth_result_t gf_authenticate (dict_t *, dict_t *, dict_t *); + + #endif /* _AUTHENTICATE_H */ +diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c +index b67ed47..f6d5a0b 100644 +--- a/xlators/protocol/server/src/server-handshake.c ++++ b/xlators/protocol/server/src/server-handshake.c +@@ -765,7 +765,7 @@ server_setvolume (rpcsvc_request_t *req) + PS_MSG_CLIENT_VERSION_NOT_SET, + "client-version not set, may be of older version"); + +- ret = gf_authenticate (conf, params, config_params, ++ ret = gf_authenticate (params, config_params, + conf->auth_modules); + + if (ret == AUTH_ACCEPT) { +diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c +index f89e743..7dd2a5a 100644 +--- a/xlators/protocol/server/src/server.c ++++ b/xlators/protocol/server/src/server.c +@@ -910,7 +910,7 @@ do_rpc: + if (strcmp (xprt_path, auth_path) != 0) { + continue; + } +- ret = gf_authenticate (conf, xprt->clnt_options, ++ ret = gf_authenticate (xprt->clnt_options, + options, + conf->auth_modules); + if (ret == AUTH_ACCEPT) { +@@ -1104,7 +1104,7 @@ init (xlator_t *this) + } + + ret = dict_get_str_boolean (this->options, "strict-auth-accept", +- _gf_false); ++ _gf_false); + if (ret == -1) + conf->strict_auth_enabled = _gf_false; + else +diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h +index 4c35aaa..6af4657 100644 +--- a/xlators/protocol/server/src/server.h ++++ b/xlators/protocol/server/src/server.h +@@ -245,8 +245,6 @@ serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp); + + int + serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp); +-auth_result_t gf_authenticate (server_conf_t *conf, dict_t *input_params, +- dict_t *config_params, dict_t *auth_modules); + + server_ctx_t* + server_ctx_get (client_t *client, xlator_t *xlator); +-- +1.8.3.1 + diff --git a/SOURCES/0657-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch b/SOURCES/0657-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch new file mode 100644 index 0000000..0081784 --- /dev/null +++ b/SOURCES/0657-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch @@ -0,0 +1,264 @@ +From f30ea46871ea2b9a9923c13c9e9e85f2db813eb8 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 24 Apr 2018 20:12:02 +0530 +Subject: [PATCH 657/657] Revert "geo-rep: Remove lazy umount and use mount + namespaces" + +This reverts commit 1747de037b7fa7513d5a3755d273002076165784 to make the +RHEL6 build + +BUG: 1556680 +Change-Id: I6d7ddbaff0191782f78c7192dd175ae48b00bbf8 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/136717 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + geo-replication/syncdaemon/gconf.py | 3 --- + geo-replication/syncdaemon/gsyncd.py | 14 +++++------- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++-------------------- + geo-replication/syncdaemon/resource.py | 16 ++------------ + geo-replication/syncdaemon/syncdutils.py | 18 +-------------- + glusterfs.spec.in | 4 ---- + 6 files changed, 23 insertions(+), 70 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 2280f44..97395b4 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,8 +28,5 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None +- mountbroker = False +- mount_point = None +- mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index 96256cf..f9471e4 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,8 +269,6 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) +- op.add_option('--access-mount', default=False, action='store_true') +- op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -416,7 +414,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node', 'slave_access_mount'] ++ 'local_id', 'local_node'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -750,15 +748,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- gconf.label = 'monitor' ++ label = 'monitor' + elif be_agent: +- gconf.label = gconf.local_path ++ label = gconf.local_path + elif remote: + # master +- gconf.label = gconf.local_path ++ label = gconf.local_path + else: +- gconf.label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) ++ label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index 087a202..dc0211e 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize, boolify ++from syncdutils import escape, Thread, finalize, memoize + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,29 +301,19 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- args_to_worker = argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] +- +- if w[3]: +- args_to_worker.append('--is-hottier') +- args_to_worker += ['--resource-remote', remote_host] +- +- access_mount = boolify(gconf.access_mount) +- if access_mount: +- os.execv(sys.executable, args_to_worker) +- else: +- unshare_cmd = ['unshare', '-m', '--propagation', 'private'] +- cmd = unshare_cmd + args_to_worker +- os.execvp("unshare", cmd) ++ os.execv(sys.executable, argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] + ++ (['--is-hottier'] if w[3] else []) + ++ ['--resource-remote', remote_host]) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 39d537b..943e3ec 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,8 +989,6 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') +- if boolify(gconf.slave_access_mount): +- extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1260,7 +1258,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None +- self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1351,16 +1348,13 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted and gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): ++ if mounted: + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- if gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): +- self.cleanup_mntpt(mntpt) ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1380,7 +1374,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') +- gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1412,11 +1405,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] +- gconf.mount_point = self.mntpt +- gconf.mountbroker = True +- self.umount_cmd = self.make_cli_argv() + ['umount'] +- gconf.mbr_umount_cmd = self.umount_cmd +- + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index 8dc6c96..a22289e 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,7 +16,6 @@ import fcntl + import shutil + import logging + import socket +-import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -189,6 +188,7 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + ++ + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,22 +233,6 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + +- """ Unmount if not done """ +- if gconf.mount_point: +- if gconf.mountbroker: +- umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] +- else: +- umount_cmd = ['umount', '-l', gconf.mount_point] +- p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) +- _, errdata = p0.communicate() +- if p0.returncode == 0: +- try: +- os.rmdir(gconf.mount_point) +- except OSError: +- pass +- else: +- pass +- + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index d1aa3ea..fc9125b 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,7 +439,6 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync +-Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2068,9 +2067,6 @@ fi + %endif + + %changelog +-* Thu Mar 22 2018 Kotresh HR +-- Added util-linux as dependency to georeplication rpm (#1544382) +- + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0658-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch b/SOURCES/0658-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch new file mode 100644 index 0000000..5696170 --- /dev/null +++ b/SOURCES/0658-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch @@ -0,0 +1,263 @@ +From 041578506f3c61870b1214c8cf3618eae844a0b3 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Wed, 23 May 2018 09:05:15 +0530 +Subject: [PATCH 658/659] Revert "Revert "geo-rep: Remove lazy umount and use + mount namespaces"" + +This reverts commit f30ea46871ea2b9a9923c13c9e9e85f2db813eb8. + +BUG: 1556680 +Change-Id: I8978f4eedc6d3f248e6304b4a4ca133369b70a05 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/139572 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + geo-replication/syncdaemon/gconf.py | 3 +++ + geo-replication/syncdaemon/gsyncd.py | 14 +++++++----- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++++++++++------------ + geo-replication/syncdaemon/resource.py | 16 ++++++++++++-- + geo-replication/syncdaemon/syncdutils.py | 18 ++++++++++++++- + glusterfs.spec.in | 4 ++++ + 6 files changed, 70 insertions(+), 23 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 97395b4..2280f44 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,5 +28,8 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None ++ mountbroker = False ++ mount_point = None ++ mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index f9471e4..96256cf 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,6 +269,8 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) ++ op.add_option('--access-mount', default=False, action='store_true') ++ op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -414,7 +416,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node'] ++ 'local_id', 'local_node', 'slave_access_mount'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -748,15 +750,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- label = 'monitor' ++ gconf.label = 'monitor' + elif be_agent: +- label = gconf.local_path ++ gconf.label = gconf.local_path + elif remote: + # master +- label = gconf.local_path ++ gconf.label = gconf.local_path + else: +- label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=label) ++ gconf.label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index dc0211e..087a202 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize ++from syncdutils import escape, Thread, finalize, memoize, boolify + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,19 +301,29 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- os.execv(sys.executable, argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] + +- (['--is-hottier'] if w[3] else []) + +- ['--resource-remote', remote_host]) ++ args_to_worker = argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] ++ ++ if w[3]: ++ args_to_worker.append('--is-hottier') ++ args_to_worker += ['--resource-remote', remote_host] ++ ++ access_mount = boolify(gconf.access_mount) ++ if access_mount: ++ os.execv(sys.executable, args_to_worker) ++ else: ++ unshare_cmd = ['unshare', '-m', '--propagation', 'private'] ++ cmd = unshare_cmd + args_to_worker ++ os.execvp("unshare", cmd) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 943e3ec..39d537b 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,6 +989,8 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') ++ if boolify(gconf.slave_access_mount): ++ extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1258,6 +1260,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None ++ self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1348,13 +1351,16 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted: ++ if mounted and gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- self.cleanup_mntpt(mntpt) ++ if gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1374,6 +1380,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') ++ gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1405,6 +1412,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] ++ gconf.mount_point = self.mntpt ++ gconf.mountbroker = True ++ self.umount_cmd = self.make_cli_argv() + ['umount'] ++ gconf.mbr_umount_cmd = self.umount_cmd ++ + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index a22289e..8dc6c96 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,6 +16,7 @@ import fcntl + import shutil + import logging + import socket ++import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -188,7 +189,6 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + +- + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,6 +233,22 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + ++ """ Unmount if not done """ ++ if gconf.mount_point: ++ if gconf.mountbroker: ++ umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] ++ else: ++ umount_cmd = ['umount', '-l', gconf.mount_point] ++ p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) ++ _, errdata = p0.communicate() ++ if p0.returncode == 0: ++ try: ++ os.rmdir(gconf.mount_point) ++ except OSError: ++ pass ++ else: ++ pass ++ + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index fc9125b..d1aa3ea 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,6 +439,7 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync ++Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2067,6 +2068,9 @@ fi + %endif + + %changelog ++* Thu Mar 22 2018 Kotresh HR ++- Added util-linux as dependency to georeplication rpm (#1544382) ++ + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0659-gluster-Allow-only-read-only-CLI-commands-via-remote.patch b/SOURCES/0659-gluster-Allow-only-read-only-CLI-commands-via-remote.patch new file mode 100644 index 0000000..316e6c5 --- /dev/null +++ b/SOURCES/0659-gluster-Allow-only-read-only-CLI-commands-via-remote.patch @@ -0,0 +1,41 @@ +From 89879f5303a35fa725d4d0f08f2e29be6cd75da7 Mon Sep 17 00:00:00 2001 +From: moagrawa +Date: Mon, 28 May 2018 19:20:54 +0530 +Subject: [PATCH 659/659] gluster: Allow only read-only CLI commands via + remote-host + +Problem: Current CLI code allows to run all commands via remote-host + while SSL is enabled even node is not added in trusted storage + pool + +Solution: Change condition in init function in glusterd.c to allow only read-only + CLI commands via remote-host while SSL is enabled. + +BUG: 1582128 +Change-Id: I1ef653efe3ea7fb9a1677cd80e09e0ea97b0177c +Signed-off-by: moagrawa +Reviewed-on: https://code.engineering.redhat.com/gerrit/140053 +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c +index 71261af..12ed558 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.c ++++ b/xlators/mgmt/glusterd/src/glusterd.c +@@ -1750,11 +1750,6 @@ init (xlator_t *this) + goto out; + } + /* +- * With strong authentication, we can afford to allow +- * privileged operations over TCP. +- */ +- gd_inet_programs[1] = &gd_svc_cli_prog; +- /* + * This is the only place where we want secure_srvr to reflect + * the management-plane setting. + */ +-- +1.8.3.1 + diff --git a/SOURCES/0660-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch b/SOURCES/0660-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch new file mode 100644 index 0000000..59fefe5 --- /dev/null +++ b/SOURCES/0660-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch @@ -0,0 +1,264 @@ +From 367d7f814114df2be19b743409d998d73aa893c0 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Wed, 30 May 2018 11:44:24 +0530 +Subject: [PATCH 660/660] Revert "geo-rep: Remove lazy umount and use mount + namespaces" + +This reverts commit 1747de037b7fa7513d5a3755d273002076165784 to make the +RHEL6 build + +BUG: 1556680 +Change-Id: I9d8263ea2fb5c7226539fccd028f89146d4b42a5 +Signed-off-by: Kotresh HR +Reviewed-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/140201 +Tested-by: RHGS Build Bot +--- + geo-replication/syncdaemon/gconf.py | 3 --- + geo-replication/syncdaemon/gsyncd.py | 14 +++++------- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++-------------------- + geo-replication/syncdaemon/resource.py | 16 ++------------ + geo-replication/syncdaemon/syncdutils.py | 18 +-------------- + glusterfs.spec.in | 4 ---- + 6 files changed, 23 insertions(+), 70 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 2280f44..97395b4 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,8 +28,5 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None +- mountbroker = False +- mount_point = None +- mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index 96256cf..f9471e4 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,8 +269,6 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) +- op.add_option('--access-mount', default=False, action='store_true') +- op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -416,7 +414,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node', 'slave_access_mount'] ++ 'local_id', 'local_node'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -750,15 +748,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- gconf.label = 'monitor' ++ label = 'monitor' + elif be_agent: +- gconf.label = gconf.local_path ++ label = gconf.local_path + elif remote: + # master +- gconf.label = gconf.local_path ++ label = gconf.local_path + else: +- gconf.label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) ++ label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index 087a202..dc0211e 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize, boolify ++from syncdutils import escape, Thread, finalize, memoize + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,29 +301,19 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- args_to_worker = argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] +- +- if w[3]: +- args_to_worker.append('--is-hottier') +- args_to_worker += ['--resource-remote', remote_host] +- +- access_mount = boolify(gconf.access_mount) +- if access_mount: +- os.execv(sys.executable, args_to_worker) +- else: +- unshare_cmd = ['unshare', '-m', '--propagation', 'private'] +- cmd = unshare_cmd + args_to_worker +- os.execvp("unshare", cmd) ++ os.execv(sys.executable, argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] + ++ (['--is-hottier'] if w[3] else []) + ++ ['--resource-remote', remote_host]) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 39d537b..943e3ec 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,8 +989,6 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') +- if boolify(gconf.slave_access_mount): +- extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1260,7 +1258,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None +- self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1351,16 +1348,13 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted and gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): ++ if mounted: + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- if gconf.label == 'slave' \ +- and not boolify(gconf.slave_access_mount): +- self.cleanup_mntpt(mntpt) ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1380,7 +1374,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') +- gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1412,11 +1405,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] +- gconf.mount_point = self.mntpt +- gconf.mountbroker = True +- self.umount_cmd = self.make_cli_argv() + ['umount'] +- gconf.mbr_umount_cmd = self.umount_cmd +- + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index 8dc6c96..a22289e 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,7 +16,6 @@ import fcntl + import shutil + import logging + import socket +-import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -189,6 +188,7 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + ++ + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,22 +233,6 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + +- """ Unmount if not done """ +- if gconf.mount_point: +- if gconf.mountbroker: +- umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] +- else: +- umount_cmd = ['umount', '-l', gconf.mount_point] +- p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) +- _, errdata = p0.communicate() +- if p0.returncode == 0: +- try: +- os.rmdir(gconf.mount_point) +- except OSError: +- pass +- else: +- pass +- + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index d1aa3ea..fc9125b 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,7 +439,6 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync +-Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2068,9 +2067,6 @@ fi + %endif + + %changelog +-* Thu Mar 22 2018 Kotresh HR +-- Added util-linux as dependency to georeplication rpm (#1544382) +- + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0661-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch b/SOURCES/0661-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch new file mode 100644 index 0000000..49d4197 --- /dev/null +++ b/SOURCES/0661-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch @@ -0,0 +1,263 @@ +From 370ed7bc3ee003bbc0cc88af7e470e170370e255 Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Wed, 30 May 2018 14:49:17 +0530 +Subject: [PATCH 661/675] Revert "Revert "geo-rep: Remove lazy umount and use + mount namespaces"" + +This reverts commit 367d7f814114df2be19b743409d998d73aa893c0. + +BUG: 1556680 +Change-Id: I2cb350b399576836a9ad3e046c794ae014167c7a +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/140230 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + geo-replication/syncdaemon/gconf.py | 3 +++ + geo-replication/syncdaemon/gsyncd.py | 14 +++++++----- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++++++++++------------ + geo-replication/syncdaemon/resource.py | 16 ++++++++++++-- + geo-replication/syncdaemon/syncdutils.py | 18 ++++++++++++++- + glusterfs.spec.in | 4 ++++ + 6 files changed, 70 insertions(+), 23 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 97395b4..2280f44 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,5 +28,8 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None ++ mountbroker = False ++ mount_point = None ++ mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index f9471e4..96256cf 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -269,6 +269,8 @@ def main_i(): + type=str, action='callback', callback=store_abs) + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) ++ op.add_option('--access-mount', default=False, action='store_true') ++ op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -414,7 +416,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node'] ++ 'local_id', 'local_node', 'slave_access_mount'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -748,15 +750,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- label = 'monitor' ++ gconf.label = 'monitor' + elif be_agent: +- label = gconf.local_path ++ gconf.label = gconf.local_path + elif remote: + # master +- label = gconf.local_path ++ gconf.label = gconf.local_path + else: +- label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=label) ++ gconf.label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index dc0211e..087a202 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize ++from syncdutils import escape, Thread, finalize, memoize, boolify + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -301,19 +301,29 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- os.execv(sys.executable, argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] + +- (['--is-hottier'] if w[3] else []) + +- ['--resource-remote', remote_host]) ++ args_to_worker = argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] ++ ++ if w[3]: ++ args_to_worker.append('--is-hottier') ++ args_to_worker += ['--resource-remote', remote_host] ++ ++ access_mount = boolify(gconf.access_mount) ++ if access_mount: ++ os.execv(sys.executable, args_to_worker) ++ else: ++ unshare_cmd = ['unshare', '-m', '--propagation', 'private'] ++ cmd = unshare_cmd + args_to_worker ++ os.execvp("unshare", cmd) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 943e3ec..39d537b 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -989,6 +989,8 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') ++ if boolify(gconf.slave_access_mount): ++ extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1258,6 +1260,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None ++ self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1348,13 +1351,16 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted: ++ if mounted and gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- self.cleanup_mntpt(mntpt) ++ if gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): ++ self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') + rv = 200 +@@ -1374,6 +1380,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') ++ gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1405,6 +1412,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] ++ gconf.mount_point = self.mntpt ++ gconf.mountbroker = True ++ self.umount_cmd = self.make_cli_argv() + ['umount'] ++ gconf.mbr_umount_cmd = self.umount_cmd ++ + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index a22289e..8dc6c96 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -16,6 +16,7 @@ import fcntl + import shutil + import logging + import socket ++import subprocess + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED + from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode +@@ -188,7 +189,6 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + +- + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -233,6 +233,22 @@ def finalize(*a, **kw): + if sys.exc_info()[0] == OSError: + pass + ++ """ Unmount if not done """ ++ if gconf.mount_point: ++ if gconf.mountbroker: ++ umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] ++ else: ++ umount_cmd = ['umount', '-l', gconf.mount_point] ++ p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) ++ _, errdata = p0.communicate() ++ if p0.returncode == 0: ++ try: ++ os.rmdir(gconf.mount_point) ++ except OSError: ++ pass ++ else: ++ pass ++ + if gconf.log_exit: + logging.info("exiting.") + sys.stdout.flush() +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index fc9125b..d1aa3ea 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -439,6 +439,7 @@ Requires: %{name}%{?_isa} = %{version}-%{release} + Requires: %{name}-server%{?_isa} = %{version}-%{release} + Requires: python python-ctypes + Requires: rsync ++Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2067,6 +2068,9 @@ fi + %endif + + %changelog ++* Thu Mar 22 2018 Kotresh HR ++- Added util-linux as dependency to georeplication rpm (#1544382) ++ + * Wed Dec 20 2017 Milind Changire + - Remove ExclusiveArch directive to help building on non-x86_64 arches (#1527772) + +-- +1.8.3.1 + diff --git a/SOURCES/0662-storage-posix-Use-the-ret-value-of-posix_gfid_heal.patch b/SOURCES/0662-storage-posix-Use-the-ret-value-of-posix_gfid_heal.patch new file mode 100644 index 0000000..e9fca90 --- /dev/null +++ b/SOURCES/0662-storage-posix-Use-the-ret-value-of-posix_gfid_heal.patch @@ -0,0 +1,82 @@ +From 6380d281a86905f149a3fbdb1f9675647d086ee8 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Wed, 19 Jul 2017 16:14:59 +0530 +Subject: [PATCH 662/675] storage/posix: Use the ret value of posix_gfid_heal() + +... to make the change in commit acf8cfdf truly useful. + +Without this, a race between entry creation fops and lookup +at posix layer can cause lookups to fail with ENODATA, as +opposed to ENOENT. + +> Upstream: https://review.gluster.org/17821 +> BUG: 1472758 +> Change-Id: I44a226872283a25f1f4812f03f68921c5eb335bb + +Change-Id: I44a226872283a25f1f4812f03f68921c5eb335bb +BUG: 1583464 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/140383 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/storage/posix/src/posix-helpers.c | 16 +++++++--------- + xlators/storage/posix/src/posix.c | 7 ++++++- + 2 files changed, 13 insertions(+), 10 deletions(-) + +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index ad3639a..073465b 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -1540,23 +1540,21 @@ posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req + struct stat stat = {0, }; + + if (!xattr_req) +- goto out; ++ return 0; + +- if (sys_lstat (path, &stat) != 0) +- goto out; ++ if (sys_lstat (path, &stat) != 0) { ++ return -errno; ++ } + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { +- ret = -1; +- errno = ENOENT; +- goto out; ++ return -ENOENT; + } + } + +- ret = posix_gfid_set (this, path, loc, xattr_req); +-out: +- return ret; ++ posix_gfid_set (this, path, loc, xattr_req); ++ return 0; + } + + +diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c +index 331461c..d499e09 100644 +--- a/xlators/storage/posix/src/posix.c ++++ b/xlators/storage/posix/src/posix.c +@@ -190,7 +190,12 @@ posix_lookup (call_frame_t *frame, xlator_t *this, + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); + + if (gf_uuid_is_null (loc->inode->gfid)) { +- posix_gfid_heal (this, real_path, loc, xdata); ++ op_ret = posix_gfid_heal (this, real_path, loc, xdata); ++ if (op_ret < 0) { ++ op_errno = -op_ret; ++ op_ret = -1; ++ goto out; ++ } + MAKE_ENTRY_HANDLE (real_path, par_path, this, + loc, &buf); + } +-- +1.8.3.1 + diff --git a/SOURCES/0663-features-shard-Pass-the-correct-block-num-to-store-i.patch b/SOURCES/0663-features-shard-Pass-the-correct-block-num-to-store-i.patch new file mode 100644 index 0000000..f1fd287 --- /dev/null +++ b/SOURCES/0663-features-shard-Pass-the-correct-block-num-to-store-i.patch @@ -0,0 +1,43 @@ +From f9854c27c373101670f1e543e3f05fca0cd55797 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Mon, 26 Feb 2018 15:22:58 +0530 +Subject: [PATCH 663/675] features/shard: Pass the correct block-num to store + in inode ctx + +> Upstream: https://review.gluster.org/19630 +> BUG: 1468483 +> Change-Id: Icf3a5d0598a081adb7d234a60bd15250a5ce1532 + +Change-Id: Icf3a5d0598a081adb7d234a60bd15250a5ce1532 +BUG: 1583462 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/140379 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 4311f74..3345883 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -641,7 +641,6 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + gf_msg_debug (this->name, 0, "Shard %d already " + "present. gfid=%s. Saving inode for future.", + shard_idx_iter, uuid_utoa(inode->gfid)); +- shard_idx_iter++; + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get +@@ -655,6 +654,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + shard_idx_iter); + } + UNLOCK(&priv->lock); ++ shard_idx_iter++; + + continue; + } else { +-- +1.8.3.1 + diff --git a/SOURCES/0664-features-shard-Leverage-block_num-info-in-inode-ctx-.patch b/SOURCES/0664-features-shard-Leverage-block_num-info-in-inode-ctx-.patch new file mode 100644 index 0000000..d44ccec --- /dev/null +++ b/SOURCES/0664-features-shard-Leverage-block_num-info-in-inode-ctx-.patch @@ -0,0 +1,79 @@ +From 0509f14090296344aeac15f27685900cf8277de0 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Mon, 26 Feb 2018 15:58:13 +0530 +Subject: [PATCH 664/675] features/shard: Leverage block_num info in inode-ctx + in read callback + +... instead of adding this information in fd_ctx in call path and +retrieving it again in the callback. + +> Upstream: https://review.gluster.org/19633 +> BUG: 1468483 +> Change-Id: Ibbddbbe85baadb7e24aacf5ec8a1250d493d7800 + +Change-Id: Ibbddbbe85baadb7e24aacf5ec8a1250d493d7800 +BUG: 1583462 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/140380 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 21 +++------------------ + 1 file changed, 3 insertions(+), 18 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 3345883..4351220 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -3104,6 +3104,7 @@ shard_readv_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + struct iovec vec = {0,}; + shard_local_t *local = NULL; + fd_t *anon_fd = cookie; ++ shard_inode_ctx_t *ctx = NULL; + + local = frame->local; + +@@ -3122,7 +3123,8 @@ shard_readv_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + if (local->op_ret >= 0) + local->op_ret += op_ret; + +- fd_ctx_get (anon_fd, this, &block_num); ++ shard_inode_ctx_get (anon_fd->inode, this, &ctx); ++ block_num = ctx->block_num; + + if (block_num == local->first_block) { + address = local->iobuf->ptr; +@@ -3175,7 +3177,6 @@ int + shard_readv_do (call_frame_t *frame, xlator_t *this) + { + int i = 0; +- int ret = 0; + int call_count = 0; + int last_block = 0; + int cur_block = 0; +@@ -3232,22 +3233,6 @@ shard_readv_do (call_frame_t *frame, xlator_t *this) + } + } + +- ret = fd_ctx_set (anon_fd, this, cur_block); +- if (ret) { +- gf_msg (this->name, GF_LOG_ERROR, 0, +- SHARD_MSG_FD_CTX_SET_FAILED, +- "Failed to set fd ctx for block %d, gfid=%s", +- cur_block, +- uuid_utoa (local->inode_list[i]->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- shard_readv_do_cbk (frame, (void *) (long) anon_fd, +- this, -1, ENOMEM, NULL, 0, NULL, +- NULL, NULL); +- goto next; +- } +- + STACK_WIND_COOKIE (frame, shard_readv_do_cbk, anon_fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, anon_fd, +-- +1.8.3.1 + diff --git a/SOURCES/0665-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch b/SOURCES/0665-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch new file mode 100644 index 0000000..22eaa40 --- /dev/null +++ b/SOURCES/0665-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch @@ -0,0 +1,150 @@ +From 247bea1a8a76a1ac37b84d679d9767bbce4782cd Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Thu, 15 Feb 2018 16:12:12 +0530 +Subject: [PATCH 665/675] features/shard: Fix shard inode refcount when it's + part of priv->lru_list. + +For as long as a shard's inode is in priv->lru_list, it should have a non-zero +ref-count. This patch achieves it by taking a ref on the inode when it +is added to lru list. When it's time for the inode to be evicted +from the lru list, a corresponding unref is done. + +> Upstream: https://review.gluster.org/19608 +> BUG: 1468483 +> Change-Id: I289ffb41e7be5df7489c989bc1bbf53377433c86 + +Change-Id: I289ffb41e7be5df7489c989bc1bbf53377433c86 +BUG: 1583462 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/140381 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/shard-inode-refcount-test.t | 27 +++++++++++++++++++++++++++ + tests/volume.rc | 18 ++++++++++++++++++ + xlators/features/shard/src/shard.c | 26 +++++++++++++++++--------- + 3 files changed, 62 insertions(+), 9 deletions(-) + create mode 100644 tests/bugs/shard/shard-inode-refcount-test.t + +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +new file mode 100644 +index 0000000..6358097 +--- /dev/null ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -0,0 +1,27 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume start $V0 ++ ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST dd if=/dev/zero of=$M0/one-plus-five-shards bs=1M count=23 ++ ++ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) ++TEST rm -f $M0/one-plus-five-shards ++EXPECT `expr $ACTIVE_INODES_BEFORE - 5` get_mount_active_size_value $V0 ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/tests/volume.rc b/tests/volume.rc +index 5d6c96c..9ab31af 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -760,3 +760,21 @@ function count_sh_entries() + { + ls $1/.glusterfs/indices/xattrop | grep -v "xattrop-" | wc -l + } ++ ++function get_mount_active_size_value { ++ local vol=$1 ++ local statedump=$(generate_mount_statedump $vol) ++ sleep 1 ++ local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1) ++ rm -f $statedump ++ echo $val ++} ++ ++function get_mount_lru_size_value { ++ local vol=$1 ++ local statedump=$(generate_mount_statedump $vol) ++ sleep 1 ++ local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1) ++ rm -f $statedump ++ echo $val ++} +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 4351220..c57a426 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -502,6 +502,10 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + * by empty list), and if there is still space in the priv list, + * add this ctx to the tail of the list. + */ ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref (linked_inode); + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); +@@ -527,8 +531,16 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + /* The following unref corresponds to the ref held by + * inode_find() above. + */ +- inode_forget (lru_inode, 0); + inode_unref (lru_inode); ++ /* The following unref corresponds to the ref held at ++ * the time the shard was created or looked up ++ */ ++ inode_unref (lru_inode); ++ inode_forget (lru_inode, 0); ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref (linked_inode); + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); +@@ -1658,11 +1670,6 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + buf); + inode_lookup (linked_inode); + list_index = block_num - local->first_block; +- +- /* Defer unref'ing the inodes until write is complete. These inodes are +- * unref'd in the event of a failure or after successful fop completion +- * in shard_local_wipe(). +- */ + local->inode_list[list_index] = linked_inode; + + LOCK(&priv->lock); +@@ -2520,10 +2527,11 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + if (!list_empty (&ctx->ilist)) { + list_del_init (&ctx->ilist); + priv->inode_count--; ++ GF_ASSERT (priv->inode_count >= 0); ++ inode_unlink (inode, priv->dot_shard_inode, block_bname); ++ inode_unref (inode); ++ inode_forget (inode, 0); + } +- GF_ASSERT (priv->inode_count >= 0); +- inode_unlink (inode, priv->dot_shard_inode, block_bname); +- inode_forget (inode, 0); + } + UNLOCK(&priv->lock); + +-- +1.8.3.1 + diff --git a/SOURCES/0666-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch b/SOURCES/0666-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch new file mode 100644 index 0000000..d434776 --- /dev/null +++ b/SOURCES/0666-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch @@ -0,0 +1,887 @@ +From 590cf967946fd5195876adb1ab449fd2242b03ed Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Wed, 6 Dec 2017 16:55:33 +0530 +Subject: [PATCH 666/675] features/shard: Upon FSYNC from upper layers, wind + fsync on all changed shards + +> Upstream: https://review.gluster.org/19566 +> BUG: 1468483 +> Change-Id: Ib74354f57a18569762ad45a51f182822a2537421 + +Change-Id: Ib74354f57a18569762ad45a51f182822a2537421 +BUG: 1583462 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/140382 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/bug-1468483.t | 58 +++ + tests/bugs/shard/shard-inode-refcount-test.t | 2 +- + xlators/features/shard/src/shard-messages.h | 9 +- + xlators/features/shard/src/shard.c | 534 +++++++++++++++++++++++++-- + xlators/features/shard/src/shard.h | 6 + + 5 files changed, 570 insertions(+), 39 deletions(-) + create mode 100644 tests/bugs/shard/bug-1468483.t + +diff --git a/tests/bugs/shard/bug-1468483.t b/tests/bugs/shard/bug-1468483.t +new file mode 100644 +index 0000000..e462b8d +--- /dev/null ++++ b/tests/bugs/shard/bug-1468483.t +@@ -0,0 +1,58 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../common-utils.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 16MB ++TEST $CLI volume start $V0 ++TEST $CLI volume profile $V0 start ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++TEST dd if=/dev/zero conv=fsync of=$M0/foo bs=1M count=100 ++ ++#This should ensure /.shard is created on the bricks. ++TEST stat $B0/${V0}0/.shard ++ ++gfid_foo=$(get_gfid_string $M0/foo) ++ ++TEST stat $B0/${V0}0/.shard/$gfid_foo.1 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.2 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.3 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.4 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.5 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.6 ++ ++# For a file with 7 shards, there should be 7 fsyncs on the brick. Without this ++# fix, I was seeing only 1 fsync (on the base shard alone). ++ ++EXPECT "7" echo `$CLI volume profile $V0 info incremental | grep -w FSYNC | awk '{print $8}'` ++ ++useradd -M test_user 2>/dev/null ++ ++TEST touch $M0/bar ++ ++# Change ownership to non-root on bar. ++TEST chown test_user:test_user $M0/bar ++ ++TEST $CLI volume profile $V0 stop ++TEST $CLI volume profile $V0 start ++ ++# Write 100M of data on bar as non-root. ++TEST run_cmd_as_user test_user "dd if=/dev/zero conv=fsync of=$M0/bar bs=1M count=100" ++ ++EXPECT "7" echo `$CLI volume profile $V0 info incremental | grep -w FSYNC | awk '{print $8}'` ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++userdel test_user ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +index 6358097..03e0cc9 100644 +--- a/tests/bugs/shard/shard-inode-refcount-test.t ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -14,7 +14,7 @@ TEST $CLI volume start $V0 + + TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +-TEST dd if=/dev/zero of=$M0/one-plus-five-shards bs=1M count=23 ++TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23 + + ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) + TEST rm -f $M0/one-plus-five-shards +diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h +index 588cb68..8e61630 100644 +--- a/xlators/features/shard/src/shard-messages.h ++++ b/xlators/features/shard/src/shard-messages.h +@@ -40,7 +40,7 @@ + */ + + #define GLFS_COMP_BASE_SHARD GLFS_MSGID_COMP_SHARD +-#define GLFS_NUM_MESSAGES 18 ++#define GLFS_NUM_MESSAGES 19 + #define GLFS_MSGID_END (GLFS_COMP_BASE_SHARD + GLFS_NUM_MESSAGES + 1) + + #define glfs_msg_start_x GLFS_COMP_BASE_SHARD, "Invalid: Start of messages" +@@ -180,5 +180,12 @@ + */ + #define SHARD_MSG_INVALID_FOP (GLFS_COMP_BASE_SHARD + 18) + ++/*! ++ * @messageid 133019 ++ * @diagnosis ++ * @recommendedaction ++*/ ++#define SHARD_MSG_MEMALLOC_FAILED (GLFS_COMP_BASE_SHARD + 19) ++ + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" + #endif /* !_SHARD_MESSAGES_H_ */ +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index c57a426..68d1a3a 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -76,6 +76,7 @@ __shard_inode_ctx_get (inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) + return ret; + + INIT_LIST_HEAD (&ctx_p->ilist); ++ INIT_LIST_HEAD (&ctx_p->to_fsync_list); + + ret = __inode_ctx_set (inode, this, (uint64_t *)&ctx_p); + if (ret < 0) { +@@ -205,6 +206,65 @@ shard_inode_ctx_set_refreshed_flag (inode_t *inode, xlator_t *this) + return ret; + } + ++int ++__shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *base_ictx = NULL; ++ shard_inode_ctx_t *shard_ictx = NULL; ++ ++ ret = __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ if (ret) ++ return ret; ++ ++ ret = __shard_inode_ctx_get (shard_inode, this, &shard_ictx); ++ if (ret) ++ return ret; ++ ++ if (shard_ictx->fsync_needed) { ++ shard_ictx->fsync_needed++; ++ return 1; ++ } ++ ++ list_add_tail (&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); ++ shard_ictx->inode = shard_inode; ++ shard_ictx->fsync_needed++; ++ base_ictx->fsync_count++; ++ shard_ictx->base_inode = base_inode; ++ ++ return 0; ++} ++ ++int ++shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; ++ ++ /* This ref acts as a refkeepr on the base inode. We ++ * need to keep this inode alive as it holds the head ++ * of the to_fsync_list. ++ */ ++ inode_ref (base_inode); ++ ++ LOCK (&base_inode->lock); ++ LOCK (&shard_inode->lock); ++ { ++ ret = __shard_inode_ctx_add_to_fsync_list (base_inode, this, ++ shard_inode); ++ } ++ UNLOCK (&shard_inode->lock); ++ UNLOCK (&base_inode->lock); ++ ++ /* Unref the base inode corresponding to the ref above, if the shard is ++ * found to be already part of the fsync list. ++ */ ++ if (ret != 0) ++ inode_unref (base_inode); ++ return ret; ++} ++ + gf_boolean_t + __shard_inode_ctx_needs_lookup (inode_t *inode, xlator_t *this) + { +@@ -301,6 +361,40 @@ shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this, + } + + int ++__shard_inode_ctx_get_fsync_count (inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ ret = __inode_ctx_get (inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; ++ ++ ctx = (shard_inode_ctx_t *) ctx_uint; ++ ++ *fsync_count = ctx->fsync_needed; ++ ++ return 0; ++} ++ ++int ++shard_inode_ctx_get_fsync_count (inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; ++ ++ LOCK (&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_fsync_count (inode, this, ++ fsync_count); ++ } ++ UNLOCK (&inode->lock); ++ ++ return ret; ++} ++int + __shard_inode_ctx_get_all (inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) + { +@@ -482,15 +576,19 @@ out: + return ret; + } + +-void ++inode_t * + __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + inode_t *base_inode, int block_num) + { +- char block_bname[256] = {0,}; +- inode_t *lru_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *lru_inode_ctx = NULL; ++ char block_bname[256] = {0,}; ++ inode_t *lru_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *lru_inode_ctx = NULL; ++ shard_inode_ctx_t *lru_base_inode_ctx = NULL; ++ inode_t *fsync_inode = NULL; ++ inode_t *lru_base_inode = NULL; ++ gf_boolean_t do_fsync = _gf_false; + + priv = this->private; + +@@ -510,6 +608,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); + priv->inode_count++; ++ ctx->base_inode = base_inode; + } else { + /*If on the other hand there is no available slot for this inode + * in the list, delete the lru inode from the head of the list, +@@ -519,30 +618,56 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + shard_inode_ctx_t, + ilist); + GF_ASSERT (lru_inode_ctx->block_num > 0); ++ lru_base_inode = lru_inode_ctx->base_inode; + list_del_init (&lru_inode_ctx->ilist); + lru_inode = inode_find (linked_inode->table, + lru_inode_ctx->stat.ia_gfid); +- shard_make_block_bname (lru_inode_ctx->block_num, +- lru_inode_ctx->base_gfid, +- block_bname, +- sizeof (block_bname)); +- inode_unlink (lru_inode, priv->dot_shard_inode, +- block_bname); +- /* The following unref corresponds to the ref held by +- * inode_find() above. ++ /* If the lru inode was part of the pending-fsync list, ++ * the base inode needs to be unref'd, the lru inode ++ * deleted from fsync list and fsync'd in a new frame, ++ * and then unlinked in memory and forgotten. + */ +- inode_unref (lru_inode); ++ LOCK (&lru_base_inode->lock); ++ LOCK (&lru_inode->lock); ++ { ++ if (!list_empty(&lru_inode_ctx->to_fsync_list)) { ++ list_del_init (&lru_inode_ctx->to_fsync_list); ++ lru_inode_ctx->fsync_needed = 0; ++ do_fsync = _gf_true; ++ __shard_inode_ctx_get (lru_base_inode, this, &lru_base_inode_ctx); ++ lru_base_inode_ctx->fsync_count--; ++ } ++ } ++ UNLOCK (&lru_inode->lock); ++ UNLOCK (&lru_base_inode->lock); ++ ++ if (!do_fsync) { ++ shard_make_block_bname (lru_inode_ctx->block_num, ++ lru_inode_ctx->base_gfid, ++ block_bname, ++ sizeof (block_bname)); + /* The following unref corresponds to the ref held at +- * the time the shard was created or looked up ++ * the time the shard was added to the lru list. ++ */ ++ inode_unref (lru_inode); ++ inode_unlink (lru_inode, priv->dot_shard_inode, ++ block_bname); ++ inode_forget (lru_inode, 0); ++ } else { ++ fsync_inode = lru_inode; ++ inode_unref (lru_base_inode); ++ } ++ /* The following unref corresponds to the ref ++ * held by inode_find() above. + */ + inode_unref (lru_inode); +- inode_forget (lru_inode, 0); + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref (linked_inode); + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; ++ ctx->base_inode = base_inode; + list_add_tail (&ctx->ilist, &priv->ilist_head); + } + } else { +@@ -551,6 +676,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + */ + list_move_tail (&ctx->ilist, &priv->ilist_head); + } ++ return fsync_inode; + } + + int +@@ -617,6 +743,85 @@ shard_common_inode_write_success_unwind (glusterfs_fop_t fop, + } + + int ++shard_evicted_inode_fsync_cbk (call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, int32_t op_errno, ++ struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ char block_bname[256] = {0,}; ++ fd_t *anon_fd = cookie; ++ inode_t *shard_inode = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ shard_inode = anon_fd->inode; ++ ++ if (op_ret < 0) { ++ gf_msg (this->name, GF_LOG_WARNING, op_errno, ++ SHARD_MSG_MEMALLOC_FAILED, "fsync failed on shard"); ++ goto out; ++ } ++ ++ LOCK (&priv->lock); ++ LOCK(&shard_inode->lock); ++ { ++ __shard_inode_ctx_get (shard_inode, this, &ctx); ++ if ((list_empty(&ctx->to_fsync_list)) && ++ (list_empty(&ctx->ilist))) { ++ shard_make_block_bname (ctx->block_num, ++ shard_inode->gfid, block_bname, ++ sizeof (block_bname)); ++ inode_unlink (shard_inode, priv->dot_shard_inode, ++ block_bname); ++ /* The following unref corresponds to the ref held by ++ * inode_link() at the time the shard was created or ++ * looked up ++ */ ++ inode_unref (shard_inode); ++ inode_forget (shard_inode, 0); ++ } ++ } ++ UNLOCK(&shard_inode->lock); ++ UNLOCK(&priv->lock); ++ ++out: ++ if (anon_fd) ++ fd_unref (anon_fd); ++ STACK_DESTROY (frame->root); ++ return 0; ++} ++ ++int ++shard_initiate_evicted_inode_fsync (xlator_t *this, inode_t *inode) ++{ ++ fd_t *anon_fd = NULL; ++ call_frame_t *fsync_frame = NULL; ++ ++ fsync_frame = create_frame (this, this->ctx->pool); ++ if (!fsync_frame) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " ++ "to fsync shard"); ++ return -1; ++ } ++ ++ anon_fd = fd_anonymous (inode); ++ if (!anon_fd) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create anon fd to" ++ " fsync shard"); ++ STACK_DESTROY (fsync_frame->root); ++ return -1; ++ } ++ ++ STACK_WIND_COOKIE (fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, ++ anon_fd, 1, NULL); ++ return 0; ++} ++ ++int + shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler) + { +@@ -625,6 +830,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + char path[PATH_MAX] = {0,}; + inode_t *inode = NULL; + inode_t *res_inode = NULL; ++ inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + +@@ -661,20 +867,22 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + */ + LOCK(&priv->lock); + { +- __shard_update_shards_inode_list (inode, this, ++ fsync_inode = __shard_update_shards_inode_list (inode, ++ this, + res_inode, + shard_idx_iter); + } + UNLOCK(&priv->lock); + shard_idx_iter++; +- ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync (this, ++ fsync_inode); + continue; + } else { + local->call_count++; + shard_idx_iter++; + } + } +- + out: + post_res_handler (frame, this); + return 0; +@@ -1657,6 +1865,7 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + char block_bname[256] = {0,}; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; ++ inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + + this = THIS; +@@ -1674,10 +1883,14 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + + LOCK(&priv->lock); + { +- __shard_update_shards_inode_list (linked_inode, this, +- local->loc.inode, block_num); ++ fsync_inode = __shard_update_shards_inode_list (linked_inode, ++ this, ++ local->loc.inode, ++ block_num); + } + UNLOCK(&priv->lock); ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync (this, fsync_inode); + } + + int +@@ -2120,6 +2333,7 @@ shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); + if (!local->xattr_req) + goto err; ++ local->resolver_base_inode = loc->inode; + + shard_lookup_base_file (frame, this, &local->loc, + shard_post_lookup_truncate_handler); +@@ -2172,6 +2386,7 @@ shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + + local->loc.inode = inode_ref (fd->inode); + gf_uuid_copy (local->loc.gfid, fd->inode->gfid); ++ local->resolver_base_inode = fd->inode; + + shard_lookup_base_file (frame, this, &local->loc, + shard_post_lookup_truncate_handler); +@@ -2509,32 +2724,48 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + { + char block_bname[256] = {0,}; + inode_t *inode = NULL; ++ inode_t *base_inode = NULL; + xlator_t *this = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ gf_boolean_t unlink_unref_forget = _gf_false; + + this = THIS; + priv = this->private; + + inode = local->inode_list[shard_block_num - local->first_block]; ++ base_inode = local->resolver_base_inode; + + shard_make_block_bname (shard_block_num, (local->loc.inode)->gfid, + block_bname, sizeof (block_bname)); + + LOCK(&priv->lock); ++ LOCK(&base_inode->lock); ++ LOCK(&inode->lock); + { +- shard_inode_ctx_get (inode, this, &ctx); ++ __shard_inode_ctx_get (inode, this, &ctx); + if (!list_empty (&ctx->ilist)) { + list_del_init (&ctx->ilist); + priv->inode_count--; + GF_ASSERT (priv->inode_count >= 0); +- inode_unlink (inode, priv->dot_shard_inode, block_bname); +- inode_unref (inode); +- inode_forget (inode, 0); ++ unlink_unref_forget = _gf_true; ++ } ++ if (ctx->fsync_needed) { ++ inode_unref (base_inode); ++ list_del_init (&ctx->to_fsync_list); ++ __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ base_ictx->fsync_count--; + } + } ++ UNLOCK(&inode->lock); ++ UNLOCK(&base_inode->lock); ++ if (unlink_unref_forget) { ++ inode_unlink (inode, priv->dot_shard_inode, block_bname); ++ inode_unref (inode); ++ inode_forget (inode, 0); ++ } + UNLOCK(&priv->lock); +- + } + + int +@@ -2755,6 +2986,7 @@ shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + local->xflag = xflag; + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); + local->block_size = block_size; ++ local->resolver_base_inode = loc->inode; + local->fop = GF_FOP_UNLINK; + if (!this->itable) + this->itable = (local->loc.inode)->table; +@@ -2991,6 +3223,7 @@ shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + frame->local = local; + loc_copy (&local->loc, oldloc); + loc_copy (&local->loc2, newloc); ++ local->resolver_base_inode = newloc->inode; + local->fop = GF_FOP_RENAME; + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new(); + if (!local->xattr_req) +@@ -3757,6 +3990,10 @@ shard_common_inode_write_do_cbk (call_frame_t *frame, void *cookie, + local->delta_size += (post->ia_size - pre->ia_size); + shard_inode_ctx_set (local->fd->inode, this, post, 0, + SHARD_MASK_TIMES); ++ if (local->fd->inode != anon_fd->inode) ++ shard_inode_ctx_add_to_fsync_list (local->fd->inode, ++ this, ++ anon_fd->inode); + } + } + UNLOCK (&frame->lock); +@@ -4207,18 +4444,199 @@ shard_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + } + + int +-shard_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) ++__shard_get_timestamps_from_inode_ctx (shard_local_t *local, inode_t *inode, ++ xlator_t *this) + { +- if (op_ret < 0) ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ ret = __inode_ctx_get (inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; ++ ++ ctx = (shard_inode_ctx_t *) ctx_uint; ++ ++ local->postbuf.ia_ctime = ctx->stat.ia_ctime; ++ local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; ++ local->postbuf.ia_atime = ctx->stat.ia_atime; ++ local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; ++ local->postbuf.ia_mtime = ctx->stat.ia_mtime; ++ local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; ++ ++ return 0; ++} ++ ++int ++shard_get_timestamps_from_inode_ctx (shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = 0; ++ ++ LOCK (&inode->lock); ++ { ++ ret = __shard_get_timestamps_from_inode_ctx (local, inode, ++ this); ++ } ++ UNLOCK (&inode->lock); ++ ++ return ret; ++} ++ ++int ++shard_fsync_shards_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int call_count = 0; ++ uint64_t fsync_count = 0; ++ fd_t *anon_fd = cookie; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ inode_t *base_inode = NULL; ++ ++ local = frame->local; ++ base_inode = local->fd->inode; ++ ++ if (local->op_ret < 0) + goto out; + +- /* To-Do: Wind fsync on all shards of the file */ +- postbuf->ia_ctime = 0; ++ LOCK (&frame->lock); ++ { ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ UNLOCK (&frame->lock); ++ goto out; ++ } ++ shard_inode_ctx_set (local->fd->inode, this, postbuf, 0, ++ SHARD_MASK_TIMES); ++ } ++ UNLOCK (&frame->lock); ++ fd_ctx_get (anon_fd, this, &fsync_count); + out: +- SHARD_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, +- xdata); ++ if (base_inode != anon_fd->inode) { ++ LOCK (&base_inode->lock); ++ LOCK (&anon_fd->inode->lock); ++ { ++ __shard_inode_ctx_get (anon_fd->inode, this, &ctx); ++ __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ if (op_ret == 0) ++ ctx->fsync_needed -= fsync_count; ++ GF_ASSERT (ctx->fsync_needed >= 0); ++ list_del_init (&ctx->to_fsync_list); ++ if (ctx->fsync_needed != 0) { ++ list_add_tail (&ctx->to_fsync_list, ++ &base_ictx->to_fsync_list); ++ base_ictx->fsync_count++; ++ } ++ } ++ UNLOCK (&anon_fd->inode->lock); ++ UNLOCK (&base_inode->lock); ++ } ++ if (anon_fd) ++ fd_unref (anon_fd); ++ ++ call_count = shard_call_count_return (frame); ++ if (call_count != 0) ++ return 0; ++ ++ if (local->op_ret < 0) { ++ SHARD_STACK_UNWIND (fsync, frame, local->op_ret, ++ local->op_errno, NULL, NULL, NULL); ++ } else { ++ shard_get_timestamps_from_inode_ctx (local, base_inode, this); ++ SHARD_STACK_UNWIND (fsync, frame, local->op_ret, ++ local->op_errno, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ } ++ return 0; ++} ++ ++int ++shard_post_lookup_fsync_handler (call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ int call_count = 0; ++ int fsync_count = 0; ++ fd_t *anon_fd = NULL; ++ inode_t *base_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *iter = NULL; ++ struct list_head copy = {0,}; ++ shard_inode_ctx_t *tmp = NULL; ++ ++ local = frame->local; ++ base_inode = local->fd->inode; ++ local->postbuf = local->prebuf; ++ INIT_LIST_HEAD (©); ++ ++ if (local->op_ret < 0) { ++ SHARD_STACK_UNWIND (fsync, frame, local->op_ret, ++ local->op_errno, NULL, NULL, NULL); ++ return 0; ++ } ++ ++ LOCK (&base_inode->lock); ++ { ++ __shard_inode_ctx_get (base_inode, this, &ctx); ++ list_splice_init (&ctx->to_fsync_list, ©); ++ call_count = ctx->fsync_count; ++ ctx->fsync_count = 0; ++ } ++ UNLOCK (&base_inode->lock); ++ ++ local->call_count = ++call_count; ++ ++ /* Send fsync() on the base shard first */ ++ anon_fd = fd_ref (local->fd); ++ STACK_WIND_COOKIE (frame, shard_fsync_shards_cbk, anon_fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, anon_fd, ++ local->datasync, local->xattr_req); ++ call_count--; ++ anon_fd = NULL; ++ ++ list_for_each_entry_safe (iter, tmp, ©, to_fsync_list) { ++ fsync_count = 0; ++ shard_inode_ctx_get_fsync_count (iter->inode, this, ++ &fsync_count); ++ GF_ASSERT (fsync_count > 0); ++ anon_fd = fd_anonymous (iter->inode); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create " ++ "anon fd to fsync shard"); ++ shard_fsync_shards_cbk (frame, (void *) (long) anon_fd, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); ++ continue; ++ } ++ ++ ret = fd_ctx_set (anon_fd, this, fsync_count); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_FD_CTX_SET_FAILED, "Failed to set fd " ++ "ctx for shard inode gfid=%s", ++ uuid_utoa (iter->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ shard_fsync_shards_cbk (frame, (void *) (long) anon_fd, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); ++ continue; ++ } ++ STACK_WIND_COOKIE (frame, shard_fsync_shards_cbk, anon_fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, anon_fd, ++ local->datasync, local->xattr_req); ++ call_count--; ++ } ++ + return 0; + } + +@@ -4226,8 +4644,50 @@ int + shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) + { +- STACK_WIND (frame, shard_fsync_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); ++ int ret = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa (fd->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, fd, datasync, ++ xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = fd->inode->table; ++ ++ local = mem_get0 (this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ local->fd = fd_ref (fd); ++ local->fop = GF_FOP_FSYNC; ++ local->datasync = datasync; ++ local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); ++ if (!local->xattr_req) ++ goto err; ++ ++ local->loc.inode = inode_ref (fd->inode); ++ gf_uuid_copy (local->loc.gfid, fd->inode->gfid); ++ ++ shard_lookup_base_file (frame, this, &local->loc, ++ shard_post_lookup_fsync_handler); ++ return 0; ++err: ++ SHARD_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 7319598..75d39a1 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -215,6 +215,7 @@ typedef struct shard_local { + uint32_t gid; + uint64_t block_size; + uint64_t dst_block_size; ++ int32_t datasync; + off_t offset; + size_t total_size; + size_t written_size; +@@ -270,6 +271,11 @@ typedef struct shard_inode_ctx { + uuid_t base_gfid; + int block_num; + gf_boolean_t refreshed; ++ struct list_head to_fsync_list; ++ int fsync_needed; ++ inode_t *inode; ++ int fsync_count; ++ inode_t *base_inode; + } shard_inode_ctx_t; + + #endif /* __SHARD_H__ */ +-- +1.8.3.1 + diff --git a/SOURCES/0667-features-shard-Do-list_del_init-while-list-memory-is.patch b/SOURCES/0667-features-shard-Do-list_del_init-while-list-memory-is.patch new file mode 100644 index 0000000..1b466b3 --- /dev/null +++ b/SOURCES/0667-features-shard-Do-list_del_init-while-list-memory-is.patch @@ -0,0 +1,55 @@ +From d005ede12e1da99dc077bf9e1c563a0f9f4066e3 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 19 Mar 2018 12:09:18 +0530 +Subject: [PATCH 667/675] features/shard: Do list_del_init() while list memory + is valid + +> Upstream: https://review.gluster.org/19737 +> BUG: 1557876 +> Change-Id: If429d3634219e1a435bd0da0ed985c646c59c2ca + +Problem: +shard_post_lookup_fsync_handler() goes over the list of inode-ctx that need to +be fsynced and in cbk it removes each of the inode-ctx from the list. When the +first member of list is removed it tries to modifies list head's memory with +the latest next/prev and when this happens, there is no guarantee that the +list-head which is from stack memory of shard_post_lookup_fsync_handler() is +valid. + +Fix: +Do list_del_init() in the loop before winding fsync. + +Change-Id: If429d3634219e1a435bd0da0ed985c646c59c2ca +BUG: 1585046 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/140483 +Tested-by: Krutika Dhananjay +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 68d1a3a..651d08a 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -4525,7 +4525,6 @@ out: + if (op_ret == 0) + ctx->fsync_needed -= fsync_count; + GF_ASSERT (ctx->fsync_needed >= 0); +- list_del_init (&ctx->to_fsync_list); + if (ctx->fsync_needed != 0) { + list_add_tail (&ctx->to_fsync_list, + &base_ictx->to_fsync_list); +@@ -4600,6 +4599,7 @@ shard_post_lookup_fsync_handler (call_frame_t *frame, xlator_t *this) + anon_fd = NULL; + + list_for_each_entry_safe (iter, tmp, ©, to_fsync_list) { ++ list_del_init (&iter->to_fsync_list); + fsync_count = 0; + shard_inode_ctx_get_fsync_count (iter->inode, this, + &fsync_count); +-- +1.8.3.1 + diff --git a/SOURCES/0668-storage-posix-Add-active-fd-count-option-in-gluster.patch b/SOURCES/0668-storage-posix-Add-active-fd-count-option-in-gluster.patch new file mode 100644 index 0000000..2b0fc9c --- /dev/null +++ b/SOURCES/0668-storage-posix-Add-active-fd-count-option-in-gluster.patch @@ -0,0 +1,219 @@ +From 56cb9390434a26a0b2d9c364899eb436799fdf30 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 19 Mar 2018 15:12:14 +0530 +Subject: [PATCH 668/675] storage/posix: Add active-fd-count option in gluster + +Problem: +when dd happens on sharded replicate volume all the writes on shards happen +through anon-fd. When the writes don't come quick enough, old anon-fd closes +and new fd gets created to serve the new writes. open-fd-count is decremented +only after the fd is closed as part of fd_destroy(). So even when one fd is on +the way to be closed a new fd will be created and during this short period it +appears as though there are multiple fds opened on the file. AFR thinks another +application opened the same file and switches off eager-lock leading to +extra latency. + +Fix: +Have a different option called active-fd whose life cycle starts at +fd_bind() and ends just before fd_destroy() + + >BUG: 1557932 + +Change-Id: If6cadbe97d183e124f9dc4672a5b621cbe3324cb +Upstream-patch: https://review.gluster.org/19740 +BUG: 1583733 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/140573 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/fd.c | 2 ++ + libglusterfs/src/glusterfs.h | 1 + + libglusterfs/src/inode.c | 2 ++ + libglusterfs/src/inode.h | 1 + + tests/volume.rc | 12 +++++++ + xlators/storage/posix/src/posix-helpers.c | 52 ++++++++++++------------------- + xlators/storage/posix/src/posix.c | 12 +++++++ + 7 files changed, 50 insertions(+), 32 deletions(-) + +diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c +index 118f876..d91f4de 100644 +--- a/libglusterfs/src/fd.c ++++ b/libglusterfs/src/fd.c +@@ -557,6 +557,7 @@ fd_unref (fd_t *fd) + if (refcount == 0) { + if (!list_empty (&fd->inode_list)) { + list_del_init (&fd->inode_list); ++ fd->inode->active_fd_count--; + bound = _gf_true; + } + } +@@ -578,6 +579,7 @@ __fd_bind (fd_t *fd) + list_del_init (&fd->inode_list); + list_add (&fd->inode_list, &fd->inode->fd_list); + fd->inode->fd_count++; ++ fd->inode->active_fd_count++; + + return fd; + } +diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h +index d13e5bd..277511a 100644 +--- a/libglusterfs/src/glusterfs.h ++++ b/libglusterfs/src/glusterfs.h +@@ -159,6 +159,7 @@ + #define GLUSTERFS_WRITE_IS_APPEND "glusterfs.write-is-append" + #define GLUSTERFS_WRITE_UPDATE_ATOMIC "glusterfs.write-update-atomic" + #define GLUSTERFS_OPEN_FD_COUNT "glusterfs.open-fd-count" ++#define GLUSTERFS_ACTIVE_FD_COUNT "glusterfs.open-active-fd-count" + #define GLUSTERFS_INODELK_COUNT "glusterfs.inodelk-count" + #define GLUSTERFS_ENTRYLK_COUNT "glusterfs.entrylk-count" + #define GLUSTERFS_POSIXLK_COUNT "glusterfs.posixlk-count" +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 0353825..2f67cf4 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -2332,6 +2332,8 @@ inode_dump (inode_t *inode, char *prefix) + gf_proc_dump_write("gfid", "%s", uuid_utoa (inode->gfid)); + gf_proc_dump_write("nlookup", "%ld", inode->nlookup); + gf_proc_dump_write("fd-count", "%u", inode->fd_count); ++ gf_proc_dump_write("active-fd-count", "%u", ++ inode->active_fd_count); + gf_proc_dump_write("ref", "%u", inode->ref); + gf_proc_dump_write("ia_type", "%d", inode->ia_type); + if (inode->_ctx) { +diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h +index e4ad046..506f495 100644 +--- a/libglusterfs/src/inode.h ++++ b/libglusterfs/src/inode.h +@@ -89,6 +89,7 @@ struct _inode { + gf_lock_t lock; + uint64_t nlookup; + uint32_t fd_count; /* Open fd count */ ++ uint32_t active_fd_count; /* Active open fd count */ + uint32_t ref; /* reference count on this inode */ + ia_type_t ia_type; /* what kind of file */ + struct list_head fd_list; /* list of open files on this inode */ +diff --git a/tests/volume.rc b/tests/volume.rc +index 9ab31af..0d20b0e 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -778,3 +778,15 @@ function get_mount_lru_size_value { + rm -f $statedump + echo $val + } ++ ++function get_active_fd_count { ++ local vol=$1 ++ local host=$2 ++ local brick=$3 ++ local fname=$4 ++ local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname)) ++ local statedump=$(generate_brick_statedump $vol $host $brick) ++ local count=$(grep "gfid=$gfid_str" $statedump -A2 -B1 | grep $brick -A3 | grep -w active-fd-count | cut -f2 -d'=' | tail -1) ++ rm -f $statedump ++ echo $count ++} +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index 073465b..5f8304e 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -371,27 +371,6 @@ _get_filler_inode (posix_xattr_filler_t *filler) + } + + static int +-_posix_filler_get_openfd_count (posix_xattr_filler_t *filler, char *key) +-{ +- inode_t *inode = NULL; +- int ret = -1; +- +- inode = _get_filler_inode (filler); +- if (!inode || gf_uuid_is_null (inode->gfid)) +- goto out; +- +- ret = dict_set_uint32 (filler->xattr, key, inode->fd_count); +- if (ret < 0) { +- gf_msg (filler->this->name, GF_LOG_WARNING, 0, +- P_MSG_DICT_SET_FAILED, +- "Failed to set dictionary value for %s", key); +- goto out; +- } +-out: +- return ret; +-} +- +-static int + _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, + void *xattrargs) + { +@@ -399,8 +378,8 @@ _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, + int ret = -1; + char *databuf = NULL; + int _fd = -1; +- loc_t *loc = NULL; + ssize_t req_size = 0; ++ inode_t *inode = NULL; + + + if (posix_xattr_ignorable (key)) +@@ -477,16 +456,25 @@ _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, + GF_FREE (databuf); + } + } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { +- ret = _posix_filler_get_openfd_count (filler, key); +- loc = filler->loc; +- if (loc) { +- ret = dict_set_uint32 (filler->xattr, key, +- loc->inode->fd_count); +- if (ret < 0) +- gf_msg (filler->this->name, GF_LOG_WARNING, 0, +- P_MSG_XDATA_GETXATTR, +- "Failed to set dictionary value for %s", +- key); ++ inode = _get_filler_inode (filler); ++ if (!inode || gf_uuid_is_null (inode->gfid)) ++ goto out; ++ ret = dict_set_uint32 (filler->xattr, key, inode->fd_count); ++ if (ret < 0) { ++ gf_msg (filler->this->name, GF_LOG_WARNING, 0, ++ P_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value for %s", key); ++ } ++ } else if (!strcmp (key, GLUSTERFS_ACTIVE_FD_COUNT)) { ++ inode = _get_filler_inode (filler); ++ if (!inode || gf_uuid_is_null (inode->gfid)) ++ goto out; ++ ret = dict_set_uint32 (filler->xattr, key, ++ inode->active_fd_count); ++ if (ret < 0) { ++ gf_msg (filler->this->name, GF_LOG_WARNING, 0, ++ P_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value for %s", key); + } + } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) { + /* As of now, the only consumers of POSIX_ANCESTRY_PATH attempt +diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c +index d499e09..b0d7037 100644 +--- a/xlators/storage/posix/src/posix.c ++++ b/xlators/storage/posix/src/posix.c +@@ -3353,6 +3353,18 @@ _fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) + } + } + ++ if (dict_get (xdata, GLUSTERFS_ACTIVE_FD_COUNT)) { ++ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_ACTIVE_FD_COUNT, ++ fd->inode->active_fd_count); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_WARNING, 0, ++ P_MSG_DICT_SET_FAILED, "%s: Failed to set " ++ "dictionary value for %s", ++ uuid_utoa (fd->inode->gfid), ++ GLUSTERFS_ACTIVE_FD_COUNT); ++ } ++ } ++ + if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); +-- +1.8.3.1 + diff --git a/SOURCES/0669-cluster-afr-Make-afr_fsync-a-transaction.patch b/SOURCES/0669-cluster-afr-Make-afr_fsync-a-transaction.patch new file mode 100644 index 0000000..749b005 --- /dev/null +++ b/SOURCES/0669-cluster-afr-Make-afr_fsync-a-transaction.patch @@ -0,0 +1,362 @@ +From 6191553d871dc004185a504682176cd7afb9deef Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Fri, 23 Feb 2018 15:12:19 +0530 +Subject: [PATCH 669/675] cluster/afr: Make afr_fsync a transaction + +Upstream patch: https://review.gluster.org/#/c/19621/ + +BUG: 1583733 +Change-Id: I5a336e8d33dd9612c0c81f1b1020aa10ebc09220 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/140574 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-common.c | 165 ------------------------------ + xlators/cluster/afr/src/afr-inode-write.c | 106 +++++++++++++++++++ + xlators/cluster/afr/src/afr-inode-write.h | 4 + + xlators/cluster/afr/src/afr.c | 2 +- + xlators/cluster/afr/src/afr.h | 4 + + 5 files changed, 115 insertions(+), 166 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index d96a819..cfd3d60 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -3218,171 +3218,6 @@ out: + return 0; + } + +-/* }}} */ +- +- +-/* {{{ fsync */ +- +-int +-afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) +-{ +- AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, +- xdata); +- return 0; +-} +- +-int +-afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- int i = 0; +- int call_count = -1; +- int child_index = (long) cookie; +- int read_subvol = 0; +- call_stub_t *stub = NULL; +- +- local = frame->local; +- priv = this->private; +- +- LOCK (&frame->lock); +- { +- local->replies[child_index].valid = 1; +- local->replies[child_index].op_ret = op_ret; +- local->replies[child_index].op_errno = op_errno; +- if (op_ret == 0) { +- if (prebuf) +- local->replies[child_index].prestat = *prebuf; +- if (postbuf) +- local->replies[child_index].poststat = *postbuf; +- if (xdata) +- local->replies[child_index].xdata = +- dict_ref (xdata); +- } +- } +- UNLOCK (&frame->lock); +- +- call_count = afr_frame_return (frame); +- +- if (call_count == 0) { +- local->op_ret = -1; +- local->op_errno = afr_final_errno (local, priv); +- read_subvol = afr_data_subvol_get (local->inode, this, NULL, +- local->readable, NULL, NULL); +- /* Pick a reply that is valid and readable, with a preference +- * given to read_subvol. */ +- for (i = 0; i < priv->child_count; i++) { +- if (!local->replies[i].valid) +- continue; +- if (local->replies[i].op_ret != 0) +- continue; +- if (!local->readable[i]) +- continue; +- local->op_ret = local->replies[i].op_ret; +- local->op_errno = local->replies[i].op_errno; +- local->cont.inode_wfop.prebuf = +- local->replies[i].prestat; +- local->cont.inode_wfop.postbuf = +- local->replies[i].poststat; +- if (local->replies[i].xdata) { +- if (local->xdata_rsp) +- dict_unref (local->xdata_rsp); +- local->xdata_rsp = +- dict_ref (local->replies[i].xdata); +- } +- if (i == read_subvol) +- break; +- } +- +- /* Make a stub out of the frame, and register it +- with the waking up post-op. When the call-stub resumes, +- we are guaranteed that there was no post-op pending +- (i.e changelogs were unset in the server). This is an +- essential "guarantee", that fsync() returns only after +- completely finishing EVERYTHING, including the delayed +- post-op. This guarantee is expected by FUSE graph switching +- for example. +- */ +- stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, +- local->op_ret, local->op_errno, +- &local->cont.inode_wfop.prebuf, +- &local->cont.inode_wfop.postbuf, +- local->xdata_rsp); +- if (!stub) { +- AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); +- return 0; +- } +- +- /* If no new unstable writes happened between the +- time we cleared the unstable write witness flag in afr_fsync +- and now, calling afr_delayed_changelog_wake_up() should +- wake up and skip over the fsync phase and go straight to +- afr_changelog_post_op_now() +- */ +- afr_delayed_changelog_wake_resume (this, local->fd, stub); +- } +- +- return 0; +-} +- +- +-int +-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +- dict_t *xdata) +-{ +- afr_private_t *priv = NULL; +- afr_local_t *local = NULL; +- int i = 0; +- int32_t call_count = 0; +- int32_t op_errno = ENOMEM; +- +- priv = this->private; +- +- local = AFR_FRAME_INIT (frame, op_errno); +- if (!local) +- goto out; +- +- call_count = local->call_count; +- if (!call_count) { +- op_errno = ENOTCONN; +- goto out; +- } +- +- local->fd = fd_ref (fd); +- +- if (afr_fd_has_witnessed_unstable_write (this, fd)) { +- /* don't care. we only wanted to CLEAR the bit */ +- } +- +- local->inode = inode_ref (fd->inode); +- +- for (i = 0; i < priv->child_count; i++) { +- if (local->child_up[i]) { +- STACK_WIND_COOKIE (frame, afr_fsync_cbk, +- (void *) (long) i, +- priv->children[i], +- priv->children[i]->fops->fsync, +- fd, datasync, xdata); +- if (!--call_count) +- break; +- } +- } +- +- return 0; +-out: +- AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); +- +- return 0; +-} +- +-/* }}} */ +- +-/* {{{ fsync */ +- + int + afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index 04bbf21..c740599 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -2511,3 +2511,109 @@ out: + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; + } ++ ++ ++int ++afr_fsync_unwind (call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = NULL; ++ call_frame_t *main_frame = NULL; ++ ++ local = frame->local; ++ ++ main_frame = afr_transaction_detach_fop_frame (frame); ++ if (!main_frame) ++ return 0; ++ ++ AFR_STACK_UNWIND (fsync, main_frame, local->op_ret, local->op_errno, ++ &local->cont.inode_wfop.prebuf, ++ &local->cont.inode_wfop.postbuf, local->xdata_rsp); ++ ++ return 0; ++} ++ ++ ++int ++afr_fsync_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, ++ prebuf, postbuf, NULL, xdata); ++} ++ ++ ++int ++afr_fsync_wind (call_frame_t *frame, xlator_t *this, int subvol) ++{ ++ afr_local_t *local = NULL; ++ afr_private_t *priv = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ STACK_WIND_COOKIE (frame, afr_fsync_wind_cbk, (void *)(long) subvol, ++ priv->children[subvol], ++ priv->children[subvol]->fops->fsync, ++ local->fd, local->cont.fsync.datasync, ++ local->xdata_req); ++ return 0; ++} ++ ++int ++afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, ++ dict_t *xdata) ++{ ++ afr_local_t *local = NULL; ++ call_frame_t *transaction_frame = NULL; ++ int ret = -1; ++ int32_t op_errno = ENOMEM; ++ ++ transaction_frame = copy_frame (frame); ++ if (!transaction_frame) ++ goto out; ++ ++ local = AFR_FRAME_INIT (transaction_frame, op_errno); ++ if (!local) ++ goto out; ++ ++ if (xdata) ++ local->xdata_req = dict_copy_with_ref (xdata, NULL); ++ else ++ local->xdata_req = dict_new (); ++ ++ if (!local->xdata_req) ++ goto out; ++ ++ local->fd = fd_ref (fd); ++ local->inode = inode_ref (fd->inode); ++ ++ local->op = GF_FOP_FSYNC; ++ local->cont.fsync.datasync = datasync; ++ ++ if (afr_fd_has_witnessed_unstable_write (this, fd)) { ++ /* don't care. we only wanted to CLEAR the bit */ ++ } ++ ++ local->transaction.wind = afr_fsync_wind; ++ local->transaction.fop = __afr_txn_write_fop; ++ local->transaction.done = __afr_txn_write_done; ++ local->transaction.unwind = afr_fsync_unwind; ++ ++ local->transaction.main_frame = frame; ++ ++ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); ++ if (ret < 0) { ++ op_errno = -ret; ++ goto out; ++ } ++ ++ return 0; ++out: ++ if (transaction_frame) ++ AFR_STACK_DESTROY (transaction_frame); ++ ++ AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); ++ ++ return 0; ++} +diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h +index e174cc2..1e8bb5c 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.h ++++ b/xlators/cluster/afr/src/afr-inode-write.h +@@ -87,4 +87,8 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t + afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); ++ ++int ++afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, ++ dict_t *xdata); + #endif /* __INODE_WRITE_H__ */ +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 375bd1f..fc56486 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -562,7 +562,6 @@ struct xlator_fops fops = { + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, +- .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, +@@ -594,6 +593,7 @@ struct xlator_fops fops = { + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, ++ .fsync = afr_fsync, + + /* dir read */ + .opendir = afr_opendir, +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index f6a1a6a..304efa1 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -702,6 +702,10 @@ typedef struct _afr_local { + gf_seek_what_t what; + } seek; + ++ struct { ++ int32_t datasync; ++ } fsync; ++ + } cont; + + struct { +-- +1.8.3.1 + diff --git a/SOURCES/0670-cluster-afr-Remove-compound-fops-usage-in-afr.patch b/SOURCES/0670-cluster-afr-Remove-compound-fops-usage-in-afr.patch new file mode 100644 index 0000000..bd73b66 --- /dev/null +++ b/SOURCES/0670-cluster-afr-Remove-compound-fops-usage-in-afr.patch @@ -0,0 +1,640 @@ +From b90d513a6e9685fd660f4997c035873a490ff577 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Fri, 2 Mar 2018 10:13:20 +0530 +Subject: [PATCH 670/675] cluster/afr: Remove compound-fops usage in afr + +We are not seeing much improvement with this change. So removing the +feature so that it doesn't need to be maintained anymore. + + > Fixes: #414 +Upstream-patch: https://review.gluster.org/19655 + +BUG: 1583733 +Change-Id: I438b37a070f4b36ab4582f99c4b5d9fa37f29099 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/140575 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/basic/afr/compounded-write-txns.t | 37 ---- + xlators/cluster/afr/src/afr-common.c | 43 ---- + xlators/cluster/afr/src/afr-transaction.c | 340 +----------------------------- + xlators/cluster/afr/src/afr-transaction.h | 4 +- + xlators/cluster/afr/src/afr.c | 10 +- + xlators/cluster/afr/src/afr.h | 14 -- + 6 files changed, 7 insertions(+), 441 deletions(-) + delete mode 100644 tests/basic/afr/compounded-write-txns.t + +diff --git a/tests/basic/afr/compounded-write-txns.t b/tests/basic/afr/compounded-write-txns.t +deleted file mode 100644 +index 7cecd87..0000000 +--- a/tests/basic/afr/compounded-write-txns.t ++++ /dev/null +@@ -1,37 +0,0 @@ +-#!/bin/bash +-. $(dirname $0)/../../include.rc +-. $(dirname $0)/../../volume.rc +- +-cleanup +- +-TEST glusterd +-TEST pidof glusterd +-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +-TEST $CLI volume set $V0 write-behind off +-TEST $CLI volume set $V0 client-io-threads off +-TEST $CLI volume start $V0 +-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 +- +-# Create and generate data into a src file +- +-TEST `printf %1024s |tr " " "1" > /tmp/source` +-TEST `printf %1024s |tr " " "2" >> /tmp/source` +- +-TEST dd if=/tmp/source of=$M0/file bs=1024 count=2 2>/dev/null +-md5sum_file=$(md5sum $M0/file | awk '{print $1}') +- +-TEST $CLI volume set $V0 cluster.use-compound-fops on +- +-TEST dd if=$M0/file of=$M0/file-copy bs=1024 count=2 2>/dev/null +- +-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 +- +-EXPECT "$md5sum_file" echo `md5sum $M0/file-copy | awk '{print $1}'` +- +-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +-TEST $CLI volume stop $V0 +-TEST $CLI volume delete $V0 +- +-TEST rm -f /tmp/source +-cleanup +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index cfd3d60..bffa71b 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -43,7 +43,6 @@ + #include "afr-self-heal.h" + #include "afr-self-heald.h" + #include "afr-messages.h" +-#include "compound-fop-utils.h" + + int32_t + afr_quorum_errno (afr_private_t *priv) +@@ -4747,7 +4746,6 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) + + local->need_full_crawl = _gf_false; + +- local->compound = _gf_false; + INIT_LIST_HEAD (&local->healer); + return 0; + out: +@@ -4894,7 +4892,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (!local->pending) + goto out; + +- local->compound = _gf_false; + INIT_LIST_HEAD (&local->transaction.eager_locked); + + ret = 0; +@@ -5680,46 +5677,6 @@ afr_get_msg_id (char *op_type) + return -1; + } + +-gf_boolean_t +-afr_can_compound_pre_op_and_op (afr_private_t *priv, glusterfs_fop_t fop) +-{ +- if (priv->arbiter_count != 0) +- return _gf_false; +- +- if (!priv->use_compound_fops) +- return _gf_false; +- +- switch (fop) { +- case GF_FOP_WRITE: +- return _gf_true; +- default: +- return _gf_false; +- } +-} +- +-afr_compound_cbk_t +-afr_pack_fop_args (call_frame_t *frame, compound_args_t *args, +- glusterfs_fop_t fop, int index) +-{ +- afr_local_t *local = frame->local; +- +- switch (fop) { +- case GF_FOP_WRITE: +- COMPOUND_PACK_ARGS (writev, GF_FOP_WRITE, +- args, index, +- local->fd, local->cont.writev.vector, +- local->cont.writev.count, +- local->cont.writev.offset, +- local->cont.writev.flags, +- local->cont.writev.iobref, +- local->xdata_req); +- return afr_pre_op_writev_cbk; +- default: +- break; +- } +- return NULL; +-} +- + int + afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *heal_frame, + void *opaque) +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 644ebe2..6672816 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -17,7 +17,6 @@ + #include "afr-transaction.h" + #include "afr-self-heal.h" + #include "afr-messages.h" +-#include "compound-fop-utils.h" + + #include + +@@ -38,10 +37,6 @@ afr_changelog_call_count (afr_transaction_type type, + unsigned char *failed_subvols, + unsigned int child_count); + int +-afr_post_op_unlock_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +- afr_changelog_resume_t changelog_resume, +- afr_xattrop_type_t op); +-int + afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op); +@@ -833,12 +828,10 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + dict_t *xattr = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + int ret = 0; + int idx = 0; + int nothing_failed = 1; +- gf_boolean_t compounded_unlock = _gf_true; + gf_boolean_t need_undirty = _gf_false; + + afr_handle_quorum (frame); +@@ -904,36 +897,8 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + goto out; + } + +- if (local->compound && local->fd) { +- LOCK (&local->fd->lock); +- { +- fd_ctx = __afr_fd_ctx_get (local->fd, this); +- for (i = 0; i < priv->child_count; i++) { +- if (local->transaction.pre_op[i] && +- local->transaction.eager_lock[i]) { +- if (fd_ctx->lock_piggyback[i]) +- compounded_unlock = _gf_false; +- else if (fd_ctx->lock_acquired[i]) +- compounded_unlock = _gf_false; +- } +- if (compounded_unlock == _gf_false) +- break; +- } +- } +- UNLOCK (&local->fd->lock); +- } +- +- /* Do not compound if any brick got piggybacked lock as +- * unlock should not be done for that. */ +- if (local->compound && compounded_unlock) { +- afr_post_op_unlock_do (frame, this, xattr, +- afr_changelog_post_op_done, +- AFR_TRANSACTION_POST_OP); +- } else { +- afr_changelog_do (frame, this, xattr, +- afr_changelog_post_op_done, +- AFR_TRANSACTION_POST_OP); +- } ++ afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done, ++ AFR_TRANSACTION_POST_OP); + out: + if (xattr) + dict_unref (xattr); +@@ -1267,68 +1232,6 @@ out: + } + + int +-afr_pre_op_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int op_ret, int op_errno, +- void *data, dict_t *xdata) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- call_frame_t *fop_frame = NULL; +- default_args_cbk_t *write_args_cbk = NULL; +- compound_args_cbk_t *args_cbk = data; +- int call_count = -1; +- int child_index = -1; +- +- local = frame->local; +- priv = this->private; +- child_index = (long) cookie; +- +- if (local->pre_op_compat) +- afr_changelog_pre_op_update (frame, this); +- +- if (op_ret == -1) { +- local->op_errno = op_errno; +- afr_transaction_fop_failed (frame, this, child_index); +- } +- +- /* If the compound fop failed due to saved_frame_unwind(), then +- * protocol/client fails it even before args_cbk is allocated. +- * Handle that case by passing the op_ret, op_errno values explicitly. +- */ +- if ((op_ret == -1) && (args_cbk == NULL)) { +- afr_inode_write_fill (frame, this, child_index, op_ret, +- op_errno, NULL, NULL, NULL); +- } else { +- write_args_cbk = &args_cbk->rsp_list[1]; +- afr_inode_write_fill (frame, this, child_index, +- write_args_cbk->op_ret, +- write_args_cbk->op_errno, +- &write_args_cbk->prestat, +- &write_args_cbk->poststat, +- write_args_cbk->xdata); +- } +- +- call_count = afr_frame_return (frame); +- +- if (call_count == 0) { +- compound_args_cleanup (local->c_args); +- local->c_args = NULL; +- afr_process_post_writev (frame, this); +- if (!afr_txn_nothing_failed (frame, this)) { +- /* Don't unwind until post-op is complete */ +- local->transaction.resume (frame, this); +- } else { +- /* frame change, place frame in post-op delay and unwind */ +- fop_frame = afr_transaction_detach_fop_frame (frame); +- afr_writev_copy_outvars (frame, fop_frame); +- local->transaction.resume (frame, this); +- afr_writev_unwind (fop_frame, this); +- } +- } +- return 0; +-} +- +-int + afr_changelog_prepare (xlator_t *this, call_frame_t *frame, int *call_count, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op, dict_t **xdata, +@@ -1358,228 +1261,6 @@ afr_changelog_prepare (xlator_t *this, call_frame_t *frame, int *call_count, + } + + int +-afr_pre_op_fop_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +- afr_changelog_resume_t changelog_resume, +- afr_xattrop_type_t op) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- dict_t *xdata = NULL; +- dict_t *newloc_xdata = NULL; +- compound_args_t *args = NULL; +- int i = 0, call_count = 0; +- afr_compound_cbk_t compound_cbk; +- int ret = 0; +- int op_errno = ENOMEM; +- +- local = frame->local; +- priv = this->private; +- +- /* If lock failed on all, just unlock and unwind */ +- ret = afr_changelog_prepare (this, frame, &call_count, changelog_resume, +- op, &xdata, &newloc_xdata); +- +- if (ret) +- return 0; +- +- local->call_count = call_count; +- +- afr_save_lk_owner (frame); +- frame->root->lk_owner = +- local->transaction.main_frame->root->lk_owner; +- +- args = compound_fop_alloc (2, GF_CFOP_XATTROP_WRITEV, NULL); +- +- if (!args) +- goto err; +- +- /* pack pre-op part */ +- i = 0; +- COMPOUND_PACK_ARGS (fxattrop, GF_FOP_FXATTROP, +- args, i, +- local->fd, GF_XATTROP_ADD_ARRAY, +- xattr, xdata); +- i++; +- /* pack whatever fop needs to be packed +- * @compound_cbk holds the cbk that would need to be called +- */ +- compound_cbk = afr_pack_fop_args (frame, args, local->op, i); +- +- local->c_args = args; +- +- for (i = 0; i < priv->child_count; i++) { +- /* Means lock did not succeed on this brick */ +- if (!local->transaction.pre_op[i] || +- local->transaction.failed_subvols[i]) +- continue; +- +- STACK_WIND_COOKIE (frame, compound_cbk, +- (void *) (long) i, +- priv->children[i], +- priv->children[i]->fops->compound, +- args, +- NULL); +- if (!--call_count) +- break; +- } +- +- if (xdata) +- dict_unref (xdata); +- if (newloc_xdata) +- dict_unref (newloc_xdata); +- return 0; +-err: +- local->internal_lock.lock_cbk = local->transaction.done; +- local->op_ret = -1; +- local->op_errno = op_errno; +- +- afr_restore_lk_owner (frame); +- afr_unlock (frame, this); +- +- if (xdata) +- dict_unref (xdata); +- if (newloc_xdata) +- dict_unref (newloc_xdata); +- return 0; +-} +- +-int +-afr_post_op_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int op_ret, int op_errno, +- void *data, dict_t *xdata) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- compound_args_cbk_t *args_cbk = data; +- int call_count = -1; +- afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; +- int32_t child_index = (long)cookie; +- int i = 0; +- +- local = frame->local; +- priv = this->private; +- child_index = (long) cookie; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- afr_update_uninodelk (local, int_lock, child_index); +- +- LOCK (&frame->lock); +- { +- call_count = --int_lock->lk_call_count; +- } +- UNLOCK (&frame->lock); +- +- if (call_count == 0) { +- compound_args_cleanup (local->c_args); +- local->c_args = NULL; +- if (local->transaction.resume_stub) { +- call_resume (local->transaction.resume_stub); +- local->transaction.resume_stub = NULL; +- } +- gf_msg_trace (this->name, 0, +- "All internal locks unlocked"); +- int_lock->lock_cbk (frame, this); +- } +- +- return 0; +-} +- +-int +-afr_post_op_unlock_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +- afr_changelog_resume_t changelog_resume, +- afr_xattrop_type_t op) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- dict_t *xdata = NULL; +- dict_t *newloc_xdata = NULL; +- compound_args_t *args = NULL; +- afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; +- int i = 0; +- int call_count = 0; +- struct gf_flock flock = {0,}; +- int ret = 0; +- +- local = frame->local; +- priv = this->private; +- int_lock = &local->internal_lock; +- +- if (afr_is_inodelk_transaction(local)) { +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; +- flock.l_type = F_UNLCK; +- } +- +- ret = afr_changelog_prepare (this, frame, &call_count, changelog_resume, +- op, &xdata, &newloc_xdata); +- +- if (ret) +- return 0; +- +- int_lock->lk_call_count = call_count; +- +- int_lock->lock_cbk = local->transaction.done; +- +- args = compound_fop_alloc (2, GF_CFOP_XATTROP_UNLOCK, NULL); +- +- if (!args) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- afr_changelog_post_op_done (frame, this); +- goto out; +- } +- +- i = 0; +- COMPOUND_PACK_ARGS (fxattrop, GF_FOP_FXATTROP, +- args, i, +- local->fd, GF_XATTROP_ADD_ARRAY, +- xattr, xdata); +- i++; +- +- if (afr_is_inodelk_transaction(local)) { +- if (local->fd) { +- COMPOUND_PACK_ARGS (finodelk, GF_FOP_FINODELK, +- args, i, +- int_lock->domain, local->fd, +- F_SETLK, &flock, NULL); +- } else { +- COMPOUND_PACK_ARGS (inodelk, GF_FOP_INODELK, +- args, i, +- int_lock->domain, &local->loc, +- F_SETLK, &flock, NULL); +- } +- } +- +- local->c_args = args; +- +- for (i = 0; i < priv->child_count; i++) { +- if (!local->transaction.pre_op[i] || +- local->transaction.failed_subvols[i]) +- continue; +- STACK_WIND_COOKIE (frame, afr_post_op_unlock_cbk, +- (void *) (long) i, +- priv->children[i], +- priv->children[i]->fops->compound, +- args, +- NULL); +- if (!--call_count) +- break; +- } +-out: +- if (xdata) +- dict_unref (xdata); +- if (newloc_xdata) +- dict_unref (newloc_xdata); +- return 0; +-} +- +-int + afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op) +@@ -1783,21 +1464,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) + goto next; + } + +- /* Till here we have already decided if pre-op needs to be done, +- * based on various criteria. The only thing that needs to be checked +- * now on is whether compound-fops is enabled or not. +- * If it is, then perform pre-op and fop together for writev op. +- */ +- if (afr_can_compound_pre_op_and_op (priv, local->op)) { +- local->compound = _gf_true; +- afr_pre_op_fop_do (frame, this, xdata_req, +- afr_transaction_perform_fop, +- AFR_TRANSACTION_PRE_OP); +- } else { +- afr_changelog_do (frame, this, xdata_req, +- afr_transaction_perform_fop, +- AFR_TRANSACTION_PRE_OP); +- } ++ afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop, ++ AFR_TRANSACTION_PRE_OP); + + if (xdata_req) + dict_unref (xdata_req); +diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h +index dd19e5b..d01e144 100644 +--- a/xlators/cluster/afr/src/afr-transaction.h ++++ b/xlators/cluster/afr/src/afr-transaction.h +@@ -58,7 +58,5 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv, + inode_t *inode1, unsigned char *readable1, + inode_t *inode2, unsigned char *readable2); + int +-afr_pre_op_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int op_ret, int op_errno, +- void *data, dict_t *xdata); ++afr_transaction_resume (call_frame_t *frame, xlator_t *this); + #endif /* __TRANSACTION_H__ */ +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index fc56486..c0e9d9c 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -216,9 +216,6 @@ reconfigure (xlator_t *this, dict_t *options) + out); + GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str, + out); +- GF_OPTION_RECONF ("use-compound-fops", priv->use_compound_fops, +- options, bool, +- out); + GF_OPTION_RECONF ("granular-entry-heal", priv->esh_granular, options, + bool, out); + +@@ -422,8 +419,6 @@ init (xlator_t *this) + + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); +- GF_OPTION_INIT ("use-compound-fops", priv->use_compound_fops, +- bool, out); + GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out); + + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); +@@ -949,9 +944,8 @@ struct volume_options options[] = { + { .key = {"use-compound-fops"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", +- .description = "Use compound fops framework to modify afr " +- "transaction such that network roundtrips are " +- "reduced, thus improving the performance.", ++ .description = "this option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {NULL} }, + }; +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 304efa1..58f881e 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -44,11 +44,6 @@ typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int + + typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); + +-typedef int (*afr_compound_cbk_t) (call_frame_t *frame, void *cookie, +- xlator_t *this, int op_ret, int op_errno, +- void *data, dict_t *xdata); +- +- + #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) + #define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) + #define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) +@@ -169,7 +164,6 @@ typedef struct _afr_private { + gf_boolean_t use_afr_in_pump; + char *locking_scheme; + gf_boolean_t esh_granular; +- gf_boolean_t use_compound_fops; + } afr_private_t; + + +@@ -820,9 +814,7 @@ typedef struct _afr_local { + call_frame_t *heal_frame; + + gf_boolean_t need_full_crawl; +- gf_boolean_t compound; + afr_fop_lock_state_t fop_lock_state; +- compound_args_t *c_args; + + gf_boolean_t is_read_txn; + } afr_local_t; +@@ -1213,12 +1205,6 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame); + void + afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index); +-gf_boolean_t +-afr_can_compound_pre_op_and_op (afr_private_t *priv, glusterfs_fop_t fop); +- +-afr_compound_cbk_t +-afr_pack_fop_args (call_frame_t *frame, compound_args_t *args, +- glusterfs_fop_t fop, int index); + int + afr_is_inodelk_transaction(afr_local_t *local); + +-- +1.8.3.1 + diff --git a/SOURCES/0671-cluster-afr-Remove-unused-code-paths.patch b/SOURCES/0671-cluster-afr-Remove-unused-code-paths.patch new file mode 100644 index 0000000..70b2a3c --- /dev/null +++ b/SOURCES/0671-cluster-afr-Remove-unused-code-paths.patch @@ -0,0 +1,1522 @@ +From 07d47befe41ed54c0340e297e7f1ce9bf87bc3e6 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Fri, 2 Mar 2018 12:37:42 +0530 +Subject: [PATCH 671/675] cluster/afr: Remove unused code paths + +Removed +1) afr-v1 self-heal locks related code which is not used anymore +2) transaction has some data types that are not needed, so removed them +3) Never used lock tracing available in afr as gluster's network tracing does +the job. So removed that as well. +4) Changelog is always enabled and afr is always used with locks, so +__changelog_enabled, afr_lock_server_count etc functions can be deleted. +5) transaction.fop/done/resume always call the same functions, so no need +to have these variables. + + > BUG: 1549606 + +Change-Id: Ie33a3c8f9ddbe6741f0950d961af3d4daf497a8a +Upstream-patch: https://review.gluster.org/19661 +BUG: 1583733 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/140576 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-common.c | 10 +- + xlators/cluster/afr/src/afr-dir-write.c | 18 +- + xlators/cluster/afr/src/afr-inode-write.c | 39 +-- + xlators/cluster/afr/src/afr-lk-common.c | 508 +----------------------------- + xlators/cluster/afr/src/afr-transaction.c | 137 +------- + xlators/cluster/afr/src/afr-transaction.h | 2 - + xlators/cluster/afr/src/afr.c | 35 +- + xlators/cluster/afr/src/afr.h | 52 +-- + xlators/cluster/afr/src/pump.c | 3 - + 9 files changed, 41 insertions(+), 763 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index bffa71b..708182a 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -4294,9 +4294,6 @@ afr_priv_dump (xlator_t *this) + gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); +- gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); +- gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); +- gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); +@@ -4753,8 +4750,7 @@ out: + } + + int +-afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, +- transaction_lk_type_t lk_type) ++afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count) + { + int ret = -ENOMEM; + +@@ -4770,7 +4766,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, + + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; +- lk->transaction_lk_type = lk_type; + + ret = 0; + out: +@@ -4837,8 +4832,7 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + afr_private_t *priv = NULL; + + priv = this->private; +- ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, +- AFR_TRANSACTION_LK); ++ ret = afr_internal_lock_init (&local->internal_lock, priv->child_count); + if (ret < 0) + goto out; + +diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c +index 9099b8c..408c7de 100644 +--- a/xlators/cluster/afr/src/afr-dir-write.c ++++ b/xlators/cluster/afr/src/afr-dir-write.c +@@ -267,7 +267,7 @@ __afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + afr_mark_entry_pending_changelog (frame, this); + +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + } + + return 0; +@@ -496,8 +496,6 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + goto out; + + local->transaction.wind = afr_create_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -626,8 +624,6 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + goto out; + + local->transaction.wind = afr_mknod_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -762,8 +758,6 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -891,8 +885,6 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + local->op = GF_FOP_LINK; + + local->transaction.wind = afr_link_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_link_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, +@@ -1021,8 +1013,6 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -1156,8 +1146,6 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, +@@ -1308,8 +1296,6 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -1436,8 +1422,6 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index c740599..e0f6541 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -207,7 +207,7 @@ __afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->transaction.unwind (frame, this); + } + +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + } + + return 0; +@@ -357,14 +357,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) + { +- afr_local_t *local = NULL; + call_frame_t *fop_frame = NULL; + int child_index = (long) cookie; + int call_count = -1; + int ret = 0; + +- local = frame->local; +- + afr_inode_write_fill (frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, xdata); + +@@ -375,7 +372,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + if (!afr_txn_nothing_failed (frame, this)) { + //Don't unwind until post-op is complete +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then +@@ -390,7 +387,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + afr_writev_unwind (fop_frame, this); + } + } +@@ -463,8 +460,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) + local->op = GF_FOP_WRITE; + + local->transaction.wind = afr_writev_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_transaction_writev_unwind; + + local->transaction.main_frame = frame; +@@ -650,8 +645,6 @@ afr_truncate (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_truncate_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); +@@ -774,8 +767,6 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->op = GF_FOP_FTRUNCATE; + + local->transaction.wind = afr_ftruncate_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->transaction.main_frame = frame; +@@ -882,8 +873,6 @@ afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + goto out; + + local->transaction.wind = afr_setattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setattr_unwind; + + loc_copy (&local->loc, loc); +@@ -987,8 +976,6 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_fsetattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetattr_unwind; + + local->fd = fd_ref (fd); +@@ -1629,8 +1616,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + goto out; + + local->transaction.wind = afr_setxattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); +@@ -1741,8 +1726,6 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_fsetxattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); +@@ -1854,8 +1837,6 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_removexattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); +@@ -1961,8 +1942,6 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + goto out; + + local->transaction.wind = afr_fremovexattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fremovexattr_unwind; + + local->fd = fd_ref (fd); +@@ -2074,8 +2053,6 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + local->op = GF_FOP_FALLOCATE; + + local->transaction.wind = afr_fallocate_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; +@@ -2186,8 +2163,6 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->op = GF_FOP_DISCARD; + + local->transaction.wind = afr_discard_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; +@@ -2295,8 +2270,6 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->op = GF_FOP_ZEROFILL; + + local->transaction.wind = afr_zerofill_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; +@@ -2389,8 +2362,6 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + local->xdata_req = dict_ref (xdata); + + local->transaction.wind = afr_xattrop_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_xattrop_unwind; + + loc_copy (&local->loc, loc); +@@ -2483,8 +2454,6 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + local->xdata_req = dict_ref (xdata); + + local->transaction.wind = afr_fxattrop_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fxattrop_unwind; + + local->fd = fd_ref (fd); +@@ -2596,8 +2565,6 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + } + + local->transaction.wind = afr_fsync_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsync_unwind; + + local->transaction.main_frame = frame; +diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c +index 6019454..38cc87b 100644 +--- a/xlators/cluster/afr/src/afr-lk-common.c ++++ b/xlators/cluster/afr/src/afr-lk-common.c +@@ -23,38 +23,6 @@ + #define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ + #define LOCKED_LOWER 0x2 /* for lower path */ + +-#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->inodelk_trace) \ +- break; \ +- afr_trace_inodelk_in (frame, this, params); \ +- } while (0); +- +-#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->inodelk_trace) \ +- break; \ +- afr_trace_inodelk_out (frame, this, params); \ +- } while (0); +- +-#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->entrylk_trace) \ +- break; \ +- afr_trace_entrylk_in (frame, this, params); \ +- } while (0); +- +-#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->entrylk_trace) \ +- break; \ +- afr_trace_entrylk_out (frame, this, params); \ +- } while (0); +- + int + afr_entry_lockee_cmp (const void *l1, const void *l2) + { +@@ -119,28 +87,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); + } + +-static int +-is_afr_lock_selfheal (afr_local_t *local) +-{ +- afr_internal_lock_t *int_lock = NULL; +- int ret = -1; +- +- int_lock = &local->internal_lock; +- +- switch (int_lock->selfheal_lk_type) { +- case AFR_DATA_SELF_HEAL_LK: +- case AFR_METADATA_SELF_HEAL_LK: +- ret = 1; +- break; +- case AFR_ENTRY_SELF_HEAL_LK: +- ret = 0; +- break; +- } +- +- return ret; +- +-} +- + int32_t + internal_lock_count (call_frame_t *frame, xlator_t *this) + { +@@ -160,315 +106,12 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) + return call_count; + } + +-static void +-afr_print_inodelk (char *str, int size, int cmd, +- struct gf_flock *flock, gf_lkowner_t *owner) +-{ +- char *cmd_str = NULL; +- char *type_str = NULL; +- +- switch (cmd) { +-#if F_GETLK != F_GETLK64 +- case F_GETLK64: +-#endif +- case F_GETLK: +- cmd_str = "GETLK"; +- break; +- +-#if F_SETLK != F_SETLK64 +- case F_SETLK64: +-#endif +- case F_SETLK: +- cmd_str = "SETLK"; +- break; +- +-#if F_SETLKW != F_SETLKW64 +- case F_SETLKW64: +-#endif +- case F_SETLKW: +- cmd_str = "SETLKW"; +- break; +- +- default: +- cmd_str = ""; +- break; +- } +- +- switch (flock->l_type) { +- case F_RDLCK: +- type_str = "READ"; +- break; +- case F_WRLCK: +- type_str = "WRITE"; +- break; +- case F_UNLCK: +- type_str = "UNLOCK"; +- break; +- default: +- type_str = "UNKNOWN"; +- break; +- } +- +- snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " +- "start=%llu, len=%llu, pid=%llu, lk-owner=%s", +- cmd_str, type_str, (unsigned long long) flock->l_start, +- (unsigned long long) flock->l_len, +- (unsigned long long) flock->l_pid, +- lkowner_utoa (owner)); +- +-} +- +-static void +-afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, +- int child_index) +-{ +- snprintf (str, size, "path=%s, fd=%p, child=%d", +- loc->path ? loc->path : "", +- fd ? fd : NULL, +- child_index); +-} +- +-void +-afr_print_entrylk (char *str, int size, const char *basename, +- gf_lkowner_t *owner) +-{ +- snprintf (str, size, "Basename=%s, lk-owner=%s", +- basename ? basename : "", +- lkowner_utoa (owner)); +-} +- +-static void +-afr_print_verdict (int op_ret, int op_errno, char *str) +-{ +- if (op_ret < 0) { +- if (op_errno == EAGAIN) +- strcpy (str, "EAGAIN"); +- else +- strcpy (str, "FAILED"); +- } +- else +- strcpy (str, "GRANTED"); +-} +- +-static void +-afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, +- char *lock_call_type_str, +- afr_internal_lock_t *int_lock) +-{ +- switch (lock_call_type) { +- case AFR_INODELK_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); +- break; +- case AFR_INODELK_NB_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); +- break; +- case AFR_ENTRYLK_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); +- break; +- case AFR_ENTRYLK_NB_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); +- break; +- default: +- strcpy (lock_call_type_str, "UNKNOWN"); +- break; +- } +- +-} +- +-static void +-afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, struct gf_flock *flock, +- int op_ret, int op_errno, int32_t child_index) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- +- char lockee[256]; +- char lock_call_type_str[256]; +- char verdict[16]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- afr_print_verdict (op_ret, op_errno, verdict); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", +- verdict, lkowner_utoa (&frame->root->lk_owner), lockee, +- (unsigned long long) int_lock->lock_number); +- +-} +- +-static void +-afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, struct gf_flock *flock, +- int32_t cmd, int32_t child_index) +-{ +- afr_local_t *local = NULL; +- afr_internal_lock_t *int_lock = NULL; +- +- char lock[256]; +- char lockee[256]; +- char lock_call_type_str[256]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); +- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", +- lock, lockee, +- (unsigned long long) int_lock->lock_number); +- +-} +- +-static void +-afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, const char *basename, +- int32_t cookie) +-{ +- afr_local_t *local = NULL; +- afr_internal_lock_t *int_lock = NULL; +- afr_private_t *priv = NULL; +- int child_index = 0; +- int lockee_no = 0; +- +- char lock[256]; +- char lockee[256]; +- char lock_call_type_str[256]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- priv = this->private; +- +- if (!priv->entrylk_trace) { +- return; +- } +- lockee_no = cookie / priv->child_count; +- child_index = cookie % priv->child_count; +- +- afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); +- afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, +- child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", +- lock, lockee, +- (unsigned long long) int_lock->lock_number, +- cookie); +-} +- +-static void +-afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, const char *basename, +- int op_ret, int op_errno, int32_t cookie) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- int lockee_no = 0; +- int child_index = 0; +- +- char lock[256]; +- char lockee[256]; +- char lock_call_type_str[256]; +- char verdict[16]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- priv = this->private; +- +- if (!priv->entrylk_trace) { +- return; +- } +- lockee_no = cookie / priv->child_count; +- child_index = cookie % priv->child_count; +- +- afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); +- afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, +- child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- afr_print_verdict (op_ret, op_errno, verdict); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", +- verdict, +- lock, lockee, +- (unsigned long long) int_lock->lock_number, +- cookie); +- +-} +- +-static int +-transaction_lk_op (afr_local_t *local) +-{ +- afr_internal_lock_t *int_lock = NULL; +- int ret = -1; +- +- int_lock = &local->internal_lock; +- +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { +- gf_msg_debug (THIS->name, 0, +- "lk op is for a transaction"); +- ret = 1; +- } +- else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { +- gf_msg_debug (THIS->name, 0, +- "lk op is for a self heal"); +- +- ret = 0; +- } +- +- if (ret == -1) +- gf_msg_debug (THIS->name, 0, +- "lk op is not set"); +- +- return ret; +- +-} +- + int +-afr_is_inodelk_transaction(afr_local_t *local) ++afr_is_inodelk_transaction(afr_transaction_type type) + { + int ret = 0; + +- switch (local->transaction.type) { ++ switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + ret = 1; +@@ -661,10 +304,6 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, +- AFR_UNLOCK_OP, NULL, op_ret, +- op_errno, child_index); +- + priv = this->private; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { +@@ -761,11 +400,6 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + + flock_use = &full_flock; + wind: +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_UNLOCK_OP, flock_use, F_SETLK, +- i); +- + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], +@@ -777,9 +411,6 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + break; + + } else { +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_UNLOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, +@@ -813,11 +444,6 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_UNLOCK_OP, +- int_lock->lockee[lockee_no].basename, op_ret, +- op_errno, (int) ((long)cookie)); +- + if (op_ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, op_errno, + AFR_MSG_ENTRY_UNLOCK_FAIL, +@@ -863,10 +489,6 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +- AFR_UNLOCK_OP, +- int_lock->lockee[lockee_no].basename, +- i); + + STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, + (void *) (long) i, +@@ -951,10 +573,6 @@ static int32_t + afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { +- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, +- AFR_LOCK_OP, NULL, op_ret, +- op_errno, (long) cookie); +- + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + return 0; + +@@ -964,10 +582,6 @@ static int32_t + afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { +- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, NULL, op_ret, +- op_errno, (long)cookie); +- + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + return 0; + } +@@ -1005,27 +619,11 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) + } + + static gf_boolean_t +-afr_is_entrylk (afr_internal_lock_t *int_lock, +- afr_transaction_type trans_type) ++afr_is_entrylk (afr_transaction_type trans_type) + { +- gf_boolean_t is_entrylk = _gf_false; +- +- if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && +- int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { +- +- is_entrylk = _gf_true; +- +- } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && +- (trans_type == AFR_ENTRY_TRANSACTION || +- trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { +- +- is_entrylk = _gf_true; +- +- } else { +- is_entrylk = _gf_false; +- } +- +- return is_entrylk; ++ if (afr_is_inodelk_transaction (trans_type)) ++ return _gf_false; ++ return _gf_true; + } + + static gf_boolean_t +@@ -1080,7 +678,7 @@ is_blocking_locks_count_sufficient (call_frame_t *frame, xlator_t *this) + priv = this->private; + int_lock = &local->internal_lock; + lockee_count = int_lock->lockee_count; +- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); ++ is_entrylk = afr_is_entrylk (local->transaction.type); + + if (!is_entrylk) { + if (int_lock->lock_count == 0) { +@@ -1138,7 +736,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; +- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); ++ is_entrylk = afr_is_entrylk (local->transaction.type); + + + if (!is_entrylk) { +@@ -1205,10 +803,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + case AFR_METADATA_TRANSACTION: + + if (local->fd) { +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_LOCK_OP, &flock, F_SETLKW, +- child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, +@@ -1218,10 +812,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + F_SETLKW, &flock, NULL); + + } else { +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_LOCK_OP, &flock, F_SETLKW, +- child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, +@@ -1239,10 +829,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + *and 'fd-less' children */ + + if (local->fd) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, +- cookie); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, +@@ -1252,10 +838,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { +- AFR_TRACE_ENTRYLK_IN (frame, this, +- AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, local->transaction.basename, +- child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, +@@ -1328,10 +910,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, op_ret, +- op_errno, (long) cookie); + + LOCK (&frame->lock); + { +@@ -1441,10 +1019,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, +- i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, +@@ -1467,10 +1041,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, +- i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, +@@ -1506,10 +1076,6 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, +- AFR_LOCK_OP, NULL, op_ret, +- op_errno, (long) cookie); +- + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + +@@ -1671,9 +1237,6 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + } + flock_use = &full_flock; + wind: +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_NB_TRANSACTION, +- AFR_LOCK_OP, flock_use, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, +@@ -1693,9 +1256,6 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_NB_TRANSACTION, +- AFR_LOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, +@@ -1719,54 +1279,10 @@ afr_unlock (call_frame_t *frame, xlator_t *this) + + local = frame->local; + +- if (transaction_lk_op (local)) { +- if (afr_is_inodelk_transaction(local)) +- afr_unlock_inodelk (frame, this); +- else +- afr_unlock_entrylk (frame, this); +- +- } else { +- if (is_afr_lock_selfheal (local)) +- afr_unlock_inodelk (frame, this); +- else +- afr_unlock_entrylk (frame, this); +- } ++ if (afr_is_inodelk_transaction(local->transaction.type)) ++ afr_unlock_inodelk (frame, this); ++ else ++ afr_unlock_entrylk (frame, this); + + return 0; + } +- +-int +-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, +- unsigned int child_count) +-{ +- afr_local_t *dst_local = NULL; +- afr_local_t *src_local = NULL; +- afr_internal_lock_t *dst_lock = NULL; +- afr_internal_lock_t *src_lock = NULL; +- afr_inodelk_t *dst_inodelk = NULL; +- afr_inodelk_t *src_inodelk = NULL; +- int ret = -1; +- +- src_local = src->local; +- src_lock = &src_local->internal_lock; +- src_inodelk = afr_get_inodelk (src_lock, dom); +- dst_local = dst->local; +- dst_lock = &dst_local->internal_lock; +- dst_inodelk = afr_get_inodelk (dst_lock, dom); +- if (!dst_inodelk || !src_inodelk) +- goto out; +- if (src_inodelk->locked_nodes) { +- memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, +- sizeof (*dst_inodelk->locked_nodes) * child_count); +- memset (src_inodelk->locked_nodes, 0, +- sizeof (*src_inodelk->locked_nodes) * child_count); +- } +- +- dst_lock->transaction_lk_type = src_lock->transaction_lk_type; +- dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; +- dst_inodelk->lock_count = src_inodelk->lock_count; +- src_inodelk->lock_count = 0; +- ret = 0; +-out: +- return ret; +-} +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 6672816..46a65a7 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -133,7 +133,7 @@ afr_needs_changelog_update (afr_local_t *local) + } + + int +-__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) ++afr_transaction_fop (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +@@ -150,7 +150,7 @@ __afr_txn_write_fop (call_frame_t *frame, xlator_t *this) + priv->child_count); + + if (call_count == 0) { +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + return 0; + } + +@@ -170,7 +170,7 @@ __afr_txn_write_fop (call_frame_t *frame, xlator_t *this) + + + int +-__afr_txn_write_done (call_frame_t *frame, xlator_t *this) ++afr_transaction_done (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +@@ -351,13 +351,13 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { +- local->internal_lock.lock_cbk = local->transaction.done; ++ local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = ENOTCONN; + afr_restore_lk_owner (frame); + afr_unlock (frame, this); + } else { +- local->transaction.fop (frame, this); ++ afr_transaction_fop (frame, this); + } + + return; +@@ -400,75 +400,12 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop (frame, this); + } else { +- local->transaction.fop (frame, this); ++ afr_transaction_fop (frame, this); + } + + return 0; + } + +-static int +-__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +-{ +- int ret = 0; +- +- switch (type) { +- case AFR_DATA_TRANSACTION: +- if (priv->data_change_log) +- ret = 1; +- +- break; +- +- case AFR_METADATA_TRANSACTION: +- if (priv->metadata_change_log) +- ret = 1; +- +- break; +- +- case AFR_ENTRY_TRANSACTION: +- case AFR_ENTRY_RENAME_TRANSACTION: +- if (priv->entry_change_log) +- ret = 1; +- +- break; +- } +- +- return ret; +-} +- +- +-static int +-__fop_changelog_needed (call_frame_t *frame, xlator_t *this) +-{ +- afr_private_t * priv = NULL; +- afr_local_t * local = NULL; +- int op_ret = 0; +- afr_transaction_type type = -1; +- +- priv = this->private; +- local = frame->local; +- type = local->transaction.type; +- +- if (__changelog_enabled (priv, type)) { +- switch (local->op) { +- +- case GF_FOP_WRITE: +- case GF_FOP_FTRUNCATE: +- op_ret = 1; +- break; +- +- case GF_FOP_FLUSH: +- op_ret = 0; +- break; +- +- default: +- op_ret = 1; +- } +- } +- +- return op_ret; +-} +- +- + int + afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) + { +@@ -489,29 +426,6 @@ afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) + return ret; + } + +-int +-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +-{ +- int ret = 0; +- +- switch (type) { +- case AFR_DATA_TRANSACTION: +- ret = priv->child_count; +- break; +- +- case AFR_METADATA_TRANSACTION: +- ret = priv->child_count; +- break; +- +- case AFR_ENTRY_TRANSACTION: +- case AFR_ENTRY_RENAME_TRANSACTION: +- ret = priv->child_count; +- break; +- } +- +- return ret; +-} +- + /* {{{ pending */ + + +@@ -519,11 +433,9 @@ int + afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; +- afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + + local = frame->local; +- priv = this->private; + int_lock = &local->internal_lock; + + if (local->transaction.resume_stub) { +@@ -531,12 +443,8 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + local->transaction.resume_stub = NULL; + } + +- if (afr_lock_server_count (priv, local->transaction.type) == 0) { +- local->transaction.done (frame, this); +- } else { +- int_lock->lock_cbk = local->transaction.done; +- afr_unlock (frame, this); +- } ++ int_lock->lock_cbk = afr_transaction_done; ++ afr_unlock (frame, this); + + return 0; + } +@@ -1479,7 +1387,7 @@ next: + + return 0; + err: +- local->internal_lock.lock_cbk = local->transaction.done; ++ local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = op_errno; + +@@ -1505,7 +1413,7 @@ afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + gf_msg (this->name, GF_LOG_INFO, + 0, AFR_MSG_BLOCKING_LKS_FAILED, + "Blocking inodelks failed."); +- local->transaction.done (frame, this); ++ afr_transaction_done (frame, this); + } else { + + gf_msg_debug (this->name, 0, +@@ -1556,7 +1464,7 @@ afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + gf_msg (this->name, GF_LOG_INFO, 0, + AFR_MSG_BLOCKING_LKS_FAILED, + "Blocking entrylks failed."); +- local->transaction.done (frame, this); ++ afr_transaction_done (frame, this); + } else { + + gf_msg_debug (this->name, 0, +@@ -1609,7 +1517,7 @@ afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) + AFR_MSG_BLOCKING_LKS_FAILED, + "Blocking entrylks failed."); + +- local->transaction.done (frame, this); ++ afr_transaction_done (frame, this); + } else { + + gf_msg_debug (this->name, 0, +@@ -1672,7 +1580,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) + local = frame->local; + int_lock = &local->internal_lock; + +- int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; + + switch (local->transaction.type) { +@@ -1721,11 +1628,7 @@ afr_lock (call_frame_t *frame, xlator_t *this) + int + afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) + { +- if (__fop_changelog_needed (frame, this)) { +- afr_changelog_pre_op (frame, this); +- } else { +- afr_transaction_perform_fop (frame, this); +- } ++ afr_changelog_pre_op (frame, this); + + return 0; + } +@@ -2135,11 +2038,7 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) + with OP */ + afr_changelog_pre_op_update (frame, this); + +- if (__fop_changelog_needed (frame, this)) { +- afr_changelog_post_op (frame, this); +- } else { +- afr_changelog_post_op_done (frame, this); +- } ++ afr_changelog_post_op (frame, this); + + return 0; + } +@@ -2246,7 +2145,6 @@ void + afr_transaction_start (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = frame->local; +- afr_private_t *priv = this->private; + fd_t *fd = NULL; + + afr_transaction_eager_lock_init (local, this); +@@ -2268,11 +2166,7 @@ afr_transaction_start (call_frame_t *frame, xlator_t *this) + } + } + +- if (afr_lock_server_count (priv, local->transaction.type) == 0) { +- afr_internal_lock_finish (frame, this); +- } else { +- afr_lock (frame, this); +- } ++ afr_lock (frame, this); + } + + int +@@ -2304,7 +2198,6 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + local = frame->local; + priv = this->private; + +- local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + + ret = afr_transaction_local_init (local, this); +diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h +index d01e144..ddcb1eb 100644 +--- a/xlators/cluster/afr/src/afr-transaction.h ++++ b/xlators/cluster/afr/src/afr-transaction.h +@@ -46,8 +46,6 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + + int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +-int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +-int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); + call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); + gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this); + gf_boolean_t afr_needs_changelog_update (afr_local_t *local); +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index c0e9d9c..78b61b7 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -171,15 +171,6 @@ reconfigure (xlator_t *this, dict_t *options) + priv->data_self_heal_window_size, options, + uint32, out); + +- GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, +- bool, out); +- +- GF_OPTION_RECONF ("metadata-change-log", +- priv->metadata_change_log, options, bool, out); +- +- GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, +- bool, out); +- + GF_OPTION_RECONF ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, options, str, out); + +@@ -403,20 +394,9 @@ init (xlator_t *this) + + GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + +- GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); +- +- GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, +- out); +- +- GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); +- + GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, + bool, out); + +- GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); +- +- GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); +- + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); + GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out); +@@ -730,23 +710,20 @@ struct volume_options options[] = { + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Data fops like write/truncate will not perform " +- "pre/post fop changelog operations in afr transaction " +- "if this option is disabled" ++ .description = "This option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Metadata fops like setattr/setxattr will not perform " +- "pre/post fop changelog operations in afr transaction " +- "if this option is disabled" ++ .description = "This option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Entry fops like create/unlink will not perform " +- "pre/post fop changelog operations in afr transaction " +- "if this option is disabled" ++ .description = "This option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 58f881e..dec2a37 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -105,10 +105,6 @@ typedef struct _afr_private { + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + +- gf_boolean_t data_change_log; /* on/off */ +- gf_boolean_t metadata_change_log; /* on/off */ +- gf_boolean_t entry_change_log; /* on/off */ +- + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ +@@ -118,9 +114,6 @@ typedef struct _afr_private { + afr_favorite_child_policy fav_child_policy;/*Policy to use for automatic + resolution of split-brains.*/ + +- gf_boolean_t inodelk_trace; +- gf_boolean_t entrylk_trace; +- + unsigned int wait_count; /* # of servers to wait for success */ + + gf_timer_t *timer; /* launched when parent up is received */ +@@ -174,33 +167,6 @@ typedef enum { + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + } afr_transaction_type; + +-typedef enum { +- AFR_TRANSACTION_LK, +- AFR_SELFHEAL_LK, +-} transaction_lk_type_t; +- +-typedef enum { +- AFR_LOCK_OP, +- AFR_UNLOCK_OP, +-} afr_lock_op_type_t; +- +-typedef enum { +- AFR_DATA_SELF_HEAL_LK, +- AFR_METADATA_SELF_HEAL_LK, +- AFR_ENTRY_SELF_HEAL_LK, +-}selfheal_lk_type_t; +- +-typedef enum { +- AFR_INODELK_TRANSACTION, +- AFR_INODELK_NB_TRANSACTION, +- AFR_ENTRYLK_TRANSACTION, +- AFR_ENTRYLK_NB_TRANSACTION, +- AFR_INODELK_SELFHEAL, +- AFR_INODELK_NB_SELFHEAL, +- AFR_ENTRYLK_SELFHEAL, +- AFR_ENTRYLK_NB_SELFHEAL, +-} afr_lock_call_type_t; +- + /* + xattr format: trusted.afr.volume = [x y z] + x - data pending +@@ -273,9 +239,6 @@ typedef struct { + unsigned char *locked_nodes; + unsigned char *lower_locked_nodes; + +- selfheal_lk_type_t selfheal_lk_type; +- transaction_lk_type_t transaction_lk_type; +- + int32_t lock_count; + int32_t entrylk_lock_count; + +@@ -785,12 +748,6 @@ typedef struct _afr_local { + + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + +- int (*fop) (call_frame_t *frame, xlator_t *this); +- +- int (*done) (call_frame_t *frame, xlator_t *this); +- +- int (*resume) (call_frame_t *frame, xlator_t *this); +- + int (*unwind) (call_frame_t *frame, xlator_t *this); + + /* post-op hook */ +@@ -955,10 +912,6 @@ int + afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); + + int +-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, +- unsigned int child_count); +- +-int + __afr_fd_ctx_set (xlator_t *this, fd_t *fd); + + afr_fd_ctx_t * +@@ -1073,8 +1026,7 @@ int + afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); + + int +-afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, +- transaction_lk_type_t lk_type); ++afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count); + + int + afr_higher_errno (int32_t old_errno, int32_t new_errno); +@@ -1206,7 +1158,7 @@ void + afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index); + int +-afr_is_inodelk_transaction(afr_local_t *local); ++afr_is_inodelk_transaction(afr_transaction_type type); + + afr_fd_ctx_t * + __afr_fd_ctx_get (fd_t *fd, xlator_t *this); +diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c +index d322a9d..06b2cc8 100644 +--- a/xlators/cluster/afr/src/pump.c ++++ b/xlators/cluster/afr/src/pump.c +@@ -2268,9 +2268,6 @@ init (xlator_t *this) + + priv->data_self_heal_window_size = 16; + +- priv->data_change_log = 1; +- priv->metadata_change_log = 1; +- priv->entry_change_log = 1; + priv->use_afr_in_pump = 1; + priv->sh_readdir_size = 65536; + +-- +1.8.3.1 + diff --git a/SOURCES/0672-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch b/SOURCES/0672-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch new file mode 100644 index 0000000..bdfe877 --- /dev/null +++ b/SOURCES/0672-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch @@ -0,0 +1,3183 @@ +From eb82d5cf848793d61956829bcc07cf107f55e467 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 31 Jan 2018 16:41:14 +0530 +Subject: [PATCH 672/675] cluster/afr: Make AFR eager-locking similar to EC + +Problem: +1) Afr's eager-lock only works for data transactions. +2) When there are conflicting writes, write with conflicting region initiates +unlock of eager-lock leading to extra pre-ops and post-ops on the file. When +eager-lock goes off, it leads to extra fsyncs for random-write workload in afr. + +Solution (that is modeled after EC): +In EC, when there is a conflicting write, it waits for the current write to +complete before it winds the conflicted write. This leads to better utilization +of network and disk, because we will not be doing extra xattrops and FSYNCs and +inodelk/unlock. Moved fd based counters to inode based counters. + +I tried to model the solution based on EC's locking, but it is not similar to +AFR because we had to keep backward compatibility. + +Lifecycle of lock: +================== +First transaction is added to inode->owners list and an inodelk will be sent on +the wire. All the next transactions will be put in inode->waiters list until +the first transaction completes inodelk and [f]xattrop completely. Once +[f]xattrop also completes, all the requests in the inode->waiters list are +checked if it conflict with any of the existing locks which are in +inode->owners list and if not are added to inode->owners list and resumed with +doing transaction. When these transactions complete fop phase they will be +moved to inode->post_op list and resume the transactions that were paused +because of conflicts. Post-op and unlock will not be issued on the wire until +that is the last transaction on that inode. Last transaction when it has to +perform post-op can choose to sleep for deyed-post-op-secs value. During that +time if any other transaction comes, it will wake up the sleeping transaction +and takes over the ownership of the lock and the cycle continues. If the +dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping +transaction and it will set lock->release to true and starts doing post-op and +then unlock. During this time if any other transactions come, they will be put +in inode->frozen list. Once the previous unlock comes it will move the frozen +list to waiters list and moves the first element from this waiters-list to +owners-list and attempts the lock and the cycle continues. This is the general +idea. There is logic at the time of dealying and at the time of new +transaction or in flush fop to wakeup existing sleeping transactions or +choosing whether to delay a transaction etc, which is subjected to change based +on future enhancements etc. + + >Fixes: #418 + >BUG: 1549606 + +Upstream-patch: https://review.gluster.org/19503 +Also had to take setting local->inode_ctx part of +https://review.gluster.org/19045 +to get this to work in this branch. + +Change-Id: I540e9e1ec7556e2c576261025d115315afc5b4d1 +BUG: 1583733 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/140577 +Reviewed-by: Ravishankar Narayanankutty +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/replicate/bug-966018.t | 36 - + xlators/cluster/afr/src/afr-common.c | 324 +++++---- + xlators/cluster/afr/src/afr-inode-write.c | 68 +- + xlators/cluster/afr/src/afr-lk-common.c | 331 +++------ + xlators/cluster/afr/src/afr-self-heal-common.c | 13 +- + xlators/cluster/afr/src/afr-self-heal-data.c | 14 +- + xlators/cluster/afr/src/afr-self-heal.h | 2 +- + xlators/cluster/afr/src/afr-transaction.c | 909 ++++++++++++++----------- + xlators/cluster/afr/src/afr-transaction.h | 13 +- + xlators/cluster/afr/src/afr.h | 117 ++-- + 10 files changed, 875 insertions(+), 952 deletions(-) + delete mode 100644 tests/bugs/replicate/bug-966018.t + +diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t +deleted file mode 100644 +index 1b5296b..0000000 +--- a/tests/bugs/replicate/bug-966018.t ++++ /dev/null +@@ -1,36 +0,0 @@ +-#!/bin/bash +- +-. $(dirname $0)/../../include.rc +-. $(dirname $0)/../../volume.rc +-. $(dirname $0)/../../nfs.rc +- +-#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts. +-#If it is not woken up, INODELK from the next command waits +-#for post-op-delay secs. +- +-cleanup; +-TEST glusterd +-TEST pidof glusterd +- +-TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1 +-TEST $CLI volume set $V0 ensure-durability off +-TEST $CLI volume set $V0 cluster.eager-lock on +-TEST $CLI volume set $V0 cluster.post-op-delay-secs 3 +-TEST $CLI volume set $V0 nfs.disable false +- +-TEST $CLI volume start $V0 +-TEST $CLI volume profile $V0 start +-EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available; +-TEST mount_nfs $H0:/$V0 $N0 nolock; +-TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 +-echo 1 > $N0/1 && chmod +x $N0/1 +-echo 1 > $M0/1 && chmod +x $M0/1 +- +-#Check that INODELK MAX latency is not in the order of seconds +-#Test if the MAX INODELK fop latency is of the order of seconds. +-inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}") +- +-TEST [ -z $inodelk_max_latency ] +-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0 +- +-cleanup; +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 708182a..6141fc6 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -102,35 +102,75 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local, + return _gf_false; + } + ++static void ++afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) ++{ ++ int i = 0; ++ ++ if (!ctx) ++ return; ++ ++ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { ++ GF_FREE (ctx->pre_op_done[i]); ++ } ++ ++ GF_FREE (ctx); ++} ++ + int + __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) + { +- uint64_t ctx_int = 0; +- int ret = -1; +- afr_inode_ctx_t *tmp_ctx = NULL; ++ uint64_t ctx_int = 0; ++ int ret = -1; ++ int i = -1; ++ int num_locks = -1; ++ afr_inode_ctx_t *ictx = NULL; ++ afr_lock_t *lock = NULL; ++ afr_private_t *priv = this->private; + + ret = __inode_ctx_get (inode, this, &ctx_int); +- if (ret) { +- tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), +- gf_afr_mt_inode_ctx_t); +- if (!tmp_ctx) +- goto out; ++ if (ret == 0) { ++ *ctx = (afr_inode_ctx_t *)ctx_int; ++ return 0; ++ } + +- ctx_int = (long) tmp_ctx; +- ret = __inode_ctx_set (inode, this, &ctx_int); +- if (ret) { +- GF_FREE (tmp_ctx); ++ ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); ++ if (!ictx) ++ goto out; ++ ++ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { ++ ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i], ++ priv->child_count, ++ gf_afr_mt_int32_t); ++ if (!ictx->pre_op_done[i]) { ++ ret = -ENOMEM; + goto out; + } +- tmp_ctx->spb_choice = -1; +- tmp_ctx->read_subvol = 0; +- } else { +- tmp_ctx = (afr_inode_ctx_t *) ctx_int; + } + +- *ctx = tmp_ctx; ++ num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t); ++ for (i = 0; i < num_locks; i++) { ++ lock = &ictx->lock[i]; ++ INIT_LIST_HEAD (&lock->post_op); ++ INIT_LIST_HEAD (&lock->frozen); ++ INIT_LIST_HEAD (&lock->waiting); ++ INIT_LIST_HEAD (&lock->owners); ++ } ++ ++ ctx_int = (uint64_t)ictx; ++ ret = __inode_ctx_set (inode, this, &ctx_int); ++ if (ret) { ++ goto out; ++ } ++ ++ ictx->spb_choice = -1; ++ ictx->read_subvol = 0; + ret = 0; ++ *ctx = ictx; + out: ++ if (ret) { ++ afr_inode_ctx_destroy (ictx); ++ } + return ret; + } + +@@ -1647,10 +1687,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) + + GF_FREE (local->internal_lock.locked_nodes); + +- for (i = 0; local->internal_lock.inodelk[i].domain; i++) { +- GF_FREE (local->internal_lock.inodelk[i].locked_nodes); +- } +- + GF_FREE (local->internal_lock.lower_locked_nodes); + + afr_entry_lockee_cleanup (&local->internal_lock); +@@ -1667,7 +1703,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) + GF_FREE (local->transaction.pre_op_xdata); + } + +- GF_FREE (local->transaction.eager_lock); + GF_FREE (local->transaction.failed_subvols); + + GF_FREE (local->transaction.basename); +@@ -1709,16 +1744,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv) + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); + } + +-void +-afr_remove_eager_lock_stub (afr_local_t *local) +-{ +- LOCK (&local->fd->lock); +- { +- list_del_init (&local->transaction.eager_locked); +- } +- UNLOCK (&local->fd->lock); +-} +- + static gf_boolean_t + afr_fop_lock_is_unlock (call_frame_t *frame) + { +@@ -1752,10 +1777,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) + + syncbarrier_destroy (&local->barrier); + +- if (local->transaction.eager_lock_on && +- !list_empty (&local->transaction.eager_locked)) +- afr_remove_eager_lock_stub (local); +- + afr_local_transaction_cleanup (local, this); + + priv = this->private; +@@ -2942,22 +2963,8 @@ out: + void + _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx) + { +- int i = 0; +- +- +- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) +- GF_FREE (fd_ctx->pre_op_done[i]); +- + GF_FREE (fd_ctx->opened_on); +- +- GF_FREE (fd_ctx->lock_piggyback); +- +- GF_FREE (fd_ctx->lock_acquired); +- +- pthread_mutex_destroy (&fd_ctx->delay_lock); +- + GF_FREE (fd_ctx); +- + return; + } + +@@ -2975,15 +2982,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (fd_ctx) { +- /*no need to take any locks*/ +- if (!list_empty (&fd_ctx->eager_locked)) +- gf_msg (this->name, GF_LOG_WARNING, 0, +- AFR_MSG_INVALID_DATA, "%s: Stale " +- "Eager-lock stubs found", +- uuid_utoa (fd->inode->gfid)); +- + _afr_cleanup_fd_ctx (fd_ctx); +- + } + + out: +@@ -3064,23 +3063,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) + goto out; + } + +- ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL); +- if (ret) { +- GF_FREE (fd_ctx); +- fd_ctx = NULL; +- goto out; +- } +- +- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +- fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), +- priv->child_count, +- gf_afr_mt_int32_t); +- if (!fd_ctx->pre_op_done[i]) { +- ret = -ENOMEM; +- goto out; +- } +- } +- + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); +@@ -3096,26 +3078,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + +- fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), +- priv->child_count, +- gf_afr_mt_char); +- if (!fd_ctx->lock_piggyback) { +- ret = -ENOMEM; +- goto out; +- } +- +- fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), +- priv->child_count, +- gf_afr_mt_char); +- if (!fd_ctx->lock_acquired) { +- ret = -ENOMEM; +- goto out; +- } +- + fd_ctx->readdir_subvol = -1; + +- INIT_LIST_HEAD (&fd_ctx->eager_locked); +- + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_msg_debug (this->name, 0, +@@ -3187,12 +3151,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + return 0; + } + ++afr_local_t* ++afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd) ++{ ++ afr_local_t *local = NULL; ++ ++ if (lock->delay_timer) { ++ local = list_entry(lock->post_op.next, afr_local_t, ++ transaction.owner_list); ++ if (fd == local->fd) { ++ if (gf_timer_call_cancel (this->ctx, ++ lock->delay_timer)) { ++ local = NULL; ++ } else { ++ lock->delay_timer = NULL; ++ } ++ } else { ++ local = NULL; ++ } ++ } ++ ++ return local; ++} ++ ++void ++afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode, ++ call_stub_t *stub) ++{ ++ afr_inode_ctx_t *ctx = NULL; ++ afr_lock_t *lock = NULL; ++ afr_local_t *metadata_local = NULL; ++ afr_local_t *data_local = NULL; ++ LOCK (&inode->lock); ++ { ++ (void)__afr_inode_ctx_get (this, inode, &ctx); ++ lock = &ctx->lock[AFR_DATA_TRANSACTION]; ++ data_local = afr_wakeup_same_fd_delayed_op (this, lock, ++ stub->args.fd); ++ lock = &ctx->lock[AFR_METADATA_TRANSACTION]; ++ metadata_local = afr_wakeup_same_fd_delayed_op (this, lock, ++ stub->args.fd); ++ } ++ UNLOCK (&inode->lock); ++ ++ if (data_local) { ++ data_local->transaction.resume_stub = stub; ++ } else if (metadata_local) { ++ metadata_local->transaction.resume_stub = stub; ++ } else { ++ call_resume (stub); ++ } ++ if (data_local) { ++ afr_delayed_changelog_wake_up_cbk (data_local); ++ } ++ if (metadata_local) { ++ afr_delayed_changelog_wake_up_cbk (metadata_local); ++ } ++} ++ + int + afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- afr_local_t *local = NULL; +- call_stub_t *stub = NULL; +- int op_errno = ENOMEM; ++ afr_local_t *local = NULL; ++ call_stub_t *stub = NULL; ++ int op_errno = ENOMEM; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) +@@ -3209,7 +3231,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + if (!stub) + goto out; + +- afr_delayed_changelog_wake_resume (this, fd, stub); ++ afr_delayed_changelog_wake_resume (this, fd->inode, stub); + + return 0; + out: +@@ -4265,7 +4287,7 @@ afr_forget (xlator_t *this, inode_t *inode) + return 0; + + ctx = (afr_inode_ctx_t *)ctx_int; +- GF_FREE (ctx); ++ afr_inode_ctx_destroy (ctx); + return 0; + } + +@@ -4811,21 +4833,6 @@ out: + } + + int +-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +-{ +- int ret = -ENOMEM; +- +- lk->domain = dom; +- lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), +- child_count, gf_afr_mt_char); +- if (NULL == lk->locked_nodes) +- goto out; +- ret = 0; +-out: +- return ret; +-} +- +-int + afr_transaction_local_init (afr_local_t *local, xlator_t *this) + { + int ret = -ENOMEM; +@@ -4836,25 +4843,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (ret < 0) + goto out; + +- if ((local->transaction.type == AFR_DATA_TRANSACTION) || +- (local->transaction.type == AFR_METADATA_TRANSACTION)) { +- ret = afr_inodelk_init (&local->internal_lock.inodelk[0], +- this->name, priv->child_count); +- if (ret < 0) +- goto out; +- } +- + ret = -ENOMEM; + local->pre_op_compat = priv->pre_op_compat; + +- local->transaction.eager_lock = +- GF_CALLOC (sizeof (*local->transaction.eager_lock), +- priv->child_count, +- gf_afr_mt_int32_t); +- +- if (!local->transaction.eager_lock) +- goto out; +- + local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), + priv->child_count, + gf_afr_mt_char); +@@ -4886,9 +4877,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (!local->pending) + goto out; + +- INIT_LIST_HEAD (&local->transaction.eager_locked); +- + ret = 0; ++ INIT_LIST_HEAD (&local->transaction.wait_list); ++ INIT_LIST_HEAD (&local->transaction.owner_list); + out: + return ret; + } +@@ -4960,24 +4951,6 @@ out: + return; + } + +-void +-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +-{ +- afr_local_t *local = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; +- +- local = frame->local; +- +- if (!local->fd) +- return; +- +- fd_ctx = afr_fd_ctx_get (local->fd, this); +- if (!fd_ctx) +- return; +- +- fd_ctx->open_fd_count = local->open_fd_count; +-} +- + int** + afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat) +@@ -5086,7 +5059,7 @@ out: + + int + afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, +- inode_t *inode, gf_boolean_t *dsh, ++ fd_t *fd, gf_boolean_t *dsh, + gf_boolean_t *pflag) + { + int ret = -1; +@@ -5096,8 +5069,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; + struct afr_reply *locked_replies = NULL; ++ inode_t *inode = fd->inode; + + priv = this->private; + data_lock = alloca0 (priv->child_count); +@@ -5106,18 +5079,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, + healed_sinks = alloca0 (priv->child_count); + undid_pending = alloca0 (priv->child_count); + +- /* Heal-info does an open() on the file being examined so that the +- * current eager-lock holding client, if present, at some point sees +- * open-fd count being > 1 and releases the eager-lock so that heal-info +- * doesn't remain blocked forever until IO completes. +- */ +- ret = afr_selfheal_data_open (this, inode, &fd); +- if (ret < 0) { +- gf_msg_debug (this->name, -ret, "%s: Failed to open", +- uuid_utoa (inode->gfid)); +- goto out; +- } +- + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, inode, this->name, +@@ -5140,8 +5101,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, + out: + if (locked_replies) + afr_replies_wipe (locked_replies, priv->child_count); +- if (fd) +- fd_unref (fd); + return ret; + } + +@@ -5226,6 +5185,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, + + { + int ret = -1; ++ fd_t *fd = NULL; + gf_boolean_t dsh = _gf_false; + gf_boolean_t msh = _gf_false; + gf_boolean_t esh = _gf_false; +@@ -5237,6 +5197,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, + + /* For every heal type hold locks and check if it indeed needs heal */ + ++ ++ /* Heal-info does an open() on the file being examined so that the ++ * current eager-lock holding client, if present, at some point sees ++ * open-fd count being > 1 and releases the eager-lock so that heal-info ++ * doesn't remain blocked forever until IO completes. ++ */ ++ if ((*inode)->ia_type == IA_IFREG) { ++ ret = afr_selfheal_data_open (this, *inode, &fd); ++ if (ret < 0) { ++ gf_msg_debug (this->name, -ret, "%s: Failed to open", ++ uuid_utoa ((*inode)->gfid)); ++ goto out; ++ } ++ } ++ + if (msh) { + ret = afr_selfheal_locked_metadata_inspect (frame, this, + *inode, &msh, +@@ -5246,7 +5221,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, + } + + if (dsh) { +- ret = afr_selfheal_locked_data_inspect (frame, this, *inode, ++ ret = afr_selfheal_locked_data_inspect (frame, this, fd, + &dsh, pending); + if (ret == -EIO || (ret == -EAGAIN)) + goto out; +@@ -5261,6 +5236,8 @@ out: + *data_selfheal = dsh; + *entry_selfheal = esh; + *metadata_selfheal = msh; ++ if (fd) ++ fd_unref (fd); + return ret; + } + +@@ -5831,3 +5808,18 @@ afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this, + out: + return ret; + } ++ ++int ++afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode) ++{ ++ int ret = 0; ++ ++ local->inode = inode_ref (inode); ++ LOCK(&local->inode->lock); ++ { ++ ret = __afr_inode_ctx_get (this, local->inode, ++ &local->inode_ctx); ++ } ++ UNLOCK (&local->inode->lock); ++ return ret; ++} +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index e0f6541..33c1015 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -341,14 +341,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this) + the xattrs are not reliably pointing at + a stale file. + */ +- afr_fd_report_unstable_write (this, local->fd); ++ afr_fd_report_unstable_write (this, local); + + __afr_inode_write_finalize (frame, this); + + afr_writev_handle_short_writes (frame, this); + + if (local->update_open_fd_count) +- afr_handle_open_fd_count (frame, this); ++ local->inode_ctx->open_fd_count = local->open_fd_count; + + } + +@@ -503,6 +503,7 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + { + afr_local_t *local = NULL; + int op_errno = ENOMEM; ++ int ret = -1; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) +@@ -525,7 +526,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + goto out; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { + op_errno = ENOMEM; +@@ -648,7 +651,9 @@ afr_truncate (call_frame_t *frame, xlator_t *this, + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); +- local->inode = inode_ref (loc->inode); ++ ret = afr_set_inode_local (this, local, loc->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_TRUNCATE; + +@@ -762,7 +767,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + goto out; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_FTRUNCATE; + +@@ -876,7 +883,9 @@ afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + local->transaction.unwind = afr_setattr_unwind; + + loc_copy (&local->loc, loc); +- local->inode = inode_ref (loc->inode); ++ ret = afr_set_inode_local (this, local, loc->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_SETATTR; + +@@ -979,7 +988,9 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, + local->transaction.unwind = afr_fsetattr_unwind; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_FSETATTR; + +@@ -1619,7 +1630,9 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); +- local->inode = inode_ref (loc->inode); ++ ret = afr_set_inode_local (this, local, loc->inode); ++ if (ret) ++ goto out; + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; +@@ -1729,7 +1742,9 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_FSETXATTR; + +@@ -1840,7 +1855,9 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); +- local->inode = inode_ref (loc->inode); ++ ret = afr_set_inode_local (this, local, loc->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_REMOVEXATTR; + +@@ -1945,7 +1962,9 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + local->transaction.unwind = afr_fremovexattr_unwind; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_FREMOVEXATTR; + +@@ -2040,7 +2059,9 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); +@@ -2150,7 +2171,9 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->cont.discard.len = len; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); +@@ -2257,7 +2280,9 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); +@@ -2365,7 +2390,9 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + local->transaction.unwind = afr_xattrop_unwind; + + loc_copy (&local->loc, loc); +- local->inode = inode_ref (loc->inode); ++ ret = afr_set_inode_local (this, local, loc->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_XATTROP; + +@@ -2457,7 +2484,9 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + local->transaction.unwind = afr_fxattrop_unwind; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; + + local->op = GF_FOP_FXATTROP; + +@@ -2555,12 +2584,15 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + goto out; + + local->fd = fd_ref (fd); +- local->inode = inode_ref (fd->inode); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; ++ + + local->op = GF_FOP_FSYNC; + local->cont.fsync.datasync = datasync; + +- if (afr_fd_has_witnessed_unstable_write (this, fd)) { ++ if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) { + /* don't care. we only wanted to CLEAR the bit */ + } + +diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c +index 38cc87b..494a63c 100644 +--- a/xlators/cluster/afr/src/afr-lk-common.c ++++ b/xlators/cluster/afr/src/afr-lk-common.c +@@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2) + + int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +-static int +-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); +- +-static uint64_t afr_lock_number = 1; +- +-static uint64_t +-get_afr_lock_number () +-{ +- return (++afr_lock_number); +-} +- +-int +-afr_set_lock_number (call_frame_t *frame, xlator_t *this) +-{ +- afr_local_t *local = NULL; +- afr_internal_lock_t *int_lock = NULL; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- int_lock->lock_number = get_afr_lock_number (); +- +- return 0; +-} +- + void + afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) + { +@@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; +- afr_inodelk_t *inodelk = NULL; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- inodelk->lock_count = 0; ++ int_lock->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + +- memset (inodelk->locked_nodes, 0, +- sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); + +@@ -283,12 +253,7 @@ void + afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index) + { +- afr_inodelk_t *inodelk = NULL; +- +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- inodelk->locked_nodes[child_index] &= LOCKED_NO; +- if (local->transaction.eager_lock) +- local->transaction.eager_lock[child_index] = 0; ++ int_lock->locked_nodes[child_index] &= LOCKED_NO; + + } + +@@ -328,35 +293,27 @@ static int + afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; +- struct gf_flock full_flock = {0,}; +- struct gf_flock *flock_use = NULL; + int call_count = 0; + int i = 0; +- int piggyback = 0; +- afr_fd_ctx_t *fd_ctx = NULL; +- + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; ++ flock.l_start = int_lock->flock.l_start; ++ flock.l_len = int_lock->flock.l_len; + flock.l_type = F_UNLCK; + +- full_flock.l_type = F_UNLCK; +- call_count = afr_locked_nodes_count (inodelk->locked_nodes, ++ call_count = afr_locked_nodes_count (int_lock->locked_nodes, + priv->child_count); + + int_lock->lk_call_count = call_count; + + if (!call_count) { ++ GF_ASSERT (!local->transaction.do_eager_unlock); + gf_msg_trace (this->name, 0, + "No internal locks unlocked"); + +@@ -364,64 +321,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + goto out; + } + +- if (local->fd) +- fd_ctx = afr_fd_ctx_get (local->fd, this); +- + for (i = 0; i < priv->child_count; i++) { +- if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) ++ if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) + continue; + + if (local->fd) { +- flock_use = &flock; +- if (!local->transaction.eager_lock[i]) { +- goto wind; +- } +- +- piggyback = 0; +- +- LOCK (&local->fd->lock); +- { +- if (fd_ctx->lock_piggyback[i]) { +- fd_ctx->lock_piggyback[i]--; +- piggyback = 1; +- } else { +- fd_ctx->lock_acquired[i]--; +- } +- } +- UNLOCK (&local->fd->lock); +- +- if (piggyback) { +- afr_unlock_inodelk_cbk (frame, (void *) (long) i, +- this, 1, 0, NULL); +- if (!--call_count) +- break; +- continue; +- } +- +- flock_use = &full_flock; +- wind: + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, +- F_SETLK, flock_use, NULL); +- +- if (!--call_count) +- break; +- ++ F_SETLK, &flock, NULL); + } else { +- + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); +- +- if (!--call_count) +- break; + } ++ ++ if (!--call_count) ++ break; + } + out: + return 0; +@@ -509,6 +430,18 @@ out: + + } + ++int32_t ++afr_unlock_now (call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = frame->local; ++ ++ if (afr_is_inodelk_transaction(local->transaction.type)) ++ afr_unlock_inodelk (frame, this); ++ else ++ afr_unlock_entrylk (frame, this); ++ return 0; ++} ++ + static int32_t + afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +@@ -550,7 +483,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + if ((op_ret == -1) && + (op_errno == ENOSYS)) { +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + } else { + if (op_ret == 0) { + if (local->transaction.type == AFR_ENTRY_TRANSACTION || +@@ -586,38 +519,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + return 0; + } + +-static int +-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; +- int_lock = &local->internal_lock; +- +- switch (local->transaction.type) { +- case AFR_DATA_TRANSACTION: +- case AFR_METADATA_TRANSACTION: +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- memcpy (inodelk->locked_nodes, int_lock->locked_nodes, +- sizeof (*inodelk->locked_nodes) * priv->child_count); +- inodelk->lock_count = int_lock->lock_count; +- break; +- +- case AFR_ENTRY_RENAME_TRANSACTION: +- case AFR_ENTRY_TRANSACTION: +- /*entrylk_count is being used in both non-blocking and blocking +- * modes */ +- break; +- } +- +- return 0; +- +-} +- + static gf_boolean_t + afr_is_entrylk (afr_transaction_type trans_type) + { +@@ -721,7 +622,6 @@ int + afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; +@@ -740,10 +640,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + + + if (!is_entrylk) { +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; +- flock.l_type = inodelk->flock.l_type; ++ flock.l_start = int_lock->flock.l_start; ++ flock.l_len = int_lock->flock.l_len; ++ flock.l_type = int_lock->flock.l_type; + } + + if (local->fd) { +@@ -758,9 +657,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + local->op_ret = -1; + int_lock->lock_op_ret = -1; + +- afr_copy_locked_nodes (frame, this); +- +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + + return 0; + } +@@ -772,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + local->op_ret = -1; + int_lock->lock_op_ret = -1; + +- afr_copy_locked_nodes (frame, this); +- +- afr_unlock(frame, this); ++ afr_unlock_now(frame, this); + + return 0; + } +@@ -786,8 +681,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + gf_msg_debug (this->name, 0, + "we're done locking"); + +- afr_copy_locked_nodes (frame, this); +- + int_lock->lock_op_ret = 0; + int_lock->lock_cbk (frame, this); + return 0; +@@ -803,7 +696,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + case AFR_METADATA_TRANSACTION: + + if (local->fd) { +- + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], +@@ -812,7 +704,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + F_SETLKW, &flock, NULL); + + } else { +- + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], +@@ -829,7 +720,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + *and 'fd-less' children */ + + if (local->fd) { +- + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], +@@ -838,7 +728,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { +- + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], +@@ -910,7 +799,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- + LOCK (&frame->lock); + { + if (op_ret < 0 ) { +@@ -957,7 +845,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + "with blocking calls", + int_lock->lock_count); + +- afr_unlock(frame, this); ++ afr_unlock_now(frame, this); + } + } + +@@ -997,7 +885,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + return -1; + } + +@@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + gf_msg (this->name, GF_LOG_INFO, 0, + AFR_MSG_INFO_COMMON, + "fd not open on any subvolumes. aborting."); +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + goto out; + } + +@@ -1019,7 +907,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, + priv->children[index], +@@ -1041,7 +928,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, + priv->children[index], +@@ -1065,19 +951,13 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int child_index = (long) cookie; +- afr_fd_ctx_t *fd_ctx = NULL; + + + local = frame->local; + int_lock = &local->internal_lock; +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- if (local->fd) +- fd_ctx = afr_fd_ctx_get (local->fd, this); + + LOCK (&frame->lock); + { +@@ -1094,23 +974,8 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } +- if (local->transaction.eager_lock) +- local->transaction.eager_lock[child_index] = 0; + } else { +- inodelk->locked_nodes[child_index] |= LOCKED_YES; +- inodelk->lock_count++; +- +- if (local->transaction.eager_lock && +- local->transaction.eager_lock[child_index] && +- local->fd) { +- /* piggybacked */ +- if (op_ret == 1) { +- /* piggybacked */ +- } else if (op_ret == 0) { +- /* lock acquired from server */ +- fd_ctx->lock_acquired[child_index]++; +- } +- } ++ int_lock->locked_nodes[child_index] |= LOCKED_YES; + } + + call_count = --int_lock->lk_call_count; +@@ -1121,7 +986,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + gf_msg_trace (this->name, 0, + "Last inode locking reply received"); + /* all locks successful. Proceed to call FOP */ +- if (inodelk->lock_count == int_lock->lk_expected_count) { ++ if (int_lock->lock_count == int_lock->lk_expected_count) { + gf_msg_trace (this->name, 0, + "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; +@@ -1135,7 +1000,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + "Trying again with blocking calls", + int_lock->lock_count); + +- afr_unlock(frame, this); ++ afr_unlock_now(frame, this); + } + } + +@@ -1146,30 +1011,17 @@ int + afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int32_t call_count = 0; + int i = 0; + int ret = 0; +- struct gf_flock flock = {0,}; +- struct gf_flock full_flock = {0,}; +- struct gf_flock *flock_use = NULL; +- int piggyback = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; +- flock.l_type = inodelk->flock.l_type; +- +- full_flock.l_type = inodelk->flock.l_type; +- + initialize_inodelk_variables (frame, this); + + if (local->fd) { +@@ -1185,88 +1037,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + ret = -1; + goto out; + } ++ } + +- call_count = internal_lock_count (frame, this); +- int_lock->lk_call_count = call_count; +- int_lock->lk_expected_count = call_count; +- +- if (!call_count) { +- gf_msg (this->name, GF_LOG_INFO, 0, +- AFR_MSG_ALL_SUBVOLS_DOWN, +- "All bricks are down, aborting."); +- afr_unlock (frame, this); +- goto out; +- } +- +- /* Send non-blocking inodelk calls only on up children +- and where the fd has been opened */ +- for (i = 0; i < priv->child_count; i++) { +- if (!local->child_up[i]) +- continue; +- +- flock_use = &flock; +- if (!local->transaction.eager_lock_on) { +- goto wind; +- } +- +- piggyback = 0; +- local->transaction.eager_lock[i] = 1; +- +- afr_set_delayed_post_op (frame, this); ++ call_count = internal_lock_count (frame, this); ++ int_lock->lk_call_count = call_count; ++ int_lock->lk_expected_count = call_count; + +- LOCK (&local->fd->lock); +- { +- if (fd_ctx->lock_acquired[i]) { +- fd_ctx->lock_piggyback[i]++; +- piggyback = 1; +- } +- } +- UNLOCK (&local->fd->lock); ++ if (!call_count) { ++ gf_msg (this->name, GF_LOG_INFO, 0, ++ AFR_MSG_ALL_SUBVOLS_DOWN, ++ "All bricks are down, aborting."); ++ afr_unlock_now (frame, this); ++ goto out; ++ } + +- if (piggyback) { +- /* (op_ret == 1) => indicate piggybacked lock */ +- afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, +- this, 1, 0, NULL); +- if (!--call_count) +- break; +- continue; +- } +- flock_use = &full_flock; +- wind: ++ /* Send non-blocking inodelk calls only on up children ++ and where the fd has been opened */ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!local->child_up[i]) ++ continue; + ++ if (local->fd) { + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, +- F_SETLK, flock_use, NULL); +- +- if (!--call_count) +- break; +- } +- } else { +- call_count = internal_lock_count (frame, this); +- int_lock->lk_call_count = call_count; +- int_lock->lk_expected_count = call_count; +- +- for (i = 0; i < priv->child_count; i++) { +- if (!local->child_up[i]) +- continue; ++ F_SETLK, &int_lock->flock, NULL); ++ } else { + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, +- F_SETLK, &flock, NULL); +- +- if (!--call_count) +- break; ++ F_SETLK, &int_lock->flock, NULL); + } ++ if (!--call_count) ++ break; + } + out: + return ret; +@@ -1276,13 +1088,32 @@ int32_t + afr_unlock (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; ++ afr_lock_t *lock = NULL; + + local = frame->local; + +- if (afr_is_inodelk_transaction(local->transaction.type)) +- afr_unlock_inodelk (frame, this); +- else +- afr_unlock_entrylk (frame, this); ++ if (!local->transaction.eager_lock_on) ++ goto out; ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ list_del_init (&local->transaction.owner_list); ++ if (list_empty (&lock->owners) && list_empty (&lock->post_op)) { ++ local->transaction.do_eager_unlock = _gf_true; ++ /*TODO: Need to get metadata use on_disk and inherit/uninherit ++ *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); ++ *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); ++ */ ++ GF_ASSERT (lock->release); ++ } ++ } ++ UNLOCK (&local->inode->lock); ++ if (!local->transaction.do_eager_unlock) { ++ local->internal_lock.lock_cbk (frame, this); ++ return 0; ++ } + ++out: ++ afr_unlock_now (frame, this); + return 0; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 9f6cbcd..8afea5e 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -1981,6 +1981,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) + int data_ret = 1; + int or_ret = 0; + inode_t *inode = NULL; ++ fd_t *fd = NULL; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; +@@ -2005,8 +2006,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) + goto out; + } + ++ if (inode->ia_type == IA_IFREG) { ++ ret = afr_selfheal_data_open (this, inode, &fd); ++ if (!fd) { ++ ret = -EIO; ++ goto out; ++ } ++ } ++ + if (data_selfheal && dataheal_enabled) +- data_ret = afr_selfheal_data (frame, this, inode); ++ data_ret = afr_selfheal_data (frame, this, fd); + + if (metadata_selfheal && priv->metadata_self_heal) + metadata_ret = afr_selfheal_metadata (frame, this, inode); +@@ -2028,6 +2037,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) + out: + if (inode) + inode_unref (inode); ++ if (fd) ++ fd_unref (fd); + return ret; + } + /* +diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c +index d032284..8e00469 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-data.c ++++ b/xlators/cluster/afr/src/afr-self-heal-data.c +@@ -844,22 +844,15 @@ out: + } + + int +-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) ++afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd) + { + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; +- fd_t *fd = NULL; ++ inode_t *inode = fd->inode; + + priv = this->private; + +- ret = afr_selfheal_data_open (this, inode, &fd); +- if (!fd) { +- gf_msg_debug (this->name, -ret, "%s: Failed to open", +- uuid_utoa (inode->gfid)); +- return -EIO; +- } +- + locked_on = alloca0 (priv->child_count); + + ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode, +@@ -886,8 +879,5 @@ unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + +- if (fd) +- fd_unref (fd); +- + return ret; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h +index 2b3a87e..49b344a 100644 +--- a/xlators/cluster/afr/src/afr-self-heal.h ++++ b/xlators/cluster/afr/src/afr-self-heal.h +@@ -94,7 +94,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name, + void *gfid_req); + + int +-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); ++afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd); + + int + afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 46a65a7..caa83c8 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -25,6 +25,18 @@ typedef enum { + AFR_TRANSACTION_POST_OP, + } afr_xattrop_type_t; + ++static void ++afr_lock_resume_shared (struct list_head *list); ++ ++void ++__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared); ++ ++void ++afr_changelog_post_op (call_frame_t *frame, xlator_t *this); ++ ++int ++afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this); ++ + gf_boolean_t + afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); + +@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this) + return 0; + } + +- + int + afr_transaction_done (call_frame_t *frame, xlator_t *this) + { +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- gf_boolean_t unwind = _gf_false; ++ afr_local_t *local = NULL; ++ afr_private_t *priv = NULL; ++ gf_boolean_t unwind = _gf_false; ++ afr_lock_t *lock = NULL; ++ afr_local_t *lock_local = NULL; + + priv = this->private; + local = frame->local; +@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this) + if (unwind)/*It definitely did post-op*/ + afr_zero_fill_stat (local); + } ++ ++ if (local->transaction.do_eager_unlock) { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ lock->acquired = _gf_false; ++ lock->release = _gf_false; ++ list_splice_init (&lock->frozen, ++ &lock->waiting); ++ if (list_empty (&lock->waiting)) ++ goto unlock; ++ lock_local = list_entry (lock->waiting.next, ++ afr_local_t, ++ transaction.wait_list); ++ list_del_init (&lock_local->transaction.wait_list); ++ list_add (&lock_local->transaction.owner_list, ++ &lock->owners); ++ } ++unlock: ++ UNLOCK (&local->inode->lock); ++ } ++ if (lock_local) { ++ afr_lock (lock_local->transaction.frame, ++ lock_local->transaction.frame->this); ++ } + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); +@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this) + return 0; + } + ++static void ++afr_lock_fail_shared (afr_local_t *local, struct list_head *list) ++{ ++ afr_local_t *each = NULL; ++ ++ while (!list_empty(list)) { ++ each = list_entry (list->next, afr_local_t, ++ transaction.wait_list); ++ list_del_init(&each->transaction.wait_list); ++ each->op_ret = -1; ++ each->op_errno = local->op_errno; ++ afr_transaction_done (each->transaction.frame, ++ each->transaction.frame->this); ++ } ++} ++ ++static void ++afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked) ++{ ++ struct list_head shared; ++ afr_lock_t *lock = NULL; ++ ++ if (!local->transaction.eager_lock_on) ++ goto out; ++ ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ ++ INIT_LIST_HEAD (&shared); ++ LOCK (&local->inode->lock); ++ { ++ list_splice_init (&lock->waiting, &shared); ++ } ++ UNLOCK (&local->inode->lock); ++ ++ afr_lock_fail_shared (local, &shared); ++ local->transaction.do_eager_unlock = _gf_true; ++out: ++ if (locked) { ++ local->internal_lock.lock_cbk = afr_transaction_done; ++ afr_unlock (local->transaction.frame, ++ local->transaction.frame->this); ++ } else { ++ afr_transaction_done (local->transaction.frame, ++ local->transaction.frame->this); ++ } ++} + + call_frame_t* + afr_transaction_detach_fop_frame (call_frame_t *frame) +@@ -340,6 +424,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_sources_count = 0; ++ int i = 0; + + priv = this->private; + local = frame->local; +@@ -351,11 +436,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { +- local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = ENOTCONN; +- afr_restore_lk_owner (frame); +- afr_unlock (frame, this); ++ for (i = 0; i < priv->child_count; i++) ++ local->transaction.failed_subvols[i] = 1; ++ afr_changelog_post_op (frame, this);/*uninherit should happen*/ + } else { + afr_transaction_fop (frame, this); + } +@@ -368,12 +453,14 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; ++ int failure_count = 0; ++ struct list_head shared; ++ afr_lock_t *lock = NULL; + + local = frame->local; + priv = this->private; +- fd = local->fd; + ++ INIT_LIST_HEAD (&shared); + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. +@@ -387,22 +474,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + just now, before OP */ + afr_changelog_pre_op_update (frame, this); + +- /* The wake up needs to happen independent of +- what type of fop arrives here. If it was +- a write, then it has already inherited the +- lock and changelog. If it was not a write, +- then the presumption of the optimization (of +- optimizing for successive write operations) +- fails. +- */ +- if (fd) +- afr_delayed_changelog_wake_up (this, fd); ++ if (!local->transaction.eager_lock_on || ++ local->transaction.inherited) ++ goto fop; ++ failure_count = AFR_COUNT (local->transaction.failed_subvols, ++ priv->child_count); ++ if (failure_count == priv->child_count) { ++ afr_handle_lock_acquire_failure (local, _gf_true); ++ } else { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ lock->acquired = _gf_true; ++ __afr_transaction_wake_shared (local, &shared); ++ } ++ UNLOCK (&local->inode->lock); ++ } ++ ++fop: + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop (frame, this); + } else { + afr_transaction_fop (frame, this); + } + ++ afr_lock_resume_shared (&shared); + return 0; + } + +@@ -450,30 +546,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + } + + +-afr_inodelk_t* +-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +-{ +- afr_inodelk_t *inodelk = NULL; +- int i = 0; +- +- for (i = 0; int_lock->inodelk[i].domain; i++) { +- inodelk = &int_lock->inodelk[i]; +- if (strcmp (dom, inodelk->domain) == 0) +- return inodelk; +- } +- return NULL; +-} +- + unsigned char* + afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) + { + unsigned char *locked_nodes = NULL; +- afr_inodelk_t *inodelk = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- locked_nodes = inodelk->locked_nodes; ++ locked_nodes = int_lock->locked_nodes; + break; + + case AFR_ENTRY_TRANSACTION: +@@ -820,27 +900,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; ++ afr_inode_ctx_t *ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; +- afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; +- fd = local->fd; ++ ctx = local->inode_ctx; + + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + +- if (!fd) +- return !local->transaction.dirtied; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_false; +- + if (local->transaction.no_uninherit) + return _gf_false; + +@@ -854,34 +926,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != +- fd_ctx->pre_op_done[type][i]) { ++ ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } + } + +- if (fd_ctx->inherited[type]) { ++ if (ctx->inherited[type]) { + ret = _gf_true; +- fd_ctx->inherited[type]--; +- } else if (fd_ctx->on_disk[type]) { ++ ctx->inherited[type]--; ++ } else if (ctx->on_disk[type]) { + ret = _gf_false; +- fd_ctx->on_disk[type]--; ++ ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } + +- if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { ++ if (!ctx->inherited[type] && !ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) +- fd_ctx->pre_op_done[type][i] = 0; ++ ctx->pre_op_done[type][i] = 0; + } + } + unlock: +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; +@@ -895,31 +967,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; +- afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; +- fd = local->fd; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + +- if (!fd) +- return _gf_false; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_false; +- +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { +- if (!fd_ctx->on_disk[type]) { ++ if (!local->inode_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; +@@ -927,21 +989,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != +- fd_ctx->pre_op_done[type][i]) { ++ local->inode_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } + +- fd_ctx->inherited[type]++; ++ local->inode_ctx->inherited[type]++; + + ret = _gf_true; + + local->transaction.inherited = _gf_true; + } + unlock: +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + return ret; + } +@@ -952,22 +1014,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; +- fd = local->fd; + +- if (!fd) +- return _gf_false; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_false; ++ if (local->transaction.type == AFR_ENTRY_TRANSACTION || ++ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) ++ return _gf_false; + + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ +@@ -983,26 +1039,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) + + ret = _gf_false; + +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { +- if (!fd_ctx->on_disk[type]) { ++ if (!local->inode_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) +- fd_ctx->pre_op_done[type][i] = ++ local->inode_ctx->pre_op_done[type][i] = + (!local->transaction.failed_subvols[i]); + } else { + for (i = 0; i < priv->child_count; i++) +- if (fd_ctx->pre_op_done[type][i] != ++ if (local->inode_ctx->pre_op_done[type][i] != + (!local->transaction.failed_subvols[i])) { + local->transaction.no_uninherit = 1; + goto unlock; + } + } +- fd_ctx->on_disk[type]++; ++ local->inode_ctx->on_disk[type]++; + + ret = _gf_true; + } + unlock: +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + return ret; + } +@@ -1307,6 +1363,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) + + afr_init_optimistic_changelog_for_txn (this, local); + ++ if (afr_changelog_pre_op_inherit (frame, this)) ++ goto next; ++ + /* This condition should not be met with present code, as + * transaction.done will be called if locks are not acquired on even a + * single node. +@@ -1332,9 +1391,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) + goto err; + } + +- if (afr_changelog_pre_op_inherit (frame, this)) +- goto next; +- + if (call_count < priv->child_count) + pre_nop = _gf_false; + +@@ -1391,7 +1447,7 @@ err: + local->op_ret = -1; + local->op_errno = op_errno; + +- afr_unlock (frame, this); ++ afr_handle_lock_acquire_failure (local, _gf_true); + + if (xdata_req) + dict_unref (xdata_req); +@@ -1401,31 +1457,6 @@ err: + + + int +-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- if (int_lock->lock_op_ret < 0) { +- gf_msg (this->name, GF_LOG_INFO, +- 0, AFR_MSG_BLOCKING_LKS_FAILED, +- "Blocking inodelks failed."); +- afr_transaction_done (frame, this); +- } else { +- +- gf_msg_debug (this->name, 0, +- "Blocking inodelks done. Proceeding to FOP"); +- afr_internal_lock_finish (frame, this); +- } +- +- return 0; +-} +- +- +-int + afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +@@ -1438,7 +1469,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + if (int_lock->lock_op_ret < 0) { + gf_msg_debug (this->name, 0, + "Non blocking inodelks failed. Proceeding to blocking"); +- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; ++ int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock (frame, this); + } else { + +@@ -1452,31 +1483,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + + + int +-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- if (int_lock->lock_op_ret < 0) { +- gf_msg (this->name, GF_LOG_INFO, 0, +- AFR_MSG_BLOCKING_LKS_FAILED, +- "Blocking entrylks failed."); +- afr_transaction_done (frame, this); +- } else { +- +- gf_msg_debug (this->name, 0, +- "Blocking entrylks done. Proceeding to FOP"); +- afr_internal_lock_finish (frame, this); +- } +- +- return 0; +-} +- +- +-int + afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +@@ -1489,7 +1495,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + if (int_lock->lock_op_ret < 0) { + gf_msg_debug (this->name, 0, + "Non blocking entrylks failed. Proceeding to blocking"); +- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; ++ int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock (frame, this); + } else { + +@@ -1550,29 +1556,27 @@ int + afr_set_transaction_flock (xlator_t *this, afr_local_t *local) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_private_t *priv = NULL; + + int_lock = &local->internal_lock; +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); + priv = this->private; + +- if (priv->arbiter_count && ++ if ((priv->arbiter_count || local->transaction.eager_lock_on) && + local->transaction.type == AFR_DATA_TRANSACTION) { + /*Lock entire file to avoid network split brains.*/ +- inodelk->flock.l_len = 0; +- inodelk->flock.l_start = 0; ++ int_lock->flock.l_len = 0; ++ int_lock->flock.l_start = 0; + } else { +- inodelk->flock.l_len = local->transaction.len; +- inodelk->flock.l_start = local->transaction.start; ++ int_lock->flock.l_len = local->transaction.len; ++ int_lock->flock.l_start = local->transaction.start; + } +- inodelk->flock.l_type = F_WRLCK; ++ int_lock->flock.l_type = F_WRLCK; + + return 0; + } + + int +-afr_lock_rec (call_frame_t *frame, xlator_t *this) ++afr_lock (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; +@@ -1613,74 +1617,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) + return 0; + } + ++static gf_boolean_t ++afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) ++{ ++ uint64_t start1 = local1->transaction.start; ++ uint64_t start2 = local2->transaction.start; ++ uint64_t end1 = 0; ++ uint64_t end2 = 0; ++ ++ if (local1->transaction.len) ++ end1 = start1 + local1->transaction.len - 1; ++ else ++ end1 = ULLONG_MAX; ++ ++ if (local2->transaction.len) ++ end2 = start2 + local2->transaction.len - 1; ++ else ++ end2 = ULLONG_MAX; + +-int +-afr_lock (call_frame_t *frame, xlator_t *this) ++ return ((end1 >= start2) && (end2 >= start1)); ++} ++ ++gf_boolean_t ++afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check) + { +- afr_set_lock_number (frame, this); ++ afr_local_t *each = NULL; ++ afr_lock_t *lock = NULL; + +- return afr_lock_rec (frame, this); ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ /* ++ * Once full file lock is acquired in eager-lock phase, overlapping ++ * writes do not compete for inode-locks, instead are transferred to the ++ * next writes. Because of this overlapping writes are not ordered. ++ * This can cause inconsistencies in replication. ++ * Example: ++ * Two overlapping writes w1, w2 are sent in parallel on same fd ++ * in two threads t1, t2. ++ * Both threads can execute afr_writev_wind in the following manner. ++ * t1 winds w1 on brick-0 ++ * t2 winds w2 on brick-0 ++ * t2 winds w2 on brick-1 ++ * t1 winds w1 on brick-1 ++ * ++ * This check makes sure the locks are not transferred for ++ * overlapping writes. ++ */ ++ list_for_each_entry (each, &lock->owners, transaction.owner_list) { ++ if (afr_locals_overlap (each, local)) { ++ return _gf_true; ++ } ++ } ++ ++ if (!waitlist_check) ++ return _gf_false; ++ list_for_each_entry (each, &lock->waiting, transaction.wait_list) { ++ if (afr_locals_overlap (each, local)) { ++ return _gf_true; ++ } ++ } ++ return _gf_false; + } + + + /* }}} */ +- +-int +-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) ++static void ++afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src, ++ xlator_t *this) + { +- afr_changelog_pre_op (frame, this); ++ afr_private_t *priv = this->private; + +- return 0; ++ dst->domain = src->domain; ++ dst->flock.l_len = src->flock.l_len; ++ dst->flock.l_start = src->flock.l_start; ++ dst->flock.l_type = src->flock.l_type; ++ dst->lock_count = src->lock_count; ++ memcpy (dst->locked_nodes, src->locked_nodes, ++ priv->child_count * sizeof (*dst->locked_nodes)); + } + +- + void +-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) ++__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared) + { +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; ++ gf_boolean_t conflict = _gf_false; ++ afr_local_t *each = NULL; ++ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; + +- /* call this function from any of the related optimizations +- which benefit from delaying post op are enabled, namely: +- +- - changelog piggybacking +- - eager locking +- */ ++ while (!conflict) { ++ if (list_empty (&lock->waiting)) ++ return; ++ each = list_entry(lock->waiting.next, afr_local_t, ++ transaction.wait_list); ++ if (afr_has_lock_conflict (each, _gf_false)) { ++ conflict = _gf_true; ++ } ++ if (conflict && !list_empty (&lock->owners)) ++ return; ++ afr_copy_inodelk_vars (&each->internal_lock, ++ &local->internal_lock, ++ each->transaction.frame->this); ++ list_move_tail (&each->transaction.wait_list, shared); ++ list_add_tail(&each->transaction.owner_list, &lock->owners); ++ } ++} + +- priv = this->private; +- if (!priv) +- return; ++static void ++afr_lock_resume_shared (struct list_head *list) ++{ ++ afr_local_t *each = NULL; + +- if (!priv->post_op_delay_secs) +- return; ++ while (!list_empty(list)) { ++ each = list_entry(list->next, afr_local_t, ++ transaction.wait_list); ++ list_del_init(&each->transaction.wait_list); ++ afr_changelog_pre_op (each->transaction.frame, ++ each->transaction.frame->this); ++ } ++} + +- local = frame->local; +- if (!local) +- return; ++int ++afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = frame->local; ++ afr_lock_t *lock = NULL; + +- if (!local->transaction.eager_lock_on) +- return; + +- if (!local->fd) +- return; ++ local->internal_lock.lock_cbk = NULL; ++ if (!local->transaction.eager_lock_on) { ++ if (local->internal_lock.lock_op_ret < 0) { ++ afr_transaction_done (frame, this); ++ return 0; ++ } ++ afr_changelog_pre_op (frame, this); ++ } else { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ if (local->internal_lock.lock_op_ret < 0) { ++ afr_handle_lock_acquire_failure (local, _gf_false); ++ } else { ++ lock->event_generation = local->event_generation; ++ afr_changelog_pre_op (frame, this); ++ } ++ } + +- if (local->op == GF_FOP_WRITE) +- local->delayed_post_op = _gf_true; ++ return 0; + } + + gf_boolean_t +-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) ++afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this) + { +- afr_fd_ctx_t *fd_ctx = NULL; +- +- if (!fd) { +- /* If false is returned, it may keep on taking eager-lock +- * which may lead to starvation, so return true to avoid that. +- */ +- gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF, +- AFR_MSG_INVALID_ARG, "Invalid fd"); +- return _gf_true; +- } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. +@@ -1688,11 +1771,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) + * if open-fd-count is > 1 + */ + +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_true; +- +- if (fd_ctx->open_fd_count > 1) ++ if (local->inode_ctx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +@@ -1700,24 +1779,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) + + + gf_boolean_t +-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) ++afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this, ++ int delay) + { +- afr_local_t *local = NULL; +- gf_boolean_t res = _gf_false; ++ afr_local_t *local = NULL; ++ afr_lock_t *lock = NULL; ++ gf_boolean_t res = _gf_false; + + local = frame->local; +- if (!local) ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ ++ if (!afr_txn_nothing_failed (frame, this)) { ++ lock->release = _gf_true; + goto out; ++ } + +- if (!local->delayed_post_op) ++ if (afr_are_multiple_fds_opened (local, this)) { ++ lock->release = _gf_true; + goto out; ++ } + +- //Mark pending changelog ASAP +- if (!afr_txn_nothing_failed (frame, this)) ++ if (!list_empty (&lock->owners)) + goto out; ++ else ++ GF_ASSERT (list_empty (&lock->waiting)); ++ ++ if (lock->release) { ++ goto out; ++ } ++ ++ if (!delay) { ++ goto out; ++ } + +- if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) ++ if ((local->op != GF_FOP_WRITE) && ++ (local->op != GF_FOP_FXATTROP)) { ++ /*Only allow writes but shard does [f]xattrops on writes, so ++ * they are fine too*/ + goto out; ++ } + + res = _gf_true; + out: +@@ -1728,50 +1828,61 @@ out: + void + afr_delayed_changelog_wake_up_cbk (void *data) + { +- fd_t *fd = NULL; +- +- fd = data; ++ afr_lock_t *lock = NULL; ++ afr_local_t *local = data; ++ afr_local_t *timer_local = NULL; ++ struct list_head shared; + +- afr_delayed_changelog_wake_up (THIS, fd); ++ INIT_LIST_HEAD (&shared); ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ timer_local = list_entry(lock->post_op.next, ++ afr_local_t, ++ transaction.owner_list); ++ if (list_empty (&lock->owners) && (local == timer_local)) { ++ GF_ASSERT (list_empty (&lock->waiting)); ++ /*Last owner*/ ++ lock->release = _gf_true; ++ lock->delay_timer = NULL; ++ } ++ } ++ UNLOCK (&local->inode->lock); ++ afr_changelog_post_op_now (local->transaction.frame, ++ local->transaction.frame->this); + } + + + /* SET operation */ + int +-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) ++afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local) + { +- afr_fd_ctx_t *fdctx = NULL; +- +- fdctx = afr_fd_ctx_get (fd, this); +- +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { +- fdctx->witnessed_unstable_write = _gf_true; ++ local->inode_ctx->witnessed_unstable_write = _gf_true; + } +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + return 0; + } + + /* TEST and CLEAR operation */ + gf_boolean_t +-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) ++afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode) + { +- afr_fd_ctx_t *fdctx = NULL; ++ afr_inode_ctx_t *ctx = NULL; + gf_boolean_t witness = _gf_false; + +- fdctx = afr_fd_ctx_get (fd, this); +- if (!fdctx) +- return _gf_true; +- +- LOCK(&fd->lock); ++ LOCK(&inode->lock); + { +- if (fdctx->witnessed_unstable_write) { ++ (void)__afr_inode_ctx_get (this, inode, &ctx); ++ ++ if (ctx->witnessed_unstable_write) { + witness = _gf_true; +- fdctx->witnessed_unstable_write = _gf_false; ++ ctx->witnessed_unstable_write = _gf_false; + } + } +- UNLOCK (&fd->lock); ++ UNLOCK (&inode->lock); + + return witness; + } +@@ -1914,7 +2025,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + +- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { ++ if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) { + afr_changelog_post_op_now (frame, this); + return 0; + } +@@ -1932,87 +2043,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) + return 0; + } + +- + void +-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, +- call_stub_t *stub) ++afr_changelog_post_op (call_frame_t *frame, xlator_t *this) + { +- afr_fd_ctx_t *fd_ctx = NULL; +- call_frame_t *prev_frame = NULL; +- struct timespec delta = {0, }; +- afr_private_t *priv = NULL; +- afr_local_t *local = NULL; ++ struct timespec delta = {0, }; ++ afr_private_t *priv = NULL; ++ afr_local_t *local = frame->local; ++ afr_lock_t *lock = NULL; ++ gf_boolean_t post_op = _gf_true; ++ struct list_head shared; + + priv = this->private; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- goto out; +- + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + +- pthread_mutex_lock (&fd_ctx->delay_lock); +- { +- prev_frame = fd_ctx->delay_frame; +- fd_ctx->delay_frame = NULL; +- if (fd_ctx->delay_timer) +- gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); +- fd_ctx->delay_timer = NULL; +- if (!frame) +- goto unlock; +- fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, +- afr_delayed_changelog_wake_up_cbk, +- fd); +- fd_ctx->delay_frame = frame; +- } +-unlock: +- pthread_mutex_unlock (&fd_ctx->delay_lock); +- +-out: +- if (prev_frame) { +- local = prev_frame->local; +- local->transaction.resume_stub = stub; +- afr_changelog_post_op_now (prev_frame, this); +- } else if (stub) { +- call_resume (stub); +- } +-} +- +- +-void +-afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +-{ +- afr_local_t *local = NULL; +- +- local = frame->local; +- +- if (is_afr_delayed_changelog_post_op_needed (frame, this)) +- afr_delayed_changelog_post_op (this, frame, local->fd, NULL); +- else +- afr_changelog_post_op_safe (frame, this); +-} +- ++ INIT_LIST_HEAD (&shared); ++ if (!local->transaction.eager_lock_on) ++ goto out; + ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ list_del_init (&local->transaction.owner_list); ++ list_add (&local->transaction.owner_list, &lock->post_op); ++ __afr_transaction_wake_shared (local, &shared); ++ ++ if (!afr_is_delayed_changelog_post_op_needed (frame, this, ++ delta.tv_sec)) { ++ if (list_empty (&lock->owners)) ++ lock->release = _gf_true; ++ goto unlock; ++ } + +-/* Wake up the sleeping/delayed post-op, and also register +- a stub to have it resumed after this transaction +- completely finishes. ++ GF_ASSERT (lock->delay_timer == NULL); ++ lock->delay_timer = gf_timer_call_after (this->ctx, delta, ++ afr_delayed_changelog_wake_up_cbk, ++ local); ++ if (!lock->delay_timer) { ++ lock->release = _gf_true; ++ } else { ++ post_op = _gf_false; ++ } + +- The @stub gets saved in @local and gets resumed in +- afr_local_cleanup() +- */ +-void +-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +-{ +- afr_delayed_changelog_post_op (this, NULL, fd, stub); +-} ++ } ++unlock: ++ UNLOCK (&local->inode->lock); + ++ if (!list_empty (&shared)) { ++ afr_lock_resume_shared (&shared); ++ } + +-void +-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +-{ +- afr_delayed_changelog_post_op (this, NULL, fd, NULL); ++out: ++ if (post_op) { ++ if (!local->transaction.eager_lock_on || lock->release) { ++ afr_changelog_post_op_safe (frame, this); ++ } else { ++ afr_changelog_post_op_now (frame, this); ++ } ++ } + } + + int +@@ -2022,13 +2110,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) + + local = frame->local; + +- if (local->transaction.eager_lock_on) { +- /* We don't need to retain "local" in the +- fd list anymore, writes to all subvols +- are finished by now */ +- afr_remove_eager_lock_stub (local); +- } +- + afr_restore_lk_owner (frame); + + afr_handle_symmetric_errors (frame, this); +@@ -2059,114 +2140,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + local->transaction.failed_subvols[child_index] = 1; + } + +- +- + static gf_boolean_t +-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) ++__need_previous_lock_unlocked (afr_local_t *local) + { +- uint64_t start1 = local1->transaction.start; +- uint64_t start2 = local2->transaction.start; +- uint64_t end1 = 0; +- uint64_t end2 = 0; +- +- if (local1->transaction.len) +- end1 = start1 + local1->transaction.len - 1; +- else +- end1 = ULLONG_MAX; ++ afr_lock_t *lock = NULL; + +- if (local2->transaction.len) +- end2 = start2 + local2->transaction.len - 1; +- else +- end2 = ULLONG_MAX; ++ if (!local->transaction.eager_lock_on) ++ return _gf_true; + +- return ((end1 >= start2) && (end2 >= start1)); ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ if (!lock->acquired) ++ return _gf_false; ++ if (lock->acquired && lock->event_generation != local->event_generation) ++ return _gf_true; ++ return _gf_false; + } + + void +-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) ++__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock, ++ gf_boolean_t *do_pre_op, afr_local_t **timer_local) + { +- afr_private_t *priv = NULL; +- afr_fd_ctx_t *fdctx = NULL; +- afr_local_t *each = NULL; +- +- priv = this->private; +- +- if (!local->fd) +- return; ++ afr_lock_t *lock = NULL; ++ afr_local_t *owner_local = NULL; ++ xlator_t *this = local->transaction.frame->this; + +- if (local->transaction.type != AFR_DATA_TRANSACTION) +- return; ++ if (local->fd && !afr_are_multiple_fds_opened (local, this)) { ++ local->transaction.eager_lock_on = _gf_true; ++ } + +- if (!priv->eager_lock) +- return; ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ if (__need_previous_lock_unlocked (local)) { ++ if (!list_empty (&lock->owners)) { ++ lock->release = _gf_true; ++ } else if (lock->delay_timer) { ++ lock->release = _gf_true; ++ if (gf_timer_call_cancel (this->ctx, ++ lock->delay_timer)) { ++ /* It will be put in frozen list ++ * in the code flow below*/ ++ } else { ++ *timer_local = list_entry(lock->post_op.next, ++ afr_local_t, ++ transaction.owner_list); ++ lock->delay_timer = NULL; ++ } ++ } ++ if (!local->transaction.eager_lock_on) ++ goto out; ++ } + +- fdctx = afr_fd_ctx_get (local->fd, this); +- if (!fdctx) +- return; ++ if (lock->release) { ++ list_add_tail (&local->transaction.wait_list, ++ &lock->frozen); ++ *take_lock = _gf_false; ++ goto out; ++ } + +- if (afr_are_multiple_fds_opened (local->fd, this)) +- return; +- /* +- * Once full file lock is acquired in eager-lock phase, overlapping +- * writes do not compete for inode-locks, instead are transferred to the +- * next writes. Because of this overlapping writes are not ordered. +- * This can cause inconsistencies in replication. +- * Example: +- * Two overlapping writes w1, w2 are sent in parallel on same fd +- * in two threads t1, t2. +- * Both threads can execute afr_writev_wind in the following manner. +- * t1 winds w1 on brick-0 +- * t2 winds w2 on brick-0 +- * t2 winds w2 on brick-1 +- * t1 winds w1 on brick-1 +- * +- * This check makes sure the locks are not transferred for +- * overlapping writes. +- */ +- LOCK (&local->fd->lock); +- { +- list_for_each_entry (each, &fdctx->eager_locked, +- transaction.eager_locked) { +- if (afr_locals_overlap (each, local)) { +- local->transaction.eager_lock_on = _gf_false; +- goto unlock; +- } ++ if (lock->delay_timer) { ++ *take_lock = _gf_false; ++ if (gf_timer_call_cancel (this->ctx, ++ lock->delay_timer)) { ++ list_add_tail (&local->transaction.wait_list, ++ &lock->frozen); ++ } else { ++ *timer_local = list_entry(lock->post_op.next, ++ afr_local_t, ++ transaction.owner_list); ++ afr_copy_inodelk_vars (&local->internal_lock, ++ &(*timer_local)->internal_lock, ++ this); ++ lock->delay_timer = NULL; ++ *do_pre_op = _gf_true; ++ list_add_tail (&local->transaction.owner_list, ++ &lock->owners); + } ++ goto out; ++ } + +- local->transaction.eager_lock_on = _gf_true; +- list_add_tail (&local->transaction.eager_locked, +- &fdctx->eager_locked); ++ if (!list_empty (&lock->owners)) { ++ if (!lock->acquired || ++ afr_has_lock_conflict (local, _gf_true)) { ++ list_add_tail (&local->transaction.wait_list, ++ &lock->waiting); ++ *take_lock = _gf_false; ++ goto out; ++ } ++ owner_local = list_entry (lock->owners.next, ++ afr_local_t, ++ transaction.owner_list); ++ afr_copy_inodelk_vars (&local->internal_lock, ++ &owner_local->internal_lock, ++ this); ++ *take_lock = _gf_false; ++ *do_pre_op = _gf_true; + } +-unlock: +- UNLOCK (&local->fd->lock); ++ ++ if (lock->acquired) ++ GF_ASSERT (!(*take_lock)); ++ list_add_tail (&local->transaction.owner_list, &lock->owners); ++out: ++ return; + } + + void +-afr_transaction_start (call_frame_t *frame, xlator_t *this) ++afr_transaction_start (afr_local_t *local, xlator_t *this) + { +- afr_local_t *local = frame->local; +- fd_t *fd = NULL; ++ afr_private_t *priv = NULL; ++ gf_boolean_t take_lock = _gf_true; ++ gf_boolean_t do_pre_op = _gf_false; ++ afr_local_t *timer_local = NULL; + +- afr_transaction_eager_lock_init (local, this); ++ priv = this->private; + +- if (local->fd && local->transaction.eager_lock_on) +- afr_set_lk_owner (frame, this, local->fd); +- else +- afr_set_lk_owner (frame, this, frame->root); ++ if (local->transaction.type != AFR_DATA_TRANSACTION && ++ local->transaction.type != AFR_METADATA_TRANSACTION) ++ goto lock_phase; + +- if (!local->transaction.eager_lock_on && local->loc.inode) { +- fd = fd_lookup (local->loc.inode, frame->root->pid); +- if (fd == NULL) +- fd = fd_lookup_anonymous (local->loc.inode, +- GF_ANON_FD_FLAGS); ++ if (!priv->eager_lock) ++ goto lock_phase; + +- if (fd) { +- afr_delayed_changelog_wake_up (this, fd); +- fd_unref (fd); +- } ++ LOCK (&local->inode->lock); ++ { ++ __afr_eager_lock_handle (local, &take_lock, &do_pre_op, ++ &timer_local); + } ++ UNLOCK (&local->inode->lock); ++lock_phase: ++ if (!local->transaction.eager_lock_on) { ++ afr_set_lk_owner (local->transaction.frame, this, ++ local->transaction.frame->root); ++ } else { ++ afr_set_lk_owner (local->transaction.frame, this, local->inode); ++ } ++ + +- afr_lock (frame, this); ++ if (take_lock) { ++ afr_lock (local->transaction.frame, this); ++ } else if (do_pre_op) { ++ afr_changelog_pre_op (local->transaction.frame, this); ++ } ++ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above ++ * so that any inheriting can happen*/ ++ if (timer_local) ++ afr_delayed_changelog_wake_up_cbk (timer_local); + } + + int +@@ -2179,7 +2295,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) + goto fail; + } + +- afr_transaction_start (frame, this); ++ afr_transaction_start (local, this); + return 0; + fail: + local->transaction.unwind (frame, this); +@@ -2197,6 +2313,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + + local = frame->local; + priv = this->private; ++ local->transaction.frame = frame; + + local->transaction.type = type; + +@@ -2204,11 +2321,10 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + if (ret < 0) + goto out; + ++ + if (type == AFR_ENTRY_TRANSACTION || + type == AFR_ENTRY_RENAME_TRANSACTION) { +- afr_transaction_start (frame, this); +- ret = 0; +- goto out; ++ goto txn_start; + } + + ret = afr_inode_get_readable (frame, local->inode, this, +@@ -2218,10 +2334,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + event_generation)) { + afr_inode_refresh (frame, this, local->inode, local->loc.gfid, + afr_write_txn_refresh_done); +- } else { +- afr_transaction_start (frame, this); ++ ret = 0; ++ goto out; + } ++ ++txn_start: + ret = 0; ++ afr_transaction_start (local, this); + out: + return ret; + } +diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h +index ddcb1eb..a27e9a3 100644 +--- a/xlators/cluster/afr/src/afr-transaction.h ++++ b/xlators/cluster/afr/src/afr-transaction.h +@@ -17,12 +17,6 @@ void + afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index); + +-int +-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +- +-afr_inodelk_t* +-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); +- + int32_t + afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +@@ -30,9 +24,6 @@ int + afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); + + void +-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); +- +-void + afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + + void +@@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv, + inode_t *inode2, unsigned char *readable2); + int + afr_transaction_resume (call_frame_t *frame, xlator_t *this); ++int ++afr_lock (call_frame_t *frame, xlator_t *this); ++void ++afr_delayed_changelog_wake_up_cbk (void *data); + #endif /* __TRANSACTION_H__ */ +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index dec2a37..ef8de3e 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -217,19 +217,12 @@ int + afr_entry_lockee_cmp (const void *l1, const void *l2); + + typedef struct { +- char *domain; /* Domain on which inodelk is taken */ +- struct gf_flock flock; +- unsigned char *locked_nodes; +- int32_t lock_count; +-} afr_inodelk_t; +- +-typedef struct { + loc_t *lk_loc; + + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + +- afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; ++ struct gf_flock flock; + const char *lk_basename; + const char *lower_basename; + const char *higher_basename; +@@ -242,7 +235,6 @@ typedef struct { + int32_t lock_count; + int32_t entrylk_lock_count; + +- uint64_t lock_number; + int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; +@@ -279,37 +271,9 @@ typedef enum { + } afr_fd_open_status_t; + + typedef struct { +- unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; +- int inherited[AFR_NUM_CHANGE_LOGS]; +- int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ +- +- unsigned int *lock_piggyback; +- unsigned int *lock_acquired; +- + int flags; + +- /* used for delayed-post-op optimization */ +- pthread_mutex_t delay_lock; +- gf_timer_t *delay_timer; +- call_frame_t *delay_frame; +- +- /* set if any write on this fd was a non stable write +- (i.e, without O_SYNC or O_DSYNC) +- */ +- gf_boolean_t witnessed_unstable_write; +- +- /* @open_fd_count: +- Number of open FDs queried from the server, as queried through +- xdata in FOPs. Currently, used to decide if eager-locking must be +- temporarily disabled. +- */ +- uint32_t open_fd_count; +- +- +- /* list of frames currently in progress */ +- struct list_head eager_locked; +- + /* the subvolume on which the latest sequence of readdirs (starting + at offset 0) has begun. Till the next readdir request with 0 offset + arrives, we continue to read off this subvol. +@@ -323,6 +287,44 @@ typedef enum { + AFR_FOP_LOCK_QUORUM_FAILED, + } afr_fop_lock_state_t; + ++typedef struct _afr_inode_lock_t { ++ unsigned int event_generation; ++ gf_boolean_t release; ++ gf_boolean_t acquired; ++ gf_timer_t *delay_timer; ++ struct list_head owners; /*Transactions that are performing fop*/ ++ struct list_head post_op;/*Transactions that are done with the fop ++ *So can not conflict with the fops*/ ++ struct list_head waiting;/*Transaction that are waiting for ++ *conflicting transactions to complete*/ ++ struct list_head frozen;/*Transactions that need to go as part of ++ * next batch of eager-lock*/ ++} afr_lock_t; ++ ++typedef struct _afr_inode_ctx { ++ uint64_t read_subvol; ++ int spb_choice; ++ gf_timer_t *timer; ++ gf_boolean_t need_refresh; ++ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; ++ int inherited[AFR_NUM_CHANGE_LOGS]; ++ int on_disk[AFR_NUM_CHANGE_LOGS]; ++ ++ /* set if any write on this fd was a non stable write ++ (i.e, without O_SYNC or O_DSYNC) ++ */ ++ gf_boolean_t witnessed_unstable_write; ++ ++ /* @open_fd_count: ++ Number of open FDs queried from the server, as queried through ++ xdata in FOPs. Currently, used to decide if eager-locking must be ++ temporarily disabled. ++ */ ++ uint32_t open_fd_count; ++ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ ++ afr_lock_t lock[2]; ++} afr_inode_ctx_t; ++ + typedef struct _afr_local { + glusterfs_fop_t op; + unsigned int call_count; +@@ -434,7 +436,6 @@ typedef struct _afr_local { + dict_t *dict; + + int optimistic_change_log; +- gf_boolean_t delayed_post_op; + + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or +@@ -669,7 +670,7 @@ typedef struct _afr_local { + off_t start, len; + + gf_boolean_t eager_lock_on; +- int *eager_lock; ++ gf_boolean_t do_eager_unlock; + + char *basename; + char *new_basename; +@@ -683,7 +684,8 @@ typedef struct _afr_local { + of the transaction frame */ + call_stub_t *resume_stub; + +- struct list_head eager_locked; ++ struct list_head owner_list; ++ struct list_head wait_list; + + unsigned char *pre_op; + +@@ -744,7 +746,8 @@ typedef struct _afr_local { + */ + afr_changelog_resume_t changelog_resume; + +- call_frame_t *main_frame; ++ call_frame_t *main_frame; /*Fop frame*/ ++ call_frame_t *frame; /*Transaction frame*/ + + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + +@@ -774,16 +777,9 @@ typedef struct _afr_local { + afr_fop_lock_state_t fop_lock_state; + + gf_boolean_t is_read_txn; ++ afr_inode_ctx_t *inode_ctx; + } afr_local_t; + +- +-typedef struct _afr_inode_ctx { +- uint64_t read_subvol; +- int spb_choice; +- gf_timer_t *timer; +- gf_boolean_t need_refresh; +-} afr_inode_ctx_t; +- + typedef struct afr_spbc_timeout { + call_frame_t *frame; + gf_boolean_t d_spb; +@@ -982,7 +978,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ +- frame->local;}) ++ frame->local; }) + + #define AFR_STACK_RESET(frame) \ + do { \ +@@ -1069,22 +1065,10 @@ afr_filter_xattrs (dict_t *xattr); + #define AFR_QUORUM_AUTO INT_MAX + + int +-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); ++afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local); + + gf_boolean_t +-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); +- +-void +-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); +- +-int +-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); +- +-void +-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); +- +-void +-afr_remove_eager_lock_stub (afr_local_t *local); ++afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode); + + void + afr_replies_wipe (struct afr_reply *replies, int count); +@@ -1174,4 +1158,9 @@ int + afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this, + char *buf, const char *default_str, + int32_t *serz_len, char delimiter); ++int ++__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx); ++ ++int ++afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode); + #endif /* __AFR_H__ */ +-- +1.8.3.1 + diff --git a/SOURCES/0673-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch b/SOURCES/0673-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch new file mode 100644 index 0000000..25d559b --- /dev/null +++ b/SOURCES/0673-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch @@ -0,0 +1,85 @@ +From c73641d9d7647ea63b5f9f8f6a9aced88e243d0c Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 19 Mar 2018 15:26:40 +0530 +Subject: [PATCH 673/675] cluster/afr: Switch to active-fd-count for open-fd + checks + + >BUG: 1557932 + +Change-Id: I1476ebc84336250f10c82ad913eba88a575c9913 +Upstream-patch: https://review.gluster.org/19741 +BUG: 1583733 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/140578 +Tested-by: RHGS Build Bot +Tested-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/basic/afr/afr-no-fsync.t | 20 ++++++++++++++++++++ + xlators/cluster/afr/src/afr-inode-write.c | 16 ++++++++-------- + 2 files changed, 28 insertions(+), 8 deletions(-) + create mode 100644 tests/basic/afr/afr-no-fsync.t + +diff --git a/tests/basic/afr/afr-no-fsync.t b/tests/basic/afr/afr-no-fsync.t +new file mode 100644 +index 0000000..0966d9b +--- /dev/null ++++ b/tests/basic/afr/afr-no-fsync.t +@@ -0,0 +1,20 @@ ++#!/bin/bash ++#Tests that sequential write workload doesn't lead to FSYNCs ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,3} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 performance.flush-behind off ++TEST $CLI volume start $V0 ++TEST $CLI volume profile $V0 start ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++TEST dd if=/dev/zero of=$M0/a bs=1M count=500 ++TEST ! "$CLI volume profile $V0 info incremental | grep FSYNC" ++ ++cleanup; +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index 33c1015..5eb2964 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -314,10 +314,10 @@ afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + if (ret || !write_is_append) + local->append_write = _gf_false; + +- ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, +- &open_fd_count); +- if (ret == -1) +- goto unlock; ++ ret = dict_get_uint32 (xdata, GLUSTERFS_ACTIVE_FD_COUNT, ++ &open_fd_count); ++ if (ret < 0) ++ goto unlock; + if (open_fd_count > local->open_fd_count) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; +@@ -530,10 +530,10 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + if (ret) + goto out; + +- if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { +- op_errno = ENOMEM; +- goto out; +- } ++ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) { ++ op_errno = ENOMEM; ++ goto out; ++ } + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; +-- +1.8.3.1 + diff --git a/SOURCES/0674-afr-fixes-to-afr-eager-locking.patch b/SOURCES/0674-afr-fixes-to-afr-eager-locking.patch new file mode 100644 index 0000000..d563846 --- /dev/null +++ b/SOURCES/0674-afr-fixes-to-afr-eager-locking.patch @@ -0,0 +1,80 @@ +From d09e4d40ed3a41e1a468a2da851e59c0ce6ea228 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Mon, 16 Apr 2018 15:38:34 +0530 +Subject: [PATCH 674/675] afr: fixes to afr-eager locking + +Upstream patch: https://review.gluster.org/#/c/19879/ + +1. If pre-op fails on all bricks,set lock->release to true in +afr_handle_lock_acquire_failure so that the GF_ASSERT in afr_unlock() does not +crash. + +2. Added a missing 'return' after handling pre-op failure in +afr_transaction_perform_fop(), fixing a use-after-free issue. + +BUG: 1583733 +Change-Id: Iae6572f6ca3c4e9c03becb7eef0fb2033c3ff0e5 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/140579 +Tested-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/replicate/bug-1561129-enospc.t | 24 ++++++++++++++++++++++++ + xlators/cluster/afr/src/afr-transaction.c | 2 ++ + 2 files changed, 26 insertions(+) + create mode 100644 tests/bugs/replicate/bug-1561129-enospc.t + +diff --git a/tests/bugs/replicate/bug-1561129-enospc.t b/tests/bugs/replicate/bug-1561129-enospc.t +new file mode 100644 +index 0000000..1b402fc +--- /dev/null ++++ b/tests/bugs/replicate/bug-1561129-enospc.t +@@ -0,0 +1,24 @@ ++#!/bin/bash ++#Tests that sequential write workload doesn't lead to FSYNCs ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++TEST truncate -s 128M $B0/xfs_image ++TEST mkfs.xfs -f $B0/xfs_image ++TEST mkdir $B0/bricks ++TEST mount -t xfs -o loop $B0/xfs_image $B0/bricks ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/bricks/brick{0,1,3} ++TEST $CLI volume start $V0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++# Write 50MB of data, which will try to consume 50x3=150MB on $B0/bricks. ++# Before that, we hit ENOSPC in pre-op cbk, which should not crash the mount. ++TEST ! dd if=/dev/zero of=$M0/a bs=1M count=50 ++TEST stat $M0/a ++cleanup; +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index caa83c8..d19795d 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -263,6 +263,7 @@ afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked) + INIT_LIST_HEAD (&shared); + LOCK (&local->inode->lock); + { ++ lock->release = _gf_true; + list_splice_init (&lock->waiting, &shared); + } + UNLOCK (&local->inode->lock); +@@ -481,6 +482,7 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + priv->child_count); + if (failure_count == priv->child_count) { + afr_handle_lock_acquire_failure (local, _gf_true); ++ return 0; + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK (&local->inode->lock); +-- +1.8.3.1 + diff --git a/SOURCES/0675-block-profile-enable-cluster.eager-lock-in-block-pro.patch b/SOURCES/0675-block-profile-enable-cluster.eager-lock-in-block-pro.patch new file mode 100644 index 0000000..6afb9c2 --- /dev/null +++ b/SOURCES/0675-block-profile-enable-cluster.eager-lock-in-block-pro.patch @@ -0,0 +1,41 @@ +From fe41cdfbff3ce17df3902e21261911f6677ee7d6 Mon Sep 17 00:00:00 2001 +From: Prasanna Kumar Kalever +Date: Fri, 20 Apr 2018 17:26:12 +0530 +Subject: [PATCH 675/675] block-profile: enable cluster.eager-lock in + block-profile + +Eager-lock gave 2.5X perf improvement. On top of that with batching +fix in tcmu-runner and client-io-threads we are seeing close to 3X perf +improvement. But we don't want to include that in the default profile +option but enable it on a case by case basis. So not adding +client-io-threads option. + >BUG: 1573119 + +Upstream-patch: https://review.gluster.org/19913 +BUG: 1583733 +Change-Id: Ida53c3ef9a041a73b65fdd06158ac082da437206 +Signed-off-by: Prasanna Kumar Kalever +Reviewed-on: https://code.engineering.redhat.com/gerrit/140580 +Tested-by: RHGS Build Bot +Tested-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/group-gluster-block | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/extras/group-gluster-block b/extras/group-gluster-block +index e94f834..933dd32 100644 +--- a/extras/group-gluster-block ++++ b/extras/group-gluster-block +@@ -6,7 +6,7 @@ performance.open-behind=off + performance.readdir-ahead=off + performance.strict-o-direct=on + network.remote-dio=disable +-cluster.eager-lock=disable ++cluster.eager-lock=enable + cluster.quorum-type=auto + cluster.data-self-heal-algorithm=full + cluster.locking-scheme=granular +-- +1.8.3.1 + diff --git a/SOURCES/0676-storage-posix-Handle-ENOSPC-correctly-in-zero_fill.patch b/SOURCES/0676-storage-posix-Handle-ENOSPC-correctly-in-zero_fill.patch new file mode 100644 index 0000000..f56b69a --- /dev/null +++ b/SOURCES/0676-storage-posix-Handle-ENOSPC-correctly-in-zero_fill.patch @@ -0,0 +1,188 @@ +From f4bfa2d984edf17ca85f19b2a2adb190bdade6fc Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 13 Jun 2018 12:17:28 +0530 +Subject: [PATCH 676/678] storage/posix: Handle ENOSPC correctly in zero_fill + + Upstream patch: https://review.gluster.org/20254 + +BUG: 1594656 +Change-Id: Icc521d86cc510f88b67d334b346095713899087a +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/142310 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + tests/basic/posix/zero-fill-enospace.c | 64 ++++++++++++++++++++++++++++++++++ + tests/basic/posix/zero-fill-enospace.t | 35 +++++++++++++++++++ + xlators/storage/posix/src/posix.c | 23 +++++++++++- + 3 files changed, 121 insertions(+), 1 deletion(-) + create mode 100644 tests/basic/posix/zero-fill-enospace.c + create mode 100644 tests/basic/posix/zero-fill-enospace.t + +diff --git a/tests/basic/posix/zero-fill-enospace.c b/tests/basic/posix/zero-fill-enospace.c +new file mode 100644 +index 0000000..b1aaa57 +--- /dev/null ++++ b/tests/basic/posix/zero-fill-enospace.c +@@ -0,0 +1,64 @@ ++#include ++#include ++#include ++ ++int ++main (int argc, char *argv[]) ++{ ++ glfs_t *fs = NULL; ++ glfs_fd_t *fd = NULL; ++ int ret = 1; ++ int size = 0; ++ ++ if (argc != 6) { ++ fprintf (stderr, "Syntax: %s \n", argv[0]); ++ return 1; ++ } ++ ++ fs = glfs_new (argv[2]); ++ if (!fs) { ++ fprintf (stderr, "glfs_new: returned NULL\n"); ++ return 1; ++ } ++ ++ ret = glfs_set_volfile_server (fs, "tcp", argv[1], 24007); ++ if (ret != 0) { ++ fprintf (stderr, "glfs_set_volfile_server: retuned %d\n", ret); ++ goto out; ++ } ++ ret = glfs_set_logging (fs, argv[4], 7); ++ if (ret != 0) { ++ fprintf (stderr, "glfs_set_logging: returned %d\n", ret); ++ goto out; ++ } ++ ret = glfs_init (fs); ++ if (ret != 0) { ++ fprintf (stderr, "glfs_init: returned %d\n", ret); ++ goto out; ++ } ++ ++ fd = glfs_open (fs, argv[3], O_RDWR); ++ if (fd == NULL) { ++ fprintf (stderr, "glfs_open: returned NULL\n"); ++ goto out; ++ } ++ ++ size = atoi(argv[5]); ++ if (size < 0) { ++ fprintf (stderr, "Wrong size %s", argv[5]); ++ goto out; ++ } ++ ret = glfs_zerofill (fd, 0, atoi(argv[5])); ++ if (ret <= 0) { ++ fprintf (stderr, "glfs_zerofill: returned %d\n", ret); ++ goto out; ++ } ++ ++ ret = 0; ++ ++out: ++ if (fd) ++ glfs_close(fd); ++ glfs_fini (fs); ++ return ret; ++} +diff --git a/tests/basic/posix/zero-fill-enospace.t b/tests/basic/posix/zero-fill-enospace.t +new file mode 100644 +index 0000000..ac2e61b +--- /dev/null ++++ b/tests/basic/posix/zero-fill-enospace.t +@@ -0,0 +1,35 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../dht.rc ++ ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd; ++ ++TEST truncate -s 100M $B0/brick1 ++ ++TEST L1=`SETUP_LOOP $B0/brick1` ++TEST MKFS_LOOP $L1 ++ ++TEST mkdir -p $B0/${V0}1 ++ ++TEST MOUNT_LOOP $L1 $B0/${V0}1 ++ ++TEST $CLI volume create $V0 $H0:$B0/${V0}1 ++ ++TEST $CLI volume start $V0; ++ ++TEST glusterfs -s $H0 --volfile-id=$V0 $M0 ++TEST touch $M0/foo ++TEST build_tester $(dirname $0)/zero-fill-enospace.c -lgfapi -Wall -O2 ++TEST ! $(dirname $0)/zero-fill-enospace $H0 $V0 /foo `gluster --print-logdir`/glfs-$V0.log 104857600 ++ ++TEST force_umount $M0 ++TEST $CLI volume stop $V0 ++UMOUNT_LOOP ${B0}/${V0}1 ++rm -f ${B0}/brick1 ++ ++cleanup +diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c +index b0d7037..84a5d78 100644 +--- a/xlators/storage/posix/src/posix.c ++++ b/xlators/storage/posix/src/posix.c +@@ -824,17 +824,32 @@ _posix_do_zerofill(int fd, off_t offset, off_t len, int o_direct) + op_ret = sys_writev (fd, vector, num_vect); + if (op_ret < 0) + goto err; ++ if (op_ret != (vect_size * num_vect)) { ++ op_ret = -1; ++ errno = ENOSPC; ++ goto err; ++ } + } + if (extra) { + op_ret = sys_writev (fd, vector, extra); + if (op_ret < 0) + goto err; ++ if (op_ret != (vect_size * extra)) { ++ op_ret = -1; ++ errno = ENOSPC; ++ goto err; ++ } + } + if (remain) { + vector[0].iov_len = remain; + op_ret = sys_writev (fd, vector , 1); + if (op_ret < 0) + goto err; ++ if (op_ret != remain) { ++ op_ret = -1; ++ errno = ENOSPC; ++ goto err; ++ } + } + err: + if (o_direct) +@@ -895,8 +910,14 @@ posix_do_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + */ + flags = FALLOC_FL_ZERO_RANGE; + ret = sys_fallocate (pfd->fd, flags, offset, len); +- if (ret == 0) ++ if (ret == 0) { + goto fsync; ++ } else { ++ ret = -errno; ++ if ((ret != -ENOSYS) && (ret != -EOPNOTSUPP)) { ++ goto out; ++ } ++ } + + ret = _posix_do_zerofill (pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { +-- +1.8.3.1 + diff --git a/SOURCES/0677-cluster-afr-Increase-the-lock-count-on-success.patch b/SOURCES/0677-cluster-afr-Increase-the-lock-count-on-success.patch new file mode 100644 index 0000000..701549c --- /dev/null +++ b/SOURCES/0677-cluster-afr-Increase-the-lock-count-on-success.patch @@ -0,0 +1,34 @@ +From 6ff5d5187500246e662421cef24cc81125c5c47f Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 20 Jun 2018 14:56:25 +0530 +Subject: [PATCH 677/678] cluster/afr: Increase the lock-count on success + +While backporting eager-lock feature I missed this line, because of this +afr always falls back to blocking locks thinking it didn't get all the +nonblocking locks even when it gets them. + +BUG: 1594682 +Change-Id: I047aebd23528b872a6b556bcf3a770d612ae550d +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/142316 +Reviewed-by: Xavi Hernandez +Tested-by: RHGS Build Bot +--- + xlators/cluster/afr/src/afr-lk-common.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c +index 494a63c..0c3ac60 100644 +--- a/xlators/cluster/afr/src/afr-lk-common.c ++++ b/xlators/cluster/afr/src/afr-lk-common.c +@@ -976,6 +976,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + } + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; ++ int_lock->lock_count++; + } + + call_count = --int_lock->lk_call_count; +-- +1.8.3.1 + diff --git a/SOURCES/0678-extras-group-add-database-workload-profile.patch b/SOURCES/0678-extras-group-add-database-workload-profile.patch new file mode 100644 index 0000000..e8b9c98 --- /dev/null +++ b/SOURCES/0678-extras-group-add-database-workload-profile.patch @@ -0,0 +1,102 @@ +From 8c1a79da71b6b99b70ded53b405fb534987c4107 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Thu, 28 Jun 2018 10:42:56 +0530 +Subject: [PATCH 678/678] extras/group : add database workload profile + +Running DB workload patterns with all perf xlators enabled as default has +resulted into some inconsistency issues. Based on the internal testing done by +Elko Kuric (ekuric@redhat.com) there're certain set of perf xlators which need +to be turned off to get these types of workload supported by Gluster. + +The proposal is to leverage group profile infrastructure to group together all +those tunables at one place so that users just need to apply the profile to the +volume to use it for the data base workload. + +Credits : Elko Kuric (ekuric@redhat.com) + +> upstream patch : https://review.gluster.org/#/c/20414/ + +>Change-Id: I8a50e915278ad4085b9aaa3f160a33af7c0b0444 +>fixes: bz#1596020 +>Signed-off-by: Atin Mukherjee + +Change-Id: I8a50e915278ad4085b9aaa3f160a33af7c0b0444 +BUG: 1596076 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/142750 +Tested-by: RHGS Build Bot +Reviewed-by: Milind Changire +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/Makefile.am | 4 +++- + extras/group-db-workload | 8 ++++++++ + glusterfs.spec.in | 5 +++++ + 3 files changed, 16 insertions(+), 1 deletion(-) + create mode 100644 extras/group-db-workload + +diff --git a/extras/Makefile.am b/extras/Makefile.am +index 5a340da..6f17906 100644 +--- a/extras/Makefile.am ++++ b/extras/Makefile.am +@@ -12,7 +12,7 @@ SUBDIRS = init.d systemd benchmarking hook-scripts $(OCF_SUBDIR) LinuxRPM \ + + confdir = $(sysconfdir)/glusterfs + conf_DATA = glusterfs-logrotate gluster-rsyslog-7.2.conf gluster-rsyslog-5.8.conf \ +- logger.conf.example glusterfs-georep-logrotate group-virt.example group-metadata-cache group-gluster-block group-nl-cache ++ logger.conf.example glusterfs-georep-logrotate group-virt.example group-metadata-cache group-gluster-block group-nl-cache group-db-workload + + voldir = $(sysconfdir)/glusterfs + vol_DATA = glusterd.vol +@@ -42,3 +42,5 @@ install-data-local: + $(DESTDIR)$(GLUSTERD_WORKDIR)/groups/gluster-block + $(INSTALL_DATA) $(top_srcdir)/extras/group-nl-cache \ + $(DESTDIR)$(GLUSTERD_WORKDIR)/groups/nl-cache ++ $(INSTALL_DATA) $(top_srcdir)/extras/group-db-workload \ ++ $(DESTDIR)$(GLUSTERD_WORKDIR)/groups/db-workload +diff --git a/extras/group-db-workload b/extras/group-db-workload +new file mode 100644 +index 0000000..c9caf21 +--- /dev/null ++++ b/extras/group-db-workload +@@ -0,0 +1,8 @@ ++performance.open-behind=off ++performance.write-behind=off ++performance.stat-prefetch=off ++performance.quick-read=off ++performance.strict-o-direct=on ++performance.read-ahead=off ++performance.io-cache=off ++performance.readdir-ahead=off +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index d1aa3ea..6d59fef 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -1062,6 +1062,7 @@ exit 0 + %exclude %{_sysconfdir}/glusterfs/group-metadata-cache + %exclude %{_sysconfdir}/glusterfs/group-nl-cache + %exclude %{_sysconfdir}/glusterfs/group-gluster-block ++%exclude %{_sysconfdir}/glusterfs/group-db-workload + %exclude %{_sysconfdir}/glusterfs/logger.conf.example + %exclude %_init_glusterd + %exclude %{_sysconfdir}/sysconfig/glusterd +@@ -1389,6 +1390,7 @@ exit 0 + %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/metadata-cache + %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/nl-cache + %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/gluster-block ++ %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/db-workload + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/glusterfind + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/glusterfind/.keys + %ghost %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/glustershd +@@ -2068,6 +2070,9 @@ fi + %endif + + %changelog ++* Fri Jun 29 2018 Atin Mukherjee ++- Added db group profile (#1596076) ++ + * Thu Mar 22 2018 Kotresh HR + - Added util-linux as dependency to georeplication rpm (#1544382) + +-- +1.8.3.1 + diff --git a/SOURCES/0679-glusterd-Introduce-daemon-log-level-cluster-wide-opt.patch b/SOURCES/0679-glusterd-Introduce-daemon-log-level-cluster-wide-opt.patch new file mode 100644 index 0000000..954dcef --- /dev/null +++ b/SOURCES/0679-glusterd-Introduce-daemon-log-level-cluster-wide-opt.patch @@ -0,0 +1,345 @@ +From 8239d2fec5c182c0501b2fc292024b15cb6bdc91 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Mon, 2 Jul 2018 20:48:22 +0530 +Subject: [PATCH 679/685] glusterd: Introduce daemon-log-level cluster wide + option + +This option, applicable to the node level daemons can be very helpful in +controlling the log level of these services. Please note any daemon +which is started prior to setting the specific value of this option (if +not INFO) will need to go through a restart to have this change into +effect. + +> upstream patch : https://review.gluster.org/#/c/20442/ + +Please note there's a difference in deownstream delta. The op-version +has to be handled a bit differently as we don't want to get into an +op-version (4_2_0) which is higher than RHGS 3.4.0. So we bump up thei +current op-version of RHGS 3.3.1 by 1 to have that control which is a +downstream only change. Marking this DOWNSTREAM_ONLY label because of +it. + +Label: DOWNSTREAM ONLY + +>Change-Id: I7f6d2620bab2b094c737f5cc816bc093e9c9c4c9 +>fixes: bz#1597473 +>Signed-off-by: Atin Mukherjee + +Change-Id: I45cf09dbccd27daf8c2bcfd06cc1557953ca0230 +BUG: 1597509 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/143106 +Reviewed-by: Sanju Rakonde +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/globals.h | 5 +- + tests/bugs/glusterd/daemon-log-level-option.t | 93 +++++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-handler.c | 3 +- + xlators/mgmt/glusterd/src/glusterd-messages.h | 10 ++- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 50 +++++++++++++ + xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c | 8 +++ + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 9 ++- + xlators/mgmt/glusterd/src/glusterd.h | 1 + + 8 files changed, 175 insertions(+), 4 deletions(-) + create mode 100644 tests/bugs/glusterd/daemon-log-level-option.t + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index caa7f64..6682dc2 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -43,7 +43,7 @@ + */ + #define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly + should not change */ +-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_11_1 /* MAX VERSION is the maximum ++#define GD_OP_VERSION_MAX GD_OP_VERSION_3_11_2 /* MAX VERSION is the maximum + count in VME table, should + keep changing with + introduction of newer +@@ -93,6 +93,9 @@ + + #define GD_OP_VERSION_3_11_1 31101 /* Op-version for GlusterFS 3.11.1 */ + ++/* Downstream only change */ ++#define GD_OP_VERSION_3_11_2 31102 /* Op-version for RHGS 3.3.1-async */ ++ + #include "xlator.h" + + /* THIS */ +diff --git a/tests/bugs/glusterd/daemon-log-level-option.t b/tests/bugs/glusterd/daemon-log-level-option.t +new file mode 100644 +index 0000000..66e55e3 +--- /dev/null ++++ b/tests/bugs/glusterd/daemon-log-level-option.t +@@ -0,0 +1,93 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++ ++function Info_messages_count() { ++ local shd_log=$1 ++ cat $shd_log | grep " I " | wc -l ++} ++ ++function Warning_messages_count() { ++ local shd_log=$1 ++ cat $shd_log | grep " W " | wc -l ++} ++ ++function Debug_messages_count() { ++ local shd_log=$1 ++ cat $shd_log | grep " D " | wc -l ++} ++ ++function Trace_messages_count() { ++ local shd_log=$1 ++ cat $shd_log | grep " T " | wc -l ++} ++ ++cleanup; ++ ++# Basic checks ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume info ++ ++# set cluster.daemon-log-level option to DEBUG ++TEST $CLI volume set all cluster.daemon-log-level DEBUG ++ ++#Create a 3X2 distributed-replicate volume ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1..6}; ++TEST $CLI volume start $V0 ++ ++# log should not have any trace messages ++EXPECT 0 Trace_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# stop the volume and remove glustershd log ++TEST $CLI volume stop $V0 ++rm -f /var/log/glusterfs/glustershd.log ++ ++# set cluster.daemon-log-level option to INFO and start the volume ++TEST $CLI volume set all cluster.daemon-log-level INFO ++TEST $CLI volume start $V0 ++ ++# log should not have any debug messages ++EXPECT 0 Debug_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# log should not have any trace messages ++EXPECT 0 Trace_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# stop the volume and remove glustershd log ++TEST $CLI volume stop $V0 ++rm -f /var/log/glusterfs/glustershd.log ++ ++# set cluster.daemon-log-level option to WARNING and start the volume ++TEST $CLI volume set all cluster.daemon-log-level WARNING ++TEST $CLI volume start $V0 ++ ++# log should not have any info messages ++EXPECT 0 Info_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# log should not have any debug messages ++EXPECT 0 Debug_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# log should not have any trace messages ++EXPECT 0 Trace_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# stop the volume and remove glustershd log ++TEST $CLI volume stop $V0 ++rm -f /var/log/glusterfs/glustershd.log ++ ++# set cluster.daemon-log-level option to ERROR and start the volume ++TEST $CLI volume set all cluster.daemon-log-level ERROR ++TEST $CLI volume start $V0 ++ ++# log should not have any info messages ++EXPECT 0 Info_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# log should not have any warning messages ++EXPECT 0 Warning_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# log should not have any debug messages ++EXPECT 0 Debug_messages_count "/var/log/glusterfs/glustershd.log" ++ ++# log should not have any trace messages ++EXPECT 0 Trace_messages_count "/var/log/glusterfs/glustershd.log" ++ ++cleanup +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 0f97573..6d66301 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -4642,7 +4642,8 @@ gd_is_global_option (char *opt_key) + return (strcmp (opt_key, GLUSTERD_SHARED_STORAGE_KEY) == 0 || + strcmp (opt_key, GLUSTERD_QUORUM_RATIO_KEY) == 0 || + strcmp (opt_key, GLUSTERD_GLOBAL_OP_VERSION_KEY) == 0 || +- strcmp (opt_key, GLUSTERD_BRICK_MULTIPLEX_KEY) == 0); ++ strcmp (opt_key, GLUSTERD_BRICK_MULTIPLEX_KEY) == 0 || ++ strcmp (opt_key, GLUSTERD_DAEMON_LOG_LEVEL_KEY) == 0); + + out: + return _gf_false; +diff --git a/xlators/mgmt/glusterd/src/glusterd-messages.h b/xlators/mgmt/glusterd/src/glusterd-messages.h +index 0548bd2..a5dc6a3 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-messages.h ++++ b/xlators/mgmt/glusterd/src/glusterd-messages.h +@@ -41,7 +41,7 @@ + + #define GLUSTERD_COMP_BASE GLFS_MSGID_GLUSTERD + +-#define GLFS_NUM_MESSAGES 599 ++#define GLFS_NUM_MESSAGES 600 + + #define GLFS_MSGID_END (GLUSTERD_COMP_BASE + GLFS_NUM_MESSAGES + 1) + /* Messaged with message IDs */ +@@ -4857,6 +4857,14 @@ + */ + #define GD_MSG_CLIENTS_GET_STATE_FAILED (GLUSTERD_COMP_BASE + 599) + ++/*! ++ * @messageid ++ * @diagnosis ++ * @recommendedaction ++ * ++ */ ++#define GD_MSG_DAEMON_LOG_LEVEL_VOL_OPT_VALIDATE_FAIL (GLUSTERD_COMP_BASE + 600) ++ + /*------------*/ + + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index ab2886e..91df502 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -84,6 +84,7 @@ glusterd_all_vol_opts valid_all_vol_opts[] = { + * TBD: Discuss the default value for this. Maybe this should be a + * dynamic value depending on the memory specifications per node */ + { GLUSTERD_BRICKMUX_LIMIT_KEY, "0"}, ++ { GLUSTERD_DAEMON_LOG_LEVEL_KEY, "INFO"}, + { NULL }, + }; + +@@ -865,6 +866,47 @@ out: + } + + static int ++glusterd_validate_daemon_log_level (char *key, char *value, char *errstr) ++{ ++ int32_t ret = -1; ++ xlator_t *this = NULL; ++ glusterd_conf_t *conf = NULL; ++ ++ this = THIS; ++ GF_VALIDATE_OR_GOTO ("glusterd", this, out); ++ ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO (this->name, conf, out); ++ ++ GF_VALIDATE_OR_GOTO (this->name, key, out); ++ GF_VALIDATE_OR_GOTO (this->name, value, out); ++ GF_VALIDATE_OR_GOTO (this->name, errstr, out); ++ ++ ret = 0; ++ ++ if (strcmp (key, GLUSTERD_DAEMON_LOG_LEVEL_KEY)) { ++ goto out; ++ } ++ ++ if ((strcmp (value, "INFO")) && ++ (strcmp (value, "WARNING")) && ++ (strcmp (value, "DEBUG")) && ++ (strcmp (value, "TRACE")) && ++ (strcmp (value, "ERROR"))) { ++ snprintf (errstr, PATH_MAX, ++ "Invalid option(%s). Valid options " ++ "are 'INFO' or 'WARNING' or 'ERROR' or 'DEBUG' or " ++ " 'TRACE'", value); ++ gf_msg (this->name, GF_LOG_ERROR, EINVAL, ++ GD_MSG_INVALID_ENTRY, "%s", errstr); ++ ret = -1; ++ } ++ ++out: ++ return ret; ++} ++ ++static int + glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr) + { + int ret = -1; +@@ -1265,6 +1307,14 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr) + "storage volume options"); + goto out; + } ++ ret = glusterd_validate_daemon_log_level (key, value, errstr); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DAEMON_LOG_LEVEL_VOL_OPT_VALIDATE_FAIL, ++ "Failed to validate daemon-log-level volume " ++ "options"); ++ goto out; ++ } + + if (!strcmp(key, "features.trash-dir") && trash_enabled) { + if (strchr (value, '/')) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c +index 9dcf503..03a2ee0 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c ++++ b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c +@@ -150,6 +150,8 @@ glusterd_svc_start (glusterd_svc_t *svc, int flags, dict_t *cmdline) + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + char valgrind_logfile[PATH_MAX] = {0}; ++ char *log_level = NULL; ++ char daemon_log_level[30] = {0}; + + this = THIS; + GF_ASSERT (this); +@@ -190,6 +192,12 @@ glusterd_svc_start (glusterd_svc_t *svc, int flags, dict_t *cmdline) + "-S", svc->conn.sockpath, + NULL); + ++ if (dict_get_str (priv->opts, GLUSTERD_DAEMON_LOG_LEVEL_KEY, ++ &log_level) == 0) { ++ snprintf (daemon_log_level, 30, "--log-level=%s", log_level); ++ runner_add_arg (&runner, daemon_log_level); ++ } ++ + if (cmdline) + dict_foreach (cmdline, svc_add_args, (void *) &runner); + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index ddda66e..d894679 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3295,6 +3295,13 @@ struct volopt_map_entry glusterd_volopt_map[] = { + "process. Also this option can't be set when the " + "brick-multiplexing feature is disabled." + }, +- { .key = NULL ++ { .key = GLUSTERD_DAEMON_LOG_LEVEL_KEY, ++ .voltype = "mgmt/glusterd", ++ .type = GLOBAL_NO_DOC, ++ .value = "INFO", ++ .op_version = GD_OP_VERSION_3_11_2, ++ }, ++ { ++ .key = NULL + } + }; +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index b94ccc9..3750fe8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -53,6 +53,7 @@ + #define GLUSTERD_SHARED_STORAGE_KEY "cluster.enable-shared-storage" + #define GLUSTERD_BRICK_MULTIPLEX_KEY "cluster.brick-multiplex" + #define GLUSTERD_BRICKMUX_LIMIT_KEY "cluster.max-bricks-per-process" ++#define GLUSTERD_DAEMON_LOG_LEVEL_KEY "cluster.daemon-log-level" + + #define GANESHA_HA_CONF CONFDIR "/ganesha-ha.conf" + #define GANESHA_EXPORT_DIRECTORY CONFDIR"/exports" +-- +1.8.3.1 + diff --git a/SOURCES/0680-glusterfsd-Do-not-process-GLUSTERD_BRICK_XLATOR_OP-i.patch b/SOURCES/0680-glusterfsd-Do-not-process-GLUSTERD_BRICK_XLATOR_OP-i.patch new file mode 100644 index 0000000..e14e33e --- /dev/null +++ b/SOURCES/0680-glusterfsd-Do-not-process-GLUSTERD_BRICK_XLATOR_OP-i.patch @@ -0,0 +1,75 @@ +From 6722966d5e73f52d702b2e4d9987c5c372767a89 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Mon, 2 Jul 2018 16:05:39 +0530 +Subject: [PATCH 680/685] glusterfsd: Do not process GLUSTERD_BRICK_XLATOR_OP + if graph is not ready + +Patch in upstream master: https://review.gluster.org/#/c/20435/ +Patch in release-3.12: https://review.gluster.org/#/c/20436/ + +Problem: +If glustershd gets restarted by glusterd due to node reboot/volume start force/ +or any thing that changes shd graph (add/remove brick), and index heal +is launched via CLI, there can be a chance that shd receives this IPC +before the graph is fully active. Thus when it accesses +glusterfsd_ctx->active, it crashes. + +Fix: +Since glusterd does not really wait for the daemons it spawned to be +fully initialized and can send the request as soon as rpc initialization has +succeeded, we just handle it at shd. If glusterfs_graph_activate() is +not yet done in shd but glusterd sends GD_OP_HEAL_VOLUME to shd, +we fail the request. + +Change-Id: If6cc07bc5455c4ba03458a36c28b63664496b17d +BUG: 1595752 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/143109 +Reviewed-by: Atin Mukherjee +--- + glusterfsd/src/glusterfsd-messages.h | 4 +++- + glusterfsd/src/glusterfsd-mgmt.c | 6 ++++++ + 2 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/glusterfsd/src/glusterfsd-messages.h b/glusterfsd/src/glusterfsd-messages.h +index e9c28f7..e38a88b 100644 +--- a/glusterfsd/src/glusterfsd-messages.h ++++ b/glusterfsd/src/glusterfsd-messages.h +@@ -36,7 +36,7 @@ + */ + + #define GLFS_COMP_BASE GLFS_MSGID_COMP_GLUSTERFSD +-#define GLFS_NUM_MESSAGES 37 ++#define GLFS_NUM_MESSAGES 38 + #define GLFS_MSGID_END (GLFS_COMP_BASE + GLFS_NUM_MESSAGES + 1) + /* Messaged with message IDs */ + #define glfs_msg_start_x GLFS_COMP_BASE, "Invalid: Start of messages" +@@ -109,6 +109,8 @@ + #define glusterfsd_msg_36 (GLFS_COMP_BASE + 36), "problem in xlator " \ + " loading." + #define glusterfsd_msg_37 (GLFS_COMP_BASE + 37), "failed to get dict value" ++#define glusterfsd_msg_38 (GLFS_COMP_BASE + 38), "Not processing brick-op no."\ ++ " %d since volume graph is not yet active." + + /*------------*/ + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" +diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c +index bde49ba..076a68a 100644 +--- a/glusterfsd/src/glusterfsd-mgmt.c ++++ b/glusterfsd/src/glusterfsd-mgmt.c +@@ -671,6 +671,12 @@ glusterfs_handle_translator_op (rpcsvc_request_t *req) + + ctx = glusterfsd_ctx; + active = ctx->active; ++ if (!active) { ++ ret = -1; ++ gf_msg (this->name, GF_LOG_ERROR, EAGAIN, glusterfsd_msg_38, ++ xlator_req.op); ++ goto out; ++ } + any = active->first; + input = dict_new (); + ret = dict_unserialize (xlator_req.input.input_val, +-- +1.8.3.1 + diff --git a/SOURCES/0681-geo-rep-Fix-for-EINVAL-errors-while-syncing-symlinks.patch b/SOURCES/0681-geo-rep-Fix-for-EINVAL-errors-while-syncing-symlinks.patch new file mode 100644 index 0000000..45ed5b5 --- /dev/null +++ b/SOURCES/0681-geo-rep-Fix-for-EINVAL-errors-while-syncing-symlinks.patch @@ -0,0 +1,63 @@ +From a9c27ff4deeddf68acd37efdba9788e3119d2d92 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Tue, 3 Jul 2018 06:53:04 -0400 +Subject: [PATCH 681/685] geo-rep: Fix for EINVAL errors while syncing symlinks + +geo-rep goes to faulty in the following scenario +failing to proceed further. It's the workload +involving symlink, rename and creation of non +symlink file with same name + +1. touch /mastermnt/file1 +2. ln -s "/mastermnt/file1" /mastermnt/symlink +3. mv /mastermnt/symlink /mastermnt/rn_symlink +4. mkdir /mastermnt/symlink + +Fixed the same. + +This is the partial logical backport of below patch which +addresses the EINVAL errors. Note that this patch is not +comparing gfid but the upstream patch does gfid comaprison +and other stuff. + +Backport of: + > Patch: https://review.gluster.org/#/c/18011/ + > BUG: 1432046 + > Signed-off-by: Kotresh HR + > Change-Id: Iaa12d6f99de47b18e0650e7c4eb455f23f8390f2 + > Reviewed-by: Aravinda VK + +BUG: 1590774 +Change-Id: Ib89a12f9c957254442117260aa26af337dcac6d4 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/143031 +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + geo-replication/syncdaemon/master.py | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py +index 52537ff..b251d3a 100644 +--- a/geo-replication/syncdaemon/master.py ++++ b/geo-replication/syncdaemon/master.py +@@ -892,6 +892,15 @@ class GMasterChangelogMixin(GMasterCommon): + entry_update() + entries.append(edct(ty, stat=st, entry=en, gfid=gfid)) + elif ty == 'SYMLINK': ++ # stat the name and check whether it's still symlink ++ # or same named file/dir is created deleting the symlink ++ st1 = lstat(en) ++ if (isinstance(st1, int) or not ++ stat.S_ISLNK(st1.st_mode)): ++ logging.debug('file %s got purged in the interim' ++ % go) ++ continue ++ + rl = errno_wrap(os.readlink, [en], [ENOENT], [ESTALE]) + if isinstance(rl, int): + continue +-- +1.8.3.1 + diff --git a/SOURCES/0682-cluster-afr-Make-sure-lk-owner-is-assigned-at-the-ti.patch b/SOURCES/0682-cluster-afr-Make-sure-lk-owner-is-assigned-at-the-ti.patch new file mode 100644 index 0000000..3e7f1b6 --- /dev/null +++ b/SOURCES/0682-cluster-afr-Make-sure-lk-owner-is-assigned-at-the-ti.patch @@ -0,0 +1,51 @@ +From 5dcb9acb287ad740ae80081211f5e92249beed75 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Tue, 3 Jul 2018 20:38:23 +0530 +Subject: [PATCH 682/685] cluster/afr: Make sure lk-owner is assigned at the + time of lock + + Upstream patch: https://review.gluster.org/20455 + +Problem: +In the new eager-lock implementation lk-owner is assigned after the +'local' is added to the eager-lock list, so there exists a possibility +of lock being sent even before lk-owner is assigned. + +Fix: +Make sure to assign lk-owner before adding local to eager-lock list + +BUG: 1597648 +Change-Id: I26d1b7bcf3e8b22531f1dc0b952cae2d92889ef2 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/143174 +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + xlators/cluster/afr/src/afr-transaction.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index d19795d..750e3f8 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -2168,6 +2168,7 @@ __afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock, + + if (local->fd && !afr_are_multiple_fds_opened (local, this)) { + local->transaction.eager_lock_on = _gf_true; ++ afr_set_lk_owner (local->transaction.frame, this, local->inode); + } + + lock = &local->inode_ctx->lock[local->transaction.type]; +@@ -2271,8 +2272,6 @@ lock_phase: + if (!local->transaction.eager_lock_on) { + afr_set_lk_owner (local->transaction.frame, this, + local->transaction.frame->root); +- } else { +- afr_set_lk_owner (local->transaction.frame, this, local->inode); + } + + +-- +1.8.3.1 + diff --git a/SOURCES/0683-glusterd-fix-client-io-threads-option-for-replicate-.patch b/SOURCES/0683-glusterd-fix-client-io-threads-option-for-replicate-.patch new file mode 100644 index 0000000..805aad7 --- /dev/null +++ b/SOURCES/0683-glusterd-fix-client-io-threads-option-for-replicate-.patch @@ -0,0 +1,366 @@ +From c209e7656bc3eb80b210924a0f756eebd0befc60 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Thu, 5 Jul 2018 14:02:34 +0530 +Subject: [PATCH 683/685] glusterd: fix client io-threads option for replicate + volumes + +Backport of https://review.gluster.org/#/c/18430/5 +...but changes the op_version checks to GD_OP_VERSION_3_11_2 + +This backport is for honouring 'gluster v set client-io-threads` +{on/off} on rhgs-3.3.1 async release. After upgrading from an older version +of rhgs to 331 or when upgrading from this version to rhgs-3.4.0, it is +recommended to explicity run the volume set after upgrading all nodes and +bumping up the cluster op-version. As an additional check, please also check +the fuse volfile to see if the io-threads xlator was loaded/unloaded depending +on whether it you did an 'on' or 'off' respectively. + +Change-Id: I47d5717bf137b01eea88678cca8624c3aabd8bb5 +BUG: 1598416 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/143250 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + .../replicate/bug-1498570-client-iot-graph-check.t | 49 ++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 34 +++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-handler.c | 7 ++-- + xlators/mgmt/glusterd/src/glusterd-utils.c | 20 +-------- + xlators/mgmt/glusterd/src/glusterd-utils.h | 3 +- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 28 ++++++++----- + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 34 +++++++++++++++ + 7 files changed, 141 insertions(+), 34 deletions(-) + create mode 100644 tests/bugs/replicate/bug-1498570-client-iot-graph-check.t + +diff --git a/tests/bugs/replicate/bug-1498570-client-iot-graph-check.t b/tests/bugs/replicate/bug-1498570-client-iot-graph-check.t +new file mode 100644 +index 0000000..4574ccb +--- /dev/null ++++ b/tests/bugs/replicate/bug-1498570-client-iot-graph-check.t +@@ -0,0 +1,49 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++TESTS_EXPECTED_IN_LOOP=21 ++function reset_cluster ++{ ++ cleanup ++ TEST glusterd ++ TEST pidof glusterd ++ ++} ++function check_iot_option ++{ ++ local enabled=$1 ++ local is_loaded_in_graph=$2 ++ ++ EXPECT "$enabled" volume_get_field $V0 client-io-threads ++ IOT_STRING="volume\ $V0-io-threads" ++ grep "$IOT_STRING" $GLUSTERD_WORKDIR/vols/$V0/trusted-$V0.tcp-fuse.vol ++ TEST ret=$? ++ EXPECT_NOT "$is_loaded_in_graph" echo $ret ++} ++ ++reset_cluster ++TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1} ++check_iot_option on 1 ++ ++reset_cluster ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++check_iot_option off 0 ++ ++reset_cluster ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume start $V0 ++TEST $CLI volume add-brick $V0 replica 2 $H0:$B0/${V0}1 ++check_iot_option off 0 ++TEST $CLI volume remove-brick $V0 replica 1 $H0:$B0/${V0}1 force ++check_iot_option on 1 ++ ++reset_cluster ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..5} ++TEST $CLI volume set $V0 client-io-threads on ++check_iot_option on 1 ++TEST $CLI volume remove-brick $V0 replica 2 $H0:$B0/${V0}2 $H0:$B0/${V0}5 force ++check_iot_option on 1 ++ ++cleanup +diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +index f4cd927..fadbc00 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +@@ -1415,6 +1415,24 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, + /* Gets changed only if the options are given in add-brick cli */ + if (type) + volinfo->type = type; ++ /* performance.client-io-threads is turned on by default, ++ * however this has adverse effects on replicate volumes due to ++ * replication design issues, till that get addressed ++ * performance.client-io-threads option is turned off for all ++ * replicate volumes if not already explicitly enabled. ++ */ ++ if (type && glusterd_is_volume_replicate (volinfo) && ++ conf->op_version >= GD_OP_VERSION_3_11_2) { ++ ret = dict_set_str (volinfo->dict, ++ "performance.client-io-threads", ++ "off"); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, "Failed to set " ++ "performance.client-io-threads to off"); ++ goto out; ++ } ++ } + + if (replica_count) { + volinfo->replica_count = replica_count; +@@ -2583,9 +2601,12 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) + char *cold_shd_key = NULL; + char *hot_shd_key = NULL; + int delete_key = 1; ++ glusterd_conf_t *conf = NULL; + + this = THIS; + GF_ASSERT (this); ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO (this->name, conf, out); + + ret = dict_get_str (dict, "volname", &volname); + +@@ -2875,6 +2896,19 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) + volinfo->subvol_count = (volinfo->brick_count / + volinfo->dist_leaf_count); + ++ if (!glusterd_is_volume_replicate (volinfo) && ++ conf->op_version >= GD_OP_VERSION_3_11_2) { ++ ret = dict_set_str (volinfo->dict, ++ "performance.client-io-threads", ++ "on"); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, "Failed to set " ++ "performance.client-io-threads to on"); ++ goto out; ++ } ++ } ++ + ret = glusterd_create_volfiles_and_notify_services (volinfo); + if (ret) { + gf_msg (this->name, GF_LOG_WARNING, 0, +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 6d66301..6bcfc6b 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -4886,7 +4886,7 @@ glusterd_get_volume_opts (rpcsvc_request_t *req, dict_t *dict) + (dict, + _gf_false, + key, orig_key, +- volinfo, ++ volinfo->dict, + &rsp.op_errstr); + if (ret && !rsp.op_errstr) { + snprintf (err_str, +@@ -4912,7 +4912,7 @@ glusterd_get_volume_opts (rpcsvc_request_t *req, dict_t *dict) + } else { + /* Handle the "all" volume option request */ + ret = glusterd_get_default_val_for_volopt (dict, _gf_true, NULL, +- NULL, volinfo, ++ NULL, volinfo->dict, + &rsp.op_errstr); + if (ret && !rsp.op_errstr) { + snprintf (err_str, sizeof(err_str), +@@ -5505,7 +5505,8 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + vol_all_opts = dict_new (); + + ret = glusterd_get_default_val_for_volopt (vol_all_opts, +- _gf_true, NULL, NULL, volinfo, &rsp.op_errstr); ++ _gf_true, NULL, NULL, volinfo->dict, ++ &rsp.op_errstr); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_VOL_OPTS_IMPORT_FAIL, "Failed to " +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index a04ed99..f219fd5 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -12368,8 +12368,7 @@ out: + int + glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + char *input_key, char *orig_key, +- glusterd_volinfo_t *volinfo, +- char **op_errstr) ++ dict_t *vol_dict, char **op_errstr) + { + struct volopt_map_entry *vme = NULL; + int ret = -1; +@@ -12380,7 +12379,6 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + char dict_key[50] = {0,}; + gf_boolean_t key_found = _gf_false; + glusterd_conf_t *priv = NULL; +- dict_t *vol_dict = NULL; + + this = THIS; + GF_ASSERT (this); +@@ -12388,7 +12386,6 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + +- vol_dict = volinfo->dict; + GF_VALIDATE_OR_GOTO (this->name, vol_dict, out); + + /* Check whether key is passed for a single option */ +@@ -12410,20 +12407,6 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + if (!def_val) { + ret = dict_get_str (vol_dict, vme->key, &def_val); + if (!def_val) { +- /* For replicate volumes +- * performance.client-io-threads will be set to +- * off by default until explicitly turned on +- */ +- if (!strcmp (vme->key, +- "performance.client-io-threads")) { +- if (volinfo->type == +- GF_CLUSTER_TYPE_REPLICATE || +- volinfo->type == +- GF_CLUSTER_TYPE_STRIPE_REPLICATE) { +- def_val = "off"; +- goto set_count; +- } +- } + if (vme->value) { + def_val = vme->value; + } else { +@@ -12436,7 +12419,6 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + } + } + } +-set_count: + count++; + sprintf (dict_key, "key%d", count); + ret = dict_set_str(ctx, dict_key, vme->key); +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 7a5bfd9..259088b 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -695,8 +695,7 @@ glusterd_get_global_options_for_all_vols (dict_t *dict, char **op_errstr); + int + glusterd_get_default_val_for_volopt (dict_t *dict, gf_boolean_t all_opts, + char *key, char *orig_key, +- glusterd_volinfo_t *volinfo, +- char **err_str); ++ dict_t *vol_dict, char **err_str); + + int + glusterd_check_client_op_version_support (char *volname, uint32_t op_version, +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 4198be8..e22c3d2 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -2550,9 +2550,15 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, + { + gf_boolean_t enabled = _gf_false; + glusterd_volinfo_t *volinfo = NULL; ++ xlator_t *this = NULL; ++ glusterd_conf_t *priv = NULL; + +- GF_ASSERT (param); ++ GF_VALIDATE_OR_GOTO ("glusterd", param, out); + volinfo = param; ++ this = THIS; ++ GF_VALIDATE_OR_GOTO ("glusterd", this, out); ++ priv = this->private; ++ GF_VALIDATE_OR_GOTO ("glusterd", priv, out); + + if (strcmp (vme->option, "!perf") != 0) + return 0; +@@ -2568,13 +2574,15 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, + (vme->op_version > volinfo->client_op_version)) + return 0; + +- /* For replicate volumes do not load io-threads as it affects +- * performance +- */ +- if (!strcmp (vme->key, "performance.client-io-threads") && +- (GF_CLUSTER_TYPE_STRIPE_REPLICATE == volinfo->type || +- GF_CLUSTER_TYPE_REPLICATE == volinfo->type)) +- return 0; ++ if (priv->op_version < GD_OP_VERSION_3_11_2) { ++ /* For replicate volumes do not load io-threads as it affects ++ * performance ++ */ ++ if (!strcmp (vme->key, "performance.client-io-threads") && ++ (GF_CLUSTER_TYPE_STRIPE_REPLICATE == volinfo->type || ++ GF_CLUSTER_TYPE_REPLICATE == volinfo->type)) ++ return 0; ++ } + + /* if VKEY_READDIR_AHEAD is enabled and parallel readdir is + * not enabled then load readdir-ahead here else it will be +@@ -2585,8 +2593,8 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, + + if (volgen_graph_add (graph, vme->voltype, volinfo->volname)) + return 0; +- else +- return -1; ++out: ++ return -1; + } + + static int +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +index 4e410ce..f552c83 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +@@ -2236,6 +2236,23 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) + volinfo->stripe_count = 1; + + if (GF_CLUSTER_TYPE_REPLICATE == volinfo->type) { ++ /* performance.client-io-threads is turned on to default, ++ * however this has adverse effects on replicate volumes due to ++ * replication design issues, till that get addressed ++ * performance.client-io-threads option is turned off for all ++ * replicate volumes ++ */ ++ if (priv->op_version >= GD_OP_VERSION_3_11_2) { ++ ret = dict_set_str (volinfo->dict, ++ "performance.client-io-threads", ++ "off"); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, "Failed to set " ++ "performance.client-io-threads to off"); ++ goto out; ++ } ++ } + ret = dict_get_int32 (dict, "replica-count", + &volinfo->replica_count); + if (ret) { +@@ -2256,6 +2273,23 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) + goto out; + } + } else if (GF_CLUSTER_TYPE_STRIPE_REPLICATE == volinfo->type) { ++ /* performance.client-io-threads is turned on to default, ++ * however this has adverse effects on replicate volumes due to ++ * replication design issues, till that get addressed ++ * performance.client-io-threads option is turned off for all ++ * replicate volumes ++ */ ++ if (priv->op_version >= GD_OP_VERSION_3_11_2) { ++ ret = dict_set_str (volinfo->dict, ++ "performance.client-io-threads", ++ "off"); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ GD_MSG_DICT_SET_FAILED, "Failed to set " ++ "performance.client-io-threads to off"); ++ goto out; ++ } ++ } + ret = dict_get_int32 (dict, "stripe-count", + &volinfo->stripe_count); + if (ret) { +-- +1.8.3.1 + diff --git a/SOURCES/0684-glusterd-show-brick-online-after-port-registration.patch b/SOURCES/0684-glusterd-show-brick-online-after-port-registration.patch new file mode 100644 index 0000000..8baf454 --- /dev/null +++ b/SOURCES/0684-glusterd-show-brick-online-after-port-registration.patch @@ -0,0 +1,53 @@ +From 016554cf28524238526d959ccd5456e232785780 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Sun, 1 Apr 2018 22:10:30 +0530 +Subject: [PATCH 684/685] glusterd: show brick online after port registration + + Upstream patch: https://review.gluster.org/19804 + +gluster-block project needs a dependency check to see if all the bricks +are online before bringing up the relevant gluster-block services. While +the patch https://review.gluster.org/#/c/19785/ attempts to write the +script but brick should be only marked as online only when the +pmap_signin is completed. + +While this is perfectly fine for non brick multiplexing, but with brick +multiplexing this patch still doesn't eliminate the race completely as +the attach_req call is asynchrnous and glusterd immediately marks the +port as registerd. + + >Fixes: bz#1563273 +BUG: 1598353 +Change-Id: I81db54b88f7315e1b24e0234beebe00de6429f9d +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/143268 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index f219fd5..828155d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -5891,6 +5891,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, + (void) pmap_registry_bind (this, + brickinfo->port, brickinfo->path, + GF_PMAP_PORT_BRICKSERVER, NULL); ++ brickinfo->port_registered = _gf_true; + /* + * This will unfortunately result in a separate RPC + * connection per brick, even though they're all in +@@ -6990,7 +6991,8 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo, + GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv); + + if (glusterd_is_brick_started (brickinfo)) { +- if (gf_is_service_running (pidfile, &pid)) { ++ if (gf_is_service_running (pidfile, &pid) && ++ brickinfo->port_registered) { + brick_online = _gf_true; + } else { + pid = -1; +-- +1.8.3.1 + diff --git a/SOURCES/0685-glusterd-show-brick-online-after-port-registration-e.patch b/SOURCES/0685-glusterd-show-brick-online-after-port-registration-e.patch new file mode 100644 index 0000000..ca84ae0 --- /dev/null +++ b/SOURCES/0685-glusterd-show-brick-online-after-port-registration-e.patch @@ -0,0 +1,155 @@ +From 15d48e766095a3ade29f82a6d1a26c65ef6c97d4 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Tue, 3 Jul 2018 14:14:59 +0530 +Subject: [PATCH 685/685] glusterd: show brick online after port registration + even in brick-mux + + Upstream patch: https://review.gluster.org/20451 + +Problem: +With brick-mux even before brick attach is complete on the bricks +glusterd marks them as online. This can lead to a race where +scripts that check if the bricks are online to assume that the +brick is online before it is completely online. + +Fix: +Wait for the callback from the brick before marking the port +as registered so that volume status will show the correct status +of the brick. + +BUG: 1598353 + >fixes bz#1597568 +Change-Id: Icd3dc62506af0cf75195e96746695db823312051 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/143269 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-snapshot.c | 2 +- + xlators/mgmt/glusterd/src/glusterd-utils.c | 36 +++++++++++++++++++++------ + xlators/mgmt/glusterd/src/glusterd-utils.h | 3 ++- + 3 files changed, 31 insertions(+), 10 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c +index 639282e..497bdba 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c ++++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c +@@ -2827,7 +2827,7 @@ glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol, + GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv); + if (gf_is_service_running (pidfile, &pid)) { + (void) send_attach_req (this, brickinfo->rpc, +- brickinfo->path, ++ brickinfo->path, NULL, + GLUSTERD_BRICK_TERMINATE); + brickinfo->status = GF_BRICK_STOPPED; + } +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 828155d..bbf6f7d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -92,9 +92,6 @@ + #define NLMV4_VERSION 4 + #define NLMV1_VERSION 1 + +-int +-send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op); +- + gf_boolean_t + is_brick_mx_enabled (void) + { +@@ -2409,7 +2406,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, + brickinfo->hostname, brickinfo->path); + + (void) send_attach_req (this, brickinfo->rpc, +- brickinfo->path, ++ brickinfo->path, NULL, + GLUSTERD_BRICK_TERMINATE); + } else { + gf_msg_debug (this->name, 0, "About to stop glusterfsd" +@@ -5330,8 +5327,27 @@ my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame) + return 0; + } + ++static int32_t ++attach_brick_callback (struct rpc_req *req, struct iovec *iov, int count, ++ void *v_frame) ++{ ++ call_frame_t *frame = v_frame; ++ glusterd_conf_t *conf = frame->this->private; ++ glusterd_brickinfo_t *brickinfo = frame->local; ++ ++ frame->local = NULL; ++ brickinfo->port_registered = _gf_true; ++ synclock_lock (&conf->big_lock); ++ --(conf->blockers); ++ synclock_unlock (&conf->big_lock); ++ ++ STACK_DESTROY (frame->root); ++ return 0; ++} ++ + int +-send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op) ++send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, ++ glusterd_brickinfo_t *brickinfo, int op) + { + int ret = -1; + struct iobuf *iobuf = NULL; +@@ -5345,6 +5361,7 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op) + struct rpc_clnt_connection *conn; + glusterd_conf_t *conf = this->private; + extern struct rpc_clnt_program gd_brick_prog; ++ fop_cbk_fn_t cbkfn = my_callback; + + if (!rpc) { + gf_log (this->name, GF_LOG_ERROR, "called with null rpc"); +@@ -5402,10 +5419,14 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op) + + iov.iov_len = ret; + ++ if (op == GLUSTERD_BRICK_ATTACH) { ++ frame->local = brickinfo; ++ cbkfn = attach_brick_callback; ++ } + /* Send the msg */ + ++(conf->blockers); + ret = rpc_clnt_submit (rpc, &gd_brick_prog, op, +- my_callback, &iov, 1, NULL, 0, iobref, ++ cbkfn, &iov, 1, NULL, 0, iobref, + frame, NULL, 0, NULL, 0, NULL); + return ret; + +@@ -5465,7 +5486,7 @@ attach_brick (xlator_t *this, + for (tries = 15; tries > 0; --tries) { + rpc = rpc_clnt_ref (other_brick->rpc); + if (rpc) { +- ret = send_attach_req (this, rpc, path, ++ ret = send_attach_req (this, rpc, path, brickinfo, + GLUSTERD_BRICK_ATTACH); + rpc_clnt_unref (rpc); + if (!ret) { +@@ -5485,7 +5506,6 @@ attach_brick (xlator_t *this, + brickinfo->status = GF_BRICK_STARTED; + brickinfo->rpc = + rpc_clnt_ref (other_brick->rpc); +- brickinfo->port_registered = _gf_true; + ret = glusterd_brick_process_add_brick (brickinfo, + volinfo); + if (ret) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 259088b..d1156dc 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -197,7 +197,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, + gf_boolean_t del_brick); + + int +-send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op); ++send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, ++ glusterd_brickinfo_t *brick, int op); + + glusterd_volinfo_t * + glusterd_volinfo_ref (glusterd_volinfo_t *volinfo); +-- +1.8.3.1 + diff --git a/SOURCES/0686-glusterd-_is_prefix-should-handle-0-length-paths.patch b/SOURCES/0686-glusterd-_is_prefix-should-handle-0-length-paths.patch new file mode 100644 index 0000000..7a262e4 --- /dev/null +++ b/SOURCES/0686-glusterd-_is_prefix-should-handle-0-length-paths.patch @@ -0,0 +1,47 @@ +From 1c1e872ad43ab0860e05be679632097296eef174 Mon Sep 17 00:00:00 2001 +From: Kaushal M +Date: Tue, 10 Jul 2018 20:56:08 +0530 +Subject: [PATCH 686/689] glusterd: _is_prefix should handle 0-length paths + +If one of the paths given to _is_prefix is 0-length, then it is not a +prefix of the other. Hence, _is_prefix should return false. + +>Change-Id: I54aa577a64a58940ec91872d0d74dc19cff9106d +>fixes: bz#1599783 +>Signed-off-by: Kaushal M + +upstream patch: https://review.gluster.org/#/c/20490/ + +Change-Id: I54aa577a64a58940ec91872d0d74dc19cff9106d +BUG: 1599803 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/143746 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index bbf6f7d..8e1048c 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -1269,6 +1269,15 @@ _is_prefix (char *str1, char *str2) + len1 = strlen (str1); + len2 = strlen (str2); + small_len = min (len1, len2); ++ ++ /* ++ * If either one (not both) of the strings are 0-length, they are not ++ * prefixes of each other. ++ */ ++ if ((small_len == 0) && (len1 != len2)) { ++ return _gf_false; ++ } ++ + for (i = 0; i < small_len; i++) { + if (str1[i] != str2[i]) { + prefix = _gf_false; +-- +1.8.3.1 + diff --git a/SOURCES/0687-glusterd-log-improvements-on-brick-creation-validati.patch b/SOURCES/0687-glusterd-log-improvements-on-brick-creation-validati.patch new file mode 100644 index 0000000..720c7a2 --- /dev/null +++ b/SOURCES/0687-glusterd-log-improvements-on-brick-creation-validati.patch @@ -0,0 +1,70 @@ +From f6164a5007ea553daf63fb6e661121f11bcdd7d2 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 10 Jul 2018 21:33:41 +0530 +Subject: [PATCH 687/689] glusterd: log improvements on brick creation + validation + +Added few log entries in glusterd_is_brickpath_available (). + +>Change-Id: I8b758578f9db90d2974f7c79126c50ad3a001d71 +>Updates: bz#1193929 +>Signed-off-by: Atin Mukherjee + +upstream patch: https://review.gluster.org/#/c/20493/ + +Change-Id: I8b758578f9db90d2974f7c79126c50ad3a001d71 +BUG: 1599803 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/143749 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 8e1048c..794e8e1 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -1233,7 +1233,8 @@ glusterd_brickinfo_new_from_brick (char *brick, + GD_MSG_BRICKINFO_CREATE_FAIL, "realpath" + " () failed for brick %s. The " + "underlying filesystem may be in bad " +- "state", new_brickinfo->path); ++ "state. Error - %s", ++ new_brickinfo->path, strerror(errno)); + ret = -1; + goto out; + } +@@ -1318,6 +1319,12 @@ glusterd_is_brickpath_available (uuid_t uuid, char *path) + /* path may not yet exist */ + if (!realpath (path, tmp_path)) { + if (errno != ENOENT) { ++ gf_msg (THIS->name, GF_LOG_CRITICAL, errno, ++ GD_MSG_BRICKINFO_CREATE_FAIL, "realpath" ++ " () failed for brick %s. The " ++ "underlying filesystem may be in bad " ++ "state. Error - %s", ++ path, strerror(errno)); + goto out; + } + /* When realpath(3) fails, tmp_path is undefined. */ +@@ -1329,8 +1336,14 @@ glusterd_is_brickpath_available (uuid_t uuid, char *path) + brick_list) { + if (gf_uuid_compare (uuid, brickinfo->uuid)) + continue; +- if (_is_prefix (brickinfo->real_path, tmp_path)) ++ if (_is_prefix (brickinfo->real_path, tmp_path)) { ++ gf_msg (THIS->name, GF_LOG_CRITICAL, 0, ++ GD_MSG_BRICKINFO_CREATE_FAIL, ++ "_is_prefix call failed for brick %s " ++ "against brick %s", tmp_path, ++ brickinfo->real_path); + goto out; ++ } + } + } + available = _gf_true; +-- +1.8.3.1 + diff --git a/SOURCES/0688-logging-Avoid-re-initing-log-level-in-io-stats.patch b/SOURCES/0688-logging-Avoid-re-initing-log-level-in-io-stats.patch new file mode 100644 index 0000000..c29f52a --- /dev/null +++ b/SOURCES/0688-logging-Avoid-re-initing-log-level-in-io-stats.patch @@ -0,0 +1,59 @@ +From 483b8a539824dd9dcce6f31b00c630c7ff238489 Mon Sep 17 00:00:00 2001 +From: Vijay Bellur +Date: Mon, 8 Aug 2016 13:11:29 -0400 +Subject: [PATCH 688/689] logging: Avoid re-initing log level in io-stats + +If log level is already set via api or command line, initialization of +io-stats xlator overwrites the log level to GF_LOG_INFO. This patch +prevents re-initialization of log level if already set. + +>Change-Id: I1f74d94ef8068b95ec696638c0a8b17d8d71aabe +>BUG: 1368882 +>Signed-off-by: Vijay Bellur +>Reported-by: Colin Lord +>Reviewed-on: http://review.gluster.org/15112 +>Reviewed-by: Niels de Vos +>Reviewed-by: Pranith Kumar Karampuri + +Change-Id: I1f74d94ef8068b95ec696638c0a8b17d8d71aabe +BUG: 1597509 +Signed-off-by: Vijay Bellur +Reviewed-on: https://code.engineering.redhat.com/gerrit/143803 +Tested-by: RHGS Build Bot +Reviewed-by: Amar Tumballi +Reviewed-by: Atin Mukherjee +--- + libglusterfs/src/ctx.c | 2 ++ + xlators/debug/io-stats/src/io-stats.c | 3 ++- + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/libglusterfs/src/ctx.c b/libglusterfs/src/ctx.c +index 35f1928..29849e2 100644 +--- a/libglusterfs/src/ctx.c ++++ b/libglusterfs/src/ctx.c +@@ -37,6 +37,8 @@ glusterfs_ctx_new () + ctx->daemon_pipe[0] = -1; + ctx->daemon_pipe[1] = -1; + ++ ctx->log.loglevel = DEFAULT_LOG_LEVEL; ++ + /* lock is never destroyed! */ + ret = LOCK_INIT (&ctx->lock); + if (ret) { +diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c +index 92f05e5..6f4206a 100644 +--- a/xlators/debug/io-stats/src/io-stats.c ++++ b/xlators/debug/io-stats/src/io-stats.c +@@ -3991,7 +3991,8 @@ init (xlator_t *this) + GF_OPTION_INIT ("log-level", log_str, str, out); + if (log_str) { + log_level = glusterd_check_log_level (log_str); +- gf_log_set_loglevel (log_level); ++ if (DEFAULT_LOG_LEVEL != log_level) ++ gf_log_set_loglevel (log_level); + } + + GF_OPTION_INIT ("logger", logger_str, str, out); +-- +1.8.3.1 + diff --git a/SOURCES/0689-geo-rep-Fix-symlink-rename-syncing-issue.patch b/SOURCES/0689-geo-rep-Fix-symlink-rename-syncing-issue.patch new file mode 100644 index 0000000..3f85391 --- /dev/null +++ b/SOURCES/0689-geo-rep-Fix-symlink-rename-syncing-issue.patch @@ -0,0 +1,110 @@ +From a6968941cacc1ddc8b554da0d142dab71a340e33 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 12 Jul 2018 02:14:01 -0400 +Subject: [PATCH 689/689] geo-rep: Fix symlink rename syncing issue + +Problem: + Geo-rep sometimes fails to sync the rename of symlink +if the I/O is as follows + + 1. touch file1 + 2. ln -s "./file1" sym_400 + 3. mv sym_400 renamed_sym_400 + 4. mkdir sym_400 + + The file 'renamed_sym_400' failed to sync to slave + +Cause: + Assume there are three distribute subvolume (brick1, brick2, brick3). + The changelogs are recorded as follows for above I/O pattern. + Note that the MKDIR is recorded on all bricks. + + 1. brick1: + ------- + + CREATE file1 + SYMLINK sym_400 + RENAME sym_400 renamed_sym_400 + MKDIR sym_400 + + 2. brick2: + ------- + + MKDIR sym_400 + + 3. brick3: + ------- + + MKDIR sym_400 + + The operations on 'brick1' should be processed sequentially. But + since MKDIR is recorded on all the bricks, The brick 'brick2/brick3' + processed MKDIR first before 'brick1' causing out of order syncing + and created directory sym_400 first. + + Now 'brick1' processed it's changelog. + + CREATE file1 -> succeeds + SYMLINK sym_400 -> No longer present in master. Ignored + RENAME sym_400 renamed_sym_400 + While processing RENAME, if source('sym_400') doesn't + present, destination('renamed_sym_400') is created. But + geo-rep stats the name 'sym_400' to confirm source file's + presence. In this race, since source name 'sym_400' is + present as directory, it doesn't create destination. + Hence RENAME is ignored. + +Fix: + The fix is not rely only on stat of source name during RENAME. + It should stat the name and if the name is present, gfid should + be same. Only then it can conclude the presence of source. + +Backport of: + > Patch: https://review.gluster.org/20496 + > BUG: 1600405 + > Change-Id: I9fbec4f13ca6a182798a7f81b356fe2003aff969 + > Signed-off-by: Kotresh HR + +BUG: 1590774 +Change-Id: I9fbec4f13ca6a182798a7f81b356fe2003aff969 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/143826 +Tested-by: RHGS Build Bot +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Reviewed-by: Milind Changire +--- + geo-replication/syncdaemon/resource.py | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 39d537b..f345ae3 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -777,8 +777,14 @@ class Server(object): + blob = entry_pack_symlink(gfid, bname, e['link'], e['stat']) + elif op == 'RENAME': + en = e['entry1'] +- st = lstat(entry) +- if isinstance(st, int): ++ # The matching disk gfid check validates two things ++ # 1. Validates name is present, return false otherwise ++ # 2. Validates gfid is same, returns false otherwise ++ # So both validations are necessary to decide src doesn't ++ # exist. We can't rely on only gfid stat as hardlink could ++ # be present and we can't rely only on name as name could ++ # exist with differnt gfid. ++ if not matching_disk_gfid(gfid, entry): + if e['stat'] and not stat.S_ISDIR(e['stat']['mode']): + if stat.S_ISLNK(e['stat']['mode']) and \ + e['link'] is not None: +@@ -789,6 +795,7 @@ class Server(object): + (pg, bname) = entry2pb(en) + blob = entry_pack_reg_stat(gfid, bname, e['stat']) + else: ++ st = lstat(entry) + st1 = lstat(en) + if isinstance(st1, int): + rename_with_disk_gfid_confirmation(gfid, entry, en) +-- +1.8.3.1 + diff --git a/SPECS/glusterfs.spec b/SPECS/glusterfs.spec index ad59e33..03f0032 100644 --- a/SPECS/glusterfs.spec +++ b/SPECS/glusterfs.spec @@ -186,11 +186,11 @@ Summary: Distributed File System Name: glusterfs Version: 3.8.0 Release: 0.1%{?prereltag:.%{prereltag}}%{?dist} -#Vendor removed +Vendor: Fedora Project %else Name: glusterfs Version: 3.8.4 -Release: 53%{?dist} +Release: 54.15%{?dist} %endif License: GPLv2 or LGPLv3+ Group: System Environment/Base @@ -893,6 +893,63 @@ Patch0629: 0629-fuse-fix-the-read-only-mount-flag-issue.patch Patch0630: 0630-glusterd-delete-source-brick-only-once-in-reset-bric.patch Patch0631: 0631-glusterd-persist-brickinfo-s-port-change-into-gluste.patch Patch0632: 0632-build-remove-pretrans-script-for-python-gluster.patch +Patch0633: 0633-mgmt-glusterd-Cleanup-memory-leaks-in-handshake.patch +Patch0634: 0634-glusterd-Fix-glusterd-mem-leaks.patch +Patch0635: 0635-glusterd-Marking-all-the-brick-status-as-stopped-whe.patch +Patch0636: 0636-glusterd-clean-up-portmap-on-brick-disconnect.patch +Patch0637: 0637-glusterd-fix-brick-restart-parallelism.patch +Patch0638: 0638-glusterd-Free-up-svc-conn-on-volume-delete.patch +Patch0639: 0639-glusterd-introduce-timer-in-mgmt_v3_lock.patch +Patch0640: 0640-dict-Don-t-expose-get_new_dict-dict_destroy.patch +Patch0641: 0641-features-locks-Fix-memory-leaks.patch +Patch0642: 0642-gfapi-set-lkowner-in-glfd.patch +Patch0643: 0643-build-remove-ExclusiveArch-from-spec-file.patch +Patch0644: 0644-libglusterfs-fix-the-call_stack_set_group-function.patch +Patch0645: 0645-glusterd-Nullify-pmap-entry-for-bricks-belonging-to-.patch +Patch0646: 0646-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch +Patch0647: 0647-glusterd-optimize-glusterd-import-volumes-code-path.patch +Patch0648: 0648-glusterd-import-volumes-in-separate-synctask.patch +Patch0649: 0649-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch +Patch0650: 0650-glusterd-snapshot-fix-the-compare-snap-logic.patch +Patch0651: 0651-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch +Patch0652: 0652-shared-storage-Prevent-mounting-shared-storage-from-.patch +Patch0653: 0653-server-auth-add-option-for-strict-authentication.patch +Patch0654: 0654-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch +Patch0655: 0655-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch +Patch0656: 0656-server-auth-fix-regression-in-honouring-auth.allow.patch +Patch0657: 0657-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch +Patch0658: 0658-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch +Patch0659: 0659-gluster-Allow-only-read-only-CLI-commands-via-remote.patch +Patch0660: 0660-Revert-geo-rep-Remove-lazy-umount-and-use-mount-name.patch +Patch0661: 0661-Revert-Revert-geo-rep-Remove-lazy-umount-and-use-mou.patch +Patch0662: 0662-storage-posix-Use-the-ret-value-of-posix_gfid_heal.patch +Patch0663: 0663-features-shard-Pass-the-correct-block-num-to-store-i.patch +Patch0664: 0664-features-shard-Leverage-block_num-info-in-inode-ctx-.patch +Patch0665: 0665-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch +Patch0666: 0666-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch +Patch0667: 0667-features-shard-Do-list_del_init-while-list-memory-is.patch +Patch0668: 0668-storage-posix-Add-active-fd-count-option-in-gluster.patch +Patch0669: 0669-cluster-afr-Make-afr_fsync-a-transaction.patch +Patch0670: 0670-cluster-afr-Remove-compound-fops-usage-in-afr.patch +Patch0671: 0671-cluster-afr-Remove-unused-code-paths.patch +Patch0672: 0672-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch +Patch0673: 0673-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch +Patch0674: 0674-afr-fixes-to-afr-eager-locking.patch +Patch0675: 0675-block-profile-enable-cluster.eager-lock-in-block-pro.patch +Patch0676: 0676-storage-posix-Handle-ENOSPC-correctly-in-zero_fill.patch +Patch0677: 0677-cluster-afr-Increase-the-lock-count-on-success.patch +Patch0678: 0678-extras-group-add-database-workload-profile.patch +Patch0679: 0679-glusterd-Introduce-daemon-log-level-cluster-wide-opt.patch +Patch0680: 0680-glusterfsd-Do-not-process-GLUSTERD_BRICK_XLATOR_OP-i.patch +Patch0681: 0681-geo-rep-Fix-for-EINVAL-errors-while-syncing-symlinks.patch +Patch0682: 0682-cluster-afr-Make-sure-lk-owner-is-assigned-at-the-ti.patch +Patch0683: 0683-glusterd-fix-client-io-threads-option-for-replicate-.patch +Patch0684: 0684-glusterd-show-brick-online-after-port-registration.patch +Patch0685: 0685-glusterd-show-brick-online-after-port-registration-e.patch +Patch0686: 0686-glusterd-_is_prefix-should-handle-0-length-paths.patch +Patch0687: 0687-glusterd-log-improvements-on-brick-creation-validati.patch +Patch0688: 0688-logging-Avoid-re-initing-log-level-in-io-stats.patch +Patch0689: 0689-geo-rep-Fix-symlink-rename-syncing-issue.patch %description GlusterFS is a distributed file-system capable of scaling to several @@ -1073,6 +1130,7 @@ Requires: %{name}%{?_isa} = %{version}-%{release} Requires: %{name}-server%{?_isa} = %{version}-%{release} Requires: python python-ctypes Requires: rsync +Requires: util-linux %description geo-replication GlusterFS is a distributed file-system capable of scaling to several @@ -1744,6 +1802,7 @@ exit 0 %exclude %{_sysconfdir}/glusterfs/group-metadata-cache %exclude %{_sysconfdir}/glusterfs/group-nl-cache %exclude %{_sysconfdir}/glusterfs/group-gluster-block +%exclude %{_sysconfdir}/glusterfs/group-db-workload %exclude %{_sysconfdir}/glusterfs/logger.conf.example %exclude %_init_glusterd %exclude %{_sysconfdir}/sysconfig/glusterd @@ -2071,6 +2130,7 @@ exit 0 %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/metadata-cache %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/nl-cache %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/gluster-block + %attr(0644,-,-) %{_sharedstatedir}/glusterd/groups/db-workload %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/glusterfind %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/glusterfind/.keys %ghost %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/glustershd @@ -2750,11 +2810,66 @@ fi %endif %changelog -* Tue Apr 10 2018 CentOS Sources - 3.8.4-53.el7.centos -- remove vendor and/or packager lines +* Thu Jul 12 2018 Milind Changire - 3.8.4-54.15 +- fixes bugs bz#1590774 bz#1597509 bz#1599803 -* Fri Dec 01 2017 Yaakov Selkowitz - 3.8.4-53 -- Rebuilt for multi-arch enablement bz#1493586 +* Thu Jul 05 2018 Milind Changire - 3.8.4-54.14 +- fixes bugs bz#1590774 bz#1595752 bz#1597509 bz#1597648 bz#1598353 bz#1598416 + +* Fri Jun 29 2018 Milind Changire - 3.8.4-54.13 +- fixes bugs bz#1594656 bz#1594682 bz#1596076 + +* Mon Jun 04 2018 Milind Changire - 3.8.4-54.12 +- fixes bugs bz#1556680 bz#1583462 bz#1583464 bz#1583733 bz#1585046 + +* Wed May 30 2018 Milind Changire - 3.8.4-54.11 +- fixes bugs bz#1556680 + +* Wed May 30 2018 Milind Changire - 3.8.4-54.10 +- fixes bugs bz#1556680 + +* Tue Apr 24 2018 Milind Changire - 3.8.4-54.9 +- fixes bugs bz#1556680 + +* Tue Apr 24 2018 Milind Changire - 3.8.4-54.8 +- fixes bugs bz#1556680 + +* Mon Apr 16 2018 Milind Changire - 3.8.4-54.7 +- fixes bugs bz#1556680 + +* Fri Apr 06 2018 Milind Changire - 3.8.4-54.6 +- rebuild for RHEL 7.5 + +* Fri Apr 06 2018 Milind Changire - 3.8.4-54.5 +- fixes bugs bz#1559331 + +* Fri Mar 23 2018 Milind Changire - 3.8.4-54.4 +- fixes bugs bz#1547931 + +* Wed Mar 21 2018 Milind Changire - 3.8.4-54.3 +- fixes bugs bz#1556670 + +* Thu Mar 15 2018 Milind Changire - 3.8.4-54.2 +- fixes bugs bz#1556670 bz#1556680 + +* Mon Feb 26 2018 Milind Changire - 3.8.4-54.1 +- fixes bugs bz#1547931 + +* Thu Jan 04 2018 Milind Changire - 3.8.4-54 +- version bump to accomodate build at RHEL 7.5 - bz#1530916 + +* Wed Jan 03 2018 Milind Changire - 3.8.4-52.4 +- fixes bugs bz#1530217 + +* Fri Dec 22 2017 Milind Changire - 3.8.4-52.3 +- fixes bugs bz#1527147 + +* Wed Dec 20 2017 Sunil Kumar Acharya - 3.8.4-52.2 +- fixes bugs bz#1527772 + +* Mon Dec 18 2017 Sunil Kumar Acharya - 3.8.4-52.1 +- fixes bugs bz#1526363 bz#1526368 bz#1526371 bz#1526372 bz#1526373 + bz#1526377 bz#1526378 * Wed Nov 08 2017 Milind Changire - 3.8.4-52 - fixes bugs bz#1257520