|
|
21ab4e |
From 71f895fb6c0fd6f4146cf0b103be3ed4079d210e Mon Sep 17 00:00:00 2001
|
|
|
21ab4e |
From: Mohit Agrawal <moagrawa@redhat.com>
|
|
|
21ab4e |
Date: Tue, 8 Aug 2017 14:36:17 +0530
|
|
|
21ab4e |
Subject: [PATCH 600/601] glusterd: Block brick attach request till the brick's
|
|
|
21ab4e |
ctx is set
|
|
|
21ab4e |
MIME-Version: 1.0
|
|
|
21ab4e |
Content-Type: text/plain; charset=UTF-8
|
|
|
21ab4e |
Content-Transfer-Encoding: 8bit
|
|
|
21ab4e |
|
|
|
21ab4e |
Problem: In multiplexing setup in a container environment we hit a race
|
|
|
21ab4e |
where before the first brick finishes its handshake with glusterd, the
|
|
|
21ab4e |
subsequent attach requests went through and they actually failed and
|
|
|
21ab4e |
glusterd has no mechanism to realize it. This resulted into all the such
|
|
|
21ab4e |
bricks not to be active resulting into clients not able to connect.
|
|
|
21ab4e |
|
|
|
21ab4e |
Solution: Introduce a new flag port_registered in glusterd_brickinfo
|
|
|
21ab4e |
to make sure about pmap_signin finish before the subsequent
|
|
|
21ab4e |
attach bricks can be processed.
|
|
|
21ab4e |
|
|
|
21ab4e |
Test: To reproduce the issue followed below steps
|
|
|
21ab4e |
1) Create 100 volumes on 3 nodes(1x3) in CNS environment
|
|
|
21ab4e |
2) Enable brick multiplexing
|
|
|
21ab4e |
3) Reboot one container
|
|
|
21ab4e |
4) Run below command
|
|
|
21ab4e |
for v in ‛gluster v list‛
|
|
|
21ab4e |
do
|
|
|
21ab4e |
glfsheal $v | grep -i "transport"
|
|
|
21ab4e |
done
|
|
|
21ab4e |
After apply the patch command should not fail.
|
|
|
21ab4e |
|
|
|
21ab4e |
Note: A big thanks to Atin for suggest the fix.
|
|
|
21ab4e |
|
|
|
21ab4e |
>Reviewed-on: https://review.gluster.org/17984
|
|
|
21ab4e |
>Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
21ab4e |
>Smoke: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
>Reviewed-by: Jeff Darcy <jeff@pl.atyp.us>
|
|
|
21ab4e |
|
|
|
21ab4e |
BUG: 1477024
|
|
|
21ab4e |
Change-Id: I8e1bd6132122b3a5b0dd49606cea564122f2609b
|
|
|
21ab4e |
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
|
|
21ab4e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/114764
|
|
|
21ab4e |
Tested-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
21ab4e |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
21ab4e |
---
|
|
|
21ab4e |
xlators/mgmt/glusterd/src/glusterd-pmap.c | 7 +++
|
|
|
21ab4e |
xlators/mgmt/glusterd/src/glusterd-utils.c | 73 ++++++++++++++++++++----------
|
|
|
21ab4e |
xlators/mgmt/glusterd/src/glusterd.h | 1 +
|
|
|
21ab4e |
3 files changed, 57 insertions(+), 24 deletions(-)
|
|
|
21ab4e |
|
|
|
21ab4e |
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
|
|
|
21ab4e |
index 6fead59..7acee19 100644
|
|
|
21ab4e |
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
|
|
|
21ab4e |
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
|
|
|
21ab4e |
@@ -532,6 +532,9 @@ __gluster_pmap_signin (rpcsvc_request_t *req)
|
|
|
21ab4e |
GF_PMAP_PORT_BRICKSERVER, req->trans);
|
|
|
21ab4e |
|
|
|
21ab4e |
ret = glusterd_get_brickinfo (THIS, args.brick, args.port, &brickinfo);
|
|
|
21ab4e |
+ /* Update portmap status in brickinfo */
|
|
|
21ab4e |
+ if (brickinfo)
|
|
|
21ab4e |
+ brickinfo->port_registered = _gf_true;
|
|
|
21ab4e |
|
|
|
21ab4e |
fail:
|
|
|
21ab4e |
glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
|
|
|
21ab4e |
@@ -584,6 +587,10 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
|
|
|
21ab4e |
brick_path, GF_PMAP_PORT_BRICKSERVER,
|
|
|
21ab4e |
req->trans);
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+ /* Update portmap status on brickinfo */
|
|
|
21ab4e |
+ if (brickinfo)
|
|
|
21ab4e |
+ brickinfo->port_registered = _gf_false;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
/* Clean up the pidfile for this brick given glusterfsd doesn't clean it
|
|
|
21ab4e |
* any more. This is required to ensure we don't end up with having
|
|
|
21ab4e |
* stale pid files in case a brick is killed from the backend
|
|
|
21ab4e |
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
|
|
|
21ab4e |
index 3afed1e..407e5b8 100644
|
|
|
21ab4e |
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
|
|
|
21ab4e |
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
|
|
|
21ab4e |
@@ -2057,6 +2057,7 @@ retry:
|
|
|
21ab4e |
brickinfo->port = port;
|
|
|
21ab4e |
brickinfo->rdma_port = rdma_port;
|
|
|
21ab4e |
brickinfo->status = GF_BRICK_STARTING;
|
|
|
21ab4e |
+ brickinfo->port_registered = _gf_false;
|
|
|
21ab4e |
|
|
|
21ab4e |
if (wait) {
|
|
|
21ab4e |
synclock_unlock (&priv->big_lock);
|
|
|
21ab4e |
@@ -5339,13 +5340,6 @@ attach_brick (xlator_t *this,
|
|
|
21ab4e |
|
|
|
21ab4e |
GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, unslashed);
|
|
|
21ab4e |
|
|
|
21ab4e |
- ret = pmap_registry_extend (this, other_brick->port,
|
|
|
21ab4e |
- brickinfo->path);
|
|
|
21ab4e |
- if (ret != 0) {
|
|
|
21ab4e |
- gf_log (this->name, GF_LOG_ERROR,
|
|
|
21ab4e |
- "adding brick to process failed");
|
|
|
21ab4e |
- return -1;
|
|
|
21ab4e |
- }
|
|
|
21ab4e |
GLUSTERD_GET_BRICK_PIDFILE (pidfile1, other_vol, other_brick, conf);
|
|
|
21ab4e |
GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf);
|
|
|
21ab4e |
|
|
|
21ab4e |
@@ -5368,6 +5362,14 @@ attach_brick (xlator_t *this,
|
|
|
21ab4e |
GLUSTERD_BRICK_ATTACH);
|
|
|
21ab4e |
rpc_clnt_unref (rpc);
|
|
|
21ab4e |
if (!ret) {
|
|
|
21ab4e |
+ ret = pmap_registry_extend (this, other_brick->port,
|
|
|
21ab4e |
+ brickinfo->path);
|
|
|
21ab4e |
+ if (ret != 0) {
|
|
|
21ab4e |
+ gf_log (this->name, GF_LOG_ERROR,
|
|
|
21ab4e |
+ "adding brick to process failed");
|
|
|
21ab4e |
+ return ret;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
/* PID file is copied once brick has attached
|
|
|
21ab4e |
successfully
|
|
|
21ab4e |
*/
|
|
|
21ab4e |
@@ -5376,6 +5378,7 @@ attach_brick (xlator_t *this,
|
|
|
21ab4e |
brickinfo->status = GF_BRICK_STARTED;
|
|
|
21ab4e |
brickinfo->rpc =
|
|
|
21ab4e |
rpc_clnt_ref (other_brick->rpc);
|
|
|
21ab4e |
+ brickinfo->port_registered = _gf_true;
|
|
|
21ab4e |
ret = glusterd_brick_process_add_brick (brickinfo,
|
|
|
21ab4e |
volinfo);
|
|
|
21ab4e |
if (ret) {
|
|
|
21ab4e |
@@ -5429,6 +5432,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
|
|
|
21ab4e |
int16_t retries = 15;
|
|
|
21ab4e |
int mux_limit = -1;
|
|
|
21ab4e |
int ret = -1;
|
|
|
21ab4e |
+ gf_boolean_t brick_status = _gf_false;
|
|
|
21ab4e |
|
|
|
21ab4e |
/*
|
|
|
21ab4e |
* If comp_vol is provided, we have to check *volume* compatibility
|
|
|
21ab4e |
@@ -5503,31 +5507,45 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
|
|
|
21ab4e |
"set. Continuing with no limit set for "
|
|
|
21ab4e |
"brick multiplexing.");
|
|
|
21ab4e |
}
|
|
|
21ab4e |
-
|
|
|
21ab4e |
- GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol, other_brick,
|
|
|
21ab4e |
- conf);
|
|
|
21ab4e |
-
|
|
|
21ab4e |
- /* It is possible that the pidfile hasn't yet been populated,
|
|
|
21ab4e |
- * when bricks are started in "no-wait" mode; for example
|
|
|
21ab4e |
- * when bricks are started by glusterd_restart_bricks(). So
|
|
|
21ab4e |
- * wait for the pidfile to be populated with a value before
|
|
|
21ab4e |
- * checking if the service is running */
|
|
|
21ab4e |
+ /* The first brick process might take some time to finish its
|
|
|
21ab4e |
+ * handshake with glusterd and prepare the graph. We can't
|
|
|
21ab4e |
+ * afford to send attach_req for other bricks till that time.
|
|
|
21ab4e |
+ * brick process sends PMAP_SIGNIN event after processing the
|
|
|
21ab4e |
+ * volfile and hence it's safe to assume that if glusterd has
|
|
|
21ab4e |
+ * received a pmap signin request for the same brick, we are
|
|
|
21ab4e |
+ * good for subsequent attach requests.
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+ retries = 15;
|
|
|
21ab4e |
while (retries > 0) {
|
|
|
21ab4e |
- if (sys_access (pidfile2, F_OK) == 0 &&
|
|
|
21ab4e |
- gf_is_service_running (pidfile2, &pid2)) {
|
|
|
21ab4e |
- break;
|
|
|
21ab4e |
+ if (other_brick->port_registered) {
|
|
|
21ab4e |
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol,
|
|
|
21ab4e |
+ other_brick, conf);
|
|
|
21ab4e |
+ if (sys_access (pidfile2, F_OK) == 0 &&
|
|
|
21ab4e |
+ gf_is_service_running (pidfile2, &pid2)) {
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0,
|
|
|
21ab4e |
+ "brick %s is running as a pid %d ",
|
|
|
21ab4e |
+ other_brick->path, pid2);
|
|
|
21ab4e |
+ brick_status = _gf_true;
|
|
|
21ab4e |
+ break;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
- sleep (1);
|
|
|
21ab4e |
+ synclock_unlock (&conf->big_lock);
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "brick %s is still"
|
|
|
21ab4e |
+ " starting, waiting for 2 seconds ",
|
|
|
21ab4e |
+ other_brick->path);
|
|
|
21ab4e |
+ sleep(2);
|
|
|
21ab4e |
+ synclock_lock (&conf->big_lock);
|
|
|
21ab4e |
retries--;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
- if (retries == 0) {
|
|
|
21ab4e |
+ if (!brick_status) {
|
|
|
21ab4e |
gf_log (this->name, GF_LOG_INFO,
|
|
|
21ab4e |
- "cleaning up dead brick %s:%s",
|
|
|
21ab4e |
+ "brick has not come up so cleaning up dead brick %s:%s",
|
|
|
21ab4e |
other_brick->hostname, other_brick->path);
|
|
|
21ab4e |
other_brick->status = GF_BRICK_STOPPED;
|
|
|
21ab4e |
- sys_unlink (pidfile2);
|
|
|
21ab4e |
+ if (pidfile2[0])
|
|
|
21ab4e |
+ sys_unlink (pidfile2);
|
|
|
21ab4e |
continue;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
return other_brick;
|
|
|
21ab4e |
@@ -5803,6 +5821,12 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
|
|
|
21ab4e |
brickinfo->path);
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+ /* We need to set the status back to STARTING so that
|
|
|
21ab4e |
+ * while the other (re)start brick requests come in for
|
|
|
21ab4e |
+ * other bricks, this brick can be considered as
|
|
|
21ab4e |
+ * compatible.
|
|
|
21ab4e |
+ */
|
|
|
21ab4e |
+ brickinfo->status = GF_BRICK_STARTING;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
return 0;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
@@ -5846,7 +5870,8 @@ run:
|
|
|
21ab4e |
*
|
|
|
21ab4e |
* TBD: pray for GlusterD 2 to be ready soon.
|
|
|
21ab4e |
*/
|
|
|
21ab4e |
-
|
|
|
21ab4e |
+ gf_log (this->name, GF_LOG_INFO, "starting a fresh brick process for "
|
|
|
21ab4e |
+ "brick %s", brickinfo->path);
|
|
|
21ab4e |
ret = glusterd_volume_start_glusterfs (volinfo, brickinfo, wait);
|
|
|
21ab4e |
if (ret) {
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
21ab4e |
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
|
|
|
21ab4e |
index c493773..533f9a8 100644
|
|
|
21ab4e |
--- a/xlators/mgmt/glusterd/src/glusterd.h
|
|
|
21ab4e |
+++ b/xlators/mgmt/glusterd/src/glusterd.h
|
|
|
21ab4e |
@@ -226,6 +226,7 @@ struct glusterd_brickinfo {
|
|
|
21ab4e |
* a replica 3 volume with arbiter enabled.
|
|
|
21ab4e |
*/
|
|
|
21ab4e |
uint16_t group;
|
|
|
21ab4e |
+ gf_boolean_t port_registered;
|
|
|
21ab4e |
};
|
|
|
21ab4e |
|
|
|
21ab4e |
typedef struct glusterd_brickinfo glusterd_brickinfo_t;
|
|
|
21ab4e |
--
|
|
|
21ab4e |
1.8.3.1
|
|
|
21ab4e |
|