From d2787b90cdbddd7c866ca9b070fd190dfccb7b93 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Apr 05 2022 13:51:38 +0000 Subject: import glusterfs-6.0-61.el7 --- diff --git a/README.debrand b/README.debrand deleted file mode 100644 index 01c46d2..0000000 --- a/README.debrand +++ /dev/null @@ -1,2 +0,0 @@ -Warning: This package was configured for automatic debranding, but the changes -failed to apply. diff --git a/SOURCES/0481-RHGS-3.5.3-rebuild-to-ship-with-RHEL.patch b/SOURCES/0481-RHGS-3.5.3-rebuild-to-ship-with-RHEL.patch deleted file mode 100644 index dd9b0ab..0000000 --- a/SOURCES/0481-RHGS-3.5.3-rebuild-to-ship-with-RHEL.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 346aa7cbc34b9bbbaca45180215a4d9ffd5055df Mon Sep 17 00:00:00 2001 -From: Rinku Kothiya -Date: Fri, 19 Feb 2021 06:19:07 +0000 -Subject: [PATCH 481/481] RHGS-3.5.3 rebuild to ship with RHEL. - -Label: DOWNSTREAM ONLY -BUG: 1930561 - -Change-Id: I9c7f30cc6bc616344b27072bfde056c7bba1e143 -Signed-off-by: Rinku Kothiya -Reviewed-on: https://code.engineering.redhat.com/gerrit/228413 -Tested-by: RHGS Build Bot -Reviewed-by: Sunil Kumar Heggodu Gopala Acharya ---- - glusterfs.spec.in | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/glusterfs.spec.in b/glusterfs.spec.in -index 30d7162..52f9b40 100644 ---- a/glusterfs.spec.in -+++ b/glusterfs.spec.in -@@ -1983,6 +1983,8 @@ fi - %endif - - %changelog -+* Fri Feb 19 2021 Rinku Kothiya -+- Build RGHS clients for RHEL (#1930561) - - * Mon May 11 2020 Sunny Kumar - - added requires policycoreutils-python-utils on rhel8 for geo-replication --- -1.8.3.1 - diff --git a/SOURCES/0481-Update-rfc.sh-to-rhgs-3.5.4.patch b/SOURCES/0481-Update-rfc.sh-to-rhgs-3.5.4.patch new file mode 100644 index 0000000..0ba12d2 --- /dev/null +++ b/SOURCES/0481-Update-rfc.sh-to-rhgs-3.5.4.patch @@ -0,0 +1,26 @@ +From 828be8e789db3c77587c708f930d7fe8c9456e3b Mon Sep 17 00:00:00 2001 +From: Rinku Kothiya +Date: Fri, 4 Dec 2020 05:18:45 +0530 +Subject: [PATCH 481/511] Update rfc.sh to rhgs-3.5.4 + +Signed-off-by: Rinku Kothiya +--- + rfc.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/rfc.sh b/rfc.sh +index 1dca29f..c0559b9 100755 +--- a/rfc.sh ++++ b/rfc.sh +@@ -18,7 +18,7 @@ done + shift $((OPTIND-1)) + + +-branch="rhgs-3.5.3"; ++branch="rhgs-3.5.4"; + + set_hooks_commit_msg() + { +-- +1.8.3.1 + diff --git a/SOURCES/0482-logger-Always-print-errors-in-english.patch b/SOURCES/0482-logger-Always-print-errors-in-english.patch new file mode 100644 index 0000000..e454bec --- /dev/null +++ b/SOURCES/0482-logger-Always-print-errors-in-english.patch @@ -0,0 +1,49 @@ +From e43af5b15d14e43c3201fd0fb7bf02663e3e0127 Mon Sep 17 00:00:00 2001 +From: Rinku Kothiya +Date: Sat, 7 Nov 2020 12:09:36 +0530 +Subject: [PATCH 482/511] logger: Always print errors in english + +Upstream: +> Reviewed-on: https://github.com/gluster/glusterfs/pull/1657 +> fixes: #1302 +> Change-Id: If0e21f016155276a953c64a8dd13ff3eb281d09d +> Signed-off-by: Rinku Kothiya + +BUG: 1896425 + +Change-Id: If0e21f016155276a953c64a8dd13ff3eb281d09d +Signed-off-by: Rinku Kothiya +Reviewed-on: https://code.engineering.redhat.com/gerrit/219999 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/logging.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/libglusterfs/src/logging.c b/libglusterfs/src/logging.c +index 7f0eff6..5874c34 100644 +--- a/libglusterfs/src/logging.c ++++ b/libglusterfs/src/logging.c +@@ -513,6 +513,7 @@ gf_openlog(const char *ident, int option, int facility) + { + int _option = option; + int _facility = facility; ++ char *language = NULL; + + if (-1 == _option) { + _option = LOG_PID | LOG_NDELAY; +@@ -522,7 +523,10 @@ gf_openlog(const char *ident, int option, int facility) + } + + /* TODO: Should check for errors here and return appropriately */ +- setlocale(LC_ALL, ""); ++ language = setlocale(LC_ALL, "en_US.UTF-8"); ++ if (!language) ++ setlocale(LC_ALL, ""); ++ + setlocale(LC_NUMERIC, "C"); /* C-locale for strtod, ... */ + /* close the previous syslog if open as we are changing settings */ + closelog(); +-- +1.8.3.1 + diff --git a/SOURCES/0483-afr-more-quorum-checks-in-lookup-and-new-entry-marki.patch b/SOURCES/0483-afr-more-quorum-checks-in-lookup-and-new-entry-marki.patch new file mode 100644 index 0000000..c0f2118 --- /dev/null +++ b/SOURCES/0483-afr-more-quorum-checks-in-lookup-and-new-entry-marki.patch @@ -0,0 +1,150 @@ +From 8c366f34a279a5ab2a6301bfd93534fe746a23e8 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Mon, 7 Dec 2020 09:53:27 +0530 +Subject: [PATCH 483/511] afr: more quorum checks in lookup and new entry + marking + +Problem: See upstream github issue for details. + +Fix: +-In lookup if the entry exists in 2 out of 3 bricks, don't fail the +lookup with ENOENT just because there is an entrylk on the parent. +Consider quorum before deciding. + +-If entry FOP does not succeed on quorum no. of bricks, do not perform +new entry mark. + +Upstream patch details: +> Reviewed-on: https://review.gluster.org/#/c/glusterfs/+/24499/ +> Fixes: #1303 +> Change-Id: I56df8c89ad53b29fa450c7930a7b7ccec9f4a6c5 +> Signed-off-by: Ravishankar N + +BUG: 1821599 +Change-Id: If513e8a7d6088a676288927630d8e616269bf5d5 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/220363 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + ...20-mark-dirty-for-entry-txn-on-quorum-failure.t | 2 -- + xlators/cluster/afr/src/afr-common.c | 24 ++++++++++++---------- + xlators/cluster/afr/src/afr-dir-write.c | 8 ++++++++ + xlators/cluster/afr/src/afr.h | 4 ++++ + 4 files changed, 25 insertions(+), 13 deletions(-) + +diff --git a/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t b/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t +index 26f9049..49c4dea 100644 +--- a/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t ++++ b/tests/bugs/replicate/bug-1586020-mark-dirty-for-entry-txn-on-quorum-failure.t +@@ -53,8 +53,6 @@ TEST ! ls $B0/${V0}1/file$i + TEST ls $B0/${V0}2/file$i + dirty=$(get_hex_xattr trusted.afr.dirty $B0/${V0}2) + TEST [ "$dirty" != "000000000000000000000000" ] +-EXPECT "000000010000000100000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file$i +-EXPECT "000000010000000100000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file$i + + TEST $CLI volume set $V0 self-heal-daemon on + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 89e2483..851ccad 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -1236,7 +1236,7 @@ refresh_done: + return 0; + } + +-static void ++void + afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) + { +@@ -2290,6 +2290,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) + 0, + }; + gf_boolean_t locked_entry = _gf_false; ++ gf_boolean_t in_flight_create = _gf_false; + gf_boolean_t can_interpret = _gf_true; + inode_t *parent = NULL; + ia_type_t ia_type = IA_INVAL; +@@ -2333,17 +2334,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) + if (!replies[i].valid) + continue; + +- if (locked_entry && replies[i].op_ret == -1 && +- replies[i].op_errno == ENOENT) { +- /* Second, check entry is still +- "underway" in creation */ +- local->op_ret = -1; +- local->op_errno = ENOENT; +- goto error; +- } +- +- if (replies[i].op_ret == -1) ++ if (replies[i].op_ret == -1) { ++ if (locked_entry && replies[i].op_errno == ENOENT) { ++ in_flight_create = _gf_true; ++ } + continue; ++ } + + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; +@@ -2353,6 +2349,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) + } + } + ++ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) { ++ local->op_ret = -1; ++ local->op_errno = ENOENT; ++ goto error; ++ } ++ + if (read_subvol == -1) + goto error; + /* We now have a read_subvol, which is readable[] (if there +diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c +index 84e2a34..416c19d 100644 +--- a/xlators/cluster/afr/src/afr-dir-write.c ++++ b/xlators/cluster/afr/src/afr-dir-write.c +@@ -349,6 +349,7 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; ++ unsigned char *success_replies = NULL; + + local = frame->local; + priv = this->private; +@@ -364,9 +365,16 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + ++ /* FOP succeeded on all bricks. */ + if (pre_op_count == priv->child_count && !failed_count) + return; + ++ /* FOP did not suceed on quorum no. of bricks. */ ++ success_replies = alloca0(priv->child_count); ++ afr_fill_success_replies(local, priv, success_replies); ++ if (!afr_has_quorum(success_replies, this, NULL)) ++ return; ++ + if (priv->thin_arbiter_count) { + /*Mark new entry using ta file*/ + local->is_new_entry = _gf_true; +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index ff96246..ed5096e 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -1334,4 +1334,8 @@ afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this); + + void + afr_selfheal_childup(xlator_t *this, afr_private_t *priv); ++ ++void ++afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, ++ unsigned char *replies); + #endif /* __AFR_H__ */ +-- +1.8.3.1 + diff --git a/SOURCES/0484-glusterd-rebalance-status-displays-stats-as-0-after-.patch b/SOURCES/0484-glusterd-rebalance-status-displays-stats-as-0-after-.patch new file mode 100644 index 0000000..56d4feb --- /dev/null +++ b/SOURCES/0484-glusterd-rebalance-status-displays-stats-as-0-after-.patch @@ -0,0 +1,90 @@ +From 6c3b21ce5bb76b35856a6c270eb65d11f869061f Mon Sep 17 00:00:00 2001 +From: Sanju Rakonde +Date: Fri, 26 Jun 2020 12:10:31 +0530 +Subject: [PATCH 484/511] glusterd: rebalance status displays stats as 0 after + reboot + +problem: while the rebalance is in progress, if a node is +rebooted rebalance v status shows the stats of this node as +0 once the node is back. + +Reason: when the node is rebooted, once it is back +glusterd_volume_defrag_restart() starts the rebalance and +creates the rpc. but due to some race, rebalance process is +sending disconnect event, so rpc object is getting destroyed. As +the rpc object is null, request for fetching the latest stats is +not sent to rebalance process. and stats are shows as default values +which is 0. + +Solution: When the rpc object null, we should create the rpc if the +rebalance process is up. so that request can be sent to rebalance +process using the rpc. + +>fixes: #1339 +>Change-Id: I1c7533fedd17dcaffc0f7a5a918c87356133a81c +>Signed-off-by: Sanju Rakonde +Upstream Patch : https://review.gluster.org/c/glusterfs/+/24641 + +BUG: 1832306 +Change-Id: I1c7533fedd17dcaffc0f7a5a918c87356133a81c +Signed-off-by: Srijan Sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/220369 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-syncop.c | 29 ++++++++++++++++++++--------- + 1 file changed, 20 insertions(+), 9 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c +index c78983a..df78fef 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c ++++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c +@@ -1693,6 +1693,7 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + rpc_clnt_t *rpc = NULL; + dict_t *rsp_dict = NULL; + int32_t cmd = GF_OP_CMD_NONE; ++ glusterd_volinfo_t *volinfo = NULL; + + this = THIS; + rsp_dict = dict_new(); +@@ -1724,18 +1725,28 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + cds_list_for_each_entry_safe(pending_node, tmp, &selected, list) + { + rpc = glusterd_pending_node_get_rpc(pending_node); ++ /* In the case of rebalance if the rpc object is null, we try to ++ * create the rpc object. if the rebalance daemon is down, it returns ++ * -1. otherwise, rpc object will be created and referenced. ++ */ + if (!rpc) { +- if (pending_node->type == GD_NODE_REBALANCE) { +- ret = 0; +- glusterd_defrag_volume_node_rsp(req_dict, NULL, op_ctx); ++ if (pending_node->type == GD_NODE_REBALANCE && pending_node->node) { ++ volinfo = pending_node->node; ++ ret = glusterd_rebalance_rpc_create(volinfo); ++ if (ret) { ++ ret = 0; ++ glusterd_defrag_volume_node_rsp(req_dict, NULL, op_ctx); ++ goto out; ++ } else { ++ rpc = glusterd_defrag_rpc_get(volinfo->rebal.defrag); ++ } ++ } else { ++ ret = -1; ++ gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE, ++ "Brick Op failed " ++ "due to rpc failure."); + goto out; + } +- +- ret = -1; +- gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE, +- "Brick Op failed " +- "due to rpc failure."); +- goto out; + } + + /* Redirect operation to be detach tier via rebalance flow. */ +-- +1.8.3.1 + diff --git a/SOURCES/0485-cli-rpc-conditional-init-of-global-quota-rpc-1578.patch b/SOURCES/0485-cli-rpc-conditional-init-of-global-quota-rpc-1578.patch new file mode 100644 index 0000000..6ed4f1c --- /dev/null +++ b/SOURCES/0485-cli-rpc-conditional-init-of-global-quota-rpc-1578.patch @@ -0,0 +1,87 @@ +From 2e6a5e504e66bc95208420e4882e453a53ac9ea2 Mon Sep 17 00:00:00 2001 +From: schaffung +Date: Mon, 2 Nov 2020 11:18:01 +0530 +Subject: [PATCH 485/511] cli-rpc: conditional init of global quota rpc (#1578) + +Issue: It is seem that the initialization of rpc to +connect with quotad is done in every glusterfs cli command, +irrespective of whether the quota feature is enabled or disabled. +This seems to be an overkill. + +Code change: The file /var/run/quotad/quotad.pid is present +signals that quotad is enabled. Hence we can put a conditional +check for seeing when this file exists and if it doesn't we +just skip over the initialization of the global quotad rpc. + +This will go on to reduce the extra rpc calls and operations +being performed in the kernel space. + +>Fixes: #1577 +>Change-Id: Icb69d35330f76ce95626f59af75a12726eb620ff +>Signed-off-by: srijan-sivakumar +Upstream Patch : https://github.com/gluster/glusterfs/pull/1578 + +BUG: 1885966 +Change-Id: Icb69d35330f76ce95626f59af75a12726eb620ff +Signed-off-by: Srijan Sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/220371 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli.c | 18 +++++++++++++----- + cli/src/cli.h | 3 +++ + 2 files changed, 16 insertions(+), 5 deletions(-) + +diff --git a/cli/src/cli.c b/cli/src/cli.c +index 99a16a0..a76c5a2 100644 +--- a/cli/src/cli.c ++++ b/cli/src/cli.c +@@ -64,8 +64,7 @@ + extern int connected; + /* using argp for command line parsing */ + +-const char *argp_program_version = +- PACKAGE_NAME" "PACKAGE_VERSION; ++const char *argp_program_version = PACKAGE_NAME " " PACKAGE_VERSION; + const char *argp_program_bug_address = "<" PACKAGE_BUGREPORT ">"; + + struct rpc_clnt *global_quotad_rpc; +@@ -840,9 +839,18 @@ main(int argc, char *argv[]) + if (!global_rpc) + goto out; + +- global_quotad_rpc = cli_quotad_clnt_rpc_init(); +- if (!global_quotad_rpc) +- goto out; ++ /* ++ * Now, one doesn't need to initialize global rpc ++ * for quota unless and until quota is enabled. ++ * So why not put a check to save all the rpc related ++ * ops here. ++ */ ++ ret = sys_access(QUOTAD_PID_PATH, F_OK); ++ if (!ret) { ++ global_quotad_rpc = cli_quotad_clnt_rpc_init(); ++ if (!global_quotad_rpc) ++ goto out; ++ } + + ret = cli_cmds_register(&state); + if (ret) +diff --git a/cli/src/cli.h b/cli/src/cli.h +index 37e4d9d..c30ae9c 100644 +--- a/cli/src/cli.h ++++ b/cli/src/cli.h +@@ -30,6 +30,9 @@ + #define CLI_TAB_LENGTH 8 + #define CLI_BRICK_STATUS_LINE_LEN 78 + ++// Quotad pid path. ++#define QUOTAD_PID_PATH "/var/run/gluster/quotad/quotad.pid" ++ + /* Geo-rep command positional arguments' index */ + #define GEO_REP_CMD_INDEX 1 + #define GEO_REP_CMD_CONFIG_INDEX 4 +-- +1.8.3.1 + diff --git a/SOURCES/0486-glusterd-brick-sock-file-deleted-log-error-1560.patch b/SOURCES/0486-glusterd-brick-sock-file-deleted-log-error-1560.patch new file mode 100644 index 0000000..60750db --- /dev/null +++ b/SOURCES/0486-glusterd-brick-sock-file-deleted-log-error-1560.patch @@ -0,0 +1,87 @@ +From 9b19d4841fc3002d30ec3e44c85ec37682c11bfb Mon Sep 17 00:00:00 2001 +From: schaffung +Date: Thu, 22 Oct 2020 13:07:09 +0530 +Subject: [PATCH 486/511] glusterd: brick sock file deleted, log error (#1560) + +Issue: The satus of the brick as tracked by glusterd is +stopped if the socket file corresponding to a running +brick process is absent in /var/run/gluster. The glusterd +keeps on trying to reconnect ( rpc layer ) but it fails. + +Code change: Rather than registering the rpc connection +with the help of the given sockfilepath which is not +even present as it keeps on reconnecting, why not log +this as an error and not try to reconnect using the +non-existing sock file path. + +>Fixes: #1526 +>Change-Id: I6c81691ab1624c66dec74f5ffcc6c383201ac757 +>Signed-off-by: srijan-sivakumar +Upstream Patch : https://github.com/gluster/glusterfs/pull/1560 + +BUG: 1882923 +Change-Id: I6c81691ab1624c66dec74f5ffcc6c383201ac757 +Signed-off-by: Srijan Sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/220376 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 27 +++++++++++++++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index d25fc8a..a72c494 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -6310,7 +6310,7 @@ find_compatible_brick(glusterd_conf_t *conf, glusterd_volinfo_t *volinfo, + check if passed pid is match with running glusterfs process + */ + +-int ++static int + glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len) + { + char fname[128] = ""; +@@ -6383,7 +6383,17 @@ glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len) + + if (tmpsockpath[0]) { + strncpy(sockpath, tmpsockpath, i); +- ret = 0; ++ /* ++ * Condition to check if the brick socket file is present ++ * in the stated path or not. This helps in preventing ++ * constant re-connect triggered in the RPC layer and also ++ * a log message would help out the user. ++ */ ++ ret = sys_access(sockpath, F_OK); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_NOT_FOUND, ++ "%s not found", sockpath, NULL); ++ } + } + + return ret; +@@ -6581,7 +6591,20 @@ glusterd_brick_start(glusterd_volinfo_t *volinfo, + if (!is_brick_mx_enabled()) { + glusterd_set_brick_socket_filepath( + volinfo, brickinfo, socketpath, sizeof(socketpath)); ++ /* ++ * Condition to check if the brick socket file is present ++ * in the stated path or not. This helps in preventing ++ * constant re-connect triggered in the RPC layer and also ++ * a log message would help out the user. ++ */ ++ ret = sys_access(socketpath, F_OK); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_NOT_FOUND, ++ "%s not found", socketpath, NULL); ++ goto out; ++ } + } ++ + gf_log(this->name, GF_LOG_DEBUG, + "Using %s as sockfile for brick %s of volume %s ", + socketpath, brickinfo->path, volinfo->volname); +-- +1.8.3.1 + diff --git a/SOURCES/0487-Events-Log-file-not-re-opened-after-logrotate.patch b/SOURCES/0487-Events-Log-file-not-re-opened-after-logrotate.patch new file mode 100644 index 0000000..ac0d1cc --- /dev/null +++ b/SOURCES/0487-Events-Log-file-not-re-opened-after-logrotate.patch @@ -0,0 +1,56 @@ +From c961ee1d7c1abb2552b79ed39ed7fd1bd1b3962f Mon Sep 17 00:00:00 2001 +From: srijan-sivakumar +Date: Fri, 7 Aug 2020 15:02:07 +0530 +Subject: [PATCH 487/511] Events: Log file not re-opened after logrotate. + +Issue: The logging is being done in the same file +even after the logrotate utility has changed the file. +This causes the logfile to grow indefinitely. + +Code Changes: Using the WatchedFileHandler class instead +of FileHandler class. This watches the file it is logging +into and if the file changes, it is closed and reopened +using the file name. Hence after file rotate, a new file +will be used for logging instead of continuing with +the same old file. + +>Fixes: #1289 +>Change-Id: I773d04f17613a03709cb682692efb39fd8e664e2 +>Signed-off-by: srijan-sivakumar +Upstream Patch : https://review.gluster.org/c/glusterfs/+/24820 + +BUG: 1814744 +Change-Id: I773d04f17613a03709cb682692efb39fd8e664e2 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/220370 +Reviewed-by: Shwetha Acharya +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + events/src/utils.py | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/events/src/utils.py b/events/src/utils.py +index 38b707a..6d4e079 100644 +--- a/events/src/utils.py ++++ b/events/src/utils.py +@@ -13,6 +13,7 @@ import sys + import json + import os + import logging ++import logging.handlers + import fcntl + from errno import EBADF + from threading import Thread +@@ -98,7 +99,7 @@ def setup_logger(): + logger.setLevel(logging.INFO) + + # create the logging file handler +- fh = logging.FileHandler(LOG_FILE) ++ fh = logging.handlers.WatchedFileHandler(LOG_FILE) + + formatter = logging.Formatter("[%(asctime)s] %(levelname)s " + "[%(module)s - %(lineno)s:%(funcName)s] " +-- +1.8.3.1 + diff --git a/SOURCES/0488-glusterd-afr-enable-granular-entry-heal-by-default.patch b/SOURCES/0488-glusterd-afr-enable-granular-entry-heal-by-default.patch new file mode 100644 index 0000000..310bc53 --- /dev/null +++ b/SOURCES/0488-glusterd-afr-enable-granular-entry-heal-by-default.patch @@ -0,0 +1,864 @@ +From 0502383024cbf7e4776816e0a992dccc484a3cf2 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Tue, 8 Dec 2020 17:23:22 +0530 +Subject: [PATCH 488/511] glusterd/afr: enable granular-entry-heal by default + +XXXXXXXXXXXXXXXXXXX + IMPORTANT: +XXXXXXXXXXXXXXXXXXXX +I see that for rhgs-3.5.3, GD_OP_VERSION_MAX is GD_OP_VERSION_7_0. Since +this patch should only act on new volumes in rhgs-3.5.4, I am bumping +the op-version to GD_OP_VERSION_7_1. In glusterfs upstream, the patch +acts only if op-version >= GD_OP_VERSION_9_0 as seen in the commit +messae below. + +Upstream patch details: +/------------------------------------------------------------------------------/ +1. The option has been enabled and tested for quite some time now in RHHI-V +downstream and I think it is safe to make it 'on' by default. Since it +is not possible to simply change it from 'off' to 'on' without breaking +rolling upgrades, old clients etc., I have made it default only for new volumes +starting from op-verison GD_OP_VERSION_9_0. + +Note: If you do a volume reset, the option will be turned back off. +This is okay as the dir's gfid will be captured in 'xattrop' folder and heals +will proceed. There might be stale entries inside entry-changes' folder, +which will be removed when we enable the option again. + +2. I encountered a cust. issue where entry heal was pending on a dir. with +236436 files in it and the glustershd.log output was just stuck at +"performing entry selfheal", so I have added logs to give us +more info in DEBUG level about whether entry heal and data heal are +progressing (metadata heal doesn't take much time). That way, we have a +quick visual indication to say things are not 'stuck' if we briefly +enable debug logs, instead of taking statedumps or checking profile info +etc. + +>Fixes: #1483 +>Change-Id: I4f116f8c92f8cd33f209b758ff14f3c7e1981422 +>Signed-off-by: Ravishankar N +Upstream Patch: https://github.com/gluster/glusterfs/pull/1621 +/------------------------------------------------------------------------------/ + +BUG: 1890506 +Change-Id: If449a1e873633616cfc508d74b5c22eb434b55ae +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/220555 +Tested-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/globals.h | 4 +- + libglusterfs/src/syncop-utils.c | 4 +- + tests/basic/afr/add-brick-self-heal-non-granular.t | 75 +++++++++++++ + tests/basic/afr/add-brick-self-heal.t | 4 +- + tests/basic/afr/bug-1130892-non-granular.t | 77 ++++++++++++++ + .../basic/afr/bug-1493415-gfid-heal-non-granular.t | 79 ++++++++++++++ + ...507-type-mismatch-error-handling-non-granular.t | 117 +++++++++++++++++++++ + ...1749322-entry-heal-not-happening-non-granular.t | 90 ++++++++++++++++ + .../afr/replace-brick-self-heal-non-granular.t | 65 ++++++++++++ + tests/basic/afr/replace-brick-self-heal.t | 2 +- + tests/bugs/replicate/bug-1130892.t | 2 +- + tests/bugs/replicate/bug-1493415-gfid-heal.t | 2 +- + .../bug-1722507-type-mismatch-error-handling.t | 26 +++-- + .../bug-1749322-entry-heal-not-happening.t | 7 +- + xlators/cluster/afr/src/afr-self-heal-common.c | 5 + + xlators/cluster/afr/src/afr-self-heal-data.c | 3 + + xlators/cluster/afr/src/afr-self-heal-entry.c | 7 +- + xlators/mgmt/glusterd/src/glusterd-utils.c | 13 +++ + 18 files changed, 558 insertions(+), 24 deletions(-) + create mode 100644 tests/basic/afr/add-brick-self-heal-non-granular.t + create mode 100644 tests/basic/afr/bug-1130892-non-granular.t + create mode 100644 tests/basic/afr/bug-1493415-gfid-heal-non-granular.t + create mode 100644 tests/basic/afr/bug-1722507-type-mismatch-error-handling-non-granular.t + create mode 100644 tests/basic/afr/bug-1749322-entry-heal-not-happening-non-granular.t + create mode 100644 tests/basic/afr/replace-brick-self-heal-non-granular.t + +diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h +index 31717ed..cc145cd 100644 +--- a/libglusterfs/src/glusterfs/globals.h ++++ b/libglusterfs/src/glusterfs/globals.h +@@ -50,7 +50,7 @@ + 1 /* MIN is the fresh start op-version, mostly \ + should not change */ + #define GD_OP_VERSION_MAX \ +- GD_OP_VERSION_7_0 /* MAX VERSION is the maximum \ ++ GD_OP_VERSION_7_1 /* MAX VERSION is the maximum \ + count in VME table, should \ + keep changing with \ + introduction of newer \ +@@ -138,6 +138,8 @@ + + #define GD_OP_VERSION_7_0 70000 /* Op-version for GlusterFS 7.0 */ + ++#define GD_OP_VERSION_7_1 70100 /* Op-version for GlusterFS 7.1 */ ++ + #include "glusterfs/xlator.h" + #include "glusterfs/options.h" + +diff --git a/libglusterfs/src/syncop-utils.c b/libglusterfs/src/syncop-utils.c +index be03527..2269c76 100644 +--- a/libglusterfs/src/syncop-utils.c ++++ b/libglusterfs/src/syncop-utils.c +@@ -495,9 +495,7 @@ syncop_dir_scan(xlator_t *subvol, loc_t *loc, int pid, void *data, + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + +- ret = fn(subvol, entry, loc, data); +- if (ret) +- break; ++ ret |= fn(subvol, entry, loc, data); + } + gf_dirent_free(&entries); + if (ret) +diff --git a/tests/basic/afr/add-brick-self-heal-non-granular.t b/tests/basic/afr/add-brick-self-heal-non-granular.t +new file mode 100644 +index 0000000..19caf24 +--- /dev/null ++++ b/tests/basic/afr/add-brick-self-heal-non-granular.t +@@ -0,0 +1,75 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++EXPECT 'Created' volinfo_field $V0 'Status'; ++TEST $CLI volume set $V0 cluster.granular-entry-heal off ++TEST $CLI volume start $V0 ++EXPECT 'Started' volinfo_field $V0 'Status'; ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 ++ ++TEST $CLI volume set $V0 cluster.data-self-heal off ++TEST $CLI volume set $V0 cluster.metadata-self-heal off ++TEST $CLI volume set $V0 cluster.entry-self-heal off ++TEST $CLI volume set $V0 cluster.heal-timeout 5 ++ ++TEST $CLI volume set $V0 self-heal-daemon off ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++# Create files ++for i in {1..5} ++do ++ echo $i > $M0/file$i.txt ++done ++ ++# Metadata changes ++TEST setfattr -n user.test -v qwerty $M0/file5.txt ++ ++# Add brick1 ++TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 ++ ++# New-brick should accuse the old-bricks (Simulating case for data-loss) ++TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}2/ ++TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}2/ ++ ++# Check if pending xattr and dirty-xattr are set for newly-added-brick ++EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0 ++EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1 ++EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}2 ++ ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 ++ ++TEST $CLI volume set $V0 self-heal-daemon on ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++TEST $CLI volume heal $V0 ++ ++# Wait for heal to complete ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++# Check if entry-heal has happened ++TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}2 | sort) ++TEST diff <(ls $B0/${V0}1 | sort) <(ls $B0/${V0}2 | sort) ++ ++# Test if data was healed ++TEST diff $B0/${V0}0/file1.txt $B0/${V0}2/file1.txt ++ ++# Test if metadata was healed and exists on both the bricks ++EXPECT "qwerty" get_text_xattr user.test $B0/${V0}2/file5.txt ++EXPECT "qwerty" get_text_xattr user.test $B0/${V0}0/file5.txt ++ ++EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0 ++EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1 ++EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.dirty $B0/${V0}2 ++ ++cleanup; +diff --git a/tests/basic/afr/add-brick-self-heal.t b/tests/basic/afr/add-brick-self-heal.t +index c847e22..7ebf4f6 100644 +--- a/tests/basic/afr/add-brick-self-heal.t ++++ b/tests/basic/afr/add-brick-self-heal.t +@@ -38,8 +38,8 @@ TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0 + TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}2/ + + # Check if pending xattr and dirty-xattr are set for newly-added-brick +-EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0 +-EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1 ++EXPECT "000000010000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0 ++EXPECT "000000010000000100000001" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}1 + EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}2 + + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +diff --git a/tests/basic/afr/bug-1130892-non-granular.t b/tests/basic/afr/bug-1130892-non-granular.t +new file mode 100644 +index 0000000..3cdbc7d +--- /dev/null ++++ b/tests/basic/afr/bug-1130892-non-granular.t +@@ -0,0 +1,77 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume info; ++ ++# Create a 1X2 replica ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}-{0,1} ++EXPECT 'Created' volinfo_field $V0 'Status'; ++TEST $CLI volume set $V0 cluster.granular-entry-heal off ++ ++# Disable self-heal daemon ++TEST gluster volume set $V0 self-heal-daemon off ++ ++# Enable Client side heal ++TEST $CLI volume set $V0 cluster.data-self-heal off ++TEST $CLI volume set $V0 cluster.metadata-self-heal off ++TEST $CLI volume set $V0 cluster.entry-self-heal off ++ ++# Disable all perf-xlators ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.read-ahead off ++ ++# Volume start ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++ ++# FUSE Mount ++TEST ${GFS} -s $H0 --volfile-id $V0 $M0 ++ ++# Create files and dirs ++TEST mkdir -p $M0/one/two/ ++TEST `echo "Carpe diem" > $M0/one/two/three` ++ ++# Simulate disk-replacement ++TEST kill_brick $V0 $H0 $B0/${V0}-1 ++EXPECT_WITHIN ${PROCESS_DOWN_TIMEOUT} "^0$" afr_child_up_status $V0 1 ++TEST rm -rf $B0/${V0}-1/one ++TEST rm -rf $B0/${V0}-1/.glusterfs ++ ++#Ideally, disk replacement is done using reset-brick or replace-brick gluster CLI ++#which will create .glusterfs folder. ++mkdir $B0/${V0}-1/.glusterfs && chmod 600 $B0/${V0}-1/.glusterfs ++ ++# Start force ++TEST $CLI volume start $V0 force ++ ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++ ++TEST stat $M0/one ++ ++sleep 1 ++ ++# Check pending xattrs ++EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 data ++EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 entry ++EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 metadata ++ ++TEST gluster volume set $V0 self-heal-daemon on ++ ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "Y" is_dir_heal_done $B0/${V0}-0 $B0/${V0}-1 one ++EXPECT_WITHIN $HEAL_TIMEOUT "Y" is_dir_heal_done $B0/${V0}-0 $B0/${V0}-1 one/two ++EXPECT_WITHIN $HEAL_TIMEOUT "Y" is_file_heal_done $B0/${V0}-0 $B0/${V0}-1 one/two/three ++ ++cleanup; +diff --git a/tests/basic/afr/bug-1493415-gfid-heal-non-granular.t b/tests/basic/afr/bug-1493415-gfid-heal-non-granular.t +new file mode 100644 +index 0000000..aff001c +--- /dev/null ++++ b/tests/basic/afr/bug-1493415-gfid-heal-non-granular.t +@@ -0,0 +1,79 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 cluster.granular-entry-heal off ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0; ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++TEST $CLI volume set $V0 self-heal-daemon off ++ ++# Create base entry in indices/xattrop ++echo "Data" > $M0/FILE ++ ++#------------------------------------------------------------------------------# ++TEST touch $M0/f1 ++gfid_f1=$(gf_get_gfid_xattr $B0/${V0}0/f1) ++gfid_str_f1=$(gf_gfid_xattr_to_str $gfid_f1) ++ ++# Remove gfid xattr and .glusterfs hard link from 2nd brick. This simulates a ++# brick crash at the point where file got created but no xattrs were set. ++TEST setfattr -x trusted.gfid $B0/${V0}1/f1 ++TEST rm $B0/${V0}1/.glusterfs/${gfid_str_f1:0:2}/${gfid_str_f1:2:2}/$gfid_str_f1 ++ ++# storage/posix considers that a file without gfid changed less than a second ++# before doesn't exist, so we need to wait for a second to force posix to ++# consider that this is a valid file but without gfid. ++sleep 2 ++ ++# Assume there were no pending xattrs on parent dir due to 1st brick crashing ++# too. Then name heal from client must heal the gfid. ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0; ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++TEST stat $M0/f1 ++EXPECT "$gfid_f1" gf_get_gfid_xattr $B0/${V0}1/f1 ++TEST stat $B0/${V0}1/.glusterfs/${gfid_str_f1:0:2}/${gfid_str_f1:2:2}/$gfid_str_f1 ++ ++#------------------------------------------------------------------------------# ++TEST mkdir $M0/dir ++TEST touch $M0/dir/f2 ++gfid_f2=$(gf_get_gfid_xattr $B0/${V0}0/dir/f2) ++gfid_str_f2=$(gf_gfid_xattr_to_str $gfid_f2) ++ ++# Remove gfid xattr and .glusterfs hard link from 2nd brick. This simulates a ++# brick crash at the point where file got created but no xattrs were set. ++TEST setfattr -x trusted.gfid $B0/${V0}1/dir/f2 ++TEST rm $B0/${V0}1/.glusterfs/${gfid_str_f2:0:2}/${gfid_str_f2:2:2}/$gfid_str_f2 ++ ++#Now simulate setting of pending entry xattr on parent dir of 1st brick. ++TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}0/dir ++create_brick_xattrop_entry $B0/${V0}0 dir ++ ++# storage/posix considers that a file without gfid changed less than a second ++# before doesn't exist, so we need to wait for a second to force posix to ++# consider that this is a valid file but without gfid. ++sleep 2 ++ ++#Trigger entry-heal via shd ++TEST $CLI volume set $V0 self-heal-daemon on ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++ ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++EXPECT "$gfid_f2" gf_get_gfid_xattr $B0/${V0}1/dir/f2 ++TEST stat $B0/${V0}1/.glusterfs/${gfid_str_f2:0:2}/${gfid_str_f2:2:2}/$gfid_str_f2 ++ ++#------------------------------------------------------------------------------# ++cleanup; +diff --git a/tests/basic/afr/bug-1722507-type-mismatch-error-handling-non-granular.t b/tests/basic/afr/bug-1722507-type-mismatch-error-handling-non-granular.t +new file mode 100644 +index 0000000..9079c93 +--- /dev/null ++++ b/tests/basic/afr/bug-1722507-type-mismatch-error-handling-non-granular.t +@@ -0,0 +1,117 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++## Start and create a volume ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}; ++TEST $CLI volume set $V0 cluster.granular-entry-heal off ++TEST $CLI volume start $V0; ++TEST $CLI volume set $V0 cluster.heal-timeout 5 ++TEST $CLI volume heal $V0 disable ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++TEST mkdir $M0/dir ++ ++########################################################################################## ++# GFID link file and the GFID is missing on one brick and all the bricks are being blamed. ++ ++TEST touch $M0/dir/file ++TEST `echo append>> $M0/dir/file` ++ ++#B0 and B2 must blame B1 ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/$V0"2"/dir ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++ ++# Add entry to xattrop dir to trigger index heal. ++xattrop_dir0=$(afr_get_index_path $B0/$V0"0") ++base_entry_b0=`ls $xattrop_dir0` ++gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/)) ++ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++EXPECT "^1$" get_pending_heal_count $V0 ++ ++# Remove the gfid xattr and the link file on one brick. ++gfid_file=$(gf_get_gfid_xattr $B0/$V0"0"/dir/file) ++gfid_str_file=$(gf_gfid_xattr_to_str $gfid_file) ++TEST setfattr -x trusted.gfid $B0/${V0}0/dir/file ++TEST rm -f $B0/${V0}0/.glusterfs/${gfid_str_file:0:2}/${gfid_str_file:2:2}/$gfid_str_file ++ ++# Launch heal ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 ++ ++# Wait for 2 second to force posix to consider that this is a valid file but ++# without gfid. ++sleep 2 ++TEST $CLI volume heal $V0 ++ ++# Heal should not fail as the file is missing gfid xattr and the link file, ++# which is not actually the gfid or type mismatch. ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++EXPECT "$gfid_file" gf_get_gfid_xattr $B0/${V0}0/dir/file ++TEST stat $B0/${V0}0/.glusterfs/${gfid_str_file:0:2}/${gfid_str_file:2:2}/$gfid_str_file ++rm -f $M0/dir/file ++ ++ ++########################################################################################### ++# GFID link file and the GFID is missing on two bricks and all the bricks are being blamed. ++ ++TEST $CLI volume heal $V0 disable ++TEST touch $M0/dir/file ++#TEST kill_brick $V0 $H0 $B0/$V0"1" ++ ++#B0 and B2 must blame B1 ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/$V0"2"/dir ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++ ++# Add entry to xattrop dir to trigger index heal. ++xattrop_dir0=$(afr_get_index_path $B0/$V0"0") ++base_entry_b0=`ls $xattrop_dir0` ++gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/)) ++ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++EXPECT "^1$" get_pending_heal_count $V0 ++ ++# Remove the gfid xattr and the link file on two bricks. ++gfid_file=$(gf_get_gfid_xattr $B0/$V0"0"/dir/file) ++gfid_str_file=$(gf_gfid_xattr_to_str $gfid_file) ++TEST setfattr -x trusted.gfid $B0/${V0}0/dir/file ++TEST rm -f $B0/${V0}0/.glusterfs/${gfid_str_file:0:2}/${gfid_str_file:2:2}/$gfid_str_file ++TEST setfattr -x trusted.gfid $B0/${V0}1/dir/file ++TEST rm -f $B0/${V0}1/.glusterfs/${gfid_str_file:0:2}/${gfid_str_file:2:2}/$gfid_str_file ++ ++# Launch heal ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 ++ ++# Wait for 2 second to force posix to consider that this is a valid file but ++# without gfid. ++sleep 2 ++TEST $CLI volume heal $V0 ++ ++# Heal should not fail as the file is missing gfid xattr and the link file, ++# which is not actually the gfid or type mismatch. ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++EXPECT "$gfid_file" gf_get_gfid_xattr $B0/${V0}0/dir/file ++TEST stat $B0/${V0}0/.glusterfs/${gfid_str_file:0:2}/${gfid_str_file:2:2}/$gfid_str_file ++EXPECT "$gfid_file" gf_get_gfid_xattr $B0/${V0}1/dir/file ++TEST stat $B0/${V0}1/.glusterfs/${gfid_str_file:0:2}/${gfid_str_file:2:2}/$gfid_str_file ++ ++cleanup +diff --git a/tests/basic/afr/bug-1749322-entry-heal-not-happening-non-granular.t b/tests/basic/afr/bug-1749322-entry-heal-not-happening-non-granular.t +new file mode 100644 +index 0000000..4f27da4 +--- /dev/null ++++ b/tests/basic/afr/bug-1749322-entry-heal-not-happening-non-granular.t +@@ -0,0 +1,90 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup ++ ++function check_gfid_and_link_count ++{ ++ local file=$1 ++ ++ file_gfid_b0=$(gf_get_gfid_xattr $B0/${V0}0/$file) ++ TEST [ ! -z $file_gfid_b0 ] ++ file_gfid_b1=$(gf_get_gfid_xattr $B0/${V0}1/$file) ++ file_gfid_b2=$(gf_get_gfid_xattr $B0/${V0}2/$file) ++ EXPECT $file_gfid_b0 echo $file_gfid_b1 ++ EXPECT $file_gfid_b0 echo $file_gfid_b2 ++ ++ EXPECT "2" stat -c %h $B0/${V0}0/$file ++ EXPECT "2" stat -c %h $B0/${V0}1/$file ++ EXPECT "2" stat -c %h $B0/${V0}2/$file ++} ++TESTS_EXPECTED_IN_LOOP=18 ++ ++################################################################################ ++## Start and create a volume ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}; ++TEST $CLI volume set $V0 cluster.granular-entry-heal off ++TEST $CLI volume start $V0; ++TEST $CLI volume set $V0 cluster.heal-timeout 5 ++TEST $CLI volume heal $V0 disable ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++TEST mkdir $M0/dir ++TEST `echo "File 1 " > $M0/dir/file1` ++TEST touch $M0/dir/file{2..4} ++ ++# Remove file2 from 1st & 3rd bricks ++TEST rm -f $B0/$V0"0"/dir/file2 ++TEST rm -f $B0/$V0"2"/dir/file2 ++ ++# Remove file3 and the .glusterfs hardlink from 1st & 2nd bricks ++gfid_file3=$(gf_get_gfid_xattr $B0/$V0"0"/dir/file3) ++gfid_str_file3=$(gf_gfid_xattr_to_str $gfid_file3) ++TEST rm $B0/$V0"0"/.glusterfs/${gfid_str_file3:0:2}/${gfid_str_file3:2:2}/$gfid_str_file3 ++TEST rm $B0/$V0"1"/.glusterfs/${gfid_str_file3:0:2}/${gfid_str_file3:2:2}/$gfid_str_file3 ++TEST rm -f $B0/$V0"0"/dir/file3 ++TEST rm -f $B0/$V0"1"/dir/file3 ++ ++# Remove the .glusterfs hardlink and the gfid xattr of file4 on 3rd brick ++gfid_file4=$(gf_get_gfid_xattr $B0/$V0"0"/dir/file4) ++gfid_str_file4=$(gf_gfid_xattr_to_str $gfid_file4) ++TEST rm $B0/$V0"2"/.glusterfs/${gfid_str_file4:0:2}/${gfid_str_file4:2:2}/$gfid_str_file4 ++TEST setfattr -x trusted.gfid $B0/$V0"2"/dir/file4 ++ ++# B0 and B2 blame each other ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/$V0"2"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++ ++# Add entry to xattrop dir on first brick. ++xattrop_dir0=$(afr_get_index_path $B0/$V0"0") ++base_entry_b0=`ls $xattrop_dir0` ++gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/)) ++TEST ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++ ++EXPECT "^1$" get_pending_heal_count $V0 ++ ++# Launch heal ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++# All the files must be present on all the bricks after conservative merge and ++# should have the gfid xattr and the .glusterfs hardlink. ++check_gfid_and_link_count dir/file1 ++check_gfid_and_link_count dir/file2 ++check_gfid_and_link_count dir/file3 ++check_gfid_and_link_count dir/file4 ++ ++cleanup +diff --git a/tests/basic/afr/replace-brick-self-heal-non-granular.t b/tests/basic/afr/replace-brick-self-heal-non-granular.t +new file mode 100644 +index 0000000..c86bff1 +--- /dev/null ++++ b/tests/basic/afr/replace-brick-self-heal-non-granular.t +@@ -0,0 +1,65 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 cluster.granular-entry-heal off ++TEST $CLI volume start $V0 ++TEST $CLI volume set $V0 cluster.data-self-heal off ++TEST $CLI volume set $V0 cluster.metadata-self-heal off ++TEST $CLI volume set $V0 cluster.entry-self-heal off ++TEST $CLI volume set $V0 cluster.heal-timeout 5 ++TEST $CLI volume set $V0 self-heal-daemon off ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++# Create files ++for i in {1..5} ++do ++ echo $i > $M0/file$i.txt ++done ++ ++# Metadata changes ++TEST setfattr -n user.test -v qwerty $M0/file5.txt ++ ++# Replace brick1 ++TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit force ++ ++# Replaced-brick should accuse the non-replaced-brick (Simulating case for data-loss) ++TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}1_new/ ++ ++# Check if pending xattr and dirty-xattr are set for replaced-brick ++EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0 ++EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}1_new ++ ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++ ++TEST $CLI volume set $V0 self-heal-daemon on ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++TEST $CLI volume heal $V0 ++ ++# Wait for heal to complete ++EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 ++ ++# Check if entry-heal has happened ++TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}1_new | sort) ++ ++# To make sure that files were not lost from brick0 ++TEST diff <(ls $B0/${V0}0 | sort) <(ls $B0/${V0}1 | sort) ++EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0 ++ ++# Test if data was healed ++TEST diff $B0/${V0}0/file1.txt $B0/${V0}1_new/file1.txt ++# To make sure that data was not lost from brick0 ++TEST diff $B0/${V0}0/file1.txt $B0/${V0}1/file1.txt ++ ++# Test if metadata was healed and exists on both the bricks ++EXPECT "qwerty" get_text_xattr user.test $B0/${V0}1_new/file5.txt ++EXPECT "qwerty" get_text_xattr user.test $B0/${V0}0/file5.txt ++ ++cleanup; +diff --git a/tests/basic/afr/replace-brick-self-heal.t b/tests/basic/afr/replace-brick-self-heal.t +index 0360db7..da31c87 100644 +--- a/tests/basic/afr/replace-brick-self-heal.t ++++ b/tests/basic/afr/replace-brick-self-heal.t +@@ -30,7 +30,7 @@ TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit forc + TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}1_new/ + + # Check if pending xattr and dirty-xattr are set for replaced-brick +-EXPECT "000000000000000100000001" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0 ++EXPECT "000000010000000100000001" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0 + EXPECT "000000000000000000000001" get_hex_xattr trusted.afr.dirty $B0/${V0}1_new + + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +diff --git a/tests/bugs/replicate/bug-1130892.t b/tests/bugs/replicate/bug-1130892.t +index 0f57d66..e23eb26 100644 +--- a/tests/bugs/replicate/bug-1130892.t ++++ b/tests/bugs/replicate/bug-1130892.t +@@ -56,7 +56,7 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 + TEST stat $M0/one + + # Check pending xattrs +-EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 data ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 data + EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 entry + EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}-0/one trusted.afr.$V0-client-1 metadata + +diff --git a/tests/bugs/replicate/bug-1493415-gfid-heal.t b/tests/bugs/replicate/bug-1493415-gfid-heal.t +index 125c35a..9714d5e 100644 +--- a/tests/bugs/replicate/bug-1493415-gfid-heal.t ++++ b/tests/bugs/replicate/bug-1493415-gfid-heal.t +@@ -49,7 +49,7 @@ TEST setfattr -x trusted.gfid $B0/${V0}1/dir/f2 + TEST rm $B0/${V0}1/.glusterfs/${gfid_str_f2:0:2}/${gfid_str_f2:2:2}/$gfid_str_f2 + + #Now simulate setting of pending entry xattr on parent dir of 1st brick. +-TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}0/dir ++TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000001 $B0/${V0}0/dir + create_brick_xattrop_entry $B0/${V0}0 dir + + #Trigger entry-heal via shd +diff --git a/tests/bugs/replicate/bug-1722507-type-mismatch-error-handling.t b/tests/bugs/replicate/bug-1722507-type-mismatch-error-handling.t +index 0aeaaaf..1fdf7ea 100644 +--- a/tests/bugs/replicate/bug-1722507-type-mismatch-error-handling.t ++++ b/tests/bugs/replicate/bug-1722507-type-mismatch-error-handling.t +@@ -23,19 +23,21 @@ TEST mkdir $M0/dir + ########################################################################################## + # GFID link file and the GFID is missing on one brick and all the bricks are being blamed. + +-TEST touch $M0/dir/file +-#TEST kill_brick $V0 $H0 $B0/$V0"1" ++TEST `echo append>> $M0/dir/file` + + #B0 and B2 must blame B1 +-setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/$V0"2"/dir +-setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/$V0"0"/dir +-setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++# Set data part of the xattr also to 1 so that local->need_full_crawl is true. ++# Another way is to create the needed entries inside indices/entry-changes ++# folder. ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000001 $B0/$V0"2"/dir ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000001 $B0/$V0"0"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000001 $B0/$V0"0"/dir + + # Add entry to xattrop dir to trigger index heal. + xattrop_dir0=$(afr_get_index_path $B0/$V0"0") + base_entry_b0=`ls $xattrop_dir0` + gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/)) +-ln -s $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str + EXPECT "^1$" get_pending_heal_count $V0 + + # Remove the gfid xattr and the link file on one brick. +@@ -70,18 +72,20 @@ rm -f $M0/dir/file + + TEST $CLI volume heal $V0 disable + TEST touch $M0/dir/file +-#TEST kill_brick $V0 $H0 $B0/$V0"1" + + #B0 and B2 must blame B1 +-setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/$V0"2"/dir +-setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/$V0"0"/dir +-setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++# Set data part of the xattr also to 1 so that local->need_full_crawl is true. ++# Another way is to create the needed entries inside indices/entry-changes ++# folder. ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000001 $B0/$V0"2"/dir ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000001 $B0/$V0"0"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000001 $B0/$V0"0"/dir + + # Add entry to xattrop dir to trigger index heal. + xattrop_dir0=$(afr_get_index_path $B0/$V0"0") + base_entry_b0=`ls $xattrop_dir0` + gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/)) +-ln -s $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str + EXPECT "^1$" get_pending_heal_count $V0 + + # Remove the gfid xattr and the link file on two bricks. +diff --git a/tests/bugs/replicate/bug-1749322-entry-heal-not-happening.t b/tests/bugs/replicate/bug-1749322-entry-heal-not-happening.t +index 9627908..3da873a 100644 +--- a/tests/bugs/replicate/bug-1749322-entry-heal-not-happening.t ++++ b/tests/bugs/replicate/bug-1749322-entry-heal-not-happening.t +@@ -59,8 +59,11 @@ TEST rm $B0/$V0"2"/.glusterfs/${gfid_str_file4:0:2}/${gfid_str_file4:2:2}/$gfid_ + TEST setfattr -x trusted.gfid $B0/$V0"2"/dir/file4 + + # B0 and B2 blame each other +-setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/$V0"2"/dir +-setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000000000001 $B0/$V0"0"/dir ++# Set data part of the xattr also to 1 so that local->need_full_crawl is true. ++# Another way is to create the needed entries inside indices/entry-changes ++# folder. ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000001 $B0/$V0"2"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000001 $B0/$V0"0"/dir + + # Add entry to xattrop dir on first brick. + xattrop_dir0=$(afr_get_index_path $B0/$V0"0") +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 1608f75..36fd3a9 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -2549,6 +2549,11 @@ afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid) + } + } + ++ gf_msg_debug( ++ this->name, 0, ++ "heals needed for %s: [entry-heal=%d, metadata-heal=%d, data-heal=%d]", ++ uuid_utoa(gfid), entry_selfheal, metadata_selfheal, data_selfheal); ++ + if (data_selfheal && priv->data_self_heal) + data_ret = afr_selfheal_data(frame, this, fd); + +diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c +index cdff4a5..b97c66b 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-data.c ++++ b/xlators/cluster/afr/src/afr-self-heal-data.c +@@ -239,6 +239,9 @@ afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, + sink_count = AFR_COUNT(healed_sinks, priv->child_count); + data_lock = alloca0(priv->child_count); + ++ gf_msg_debug(this->name, 0, "gfid:%s, offset=%jd, size=%zu", ++ uuid_utoa(fd->inode->gfid), offset, size); ++ + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + { +diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c +index 40be898..00b5b2d 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-entry.c ++++ b/xlators/cluster/afr/src/afr-self-heal-entry.c +@@ -206,8 +206,11 @@ __afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + replies); + } else { + if (!gf_uuid_compare(replies[i].poststat.ia_gfid, +- replies[source].poststat.ia_gfid)) ++ replies[source].poststat.ia_gfid)) { ++ gf_msg_debug(this->name, 0, "skipping %s, no heal needed.", ++ name); + continue; ++ } + + ret = afr_selfheal_recreate_entry(frame, i, source, sources, + fd->inode, name, inode, replies); +@@ -839,7 +842,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + + out: + loc_wipe(&loc); +- return 0; ++ return ret; + } + + static int +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index a72c494..bd17a82 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -13181,6 +13181,19 @@ glusterd_enable_default_options(glusterd_volinfo_t *volinfo, char *option) + goto out; + } + } ++ ++ if ((conf->op_version >= GD_OP_VERSION_7_1) && ++ (volinfo->status == GLUSTERD_STATUS_NONE)) { ++ ret = dict_set_dynstr_with_alloc(volinfo->dict, ++ "cluster.granular-entry-heal", "on"); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED, ++ "Failed to set option 'cluster.granular-entry-heal' " ++ "on volume %s", ++ volinfo->volname); ++ goto out; ++ } ++ } + out: + return ret; + } +-- +1.8.3.1 + diff --git a/SOURCES/0489-glusterd-fix-bug-in-enabling-granular-entry-heal.patch b/SOURCES/0489-glusterd-fix-bug-in-enabling-granular-entry-heal.patch new file mode 100644 index 0000000..dde2156 --- /dev/null +++ b/SOURCES/0489-glusterd-fix-bug-in-enabling-granular-entry-heal.patch @@ -0,0 +1,141 @@ +From 2d172144810956225eac3599c943416c4a7e25d0 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Tue, 8 Dec 2020 20:30:23 +0530 +Subject: [PATCH 489/511] glusterd: fix bug in enabling granular-entry-heal + +Upstream patch details: +/------------------------------------------------------------------------------/ +commit f5e1eb87d4af44be3b317b7f99ab88f89c2f0b1a meant to enable the +volume option only for replica volumes but inadvertently enabled +it for all volume types. Fixing it now. + +Also found a bug in glusterd where disabling the option on plain +distribute was succeeding even though setting it in the fist place +fails. Fixed that too. + +>Fixes: #1483 +>Change-Id: Icb6c169a8eec44cc4fb4dd636405d3b3485e91b4 +>Reported-by: Sheetal Pamecha +>Signed-off-by: Ravishankar N +Upstream Patch: https://github.com/gluster/glusterfs/pull/1752 +/------------------------------------------------------------------------------/ + +BUG: 1890506 +Change-Id: Id63655dac08d2cfda4899d7ee0efe96e72cd6986 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/220556 +Tested-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/basic/afr/granular-esh/cli.t | 30 ++++++++++++++++++++----- + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 ++- + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 12 +++++----- + 3 files changed, 34 insertions(+), 11 deletions(-) + +diff --git a/tests/basic/afr/granular-esh/cli.t b/tests/basic/afr/granular-esh/cli.t +index 995d93e..5ab2e39 100644 +--- a/tests/basic/afr/granular-esh/cli.t ++++ b/tests/basic/afr/granular-esh/cli.t +@@ -11,25 +11,38 @@ TESTS_EXPECTED_IN_LOOP=4 + TEST glusterd + TEST pidof glusterd + +-TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +-# Test that enabling the option should work on a newly created volume +-TEST $CLI volume set $V0 cluster.granular-entry-heal on +-TEST $CLI volume set $V0 cluster.granular-entry-heal off +- + ######################### + ##### DISPERSE TEST ##### + ######################### + # Execute the same command on a disperse volume and make sure it fails. + TEST $CLI volume create $V1 disperse 3 redundancy 1 $H0:$B0/${V1}{0,1,2} ++EXPECT "no" volume_get_field $V1 cluster.granular-entry-heal ++TEST $CLI volume start $V1 ++TEST ! $CLI volume heal $V1 granular-entry-heal enable ++TEST ! $CLI volume heal $V1 granular-entry-heal disable ++ ++TEST $CLI volume stop $V1 ++TEST $CLI volume delete $V1 ++ ++######################### ++##### PLAIN DISTRIBUTE TEST ##### ++######################### ++# Execute the same command on a distribute volume and make sure it fails. ++TEST $CLI volume create $V1 $H0:$B0/${V1}{0,1,2} ++EXPECT "no" volume_get_field $V1 cluster.granular-entry-heal + TEST $CLI volume start $V1 + TEST ! $CLI volume heal $V1 granular-entry-heal enable + TEST ! $CLI volume heal $V1 granular-entry-heal disable ++TEST $CLI volume stop $V1 ++TEST $CLI volume delete $V1 + + ####################### + ###### TIER TEST ###### + ####################### + # Execute the same command on a disperse + replicate tiered volume and make + # sure the option is set on the replicate leg of the volume ++TEST $CLI volume create $V1 disperse 3 redundancy 1 $H0:$B0/${V1}{0,1,2} ++TEST $CLI volume start $V1 + TEST $CLI volume tier $V1 attach replica 2 $H0:$B0/${V1}{3,4} + TEST $CLI volume heal $V1 granular-entry-heal enable + EXPECT "enable" volume_get_field $V1 cluster.granular-entry-heal +@@ -52,10 +65,17 @@ TEST kill_brick $V1 $H0 $B0/${V1}3 + # failed. + TEST ! $CLI volume heal $V1 granular-entry-heal enable + EXPECT "disable" volume_get_field $V1 cluster.granular-entry-heal ++TEST $CLI volume stop $V1 ++TEST $CLI volume delete $V1 + + ###################### + ### REPLICATE TEST ### + ###################### ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} ++EXPECT "on" volume_get_field $V0 cluster.granular-entry-heal ++# Test that enabling the option should work on a newly created volume ++TEST $CLI volume set $V0 cluster.granular-entry-heal on ++TEST $CLI volume set $V0 cluster.granular-entry-heal off + TEST $CLI volume start $V0 + TEST $CLI volume set $V0 cluster.data-self-heal off + TEST $CLI volume set $V0 cluster.metadata-self-heal off +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index bd17a82..ad3750e 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -13183,7 +13183,8 @@ glusterd_enable_default_options(glusterd_volinfo_t *volinfo, char *option) + } + + if ((conf->op_version >= GD_OP_VERSION_7_1) && +- (volinfo->status == GLUSTERD_STATUS_NONE)) { ++ (volinfo->status == GLUSTERD_STATUS_NONE) && ++ (volinfo->type == GF_CLUSTER_TYPE_REPLICATE)) { + ret = dict_set_dynstr_with_alloc(volinfo->dict, + "cluster.granular-entry-heal", "on"); + if (ret) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +index 134b04c..09e6ead 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +@@ -621,11 +621,13 @@ glusterd_handle_heal_options_enable_disable(rpcsvc_request_t *req, dict_t *dict, + goto out; + } + +- if (((heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_ENABLE) || +- (heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_DISABLE)) && +- (volinfo->type == GF_CLUSTER_TYPE_DISPERSE)) { +- ret = -1; +- goto out; ++ if ((heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_ENABLE) || ++ (heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_DISABLE)) { ++ if ((volinfo->type != GF_CLUSTER_TYPE_REPLICATE) && ++ (volinfo->type != GF_CLUSTER_TYPE_TIER)) { ++ ret = -1; ++ goto out; ++ } + } + + if ((heal_op == GF_SHD_OP_HEAL_ENABLE) || +-- +1.8.3.1 + diff --git a/SOURCES/0490-Segmentation-fault-occurs-during-truncate.patch b/SOURCES/0490-Segmentation-fault-occurs-during-truncate.patch new file mode 100644 index 0000000..bd3c777 --- /dev/null +++ b/SOURCES/0490-Segmentation-fault-occurs-during-truncate.patch @@ -0,0 +1,57 @@ +From 5a110946b41619577b365cdceddc4da551ff49f0 Mon Sep 17 00:00:00 2001 +From: kinsu +Date: Thu, 19 Sep 2019 08:34:32 +0000 +Subject: [PATCH 490/511] Segmentation fault occurs during truncate + +Problem: +Segmentation fault occurs when bricks are nearly full 100% and in +parallel truncate of a file is attempted (No space left on device). +Prerequicite is that performance xlators are activated +(read-ahead, write-behind etc) +while stack unwind of the frames following an error responce +from brick (No space left on device) frame->local includes a memory +location that is not allocated via mem_get but via calloc. +The destroyed frame is always ra_truncate_cbk winded from ra_ftruncate +and the inode ptr is copied to the frame local in the wb_ftruncate. + +Fix: +extra check is added for the pool ptr + +>Change-Id: Ic5d3bd0ab7011e40b2811c6dece063b256e4d9d1 +>Fixes: bz#1797882 +>Signed-off-by: kinsu + +Upstream-patch: https://review.gluster.org/c/glusterfs/+/23445 + +BUG: 1842449 +Change-Id: Ic5d3bd0ab7011e40b2811c6dece063b256e4d9d1 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/220540 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/mem-pool.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c +index 73503e0..1390747 100644 +--- a/libglusterfs/src/mem-pool.c ++++ b/libglusterfs/src/mem-pool.c +@@ -857,6 +857,14 @@ mem_put(void *ptr) + /* Not one of ours; don't touch it. */ + return; + } ++ ++ if (!hdr->pool_list) { ++ gf_msg_callingfn("mem-pool", GF_LOG_CRITICAL, EINVAL, ++ LG_MSG_INVALID_ARG, ++ "invalid argument hdr->pool_list NULL"); ++ return; ++ } ++ + pool_list = hdr->pool_list; + pt_pool = &pool_list->pools[hdr->power_of_two - POOL_SMALLEST]; + +-- +1.8.3.1 + diff --git a/SOURCES/0491-glusterd-mount-directory-getting-truncated-on-mounti.patch b/SOURCES/0491-glusterd-mount-directory-getting-truncated-on-mounti.patch new file mode 100644 index 0000000..375cfd2 --- /dev/null +++ b/SOURCES/0491-glusterd-mount-directory-getting-truncated-on-mounti.patch @@ -0,0 +1,56 @@ +From 0fed8ca9c6c9e3a9041951bc748c7936d0abc8cf Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Tue, 15 Sep 2020 16:20:19 +0530 +Subject: [PATCH 491/511] glusterd: mount directory getting truncated on + mounting shared_storage + +Issue: +In case of a user created volume the mount point +is the brick path 'ex: /data/brick' but in case of +shared_storage the mount point is '/'.So, here +we increment the array by one so as to get the exact +path of brick without '/', which works fine for other +volumes as the pointer of the brick_dir variable is +at '/', but for shared_storage it is at 'v'(where v is +starting letter of 'var' directory). So, on incrementing +the path we get in case of shared_storage starts from +'ar/lib/glusterd/...' + +Fix: +Only, increment the pointer if the current position is '/', +else the path will be wrong. + +>Fixes: #1480 + +>Change-Id: Id31bb13f58134ae2099884fbc5984c4e055fb357 +>Signed-off-by: nik-redhat + +Upstream patch: https://review.gluster.org/c/glusterfs/+/24989 + +BUG: 1878077 +Change-Id: Id31bb13f58134ae2099884fbc5984c4e055fb357 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/220536 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index ad3750e..b343eee 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -1221,7 +1221,8 @@ glusterd_get_brick_mount_dir(char *brickpath, char *hostname, char *mount_dir) + } + + brick_dir = &brickpath[strlen(mnt_pt)]; +- brick_dir++; ++ if (brick_dir[0] == '/') ++ brick_dir++; + + snprintf(mount_dir, VALID_GLUSTERD_PATHMAX, "/%s", brick_dir); + } +-- +1.8.3.1 + diff --git a/SOURCES/0492-afr-lookup-Pass-xattr_req-in-while-doing-a-selfheal-.patch b/SOURCES/0492-afr-lookup-Pass-xattr_req-in-while-doing-a-selfheal-.patch new file mode 100644 index 0000000..a983baa --- /dev/null +++ b/SOURCES/0492-afr-lookup-Pass-xattr_req-in-while-doing-a-selfheal-.patch @@ -0,0 +1,188 @@ +From bde1ad97f8739f8370a2bbb92229b1b397ecd82c Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Tue, 8 Dec 2020 19:06:03 +0530 +Subject: [PATCH 492/511] afr/lookup: Pass xattr_req in while doing a selfheal + in lookup + +We were not passing xattr_req when doing a name self heal +as well as a meta data heal. Because of this, some xdata +was missing which causes i/o errors + +Upstream patch details: +> Change-Id: Ibfb1205a7eb0195632dc3820116ffbbb8043545f +> Fixes: bz#1728770 +> Signed-off-by: Mohammed Rafi KC +Upstream Patch : https://review.gluster.org/#/c/glusterfs/+/23024/ + +BUG: 1726673 +Change-Id: Ibfb1205a7eb0195632dc3820116ffbbb8043545f +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/220538 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/replicate/bug-1728770-pass-xattrs.t | 52 ++++++++++++++++++++++++++ + tests/include.rc | 1 + + xlators/cluster/afr/src/afr-common.c | 8 +++- + xlators/cluster/afr/src/afr-self-heal-common.c | 9 ++++- + xlators/cluster/afr/src/afr-self-heal.h | 2 +- + 5 files changed, 67 insertions(+), 5 deletions(-) + create mode 100644 tests/bugs/replicate/bug-1728770-pass-xattrs.t + +diff --git a/tests/bugs/replicate/bug-1728770-pass-xattrs.t b/tests/bugs/replicate/bug-1728770-pass-xattrs.t +new file mode 100644 +index 0000000..159c4fc +--- /dev/null ++++ b/tests/bugs/replicate/bug-1728770-pass-xattrs.t +@@ -0,0 +1,52 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../snapshot.rc ++ ++cleanup; ++ ++function fop_on_bad_disk { ++ local path=$1 ++ mkdir $path/dir{1..1000} 2>/dev/null ++ mv $path/dir1 $path/newdir ++ touch $path/foo.txt ++ echo $? ++} ++ ++function ls_fop_on_bad_disk { ++ local path=$1 ++ ls $path ++ echo $? ++} ++ ++TEST init_n_bricks 6; ++TEST setup_lvm 6; ++ ++TEST glusterd; ++TEST pidof glusterd; ++ ++TEST $CLI volume create $V0 replica 3 $H0:$L1 $H0:$L2 $H0:$L3 $H0:$L4 $H0:$L5 $H0:$L6; ++TEST $CLI volume set $V0 health-check-interval 1000; ++ ++TEST $CLI volume start $V0; ++ ++TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0; ++#corrupt last disk ++dd if=/dev/urandom of=/dev/mapper/patchy_snap_vg_6-brick_lvm bs=512K count=200 status=progress && sync ++ ++ ++# Test the disk is now returning EIO for touch and ls ++EXPECT_WITHIN $DISK_FAIL_TIMEOUT "^1$" fop_on_bad_disk "$L6" ++EXPECT_WITHIN $DISK_FAIL_TIMEOUT "^2$" ls_fop_on_bad_disk "$L6" ++ ++TEST touch $M0/foo{1..100} ++TEST $CLI volume remove-brick $V0 replica 3 $H0:$L4 $H0:$L5 $H0:$L6 start ++EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" remove_brick_status_completed_field "$V0" "$H0:$L4 $H0:$L5 $H0:$L6"; ++ ++#check that remove-brick status should not have any failed or skipped files ++var=`$CLI volume remove-brick $V0 $H0:$L4 $H0:$L5 $H0:$L6 status | grep completed` ++TEST [ `echo $var | awk '{print $5}'` = "0" ] ++TEST [ `echo $var | awk '{print $6}'` = "0" ] ++ ++cleanup; +diff --git a/tests/include.rc b/tests/include.rc +index 762c5e2..c925941 100644 +--- a/tests/include.rc ++++ b/tests/include.rc +@@ -89,6 +89,7 @@ GRAPH_SWITCH_TIMEOUT=10 + UNLINK_TIMEOUT=5 + MDC_TIMEOUT=5 + IO_WAIT_TIMEOUT=5 ++DISK_FAIL_TIMEOUT=80 + + LOGDIR=$(gluster --print-logdir) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 851ccad..fca2cd5 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2609,6 +2609,10 @@ afr_lookup_sh_metadata_wrap(void *opaque) + dict = dict_new(); + if (!dict) + goto out; ++ if (local->xattr_req) { ++ dict_copy(local->xattr_req, dict); ++ } ++ + ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); +@@ -2617,7 +2621,7 @@ afr_lookup_sh_metadata_wrap(void *opaque) + if (loc_is_nameless(&local->loc)) { + ret = afr_selfheal_unlocked_discover_on(frame, local->inode, + local->loc.gfid, local->replies, +- local->child_up); ++ local->child_up, dict); + } else { + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, +@@ -2791,7 +2795,7 @@ afr_lookup_selfheal_wrap(void *opaque) + + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, +- local->child_up, NULL); ++ local->child_up, local->xattr_req); + if (inode) + inode_unref(inode); + +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 36fd3a9..9b6575f 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -1861,7 +1861,7 @@ afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict) + int + afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, +- unsigned char *discover_on) ++ unsigned char *discover_on, dict_t *dict) + { + loc_t loc = { + 0, +@@ -1876,6 +1876,8 @@ afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + xattr_req = dict_new(); + if (!xattr_req) + return -ENOMEM; ++ if (dict) ++ dict_copy(dict, xattr_req); + + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); +@@ -1906,11 +1908,14 @@ afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies) + { + afr_local_t *local = NULL; ++ dict_t *dict = NULL; + + local = frame->local; ++ if (local && local->xattr_req) ++ dict = local->xattr_req; + + return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, +- local->child_up); ++ local->child_up, dict); + } + + unsigned int +diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h +index b39af02..8f6fb00 100644 +--- a/xlators/cluster/afr/src/afr-self-heal.h ++++ b/xlators/cluster/afr/src/afr-self-heal.h +@@ -188,7 +188,7 @@ afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + int + afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, +- unsigned char *discover_on); ++ unsigned char *discover_on, dict_t *dict); + inode_t * + afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, +-- +1.8.3.1 + diff --git a/SOURCES/0493-geo-rep-Note-section-is-required-for-ignore_deletes.patch b/SOURCES/0493-geo-rep-Note-section-is-required-for-ignore_deletes.patch new file mode 100644 index 0000000..e712886 --- /dev/null +++ b/SOURCES/0493-geo-rep-Note-section-is-required-for-ignore_deletes.patch @@ -0,0 +1,283 @@ +From 03de45e5fb1c8aa5369848ed9e52abd1365e1d21 Mon Sep 17 00:00:00 2001 +From: Shwetha K Acharya +Date: Wed, 31 Jul 2019 11:34:19 +0530 +Subject: [PATCH 493/511] geo-rep: Note section is required for ignore_deletes + +There exists a window of 15 sec, where the deletes are picked up +by history crawl when the ignore_deletes is set to true. +And it eventually deletes the file/s from slave which is/are not +supposed to be deleted. Though it is working as per design, a +note regarding this is needed. + +Added a warning message indicating the same. +Also logged info when the worker restarts after ignore-deletes +option set. + +>fixes: bz#1708603 +>Change-Id: I103be882fac18b4cef935efa355f5037a396f7c1 +>Signed-off-by: Shwetha K Acharya +Upstream patch: https://review.gluster.org/c/glusterfs/+/22702 + +BUG: 1224906 +Change-Id: I103be882fac18b4cef935efa355f5037a396f7c1 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/220757 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli-cmd-parser.c | 45 ++++++++++++++++++++------ + cli/src/cli-cmd-volume.c | 20 ++++++++---- + cli/src/cli.h | 3 +- + geo-replication/syncdaemon/gsyncd.py | 2 +- + geo-replication/syncdaemon/master.py | 6 ++++ + tests/00-geo-rep/bug-1708603.t | 63 ++++++++++++++++++++++++++++++++++++ + 6 files changed, 120 insertions(+), 19 deletions(-) + create mode 100644 tests/00-geo-rep/bug-1708603.t + +diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c +index 5fd05f4..34f17c9 100644 +--- a/cli/src/cli-cmd-parser.c ++++ b/cli/src/cli-cmd-parser.c +@@ -2901,7 +2901,8 @@ out: + } + + int32_t +-cli_cmd_gsync_set_parse(const char **words, int wordcount, dict_t **options) ++cli_cmd_gsync_set_parse(struct cli_state *state, const char **words, ++ int wordcount, dict_t **options, char **errstr) + { + int32_t ret = -1; + dict_t *dict = NULL; +@@ -2918,6 +2919,8 @@ cli_cmd_gsync_set_parse(const char **words, int wordcount, dict_t **options) + char *save_ptr = NULL; + char *slave_temp = NULL; + char *token = NULL; ++ gf_answer_t answer = GF_ANSWER_NO; ++ const char *question = NULL; + + GF_ASSERT(words); + GF_ASSERT(options); +@@ -2990,8 +2993,10 @@ cli_cmd_gsync_set_parse(const char **words, int wordcount, dict_t **options) + + if (masteri && gsyncd_url_check(words[masteri])) + goto out; +- if (slavei && !glob && !gsyncd_url_check(words[slavei])) ++ if (slavei && !glob && !gsyncd_url_check(words[slavei])) { ++ gf_asprintf(errstr, "Invalid slave url: %s", words[slavei]); + goto out; ++ } + + w = str_getunamb(words[cmdi], opwords); + if (!w) +@@ -3101,16 +3106,36 @@ cli_cmd_gsync_set_parse(const char **words, int wordcount, dict_t **options) + } + if (!ret) + ret = dict_set_int32(dict, "type", type); +- if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG) ++ if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG) { ++ if (!strcmp((char *)words[wordcount - 2], "ignore-deletes") && ++ !strcmp((char *)words[wordcount - 1], "true")) { ++ question = ++ "There exists ~15 seconds delay for the option to take" ++ " effect from stime of the corresponding brick. Please" ++ " check the log for the time, the option is effective." ++ " Proceed"; ++ ++ answer = cli_cmd_get_confirmation(state, question); ++ ++ if (GF_ANSWER_NO == answer) { ++ gf_log("cli", GF_LOG_INFO, ++ "Operation " ++ "cancelled, exiting"); ++ *errstr = gf_strdup("Aborted by user."); ++ ret = -1; ++ goto out; ++ } ++ } ++ + ret = config_parse(words, wordcount, dict, cmdi, glob); ++ } + + out: + if (slave_temp) + GF_FREE(slave_temp); +- if (ret) { +- if (dict) +- dict_unref(dict); +- } else ++ if (ret && dict) ++ dict_unref(dict); ++ else + *options = dict; + + return ret; +@@ -5659,9 +5684,9 @@ cli_cmd_bitrot_parse(const char **words, int wordcount, dict_t **options) + int32_t ret = -1; + char *w = NULL; + char *volname = NULL; +- char *opwords[] = { +- "enable", "disable", "scrub-throttle", "scrub-frequency", "scrub", +- "signing-time", "signer-threads", NULL}; ++ char *opwords[] = {"enable", "disable", "scrub-throttle", ++ "scrub-frequency", "scrub", "signing-time", ++ "signer-threads", NULL}; + char *scrub_throt_values[] = {"lazy", "normal", "aggressive", NULL}; + char *scrub_freq_values[] = {"hourly", "daily", "weekly", "biweekly", + "monthly", "minute", NULL}; +diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c +index 72504ca..6f5bf8b 100644 +--- a/cli/src/cli-cmd-volume.c ++++ b/cli/src/cli-cmd-volume.c +@@ -2457,6 +2457,7 @@ cli_cmd_volume_gsync_set_cbk(struct cli_state *state, struct cli_cmd_word *word, + rpc_clnt_procedure_t *proc = NULL; + call_frame_t *frame = NULL; + cli_local_t *local = NULL; ++ char *errstr = NULL; + #if (USE_EVENTS) + int ret1 = -1; + int cmd_type = -1; +@@ -2468,16 +2469,21 @@ cli_cmd_volume_gsync_set_cbk(struct cli_state *state, struct cli_cmd_word *word, + + proc = &cli_rpc_prog->proctable[GLUSTER_CLI_GSYNC_SET]; + +- frame = create_frame(THIS, THIS->ctx->pool); +- if (frame == NULL) { +- ret = -1; ++ ret = cli_cmd_gsync_set_parse(state, words, wordcount, &options, &errstr); ++ if (ret) { ++ if (errstr) { ++ cli_err("%s", errstr); ++ GF_FREE(errstr); ++ } else { ++ cli_usage_out(word->pattern); ++ } ++ parse_err = 1; + goto out; + } + +- ret = cli_cmd_gsync_set_parse(words, wordcount, &options); +- if (ret) { +- cli_usage_out(word->pattern); +- parse_err = 1; ++ frame = create_frame(THIS, THIS->ctx->pool); ++ if (frame == NULL) { ++ ret = -1; + goto out; + } + +diff --git a/cli/src/cli.h b/cli/src/cli.h +index c30ae9c..7b4f446 100644 +--- a/cli/src/cli.h ++++ b/cli/src/cli.h +@@ -269,7 +269,8 @@ int32_t + cli_cmd_volume_reset_parse(const char **words, int wordcount, dict_t **opt); + + int32_t +-cli_cmd_gsync_set_parse(const char **words, int wordcount, dict_t **opt); ++cli_cmd_gsync_set_parse(struct cli_state *state, const char **words, ++ int wordcount, dict_t **opt, char **errstr); + + int32_t + cli_cmd_quota_parse(const char **words, int wordcount, dict_t **opt); +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index 8940384..215c62d 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -315,7 +315,7 @@ def main(): + + # Log message for loaded config file + if config_file is not None: +- logging.info(lf("Using session config file", path=config_file)) ++ logging.debug(lf("Using session config file", path=config_file)) + + set_term_handler() + excont = FreeObject(exval=0) +diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py +index 08e98f8..98637e7 100644 +--- a/geo-replication/syncdaemon/master.py ++++ b/geo-replication/syncdaemon/master.py +@@ -1549,6 +1549,12 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin): + data_stime = self.get_data_stime() + + end_time = int(time.time()) ++ ++ #as start of historical crawl marks Geo-rep worker restart ++ if gconf.get("ignore-deletes"): ++ logging.info(lf('ignore-deletes config option is set', ++ stime=data_stime)) ++ + logging.info(lf('starting history crawl', + turns=self.history_turns, + stime=data_stime, +diff --git a/tests/00-geo-rep/bug-1708603.t b/tests/00-geo-rep/bug-1708603.t +new file mode 100644 +index 0000000..26913f1 +--- /dev/null ++++ b/tests/00-geo-rep/bug-1708603.t +@@ -0,0 +1,63 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../include.rc ++. $(dirname $0)/../volume.rc ++. $(dirname $0)/../geo-rep.rc ++. $(dirname $0)/../env.rc ++ ++SCRIPT_TIMEOUT=300 ++ ++##Cleanup and start glusterd ++cleanup; ++TEST glusterd; ++TEST pidof glusterd ++ ++ ++##Variables ++GEOREP_CLI="gluster volume geo-replication" ++master=$GMV0 ++SH0="127.0.0.1" ++slave=${SH0}::${GSV0} ++num_active=2 ++num_passive=2 ++master_mnt=$M0 ++slave_mnt=$M1 ++ ++############################################################ ++#SETUP VOLUMES AND GEO-REPLICATION ++############################################################ ++ ++##create_and_start_master_volume ++TEST $CLI volume create $GMV0 replica 2 $H0:$B0/${GMV0}{1,2,3,4}; ++TEST $CLI volume start $GMV0 ++ ++##create_and_start_slave_volume ++TEST $CLI volume create $GSV0 replica 2 $H0:$B0/${GSV0}{1,2,3,4}; ++TEST $CLI volume start $GSV0 ++ ++##Mount master ++TEST glusterfs -s $H0 --volfile-id $GMV0 $M0 ++ ++##Mount slave ++TEST glusterfs -s $H0 --volfile-id $GSV0 $M1 ++ ++#Create geo-rep session ++TEST create_georep_session $master $slave ++ ++echo n | $GEOREP_CLI $master $slave config ignore-deletes true >/dev/null 2>&1 ++EXPECT "false" echo $($GEOREP_CLI $master $slave config ignore-deletes) ++echo y | $GEOREP_CLI $master $slave config ignore-deletes true ++EXPECT "true" echo $($GEOREP_CLI $master $slave config ignore-deletes) ++ ++#Stop Geo-rep ++TEST $GEOREP_CLI $master $slave stop ++ ++#Delete Geo-rep ++TEST $GEOREP_CLI $master $slave delete ++ ++#Cleanup authorized keys ++sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys ++sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys ++ ++cleanup; ++#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000 +-- +1.8.3.1 + diff --git a/SOURCES/0494-glusterd-start-the-brick-on-a-different-port.patch b/SOURCES/0494-glusterd-start-the-brick-on-a-different-port.patch new file mode 100644 index 0000000..d11b138 --- /dev/null +++ b/SOURCES/0494-glusterd-start-the-brick-on-a-different-port.patch @@ -0,0 +1,54 @@ +From 1b24bc4319203128a9ff7f97fe14f4b3622c4eec Mon Sep 17 00:00:00 2001 +From: Sanju Rakonde +Date: Wed, 26 Aug 2020 20:05:35 +0530 +Subject: [PATCH 494/511] glusterd: start the brick on a different port + +Problem: brick fails to start when the port provided by +glusterd is in use by any other process + +Solution: glusterd should check errno set by runner_run() +and if it is set to EADDRINUSE, it should allocate a new +port to the brick and try to start it again. + +Previously ret value is checked instead of errno, so the +retry part never executed. Now, we initialize errno to 0 +before calling runner framework. and afterwards store the +errno into ret to avoid modification of errno in subsequent +function calls. + +>fixes: #1101 + +>Change-Id: I1aa048a77c5f8b035dece36976d60602d9753b1a +>Signed-off-by: Sanju Rakonde +>Signed-off-by: nik-redhat + +Upstream patch: https://review.gluster.org/c/glusterfs/+/24923/ + +BUG: 1865796 +Change-Id: I1aa048a77c5f8b035dece36976d60602d9753b1a +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/220541 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index b343eee..f7030fb 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -2289,7 +2289,10 @@ retry: + + if (wait) { + synclock_unlock(&priv->big_lock); ++ errno = 0; + ret = runner_run(&runner); ++ if (errno != 0) ++ ret = errno; + synclock_lock(&priv->big_lock); + + if (ret == EADDRINUSE) { +-- +1.8.3.1 + diff --git a/SOURCES/0495-geo-rep-descriptive-message-when-worker-crashes-due-.patch b/SOURCES/0495-geo-rep-descriptive-message-when-worker-crashes-due-.patch new file mode 100644 index 0000000..6b3f6f5 --- /dev/null +++ b/SOURCES/0495-geo-rep-descriptive-message-when-worker-crashes-due-.patch @@ -0,0 +1,60 @@ +From 17a2a880290d2038c913c23985df620e3c9741b3 Mon Sep 17 00:00:00 2001 +From: Sunny Kumar +Date: Mon, 16 Mar 2020 15:17:23 +0000 +Subject: [PATCH 495/511] geo-rep: descriptive message when worker crashes due + to EIO + +With this patch now you can notice log if it is due to EIO: + +[2020-03-16 16:24:48.293837] E [syncdutils(worker /bricks/brick1/mbr3):348:log_raise_exception] : Getting "Input/Output error" is most likely due to a. Brick is down or b. Split brain issue. +[2020-03-16 16:24:48.293915] E [syncdutils(worker /bricks/brick1/mbr3):352:log_raise_exception] : This is expected as per design to keep the consistency of the file system. Once the above issue is resolved geo-rep would automatically proceed further. + +>Change-Id: Ie33f2440bc96089731ce12afa8dab91d9550a7ca +>Fixes: #1104 +>Signed-off-by: Sunny Kumar +>Upstream Patch : https://review.gluster.org/c/glusterfs/+/24228/ + +BUG: 1412494 +Change-Id: Ie33f2440bc96089731ce12afa8dab91d9550a7ca +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/220874 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/syncdutils.py | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index f43e13b..d5a94d4 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -22,7 +22,7 @@ import socket + from subprocess import PIPE + from threading import Lock, Thread as baseThread + from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ENOMEM, ECONNABORTED +-from errno import EINTR, ENOENT, ESTALE, EBUSY, ENODATA, errorcode ++from errno import EINTR, ENOENT, ESTALE, EBUSY, ENODATA, errorcode, EIO + from signal import signal, SIGTERM + import select as oselect + from os import waitpid as owaitpid +@@ -346,6 +346,17 @@ def log_raise_exception(excont): + ECONNABORTED): + logging.error(lf('Gluster Mount process exited', + error=errorcode[exc.errno])) ++ elif isinstance(exc, OSError) and exc.errno == EIO: ++ logging.error("Getting \"Input/Output error\" " ++ "is most likely due to " ++ "a. Brick is down or " ++ "b. Split brain issue.") ++ logging.error("This is expected as per design to " ++ "keep the consistency of the file system. " ++ "Once the above issue is resolved " ++ "geo-replication would automatically " ++ "proceed further.") ++ logtag = "FAIL" + else: + logtag = "FAIL" + if not logtag and logging.getLogger().isEnabledFor(logging.DEBUG): +-- +1.8.3.1 + diff --git a/SOURCES/0496-posix-Use-MALLOC-instead-of-alloca-to-allocate-memor.patch b/SOURCES/0496-posix-Use-MALLOC-instead-of-alloca-to-allocate-memor.patch new file mode 100644 index 0000000..590aea3 --- /dev/null +++ b/SOURCES/0496-posix-Use-MALLOC-instead-of-alloca-to-allocate-memor.patch @@ -0,0 +1,139 @@ +From 5893e64ca8c147b7acfa12cd9824f254d53ee261 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Wed, 4 Nov 2020 09:02:03 +0530 +Subject: [PATCH 496/511] posix: Use MALLOC instead of alloca to allocate + memory for xattrs list (#1730) + +In case of file is having huge xattrs on backend a brick process is +crashed while alloca(size) limit has been crossed 256k because iot_worker +stack size is 256k. + +> Fixes: #1699 +> Signed-off-by: Mohit Agrawal +> Change-Id: I100468234f83329a7d65b43cbe4e10450c1ccecd +> (Cherry pick from commit fd666caa35ac84dd1cba55399761982011b77112) +> (Reviewed on upstream link https://github.com/gluster/glusterfs/pull/1828) + +Change-Id: I100468234f83329a7d65b43cbe4e10450c1ccecd +Bug: 1903468 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/220872 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/storage/posix/src/posix-gfid-path.c | 5 ++++- + xlators/storage/posix/src/posix-helpers.c | 3 ++- + xlators/storage/posix/src/posix-inode-fd-ops.c | 12 +++++++++--- + 3 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/xlators/storage/posix/src/posix-gfid-path.c b/xlators/storage/posix/src/posix-gfid-path.c +index 64b5c6c..01315ac 100644 +--- a/xlators/storage/posix/src/posix-gfid-path.c ++++ b/xlators/storage/posix/src/posix-gfid-path.c +@@ -195,7 +195,8 @@ posix_get_gfid2path(xlator_t *this, inode_t *inode, const char *real_path, + if (size == 0) + goto done; + } +- list = alloca(size); ++ ++ list = GF_MALLOC(size, gf_posix_mt_char); + if (!list) { + *op_errno = errno; + goto err; +@@ -309,6 +310,7 @@ done: + GF_FREE(paths[j]); + } + ret = 0; ++ GF_FREE(list); + return ret; + err: + if (path) +@@ -317,5 +319,6 @@ err: + if (paths[j]) + GF_FREE(paths[j]); + } ++ GF_FREE(list); + return ret; + } +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index 73a44be..ceac52a 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -349,7 +349,7 @@ _posix_get_marker_all_contributions(posix_xattr_filler_t *filler) + goto out; + } + +- list = alloca(size); ++ list = GF_MALLOC(size, gf_posix_mt_char); + if (!list) { + goto out; + } +@@ -379,6 +379,7 @@ _posix_get_marker_all_contributions(posix_xattr_filler_t *filler) + ret = 0; + + out: ++ GF_FREE(list); + return ret; + } + +diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c +index 21119ea..1d37aed 100644 +--- a/xlators/storage/posix/src/posix-inode-fd-ops.c ++++ b/xlators/storage/posix/src/posix-inode-fd-ops.c +@@ -3305,7 +3305,7 @@ posix_get_ancestry_non_directory(xlator_t *this, inode_t *leaf_inode, + goto out; + } + +- list = alloca(size); ++ list = GF_MALLOC(size, gf_posix_mt_char); + if (!list) { + *op_errno = errno; + goto out; +@@ -3385,6 +3385,7 @@ posix_get_ancestry_non_directory(xlator_t *this, inode_t *leaf_inode, + op_ret = 0; + + out: ++ GF_FREE(list); + return op_ret; + } + +@@ -3810,7 +3811,8 @@ posix_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + if (size == 0) + goto done; + } +- list = alloca(size); ++ ++ list = GF_MALLOC(size, gf_posix_mt_char); + if (!list) { + op_errno = errno; + goto out; +@@ -3937,6 +3939,7 @@ out: + dict_unref(dict); + } + ++ GF_FREE(list); + return 0; + } + +@@ -4136,7 +4139,8 @@ posix_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + if (size == 0) + goto done; + } +- list = alloca(size + 1); ++ ++ list = GF_MALLOC(size, gf_posix_mt_char); + if (!list) { + op_ret = -1; + op_errno = ENOMEM; +@@ -4240,6 +4244,8 @@ out: + if (dict) + dict_unref(dict); + ++ GF_FREE(list); ++ + return 0; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0497-socket-Use-AES128-cipher-in-SSL-if-AES-is-supported-.patch b/SOURCES/0497-socket-Use-AES128-cipher-in-SSL-if-AES-is-supported-.patch new file mode 100644 index 0000000..9d477ae --- /dev/null +++ b/SOURCES/0497-socket-Use-AES128-cipher-in-SSL-if-AES-is-supported-.patch @@ -0,0 +1,80 @@ +From 85a5cce40dba0393e636c0eb5af9d8f8746f2315 Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Thu, 2 Jan 2020 10:23:52 +0530 +Subject: [PATCH 497/511] socket: Use AES128 cipher in SSL if AES is supported + by CPU + +SSL performance is improved after configuring AES128 cipher +so use AES128 cipher as a default cipher on the CPU those +enabled AES bits otherwise ssl use AES256 cipher + +> Change-Id: I91c50fe987cbb22ed76f8012094730c592c63506 +> Fixes: #1050 +> Signed-off-by: Mohit Agrawal +> (Cherry pick from commit 177cc09d24515596eb51739ce0a276c26e3c52f1) +> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/23952/) + +Change-Id: I91c50fe987cbb22ed76f8012094730c592c63506 +Bug: 1612973 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/220870 +Tested-by: Mohit Agrawal +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + rpc/rpc-transport/socket/src/socket.c | 32 ++++++++++++++++++++++++++++++++ + 1 file changed, 32 insertions(+) + +diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c +index 54cd5df..1ee7320 100644 +--- a/rpc/rpc-transport/socket/src/socket.c ++++ b/rpc/rpc-transport/socket/src/socket.c +@@ -4238,6 +4238,34 @@ static void __attribute__((destructor)) fini_openssl_mt(void) + ERR_free_strings(); + } + ++/* The function returns 0 if AES bit is enabled on the CPU */ ++static int ++ssl_check_aes_bit(void) ++{ ++ FILE *fp = fopen("/proc/cpuinfo", "r"); ++ int ret = 1; ++ size_t len = 0; ++ char *line = NULL; ++ char *match = NULL; ++ ++ GF_ASSERT(fp != NULL); ++ ++ while (getline(&line, &len, fp) > 0) { ++ if (!strncmp(line, "flags", 5)) { ++ match = strstr(line, " aes"); ++ if ((match != NULL) && ((match[4] == ' ') || (match[4] == 0))) { ++ ret = 0; ++ break; ++ } ++ } ++ } ++ ++ free(line); ++ fclose(fp); ++ ++ return ret; ++} ++ + static int + ssl_setup_connection_params(rpc_transport_t *this) + { +@@ -4261,6 +4289,10 @@ ssl_setup_connection_params(rpc_transport_t *this) + return 0; + } + ++ if (!ssl_check_aes_bit()) { ++ cipher_list = "AES128:" DEFAULT_CIPHER_LIST; ++ } ++ + priv->ssl_own_cert = DEFAULT_CERT_PATH; + if (dict_get_str(this->options, SSL_OWN_CERT_OPT, &optstr) == 0) { + if (!priv->ssl_enabled) { +-- +1.8.3.1 + diff --git a/SOURCES/0498-geo-rep-Fix-corner-case-in-rename-on-mkdir-during-hy.patch b/SOURCES/0498-geo-rep-Fix-corner-case-in-rename-on-mkdir-during-hy.patch new file mode 100644 index 0000000..078c390 --- /dev/null +++ b/SOURCES/0498-geo-rep-Fix-corner-case-in-rename-on-mkdir-during-hy.patch @@ -0,0 +1,69 @@ +From 11d648660b8bd246756f87b2f40c72fbabf084d1 Mon Sep 17 00:00:00 2001 +From: Sunny Kumar +Date: Tue, 19 May 2020 16:13:01 +0100 +Subject: [PATCH 498/511] geo-rep: Fix corner case in rename on mkdir during + hybrid crawl +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Problem: +The issue is being hit during hybrid mode while handling rename on slave. +In this special case the rename is recorded as mkdir and geo-rep process it +by resolving the path form backend. + +While resolving the backend path during this special handling one corner case is not considered. + + +Traceback (most recent call last): +  File "/usr/libexec/glusterfs/python/syncdaemon/repce.py", line 118, in worker +    res = getattr(self.obj, rmeth)(*in_data[2:]) +  File "/usr/libexec/glusterfs/python/syncdaemon/resource.py", line 588, in entry_ops +    src_entry = get_slv_dir_path(slv_host, slv_volume, gfid) +  File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 710, in get_slv_dir_path +    dir_entry = os.path.join(pfx, pargfid, basename) +  File "/usr/lib64/python2.7/posixpath.py", line 75, in join +    if b.startswith('/'): +AttributeError: 'int' object has no attribute 'startswith' + +In pyhthon3: +Traceback (most recent call last): + File "", line 1, in + File "/usr/lib64/python3.8/posixpath.py", line 90, in join + genericpath._check_arg_types('join', a, *p) + File "/usr/lib64/python3.8/genericpath.py", line 152, in _check_arg_types + raise TypeError(f'{funcname}() argument must be str, bytes, or ' +TypeError: join() argument must be str, bytes, or os.PathLike object, not 'int' + + +>Change-Id: I8b926899c60ad8c4ffc886d57028ba70fd21e332 +>Fixes: #1250 +>Signed-off-by: Sunny Kumar +Upstream Patch: https://review.gluster.org/c/glusterfs/+/24468/ + +BUG: 1835229 +Change-Id: I8b926899c60ad8c4ffc886d57028ba70fd21e332 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/220867 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/syncdutils.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index d5a94d4..26c79d0 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -732,6 +732,8 @@ def get_slv_dir_path(slv_host, slv_volume, gfid): + else: + dirpath = dirpath.strip("/") + pargfid = get_gfid_from_mnt(dirpath) ++ if isinstance(pargfid, int): ++ return None + dir_entry = os.path.join(pfx, pargfid, basename) + return dir_entry + +-- +1.8.3.1 + diff --git a/SOURCES/0499-gfapi-give-appropriate-error-when-size-exceeds.patch b/SOURCES/0499-gfapi-give-appropriate-error-when-size-exceeds.patch new file mode 100644 index 0000000..edeca1a --- /dev/null +++ b/SOURCES/0499-gfapi-give-appropriate-error-when-size-exceeds.patch @@ -0,0 +1,63 @@ +From f78a5d86c55149d80b6efdf60eae7221c238654e Mon Sep 17 00:00:00 2001 +From: Rinku Kothiya +Date: Thu, 24 Sep 2020 12:43:51 +0000 +Subject: [PATCH 499/511] gfapi: give appropriate error when size exceeds + +This patch help generate appropriate error message +when the gfapi tries to write data equal to or +greater than 1 Gb due to the limitation at the +socket layer. + +Upstream: +> Reviewed-on: https://github.com/gluster/glusterfs/pull/1557 +> fixes: #1518 +> Change-Id: I1234a0b5a6e675a0b20c6b1afe0f4390fd721f6f +> Signed-off-by: Rinku Kothiya + +BUG: 1691320 +Change-Id: I1234a0b5a6e675a0b20c6b1afe0f4390fd721f6f +Signed-off-by: Rinku Kothiya +Signed-off-by: Sunil Kumar Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/219998 +Tested-by: RHGS Build Bot +--- + api/src/gfapi-messages.h | 4 +++- + api/src/glfs-fops.c | 8 ++++++++ + 2 files changed, 11 insertions(+), 1 deletion(-) + +diff --git a/api/src/gfapi-messages.h b/api/src/gfapi-messages.h +index 68d1242..2ffd5ac 100644 +--- a/api/src/gfapi-messages.h ++++ b/api/src/gfapi-messages.h +@@ -49,6 +49,8 @@ GLFS_MSGID(API, API_MSG_MEM_ACCT_INIT_FAILED, API_MSG_MASTER_XLATOR_INIT_FAILED, + API_MSG_INODE_LINK_FAILED, API_MSG_STATEDUMP_FAILED, + API_MSG_XREADDIRP_R_FAILED, API_MSG_LOCK_INSERT_MERGE_FAILED, + API_MSG_SETTING_LOCK_TYPE_FAILED, API_MSG_INODE_FIND_FAILED, +- API_MSG_FDCTX_SET_FAILED, API_MSG_UPCALL_SYNCOP_FAILED); ++ API_MSG_FDCTX_SET_FAILED, API_MSG_UPCALL_SYNCOP_FAILED, ++ API_MSG_INVALID_ARG); + ++#define API_MSG_INVALID_ARG_STR "Invalid" + #endif /* !_GFAPI_MESSAGES_H__ */ +diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c +index e6adea5..051541f 100644 +--- a/api/src/glfs-fops.c ++++ b/api/src/glfs-fops.c +@@ -1525,6 +1525,14 @@ glfs_pwritev_common(struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, + + GF_REF_GET(glfd); + ++ if (iovec->iov_len >= GF_UNIT_GB) { ++ ret = -1; ++ errno = EINVAL; ++ gf_smsg(THIS->name, GF_LOG_ERROR, errno, API_MSG_INVALID_ARG, ++ "size >= %llu is not allowed", GF_UNIT_GB, NULL); ++ goto out; ++ } ++ + subvol = glfs_active_subvol(glfd->fs); + if (!subvol) { + ret = -1; +-- +1.8.3.1 + diff --git a/SOURCES/0500-features-shard-Convert-shard-block-indices-to-uint64.patch b/SOURCES/0500-features-shard-Convert-shard-block-indices-to-uint64.patch new file mode 100644 index 0000000..4898422 --- /dev/null +++ b/SOURCES/0500-features-shard-Convert-shard-block-indices-to-uint64.patch @@ -0,0 +1,104 @@ +From 60789c658ea22063c26168cb4ce15ac5fd279e58 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Mon, 14 Dec 2020 10:57:03 +0530 +Subject: [PATCH 500/511] features/shard: Convert shard block indices to uint64 + +This patch fixes a crash in FOPs that operate on really large sharded +files where number of participant shards could sometimes exceed +signed int32 max. + +The patch also adds GF_ASSERTs to ensure that number of participating +shards is always greater than 0 for files that do have more than one +shard. + +Upstream: +> https://review.gluster.org/#/c/glusterfs/+/23407/ +> Change-Id: I354de58796f350eb1aa42fcdf8092ca2e69ccbb6 +> Fixes: #1348 +> Signed-off-by: Krutika Dhananjay + +BUG: 1752739 +Change-Id: I354de58796f350eb1aa42fcdf8092ca2e69ccbb6 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/221061 +Tested-by: Ravishankar Narayanankutty +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + xlators/features/shard/src/shard.c | 14 ++++++++------ + xlators/features/shard/src/shard.h | 6 +++--- + 2 files changed, 11 insertions(+), 9 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 16d557b..a967f35 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -1855,10 +1855,9 @@ int shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, + */ + if (!inode) { + gf_msg_debug(this->name, 0, +- "Last shard to be truncated absent" +- " in backend: %s. Directly proceeding to update " +- "file size", +- uuid_utoa(inode->gfid)); ++ "Last shard to be truncated absent in backend: " PRIu64 ++ " of gfid: %s. Directly proceeding to update file size", ++ local->first_block, uuid_utoa(local->loc.inode->gfid)); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + return 0; +@@ -2389,6 +2388,7 @@ int shard_truncate_begin(call_frame_t *frame, xlator_t *this) { + get_highest_block(0, local->prebuf.ia_size, local->block_size); + + local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); + local->resolver_base_inode = + (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; + +@@ -4809,6 +4809,7 @@ int shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) { + get_highest_block(local->offset, local->total_size, local->block_size); + + local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); + local->resolver_base_inode = local->loc.inode; + + local->inode_list = +@@ -5266,6 +5267,7 @@ int shard_common_inode_write_post_lookup_handler(call_frame_t *frame, + local->last_block = + get_highest_block(local->offset, local->total_size, local->block_size); + local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); + local->inode_list = + GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); + if (!local->inode_list) { +@@ -5274,8 +5276,8 @@ int shard_common_inode_write_post_lookup_handler(call_frame_t *frame, + } + + gf_msg_trace( +- this->name, 0, "%s: gfid=%s first_block=%" PRIu32 " " +- "last_block=%" PRIu32 " num_blocks=%" PRIu32 ++ this->name, 0, "%s: gfid=%s first_block=%" PRIu64 " " ++ "last_block=%" PRIu64 " num_blocks=%" PRIu64 + " offset=%" PRId64 " total_size=%zu flags=%" PRId32 "", + gf_fop_list[local->fop], uuid_utoa(local->resolver_base_inode->gfid), + local->first_block, local->last_block, local->num_blocks, local->offset, +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 1721417..4fe181b 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -254,9 +254,9 @@ typedef int32_t (*shard_post_update_size_fop_handler_t)(call_frame_t *frame, + typedef struct shard_local { + int op_ret; + int op_errno; +- int first_block; +- int last_block; +- int num_blocks; ++ uint64_t first_block; ++ uint64_t last_block; ++ uint64_t num_blocks; + int call_count; + int eexist_count; + int create_count; +-- +1.8.3.1 + diff --git a/SOURCES/0501-Cli-Removing-old-syntax-of-tier-cmds-from-help-menu.patch b/SOURCES/0501-Cli-Removing-old-syntax-of-tier-cmds-from-help-menu.patch new file mode 100644 index 0000000..5152df8 --- /dev/null +++ b/SOURCES/0501-Cli-Removing-old-syntax-of-tier-cmds-from-help-menu.patch @@ -0,0 +1,48 @@ +From 070698ede9c3765c95364e8207c8311dbf895499 Mon Sep 17 00:00:00 2001 +From: kiyer +Date: Tue, 8 Dec 2020 15:18:49 +0530 +Subject: [PATCH 501/511] Cli: Removing old syntax of tier cmds from help menu + +Remove old syntax of attach-tier and detach-tier +commands from help menu. + +Label: DOWNSTREAM ONLY +BUG: 1813866 + +Change-Id: If86e4828b475fb593a5105ca8deac96374f9542d +Signed-off-by: kiyer +Reviewed-on: https://code.engineering.redhat.com/gerrit/220510 +Tested-by: RHGS Build Bot +Reviewed-by: Mohit Agrawal +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli-cmd-volume.c | 13 ------------- + 1 file changed, 13 deletions(-) + +diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c +index 6f5bf8b..b6bef80 100644 +--- a/cli/src/cli-cmd-volume.c ++++ b/cli/src/cli-cmd-volume.c +@@ -3331,19 +3331,6 @@ struct cli_cmd tier_cmds[] = { + {"volume tier detach ", + cli_cmd_volume_tier_cbk, "Detach the hot tier from "}, + +- {"volume attach-tier [] ...", +- cli_cmd_volume_tier_cbk, +- "NOTE: this is old syntax, will be deprecated in next release. " +- "Please use gluster volume tier attach " +- "[] ..."}, +- +- {"volume detach-tier " +- "", +- cli_cmd_volume_tier_cbk, +- "NOTE: this is old syntax, will be deprecated in next release. " +- "Please use gluster volume tier detach " +- "{start|stop|commit} [force]"}, +- + {"volume tier status\n" + "volume tier start [force]\n" + "volume tier stop\n" +-- +1.8.3.1 + diff --git a/SOURCES/0502-dht-fixing-a-permission-update-issue.patch b/SOURCES/0502-dht-fixing-a-permission-update-issue.patch new file mode 100644 index 0000000..7c136d0 --- /dev/null +++ b/SOURCES/0502-dht-fixing-a-permission-update-issue.patch @@ -0,0 +1,225 @@ +From 3f1eee125a35c33ecb078e5d3bfd80d80e63881d Mon Sep 17 00:00:00 2001 +From: Barak Sason Rofman +Date: Wed, 15 Jan 2020 12:02:05 +0200 +Subject: [PATCH 502/511] dht - fixing a permission update issue + +When bringing back a downed brick and performing lookup from the client +side, the permission on said brick aren't updated on the first lookup, +but only on the second. + +This patch modifies permission update logic so the first lookup will +trigger a permission update on the downed brick. + +LIMITATIONS OF THE PATCH: +As the choice of source depends on whether the directory has layout or not. +Even the directories on the newly added brick will have layout xattr[zeroed], but the same is not true for a root directory. +Hence, in case in the entire cluster only the newly added bricks are up [and others are down], then any change in permission during this time will be overwritten by the older permissions when the cluster is restarted. + +Upstream: +> Reviewed-on: https://review.gluster.org/#/c/glusterfs/+/24020/ +> fixes: #999 +> Change-Id: Ieb70246d41e59f9cae9f70bc203627a433dfbd33 +> Signed-off-by: Barak Sason Rofman + +BUG: 1663821 +Change-Id: Ieb70246d41e59f9cae9f70bc203627a433dfbd33 +Signed-off-by: Barak Sason Rofman +Reviewed-on: https://code.engineering.redhat.com/gerrit/221116 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/bug-1064147.t | 71 ++++++++++++++++++++++++++++++++ + xlators/cluster/dht/src/dht-common.c | 28 ++++++++++--- + xlators/cluster/dht/src/dht-selfheal.c | 15 +++++-- + xlators/storage/posix/src/posix-common.c | 16 +++---- + 4 files changed, 111 insertions(+), 19 deletions(-) + create mode 100755 tests/bugs/bug-1064147.t + +diff --git a/tests/bugs/bug-1064147.t b/tests/bugs/bug-1064147.t +new file mode 100755 +index 0000000..617a1aa +--- /dev/null ++++ b/tests/bugs/bug-1064147.t +@@ -0,0 +1,71 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../include.rc ++. $(dirname $0)/../volume.rc ++ ++# Initialize ++#------------------------------------------------------------ ++cleanup; ++ ++# Start glusterd ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++# Create a volume ++TEST $CLI volume create $V0 $H0:/${V0}{1,2}; ++ ++# Verify volume creation ++ EXPECT "$V0" volinfo_field $V0 'Volume Name'; ++ EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++# Start volume and verify successful start ++ TEST $CLI volume start $V0; ++ EXPECT 'Started' volinfo_field $V0 'Status'; ++ TEST glusterfs -s $H0 --volfile-id=$V0 $M0 ++#------------------------------------------------------------ ++ ++# Test case 1 - Subvolume down + Healing ++#------------------------------------------------------------ ++# Kill 2nd brick process ++TEST kill -9 `ps aux | grep glusterfsd | grep ${V0}2 | grep -v grep | awk '{print $2}'`; ++ ++# Change root permissions ++TEST chmod 444 $M0 ++ ++# Store permission for comparision ++TEST permission_new=`stat -c "%A" $M0` ++ ++# Bring up the killed brick process ++TEST $CLI volume start $V0 force ++ ++# Perform lookup ++sleep 5 ++TEST ls $M0 ++ ++# Check brick permissions ++TEST brick_perm=`stat -c "%A" /${V0}2` ++TEST [ ${brick_perm} = ${permission_new} ] ++#------------------------------------------------------------ ++ ++# Test case 2 - Add-brick + Healing ++#------------------------------------------------------------ ++# Change root permissions ++TEST chmod 777 $M0 ++ ++# Store permission for comparision ++TEST permission_new_2=`stat -c "%A" $M0` ++ ++# Add a 3rd brick ++TEST $CLI volume add-brick $V0 $H0:/${V0}3 ++ ++# Perform lookup ++sleep 5 ++TEST ls $M0 ++ ++# Check permissions on the new brick ++TEST brick_perm2=`stat -c "%A" /${V0}3` ++ ++TEST [ ${brick_perm2} = ${permission_new_2} ] ++ ++cleanup; +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 4db89df..fe1d0ee 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -1363,13 +1363,29 @@ dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + dht_aggregate_xattr(local->xattr, xattr); + } + ++ if (__is_root_gfid(stbuf->ia_gfid)) { ++ ret = dht_dir_has_layout(xattr, conf->xattr_name); ++ if (ret >= 0) { ++ if (is_greater_time(local->prebuf.ia_ctime, ++ local->prebuf.ia_ctime_nsec, ++ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { ++ /* Choose source */ ++ local->prebuf.ia_gid = stbuf->ia_gid; ++ local->prebuf.ia_uid = stbuf->ia_uid; ++ ++ local->prebuf.ia_ctime = stbuf->ia_ctime; ++ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; ++ local->prebuf.ia_prot = stbuf->ia_prot; ++ } ++ } ++ } ++ + if (local->stbuf.ia_type != IA_INVAL) { + /* This is not the first subvol to respond */ +- if (!__is_root_gfid(stbuf->ia_gfid) && +- ((local->stbuf.ia_gid != stbuf->ia_gid) || +- (local->stbuf.ia_uid != stbuf->ia_uid) || +- (is_permission_different(&local->stbuf.ia_prot, +- &stbuf->ia_prot)))) { ++ if ((local->stbuf.ia_gid != stbuf->ia_gid) || ++ (local->stbuf.ia_uid != stbuf->ia_uid) || ++ (is_permission_different(&local->stbuf.ia_prot, ++ &stbuf->ia_prot))) { + local->need_attrheal = 1; + } + } +@@ -10969,7 +10985,7 @@ dht_notify(xlator_t *this, int event, void *data, ...) + if ((cmd == GF_DEFRAG_CMD_STATUS) || + (cmd == GF_DEFRAG_CMD_STATUS_TIER) || + (cmd == GF_DEFRAG_CMD_DETACH_STATUS)) +- gf_defrag_status_get(conf, output, _gf_false); ++ gf_defrag_status_get(conf, output, _gf_false); + else if (cmd == GF_DEFRAG_CMD_START_DETACH_TIER) + gf_defrag_start_detach_tier(defrag); + else if (cmd == GF_DEFRAG_CMD_DETACH_START) +diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c +index f5dfff9..f4e17d1 100644 +--- a/xlators/cluster/dht/src/dht-selfheal.c ++++ b/xlators/cluster/dht/src/dht-selfheal.c +@@ -2097,9 +2097,18 @@ dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(this, layout); + +- if (local->need_attrheal && !IA_ISINVAL(local->mds_stbuf.ia_type)) { +- /*Use the one in the mds_stbuf*/ +- local->stbuf = local->mds_stbuf; ++ if (local->need_attrheal) { ++ if (__is_root_gfid(local->stbuf.ia_gfid)) { ++ local->stbuf.ia_gid = local->prebuf.ia_gid; ++ local->stbuf.ia_uid = local->prebuf.ia_uid; ++ ++ local->stbuf.ia_ctime = local->prebuf.ia_ctime; ++ local->stbuf.ia_ctime_nsec = local->prebuf.ia_ctime_nsec; ++ local->stbuf.ia_prot = local->prebuf.ia_prot; ++ ++ } else if (!IA_ISINVAL(local->mds_stbuf.ia_type)) { ++ local->stbuf = local->mds_stbuf; ++ } + } + + if (!__is_root_gfid(local->stbuf.ia_gfid)) { +diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c +index c5a43a1..e5c6e62 100644 +--- a/xlators/storage/posix/src/posix-common.c ++++ b/xlators/storage/posix/src/posix-common.c +@@ -598,6 +598,7 @@ posix_init(xlator_t *this) + int force_directory = -1; + int create_mask = -1; + int create_directory_mask = -1; ++ char value; + + dir_data = dict_get(this->options, "directory"); + +@@ -654,16 +655,11 @@ posix_init(xlator_t *this) + } + + /* Check for Extended attribute support, if not present, log it */ +- op_ret = sys_lsetxattr(dir_data->data, "trusted.glusterfs.test", "working", +- 8, 0); +- if (op_ret != -1) { +- ret = sys_lremovexattr(dir_data->data, "trusted.glusterfs.test"); +- if (ret) { +- gf_msg(this->name, GF_LOG_DEBUG, errno, P_MSG_INVALID_OPTION, +- "failed to remove xattr: " +- "trusted.glusterfs.test"); +- } +- } else { ++ size = sys_lgetxattr(dir_data->data, "user.x", &value, sizeof(value)); ++ ++ if ((size == -1) && (errno == EOPNOTSUPP)) { ++ gf_msg(this->name, GF_LOG_DEBUG, 0, P_MSG_XDATA_GETXATTR, ++ "getxattr returned %zd", size); + tmp_data = dict_get(this->options, "mandate-attribute"); + if (tmp_data) { + if (gf_string2boolean(tmp_data->data, &tmp_bool) == -1) { +-- +1.8.3.1 + diff --git a/SOURCES/0503-gfapi-Suspend-synctasks-instead-of-blocking-them.patch b/SOURCES/0503-gfapi-Suspend-synctasks-instead-of-blocking-them.patch new file mode 100644 index 0000000..466bf4e --- /dev/null +++ b/SOURCES/0503-gfapi-Suspend-synctasks-instead-of-blocking-them.patch @@ -0,0 +1,179 @@ +From 5946a6ec18976c0f52162fe0f47e9b5171af87ec Mon Sep 17 00:00:00 2001 +From: Soumya Koduri +Date: Mon, 6 Apr 2020 12:36:44 +0530 +Subject: [PATCH 503/511] gfapi: Suspend synctasks instead of blocking them + +There are certain conditions which blocks the current +execution thread (like waiting on mutex lock or condition +variable or I/O response). In such cases, if it is a +synctask thread, we should suspend the task instead +of blocking it (like done in SYNCOP using synctask_yield) + +This is to avoid deadlock like the one mentioned below - + +1) synctaskA sets fs->migration_in_progress to 1 and + does I/O (LOOKUP) +2) Other synctask threads wait for fs->migration_in_progress + to be reset to 0 by synctaskA and hence blocked +3) but synctaskA cannot resume as all synctask threads are blocked + on (2). + +Note: this same approach is already used by few other components +like syncbarrier etc. + +>Change-Id: If90f870d663bb242c702a5b86ac52eeda67c6f0d +>Fixes: #1146 +>Signed-off-by: Soumya Koduri +Upstream patch: https://review.gluster.org/c/glusterfs/+/24276 + +BUG: 1779238 +Change-Id: If90f870d663bb242c702a5b86ac52eeda67c6f0d +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/221081 +Tested-by: RHGS Build Bot +Reviewed-by: Soumya Koduri +--- + api/src/glfs-internal.h | 34 ++++++++++++++++++++++++++++++++-- + api/src/glfs-resolve.c | 9 +++++++++ + api/src/glfs.c | 9 +++++++++ + 3 files changed, 50 insertions(+), 2 deletions(-) + +diff --git a/api/src/glfs-internal.h b/api/src/glfs-internal.h +index 55401b2..15cf0ee 100644 +--- a/api/src/glfs-internal.h ++++ b/api/src/glfs-internal.h +@@ -16,6 +16,7 @@ + #include + #include "glfs-handles.h" + #include ++#include + + #define GLFS_SYMLINK_MAX_FOLLOW 2048 + +@@ -207,6 +208,7 @@ struct glfs { + glfs_upcall_cbk up_cbk; /* upcall cbk function to be registered */ + void *up_data; /* Opaque data provided by application + * during upcall registration */ ++ struct list_head waitq; /* waiting synctasks */ + }; + + /* This enum is used to maintain the state of glfd. In case of async fops +@@ -442,6 +444,34 @@ glfs_process_upcall_event(struct glfs *fs, void *data) + THIS = glfd->fd->inode->table->xl->ctx->master; \ + } while (0) + ++#define __GLFS_LOCK_WAIT(fs) \ ++ do { \ ++ struct synctask *task = NULL; \ ++ \ ++ task = synctask_get(); \ ++ \ ++ if (task) { \ ++ list_add_tail(&task->waitq, &fs->waitq); \ ++ pthread_mutex_unlock(&fs->mutex); \ ++ synctask_yield(task, NULL); \ ++ pthread_mutex_lock(&fs->mutex); \ ++ } else { \ ++ /* non-synctask */ \ ++ pthread_cond_wait(&fs->cond, &fs->mutex); \ ++ } \ ++ } while (0) ++ ++#define __GLFS_SYNCTASK_WAKE(fs) \ ++ do { \ ++ struct synctask *waittask = NULL; \ ++ \ ++ while (!list_empty(&fs->waitq)) { \ ++ waittask = list_entry(fs->waitq.next, struct synctask, waitq); \ ++ list_del_init(&waittask->waitq); \ ++ synctask_wake(waittask); \ ++ } \ ++ } while (0) ++ + /* + By default all lock attempts from user context must + use glfs_lock() and glfs_unlock(). This allows +@@ -466,10 +496,10 @@ glfs_lock(struct glfs *fs, gf_boolean_t wait_for_migration) + pthread_mutex_lock(&fs->mutex); + + while (!fs->init) +- pthread_cond_wait(&fs->cond, &fs->mutex); ++ __GLFS_LOCK_WAIT(fs); + + while (wait_for_migration && fs->migration_in_progress) +- pthread_cond_wait(&fs->cond, &fs->mutex); ++ __GLFS_LOCK_WAIT(fs); + + return 0; + } +diff --git a/api/src/glfs-resolve.c b/api/src/glfs-resolve.c +index 062b7dc..58b6ace 100644 +--- a/api/src/glfs-resolve.c ++++ b/api/src/glfs-resolve.c +@@ -65,6 +65,9 @@ __glfs_first_lookup(struct glfs *fs, xlator_t *subvol) + fs->migration_in_progress = 0; + pthread_cond_broadcast(&fs->cond); + ++ /* wake up other waiting tasks */ ++ __GLFS_SYNCTASK_WAKE(fs); ++ + return ret; + } + +@@ -154,6 +157,9 @@ __glfs_refresh_inode(struct glfs *fs, xlator_t *subvol, inode_t *inode, + fs->migration_in_progress = 0; + pthread_cond_broadcast(&fs->cond); + ++ /* wake up other waiting tasks */ ++ __GLFS_SYNCTASK_WAKE(fs); ++ + return newinode; + } + +@@ -841,6 +847,9 @@ __glfs_migrate_fd(struct glfs *fs, xlator_t *newsubvol, struct glfs_fd *glfd) + fs->migration_in_progress = 0; + pthread_cond_broadcast(&fs->cond); + ++ /* wake up other waiting tasks */ ++ __GLFS_SYNCTASK_WAKE(fs); ++ + return newfd; + } + +diff --git a/api/src/glfs.c b/api/src/glfs.c +index f36616d..ae994fa 100644 +--- a/api/src/glfs.c ++++ b/api/src/glfs.c +@@ -740,6 +740,7 @@ glfs_new_fs(const char *volname) + + INIT_LIST_HEAD(&fs->openfds); + INIT_LIST_HEAD(&fs->upcall_list); ++ INIT_LIST_HEAD(&fs->waitq); + + PTHREAD_MUTEX_INIT(&fs->mutex, NULL, fs->pthread_flags, GLFS_INIT_MUTEX, + err); +@@ -1228,6 +1229,7 @@ pub_glfs_fini(struct glfs *fs) + call_pool_t *call_pool = NULL; + int fs_init = 0; + int err = -1; ++ struct synctask *waittask = NULL; + + DECLARE_OLD_THIS; + +@@ -1249,6 +1251,13 @@ pub_glfs_fini(struct glfs *fs) + + call_pool = fs->ctx->pool; + ++ /* Wake up any suspended synctasks */ ++ while (!list_empty(&fs->waitq)) { ++ waittask = list_entry(fs->waitq.next, struct synctask, waitq); ++ list_del_init(&waittask->waitq); ++ synctask_wake(waittask); ++ } ++ + while (countdown--) { + /* give some time for background frames to finish */ + pthread_mutex_lock(&fs->mutex); +-- +1.8.3.1 + diff --git a/SOURCES/0504-io-stats-Configure-ios_sample_buf_size-based-on-samp.patch b/SOURCES/0504-io-stats-Configure-ios_sample_buf_size-based-on-samp.patch new file mode 100644 index 0000000..21d7f7f --- /dev/null +++ b/SOURCES/0504-io-stats-Configure-ios_sample_buf_size-based-on-samp.patch @@ -0,0 +1,109 @@ +From baa566be8832a56fdea7068d84844ec1ec84d8d9 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Thu, 15 Oct 2020 16:28:58 +0530 +Subject: [PATCH 504/511] io-stats: Configure ios_sample_buf_size based on + sample_interval value (#1574) + +io-stats xlator declares a ios_sample_buf_size 64k object(10M) per xlator +but in case of sample_interval is 0 this big buffer is not required so +declare the default value only while sample_interval is not 0.The new +change would be helpful to reduce RSS size for a brick and shd process +while the number of volumes are huge. + +> Change-Id: I3e82cca92e40549355edfac32580169f3ce51af8 +> Fixes: #1542 +> Signed-off-by: Mohit Agrawal +> (Cherry picked from commit f71660eb879a9cd5761e5adbf10c783e959a990a) +> (Reviewed on upstream link https://github.com/gluster/glusterfs/issues/1542) + +Change-Id: I3e82cca92e40549355edfac32580169f3ce51af8 +BUG: 1898778 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/221183 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/glusterd/daemon-log-level-option.t | 8 ++++---- + xlators/debug/io-stats/src/io-stats.c | 26 ++++++++++++++++++++++---- + 2 files changed, 26 insertions(+), 8 deletions(-) + +diff --git a/tests/bugs/glusterd/daemon-log-level-option.t b/tests/bugs/glusterd/daemon-log-level-option.t +index 66e55e3..5352a63 100644 +--- a/tests/bugs/glusterd/daemon-log-level-option.t ++++ b/tests/bugs/glusterd/daemon-log-level-option.t +@@ -61,8 +61,8 @@ rm -f /var/log/glusterfs/glustershd.log + TEST $CLI volume set all cluster.daemon-log-level WARNING + TEST $CLI volume start $V0 + +-# log should not have any info messages +-EXPECT 0 Info_messages_count "/var/log/glusterfs/glustershd.log" ++# log does have 1 info message specific to configure ios_sample_buf_size in io-stats xlator ++EXPECT 1 Info_messages_count "/var/log/glusterfs/glustershd.log" + + # log should not have any debug messages + EXPECT 0 Debug_messages_count "/var/log/glusterfs/glustershd.log" +@@ -78,8 +78,8 @@ rm -f /var/log/glusterfs/glustershd.log + TEST $CLI volume set all cluster.daemon-log-level ERROR + TEST $CLI volume start $V0 + +-# log should not have any info messages +-EXPECT 0 Info_messages_count "/var/log/glusterfs/glustershd.log" ++# log does have 1 info message specific to configure ios_sample_buf_size in io-stats xlator ++EXPECT 1 Info_messages_count "/var/log/glusterfs/glustershd.log" + + # log should not have any warning messages + EXPECT 0 Warning_messages_count "/var/log/glusterfs/glustershd.log" +diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c +index aa91a0a..9b34895 100644 +--- a/xlators/debug/io-stats/src/io-stats.c ++++ b/xlators/debug/io-stats/src/io-stats.c +@@ -3724,6 +3724,15 @@ xlator_set_loglevel(xlator_t *this, int log_level) + } + } + ++void ++ios_sample_buf_size_configure(char *name, struct ios_conf *conf) ++{ ++ conf->ios_sample_buf_size = 1024; ++ gf_log(name, GF_LOG_INFO, ++ "Configure ios_sample_buf " ++ " size is 1024 because ios_sample_interval is 0"); ++} ++ + int + reconfigure(xlator_t *this, dict_t *options) + { +@@ -3779,8 +3788,13 @@ reconfigure(xlator_t *this, dict_t *options) + int32, out); + GF_OPTION_RECONF("ios-dump-format", dump_format_str, options, str, out); + ios_set_log_format_code(conf, dump_format_str); +- GF_OPTION_RECONF("ios-sample-buf-size", conf->ios_sample_buf_size, options, +- int32, out); ++ if (conf->ios_sample_interval) { ++ GF_OPTION_RECONF("ios-sample-buf-size", conf->ios_sample_buf_size, ++ options, int32, out); ++ } else { ++ ios_sample_buf_size_configure(this->name, conf); ++ } ++ + GF_OPTION_RECONF("sys-log-level", sys_log_str, options, str, out); + if (sys_log_str) { + sys_log_level = glusterd_check_log_level(sys_log_str); +@@ -3947,8 +3961,12 @@ init(xlator_t *this) + GF_OPTION_INIT("ios-dump-format", dump_format_str, str, out); + ios_set_log_format_code(conf, dump_format_str); + +- GF_OPTION_INIT("ios-sample-buf-size", conf->ios_sample_buf_size, int32, +- out); ++ if (conf->ios_sample_interval) { ++ GF_OPTION_INIT("ios-sample-buf-size", conf->ios_sample_buf_size, int32, ++ out); ++ } else { ++ ios_sample_buf_size_configure(this->name, conf); ++ } + + ret = ios_init_sample_buf(conf); + if (ret) { +-- +1.8.3.1 + diff --git a/SOURCES/0505-trash-Create-inode_table-only-while-feature-is-enabl.patch b/SOURCES/0505-trash-Create-inode_table-only-while-feature-is-enabl.patch new file mode 100644 index 0000000..a0f6b62 --- /dev/null +++ b/SOURCES/0505-trash-Create-inode_table-only-while-feature-is-enabl.patch @@ -0,0 +1,107 @@ +From 43a8e2c7441b14f5f238cb11d83f32f248b16abb Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Tue, 13 Oct 2020 18:56:20 +0530 +Subject: [PATCH 505/511] trash: Create inode_table only while feature is + enabled + +Currently trash xlator create a inode table(1M) even if +feature is not enabled.In brick_mux environment while 250 +bricks are attached with a single brick process and feature +is not enable brick process increase RSS size unnecessarily. + +Solution: Create inode_table only while a feature is enabled. +The patch reduces 250M RSS size per brick process +if trash feature is not enabled. + +> Change-Id: I11a6fd2b8419fe2988f398be6ec30fb4f3b99a5d +> Fixes: #1543 +> Signed-off-by: Mohit Agrawal +> (Cherry pick from commit 32f25e7b1b4b080ab2640e178b407c878e629376) +> (Reviewed on upstream link https://github.com/gluster/glusterfs/issues/1543) + +Change-Id: I11a6fd2b8419fe2988f398be6ec30fb4f3b99a5d +BUG: 1898781 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/221184 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/trash/src/trash.c | 47 +++++++++++++++++++++++++++++++++++--- + 1 file changed, 44 insertions(+), 3 deletions(-) + +diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c +index f96ed73..93f020f 100644 +--- a/xlators/features/trash/src/trash.c ++++ b/xlators/features/trash/src/trash.c +@@ -2235,16 +2235,47 @@ reconfigure(xlator_t *this, dict_t *options) + char trash_dir[PATH_MAX] = { + 0, + }; ++ gf_boolean_t active_earlier = _gf_false; ++ gf_boolean_t active_now = _gf_false; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("trash", priv, out); + ++ active_earlier = priv->state; ++ GF_OPTION_RECONF("trash", active_now, options, bool, out); ++ ++ /* Disable of trash feature is not allowed at this point until ++ we are not able to find an approach to cleanup resource ++ gracefully. Here to disable the feature need to destroy inode ++ table and currently it is difficult to ensure inode is not ++ being used ++ */ ++ if (active_earlier && !active_now) { ++ gf_log(this->name, GF_LOG_INFO, ++ "Disable of trash feature is not allowed " ++ "during graph reconfigure"); ++ ret = 0; ++ goto out; ++ } ++ ++ if (!active_earlier && active_now) { ++ if (!priv->trash_itable) { ++ priv->trash_itable = inode_table_new(0, this); ++ if (!priv->trash_itable) { ++ ret = -ENOMEM; ++ gf_log(this->name, GF_LOG_ERROR, ++ "failed to create trash inode_table" ++ " during graph reconfigure"); ++ goto out; ++ } ++ } ++ priv->state = active_now; ++ } ++ + GF_OPTION_RECONF("trash-internal-op", priv->internal, options, bool, out); + GF_OPTION_RECONF("trash-dir", tmp, options, str, out); + +- GF_OPTION_RECONF("trash", priv->state, options, bool, out); +- + if (priv->state) { + ret = create_or_rename_trash_directory(this); + +@@ -2501,7 +2532,17 @@ init(xlator_t *this) + goto out; + } + +- priv->trash_itable = inode_table_new(0, this); ++ if (priv->state) { ++ priv->trash_itable = inode_table_new(0, this); ++ if (!priv->trash_itable) { ++ ret = -ENOMEM; ++ priv->state = _gf_false; ++ gf_log(this->name, GF_LOG_ERROR, ++ "failed to create trash inode_table disable trash"); ++ goto out; ++ } ++ } ++ + gf_log(this->name, GF_LOG_DEBUG, "brick path is%s", priv->brick_path); + + this->private = (void *)priv; +-- +1.8.3.1 + diff --git a/SOURCES/0506-posix-Attach-a-posix_spawn_disk_thread-with-glusterf.patch b/SOURCES/0506-posix-Attach-a-posix_spawn_disk_thread-with-glusterf.patch new file mode 100644 index 0000000..cf978f5 --- /dev/null +++ b/SOURCES/0506-posix-Attach-a-posix_spawn_disk_thread-with-glusterf.patch @@ -0,0 +1,499 @@ +From 17a9ce965ef2fec9ee5c8e4b76981bb7cbcf1352 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Mon, 9 Nov 2020 17:15:42 +0530 +Subject: [PATCH 506/511] posix: Attach a posix_spawn_disk_thread with + glusterfs_ctx (#1595) + +Currently posix xlator spawns posix_disk_space_threads per brick and in +case of brick_mux environment while glusterd attached bricks at maximum +level(250) with a single brick process in that case 250 threads are +spawned for all bricks and brick process memory size also increased. + +Solution: Attach a posix_disk_space thread with glusterfs_ctx to + spawn a thread per process basis instead of spawning a per brick + +> Fixes: #1482 +> Change-Id: I8dd88f252a950495b71742e2a7588bd5bb019ec7 +> Cherry-picked from commit 3f93be77e1acf5baacafa97a320e91e6879d1c0e +> Reviewed on upstream link https://github.com/gluster/glusterfs/issues/1482 +> Signed-off-by: Mohit Agrawal + +Change-Id: I8dd88f252a950495b71742e2a7588bd5bb019ec7 +Bug: 1898776 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/220366 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfsd/src/glusterfsd.c | 4 + + libglusterfs/src/glusterfs/glusterfs.h | 6 ++ + xlators/storage/posix/src/posix-common.c | 68 +++++++++++-- + xlators/storage/posix/src/posix-handle.h | 3 +- + xlators/storage/posix/src/posix-helpers.c | 131 ++++++++++++++----------- + xlators/storage/posix/src/posix-inode-fd-ops.c | 3 +- + xlators/storage/posix/src/posix-mem-types.h | 1 + + xlators/storage/posix/src/posix.h | 12 ++- + 8 files changed, 160 insertions(+), 68 deletions(-) + +diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c +index 955bf1d..ac25255 100644 +--- a/glusterfsd/src/glusterfsd.c ++++ b/glusterfsd/src/glusterfsd.c +@@ -1840,9 +1840,13 @@ glusterfs_ctx_defaults_init(glusterfs_ctx_t *ctx) + INIT_LIST_HEAD(&cmd_args->xlator_options); + INIT_LIST_HEAD(&cmd_args->volfile_servers); + ctx->pxl_count = 0; ++ ctx->diskxl_count = 0; + pthread_mutex_init(&ctx->fd_lock, NULL); + pthread_cond_init(&ctx->fd_cond, NULL); + INIT_LIST_HEAD(&ctx->janitor_fds); ++ pthread_mutex_init(&ctx->xl_lock, NULL); ++ pthread_cond_init(&ctx->xl_cond, NULL); ++ INIT_LIST_HEAD(&ctx->diskth_xl); + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; +diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h +index bf6a987..d3400bf 100644 +--- a/libglusterfs/src/glusterfs/glusterfs.h ++++ b/libglusterfs/src/glusterfs/glusterfs.h +@@ -740,7 +740,13 @@ struct _glusterfs_ctx { + pthread_t janitor; + /* The variable is use to save total posix xlator count */ + uint32_t pxl_count; ++ uint32_t diskxl_count; + ++ /* List of posix xlator use by disk thread*/ ++ struct list_head diskth_xl; ++ pthread_mutex_t xl_lock; ++ pthread_cond_t xl_cond; ++ pthread_t disk_space_check; + char volume_id[GF_UUID_BUF_SIZE]; /* Used only in protocol/client */ + }; + typedef struct _glusterfs_ctx glusterfs_ctx_t; +diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c +index e5c6e62..2c9030b 100644 +--- a/xlators/storage/posix/src/posix-common.c ++++ b/xlators/storage/posix/src/posix-common.c +@@ -138,6 +138,36 @@ posix_inode(xlator_t *this) + return 0; + } + ++static void ++delete_posix_diskxl(xlator_t *this) ++{ ++ struct posix_private *priv = this->private; ++ struct posix_diskxl *pxl = priv->pxl; ++ glusterfs_ctx_t *ctx = this->ctx; ++ uint32_t count = 1; ++ ++ if (pxl) { ++ pthread_mutex_lock(&ctx->xl_lock); ++ { ++ pxl->detach_notify = _gf_true; ++ while (pxl->is_use) ++ pthread_cond_wait(&pxl->cond, &ctx->xl_lock); ++ list_del_init(&pxl->list); ++ priv->pxl = NULL; ++ count = --ctx->diskxl_count; ++ if (count == 0) ++ pthread_cond_signal(&ctx->xl_cond); ++ } ++ pthread_mutex_unlock(&ctx->xl_lock); ++ pthread_cond_destroy(&pxl->cond); ++ GF_FREE(pxl); ++ if (count == 0) { ++ pthread_join(ctx->disk_space_check, NULL); ++ ctx->disk_space_check = 0; ++ } ++ } ++} ++ + /** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +@@ -194,6 +224,8 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) + } + pthread_mutex_unlock(&ctx->fd_lock); + ++ delete_posix_diskxl(this); ++ + gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s", + victim->name); + default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data); +@@ -318,6 +350,7 @@ posix_reconfigure(xlator_t *this, dict_t *options) + int32_t force_directory_mode = -1; + int32_t create_mask = -1; + int32_t create_directory_mask = -1; ++ double old_disk_reserve = 0.0; + + priv = this->private; + +@@ -383,6 +416,7 @@ posix_reconfigure(xlator_t *this, dict_t *options) + " fallback to :"); + } + ++ old_disk_reserve = priv->disk_reserve; + GF_OPTION_RECONF("reserve", priv->disk_reserve, options, percent_or_size, + out); + /* option can be any one of percent or bytes */ +@@ -390,11 +424,19 @@ posix_reconfigure(xlator_t *this, dict_t *options) + if (priv->disk_reserve < 100.0) + priv->disk_unit = 'p'; + +- if (priv->disk_reserve) { ++ /* Delete a pxl object from a list of disk_reserve while something ++ is changed for reserve option during graph reconfigure ++ */ ++ if (old_disk_reserve != priv->disk_reserve) { ++ delete_posix_diskxl(this); ++ old_disk_reserve = 0; ++ } ++ ++ if (!old_disk_reserve && priv->disk_reserve) { + ret = posix_spawn_disk_space_check_thread(this); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED, +- "Getting disk space check from thread failed"); ++ "Getting disk space check from thread failed "); + goto out; + } + } +@@ -1008,13 +1050,13 @@ posix_init(xlator_t *this) + " fallback to :"); + } + +- _private->disk_space_check_active = _gf_false; + _private->disk_space_full = 0; + + GF_OPTION_INIT("reserve", _private->disk_reserve, percent_or_size, out); + + /* option can be any one of percent or bytes */ + _private->disk_unit = 0; ++ pthread_cond_init(&_private->fd_cond, NULL); + if (_private->disk_reserve < 100.0) + _private->disk_unit = 'p'; + +@@ -1162,12 +1204,6 @@ posix_fini(xlator_t *this) + priv->health_check = 0; + } + +- if (priv->disk_space_check) { +- priv->disk_space_check_active = _gf_false; +- (void)gf_thread_cleanup_xint(priv->disk_space_check); +- priv->disk_space_check = 0; +- } +- + if (priv->janitor) { + /*TODO: Make sure the synctask is also complete */ + ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor); +@@ -1192,10 +1228,24 @@ posix_fini(xlator_t *this) + pthread_join(ctx->janitor, NULL); + } + ++ pthread_mutex_lock(&ctx->xl_lock); ++ { ++ count = --ctx->diskxl_count; ++ if (count == 0) ++ pthread_cond_signal(&ctx->xl_cond); ++ } ++ pthread_mutex_unlock(&ctx->xl_lock); ++ ++ if (count == 0) { ++ pthread_join(ctx->disk_space_check, NULL); ++ ctx->disk_space_check = 0; ++ } ++ + if (priv->fsyncer) { + (void)gf_thread_cleanup_xint(priv->fsyncer); + priv->fsyncer = 0; + } ++ + /*unlock brick dir*/ + if (priv->mount_lock) + (void)sys_closedir(priv->mount_lock); +diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h +index c4d7cb1..8e4c719 100644 +--- a/xlators/storage/posix/src/posix-handle.h ++++ b/xlators/storage/posix/src/posix-handle.h +@@ -206,5 +206,6 @@ int + posix_check_internal_writes(xlator_t *this, fd_t *fd, int sysfd, dict_t *xdata); + + void +-posix_disk_space_check(xlator_t *this); ++posix_disk_space_check(struct posix_private* priv); ++ + #endif /* !_POSIX_HANDLE_H */ +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index ceac52a..110d383 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -2284,9 +2284,8 @@ unlock: + } + + void +-posix_disk_space_check(xlator_t *this) ++posix_disk_space_check(struct posix_private *priv) + { +- struct posix_private *priv = NULL; + char *subvol_path = NULL; + int op_ret = 0; + double size = 0; +@@ -2295,16 +2294,14 @@ posix_disk_space_check(xlator_t *this) + double totsz = 0; + double freesz = 0; + +- GF_VALIDATE_OR_GOTO(this->name, this, out); +- priv = this->private; +- GF_VALIDATE_OR_GOTO(this->name, priv, out); ++ GF_VALIDATE_OR_GOTO("posix-helpers", priv, out); + + subvol_path = priv->base_path; + + op_ret = sys_statvfs(subvol_path, &buf); + + if (op_ret == -1) { +- gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_STATVFS_FAILED, ++ gf_msg("posix-disk", GF_LOG_ERROR, errno, P_MSG_STATVFS_FAILED, + "statvfs failed on %s", subvol_path); + goto out; + } +@@ -2328,78 +2325,102 @@ out: + } + + static void * +-posix_disk_space_check_thread_proc(void *data) ++posix_ctx_disk_thread_proc(void *data) + { +- xlator_t *this = NULL; + struct posix_private *priv = NULL; ++ glusterfs_ctx_t *ctx = NULL; + uint32_t interval = 0; +- int ret = -1; +- +- this = data; +- priv = this->private; ++ struct posix_diskxl *pthis = NULL; ++ xlator_t *this = NULL; ++ struct timespec sleep_till = { ++ 0, ++ }; + ++ ctx = data; + interval = 5; +- gf_msg_debug(this->name, 0, +- "disk-space thread started, " ++ ++ gf_msg_debug("glusterfs_ctx", 0, ++ "Ctx disk-space thread started, " + "interval = %d seconds", + interval); +- while (1) { +- /* aborting sleep() is a request to exit this thread, sleep() +- * will normally not return when cancelled */ +- ret = sleep(interval); +- if (ret > 0) +- break; +- /* prevent thread errors while doing the health-check(s) */ +- pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); +- +- /* Do the disk-check.*/ +- posix_disk_space_check(this); +- if (!priv->disk_space_check_active) +- goto out; +- pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); +- } + +-out: +- gf_msg_debug(this->name, 0, "disk space check thread exiting"); +- LOCK(&priv->lock); ++ pthread_mutex_lock(&ctx->xl_lock); + { +- priv->disk_space_check_active = _gf_false; ++ while (ctx->diskxl_count > 0) { ++ list_for_each_entry(pthis, &ctx->diskth_xl, list) ++ { ++ pthis->is_use = _gf_true; ++ pthread_mutex_unlock(&ctx->xl_lock); ++ ++ THIS = this = pthis->xl; ++ priv = this->private; ++ ++ posix_disk_space_check(priv); ++ ++ pthread_mutex_lock(&ctx->xl_lock); ++ pthis->is_use = _gf_false; ++ /* Send a signal to posix_notify function */ ++ if (pthis->detach_notify) ++ pthread_cond_signal(&pthis->cond); ++ } ++ ++ timespec_now_realtime(&sleep_till); ++ sleep_till.tv_sec += 5; ++ (void)pthread_cond_timedwait(&ctx->xl_cond, &ctx->xl_lock, ++ &sleep_till); ++ } + } +- UNLOCK(&priv->lock); ++ pthread_mutex_unlock(&ctx->xl_lock); + + return NULL; + } + + int +-posix_spawn_disk_space_check_thread(xlator_t *xl) ++posix_spawn_disk_space_check_thread(xlator_t *this) + { +- struct posix_private *priv = NULL; +- int ret = -1; ++ int ret = 0; ++ glusterfs_ctx_t *ctx = this->ctx; ++ struct posix_diskxl *pxl = NULL; ++ struct posix_private *priv = this->private; + +- priv = xl->private; ++ pxl = GF_CALLOC(1, sizeof(struct posix_diskxl), gf_posix_mt_diskxl_t); ++ if (!pxl) { ++ ret = -ENOMEM; ++ gf_log(this->name, GF_LOG_ERROR, ++ "Calloc is failed to allocate " ++ "memory for diskxl object"); ++ goto out; ++ } ++ pthread_cond_init(&pxl->cond, NULL); + +- LOCK(&priv->lock); ++ pthread_mutex_lock(&ctx->xl_lock); + { +- /* cancel the running thread */ +- if (priv->disk_space_check_active == _gf_true) { +- pthread_cancel(priv->disk_space_check); +- priv->disk_space_check_active = _gf_false; +- } ++ if (ctx->diskxl_count++ == 0) { ++ ret = gf_thread_create(&ctx->disk_space_check, NULL, ++ posix_ctx_disk_thread_proc, ctx, ++ "posixctxres"); + +- ret = gf_thread_create(&priv->disk_space_check, NULL, +- posix_disk_space_check_thread_proc, xl, +- "posix_reserve"); +- if (ret) { +- priv->disk_space_check_active = _gf_false; +- gf_msg(xl->name, GF_LOG_ERROR, errno, P_MSG_DISK_SPACE_CHECK_FAILED, +- "unable to setup disk space check thread"); +- goto unlock; ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_THREAD_FAILED, ++ "spawning disk space check thread failed"); ++ ctx->diskxl_count--; ++ pthread_mutex_unlock(&ctx->xl_lock); ++ goto out; ++ } + } ++ pxl->xl = this; ++ priv->pxl = (void *)pxl; ++ list_add_tail(&pxl->list, &ctx->diskth_xl); ++ } ++ pthread_mutex_unlock(&ctx->xl_lock); + +- priv->disk_space_check_active = _gf_true; ++out: ++ if (ret) { ++ if (pxl) { ++ pthread_cond_destroy(&pxl->cond); ++ GF_FREE(pxl); ++ } + } +-unlock: +- UNLOCK(&priv->lock); + return ret; + } + +diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c +index 1d37aed..761e018 100644 +--- a/xlators/storage/posix/src/posix-inode-fd-ops.c ++++ b/xlators/storage/posix/src/posix-inode-fd-ops.c +@@ -37,6 +37,7 @@ + #include + #endif /* HAVE_LINKAT */ + ++#include "posix-handle.h" + #include + #include + #include +@@ -713,7 +714,7 @@ posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + option behaviour + */ + if (priv->disk_reserve) +- posix_disk_space_check(this); ++ posix_disk_space_check(priv); + + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, ret, ret, unlock); + +diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h +index 2253f38..bb4c56d 100644 +--- a/xlators/storage/posix/src/posix-mem-types.h ++++ b/xlators/storage/posix/src/posix-mem-types.h +@@ -20,6 +20,7 @@ enum gf_posix_mem_types_ { + gf_posix_mt_paiocb, + gf_posix_mt_inode_ctx_t, + gf_posix_mt_mdata_attr, ++ gf_posix_mt_diskxl_t, + gf_posix_mt_end + }; + #endif +diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h +index 07f367b..4be979c 100644 +--- a/xlators/storage/posix/src/posix.h ++++ b/xlators/storage/posix/src/posix.h +@@ -36,7 +36,6 @@ + #include + #include + #include "posix-mem-types.h" +-#include "posix-handle.h" + #include + + #ifdef HAVE_LIBAIO +@@ -138,6 +137,14 @@ struct posix_fd { + char _pad[4]; /* manual padding */ + }; + ++struct posix_diskxl { ++ pthread_cond_t cond; ++ struct list_head list; ++ xlator_t *xl; ++ gf_boolean_t detach_notify; ++ gf_boolean_t is_use; ++}; ++ + struct posix_private { + char *base_path; + int32_t base_path_length; +@@ -207,6 +214,7 @@ struct posix_private { + pthread_mutex_t janitor_mutex; + pthread_cond_t janitor_cond; + pthread_cond_t fd_cond; ++ pthread_cond_t disk_cond; + int fsync_queue_count; + + enum { +@@ -233,7 +241,6 @@ struct posix_private { + char disk_unit; + uint32_t disk_space_full; + pthread_t disk_space_check; +- gf_boolean_t disk_space_check_active; + + #ifdef GF_DARWIN_HOST_OS + enum { +@@ -263,6 +270,7 @@ struct posix_private { + gf_boolean_t ctime; + gf_boolean_t janitor_task_stop; + uint32_t rel_fdcount; ++ void *pxl; + }; + + typedef struct { +-- +1.8.3.1 + diff --git a/SOURCES/0507-inode-make-critical-section-smaller.patch b/SOURCES/0507-inode-make-critical-section-smaller.patch new file mode 100644 index 0000000..3b1dac5 --- /dev/null +++ b/SOURCES/0507-inode-make-critical-section-smaller.patch @@ -0,0 +1,764 @@ +From b3a17b67a69142eef1b4adde3409d5e54dda1e0b Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Sat, 9 Feb 2019 13:23:06 +0530 +Subject: [PATCH 507/511] inode: make critical section smaller + +do all the 'static' tasks outside of locked region. + +* hash_dentry() and hash_gfid() are now called outside locked region. +* remove extra __dentry_hash exported in libglusterfs.sym +* avoid checks in locked functions, if the check is done in calling + function. +* implement dentry_destroy(), which handles freeing of dentry separately, + from that of dentry_unset (which takes care of separating dentry from + inode, and table) + +> Updates: bz#1670031 +> Change-Id: I584213e0748464bb427fbdef3c4ab6615d7d5eb0 +> Signed-off-by: Amar Tumballi +> (Cherry pick from commit 8a90d346b9d3f69ff11241feb0011c90a8e57e30) +> (Review on upstream link https://review.gluster.org/#/c/glusterfs/+/22184/) + +Change-Id: I584213e0748464bb427fbdef3c4ab6615d7d5eb0 +BUG: 1898777 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/221189 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/inode.h | 3 - + libglusterfs/src/inode.c | 323 +++++++++++++------------------------ + libglusterfs/src/libglusterfs.sym | 1 - + 3 files changed, 111 insertions(+), 216 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/inode.h b/libglusterfs/src/glusterfs/inode.h +index 4421c47..c875653 100644 +--- a/libglusterfs/src/glusterfs/inode.h ++++ b/libglusterfs/src/glusterfs/inode.h +@@ -167,9 +167,6 @@ inode_rename(inode_table_t *table, inode_t *olddir, const char *oldname, + inode_t *newdir, const char *newname, inode_t *inode, + struct iatt *stbuf); + +-dentry_t * +-__dentry_grep(inode_table_t *table, inode_t *parent, const char *name); +- + inode_t * + inode_grep(inode_table_t *table, inode_t *parent, const char *name); + +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 4c3c546..71b2d2a 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -159,27 +159,15 @@ hash_dentry(inode_t *parent, const char *name, int mod) + static int + hash_gfid(uuid_t uuid, int mod) + { +- int ret = 0; +- +- ret = uuid[15] + (uuid[14] << 8); +- +- return ret; ++ return ((uuid[15] + (uuid[14] << 8)) % mod); + } + + static void +-__dentry_hash(dentry_t *dentry) ++__dentry_hash(dentry_t *dentry, const int hash) + { + inode_table_t *table = NULL; +- int hash = 0; +- +- if (!dentry) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_DENTRY_NOT_FOUND, +- "dentry not found"); +- return; +- } + + table = dentry->inode->table; +- hash = hash_dentry(dentry->parent, dentry->name, table->hashsize); + + list_del_init(&dentry->hash); + list_add(&dentry->hash, &table->name_hash[hash]); +@@ -188,49 +176,44 @@ __dentry_hash(dentry_t *dentry) + static int + __is_dentry_hashed(dentry_t *dentry) + { +- if (!dentry) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_DENTRY_NOT_FOUND, +- "dentry not found"); +- return 0; +- } +- + return !list_empty(&dentry->hash); + } + + static void + __dentry_unhash(dentry_t *dentry) + { +- if (!dentry) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_DENTRY_NOT_FOUND, +- "dentry not found"); +- return; +- } +- + list_del_init(&dentry->hash); + } + + static void +-__dentry_unset(dentry_t *dentry) ++dentry_destroy(dentry_t *dentry) + { +- if (!dentry) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_DENTRY_NOT_FOUND, +- "dentry not found"); ++ if (!dentry) + return; +- } ++ ++ GF_FREE(dentry->name); ++ dentry->name = NULL; ++ mem_put(dentry); ++ ++ return; ++} ++ ++static dentry_t * ++__dentry_unset(dentry_t *dentry) ++{ ++ if (!dentry) ++ return NULL; + + __dentry_unhash(dentry); + + list_del_init(&dentry->inode_list); + +- GF_FREE(dentry->name); +- dentry->name = NULL; +- + if (dentry->parent) { + __inode_unref(dentry->parent, false); + dentry->parent = NULL; + } + +- mem_put(dentry); ++ return dentry; + } + + static int +@@ -289,22 +272,14 @@ static int + __is_dentry_cyclic(dentry_t *dentry) + { + int ret = 0; +- inode_t *inode = NULL; +- char *name = ""; + + ret = __foreach_ancestor_dentry(dentry, __check_cycle, dentry->inode); + if (ret) { +- inode = dentry->inode; +- +- if (dentry->name) +- name = dentry->name; +- + gf_msg(dentry->inode->table->name, GF_LOG_CRITICAL, 0, + LG_MSG_DENTRY_CYCLIC_LOOP, +- "detected cyclic loop " +- "formation during inode linkage. inode (%s) linking " +- "under itself as %s", +- uuid_utoa(inode->gfid), name); ++ "detected cyclic loop formation during inode linkage. " ++ "inode (%s) linking under itself as %s", ++ uuid_utoa(dentry->inode->gfid), dentry->name); + } + + return ret; +@@ -313,41 +288,19 @@ __is_dentry_cyclic(dentry_t *dentry) + static void + __inode_unhash(inode_t *inode) + { +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return; +- } +- + list_del_init(&inode->hash); + } + + static int + __is_inode_hashed(inode_t *inode) + { +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return 0; +- } +- + return !list_empty(&inode->hash); + } + + static void +-__inode_hash(inode_t *inode) ++__inode_hash(inode_t *inode, const int hash) + { +- inode_table_t *table = NULL; +- int hash = 0; +- +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return; +- } +- +- table = inode->table; +- hash = hash_gfid(inode->gfid, 65536); ++ inode_table_t *table = inode->table; + + list_del_init(&inode->hash); + list_add(&inode->hash, &table->inode_hash[hash]); +@@ -359,12 +312,6 @@ __dentry_search_for_inode(inode_t *inode, uuid_t pargfid, const char *name) + dentry_t *dentry = NULL; + dentry_t *tmp = NULL; + +- if (!inode || !name) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG, +- "inode || name not found"); +- return NULL; +- } +- + /* earlier, just the ino was sent, which could have been 0, now + we deal with gfid, and if sent gfid is null or 0, no need to + continue with the check */ +@@ -390,12 +337,6 @@ __inode_ctx_free(inode_t *inode) + xlator_t *xl = NULL; + xlator_t *old_THIS = NULL; + +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return; +- } +- + if (!inode->_ctx) { + gf_msg(THIS->name, GF_LOG_WARNING, 0, LG_MSG_CTX_NULL, + "_ctx not found"); +@@ -424,12 +365,6 @@ noctx: + static void + __inode_destroy(inode_t *inode) + { +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return; +- } +- + __inode_ctx_free(inode); + + LOCK_DESTROY(&inode->lock); +@@ -472,9 +407,6 @@ inode_ctx_merge(fd_t *fd, inode_t *inode, inode_t *linked_inode) + static void + __inode_activate(inode_t *inode) + { +- if (!inode) +- return; +- + list_move(&inode->list, &inode->table->active); + inode->table->active_size++; + } +@@ -485,19 +417,13 @@ __inode_passivate(inode_t *inode) + dentry_t *dentry = NULL; + dentry_t *t = NULL; + +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return; +- } +- + list_move_tail(&inode->list, &inode->table->lru); + inode->table->lru_size++; + + list_for_each_entry_safe(dentry, t, &inode->dentry_list, inode_list) + { + if (!__is_dentry_hashed(dentry)) +- __dentry_unset(dentry); ++ dentry_destroy(__dentry_unset(dentry)); + } + } + +@@ -507,12 +433,6 @@ __inode_retire(inode_t *inode) + dentry_t *dentry = NULL; + dentry_t *t = NULL; + +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return; +- } +- + list_move_tail(&inode->list, &inode->table->purge); + inode->table->purge_size++; + +@@ -520,7 +440,7 @@ __inode_retire(inode_t *inode) + + list_for_each_entry_safe(dentry, t, &inode->dentry_list, inode_list) + { +- __dentry_unset(dentry); ++ dentry_destroy(__dentry_unset(dentry)); + } + } + +@@ -547,9 +467,6 @@ __inode_unref(inode_t *inode, bool clear) + xlator_t *this = NULL; + uint64_t nlookup = 0; + +- if (!inode) +- return NULL; +- + /* + * Root inode should always be in active list of inode table. So unrefs + * on root inode are no-ops. +@@ -677,16 +594,10 @@ inode_ref(inode_t *inode) + } + + static dentry_t * +-__dentry_create(inode_t *inode, inode_t *parent, const char *name) ++dentry_create(inode_t *inode, inode_t *parent, const char *name) + { + dentry_t *newd = NULL; + +- if (!inode || !parent || !name) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG, +- "inode || parent || name not found"); +- return NULL; +- } +- + newd = mem_get0(parent->table->dentry_pool); + if (newd == NULL) { + goto out; +@@ -702,10 +613,6 @@ __dentry_create(inode_t *inode, inode_t *parent, const char *name) + goto out; + } + +- if (parent) +- newd->parent = __inode_ref(parent, false); +- +- list_add(&newd->inode_list, &inode->dentry_list); + newd->inode = inode; + + out: +@@ -717,14 +624,6 @@ __inode_create(inode_table_t *table) + { + inode_t *newi = NULL; + +- if (!table) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, +- LG_MSG_INODE_TABLE_NOT_FOUND, +- "table not " +- "found"); +- return NULL; +- } +- + newi = mem_get0(table->inode_pool); + if (!newi) { + goto out; +@@ -795,9 +694,6 @@ __inode_ref_reduce_by_n(inode_t *inode, uint64_t nref) + { + uint64_t nlookup = 0; + +- if (!inode) +- return NULL; +- + GF_ASSERT(inode->ref >= nref); + + inode->ref -= nref; +@@ -837,17 +733,12 @@ inode_forget_atomic(inode_t *inode, uint64_t nlookup) + } + + dentry_t * +-__dentry_grep(inode_table_t *table, inode_t *parent, const char *name) ++__dentry_grep(inode_table_t *table, inode_t *parent, const char *name, ++ const int hash) + { +- int hash = 0; + dentry_t *dentry = NULL; + dentry_t *tmp = NULL; + +- if (!table || !name || !parent) +- return NULL; +- +- hash = hash_dentry(parent, name, table->hashsize); +- + list_for_each_entry(tmp, &table->name_hash[hash], hash) + { + if (tmp->parent == parent && !strcmp(tmp->name, name)) { +@@ -872,15 +763,16 @@ inode_grep(inode_table_t *table, inode_t *parent, const char *name) + return NULL; + } + ++ int hash = hash_dentry(parent, name, table->hashsize); ++ + pthread_mutex_lock(&table->lock); + { +- dentry = __dentry_grep(table, parent, name); +- +- if (dentry) ++ dentry = __dentry_grep(table, parent, name, hash); ++ if (dentry) { + inode = dentry->inode; +- +- if (inode) +- __inode_ref(inode, false); ++ if (inode) ++ __inode_ref(inode, false); ++ } + } + pthread_mutex_unlock(&table->lock); + +@@ -947,17 +839,18 @@ inode_grep_for_gfid(inode_table_t *table, inode_t *parent, const char *name, + return ret; + } + ++ int hash = hash_dentry(parent, name, table->hashsize); ++ + pthread_mutex_lock(&table->lock); + { +- dentry = __dentry_grep(table, parent, name); +- +- if (dentry) ++ dentry = __dentry_grep(table, parent, name, hash); ++ if (dentry) { + inode = dentry->inode; +- +- if (inode) { +- gf_uuid_copy(gfid, inode->gfid); +- *type = inode->ia_type; +- ret = 0; ++ if (inode) { ++ gf_uuid_copy(gfid, inode->gfid); ++ *type = inode->ia_type; ++ ret = 0; ++ } + } + } + pthread_mutex_unlock(&table->lock); +@@ -978,25 +871,14 @@ __is_root_gfid(uuid_t gfid) + } + + inode_t * +-__inode_find(inode_table_t *table, uuid_t gfid) ++__inode_find(inode_table_t *table, uuid_t gfid, const int hash) + { + inode_t *inode = NULL; + inode_t *tmp = NULL; +- int hash = 0; +- +- if (!table) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, +- LG_MSG_INODE_TABLE_NOT_FOUND, +- "table not " +- "found"); +- goto out; +- } + + if (__is_root_gfid(gfid)) + return table->root; + +- hash = hash_gfid(gfid, 65536); +- + list_for_each_entry(tmp, &table->inode_hash[hash], hash) + { + if (gf_uuid_compare(tmp->gfid, gfid) == 0) { +@@ -1005,7 +887,6 @@ __inode_find(inode_table_t *table, uuid_t gfid) + } + } + +-out: + return inode; + } + +@@ -1022,9 +903,11 @@ inode_find(inode_table_t *table, uuid_t gfid) + return NULL; + } + ++ int hash = hash_gfid(gfid, 65536); ++ + pthread_mutex_lock(&table->lock); + { +- inode = __inode_find(table, gfid); ++ inode = __inode_find(table, gfid, hash); + if (inode) + __inode_ref(inode, false); + } +@@ -1035,7 +918,7 @@ inode_find(inode_table_t *table, uuid_t gfid) + + static inode_t * + __inode_link(inode_t *inode, inode_t *parent, const char *name, +- struct iatt *iatt) ++ struct iatt *iatt, const int dhash) + { + dentry_t *dentry = NULL; + dentry_t *old_dentry = NULL; +@@ -1043,16 +926,7 @@ __inode_link(inode_t *inode, inode_t *parent, const char *name, + inode_table_t *table = NULL; + inode_t *link_inode = NULL; + +- if (!inode) { +- errno = EINVAL; +- return NULL; +- } +- + table = inode->table; +- if (!table) { +- errno = EINVAL; +- return NULL; +- } + + if (parent) { + /* We should prevent inode linking between different +@@ -1090,14 +964,16 @@ __inode_link(inode_t *inode, inode_t *parent, const char *name, + return NULL; + } + +- old_inode = __inode_find(table, iatt->ia_gfid); ++ int ihash = hash_gfid(iatt->ia_gfid, 65536); ++ ++ old_inode = __inode_find(table, iatt->ia_gfid, ihash); + + if (old_inode) { + link_inode = old_inode; + } else { + gf_uuid_copy(inode->gfid, iatt->ia_gfid); + inode->ia_type = iatt->ia_type; +- __inode_hash(inode); ++ __inode_hash(inode, ihash); + } + } else { + /* @old_inode serves another important purpose - it indicates +@@ -1112,22 +988,16 @@ __inode_link(inode_t *inode, inode_t *parent, const char *name, + old_inode = inode; + } + +- if (name) { +- if (!strcmp(name, ".") || !strcmp(name, "..")) +- return link_inode; +- +- if (strchr(name, '/')) { +- GF_ASSERT(!"inode link attempted with '/' in name"); +- return NULL; +- } ++ if (name && (!strcmp(name, ".") || !strcmp(name, ".."))) { ++ return link_inode; + } + + /* use only link_inode beyond this point */ + if (parent) { +- old_dentry = __dentry_grep(table, parent, name); ++ old_dentry = __dentry_grep(table, parent, name, dhash); + + if (!old_dentry || old_dentry->inode != link_inode) { +- dentry = __dentry_create(link_inode, parent, name); ++ dentry = dentry_create(link_inode, parent, name); + if (!dentry) { + gf_msg_callingfn( + THIS->name, GF_LOG_ERROR, 0, LG_MSG_DENTRY_CREATE_FAILED, +@@ -1137,15 +1007,20 @@ __inode_link(inode_t *inode, inode_t *parent, const char *name, + errno = ENOMEM; + return NULL; + } ++ ++ /* dentry linking needs to happen inside lock */ ++ dentry->parent = __inode_ref(parent, false); ++ list_add(&dentry->inode_list, &link_inode->dentry_list); ++ + if (old_inode && __is_dentry_cyclic(dentry)) { + errno = ELOOP; +- __dentry_unset(dentry); ++ dentry_destroy(__dentry_unset(dentry)); + return NULL; + } +- __dentry_hash(dentry); ++ __dentry_hash(dentry, dhash); + + if (old_dentry) +- __dentry_unset(old_dentry); ++ dentry_destroy(__dentry_unset(old_dentry)); + } + } + +@@ -1155,6 +1030,7 @@ __inode_link(inode_t *inode, inode_t *parent, const char *name, + inode_t * + inode_link(inode_t *inode, inode_t *parent, const char *name, struct iatt *iatt) + { ++ int hash = 0; + inode_table_t *table = NULL; + inode_t *linked_inode = NULL; + +@@ -1166,10 +1042,18 @@ inode_link(inode_t *inode, inode_t *parent, const char *name, struct iatt *iatt) + + table = inode->table; + ++ if (parent && name) { ++ hash = hash_dentry(parent, name, table->hashsize); ++ } ++ ++ if (name && strchr(name, '/')) { ++ GF_ASSERT(!"inode link attempted with '/' in name"); ++ return NULL; ++ } ++ + pthread_mutex_lock(&table->lock); + { +- linked_inode = __inode_link(inode, parent, name, iatt); +- ++ linked_inode = __inode_link(inode, parent, name, iatt, hash); + if (linked_inode) + __inode_ref(linked_inode, false); + } +@@ -1312,48 +1196,47 @@ inode_invalidate(inode_t *inode) + return ret; + } + +-static void ++static dentry_t * + __inode_unlink(inode_t *inode, inode_t *parent, const char *name) + { + dentry_t *dentry = NULL; + char pgfid[64] = {0}; + char gfid[64] = {0}; + +- if (!inode || !parent || !name) +- return; +- + dentry = __dentry_search_for_inode(inode, parent->gfid, name); + + /* dentry NULL for corrupted backend */ + if (dentry) { +- __dentry_unset(dentry); ++ dentry = __dentry_unset(dentry); + } else { + gf_msg("inode", GF_LOG_WARNING, 0, LG_MSG_DENTRY_NOT_FOUND, + "%s/%s: dentry not found in %s", + uuid_utoa_r(parent->gfid, pgfid), name, + uuid_utoa_r(inode->gfid, gfid)); + } ++ ++ return dentry; + } + + void + inode_unlink(inode_t *inode, inode_t *parent, const char *name) + { +- inode_table_t *table = NULL; ++ inode_table_t *table; ++ dentry_t *dentry; + +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); ++ if (!inode || !parent || !name) + return; +- } + + table = inode->table; + + pthread_mutex_lock(&table->lock); + { +- __inode_unlink(inode, parent, name); ++ dentry = __inode_unlink(inode, parent, name); + } + pthread_mutex_unlock(&table->lock); + ++ dentry_destroy(dentry); ++ + inode_table_prune(table); + } + +@@ -1362,6 +1245,9 @@ inode_rename(inode_table_t *table, inode_t *srcdir, const char *srcname, + inode_t *dstdir, const char *dstname, inode_t *inode, + struct iatt *iatt) + { ++ int hash = 0; ++ dentry_t *dentry = NULL; ++ + if (!inode) { + gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, + "inode not found"); +@@ -1370,13 +1256,26 @@ inode_rename(inode_table_t *table, inode_t *srcdir, const char *srcname, + + table = inode->table; + ++ if (dstname && strchr(dstname, '/')) { ++ GF_ASSERT(!"inode link attempted with '/' in name"); ++ return -1; ++ } ++ ++ if (dstdir && dstname) { ++ hash = hash_dentry(dstdir, dstname, table->hashsize); ++ } ++ + pthread_mutex_lock(&table->lock); + { +- __inode_link(inode, dstdir, dstname, iatt); +- __inode_unlink(inode, srcdir, srcname); ++ __inode_link(inode, dstdir, dstname, iatt, hash); ++ /* pick the old dentry */ ++ dentry = __inode_unlink(inode, srcdir, srcname); + } + pthread_mutex_unlock(&table->lock); + ++ /* free the old dentry */ ++ dentry_destroy(dentry); ++ + inode_table_prune(table); + + return 0; +@@ -1447,12 +1346,6 @@ inode_parent(inode_t *inode, uuid_t pargfid, const char *name) + static int + __inode_has_dentry(inode_t *inode) + { +- if (!inode) { +- gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, +- "inode not found"); +- return 0; +- } +- + return !list_empty(&inode->dentry_list); + } + +@@ -1461,6 +1354,12 @@ inode_has_dentry(inode_t *inode) + { + int dentry_present = 0; + ++ if (!inode) { ++ gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND, ++ "inode not found"); ++ return 0; ++ } ++ + LOCK(&inode->lock); + { + dentry_present = __inode_has_dentry(inode); +@@ -1720,7 +1619,7 @@ __inode_table_init_root(inode_table_t *table) + iatt.ia_ino = 1; + iatt.ia_type = IA_IFDIR; + +- __inode_link(root, NULL, NULL, &iatt); ++ __inode_link(root, NULL, NULL, &iatt, 0); + table->root = root; + } + +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index 5a721e0..d060292 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -357,7 +357,6 @@ default_copy_file_range + default_copy_file_range_cbk + default_copy_file_range_failure_cbk + default_copy_file_range_resume +-__dentry_grep + dht_is_linkfile + dict_add + dict_addn +-- +1.8.3.1 + diff --git a/SOURCES/0508-fuse-fetch-arbitrary-number-of-groups-from-proc-pid-.patch b/SOURCES/0508-fuse-fetch-arbitrary-number-of-groups-from-proc-pid-.patch new file mode 100644 index 0000000..9ccc1b5 --- /dev/null +++ b/SOURCES/0508-fuse-fetch-arbitrary-number-of-groups-from-proc-pid-.patch @@ -0,0 +1,232 @@ +From 87b7689f7727a542c5afa22bdebd3781dd650a2f Mon Sep 17 00:00:00 2001 +From: Csaba Henk +Date: Fri, 17 Jul 2020 11:33:36 +0200 +Subject: [PATCH 508/511] fuse: fetch arbitrary number of groups from + /proc/[pid]/status + +Glusterfs so far constrained itself with an arbitrary limit (32) +for the number of groups read from /proc/[pid]/status (this was +the number of groups shown there prior to Linux commit +v3.7-9553-g8d238027b87e (v3.8-rc1~74^2~59); since this commit, all +groups are shown). + +With this change we'll read groups up to the number Glusterfs +supports in general (64k). + +Note: the actual number of groups that are made use of in a +regular Glusterfs setup shall still be capped at ~93 due to limitations +of the RPC transport. To be able to handle more groups than that, +brick side gid resolution (server.manage-gids option) can be used along +with NIS, LDAP or other such networked directory service (see +https://github.com/gluster/glusterdocs/blob/5ba15a2/docs/Administrator%20Guide/Handling-of-users-with-many-groups.md#limit-in-the-glusterfs-protocol +). + +Also adding some diagnostic messages to frame_fill_groups(). + +Upstream: +> Reviewed-on: https://review.gluster.org/c/glusterfs/+/24721 +> Change-Id: I271f3dc3e6d3c44d6d989c7a2073ea5f16c26ee0 +> fixes: #1075 +> Signed-off-by: Csaba Henk + +BUG: 1749304 +Change-Id: I80bf99d34087fb95768bf2259d8c4774d9f5d0c5 +Signed-off-by: Csaba Henk +Reviewed-on: https://code.engineering.redhat.com/gerrit/220920 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/stack.h | 7 ++++ + tests/bugs/fuse/many-groups-for-acl.t | 13 ++++++- + xlators/mount/fuse/src/fuse-helpers.c | 71 +++++++++++++++++++++++------------ + 3 files changed, 65 insertions(+), 26 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/stack.h b/libglusterfs/src/glusterfs/stack.h +index 1758550..bd466d8 100644 +--- a/libglusterfs/src/glusterfs/stack.h ++++ b/libglusterfs/src/glusterfs/stack.h +@@ -429,6 +429,7 @@ call_stack_alloc_groups(call_stack_t *stack, int ngrps) + if (ngrps <= SMALL_GROUP_COUNT) { + stack->groups = stack->groups_small; + } else { ++ GF_FREE(stack->groups_large); + stack->groups_large = GF_CALLOC(ngrps, sizeof(gid_t), + gf_common_mt_groups_t); + if (!stack->groups_large) +@@ -442,6 +443,12 @@ call_stack_alloc_groups(call_stack_t *stack, int ngrps) + } + + static inline int ++call_stack_groups_capacity(call_stack_t *stack) ++{ ++ return max(stack->ngrps, SMALL_GROUP_COUNT); ++} ++ ++static inline int + call_frames_count(call_stack_t *call_stack) + { + call_frame_t *pos; +diff --git a/tests/bugs/fuse/many-groups-for-acl.t b/tests/bugs/fuse/many-groups-for-acl.t +index d959f75..a51b1bc 100755 +--- a/tests/bugs/fuse/many-groups-for-acl.t ++++ b/tests/bugs/fuse/many-groups-for-acl.t +@@ -38,6 +38,13 @@ do + done + TEST useradd -o -M -u ${NEW_UID} -g ${NEW_GID} -G ${NEW_USER}-${NEW_GIDS} ${NEW_USER} + ++# Linux < 3.8 exports only first 32 gids of pid to userspace ++kernel_exports_few_gids=0 ++if [ "$OSTYPE" = Linux ] && \ ++ su -m ${NEW_USER} -c "grep ^Groups: /proc/self/status | wc -w | xargs -I@ expr @ - 1 '<' $LAST_GID - $NEW_GID + 1" > /dev/null; then ++ kernel_exports_few_gids=1 ++fi ++ + # preparation done, start the tests + + TEST glusterd +@@ -48,6 +55,8 @@ TEST $CLI volume set $V0 nfs.disable off + TEST $CLI volume set ${V0} server.manage-gids off + TEST $CLI volume start ${V0} + ++# This is just a synchronization hack to make sure the bricks are ++# up before going on. + EXPECT_WITHIN ${NFS_EXPORT_TIMEOUT} "1" is_nfs_export_available + + # mount the volume with POSIX ACL support, without --resolve-gids +@@ -69,8 +78,8 @@ TEST [ $? -eq 0 ] + su -m ${NEW_USER} -c "touch ${M0}/first-32-gids-2/success > /dev/null" + TEST [ $? -eq 0 ] + +-su -m ${NEW_USER} -c "touch ${M0}/gid-64/failure > /dev/null" +-TEST [ $? -ne 0 ] ++su -m ${NEW_USER} -c "touch ${M0}/gid-64/success--if-all-gids-exported > /dev/null" ++TEST [ $? -eq $kernel_exports_few_gids ] + + su -m ${NEW_USER} -c "touch ${M0}/gid-120/failure > /dev/null" + TEST [ $? -ne 0 ] +diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c +index 5bfc40c..6e04cd4 100644 +--- a/xlators/mount/fuse/src/fuse-helpers.c ++++ b/xlators/mount/fuse/src/fuse-helpers.c +@@ -139,8 +139,6 @@ get_fuse_state(xlator_t *this, fuse_in_header_t *finh) + return state; + } + +-#define FUSE_MAX_AUX_GROUPS \ +- 32 /* We can get only up to 32 aux groups from /proc */ + void + frame_fill_groups(call_frame_t *frame) + { +@@ -150,8 +148,6 @@ frame_fill_groups(call_frame_t *frame) + char filename[32]; + char line[4096]; + char *ptr = NULL; +- FILE *fp = NULL; +- int idx = 0; + long int id = 0; + char *saveptr = NULL; + char *endptr = NULL; +@@ -191,45 +187,72 @@ frame_fill_groups(call_frame_t *frame) + + call_stack_set_groups(frame->root, ngroups, &mygroups); + } else { ++ FILE *fp = NULL; ++ + ret = snprintf(filename, sizeof filename, "/proc/%d/status", + frame->root->pid); +- if (ret >= sizeof filename) ++ if (ret >= sizeof filename) { ++ gf_log(this->name, GF_LOG_ERROR, "procfs path exceeds buffer size"); + goto out; ++ } + + fp = fopen(filename, "r"); +- if (!fp) ++ if (!fp) { ++ gf_log(this->name, GF_LOG_ERROR, "failed to open %s: %s", filename, ++ strerror(errno)); + goto out; ++ } + +- if (call_stack_alloc_groups(frame->root, ngroups) != 0) +- goto out; ++ for (;;) { ++ gf_boolean_t found_groups = _gf_false; ++ int idx = 0; + +- while ((ptr = fgets(line, sizeof line, fp))) { +- if (strncmp(ptr, "Groups:", 7) != 0) +- continue; ++ if (call_stack_alloc_groups(frame->root, ngroups) != 0) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "failed to allocate gid buffer"); ++ goto out; ++ } + ++ while ((ptr = fgets(line, sizeof line, fp))) { ++ if (strncmp(ptr, "Groups:", 7) == 0) { ++ found_groups = _gf_true; ++ break; ++ } ++ } ++ if (!found_groups) { ++ gf_log(this->name, GF_LOG_ERROR, "cannot find gid list in %s", ++ filename); ++ break; ++ } + ptr = line + 8; + + for (ptr = strtok_r(ptr, " \t\r\n", &saveptr); ptr; + ptr = strtok_r(NULL, " \t\r\n", &saveptr)) { + errno = 0; + id = strtol(ptr, &endptr, 0); +- if (errno == ERANGE) +- break; +- if (!endptr || *endptr) ++ if (errno == ERANGE || !endptr || *endptr) { ++ gf_log(this->name, GF_LOG_ERROR, "failed to parse %s", ++ filename); + break; +- frame->root->groups[idx++] = id; +- if (idx == FUSE_MAX_AUX_GROUPS) ++ } ++ if (idx < call_stack_groups_capacity(frame->root)) ++ frame->root->groups[idx] = id; ++ idx++; ++ if (idx == GF_MAX_AUX_GROUPS) + break; + } +- +- frame->root->ngrps = idx; +- break; ++ if (idx > call_stack_groups_capacity(frame->root)) { ++ ngroups = idx; ++ rewind(fp); ++ } else { ++ frame->root->ngrps = idx; ++ break; ++ } + } ++ out: ++ if (fp) ++ fclose(fp); + } +- +-out: +- if (fp) +- fclose(fp); + #elif defined(GF_SOLARIS_HOST_OS) + char filename[32]; + char scratch[128]; +@@ -245,7 +268,7 @@ out: + fp = fopen(filename, "r"); + if (fp != NULL) { + if (fgets(scratch, sizeof scratch, fp) != NULL) { +- ngrps = MIN(prcred->pr_ngroups, FUSE_MAX_AUX_GROUPS); ++ ngrps = MIN(prcred->pr_ngroups, GF_MAX_AUX_GROUPS); + if (call_stack_alloc_groups(frame->root, ngrps) != 0) { + fclose(fp); + return; +-- +1.8.3.1 + diff --git a/SOURCES/0509-core-configure-optimum-inode-table-hash_size-for-shd.patch b/SOURCES/0509-core-configure-optimum-inode-table-hash_size-for-shd.patch new file mode 100644 index 0000000..fdfc9bb --- /dev/null +++ b/SOURCES/0509-core-configure-optimum-inode-table-hash_size-for-shd.patch @@ -0,0 +1,407 @@ +From a18f03cbf2b5652f8617cb4dd236bb4ca9838d96 Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Tue, 6 Oct 2020 16:54:15 +0530 +Subject: [PATCH 509/511] core: configure optimum inode table hash_size for shd + +In brick_mux environment a shd process consume high memory. +After print the statedump i have found it allocates 1M per afr xlator +for all bricks.In case of configure 4k volumes it consumes almost total +6G RSS size in which 4G consumes by inode_tables + +[cluster/replicate.test1-replicate-0 - usage-type gf_common_mt_list_head memusage] +size=1273488 +num_allocs=2 +max_size=1273488 +max_num_allocs=2 +total_allocs=2 + +inode_new_table function allocates memory(1M) for a list of inode and dentry hash. +For shd lru_limit size is 1 so we don't need to create a big hash table so to reduce +RSS size for shd process pass optimum bucket count at the time of creating inode_table. + +> Change-Id: I039716d42321a232fdee1ee8fd50295e638715bb +> Fixes: #1538 +> Signed-off-by: Mohit Agrawal +> (Cherry pick from commit ca6bbc486e76fdb9a8e07119bb10d7fa45b2e93b) +> (Reviewed on upstream link https://github.com/gluster/glusterfs/issues/1538) + +Change-Id: I039716d42321a232fdee1ee8fd50295e638715bb +BUG: 1898777 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/221191 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + api/src/glfs-master.c | 2 +- + libglusterfs/src/glusterfs/inode.h | 17 +++++---- + libglusterfs/src/inode.c | 53 +++++++++++++++++--------- + xlators/cluster/afr/src/afr.c | 10 ++++- + xlators/cluster/dht/src/dht-rebalance.c | 3 +- + xlators/cluster/ec/src/ec.c | 2 +- + xlators/features/bit-rot/src/bitd/bit-rot.c | 2 +- + xlators/features/quota/src/quotad-helpers.c | 2 +- + xlators/features/trash/src/trash.c | 4 +- + xlators/mount/fuse/src/fuse-bridge.c | 6 +-- + xlators/nfs/server/src/nfs.c | 2 +- + xlators/protocol/server/src/server-handshake.c | 3 +- + 12 files changed, 66 insertions(+), 40 deletions(-) + +diff --git a/api/src/glfs-master.c b/api/src/glfs-master.c +index b4473b1..9e604d3 100644 +--- a/api/src/glfs-master.c ++++ b/api/src/glfs-master.c +@@ -45,7 +45,7 @@ graph_setup(struct glfs *fs, glusterfs_graph_t *graph) + } + + if (!new_subvol->itable) { +- itable = inode_table_new(131072, new_subvol); ++ itable = inode_table_new(131072, new_subvol, 0, 0); + if (!itable) { + errno = ENOMEM; + ret = -1; +diff --git a/libglusterfs/src/glusterfs/inode.h b/libglusterfs/src/glusterfs/inode.h +index c875653..62c093d 100644 +--- a/libglusterfs/src/glusterfs/inode.h ++++ b/libglusterfs/src/glusterfs/inode.h +@@ -35,11 +35,12 @@ typedef struct _dentry dentry_t; + + struct _inode_table { + pthread_mutex_t lock; +- size_t hashsize; /* bucket size of inode hash and dentry hash */ +- char *name; /* name of the inode table, just for gf_log() */ +- inode_t *root; /* root directory inode, with number 1 */ +- xlator_t *xl; /* xlator to be called to do purge */ +- uint32_t lru_limit; /* maximum LRU cache size */ ++ size_t dentry_hashsize; /* Number of buckets for dentry hash*/ ++ size_t inode_hashsize; /* Size of inode hash table */ ++ char *name; /* name of the inode table, just for gf_log() */ ++ inode_t *root; /* root directory inode, with number 1 */ ++ xlator_t *xl; /* xlator to be called to do purge */ ++ uint32_t lru_limit; /* maximum LRU cache size */ + struct list_head *inode_hash; /* buckets for inode hash table */ + struct list_head *name_hash; /* buckets for dentry hash table */ + struct list_head active; /* list of inodes currently active (in an fop) */ +@@ -116,12 +117,14 @@ struct _inode { + #define GFID_STR_PFX_LEN (sizeof(GFID_STR_PFX) - 1) + + inode_table_t * +-inode_table_new(uint32_t lru_limit, xlator_t *xl); ++inode_table_new(uint32_t lru_limit, xlator_t *xl, uint32_t dhash_size, ++ uint32_t inodehash_size); + + inode_table_t * + inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl, + int32_t (*invalidator_fn)(xlator_t *, inode_t *), +- xlator_t *invalidator_xl); ++ xlator_t *invalidator_xl, uint32_t dentry_hashsize, ++ uint32_t inode_hashsize); + + void + inode_table_destroy_all(glusterfs_ctx_t *ctx); +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 71b2d2a..98f8ea6 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -763,7 +763,7 @@ inode_grep(inode_table_t *table, inode_t *parent, const char *name) + return NULL; + } + +- int hash = hash_dentry(parent, name, table->hashsize); ++ int hash = hash_dentry(parent, name, table->dentry_hashsize); + + pthread_mutex_lock(&table->lock); + { +@@ -839,7 +839,7 @@ inode_grep_for_gfid(inode_table_t *table, inode_t *parent, const char *name, + return ret; + } + +- int hash = hash_dentry(parent, name, table->hashsize); ++ int hash = hash_dentry(parent, name, table->dentry_hashsize); + + pthread_mutex_lock(&table->lock); + { +@@ -903,7 +903,7 @@ inode_find(inode_table_t *table, uuid_t gfid) + return NULL; + } + +- int hash = hash_gfid(gfid, 65536); ++ int hash = hash_gfid(gfid, table->inode_hashsize); + + pthread_mutex_lock(&table->lock); + { +@@ -964,7 +964,7 @@ __inode_link(inode_t *inode, inode_t *parent, const char *name, + return NULL; + } + +- int ihash = hash_gfid(iatt->ia_gfid, 65536); ++ int ihash = hash_gfid(iatt->ia_gfid, table->inode_hashsize); + + old_inode = __inode_find(table, iatt->ia_gfid, ihash); + +@@ -1043,7 +1043,7 @@ inode_link(inode_t *inode, inode_t *parent, const char *name, struct iatt *iatt) + table = inode->table; + + if (parent && name) { +- hash = hash_dentry(parent, name, table->hashsize); ++ hash = hash_dentry(parent, name, table->dentry_hashsize); + } + + if (name && strchr(name, '/')) { +@@ -1262,7 +1262,7 @@ inode_rename(inode_table_t *table, inode_t *srcdir, const char *srcname, + } + + if (dstdir && dstname) { +- hash = hash_dentry(dstdir, dstname, table->hashsize); ++ hash = hash_dentry(dstdir, dstname, table->dentry_hashsize); + } + + pthread_mutex_lock(&table->lock); +@@ -1626,7 +1626,8 @@ __inode_table_init_root(inode_table_t *table) + inode_table_t * + inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl, + int32_t (*invalidator_fn)(xlator_t *, inode_t *), +- xlator_t *invalidator_xl) ++ xlator_t *invalidator_xl, uint32_t dentry_hashsize, ++ uint32_t inode_hashsize) + { + inode_table_t *new = NULL; + uint32_t mem_pool_size = lru_limit; +@@ -1644,7 +1645,19 @@ inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl, + new->invalidator_fn = invalidator_fn; + new->invalidator_xl = invalidator_xl; + +- new->hashsize = 14057; /* TODO: Random Number?? */ ++ if (dentry_hashsize == 0) { ++ /* Prime number for uniform distribution */ ++ new->dentry_hashsize = 14057; ++ } else { ++ new->dentry_hashsize = dentry_hashsize; ++ } ++ ++ if (inode_hashsize == 0) { ++ /* The size of hash table always should be power of 2 */ ++ new->inode_hashsize = 65536; ++ } else { ++ new->inode_hashsize = inode_hashsize; ++ } + + /* In case FUSE is initing the inode table. */ + if (!mem_pool_size || (mem_pool_size > DEFAULT_INODE_MEMPOOL_ENTRIES)) +@@ -1658,13 +1671,13 @@ inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl, + if (!new->dentry_pool) + goto out; + +- new->inode_hash = (void *)GF_CALLOC(65536, sizeof(struct list_head), +- gf_common_mt_list_head); ++ new->inode_hash = (void *)GF_CALLOC( ++ new->inode_hashsize, sizeof(struct list_head), gf_common_mt_list_head); + if (!new->inode_hash) + goto out; + +- new->name_hash = (void *)GF_CALLOC(new->hashsize, sizeof(struct list_head), +- gf_common_mt_list_head); ++ new->name_hash = (void *)GF_CALLOC( ++ new->dentry_hashsize, sizeof(struct list_head), gf_common_mt_list_head); + if (!new->name_hash) + goto out; + +@@ -1675,11 +1688,11 @@ inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl, + if (!new->fd_mem_pool) + goto out; + +- for (i = 0; i < 65536; i++) { ++ for (i = 0; i < new->inode_hashsize; i++) { + INIT_LIST_HEAD(&new->inode_hash[i]); + } + +- for (i = 0; i < new->hashsize; i++) { ++ for (i = 0; i < new->dentry_hashsize; i++) { + INIT_LIST_HEAD(&new->name_hash[i]); + } + +@@ -1717,10 +1730,12 @@ out: + } + + inode_table_t * +-inode_table_new(uint32_t lru_limit, xlator_t *xl) ++inode_table_new(uint32_t lru_limit, xlator_t *xl, uint32_t dentry_hashsize, ++ uint32_t inode_hashsize) + { + /* Only fuse for now requires the inode table with invalidator */ +- return inode_table_with_invalidator(lru_limit, xl, NULL, NULL); ++ return inode_table_with_invalidator(lru_limit, xl, NULL, NULL, ++ dentry_hashsize, inode_hashsize); + } + + int +@@ -2439,8 +2454,10 @@ inode_table_dump(inode_table_t *itable, char *prefix) + return; + } + +- gf_proc_dump_build_key(key, prefix, "hashsize"); +- gf_proc_dump_write(key, "%" GF_PRI_SIZET, itable->hashsize); ++ gf_proc_dump_build_key(key, prefix, "dentry_hashsize"); ++ gf_proc_dump_write(key, "%" GF_PRI_SIZET, itable->dentry_hashsize); ++ gf_proc_dump_build_key(key, prefix, "inode_hashsize"); ++ gf_proc_dump_write(key, "%" GF_PRI_SIZET, itable->inode_hashsize); + gf_proc_dump_build_key(key, prefix, "name"); + gf_proc_dump_write(key, "%s", itable->name); + +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 8f9e71f..bfa464f 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -594,7 +594,15 @@ init(xlator_t *this) + goto out; + } + +- this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this); ++ if (priv->shd.iamshd) { ++ /* Number of hash bucket should be prime number so declare 131 ++ total dentry hash buckets ++ */ ++ this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this, 131, 128); ++ } else { ++ this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this, 0, 0); ++ } ++ + if (!this->itable) { + ret = -ENOMEM; + goto out; +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index 16ac16c..072896d 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -1168,7 +1168,6 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + break; + } + +- + offset += ret; + total += ret; + +@@ -2467,7 +2466,7 @@ dht_build_root_inode(xlator_t *this, inode_t **inode) + 0, + }; + +- itable = inode_table_new(0, this); ++ itable = inode_table_new(0, this, 0, 0); + if (!itable) + return; + +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 3f31c74..4118c3b 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -734,7 +734,7 @@ init(xlator_t *this) + GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed); + GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed); + +- this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this); ++ this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this, 0, 0); + if (!this->itable) + goto failed; + +diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c +index 424c0d5..4e0e798 100644 +--- a/xlators/features/bit-rot/src/bitd/bit-rot.c ++++ b/xlators/features/bit-rot/src/bitd/bit-rot.c +@@ -1658,7 +1658,7 @@ notify(xlator_t *this, int32_t event, void *data, ...) + child->child_up = 1; + child->xl = subvol; + if (!child->table) +- child->table = inode_table_new(4096, subvol); ++ child->table = inode_table_new(4096, subvol, 0, 0); + + _br_qchild_event(this, child, br_brick_connect); + pthread_cond_signal(&priv->cond); +diff --git a/xlators/features/quota/src/quotad-helpers.c b/xlators/features/quota/src/quotad-helpers.c +index d9f0351..46ac116 100644 +--- a/xlators/features/quota/src/quotad-helpers.c ++++ b/xlators/features/quota/src/quotad-helpers.c +@@ -32,7 +32,7 @@ get_quotad_aggregator_state(xlator_t *this, rpcsvc_request_t *req) + UNLOCK(&priv->lock); + + if (active_subvol->itable == NULL) +- active_subvol->itable = inode_table_new(4096, active_subvol); ++ active_subvol->itable = inode_table_new(4096, active_subvol, 0, 0); + + state->itable = active_subvol->itable; + +diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c +index 93f020f..099c887 100644 +--- a/xlators/features/trash/src/trash.c ++++ b/xlators/features/trash/src/trash.c +@@ -2261,7 +2261,7 @@ reconfigure(xlator_t *this, dict_t *options) + + if (!active_earlier && active_now) { + if (!priv->trash_itable) { +- priv->trash_itable = inode_table_new(0, this); ++ priv->trash_itable = inode_table_new(0, this, 0, 0); + if (!priv->trash_itable) { + ret = -ENOMEM; + gf_log(this->name, GF_LOG_ERROR, +@@ -2533,7 +2533,7 @@ init(xlator_t *this) + } + + if (priv->state) { +- priv->trash_itable = inode_table_new(0, this); ++ priv->trash_itable = inode_table_new(0, this, 0, 0); + if (!priv->trash_itable) { + ret = -ENOMEM; + priv->state = _gf_false; +diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c +index 1bddac2..919eea3 100644 +--- a/xlators/mount/fuse/src/fuse-bridge.c ++++ b/xlators/mount/fuse/src/fuse-bridge.c +@@ -6298,10 +6298,10 @@ fuse_graph_setup(xlator_t *this, glusterfs_graph_t *graph) + } + + #if FUSE_KERNEL_MINOR_VERSION >= 11 +- itable = inode_table_with_invalidator(priv->lru_limit, graph->top, +- fuse_inode_invalidate_fn, this); ++ itable = inode_table_with_invalidator( ++ priv->lru_limit, graph->top, fuse_inode_invalidate_fn, this, 0, 0); + #else +- itable = inode_table_new(0, graph->top); ++ itable = inode_table_new(0, graph->top, 0, 0); + #endif + if (!itable) { + ret = -1; +diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c +index ebded41..402be30 100644 +--- a/xlators/nfs/server/src/nfs.c ++++ b/xlators/nfs/server/src/nfs.c +@@ -564,7 +564,7 @@ nfs_init_subvolume(struct nfs_state *nfs, xlator_t *xl) + return -1; + + lrusize = nfs->memfactor * GF_NFS_INODE_LRU_MULT; +- xl->itable = inode_table_new(lrusize, xl); ++ xl->itable = inode_table_new(lrusize, xl, 0, 0); + if (!xl->itable) { + gf_msg(GF_NFS, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY, + "Failed to allocate inode table"); +diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c +index 1d1177d..eeca73c 100644 +--- a/xlators/protocol/server/src/server-handshake.c ++++ b/xlators/protocol/server/src/server-handshake.c +@@ -36,7 +36,6 @@ gf_compare_client_version(rpcsvc_request_t *req, int fop_prognum, + return ret; + } + +- + int + server_getspec(rpcsvc_request_t *req) + { +@@ -629,7 +628,7 @@ server_setvolume(rpcsvc_request_t *req) + + /* TODO: what is this ? */ + client->bound_xl->itable = inode_table_new(conf->inode_lru_limit, +- client->bound_xl); ++ client->bound_xl, 0, 0); + } + } + UNLOCK(&conf->itable_lock); +-- +1.8.3.1 + diff --git a/SOURCES/0510-glusterd-brick_mux-Optimize-friend-handshake-code-to.patch b/SOURCES/0510-glusterd-brick_mux-Optimize-friend-handshake-code-to.patch new file mode 100644 index 0000000..e8a4906 --- /dev/null +++ b/SOURCES/0510-glusterd-brick_mux-Optimize-friend-handshake-code-to.patch @@ -0,0 +1,784 @@ +From 5294c82e0528059b10cbaab7805b20e76ffdd66b Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Mon, 30 Nov 2020 17:39:53 +0530 +Subject: [PATCH 510/511] glusterd[brick_mux]: Optimize friend handshake code + to avoid call_bail (#1614) + +During glusterd handshake glusterd received a volume dictionary +from peer end to compare the own volume dictionary data.If the options +are differ it sets the key to recognize volume options are changed +and call import syntask to delete/start the volume.In brick_mux +environment while number of volumes are high(5k) the dict api in function +glusterd_compare_friend_volume takes time because the function +glusterd_handle_friend_req saves all peer volume data in a single dictionary. +Due to time taken by the function glusterd_handle_friend RPC requests receives +a call_bail from a peer end gluster(CLI) won't be able to show volume status. + +Solution: To optimize the code done below changes +1) Populate a new specific dictionary to save the peer end version specific + data so that function won't take much time to take the decision about the + peer end has some volume updates. +2) In case of volume has differ version set the key in status_arr instead + of saving in a dictionary to make the operation is faster. + +Note: To validate the changes followed below procedure +1) Setup 5100 distributed volumes 3x1 +2) Enable brick_mux +3) Start all the volumes +4) Kill all gluster processes on 3rd node +5) Run a loop to update volume option on a 1st node + for i in {1..5100}; do gluster v set vol$i performance.open-behind off; done +6) Start the glusterd process on the 3rd node +7) Wait to finish handshake and check there should not be any call_bail message + in the logs + +> Change-Id: Ibad7c23988539cc369ecc39dea2ea6985470bee1 +> Fixes: #1613 +> Signed-off-by: Mohit Agrawal +> (Cherry pick from commit 12545d91eed27ff9abb0505a12c7d4e75b45a53e) +> (Reviewed on upstream link https://github.com/gluster/glusterfs/issues/1613) + +Change-Id: Ibad7c23988539cc369ecc39dea2ea6985470bee1 +BUG: 1898784 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/221193 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/ctx.c | 4 + + libglusterfs/src/dict.c | 166 ++++++++++++++++++++++++++- + libglusterfs/src/globals.c | 2 - + libglusterfs/src/glusterfs/dict.h | 5 + + libglusterfs/src/glusterfs/globals.h | 2 + + libglusterfs/src/libglusterfs.sym | 1 + + xlators/mgmt/glusterd/src/glusterd-handler.c | 39 ++++--- + xlators/mgmt/glusterd/src/glusterd-sm.c | 6 +- + xlators/mgmt/glusterd/src/glusterd-sm.h | 1 + + xlators/mgmt/glusterd/src/glusterd-utils.c | 148 ++++++++++++++---------- + xlators/mgmt/glusterd/src/glusterd-utils.h | 2 +- + xlators/mgmt/glusterd/src/glusterd.h | 8 +- + 12 files changed, 301 insertions(+), 83 deletions(-) + +diff --git a/libglusterfs/src/ctx.c b/libglusterfs/src/ctx.c +index 4a001c2..ae1a77a 100644 +--- a/libglusterfs/src/ctx.c ++++ b/libglusterfs/src/ctx.c +@@ -14,6 +14,7 @@ + #include "glusterfs/glusterfs.h" + #include "timer-wheel.h" + ++glusterfs_ctx_t *global_ctx = NULL; + glusterfs_ctx_t * + glusterfs_ctx_new() + { +@@ -51,6 +52,9 @@ glusterfs_ctx_new() + GF_ATOMIC_INIT(ctx->stats.max_dict_pairs, 0); + GF_ATOMIC_INIT(ctx->stats.total_pairs_used, 0); + GF_ATOMIC_INIT(ctx->stats.total_dicts_used, 0); ++ ++ if (!global_ctx) ++ global_ctx = ctx; + out: + return ctx; + } +diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c +index d8cdda4..e5f619c 100644 +--- a/libglusterfs/src/dict.c ++++ b/libglusterfs/src/dict.c +@@ -56,7 +56,13 @@ struct dict_cmp { + static data_t * + get_new_data() + { +- data_t *data = mem_get(THIS->ctx->dict_data_pool); ++ data_t *data = NULL; ++ ++ if (global_ctx) { ++ data = mem_get(global_ctx->dict_data_pool); ++ } else { ++ data = mem_get(THIS->ctx->dict_data_pool); ++ } + + if (!data) + return NULL; +@@ -3503,3 +3509,161 @@ unlock: + UNLOCK(&dict->lock); + return 0; + } ++ ++/* Popluate specific dictionary on the basis of passed key array at the ++ time of unserialize buffer ++*/ ++int32_t ++dict_unserialize_specific_keys(char *orig_buf, int32_t size, dict_t **fill, ++ char **suffix_key_arr, dict_t **specific_dict, ++ int totkeycount) ++{ ++ char *buf = orig_buf; ++ int ret = -1; ++ int32_t count = 0; ++ int i = 0; ++ int j = 0; ++ ++ data_t *value = NULL; ++ char *key = NULL; ++ int32_t keylen = 0; ++ int32_t vallen = 0; ++ int32_t hostord = 0; ++ xlator_t *this = NULL; ++ int32_t keylenarr[totkeycount]; ++ ++ this = THIS; ++ GF_ASSERT(this); ++ ++ if (!buf) { ++ gf_msg_callingfn("dict", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG, ++ "buf is null!"); ++ goto out; ++ } ++ ++ if (size == 0) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, ++ "size is 0!"); ++ goto out; ++ } ++ ++ if (!fill) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, ++ "fill is null!"); ++ goto out; ++ } ++ ++ if (!*fill) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, ++ "*fill is null!"); ++ goto out; ++ } ++ ++ if ((buf + DICT_HDR_LEN) > (orig_buf + size)) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, 0, LG_MSG_UNDERSIZED_BUF, ++ "undersized buffer " ++ "passed. available (%lu) < required (%lu)", ++ (long)(orig_buf + size), (long)(buf + DICT_HDR_LEN)); ++ goto out; ++ } ++ ++ memcpy(&hostord, buf, sizeof(hostord)); ++ count = ntoh32(hostord); ++ buf += DICT_HDR_LEN; ++ ++ if (count < 0) { ++ gf_smsg("dict", GF_LOG_ERROR, 0, LG_MSG_COUNT_LESS_THAN_ZERO, ++ "count=%d", count, NULL); ++ goto out; ++ } ++ ++ /* Compute specific key length and save in array */ ++ for (i = 0; i < totkeycount; i++) { ++ keylenarr[i] = strlen(suffix_key_arr[i]); ++ } ++ ++ for (i = 0; i < count; i++) { ++ if ((buf + DICT_DATA_HDR_KEY_LEN) > (orig_buf + size)) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, 0, LG_MSG_UNDERSIZED_BUF, ++ "undersized " ++ "buffer passed. available (%lu) < " ++ "required (%lu)", ++ (long)(orig_buf + size), ++ (long)(buf + DICT_DATA_HDR_KEY_LEN)); ++ goto out; ++ } ++ memcpy(&hostord, buf, sizeof(hostord)); ++ keylen = ntoh32(hostord); ++ buf += DICT_DATA_HDR_KEY_LEN; ++ ++ if ((buf + DICT_DATA_HDR_VAL_LEN) > (orig_buf + size)) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, 0, LG_MSG_UNDERSIZED_BUF, ++ "undersized " ++ "buffer passed. available (%lu) < " ++ "required (%lu)", ++ (long)(orig_buf + size), ++ (long)(buf + DICT_DATA_HDR_VAL_LEN)); ++ goto out; ++ } ++ memcpy(&hostord, buf, sizeof(hostord)); ++ vallen = ntoh32(hostord); ++ buf += DICT_DATA_HDR_VAL_LEN; ++ ++ if ((keylen < 0) || (vallen < 0)) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, 0, LG_MSG_UNDERSIZED_BUF, ++ "undersized length passed " ++ "key:%d val:%d", ++ keylen, vallen); ++ goto out; ++ } ++ if ((buf + keylen) > (orig_buf + size)) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, 0, LG_MSG_UNDERSIZED_BUF, ++ "undersized buffer passed. " ++ "available (%lu) < required (%lu)", ++ (long)(orig_buf + size), (long)(buf + keylen)); ++ goto out; ++ } ++ key = buf; ++ buf += keylen + 1; /* for '\0' */ ++ ++ if ((buf + vallen) > (orig_buf + size)) { ++ gf_msg_callingfn("dict", GF_LOG_ERROR, 0, LG_MSG_UNDERSIZED_BUF, ++ "undersized buffer passed. " ++ "available (%lu) < required (%lu)", ++ (long)(orig_buf + size), (long)(buf + vallen)); ++ goto out; ++ } ++ value = get_new_data(); ++ ++ if (!value) { ++ ret = -1; ++ goto out; ++ } ++ value->len = vallen; ++ value->data = gf_memdup(buf, vallen); ++ value->data_type = GF_DATA_TYPE_STR_OLD; ++ value->is_static = _gf_false; ++ buf += vallen; ++ ++ ret = dict_addn(*fill, key, keylen, value); ++ if (ret < 0) { ++ data_destroy(value); ++ goto out; ++ } ++ for (j = 0; j < totkeycount; j++) { ++ if (keylen > keylenarr[j]) { ++ if (!strcmp(key + keylen - keylenarr[j], suffix_key_arr[j])) { ++ ret = dict_addn(*specific_dict, key, keylen, value); ++ break; ++ } ++ } ++ } ++ ++ if (ret < 0) ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ return ret; ++} +diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c +index e433ee8..30c15b6 100644 +--- a/libglusterfs/src/globals.c ++++ b/libglusterfs/src/globals.c +@@ -96,7 +96,6 @@ const char *gf_upcall_list[GF_UPCALL_FLAGS_MAXVALUE] = { + /* This global ctx is a bad hack to prevent some of the libgfapi crashes. + * This should be removed once the patch on resource pool is accepted + */ +-glusterfs_ctx_t *global_ctx = NULL; + pthread_mutex_t global_ctx_mutex = PTHREAD_MUTEX_INITIALIZER; + xlator_t global_xlator; + static int gf_global_mem_acct_enable = 1; +@@ -236,7 +235,6 @@ __glusterfs_this_location() + if (*this_location == NULL) { + thread_xlator = &global_xlator; + } +- + return this_location; + } + +diff --git a/libglusterfs/src/glusterfs/dict.h b/libglusterfs/src/glusterfs/dict.h +index 8239c7a..6e469c7 100644 +--- a/libglusterfs/src/glusterfs/dict.h ++++ b/libglusterfs/src/glusterfs/dict.h +@@ -423,4 +423,9 @@ dict_has_key_from_array(dict_t *dict, char **strings, gf_boolean_t *result); + + int + dict_serialized_length_lk(dict_t *this); ++ ++int32_t ++dict_unserialize_specific_keys(char *orig_buf, int32_t size, dict_t **fill, ++ char **specific_key_arr, dict_t **specific_dict, ++ int totkeycount); + #endif +diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h +index cc145cd..33fb023 100644 +--- a/libglusterfs/src/glusterfs/globals.h ++++ b/libglusterfs/src/glusterfs/globals.h +@@ -199,4 +199,6 @@ int + gf_global_mem_acct_enable_get(void); + int + gf_global_mem_acct_enable_set(int val); ++ ++extern glusterfs_ctx_t *global_ctx; + #endif /* !_GLOBALS_H */ +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index d060292..bc770e2 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -436,6 +436,7 @@ dict_clear_flag + dict_check_flag + dict_unref + dict_unserialize ++dict_unserialize_specific_keys + drop_token + eh_destroy + eh_dump +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index b8799ab..908361c 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -86,6 +86,9 @@ glusterd_big_locked_handler(rpcsvc_request_t *req, rpcsvc_actor actor_fn) + return ret; + } + ++static char *specific_key_suffix[] = {".quota-cksum", ".ckusm", ".version", ++ ".quota-version", ".name"}; ++ + static int + glusterd_handle_friend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname, + int port, gd1_mgmt_friend_req *friend_req) +@@ -97,6 +100,8 @@ glusterd_handle_friend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname, + char rhost[UNIX_PATH_MAX + 1] = {0}; + uuid_t friend_uuid = {0}; + dict_t *dict = NULL; ++ dict_t *peer_ver = NULL; ++ int totcount = sizeof(specific_key_suffix) / sizeof(specific_key_suffix[0]); + + gf_uuid_parse(uuid_utoa(uuid), friend_uuid); + if (!port) +@@ -104,8 +109,19 @@ glusterd_handle_friend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname, + + ret = glusterd_remote_hostname_get(req, rhost, sizeof(rhost)); + ++ ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_friend_req_ctx_t); ++ dict = dict_new(); ++ peer_ver = dict_new(); ++ + RCU_READ_LOCK; + ++ if (!ctx || !dict || !peer_ver) { ++ gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY, ++ "Unable to allocate memory"); ++ ret = -1; ++ goto out; ++ } ++ + peerinfo = glusterd_peerinfo_find(uuid, rhost); + + if (peerinfo == NULL) { +@@ -130,28 +146,14 @@ glusterd_handle_friend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname, + event->peername = gf_strdup(peerinfo->hostname); + gf_uuid_copy(event->peerid, peerinfo->uuid); + +- ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_friend_req_ctx_t); +- +- if (!ctx) { +- gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY, +- "Unable to allocate memory"); +- ret = -1; +- goto out; +- } +- + gf_uuid_copy(ctx->uuid, uuid); + if (hostname) + ctx->hostname = gf_strdup(hostname); + ctx->req = req; + +- dict = dict_new(); +- if (!dict) { +- ret = -1; +- goto out; +- } +- +- ret = dict_unserialize(friend_req->vols.vols_val, friend_req->vols.vols_len, +- &dict); ++ ret = dict_unserialize_specific_keys( ++ friend_req->vols.vols_val, friend_req->vols.vols_len, &dict, ++ specific_key_suffix, &peer_ver, totcount); + + if (ret) + goto out; +@@ -159,6 +161,7 @@ glusterd_handle_friend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname, + dict->extra_stdfree = friend_req->vols.vols_val; + + ctx->vols = dict; ++ ctx->peer_ver = peer_ver; + event->ctx = ctx; + + ret = glusterd_friend_sm_inject_event(event); +@@ -188,6 +191,8 @@ out: + } else { + free(friend_req->vols.vols_val); + } ++ if (peer_ver) ++ dict_unref(peer_ver); + if (event) + GF_FREE(event->peername); + GF_FREE(event); +diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c +index 044da3d..d10a792 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-sm.c +@@ -106,6 +106,8 @@ glusterd_destroy_friend_req_ctx(glusterd_friend_req_ctx_t *ctx) + + if (ctx->vols) + dict_unref(ctx->vols); ++ if (ctx->peer_ver) ++ dict_unref(ctx->peer_ver); + GF_FREE(ctx->hostname); + GF_FREE(ctx); + } +@@ -936,8 +938,8 @@ glusterd_ac_handle_friend_add_req(glusterd_friend_sm_event_t *event, void *ctx) + // Build comparison logic here. + pthread_mutex_lock(&conf->import_volumes); + { +- ret = glusterd_compare_friend_data(ev_ctx->vols, &status, +- event->peername); ++ ret = glusterd_compare_friend_data(ev_ctx->vols, ev_ctx->peer_ver, ++ &status, event->peername); + if (ret) { + pthread_mutex_unlock(&conf->import_volumes); + goto out; +diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.h b/xlators/mgmt/glusterd/src/glusterd-sm.h +index ce008ac..efdf68e 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-sm.h ++++ b/xlators/mgmt/glusterd/src/glusterd-sm.h +@@ -174,6 +174,7 @@ typedef struct glusterd_friend_req_ctx_ { + rpcsvc_request_t *req; + int port; + dict_t *vols; ++ dict_t *peer_ver; // Dictionary to save peer ver data + } glusterd_friend_req_ctx_t; + + typedef struct glusterd_friend_update_ctx_ { +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index f7030fb..cf32bd9 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -3709,12 +3709,14 @@ out: + return ret; + } + +-int32_t +-glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, +- int32_t *status, char *hostname) ++static int32_t ++glusterd_compare_friend_volume(dict_t *peer_data, ++ glusterd_friend_synctask_args_t *arg, ++ int32_t count, int32_t *status, char *hostname) + { + int32_t ret = -1; + char key[64] = ""; ++ char key_prefix[32]; + int keylen; + glusterd_volinfo_t *volinfo = NULL; + char *volname = NULL; +@@ -3726,15 +3728,20 @@ glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, + xlator_t *this = NULL; + + GF_ASSERT(peer_data); ++ GF_ASSERT(arg); + GF_ASSERT(status); + + this = THIS; + GF_ASSERT(this); + +- keylen = snprintf(key, sizeof(key), "volume%d.name", count); +- ret = dict_get_strn(peer_data, key, keylen, &volname); +- if (ret) ++ snprintf(key_prefix, sizeof(key_prefix), "volume%d", count); ++ keylen = snprintf(key, sizeof(key), "%s.name", key_prefix); ++ ret = dict_get_strn(arg->peer_ver_data, key, keylen, &volname); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, ++ "Key=%s is NULL in peer_ver_data", key, NULL); + goto out; ++ } + + ret = glusterd_volinfo_find(volname, &volinfo); + if (ret) { +@@ -3750,10 +3757,13 @@ glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, + goto out; + } + +- keylen = snprintf(key, sizeof(key), "volume%d.version", count); +- ret = dict_get_int32n(peer_data, key, keylen, &version); +- if (ret) ++ keylen = snprintf(key, sizeof(key), "%s.version", key_prefix); ++ ret = dict_get_int32n(arg->peer_ver_data, key, keylen, &version); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, ++ "Key=%s is NULL in peer_ver_data", key, NULL); + goto out; ++ } + + if (version > volinfo->version) { + // Mismatch detected +@@ -3772,10 +3782,13 @@ glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, + + // Now, versions are same, compare cksums. + // +- snprintf(key, sizeof(key), "volume%d.ckusm", count); +- ret = dict_get_uint32(peer_data, key, &cksum); +- if (ret) ++ snprintf(key, sizeof(key), "%s.ckusm", key_prefix); ++ ret = dict_get_uint32(arg->peer_ver_data, key, &cksum); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, ++ "Key=%s is NULL in peer_ver_data", key, NULL); + goto out; ++ } + + if (cksum != volinfo->cksum) { + ret = 0; +@@ -3790,8 +3803,8 @@ glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, + if (!dict_get_sizen(volinfo->dict, VKEY_FEATURES_QUOTA)) + goto skip_quota; + +- snprintf(key, sizeof(key), "volume%d.quota-version", count); +- ret = dict_get_uint32(peer_data, key, "a_version); ++ snprintf(key, sizeof(key), "%s.quota-version", key_prefix); ++ ret = dict_get_uint32(arg->peer_ver_data, key, "a_version); + if (ret) { + gf_msg_debug(this->name, 0, + "quota-version key absent for" +@@ -3809,6 +3822,7 @@ glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, + "%d on peer %s", + volinfo->volname, volinfo->quota_conf_version, quota_version, + hostname); ++ GF_ATOMIC_INIT(volinfo->volpeerupdate, 1); + *status = GLUSTERD_VOL_COMP_UPDATE_REQ; + goto out; + } else if (quota_version < volinfo->quota_conf_version) { +@@ -3819,8 +3833,8 @@ glusterd_compare_friend_volume(dict_t *peer_data, int32_t count, + + // Now, versions are same, compare cksums. + // +- snprintf(key, sizeof(key), "volume%d.quota-cksum", count); +- ret = dict_get_uint32(peer_data, key, "a_cksum); ++ snprintf(key, sizeof(key), "%s.quota-cksum", key_prefix); ++ ret = dict_get_uint32(arg->peer_ver_data, key, "a_cksum); + if (ret) { + gf_msg_debug(this->name, 0, + "quota checksum absent for " +@@ -3846,13 +3860,12 @@ skip_quota: + *status = GLUSTERD_VOL_COMP_SCS; + + out: +- keylen = snprintf(key, sizeof(key), "volume%d.update", count); +- + if (*status == GLUSTERD_VOL_COMP_UPDATE_REQ) { +- ret = dict_set_int32n(peer_data, key, keylen, 1); +- } else { +- ret = dict_set_int32n(peer_data, key, keylen, 0); ++ /*Set the status to ensure volume is updated on the peer ++ */ ++ arg->status_arr[(count / 64)] ^= 1UL << (count % 64); + } ++ + if (*status == GLUSTERD_VOL_COMP_RJT) { + gf_event(EVENT_COMPARE_FRIEND_VOLUME_FAILED, "volume=%s", + volinfo->volname); +@@ -4935,8 +4948,9 @@ out: + return ret; + } + +-int32_t +-glusterd_import_friend_volume(dict_t *peer_data, int count) ++static int32_t ++glusterd_import_friend_volume(dict_t *peer_data, int count, ++ glusterd_friend_synctask_args_t *arg) + { + int32_t ret = -1; + glusterd_conf_t *priv = NULL; +@@ -4954,10 +4968,27 @@ glusterd_import_friend_volume(dict_t *peer_data, int count) + priv = this->private; + GF_ASSERT(priv); + +- ret = snprintf(key, sizeof(key), "volume%d.update", count); +- ret = dict_get_int32n(peer_data, key, ret, &update); +- if (ret || !update) { ++ if (arg) { ++ /*Check if the volume options are updated on the other peers ++ */ ++ update = (1UL & (arg->status_arr[(count / 64)] >> (count % 64))); ++ } else { ++ ret = snprintf(key, sizeof(key), "volume%d.update", count); ++ ret = dict_get_int32n(peer_data, key, ret, &update); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, ++ "Key=%s", key, NULL); ++ goto out; ++ } ++ } ++ ++ if (!update) { + /* if update is 0 that means the volume is not imported */ ++ gf_log(this->name, GF_LOG_DEBUG, ++ "The volume%d does" ++ " not have any peer change", ++ count); ++ ret = 0; + goto out; + } + +@@ -5045,6 +5076,8 @@ glusterd_import_friend_volumes_synctask(void *opaque) + glusterd_conf_t *conf = NULL; + dict_t *peer_data = NULL; + glusterd_friend_synctask_args_t *arg = NULL; ++ uint64_t bm = 0; ++ uint64_t mask = 0; + + this = THIS; + GF_ASSERT(this); +@@ -5056,17 +5089,7 @@ glusterd_import_friend_volumes_synctask(void *opaque) + if (!arg) + goto out; + +- peer_data = dict_new(); +- if (!peer_data) { +- goto out; +- } +- +- ret = dict_unserialize(arg->dict_buf, arg->dictlen, &peer_data); +- if (ret) { +- errno = ENOMEM; +- goto out; +- } +- ++ peer_data = arg->peer_data; + ret = dict_get_int32n(peer_data, "count", SLEN("count"), &count); + if (ret) + goto out; +@@ -5083,11 +5106,18 @@ glusterd_import_friend_volumes_synctask(void *opaque) + conf->restart_bricks = _gf_true; + + while (i <= count) { +- ret = glusterd_import_friend_volume(peer_data, i); +- if (ret) { +- break; ++ bm = arg->status_arr[i / 64]; ++ while (bm != 0) { ++ /* mask will contain the lowest bit set from bm. */ ++ mask = bm & (-bm); ++ bm ^= mask; ++ ret = glusterd_import_friend_volume(peer_data, i + ffsll(mask) - 2, ++ arg); ++ if (ret < 0) { ++ break; ++ } + } +- i++; ++ i += 64; + } + if (i > count) { + glusterd_svcs_manager(NULL); +@@ -5095,11 +5125,9 @@ glusterd_import_friend_volumes_synctask(void *opaque) + conf->restart_bricks = _gf_false; + synccond_broadcast(&conf->cond_restart_bricks); + out: +- if (peer_data) +- dict_unref(peer_data); + if (arg) { +- if (arg->dict_buf) +- GF_FREE(arg->dict_buf); ++ dict_unref(arg->peer_data); ++ dict_unref(arg->peer_ver_data); + GF_FREE(arg); + } + +@@ -5121,7 +5149,7 @@ glusterd_import_friend_volumes(dict_t *peer_data) + goto out; + + while (i <= count) { +- ret = glusterd_import_friend_volume(peer_data, i); ++ ret = glusterd_import_friend_volume(peer_data, i, NULL); + if (ret) + goto out; + i++; +@@ -5260,7 +5288,8 @@ out: + } + + int32_t +-glusterd_compare_friend_data(dict_t *peer_data, int32_t *status, char *hostname) ++glusterd_compare_friend_data(dict_t *peer_data, dict_t *cmp, int32_t *status, ++ char *hostname) + { + int32_t ret = -1; + int32_t count = 0; +@@ -5289,8 +5318,19 @@ glusterd_compare_friend_data(dict_t *peer_data, int32_t *status, char *hostname) + if (ret) + goto out; + ++ arg = GF_CALLOC(1, sizeof(*arg) + sizeof(uint64_t) * (count / 64), ++ gf_common_mt_char); ++ if (!arg) { ++ ret = -1; ++ gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY, ++ "Out Of Memory"); ++ goto out; ++ } ++ arg->peer_data = dict_ref(peer_data); ++ arg->peer_ver_data = dict_ref(cmp); + while (i <= count) { +- ret = glusterd_compare_friend_volume(peer_data, i, status, hostname); ++ ret = glusterd_compare_friend_volume(peer_data, arg, i, status, ++ hostname); + if (ret) + goto out; + +@@ -5310,21 +5350,13 @@ glusterd_compare_friend_data(dict_t *peer_data, int32_t *status, char *hostname) + * first brick to come up before attaching the subsequent bricks + * in case brick multiplexing is enabled + */ +- arg = GF_CALLOC(1, sizeof(*arg), gf_common_mt_char); +- ret = dict_allocate_and_serialize(peer_data, &arg->dict_buf, +- &arg->dictlen); +- if (ret < 0) { +- gf_log(this->name, GF_LOG_ERROR, +- "dict_serialize failed while handling " +- " import friend volume request"); +- goto out; +- } +- + glusterd_launch_synctask(glusterd_import_friend_volumes_synctask, arg); + } + + out: + if (ret && arg) { ++ dict_unref(arg->peer_data); ++ dict_unref(arg->peer_ver_data); + GF_FREE(arg); + } + gf_msg_debug(this->name, 0, "Returning with ret: %d, status: %d", ret, +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 5f5de82..02d85d2 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -231,7 +231,7 @@ glusterd_add_volumes_to_export_dict(dict_t *peer_data, char **buf, + u_int *length); + + int32_t +-glusterd_compare_friend_data(dict_t *peer_data, int32_t *status, ++glusterd_compare_friend_data(dict_t *peer_data, dict_t *cmp, int32_t *status, + char *hostname); + + int +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index f739b5d..efe4d0e 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -234,8 +234,12 @@ typedef struct glusterd_add_dict_args { + } glusterd_add_dict_args_t; + + typedef struct glusterd_friend_synctask_args { +- char *dict_buf; +- u_int dictlen; ++ dict_t *peer_data; ++ dict_t *peer_ver_data; // Dictionary to save peer version data ++ /* This status_arr[1] is not a real size, real size of the array ++ is dynamically allocated ++ */ ++ uint64_t status_arr[1]; + } glusterd_friend_synctask_args_t; + + typedef enum gf_brick_status { +-- +1.8.3.1 + diff --git a/SOURCES/0511-features-shard-Missing-format-specifier.patch b/SOURCES/0511-features-shard-Missing-format-specifier.patch new file mode 100644 index 0000000..baf6cf4 --- /dev/null +++ b/SOURCES/0511-features-shard-Missing-format-specifier.patch @@ -0,0 +1,39 @@ +From 868d346cc35c222d19b95bd9c367674c9ea859df Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Tue, 15 Dec 2020 16:23:49 +0530 +Subject: [PATCH 511/511] features/shard: Missing format specifier + +PRIu64 format specifier explicitly needs (percent sign) as +prefix and that was missing as part of the below commit on +downstream + +https://code.engineering.redhat.com/gerrit/#/c/221061/ + +BUG: 1752739 +Change-Id: I354de58796f350eb1aa42fcdf8092ca2e69ccbb6 + +Signed-off-by: Vinayakswami Hariharmath +Change-Id: I4598893e3fcca3a2b3e6e8ef9b64b3e5e98923e6 +Reviewed-on: https://code.engineering.redhat.com/gerrit/221217 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index a967f35..099b062 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -1855,7 +1855,7 @@ int shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, + */ + if (!inode) { + gf_msg_debug(this->name, 0, +- "Last shard to be truncated absent in backend: " PRIu64 ++ "Last shard to be truncated absent in backend:%" PRIu64 + " of gfid: %s. Directly proceeding to update file size", + local->first_block, uuid_utoa(local->loc.inode->gfid)); + shard_update_file_size(frame, this, NULL, &local->loc, +-- +1.8.3.1 + diff --git a/SOURCES/0512-glusterd-shared-storage-mount-fails-in-ipv6-environm.patch b/SOURCES/0512-glusterd-shared-storage-mount-fails-in-ipv6-environm.patch new file mode 100644 index 0000000..37de503 --- /dev/null +++ b/SOURCES/0512-glusterd-shared-storage-mount-fails-in-ipv6-environm.patch @@ -0,0 +1,105 @@ +From c963653a89c3f6466af9a3e8f19246a7907f7f8c Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 30 Jul 2020 13:04:52 +0530 +Subject: [PATCH 512/517] glusterd: shared storage mount fails in ipv6 + environment + +Issue: +In case of ipv6 environment, the mounting of glusterd_shared_storage +volume fails as it doesn't recognises the ipv6 enviornment. + +Fix: +In case of ipv6 environment, the address-family is passed +to the hooks script on creating shared-storage, then depending +upon the address-family --xlator-option=transport.address-family=inet6 +option is added to the mount command, and the mounting succeeds. + +>Fixes: #1406 +> +>Change-Id: Ib1888c34d85e6c01618b0ba214cbe1f57576908d +>Signed-off-by: nik-redhat + +Upstream patch: https://review.gluster.org/c/glusterfs/+/24797 +BUG: 1856574 + +Change-Id: Ib1888c34d85e6c01618b0ba214cbe1f57576908d +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/221844 +Tested-by: RHGS Build Bot +Reviewed-by: Srijan Sivakumar +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + .../set/post/S32gluster_enable_shared_storage.sh | 11 +++++++++-- + xlators/mgmt/glusterd/src/glusterd-hooks.c | 19 +++++++++++++++++++ + 2 files changed, 28 insertions(+), 2 deletions(-) + +diff --git a/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh b/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh +index 3bae37c..9597503 100755 +--- a/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh ++++ b/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh +@@ -104,8 +104,15 @@ function check_volume_status() + echo $status + } + +-mount_cmd="mount -t glusterfs $local_node_hostname:/gluster_shared_storage \ +- /run/gluster/shared_storage" ++key=`echo $5 | cut -d '=' -f 1` ++val=`echo $5 | cut -d '=' -f 2` ++if [ "$key" == "transport.address-family" ]; then ++ mount_cmd="mount -t glusterfs -o xlator-option=transport.address-family=inet6 \ ++ $local_node_hostname:/gluster_shared_storage /var/run/gluster/shared_storage" ++else ++ mount_cmd="mount -t glusterfs $local_node_hostname:/gluster_shared_storage \ ++ /var/run/gluster/shared_storage" ++fi + + if [ "$option" == "enable" ]; then + retry=0; +diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.c b/xlators/mgmt/glusterd/src/glusterd-hooks.c +index 216cdf7..4f0d775 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-hooks.c ++++ b/xlators/mgmt/glusterd/src/glusterd-hooks.c +@@ -200,11 +200,16 @@ glusterd_hooks_set_volume_args(dict_t *dict, runner_t *runner) + int i = 0; + int count = 0; + int ret = -1; ++ int flag = 0; + char query[1024] = { + 0, + }; + char *key = NULL; + char *value = NULL; ++ char *inet_family = NULL; ++ xlator_t *this = NULL; ++ this = THIS; ++ GF_ASSERT(this); + + ret = dict_get_int32(dict, "count", &count); + if (ret) +@@ -228,9 +233,23 @@ glusterd_hooks_set_volume_args(dict_t *dict, runner_t *runner) + continue; + + runner_argprintf(runner, "%s=%s", key, value); ++ if ((strncmp(key, "cluster.enable-shared-storage", ++ SLEN("cluster.enable-shared-storage")) == 0 || ++ strncmp(key, "enable-shared-storage", ++ SLEN("enable-shared-storage")) == 0) && ++ strncmp(value, "enable", SLEN("enable")) == 0) ++ flag = 1; + } + + glusterd_hooks_add_custom_args(dict, runner); ++ if (flag == 1) { ++ ret = dict_get_str_sizen(this->options, "transport.address-family", ++ &inet_family); ++ if (!ret) { ++ runner_argprintf(runner, "transport.address-family=%s", ++ inet_family); ++ } ++ } + + ret = 0; + out: +-- +1.8.3.1 + diff --git a/SOURCES/0513-afr-mark-pending-xattrs-as-a-part-of-metadata-heal.patch b/SOURCES/0513-afr-mark-pending-xattrs-as-a-part-of-metadata-heal.patch new file mode 100644 index 0000000..ebd5609 --- /dev/null +++ b/SOURCES/0513-afr-mark-pending-xattrs-as-a-part-of-metadata-heal.patch @@ -0,0 +1,191 @@ +From 708c17a8a69b2657f384affaedfcf4ba0a123893 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Wed, 23 Dec 2020 14:45:07 +0530 +Subject: [PATCH 513/517] afr: mark pending xattrs as a part of metadata heal + +...if pending xattrs are zero for all children. + +Problem: +If there are no pending xattrs and a metadata heal needs to be +performed, it can be possible that we end up with xattrs inadvertendly +deleted from all bricks, as explained in the BZ. + +Fix: +After picking one among the sources as the good copy, mark pending xattrs on +all sources to blame the sinks. Now even if this metadata heal fails midway, +a subsequent heal will still choose one of the valid sources that it +picked previously. + +Upstream patch details: +> Fixes: #1067 +> Change-Id: If1b050b70b0ad911e162c04db4d89b263e2b8d7b +> Signed-off-by: Ravishankar N +Upstream patch: https://review.gluster.org/#/c/glusterfs/+/21922/ + +BUG: 1640148 +Change-Id: If1b050b70b0ad911e162c04db4d89b263e2b8d7b +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/222073 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + tests/bugs/replicate/mdata-heal-no-xattrs.t | 59 ++++++++++++++++++++++ + xlators/cluster/afr/src/afr-self-heal-metadata.c | 62 +++++++++++++++++++++++- + 2 files changed, 120 insertions(+), 1 deletion(-) + create mode 100644 tests/bugs/replicate/mdata-heal-no-xattrs.t + +diff --git a/tests/bugs/replicate/mdata-heal-no-xattrs.t b/tests/bugs/replicate/mdata-heal-no-xattrs.t +new file mode 100644 +index 0000000..d3b0c50 +--- /dev/null ++++ b/tests/bugs/replicate/mdata-heal-no-xattrs.t +@@ -0,0 +1,59 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}; ++TEST $CLI volume set $V0 cluster.self-heal-daemon off ++TEST $CLI volume start $V0 ++ ++TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 ++echo "Data">$M0/FILE ++ret=$? ++TEST [ $ret -eq 0 ] ++ ++# Change permission on brick-0: simulates the case where there is metadata ++# mismatch but no pending xattrs. This brick will become the source for heal. ++TEST chmod +x $B0/$V0"0"/FILE ++ ++# Add gfid to xattrop ++xattrop_b0=$(afr_get_index_path $B0/$V0"0") ++base_entry_b0=`ls $xattrop_b0` ++gfid_str_FILE=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/FILE)) ++TEST ln $xattrop_b0/$base_entry_b0 $xattrop_b0/$gfid_str_FILE ++EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0 ++ ++TEST $CLI volume set $V0 cluster.self-heal-daemon on ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++# Brick-0 should contain xattrs blaming other 2 bricks. ++# The values will be zero because heal is over. ++EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/FILE ++EXPECT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-2 $B0/${V0}0/FILE ++TEST ! getfattr -n trusted.afr.$V0-client-0 $B0/${V0}0/FILE ++ ++# Brick-1 and Brick-2 must not contain any afr xattrs. ++TEST ! getfattr -n trusted.afr.$V0-client-0 $B0/${V0}1/FILE ++TEST ! getfattr -n trusted.afr.$V0-client-1 $B0/${V0}1/FILE ++TEST ! getfattr -n trusted.afr.$V0-client-2 $B0/${V0}1/FILE ++TEST ! getfattr -n trusted.afr.$V0-client-0 $B0/${V0}2/FILE ++TEST ! getfattr -n trusted.afr.$V0-client-1 $B0/${V0}2/FILE ++TEST ! getfattr -n trusted.afr.$V0-client-2 $B0/${V0}2/FILE ++ ++# check permission bits. ++EXPECT '755' stat -c %a $B0/${V0}0/FILE ++EXPECT '755' stat -c %a $B0/${V0}1/FILE ++EXPECT '755' stat -c %a $B0/${V0}2/FILE ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++cleanup; +diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c +index f4e31b6..03f43ba 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c ++++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c +@@ -190,6 +190,59 @@ out: + return ret; + } + ++static int ++__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, ++ inode_t *inode, ++ struct afr_reply *replies, ++ unsigned char *sources) ++{ ++ int ret = 0; ++ int i = 0; ++ int m_idx = 0; ++ afr_private_t *priv = NULL; ++ int raw[AFR_NUM_CHANGE_LOGS] = {0}; ++ dict_t *xattr = NULL; ++ ++ priv = this->private; ++ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); ++ raw[m_idx] = 1; ++ ++ xattr = dict_new(); ++ if (!xattr) ++ return -ENOMEM; ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (sources[i]) ++ continue; ++ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, ++ sizeof(int) * AFR_NUM_CHANGE_LOGS); ++ if (ret) { ++ ret = -1; ++ goto out; ++ } ++ } ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!sources[i]) ++ continue; ++ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, ++ "Failed to set pending metadata xattr on child %d for %s", i, ++ uuid_utoa(inode->gfid)); ++ goto out; ++ } ++ } ++ ++ afr_replies_wipe(replies, priv->child_count); ++ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); ++ ++out: ++ if (xattr) ++ dict_unref(xattr); ++ return ret; ++} ++ + /* + * Look for mismatching uid/gid or mode or user xattrs even if + * AFR xattrs don't say so, and pick one arbitrarily as winner. */ +@@ -210,6 +263,7 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, + }; + int source = -1; + int sources_count = 0; ++ int ret = 0; + + priv = this->private; + +@@ -300,7 +354,13 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, + healed_sinks[i] = 1; + } + } +- ++ if ((sources_count == priv->child_count) && (source > -1) && ++ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { ++ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, ++ replies, sources); ++ if (ret < 0) ++ return ret; ++ } + out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return source; +-- +1.8.3.1 + diff --git a/SOURCES/0514-afr-event-gen-changes.patch b/SOURCES/0514-afr-event-gen-changes.patch new file mode 100644 index 0000000..9f9562e --- /dev/null +++ b/SOURCES/0514-afr-event-gen-changes.patch @@ -0,0 +1,308 @@ +From 4c47d6dd7c5ddcaa2a1e159427c0f6713fd33907 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Wed, 23 Dec 2020 14:57:51 +0530 +Subject: [PATCH 514/517] afr: event gen changes + +The general idea of the changes is to prevent resetting event generation +to zero in the inode ctx, since event gen is something that should +follow 'causal order'. + +Change #1: +For a read txn, in inode refresh cbk, if event_generation is +found zero, we are failing the read fop. This is not needed +because change in event gen is only a marker for the next inode refresh to +happen and should not be taken into account by the current read txn. + +Change #2: +The event gen being zero above can happen if there is a racing lookup, +which resets even get (in afr_lookup_done) if there are non zero afr +xattrs. The resetting is done only to trigger an inode refresh and a +possible client side heal on the next lookup. That can be acheived by +setting the need_refresh flag in the inode ctx. So replaced all +occurences of resetting even gen to zero with a call to +afr_inode_need_refresh_set(). + +Change #3: +In both lookup and discover path, we are doing an inode refresh which is +not required since all 3 essentially do the same thing- update the inode +ctx with the good/bad copies from the brick replies. Inode refresh also +triggers background heals, but I think it is okay to do it when we call +refresh during the read and write txns and not in the lookup path. + +The .ts which relied on inode refresh in lookup path to trigger heals are +now changed to do read txn so that inode refresh and the heal happens. + +Upstream patch details: +> Change-Id: Iebf39a9be6ffd7ffd6e4046c96b0fa78ade6c5ec +> Fixes: #1179 +> Signed-off-by: Ravishankar N +> Reported-by: Erik Jacobson +Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24316/ + +BUG: 1640148 +Change-Id: Iebf39a9be6ffd7ffd6e4046c96b0fa78ade6c5ec +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/222074 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + ...fid-mismatch-resolution-with-fav-child-policy.t | 8 +- + xlators/cluster/afr/src/afr-common.c | 92 +++++----------------- + xlators/cluster/afr/src/afr-dir-write.c | 6 +- + xlators/cluster/afr/src/afr.h | 5 +- + 4 files changed, 29 insertions(+), 82 deletions(-) + +diff --git a/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t b/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t +index f4aa351..12af0c8 100644 +--- a/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t ++++ b/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t +@@ -168,8 +168,8 @@ TEST [ "$gfid_1" != "$gfid_2" ] + #We know that second brick has the bigger size file + BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/f3 | cut -d\ -f1) + +-TEST ls $M0/f3 +-TEST cat $M0/f3 ++TEST ls $M0 #Trigger entry heal via readdir inode refresh ++TEST cat $M0/f3 #Trigger data heal via readv inode refresh + EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + + #gfid split-brain should be resolved +@@ -215,8 +215,8 @@ TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 + EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +-TEST ls $M0/f4 +-TEST cat $M0/f4 ++TEST ls $M0 #Trigger entry heal via readdir inode refresh ++TEST cat $M0/f4 #Trigger data heal via readv inode refresh + EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + + #gfid split-brain should be resolved +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index fca2cd5..90b4f14 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -284,7 +284,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, + metadatamap |= (1 << index); + } + if (metadatamap_old != metadatamap) { +- event = 0; ++ __afr_inode_need_refresh_set(inode, this); + } + break; + +@@ -297,7 +297,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, + datamap |= (1 << index); + } + if (datamap_old != datamap) +- event = 0; ++ __afr_inode_need_refresh_set(inode, this); + break; + + default: +@@ -461,34 +461,6 @@ out: + } + + int +-__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this) +-{ +- int ret = -1; +- uint16_t datamap = 0; +- uint16_t metadatamap = 0; +- uint32_t event = 0; +- uint64_t val = 0; +- afr_inode_ctx_t *ctx = NULL; +- +- ret = __afr_inode_ctx_get(this, inode, &ctx); +- if (ret) +- return ret; +- +- val = ctx->read_subvol; +- +- metadatamap = (val & 0x000000000000ffff) >> 0; +- datamap = (val & 0x00000000ffff0000) >> 16; +- event = 0; +- +- val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | +- (((uint64_t)event) << 32); +- +- ctx->read_subvol = val; +- +- return ret; +-} +- +-int + __afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) + { +@@ -559,22 +531,6 @@ out: + } + + int +-__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) +-{ +- afr_private_t *priv = NULL; +- int ret = -1; +- +- priv = this->private; +- +- if (priv->child_count <= 16) +- ret = __afr_inode_event_gen_reset_small(inode, this); +- else +- ret = -1; +- +- return ret; +-} +- +-int + afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) + { +@@ -723,30 +679,22 @@ out: + return need_refresh; + } + +-static int +-afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) ++int ++__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) + { + int ret = -1; + afr_inode_ctx_t *ctx = NULL; + +- GF_VALIDATE_OR_GOTO(this->name, inode, out); +- +- LOCK(&inode->lock); +- { +- ret = __afr_inode_ctx_get(this, inode, &ctx); +- if (ret) +- goto unlock; +- ++ ret = __afr_inode_ctx_get(this, inode, &ctx); ++ if (ret == 0) { + ctx->need_refresh = _gf_true; + } +-unlock: +- UNLOCK(&inode->lock); +-out: ++ + return ret; + } + + int +-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) ++afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) + { + int ret = -1; + +@@ -754,7 +702,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) + + LOCK(&inode->lock); + { +- ret = __afr_inode_event_gen_reset(inode, this); ++ ret = __afr_inode_need_refresh_set(inode, this); + } + UNLOCK(&inode->lock); + out: +@@ -1191,7 +1139,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) + ret = afr_inode_get_readable(frame, inode, this, local->readable, + &event_generation, local->transaction.type); + +- if (ret == -EIO || (local->is_read_txn && !event_generation)) { ++ if (ret == -EIO) { + /* No readable subvolume even after refresh ==> splitbrain.*/ + if (!priv->fav_child_policy) { + err = EIO; +@@ -2413,7 +2361,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) + if (read_subvol == -1) + goto cant_interpret; + if (ret) { +- afr_inode_event_gen_reset(local->inode, this); ++ afr_inode_need_refresh_set(local->inode, this); + dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); + } + } else { +@@ -2971,6 +2919,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; ++ int ret = 0; + unsigned char *data_readable = NULL; + unsigned char *success_replies = NULL; + +@@ -2992,7 +2941,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) + if (!afr_has_quorum(success_replies, this, frame)) + goto unwind; + +- afr_replies_interpret(frame, this, local->inode, NULL); ++ ret = afr_replies_interpret(frame, this, local->inode, NULL); ++ if (ret) { ++ afr_inode_need_refresh_set(local->inode, this); ++ } + + read_subvol = afr_read_subvol_decide(local->inode, this, NULL, + data_readable); +@@ -3248,11 +3200,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) + afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); + +- if (afr_is_inode_refresh_reqd(loc->inode, this, event, +- local->event_generation)) +- afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do); +- else +- afr_discover_do(frame, this, 0); ++ afr_discover_do(frame, this, 0); + + return 0; + out: +@@ -3393,11 +3341,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) + afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); + +- if (afr_is_inode_refresh_reqd(loc->inode, this, event, +- local->event_generation)) +- afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do); +- else +- afr_lookup_do(frame, this, 0); ++ afr_lookup_do(frame, this, 0); + + return 0; + out: +diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c +index 416c19d..d419bfc 100644 +--- a/xlators/cluster/afr/src/afr-dir-write.c ++++ b/xlators/cluster/afr/src/afr-dir-write.c +@@ -123,11 +123,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) +- afr_inode_event_gen_reset(local->inode, this); ++ afr_inode_need_refresh_set(local->inode, this); + if (local->parent) +- afr_inode_event_gen_reset(local->parent, this); ++ afr_inode_need_refresh_set(local->parent, this); + if (local->parent2) +- afr_inode_event_gen_reset(local->parent2, this); ++ afr_inode_need_refresh_set(local->parent2, this); + continue; + } + +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index ed5096e..3a2b26d 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -948,7 +948,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, + int event_generation); + + int +-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this); ++__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); ++ ++int ++afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); + + int + afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, +-- +1.8.3.1 + diff --git a/SOURCES/0515-cluster-afr-Heal-directory-rename-without-rmdir-mkdi.patch b/SOURCES/0515-cluster-afr-Heal-directory-rename-without-rmdir-mkdi.patch new file mode 100644 index 0000000..9c7693a --- /dev/null +++ b/SOURCES/0515-cluster-afr-Heal-directory-rename-without-rmdir-mkdi.patch @@ -0,0 +1,2155 @@ +From aab8a587360214432c4a2ab59134411f1d38c509 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Wed, 9 Dec 2020 10:46:31 +0530 +Subject: [PATCH 515/517] cluster/afr: Heal directory rename without + rmdir/mkdir + +Problem1: +When a directory is renamed while a brick +is down entry-heal always did an rm -rf on that directory on +the sink on old location and did mkdir and created the directory +hierarchy again in the new location. This is inefficient. + +Problem2: +Renamedir heal order may lead to a scenario where directory in +the new location could be created before deleting it from old +location leading to 2 directories with same gfid in posix. + +Fix: +As part of heal, if oldlocation is healed first and is not present in +source-brick always rename it into a hidden directory inside the +sink-brick so that when heal is triggered in new-location shd can +rename it from this hidden directory to the new-location. + +If new-location heal is triggered first and it detects that the +directory already exists in the brick, then it should skip healing the +directory until it appears in the hidden directory. + +Credits: Ravi for rename-data-loss.t script + +Upstream patch details: +> Fixes: #1211 +> Change-Id: I0cba2006f35cd03d314d18211ce0bd530e254843 +> Signed-off-by: Pranith Kumar K +Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24373/ + +BUG: 1640148 +Change-Id: I0cba2006f35cd03d314d18211ce0bd530e254843 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/220660 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + tests/afr.rc | 16 + + tests/basic/afr/afr-anon-inode-no-quorum.t | 63 ++++ + tests/basic/afr/afr-anon-inode.t | 114 ++++++ + tests/basic/afr/entry-self-heal-anon-dir-off.t | 464 ++++++++++++++++++++++++ + tests/basic/afr/rename-data-loss.t | 72 ++++ + tests/bugs/replicate/bug-1744548-heal-timeout.t | 6 +- + tests/features/trash.t | 74 ++-- + xlators/cluster/afr/src/afr-common.c | 46 ++- + xlators/cluster/afr/src/afr-dir-read.c | 12 +- + xlators/cluster/afr/src/afr-self-heal-common.c | 182 ++++++++++ + xlators/cluster/afr/src/afr-self-heal-entry.c | 206 +++++++++-- + xlators/cluster/afr/src/afr-self-heal-name.c | 33 +- + xlators/cluster/afr/src/afr-self-heal.h | 5 + + xlators/cluster/afr/src/afr-self-heald.c | 178 ++++++++- + xlators/cluster/afr/src/afr-self-heald.h | 2 +- + xlators/cluster/afr/src/afr.c | 40 +- + xlators/cluster/afr/src/afr.h | 11 + + xlators/mgmt/glusterd/src/glusterd-volgen.c | 39 ++ + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 + + 19 files changed, 1442 insertions(+), 127 deletions(-) + create mode 100644 tests/basic/afr/afr-anon-inode-no-quorum.t + create mode 100644 tests/basic/afr/afr-anon-inode.t + create mode 100644 tests/basic/afr/entry-self-heal-anon-dir-off.t + create mode 100644 tests/basic/afr/rename-data-loss.t + +diff --git a/tests/afr.rc b/tests/afr.rc +index 35f352d..2417899 100644 +--- a/tests/afr.rc ++++ b/tests/afr.rc +@@ -105,3 +105,19 @@ function get_quorum_type() + local repl_id="$3" + cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}' + } ++ ++function afr_private_key_value() ++{ ++ local v=$1 ++ local m=$2 ++ local replica_id=$3 ++ local key=$4 ++#xargs at the end will strip leading spaces ++ grep -E "^${key} = " $m/.meta/graphs/active/${v}-replicate-${replica_id}/private | cut -f2 -d'=' | xargs ++} ++ ++function afr_anon_entry_count() ++{ ++ local b=$1 ++ ls $b/.glusterfs-anonymous-inode* | wc -l ++} +diff --git a/tests/basic/afr/afr-anon-inode-no-quorum.t b/tests/basic/afr/afr-anon-inode-no-quorum.t +new file mode 100644 +index 0000000..896ba0c +--- /dev/null ++++ b/tests/basic/afr/afr-anon-inode-no-quorum.t +@@ -0,0 +1,63 @@ ++#!/bin/bash ++ ++#Test that anon-inode entry is not cleaned up as long as there exists at least ++#one valid entry ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume heal $V0 disable ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.readdir-ahead off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 cluster.entry-self-heal off ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++TEST touch $M0/a $M0/b ++ ++gfid_a=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/a)) ++gfid_b=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/b)) ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++TEST mv $M0/a $M0/a-new ++TEST mv $M0/b $M0/b-new ++ ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++TEST ! ls $M0/a ++TEST ! ls $M0/b ++anon_inode_name=$(ls -a $B0/${V0}0 | grep glusterfs-anonymous-inode) ++TEST stat $B0/${V0}0/$anon_inode_name/$gfid_a ++TEST stat $B0/${V0}0/$anon_inode_name/$gfid_b ++#Make sure index heal doesn't happen after enabling heal ++TEST setfattr -x trusted.afr.$V0-client-0 $B0/${V0}1 ++TEST rm -f $B0/${V0}1/.glusterfs/indices/xattrop/* ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++TEST $CLI volume heal $V0 ++#Allow time for a scan ++sleep 5 ++TEST stat $B0/${V0}0/$anon_inode_name/$gfid_a ++TEST stat $B0/${V0}0/$anon_inode_name/$gfid_b ++inum_b=$(STAT_INO $B0/${V0}0/$anon_inode_name/$gfid_b) ++TEST rm -f $M0/a-new ++TEST stat $M0/b-new ++ ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/${V0}0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/${V0}1 ++EXPECT "$inum_b" STAT_INO $B0/${V0}0/b-new ++ ++cleanup +diff --git a/tests/basic/afr/afr-anon-inode.t b/tests/basic/afr/afr-anon-inode.t +new file mode 100644 +index 0000000..f4cf37a +--- /dev/null ++++ b/tests/basic/afr/afr-anon-inode.t +@@ -0,0 +1,114 @@ ++#!/bin/bash ++#Tests that afr-anon-inode test cases work fine as expected ++#These are cases where in entry-heal/name-heal we dont know entry for an inode ++#so these inodes are kept in a special directory ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..2} ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume start $V0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++EXPECT "^1$" afr_private_key_value $V0 $M0 0 "use-anonymous-inode" ++TEST $CLI volume set $V0 cluster.use-anonymous-inode no ++EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "^0$" afr_private_key_value $V0 $M0 0 "use-anonymous-inode" ++TEST $CLI volume set $V0 cluster.use-anonymous-inode yes ++EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "^1$" afr_private_key_value $V0 $M0 0 "use-anonymous-inode" ++TEST mkdir -p $M0/d1/b $M0/d2/a ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++TEST mv $M0/d2/a $M0/d1 ++TEST mv $M0/d1/b $M0/d2 ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++anon_inode_name=$(ls -a $B0/${V0}0 | grep glusterfs-anonymous-inode) ++TEST [[ -d $B0/${V0}1/$anon_inode_name ]] ++TEST [[ -d $B0/${V0}2/$anon_inode_name ]] ++anon_gfid=$(gf_get_gfid_xattr $B0/${V0}0/$anon_inode_name) ++EXPECT "$anon_gfid" gf_get_gfid_xattr $B0/${V0}1/$anon_inode_name ++EXPECT "$anon_gfid" gf_get_gfid_xattr $B0/${V0}2/$anon_inode_name ++ ++TEST ! ls $M0/$anon_inode_name ++EXPECT "^4$" echo $(ls -a $M0 | wc -l) ++ ++#Test purging code path by shd ++TEST $CLI volume heal $V0 disable ++TEST mkdir $M0/l0 $M0/l1 $M0/l2 ++TEST touch $M0/del-file $M0/del-file-nolink $M0/l0/file ++TEST ln $M0/del-file $M0/del-file-link ++TEST ln $M0/l0/file $M0/l1/file-link1 ++TEST ln $M0/l0/file $M0/l2/file-link2 ++TEST mkdir -p $M0/del-recursive-dir/d1 ++ ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++TEST rm -f $M0/del-file $M0/del-file-nolink ++TEST rm -rf $M0/del-recursive-dir ++TEST mv $M0/d1/a $M0/d2 ++TEST mv $M0/l0/file $M0/l0/renamed-file ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status $V0 0 ++ ++nolink_gfid=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/del-file-nolink)) ++link_gfid=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/del-file)) ++dir_gfid=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/del-recursive-dir)) ++rename_dir_gfid=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/d1/a)) ++rename_file_gfid=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}0/l0/file)) ++TEST ! stat $M0/del-file ++TEST stat $B0/${V0}0/$anon_inode_name/$link_gfid ++TEST ! stat $M0/del-file-nolink ++TEST ! stat $B0/${V0}0/$anon_inode_name/$nolink_gfid ++TEST ! stat $M0/del-recursive-dir ++TEST stat $B0/${V0}0/$anon_inode_name/$dir_gfid ++TEST ! stat $M0/d1/a ++TEST stat $B0/${V0}0/$anon_inode_name/$rename_dir_gfid ++TEST ! stat $M0/l0/file ++TEST stat $B0/${V0}0/$anon_inode_name/$rename_file_gfid ++ ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++TEST mv $M0/l1/file-link1 $M0/l1/renamed-file-link1 ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status $V0 1 ++TEST ! stat $M0/l1/file-link1 ++TEST stat $B0/${V0}1/$anon_inode_name/$rename_file_gfid ++ ++TEST kill_brick $V0 $H0 $B0/${V0}2 ++TEST mv $M0/l2/file-link2 $M0/l2/renamed-file-link2 ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status $V0 2 ++TEST ! stat $M0/l2/file-link2 ++TEST stat $B0/${V0}2/$anon_inode_name/$rename_file_gfid ++ ++#Simulate only anon-inodes present in all bricks ++TEST rm -f $M0/l0/renamed-file $M0/l1/renamed-file-link1 $M0/l2/renamed-file-link2 ++ ++#Test that shd doesn't cleanup anon-inodes when some bricks are down ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++TEST $CLI volume heal $V0 enable ++$CLI volume heal $V0 ++sleep 5 #Allow time for completion of one scan ++TEST stat $B0/${V0}0/$anon_inode_name/$link_gfid ++TEST stat $B0/${V0}0/$anon_inode_name/$rename_dir_gfid ++TEST stat $B0/${V0}0/$anon_inode_name/$dir_gfid ++rename_dir_inum=$(STAT_INO $B0/${V0}0/$anon_inode_name/$rename_dir_gfid) ++ ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status $V0 1 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/${V0}0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/${V0}1 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/${V0}2 ++ ++#Test that rename indeed happened instead of rmdir/mkdir ++renamed_dir_inum=$(STAT_INO $B0/${V0}0/d2/a) ++EXPECT "$rename_dir_inum" echo $renamed_dir_inum ++cleanup; +diff --git a/tests/basic/afr/entry-self-heal-anon-dir-off.t b/tests/basic/afr/entry-self-heal-anon-dir-off.t +new file mode 100644 +index 0000000..0803a08 +--- /dev/null ++++ b/tests/basic/afr/entry-self-heal-anon-dir-off.t +@@ -0,0 +1,464 @@ ++#!/bin/bash ++ ++#This file checks if missing entry self-heal and entry self-heal are working ++#as expected. ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++function get_file_type { ++ stat -c "%a:%F:%g:%t:%T:%u" $1 ++} ++ ++function diff_dirs { ++ diff <(ls $1 | sort) <(ls $2 | sort) ++} ++ ++function heal_status { ++ local f1_path="${1}/${3}" ++ local f2_path="${2}/${3}" ++ local insync="" ++ diff_dirs $f1_path $f2_path ++ if [ $? -eq 0 ]; ++ then ++ insync="Y" ++ else ++ insync="N" ++ fi ++ local xattr11=$(get_hex_xattr trusted.afr.$V0-client-0 $f1_path) ++ local xattr12=$(get_hex_xattr trusted.afr.$V0-client-1 $f1_path) ++ local xattr21=$(get_hex_xattr trusted.afr.$V0-client-0 $f2_path) ++ local xattr22=$(get_hex_xattr trusted.afr.$V0-client-1 $f2_path) ++ local dirty1=$(get_hex_xattr trusted.afr.dirty $f1_path) ++ local dirty2=$(get_hex_xattr trusted.afr.dirty $f2_path) ++ if [ -z $xattr11 ]; then xattr11="000000000000000000000000"; fi ++ if [ -z $xattr12 ]; then xattr12="000000000000000000000000"; fi ++ if [ -z $xattr21 ]; then xattr21="000000000000000000000000"; fi ++ if [ -z $xattr22 ]; then xattr22="000000000000000000000000"; fi ++ if [ -z $dirty1 ]; then dirty1="000000000000000000000000"; fi ++ if [ -z $dirty2 ]; then dirty2="000000000000000000000000"; fi ++ echo ${insync}${xattr11}${xattr12}${xattr21}${xattr22}${dirty1}${dirty2} ++} ++ ++function is_heal_done { ++ local zero_xattr="000000000000000000000000" ++ if [ "$(heal_status $@)" == "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" ]; ++ then ++ echo "Y" ++ else ++ echo "N" ++ fi ++} ++ ++function print_pending_heals { ++ local result=":" ++ for i in "$@"; ++ do ++ if [ "N" == $(is_heal_done $B0/${V0}0 $B0/${V0}1 $i) ]; ++ then ++ result="$result:$i" ++ fi ++ done ++#To prevent any match for EXPECT_WITHIN, print a char non-existent in file-names ++ if [ $result == ":" ]; then result="~"; fi ++ echo $result ++} ++ ++zero_xattr="000000000000000000000000" ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume heal $V0 disable ++TEST $CLI volume set $V0 cluster.use-anonymous-inode off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.readdir-ahead off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 cluster.data-self-heal on ++TEST $CLI volume set $V0 cluster.metadata-self-heal on ++TEST $CLI volume set $V0 cluster.entry-self-heal on ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --use-readdirp=no $M0 ++cd $M0 ++#_me_ is dir on which missing entry self-heal happens, _heal is where dir self-heal happens ++#spb is split-brain, fool is all fool ++ ++#source_self_accusing means there exists source and a sink which self-accuses. ++#This simulates failures where fops failed on the bricks without it going down. ++#Something like EACCESS/EDQUOT etc ++ ++TEST mkdir spb_heal spb spb_me_heal spb_me fool_heal fool_me v1_fool_heal v1_fool_me source_creations_heal source_deletions_heal source_creations_me source_deletions_me v1_dirty_me v1_dirty_heal source_self_accusing ++TEST mkfifo source_deletions_heal/fifo ++TEST mknod source_deletions_heal/block b 4 5 ++TEST mknod source_deletions_heal/char c 1 5 ++TEST touch source_deletions_heal/file ++TEST ln -s source_deletions_heal/file source_deletions_heal/slink ++TEST mkdir source_deletions_heal/dir1 ++TEST mkdir source_deletions_heal/dir1/dir2 ++ ++TEST mkfifo source_deletions_me/fifo ++TEST mknod source_deletions_me/block b 4 5 ++TEST mknod source_deletions_me/char c 1 5 ++TEST touch source_deletions_me/file ++TEST ln -s source_deletions_me/file source_deletions_me/slink ++TEST mkdir source_deletions_me/dir1 ++TEST mkdir source_deletions_me/dir1/dir2 ++ ++TEST mkfifo source_self_accusing/fifo ++TEST mknod source_self_accusing/block b 4 5 ++TEST mknod source_self_accusing/char c 1 5 ++TEST touch source_self_accusing/file ++TEST ln -s source_self_accusing/file source_self_accusing/slink ++TEST mkdir source_self_accusing/dir1 ++TEST mkdir source_self_accusing/dir1/dir2 ++ ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++ ++TEST touch spb_heal/0 spb/0 spb_me_heal/0 spb_me/0 fool_heal/0 fool_me/0 v1_fool_heal/0 v1_fool_me/0 v1_dirty_heal/0 v1_dirty_me/0 ++TEST rm -rf source_deletions_heal/fifo source_deletions_heal/block source_deletions_heal/char source_deletions_heal/file source_deletions_heal/slink source_deletions_heal/dir1 ++TEST rm -rf source_deletions_me/fifo source_deletions_me/block source_deletions_me/char source_deletions_me/file source_deletions_me/slink source_deletions_me/dir1 ++TEST rm -rf source_self_accusing/fifo source_self_accusing/block source_self_accusing/char source_self_accusing/file source_self_accusing/slink source_self_accusing/dir1 ++ ++#Test that the files are deleted ++TEST ! stat $B0/${V0}1/source_deletions_heal/fifo ++TEST ! stat $B0/${V0}1/source_deletions_heal/block ++TEST ! stat $B0/${V0}1/source_deletions_heal/char ++TEST ! stat $B0/${V0}1/source_deletions_heal/file ++TEST ! stat $B0/${V0}1/source_deletions_heal/slink ++TEST ! stat $B0/${V0}1/source_deletions_heal/dir1 ++TEST ! stat $B0/${V0}1/source_deletions_me/fifo ++TEST ! stat $B0/${V0}1/source_deletions_me/block ++TEST ! stat $B0/${V0}1/source_deletions_me/char ++TEST ! stat $B0/${V0}1/source_deletions_me/file ++TEST ! stat $B0/${V0}1/source_deletions_me/slink ++TEST ! stat $B0/${V0}1/source_deletions_me/dir1 ++TEST ! stat $B0/${V0}1/source_self_accusing/fifo ++TEST ! stat $B0/${V0}1/source_self_accusing/block ++TEST ! stat $B0/${V0}1/source_self_accusing/char ++TEST ! stat $B0/${V0}1/source_self_accusing/file ++TEST ! stat $B0/${V0}1/source_self_accusing/slink ++TEST ! stat $B0/${V0}1/source_self_accusing/dir1 ++ ++ ++TEST mkfifo source_creations_heal/fifo ++TEST mknod source_creations_heal/block b 4 5 ++TEST mknod source_creations_heal/char c 1 5 ++TEST touch source_creations_heal/file ++TEST ln -s source_creations_heal/file source_creations_heal/slink ++TEST mkdir source_creations_heal/dir1 ++TEST mkdir source_creations_heal/dir1/dir2 ++ ++TEST mkfifo source_creations_me/fifo ++TEST mknod source_creations_me/block b 4 5 ++TEST mknod source_creations_me/char c 1 5 ++TEST touch source_creations_me/file ++TEST ln -s source_creations_me/file source_creations_me/slink ++TEST mkdir source_creations_me/dir1 ++TEST mkdir source_creations_me/dir1/dir2 ++ ++$CLI volume stop $V0 ++ ++#simulate fool fool scenario for fool_* dirs ++setfattr -x trusted.afr.$V0-client-0 $B0/${V0}1/{fool_heal,fool_me} ++setfattr -n trusted.afr.dirty -v 0x000000000000000000000001 $B0/${V0}1/{fool_heal,fool_me} ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}1/{v1_fool_heal,v1_fool_me} ++ ++#Simulate v1-dirty(self-accusing but no pending ops on others) scenario for v1-dirty ++setfattr -x trusted.afr.$V0-client-0 $B0/${V0}1/v1_dirty_{heal,me} ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000001 $B0/${V0}1/v1_dirty_{heal,me} ++ ++$CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++ ++TEST touch spb_heal/1 spb/0 spb_me_heal/1 spb_me/0 fool_heal/1 fool_me/1 v1_fool_heal/1 v1_fool_me/1 ++ ++$CLI volume stop $V0 ++ ++#simulate fool fool scenario for fool_* dirs ++setfattr -x trusted.afr.$V0-client-1 $B0/${V0}0/{fool_heal,fool_me} ++setfattr -n trusted.afr.dirty -v 0x000000000000000000000001 $B0/${V0}1/{fool_heal,fool_me} ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000001 $B0/${V0}1/{v1_fool_heal,v1_fool_me} ++ ++#simulate self-accusing for source_self_accusing ++TEST setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000000000006 $B0/${V0}0/source_self_accusing ++ ++$CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++ ++# Check if conservative merges happened correctly on _me_ dirs ++TEST stat spb_me_heal/1 ++TEST stat $B0/${V0}0/spb_me_heal/1 ++TEST stat $B0/${V0}1/spb_me_heal/1 ++ ++TEST stat spb_me_heal/0 ++TEST stat $B0/${V0}0/spb_me_heal/0 ++TEST stat $B0/${V0}1/spb_me_heal/0 ++ ++TEST stat fool_me/1 ++TEST stat $B0/${V0}0/fool_me/1 ++TEST stat $B0/${V0}1/fool_me/1 ++ ++TEST stat fool_me/0 ++TEST stat $B0/${V0}0/fool_me/0 ++TEST stat $B0/${V0}1/fool_me/0 ++ ++TEST stat v1_fool_me/0 ++TEST stat $B0/${V0}0/v1_fool_me/0 ++TEST stat $B0/${V0}1/v1_fool_me/0 ++ ++TEST stat v1_fool_me/1 ++TEST stat $B0/${V0}0/v1_fool_me/1 ++TEST stat $B0/${V0}1/v1_fool_me/1 ++ ++TEST stat v1_dirty_me/0 ++TEST stat $B0/${V0}0/v1_dirty_me/0 ++TEST stat $B0/${V0}1/v1_dirty_me/0 ++ ++#Check if files that have gfid-mismatches in _me_ are giving EIO ++TEST ! stat spb_me/0 ++ ++#Check if stale files are deleted on access ++TEST ! stat source_deletions_me/fifo ++TEST ! stat $B0/${V0}0/source_deletions_me/fifo ++TEST ! stat $B0/${V0}1/source_deletions_me/fifo ++TEST ! stat source_deletions_me/block ++TEST ! stat $B0/${V0}0/source_deletions_me/block ++TEST ! stat $B0/${V0}1/source_deletions_me/block ++TEST ! stat source_deletions_me/char ++TEST ! stat $B0/${V0}0/source_deletions_me/char ++TEST ! stat $B0/${V0}1/source_deletions_me/char ++TEST ! stat source_deletions_me/file ++TEST ! stat $B0/${V0}0/source_deletions_me/file ++TEST ! stat $B0/${V0}1/source_deletions_me/file ++TEST ! stat source_deletions_me/file ++TEST ! stat $B0/${V0}0/source_deletions_me/file ++TEST ! stat $B0/${V0}1/source_deletions_me/file ++TEST ! stat source_deletions_me/dir1/dir2 ++TEST ! stat $B0/${V0}0/source_deletions_me/dir1/dir2 ++TEST ! stat $B0/${V0}1/source_deletions_me/dir1/dir2 ++TEST ! stat source_deletions_me/dir1 ++TEST ! stat $B0/${V0}0/source_deletions_me/dir1 ++TEST ! stat $B0/${V0}1/source_deletions_me/dir1 ++ ++#Test if the files created as part of access are healed correctly ++r=$(get_file_type source_creations_me/fifo) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/fifo ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/fifo ++TEST [ -p source_creations_me/fifo ] ++ ++r=$(get_file_type source_creations_me/block) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/block ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/block ++EXPECT "^4 5$" stat -c "%t %T" $B0/${V0}1/source_creations_me/block ++EXPECT "^4 5$" stat -c "%t %T" $B0/${V0}0/source_creations_me/block ++TEST [ -b source_creations_me/block ] ++ ++r=$(get_file_type source_creations_me/char) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/char ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/char ++EXPECT "^1 5$" stat -c "%t %T" $B0/${V0}1/source_creations_me/char ++EXPECT "^1 5$" stat -c "%t %T" $B0/${V0}0/source_creations_me/char ++TEST [ -c source_creations_me/char ] ++ ++r=$(get_file_type source_creations_me/file) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/file ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/file ++TEST [ -f source_creations_me/file ] ++ ++r=$(get_file_type source_creations_me/slink) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/slink ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/slink ++TEST [ -h source_creations_me/slink ] ++ ++r=$(get_file_type source_creations_me/dir1/dir2) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/dir1/dir2 ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/dir1/dir2 ++TEST [ -d source_creations_me/dir1/dir2 ] ++ ++r=$(get_file_type source_creations_me/dir1) ++EXPECT "$r" get_file_type $B0/${V0}0/source_creations_me/dir1 ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_me/dir1 ++TEST [ -d source_creations_me/dir1 ] ++ ++#Trigger heal and check _heal dirs are healed properly ++#Trigger change in event generation number. That way inodes would get refreshed during lookup ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++$CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++ ++TEST stat spb_heal ++TEST stat spb_me_heal ++TEST stat fool_heal ++TEST stat fool_me ++TEST stat v1_fool_heal ++TEST stat v1_fool_me ++TEST stat source_deletions_heal ++TEST stat source_deletions_me ++TEST stat source_self_accusing ++TEST stat source_creations_heal ++TEST stat source_creations_me ++TEST stat v1_dirty_heal ++TEST stat v1_dirty_me ++TEST $CLI volume stop $V0 ++TEST rm -rf $B0/${V0}{0,1}/.glusterfs/indices/xattrop/* ++ ++$CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++ ++#Create base entry in indices/xattrop ++echo "Data" > $M0/FILE ++rm -f $M0/FILE ++EXPECT "1" count_index_entries $B0/${V0}0 ++EXPECT "1" count_index_entries $B0/${V0}1 ++ ++TEST $CLI volume stop $V0; ++ ++#Create entries for fool_heal and fool_me to ensure they are fully healed and dirty xattrs erased, before triggering index heal ++create_brick_xattrop_entry $B0/${V0}0 fool_heal fool_me source_creations_heal/dir1 ++ ++$CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++ ++$CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++ ++TEST $CLI volume heal $V0; ++EXPECT_WITHIN $HEAL_TIMEOUT "~" print_pending_heals spb_heal spb_me_heal fool_heal fool_me v1_fool_heal v1_fool_me source_deletions_heal source_deletions_me source_creations_heal source_creations_me v1_dirty_heal v1_dirty_me source_self_accusing ++ ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 spb_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 spb_me_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 fool_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 fool_me ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 v1_fool_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 v1_fool_me ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 source_deletions_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 source_deletions_me ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 source_self_accusing ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 source_creations_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 source_creations_me ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 v1_dirty_heal ++EXPECT "Y${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}${zero_xattr}" heal_status $B0/${V0}0 $B0/${V0}1 v1_dirty_me ++ ++#Don't access the files/dirs from mount point as that may cause self-heals ++# Check if conservative merges happened correctly on heal dirs ++TEST stat $B0/${V0}0/spb_heal/1 ++TEST stat $B0/${V0}1/spb_heal/1 ++ ++TEST stat $B0/${V0}0/spb_heal/0 ++TEST stat $B0/${V0}1/spb_heal/0 ++ ++TEST stat $B0/${V0}0/fool_heal/1 ++TEST stat $B0/${V0}1/fool_heal/1 ++ ++TEST stat $B0/${V0}0/fool_heal/0 ++TEST stat $B0/${V0}1/fool_heal/0 ++ ++TEST stat $B0/${V0}0/v1_fool_heal/0 ++TEST stat $B0/${V0}1/v1_fool_heal/0 ++ ++TEST stat $B0/${V0}0/v1_fool_heal/1 ++TEST stat $B0/${V0}1/v1_fool_heal/1 ++ ++TEST stat $B0/${V0}0/v1_dirty_heal/0 ++TEST stat $B0/${V0}1/v1_dirty_heal/0 ++ ++#Check if files that have gfid-mismatches in spb are giving EIO ++TEST ! stat spb/0 ++ ++#Check if stale files are deleted on access ++TEST ! stat $B0/${V0}0/source_deletions_heal/fifo ++TEST ! stat $B0/${V0}1/source_deletions_heal/fifo ++TEST ! stat $B0/${V0}0/source_deletions_heal/block ++TEST ! stat $B0/${V0}1/source_deletions_heal/block ++TEST ! stat $B0/${V0}0/source_deletions_heal/char ++TEST ! stat $B0/${V0}1/source_deletions_heal/char ++TEST ! stat $B0/${V0}0/source_deletions_heal/file ++TEST ! stat $B0/${V0}1/source_deletions_heal/file ++TEST ! stat $B0/${V0}0/source_deletions_heal/file ++TEST ! stat $B0/${V0}1/source_deletions_heal/file ++TEST ! stat $B0/${V0}0/source_deletions_heal/dir1/dir2 ++TEST ! stat $B0/${V0}1/source_deletions_heal/dir1/dir2 ++TEST ! stat $B0/${V0}0/source_deletions_heal/dir1 ++TEST ! stat $B0/${V0}1/source_deletions_heal/dir1 ++ ++#Check if stale files are deleted on access ++TEST ! stat $B0/${V0}0/source_self_accusing/fifo ++TEST ! stat $B0/${V0}1/source_self_accusing/fifo ++TEST ! stat $B0/${V0}0/source_self_accusing/block ++TEST ! stat $B0/${V0}1/source_self_accusing/block ++TEST ! stat $B0/${V0}0/source_self_accusing/char ++TEST ! stat $B0/${V0}1/source_self_accusing/char ++TEST ! stat $B0/${V0}0/source_self_accusing/file ++TEST ! stat $B0/${V0}1/source_self_accusing/file ++TEST ! stat $B0/${V0}0/source_self_accusing/file ++TEST ! stat $B0/${V0}1/source_self_accusing/file ++TEST ! stat $B0/${V0}0/source_self_accusing/dir1/dir2 ++TEST ! stat $B0/${V0}1/source_self_accusing/dir1/dir2 ++TEST ! stat $B0/${V0}0/source_self_accusing/dir1 ++TEST ! stat $B0/${V0}1/source_self_accusing/dir1 ++ ++#Test if the files created as part of full self-heal correctly ++r=$(get_file_type $B0/${V0}0/source_creations_heal/fifo) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/fifo ++TEST [ -p $B0/${V0}0/source_creations_heal/fifo ] ++EXPECT "^4 5$" stat -c "%t %T" $B0/${V0}1/source_creations_heal/block ++EXPECT "^4 5$" stat -c "%t %T" $B0/${V0}0/source_creations_heal/block ++ ++r=$(get_file_type $B0/${V0}0/source_creations_heal/block) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/block ++ ++r=$(get_file_type $B0/${V0}0/source_creations_heal/char) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/char ++EXPECT "^1 5$" stat -c "%t %T" $B0/${V0}1/source_creations_heal/char ++EXPECT "^1 5$" stat -c "%t %T" $B0/${V0}0/source_creations_heal/char ++ ++r=$(get_file_type $B0/${V0}0/source_creations_heal/file) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/file ++TEST [ -f $B0/${V0}0/source_creations_heal/file ] ++ ++r=$(get_file_type source_creations_heal/file $B0/${V0}0/slink) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/file slink ++TEST [ -h $B0/${V0}0/source_creations_heal/slink ] ++ ++r=$(get_file_type $B0/${V0}0/source_creations_heal/dir1/dir2) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/dir1/dir2 ++TEST [ -d $B0/${V0}0/source_creations_heal/dir1/dir2 ] ++ ++r=$(get_file_type $B0/${V0}0/source_creations_heal/dir1) ++EXPECT "$r" get_file_type $B0/${V0}1/source_creations_heal/dir1 ++TEST [ -d $B0/${V0}0/source_creations_heal/dir1 ] ++ ++cd - ++ ++#Anonymous directory shouldn't be created ++TEST mkdir $M0/rename-dir ++before_rename=$(STAT_INO $B0/${V0}1/rename-dir) ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++TEST mv $M0/rename-dir $M0/new-name ++TEST $CLI volume start $V0 force ++#Since features.ctime is not enabled by default in downstream, the below test ++#will fail. If ctime feature is enabled, there will be trusted.glusterfs.mdata ++#xattr set which will differ for the parent in the gfid split-brain scenario ++#and when lookup is triggered, the gfid gets added to indices/xattrop leading ++#the below test to pass in upstream. Hence commenting it here. ++#'spb' is in split-brain so pending-heal-count will be 2 ++#EXPECT_WITHIN $HEAL_TIMEOUT "^2$" get_pending_heal_count $V0 ++after_rename=$(STAT_INO $B0/${V0}1/new-name) ++EXPECT "0" echo $(ls -a $B0/${V0}0/ | grep anonymous-inode | wc -l) ++EXPECT "0" echo $(ls -a $B0/${V0}1/ | grep anonymous-inode | wc -l) ++EXPECT_NOT "$before_rename" echo $after_rename ++cleanup +diff --git a/tests/basic/afr/rename-data-loss.t b/tests/basic/afr/rename-data-loss.t +new file mode 100644 +index 0000000..256ee2a +--- /dev/null ++++ b/tests/basic/afr/rename-data-loss.t +@@ -0,0 +1,72 @@ ++#!/bin/bash ++#Self-heal tests ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1} ++TEST $CLI volume set $V0 write-behind off ++TEST $CLI volume set $V0 self-heal-daemon off ++TEST $CLI volume set $V0 data-self-heal off ++TEST $CLI volume set $V0 metadata-self-heal off ++TEST $CLI volume set $V0 entry-self-heal off ++TEST $CLI volume start $V0 ++EXPECT 'Started' volinfo_field $V0 'Status' ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++cd $M0 ++TEST `echo "line1" >> file1` ++TEST mkdir dir1 ++TEST mkdir dir2 ++TEST mkdir -p dir1/dira/dirb ++TEST `echo "line1">>dir1/dira/dirb/file1` ++TEST mkdir delete_me ++TEST `echo "line1" >> delete_me/file1` ++ ++#brick0 has witnessed the second write while brick1 is down. ++TEST kill_brick $V0 $H0 $B0/brick1 ++TEST `echo "line2" >> file1` ++TEST `echo "line2" >> dir1/dira/dirb/file1` ++TEST `echo "line2" >> delete_me/file1` ++ ++#Toggle the bricks that are up/down. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 ++TEST kill_brick $V0 $H0 $B0/brick0 ++ ++#Rename when the 'source' brick0 for data-selfheals is down. ++mv file1 file2 ++mv dir1/dira dir2 ++ ++#Delete a dir when brick0 is down. ++rm -rf delete_me ++cd - ++ ++#Bring everything up and trigger heal ++TEST $CLI volume set $V0 self-heal-daemon on ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/brick0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_anon_entry_count $B0/brick1 ++ ++#Remount to avoid reading from caches ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; ++EXPECT "line2" tail -1 $M0/file2 ++EXPECT "line2" tail -1 $M0/dir2/dira/dirb/file1 ++TEST ! stat $M0/delete_me/file1 ++TEST ! stat $M0/delete_me ++ ++anon_inode_name=$(ls -a $B0/brick0 | grep glusterfs-anonymous-inode) ++TEST [[ -d $B0/brick0/$anon_inode_name ]] ++TEST [[ -d $B0/brick1/$anon_inode_name ]] ++cleanup +diff --git a/tests/bugs/replicate/bug-1744548-heal-timeout.t b/tests/bugs/replicate/bug-1744548-heal-timeout.t +index c208112..0115350 100644 +--- a/tests/bugs/replicate/bug-1744548-heal-timeout.t ++++ b/tests/bugs/replicate/bug-1744548-heal-timeout.t +@@ -25,14 +25,14 @@ TEST ! $CLI volume heal $V0 + TEST $CLI volume profile $V0 start + TEST $CLI volume profile $V0 info clear + TEST $CLI volume heal $V0 enable +-# Each brick does 3 opendirs, corresponding to dirty, xattrop and entry-changes +-EXPECT_WITHIN $HEAL_TIMEOUT "^333$" get_cumulative_opendir_count ++# Each brick does 4 opendirs, corresponding to dirty, xattrop and entry-changes, anonymous-inode ++EXPECT_WITHIN 4 "^444$" get_cumulative_opendir_count + + # Check that a change in heal-timeout is honoured immediately. + TEST $CLI volume set $V0 cluster.heal-timeout 5 + sleep 10 + # Two crawls must have happened. +-EXPECT_WITHIN $HEAL_TIMEOUT "^999$" get_cumulative_opendir_count ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^121212$" get_cumulative_opendir_count + + # shd must not heal if it is disabled and heal-timeout is changed. + TEST $CLI volume heal $V0 disable +diff --git a/tests/features/trash.t b/tests/features/trash.t +index 472e909..da5b50b 100755 +--- a/tests/features/trash.t ++++ b/tests/features/trash.t +@@ -94,105 +94,105 @@ wildcard_not_exists() { + if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi + } + +-# testing glusterd [1-3] ++# testing glusterd + TEST glusterd + TEST pidof glusterd + TEST $CLI volume info + +-# creating distributed volume [4] ++# creating distributed volume + TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2} + +-# checking volume status [5-7] ++# checking volume status + EXPECT "$V0" volinfo_field $V0 'Volume Name' + EXPECT 'Created' volinfo_field $V0 'Status' + EXPECT '2' brick_count $V0 + +-# test without enabling trash translator [8] ++# test without enabling trash translator + TEST start_vol $V0 $M0 + +-# test on enabling trash translator [9-10] ++# test on enabling trash translator + TEST $CLI volume set $V0 features.trash on + EXPECT 'on' volinfo_field $V0 'features.trash' + +-# files directly under mount point [11] ++# files directly under mount point + create_files $M0/file1 $M0/file2 + TEST file_exists $V0 file1 file2 + +-# perform unlink [12] ++# perform unlink + TEST unlink_op file1 + +-# perform truncate [13] ++# perform truncate + TEST truncate_op file2 4 + +-# create files directory hierarchy and check [14] ++# create files directory hierarchy and check + mkdir -p $M0/1/2/3 + create_files $M0/1/2/3/foo1 $M0/1/2/3/foo2 + TEST file_exists $V0 1/2/3/foo1 1/2/3/foo2 + +-# perform unlink [15] ++# perform unlink + TEST unlink_op 1/2/3/foo1 + +-# perform truncate [16] ++# perform truncate + TEST truncate_op 1/2/3/foo2 4 + + # create a directory for eliminate pattern + mkdir $M0/a + +-# set the eliminate pattern [17-18] ++# set the eliminate pattern + TEST $CLI volume set $V0 features.trash-eliminate-path /a + EXPECT '/a' volinfo_field $V0 'features.trash-eliminate-path' + +-# create two files and check [19] ++# create two files and check + create_files $M0/a/test1 $M0/a/test2 + TEST file_exists $V0 a/test1 a/test2 + +-# remove from eliminate pattern [20] ++# remove from eliminate pattern + rm -f $M0/a/test1 + EXPECT "Y" wildcard_not_exists $M0/.trashcan/a/test1* + +-# truncate from eliminate path [21-23] ++# truncate from eliminate path + truncate -s 2 $M0/a/test2 + TEST [ -e $M0/a/test2 ] + TEST [ `ls -l $M0/a/test2 | awk '{print $5}'` -eq 2 ] + EXPECT "Y" wildcard_not_exists $M0/.trashcan/a/test2* + +-# set internal op on [24-25] ++# set internal op on + TEST $CLI volume set $V0 features.trash-internal-op on + EXPECT 'on' volinfo_field $V0 'features.trash-internal-op' + +-# again create two files and check [26] ++# again create two files and check + create_files $M0/inop1 $M0/inop2 + TEST file_exists $V0 inop1 inop2 + +-# perform unlink [27] ++# perform unlink + TEST unlink_op inop1 + +-# perform truncate [28] ++# perform truncate + TEST truncate_op inop2 4 + +-# remove one brick and restart the volume [28-31] ++# remove one brick and restart the volume + TEST $CLI volume remove-brick $V0 $H0:$B0/${V0}2 force + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume stop $V0 + TEST start_vol $V0 $M0 $M0/.trashcan + +-# again create two files and check [33] ++# again create two files and check + create_files $M0/rebal1 $M0/rebal2 + TEST file_exists $V0 rebal1 rebal2 + +-# add one brick [34-35] ++# add one brick + TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3 + TEST [ -d $B0/${V0}3 ] + + +-# perform rebalance [36] ++# perform rebalance + TEST $CLI volume rebalance $V0 start force + EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed + + #Find out which file was migrated to the new brick + file_name=$(ls $B0/${V0}3/rebal*| xargs basename) + +-# check whether rebalance was succesful [37-40] ++# check whether rebalance was succesful + EXPECT "Y" wildcard_exists $B0/${V0}3/$file_name* + EXPECT "Y" wildcard_exists $B0/${V0}1/.trashcan/internal_op/$file_name* + +@@ -201,52 +201,42 @@ EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + # force required in case rebalance is not over + TEST $CLI volume stop $V0 force + +-# create a replicated volume [41] ++# create a replicated volume + TEST $CLI volume create $V1 replica 2 $H0:$B0/${V1}{1,2} + +-# checking volume status [42-45] ++# checking volume status + EXPECT "$V1" volinfo_field $V1 'Volume Name' + EXPECT 'Replicate' volinfo_field $V1 'Type' + EXPECT 'Created' volinfo_field $V1 'Status' + EXPECT '2' brick_count $V1 + +-# enable trash with options and start the replicate volume by disabling automatic self-heal [46-50] ++# enable trash with options and start the replicate volume by disabling automatic self-heal + TEST $CLI volume set $V1 features.trash on + TEST $CLI volume set $V1 features.trash-internal-op on + EXPECT 'on' volinfo_field $V1 'features.trash' + EXPECT 'on' volinfo_field $V1 'features.trash-internal-op' + TEST start_vol $V1 $M1 $M1/.trashcan + +-# mount and check for trash directory [51] ++# mount and check for trash directory + TEST [ -d $M1/.trashcan/internal_op ] + +-# create a file and check [52] ++# create a file and check + touch $M1/self + TEST [ -e $B0/${V1}1/self -a -e $B0/${V1}2/self ] + +-# kill one brick and delete the file from mount point [53-54] ++# kill one brick and delete the file from mount point + kill_brick $V1 $H0 $B0/${V1}1 + EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "1" online_brick_count + rm -f $M1/self + EXPECT "Y" wildcard_exists $B0/${V1}2/.trashcan/self* + +-# force start the volume and trigger the self-heal manually [55-57] +-TEST $CLI volume start $V1 force +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" online_brick_count +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +-# Since we created the file under root of the volume, it will be +-# healed automatically +- +-# check for the removed file in trashcan [58] +-EXPECT_WITHIN $HEAL_TIMEOUT "Y" wildcard_exists $B0/${V1}1/.trashcan/internal_op/self* +- +-# check renaming of trash directory through cli [59-62] ++# check renaming of trash directory through cli + TEST $CLI volume set $V0 trash-dir abc + TEST start_vol $V0 $M0 $M0/abc + TEST [ -e $M0/abc -a ! -e $M0/.trashcan ] + EXPECT "Y" wildcard_exists $B0/${V0}1/abc/internal_op/rebal* + +-# ensure that rename and delete operation on trash directory fails [63-65] ++# ensure that rename and delete operation on trash directory fails + rm -rf $M0/abc/internal_op + TEST [ -e $M0/abc/internal_op ] + rm -rf $M0/abc/ +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 90b4f14..6f2da11 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -47,6 +47,41 @@ afr_quorum_errno(afr_private_t *priv) + return ENOTCONN; + } + ++gf_boolean_t ++afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, ++ pid_t pid) ++{ ++ if (!__is_root_gfid(pargfid)) { ++ return _gf_false; ++ } ++ ++ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { ++ /*For backward compatibility /.landfill is private*/ ++ return _gf_true; ++ } ++ ++ if (pid == GF_CLIENT_PID_GSYNCD) { ++ /*geo-rep needs to create/sync private directory on slave because ++ * it appears in changelog*/ ++ return _gf_false; ++ } ++ ++ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { ++ if (strcmp(name, priv->anon_inode_name) == 0) { ++ /* anonymous-inode dir is private*/ ++ return _gf_true; ++ } ++ } else { ++ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == ++ 0) { ++ /* anonymous-inode dir prefix is private for geo-rep to work*/ ++ return _gf_true; ++ } ++ } ++ ++ return _gf_false; ++} ++ + int + afr_fav_child_reset_sink_xattrs(void *opaque); + +@@ -3301,11 +3336,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) + return 0; + } + +- if (__is_root_gfid(loc->parent->gfid)) { +- if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) { +- op_errno = EPERM; +- goto out; +- } ++ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, ++ frame->root->pid)) { ++ op_errno = EPERM; ++ goto out; + } + + local = AFR_FRAME_INIT(frame, op_errno); +@@ -4832,6 +4866,7 @@ afr_priv_dump(xlator_t *this) + priv->background_self_heal_count); + gf_proc_dump_write("healers", "%d", priv->healers); + gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); ++ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); + if (priv->quorum_count == AFR_QUORUM_AUTO) { + gf_proc_dump_write("quorum-type", "auto"); + } else if (priv->quorum_count == 0) { +@@ -5792,6 +5827,7 @@ afr_priv_destroy(afr_private_t *priv) + GF_FREE(priv->local); + GF_FREE(priv->pending_key); + GF_FREE(priv->children); ++ GF_FREE(priv->anon_inode); + GF_FREE(priv->child_up); + GF_FREE(priv->child_latency); + LOCK_DESTROY(&priv->lock); +diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c +index 6307b63..d64b6a9 100644 +--- a/xlators/cluster/afr/src/afr-dir-read.c ++++ b/xlators/cluster/afr/src/afr-dir-read.c +@@ -158,8 +158,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) + } + + static void +-afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, +- gf_dirent_t *entries, fd_t *fd) ++afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, ++ int subvol, gf_dirent_t *entries, fd_t *fd) + { + int ret = -1; + gf_dirent_t *entry = NULL; +@@ -177,8 +177,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, + + list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) + { +- if (__is_root_gfid(fd->inode->gfid) && +- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) { ++ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, ++ frame->root->pid)) { + continue; + } + +@@ -222,8 +222,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + } + + if (op_ret >= 0) +- afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries, +- local->fd); ++ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, ++ &entries, local->fd); + + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); + +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 9b6575f..0a8a7fd 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -2753,3 +2753,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + out: + return source; + } ++ ++static int ++afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ afr_local_t *local = frame->local; ++ int i = (long)cookie; ++ ++ local->replies[i].valid = 1; ++ local->replies[i].op_ret = op_ret; ++ local->replies[i].op_errno = op_errno; ++ if (op_ret == 0) { ++ local->op_ret = 0; ++ local->replies[i].poststat = *buf; ++ local->replies[i].preparent = *preparent; ++ local->replies[i].postparent = *postparent; ++ } ++ if (xdata) { ++ local->replies[i].xdata = dict_ref(xdata); ++ } ++ ++ syncbarrier_wake(&local->barrier); ++ return 0; ++} ++ ++int ++afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) ++{ ++ call_frame_t *frame = NULL; ++ afr_local_t *local = NULL; ++ afr_private_t *priv = this->private; ++ unsigned char *mkdir_on = alloca0(priv->child_count); ++ unsigned char *lookup_on = alloca0(priv->child_count); ++ loc_t loc = {0}; ++ int32_t op_errno = 0; ++ int32_t child_op_errno = 0; ++ struct iatt iatt = {0}; ++ dict_t *xdata = NULL; ++ uuid_t anon_inode_gfid = {0}; ++ int mkdir_count = 0; ++ int i = 0; ++ ++ /*Try to mkdir everywhere and return success if the dir exists on 'child' ++ */ ++ ++ if (!priv->use_anon_inode) { ++ op_errno = EINVAL; ++ goto out; ++ } ++ ++ frame = afr_frame_create(this, &op_errno); ++ if (op_errno) { ++ goto out; ++ } ++ local = frame->local; ++ if (!local->child_up[child]) { ++ /*Other bricks may need mkdir so don't error out yet*/ ++ child_op_errno = ENOTCONN; ++ } ++ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); ++ for (i = 0; i < priv->child_count; i++) { ++ if (!local->child_up[i]) ++ continue; ++ ++ if (priv->anon_inode[i]) { ++ mkdir_on[i] = 0; ++ } else { ++ mkdir_on[i] = 1; ++ mkdir_count++; ++ } ++ } ++ ++ if (mkdir_count == 0) { ++ *linked_inode = inode_find(this->itable, anon_inode_gfid); ++ if (*linked_inode) { ++ op_errno = 0; ++ goto out; ++ } ++ } ++ ++ loc.parent = inode_ref(this->itable->root); ++ loc.name = priv->anon_inode_name; ++ loc.inode = inode_new(this->itable); ++ if (!loc.inode) { ++ op_errno = ENOMEM; ++ goto out; ++ } ++ ++ xdata = dict_new(); ++ if (!xdata) { ++ op_errno = ENOMEM; ++ goto out; ++ } ++ ++ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); ++ if (op_errno) { ++ goto out; ++ } ++ ++ if (mkdir_count == 0) { ++ memcpy(lookup_on, local->child_up, priv->child_count); ++ goto lookup; ++ } ++ ++ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, ++ xdata); ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!mkdir_on[i]) { ++ continue; ++ } ++ ++ if (local->replies[i].op_ret == 0) { ++ priv->anon_inode[i] = 1; ++ iatt = local->replies[i].poststat; ++ } else if (local->replies[i].op_ret < 0 && ++ local->replies[i].op_errno == EEXIST) { ++ lookup_on[i] = 1; ++ } else if (i == child) { ++ child_op_errno = local->replies[i].op_errno; ++ } ++ } ++ ++ if (AFR_COUNT(lookup_on, priv->child_count) == 0) { ++ goto link; ++ } ++ ++lookup: ++ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, ++ xdata); ++ for (i = 0; i < priv->child_count; i++) { ++ if (!lookup_on[i]) { ++ continue; ++ } ++ ++ if (local->replies[i].op_ret == 0) { ++ if (gf_uuid_compare(anon_inode_gfid, ++ local->replies[i].poststat.ia_gfid) == 0) { ++ priv->anon_inode[i] = 1; ++ iatt = local->replies[i].poststat; ++ } else { ++ if (i == child) ++ child_op_errno = EINVAL; ++ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, ++ "%s has gfid: %s", priv->anon_inode_name, ++ uuid_utoa(local->replies[i].poststat.ia_gfid)); ++ } ++ } else if (i == child) { ++ child_op_errno = local->replies[i].op_errno; ++ } ++ } ++link: ++ if (!gf_uuid_is_null(iatt.ia_gfid)) { ++ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); ++ if (*linked_inode) { ++ op_errno = 0; ++ inode_lookup(*linked_inode); ++ } else { ++ op_errno = ENOMEM; ++ } ++ goto out; ++ } ++ ++out: ++ if (xdata) ++ dict_unref(xdata); ++ loc_wipe(&loc); ++ /*child_op_errno takes precedence*/ ++ if (child_op_errno == 0) { ++ child_op_errno = op_errno; ++ } ++ ++ if (child_op_errno && *linked_inode) { ++ inode_unref(*linked_inode); ++ *linked_inode = NULL; ++ } ++ if (frame) ++ AFR_STACK_DESTROY(frame); ++ return -child_op_errno; ++} +diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c +index 00b5b2d..20b07dd 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-entry.c ++++ b/xlators/cluster/afr/src/afr-self-heal-entry.c +@@ -16,54 +16,170 @@ + #include + #include + +-static int +-afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, +- inode_t *inode, int child, struct afr_reply *replies) ++int ++afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name, ++ inode_t *inode, int child, ++ struct afr_reply *replies, ++ gf_boolean_t *anon_inode) + { + afr_private_t *priv = NULL; ++ afr_local_t *local = NULL; + xlator_t *subvol = NULL; + int ret = 0; ++ int i = 0; ++ char g[64] = {0}; ++ unsigned char *lookup_success = NULL; ++ call_frame_t *frame = NULL; ++ loc_t loc2 = { ++ 0, ++ }; + loc_t loc = { + 0, + }; +- char g[64]; + + priv = this->private; +- + subvol = priv->children[child]; ++ lookup_success = alloca0(priv->child_count); ++ uuid_utoa_r(replies[child].poststat.ia_gfid, g); ++ loc.inode = inode_new(inode->table); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (replies[child].poststat.ia_type == IA_IFDIR) { ++ /* This directory may have sub-directory hierarchy which may need to ++ * be preserved for subsequent heals. So unconditionally move the ++ * directory to anonymous-inode directory*/ ++ *anon_inode = _gf_true; ++ goto anon_inode; ++ } ++ ++ frame = afr_frame_create(this, &ret); ++ if (!frame) { ++ ret = -ret; ++ goto out; ++ } ++ local = frame->local; ++ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid); ++ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, ++ NULL); ++ for (i = 0; i < priv->child_count; i++) { ++ if (local->replies[i].op_ret == 0) { ++ lookup_success[i] = 1; ++ } else if (local->replies[i].op_errno != ENOENT && ++ local->replies[i].op_errno != ESTALE) { ++ ret = -local->replies[i].op_errno; ++ } ++ } ++ ++ if (priv->quorum_count) { ++ if (afr_has_quorum(lookup_success, this, NULL)) { ++ *anon_inode = _gf_true; ++ } ++ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) { ++ *anon_inode = _gf_true; ++ } else if (ret) { ++ goto out; ++ } ++ ++anon_inode: ++ if (!*anon_inode) { ++ ret = 0; ++ goto out; ++ } + + loc.parent = inode_ref(dir); + gf_uuid_copy(loc.pargfid, dir->gfid); + loc.name = name; +- loc.inode = inode_ref(inode); + +- if (replies[child].valid && replies[child].op_ret == 0) { +- switch (replies[child].poststat.ia_type) { +- case IA_IFDIR: +- gf_msg(this->name, GF_LOG_WARNING, 0, +- AFR_MSG_EXPUNGING_FILE_OR_DIR, +- "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), +- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), +- subvol->name); +- ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, +- AFR_MSG_EXPUNGING_FILE_OR_DIR, +- "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), +- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), +- subvol->name); +- ret = syncop_unlink(subvol, &loc, NULL, NULL); +- break; +- } ++ ret = afr_anon_inode_create(this, child, &loc2.parent); ++ if (ret < 0) ++ goto out; ++ ++ loc2.name = g; ++ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR, ++ "Rename to %s dir %s/%s (%s) on %s failed", ++ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, ++ subvol->name); ++ } else { ++ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, ++ "Rename to %s dir %s/%s (%s) on %s successful", ++ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, ++ subvol->name); + } + ++out: + loc_wipe(&loc); ++ loc_wipe(&loc2); ++ if (frame) { ++ AFR_STACK_DESTROY(frame); ++ } + + return ret; + } + + int ++afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, ++ inode_t *inode, int child, struct afr_reply *replies) ++{ ++ char g[64] = {0}; ++ afr_private_t *priv = NULL; ++ xlator_t *subvol = NULL; ++ int ret = 0; ++ loc_t loc = { ++ 0, ++ }; ++ gf_boolean_t anon_inode = _gf_false; ++ ++ priv = this->private; ++ subvol = priv->children[child]; ++ ++ if ((!replies[child].valid) || (replies[child].op_ret < 0)) { ++ /*Nothing to do*/ ++ ret = 0; ++ goto out; ++ } ++ ++ if (priv->use_anon_inode) { ++ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child, ++ replies, &anon_inode); ++ if (ret < 0 || anon_inode) ++ goto out; ++ } ++ ++ loc.parent = inode_ref(dir); ++ loc.inode = inode_new(inode->table); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ loc.name = name; ++ switch (replies[child].poststat.ia_type) { ++ case IA_IFDIR: ++ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, ++ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, ++ uuid_utoa_r(replies[child].poststat.ia_gfid, g), ++ subvol->name); ++ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, ++ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), ++ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), ++ subvol->name); ++ ret = syncop_unlink(subvol, &loc, NULL, NULL); ++ break; ++ } ++ ++out: ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int + afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + unsigned char *sources, inode_t *dir, + const char *name, inode_t *inode, +@@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + loc_t srcloc = { + 0, + }; ++ loc_t anonloc = { ++ 0, ++ }; + xlator_t *this = frame->this; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; +@@ -86,15 +205,18 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + 0, + }; + unsigned char *newentry = NULL; ++ char iatt_uuid_str[64] = {0}; ++ char dir_uuid_str[64] = {0}; + + priv = this->private; + iatt = &replies[source].poststat; ++ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str); + if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, + "Invalid ia_type (%d) or gfid(%s). source brick=%d, " + "pargfid=%s, name=%s", +- iatt->ia_type, uuid_utoa(iatt->ia_gfid), source, +- uuid_utoa(dir->gfid), name); ++ iatt->ia_type, iatt_uuid_str, source, ++ uuid_utoa_r(dir->gfid, dir_uuid_str), name); + ret = -EINVAL; + goto out; + } +@@ -119,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + + srcloc.inode = inode_ref(inode); + gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); +- if (iatt->ia_type != IA_IFDIR) +- ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); +- if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) { ++ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); ++ if (ret == -ENOENT || ret == -ESTALE) { + newentry[dst] = 1; + ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, + sources, newentry); + if (ret) + goto out; ++ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) { ++ // Try rename from hidden directory ++ ret = afr_anon_inode_create(this, dst, &anonloc.parent); ++ if (ret < 0) ++ goto out; ++ anonloc.inode = inode_ref(inode); ++ anonloc.name = iatt_uuid_str; ++ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL); ++ if (ret == -ENOENT || ret == -ESTALE) ++ ret = -1; /*This sets 'mismatch' to true*/ ++ goto out; + } + + mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); +@@ -165,6 +297,7 @@ out: + GF_FREE(linkname); + loc_wipe(&loc); + loc_wipe(&srcloc); ++ loc_wipe(&anonloc); + return ret; + } + +@@ -580,6 +713,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + + priv = this->private; + ++ if (afr_is_private_directory(priv, fd->inode->gfid, name, ++ GF_CLIENT_PID_SELF_HEALD)) { ++ return 0; ++ } ++ + xattr = dict_new(); + if (!xattr) + return -ENOMEM; +@@ -628,7 +766,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + replies); + + if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { +- ret = afr_shd_index_purge(subvol, parent_idx_inode, name, ++ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name, + inode->ia_type); + /* Why is ret force-set to 0? We do not care about + * index purge failing for full heal as it is quite +@@ -758,10 +896,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + +- if (__is_root_gfid(fd->inode->gfid) && +- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) +- continue; +- + ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, + loc.inode, subvol, + local->need_full_crawl); +@@ -824,7 +958,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + /* The name indices under the pgfid index dir are guaranteed + * to be regular files. Hence the hardcoding. + */ +- afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG); ++ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); + ret = 0; + goto out; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c +index dace071..51e3d8c 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-name.c ++++ b/xlators/cluster/afr/src/afr-self-heal-name.c +@@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, + const char *bname, inode_t *inode, + struct afr_reply *replies) + { +- loc_t loc = { +- 0, +- }; + int i = 0; + afr_private_t *priv = NULL; +- char g[64]; + int ret = 0; + + priv = this->private; + +- loc.parent = inode_ref(parent); +- gf_uuid_copy(loc.pargfid, pargfid); +- loc.name = bname; +- loc.inode = inode_ref(inode); +- + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; +@@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, + if (replies[i].op_ret) + continue; + +- switch (replies[i].poststat.ia_type) { +- case IA_IFDIR: +- gf_msg(this->name, GF_LOG_WARNING, 0, +- AFR_MSG_EXPUNGING_FILE_OR_DIR, +- "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid), +- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), +- priv->children[i]->name); +- +- ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, +- AFR_MSG_EXPUNGING_FILE_OR_DIR, +- "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid), +- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), +- priv->children[i]->name); +- +- ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL); +- break; +- } ++ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i, ++ replies); + } + +- loc_wipe(&loc); +- + return ret; + } + +diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h +index 8f6fb00..c8dc384 100644 +--- a/xlators/cluster/afr/src/afr-self-heal.h ++++ b/xlators/cluster/afr/src/afr-self-heal.h +@@ -370,4 +370,9 @@ gf_boolean_t + afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies); + ++int ++afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, ++ inode_t *inode, int child, struct afr_reply *replies); ++int ++afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode); + #endif /* !_AFR_SELFHEAL_H */ +diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c +index 95ac5f2..939a135 100644 +--- a/xlators/cluster/afr/src/afr-self-heald.c ++++ b/xlators/cluster/afr/src/afr-self-heald.c +@@ -222,7 +222,7 @@ out: + } + + int +-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, ++afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, + ia_type_t type) + { + int ret = 0; +@@ -422,7 +422,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + ret = afr_shd_selfheal(healer, healer->subvol, gfid); + + if (ret == -ENOENT || ret == -ESTALE) +- afr_shd_index_purge(subvol, parent->inode, entry->d_name, val); ++ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val); + + if (ret == 2) + /* If bricks crashed in pre-op after creating indices/xattrop +@@ -798,6 +798,176 @@ afr_bricks_available_for_heal(afr_private_t *priv) + return _gf_true; + } + ++static int ++afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, ++ void *data) ++{ ++ struct subvol_healer *healer = data; ++ afr_private_t *priv = healer->this->private; ++ call_frame_t *frame = NULL; ++ afr_local_t *local = NULL; ++ int ret = 0; ++ loc_t loc = {0}; ++ int count = 0; ++ int i = 0; ++ int op_errno = 0; ++ struct iatt *iatt = NULL; ++ gf_boolean_t multiple_links = _gf_false; ++ unsigned char *gfid_present = alloca0(priv->child_count); ++ unsigned char *entry_present = alloca0(priv->child_count); ++ char *type = "file"; ++ ++ frame = afr_frame_create(healer->this, &ret); ++ if (!frame) { ++ ret = -ret; ++ goto out; ++ } ++ local = frame->local; ++ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { ++ gf_msg_debug(healer->this->name, 0, ++ "Not all bricks are up. Skipping " ++ "cleanup of %s on %s", ++ entry->d_name, subvol->name); ++ ret = 0; ++ goto out; ++ } ++ ++ loc.inode = inode_new(parent->inode->table); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ret = gf_uuid_parse(entry->d_name, loc.gfid); ++ if (ret) { ++ ret = 0; ++ goto out; ++ } ++ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, ++ NULL); ++ for (i = 0; i < priv->child_count; i++) { ++ if (local->replies[i].op_ret == 0) { ++ count++; ++ gfid_present[i] = 1; ++ iatt = &local->replies[i].poststat; ++ if (iatt->ia_type == IA_IFDIR) { ++ type = "dir"; ++ } ++ ++ if (i == healer->subvol) { ++ if (local->replies[i].poststat.ia_nlink > 1) { ++ multiple_links = _gf_true; ++ } ++ } ++ } else if (local->replies[i].op_errno != ENOENT && ++ local->replies[i].op_errno != ESTALE) { ++ /*We don't have complete view. Skip the entry*/ ++ gf_msg_debug(healer->this->name, local->replies[i].op_errno, ++ "Skipping cleanup of %s on %s", entry->d_name, ++ subvol->name); ++ ret = 0; ++ goto out; ++ } ++ } ++ ++ /*Inode is deleted from subvol*/ ++ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { ++ gf_msg(healer->this->name, GF_LOG_WARNING, 0, ++ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, ++ priv->anon_inode_name, entry->d_name, subvol->name); ++ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name, ++ iatt->ia_type); ++ if (ret == -ENOENT || ret == -ESTALE) ++ ret = 0; ++ } else if (count > 1) { ++ loc_wipe(&loc); ++ loc.parent = inode_ref(parent->inode); ++ loc.name = entry->d_name; ++ loc.inode = inode_new(parent->inode->table); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, ++ &loc, NULL); ++ count = 0; ++ for (i = 0; i < priv->child_count; i++) { ++ if (local->replies[i].op_ret == 0) { ++ count++; ++ entry_present[i] = 1; ++ iatt = &local->replies[i].poststat; ++ } else if (local->replies[i].op_errno != ENOENT && ++ local->replies[i].op_errno != ESTALE) { ++ /*We don't have complete view. Skip the entry*/ ++ gf_msg_debug(healer->this->name, local->replies[i].op_errno, ++ "Skipping cleanup of %s on %s", entry->d_name, ++ subvol->name); ++ ret = 0; ++ goto out; ++ } ++ } ++ for (i = 0; i < priv->child_count; i++) { ++ if (gfid_present[i] && !entry_present[i]) { ++ /*Entry is not anonymous on at least one subvol*/ ++ gf_msg_debug(healer->this->name, 0, ++ "Valid entry present on %s " ++ "Skipping cleanup of %s on %s", ++ priv->children[i]->name, entry->d_name, ++ subvol->name); ++ ret = 0; ++ goto out; ++ } ++ } ++ ++ gf_msg(healer->this->name, GF_LOG_WARNING, 0, ++ AFR_MSG_EXPUNGING_FILE_OR_DIR, ++ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name, ++ entry->d_name); ++ ret = 0; ++ for (i = 0; i < priv->child_count; i++) { ++ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent, ++ entry->d_name, iatt->ia_type); ++ if (op_errno != ENOENT && op_errno != ESTALE) { ++ ret |= -op_errno; ++ } ++ } ++ } ++ ++out: ++ if (frame) ++ AFR_STACK_DESTROY(frame); ++ loc_wipe(&loc); ++ return ret; ++} ++ ++static void ++afr_cleanup_anon_inode_dir(struct subvol_healer *healer) ++{ ++ int ret = 0; ++ call_frame_t *frame = NULL; ++ afr_private_t *priv = healer->this->private; ++ loc_t loc = {0}; ++ ++ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode); ++ if (ret) ++ goto out; ++ ++ frame = afr_frame_create(healer->this, &ret); ++ if (!frame) { ++ ret = -ret; ++ goto out; ++ } ++ ++ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc, ++ GF_CLIENT_PID_SELF_HEALD, healer, ++ afr_shd_anon_inode_cleaner, NULL, ++ priv->shd.max_threads, priv->shd.wait_qlength); ++out: ++ if (frame) ++ AFR_STACK_DESTROY(frame); ++ loc_wipe(&loc); ++ return; ++} ++ + void * + afr_shd_index_healer(void *data) + { +@@ -854,6 +1024,10 @@ afr_shd_index_healer(void *data) + sleep(1); + } while (ret > 0); + ++ if (ret == 0) { ++ afr_cleanup_anon_inode_dir(healer); ++ } ++ + if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { + afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, + pre_crawl_xdata); +diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h +index 1990539..acd567e 100644 +--- a/xlators/cluster/afr/src/afr-self-heald.h ++++ b/xlators/cluster/afr/src/afr-self-heald.h +@@ -70,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid, + char **path_p); + + int +-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, ++afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, + ia_type_t type); + #endif /* !_AFR_SELF_HEALD_H */ +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index bfa464f..33fe4d8 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -135,6 +135,27 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo) + } + } + ++void ++afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options) ++{ ++ char *volfile_id_str = NULL; ++ uuid_t anon_inode_gfid = {0}; ++ ++ /*If volume id is not present don't enable anything*/ ++ if (dict_get_str(options, "volume-id", &volfile_id_str)) ++ return; ++ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX); ++ /*anon_inode_name is not supposed to change once assigned*/ ++ if (!priv->anon_inode_name[0]) { ++ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s", ++ AFR_ANON_DIR_PREFIX, volfile_id_str); ++ gf_uuid_parse(volfile_id_str, anon_inode_gfid); ++ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/ ++ anon_inode_gfid[0] ^= 1; ++ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str); ++ } ++} ++ + int + reconfigure(xlator_t *this, dict_t *options) + { +@@ -287,6 +308,10 @@ reconfigure(xlator_t *this, dict_t *options) + consistent_io = _gf_false; + priv->consistent_io = consistent_io; + ++ afr_handle_anon_inode_options(priv, options); ++ ++ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool, ++ out); + if (priv->shd.enabled) { + if ((priv->shd.enabled != enabled_old) || + (timeout_old != priv->shd.timeout)) +@@ -535,7 +560,9 @@ init(xlator_t *this) + + GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); + GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); ++ afr_handle_anon_inode_options(priv, this->options); + ++ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out); + if (priv->quorum_count != 0) + priv->consistent_io = _gf_false; + +@@ -547,13 +574,16 @@ init(xlator_t *this) + goto out; + } + ++ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count, ++ gf_afr_mt_char); ++ + priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); + + priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count, + gf_afr_mt_child_latency_t); + +- if (!priv->child_up || !priv->child_latency) { ++ if (!priv->child_up || !priv->child_latency || !priv->anon_inode) { + ret = -ENOMEM; + goto out; + } +@@ -1218,6 +1248,14 @@ struct volume_options options[] = { + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, ++ {.key = {"use-anonymous-inode"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .default_value = "no", ++ .op_version = {GD_OP_VERSION_7_0}, ++ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, ++ .tags = {"replicate"}, ++ .description = "Setting this option heals directory renames efficiently"}, ++ + {.key = {NULL}}, + }; + +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 3a2b26d..6a9a763 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -40,6 +40,8 @@ + #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" + + #define AFR_HALO_MAX_LATENCY 99999 ++#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode" ++ + + #define PFLAG_PENDING (1 << 0) + #define PFLAG_SBRAIN (1 << 1) +@@ -155,6 +157,7 @@ typedef struct _afr_private { + struct list_head ta_waitq; + struct list_head ta_onwireq; + ++ unsigned char *anon_inode; + unsigned char *child_up; + int64_t *child_latency; + unsigned char *local; +@@ -240,6 +243,11 @@ typedef struct _afr_private { + gf_boolean_t esh_granular; + gf_boolean_t consistent_io; + gf_boolean_t data_self_heal; /* on/off */ ++ gf_boolean_t use_anon_inode; ++ ++ /*For anon-inode handling */ ++ char anon_inode_name[NAME_MAX + 1]; ++ char anon_gfid_str[UUID_SIZE + 1]; + } afr_private_t; + + typedef enum { +@@ -1341,4 +1349,7 @@ afr_selfheal_childup(xlator_t *this, afr_private_t *priv); + void + afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies); ++gf_boolean_t ++afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, ++ pid_t pid); + #endif /* __AFR_H__ */ +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 094a71f..1920284 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -3867,6 +3867,38 @@ out: + } + + static int ++set_volfile_id_option(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, ++ int clusters) ++{ ++ xlator_t *xlator = NULL; ++ int i = 0; ++ int ret = -1; ++ glusterd_conf_t *conf = NULL; ++ xlator_t *this = NULL; ++ ++ this = THIS; ++ GF_VALIDATE_OR_GOTO("glusterd", this, out); ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO(this->name, conf, out); ++ ++ if (conf->op_version < GD_OP_VERSION_7_1) ++ return 0; ++ xlator = first_of(graph); ++ ++ for (i = 0; i < clusters; i++) { ++ ret = xlator_set_fixed_option(xlator, "volume-id", ++ uuid_utoa(volinfo->volume_id)); ++ if (ret) ++ goto out; ++ ++ xlator = xlator->next; ++ } ++ ++out: ++ return ret; ++} ++ ++static int + volgen_graph_build_afr_clusters(volgen_graph_t *graph, + glusterd_volinfo_t *volinfo) + { +@@ -3906,6 +3938,13 @@ volgen_graph_build_afr_clusters(volgen_graph_t *graph, + clusters = -1; + goto out; + } ++ ++ ret = set_volfile_id_option(graph, volinfo, clusters); ++ if (ret) { ++ clusters = -1; ++ goto out; ++ } ++ + if (!volinfo->arbiter_count) + goto out; + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index 62acadf..c1ca190 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3789,4 +3789,10 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .voltype = "features/cloudsync", + .op_version = GD_OP_VERSION_7_0, + .flags = VOLOPT_FLAG_CLIENT_OPT}, ++ ++ {.key = "cluster.use-anonymous-inode", ++ .voltype = "cluster/replicate", ++ .op_version = GD_OP_VERSION_7_1, ++ .value = "yes", ++ .flags = VOLOPT_FLAG_CLIENT_OPT}, + {.key = NULL}}; +-- +1.8.3.1 + diff --git a/SOURCES/0516-afr-return-EIO-for-gfid-split-brains.patch b/SOURCES/0516-afr-return-EIO-for-gfid-split-brains.patch new file mode 100644 index 0000000..0f6249e --- /dev/null +++ b/SOURCES/0516-afr-return-EIO-for-gfid-split-brains.patch @@ -0,0 +1,338 @@ +From 8d24d891aade910b0bb86b27c25a8d2382e19ba0 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Tue, 15 Dec 2020 15:04:19 +0530 +Subject: [PATCH 516/517] afr: return -EIO for gfid split-brains. + +Problem: +entry-self-heal-anon-dir-off.t was failing occasionally because +afr_gfid_split_brain_source() returned -1 instead of -EIO for +split-brains, causing the code to proceed to afr_lookup_done(), which +in turn succeeded the lookup if there was a parallel client side heal +going on. + +Fix: +Return -EIO instead of -1 so that lookp fails. + +Also, afr_selfheal_name() was using the same dict to get and set values. This +could be problematic if the caller passed local->xdata_req, since +setting a response in a request dict can lead to bugs.So changed it to use +separate request and response dicts. + +Upstream patch details: +> Fixes: #1739 +> Credits Pranith Karampuri +> Signed-off-by: Ravishankar N +>Change-Id: I5cb4c547fb25e6bfc8bec1740f7eb64e1a5ad443 +Upstream patch: https://github.com/gluster/glusterfs/pull/1819/ + +BUG: 1640148 +Signed-off-by: karthik-us +Change-Id: I5cb4c547fb25e6bfc8bec1740f7eb64e1a5ad443 +Reviewed-on: https://code.engineering.redhat.com/gerrit/221209 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + xlators/cluster/afr/src/afr-common.c | 12 ++++++++---- + xlators/cluster/afr/src/afr-self-heal-common.c | 27 +++++++++++++------------- + xlators/cluster/afr/src/afr-self-heal-entry.c | 8 ++++---- + xlators/cluster/afr/src/afr-self-heal-name.c | 23 +++++++++++----------- + xlators/cluster/afr/src/afr-self-heal.h | 5 +++-- + xlators/cluster/afr/src/afr-self-heald.c | 2 +- + 6 files changed, 42 insertions(+), 35 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 6f2da11..416012c 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2366,7 +2366,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) + /* If we were called from glfsheal and there is still a gfid + * mismatch, succeed the lookup and let glfsheal print the + * response via gfid-heal-msg.*/ +- if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", ++ if (!dict_get_str_sizen(local->xattr_rsp, "gfid-heal-msg", + &gfid_heal_msg)) + goto cant_interpret; + +@@ -2421,7 +2421,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) + goto error; + } + +- ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg); ++ ret = dict_get_str_sizen(local->xattr_rsp, "gfid-heal-msg", &gfid_heal_msg); + if (!ret) { + ret = dict_set_str_sizen(local->replies[read_subvol].xdata, + "gfid-heal-msg", gfid_heal_msg); +@@ -2768,9 +2768,12 @@ afr_lookup_selfheal_wrap(void *opaque) + local = frame->local; + this = frame->this; + loc_pargfid(&local->loc, pargfid); ++ if (!local->xattr_rsp) ++ local->xattr_rsp = dict_new(); + + ret = afr_selfheal_name(frame->this, pargfid, local->loc.name, +- &local->cont.lookup.gfid_req, local->xattr_req); ++ &local->cont.lookup.gfid_req, local->xattr_req, ++ local->xattr_rsp); + if (ret == -EIO) + goto unwind; + +@@ -2786,7 +2789,8 @@ afr_lookup_selfheal_wrap(void *opaque) + return 0; + + unwind: +- AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); ++ AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, local->xattr_rsp, ++ NULL); + return 0; + } + +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 0a8a7fd..0954d2c 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -245,7 +245,8 @@ int + afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, +- unsigned char *locked_on, int *src, dict_t *xdata) ++ unsigned char *locked_on, int *src, dict_t *req, ++ dict_t *rsp) + { + afr_private_t *priv = NULL; + char g1[64] = { +@@ -266,8 +267,8 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "All the bricks should be up to resolve the gfid split " + "barin"); +- if (xdata) { +- ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", ++ if (rsp) { ++ ret = dict_set_sizen_str_sizen(rsp, "gfid-heal-msg", + SALL_BRICKS_UP_TO_RESOLVE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, +@@ -277,8 +278,8 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + goto out; + } + +- if (xdata) { +- ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op); ++ if (req) { ++ ret = dict_get_int32_sizen(req, "heal-op", &heal_op); + if (ret) + goto fav_child; + } else { +@@ -292,8 +293,8 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_BIGGER_FILE); +- if (xdata) { +- ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", ++ if (rsp) { ++ ret = dict_set_sizen_str_sizen(rsp, "gfid-heal-msg", + SNO_BIGGER_FILE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, +@@ -310,8 +311,8 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_DIFF_IN_MTIME); +- if (xdata) { +- ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", ++ if (rsp) { ++ ret = dict_set_sizen_str_sizen(rsp, "gfid-heal-msg", + SNO_DIFF_IN_MTIME); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, +@@ -323,7 +324,7 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: +- ret = dict_get_str_sizen(xdata, "child-name", &src_brick); ++ ret = dict_get_str_sizen(req, "child-name", &src_brick); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Error getting the source " +@@ -335,8 +336,8 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SERROR_GETTING_SRC_BRICK); +- if (xdata) { +- ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", ++ if (rsp) { ++ ret = dict_set_sizen_str_sizen(rsp, "gfid-heal-msg", + SERROR_GETTING_SRC_BRICK); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, +@@ -400,7 +401,7 @@ out: + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx, + priv->children[src_idx]->name, src_idx, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2)); +- return -1; ++ return -EIO; + } + return 0; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c +index 20b07dd..a17dd93 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-entry.c ++++ b/xlators/cluster/afr/src/afr-self-heal-entry.c +@@ -399,7 +399,7 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this, + (ia_type == replies[i].poststat.ia_type)) { + ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, + bname, src_idx, i, locked_on, src, +- NULL); ++ NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Skipping conservative merge on the " +@@ -474,7 +474,7 @@ __afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + return ret; + + /* In case of type mismatch / unable to resolve gfid mismatch on the +- * entry, return -1.*/ ++ * entry, return -EIO.*/ + ret = afr_selfheal_detect_gfid_and_type_mismatch( + this, replies, inode, fd->inode->gfid, name, source, locked_on, &src); + +@@ -905,7 +905,7 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, + break; + } + +- if (ret == -1) { ++ if (ret == -EIO) { + /* gfid or type mismatch. */ + mismatch = _gf_true; + ret = 0; +@@ -1072,7 +1072,7 @@ afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + else + ret = afr_selfheal_entry_do_subvol(frame, this, fd, i); + +- if (ret == -1) { ++ if (ret == -EIO) { + /* gfid or type mismatch. */ + mismatch = _gf_true; + ret = 0; +diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c +index 51e3d8c..9ec2066 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-name.c ++++ b/xlators/cluster/afr/src/afr-self-heal-name.c +@@ -217,7 +217,8 @@ afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies, + int source, unsigned char *sources, + int *gfid_idx, uuid_t pargfid, + const char *bname, inode_t *inode, +- unsigned char *locked_on, dict_t *xdata) ++ unsigned char *locked_on, dict_t *req, ++ dict_t *rsp) + { + int i = 0; + int gfid_idx_iter = -1; +@@ -245,11 +246,11 @@ afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies, + if (sources[i] || source == -1) { + if ((sources[gfid_idx_iter] || source == -1) && + gf_uuid_compare(gfid, gfid1)) { +- ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, +- bname, gfid_idx_iter, i, +- locked_on, gfid_idx, xdata); ++ ret = afr_gfid_split_brain_source( ++ this, replies, inode, pargfid, bname, gfid_idx_iter, i, ++ locked_on, gfid_idx, req, rsp); + if (!ret && *gfid_idx >= 0) { +- ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", ++ ret = dict_set_sizen_str_sizen(rsp, "gfid-heal-msg", + "GFID split-brain resolved"); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, +@@ -303,7 +304,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies, +- void *gfid_req, dict_t *xdata) ++ void *gfid_req, dict_t *req, dict_t *rsp) + { + int gfid_idx = -1; + int ret = -1; +@@ -333,7 +334,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + + ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources, + &gfid_idx, pargfid, bname, +- inode, locked_on, xdata); ++ inode, locked_on, req, rsp); + if (ret) + return ret; + +@@ -450,7 +451,7 @@ out: + int + afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, void *gfid_req, +- dict_t *xdata) ++ dict_t *req, dict_t *rsp) + { + afr_private_t *priv = NULL; + unsigned char *sources = NULL; +@@ -505,7 +506,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + + ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode, + sources, sinks, healed_sinks, source, +- locked_on, replies, gfid_req, xdata); ++ locked_on, replies, gfid_req, req, rsp); + } + unlock: + afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on, +@@ -578,7 +579,7 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this, + + int + afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname, +- void *gfid_req, dict_t *xdata) ++ void *gfid_req, dict_t *req, dict_t *rsp) + { + inode_t *parent = NULL; + call_frame_t *frame = NULL; +@@ -600,7 +601,7 @@ afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname, + + if (need_heal) { + ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname, +- gfid_req, xdata); ++ gfid_req, req, rsp); + if (ret) + goto out; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h +index c8dc384..6b0bf69 100644 +--- a/xlators/cluster/afr/src/afr-self-heal.h ++++ b/xlators/cluster/afr/src/afr-self-heal.h +@@ -127,7 +127,7 @@ afr_throttled_selfheal(call_frame_t *frame, xlator_t *this); + + int + afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req, +- dict_t *xdata); ++ dict_t *req, dict_t *rsp); + + int + afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd); +@@ -357,7 +357,8 @@ int + afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, +- unsigned char *locked_on, int *src, dict_t *xdata); ++ unsigned char *locked_on, int *src, dict_t *req, ++ dict_t *rsp); + int + afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, +diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c +index 939a135..18aed93 100644 +--- a/xlators/cluster/afr/src/afr-self-heald.c ++++ b/xlators/cluster/afr/src/afr-self-heald.c +@@ -295,7 +295,7 @@ afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent, + { + int ret = -1; + +- ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL); ++ ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL, NULL); + + return ret; + } +-- +1.8.3.1 + diff --git a/SOURCES/0517-gfapi-glfs_h_creat_open-new-API-to-create-handle-and.patch b/SOURCES/0517-gfapi-glfs_h_creat_open-new-API-to-create-handle-and.patch new file mode 100644 index 0000000..bc1b263 --- /dev/null +++ b/SOURCES/0517-gfapi-glfs_h_creat_open-new-API-to-create-handle-and.patch @@ -0,0 +1,388 @@ +From da75c2857fd8b173d47fb7fc3b925ffd14105f64 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 23 Dec 2020 07:39:13 -0500 +Subject: [PATCH 517/517] gfapi: 'glfs_h_creat_open' - new API to create handle + and open fd + +Right now we have two separate APIs, one +- 'glfs_h_creat_handle' to create handle & another +- 'glfs_h_open' to create a glfd to return to application + +Having two separate routines can result in access errors +while trying to create and write into a read-only file. + +Since a fd is opened even during file/directory creation, +introducing a new API to make these two operations atomic i.e, +which can create both handle & fd and pass them to application + +This is backport of below mainline patch - +- https://review.gluster.org/#/c/glusterfs/+/23448/ +- bz#1753569 + +> Signed-off-by: Soumya Koduri +> Change-Id: Ibf513fcfcdad175f4d7eb6fa7a61b8feec6d33b5 +> release-6: commit 5a2af2fd06356f6fc79d591c352caffd4c511c9e +> master: commit 41a0f2aa755ec7162facd30209f2fa3f40308766 + +BUG: 1910119 +Change-Id: Ib397dbe82a6928d8f24251809d30febddd007bfc +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/222083 +Reviewed-by: Soumya Koduri +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + api/src/gfapi.aliases | 1 + + api/src/gfapi.map | 5 ++ + api/src/glfs-handleops.c | 135 ++++++++++++++++++++++++++++++++++ + api/src/glfs-handles.h | 5 ++ + tests/basic/gfapi/glfs_h_creat_open.c | 118 +++++++++++++++++++++++++++++ + tests/basic/gfapi/glfs_h_creat_open.t | 27 +++++++ + 6 files changed, 291 insertions(+) + create mode 100644 tests/basic/gfapi/glfs_h_creat_open.c + create mode 100755 tests/basic/gfapi/glfs_h_creat_open.t + +diff --git a/api/src/gfapi.aliases b/api/src/gfapi.aliases +index 692ae13..3d3415c 100644 +--- a/api/src/gfapi.aliases ++++ b/api/src/gfapi.aliases +@@ -197,3 +197,4 @@ _pub_glfs_fsetattr _glfs_fsetattr$GFAPI_6.0 + _pub_glfs_setattr _glfs_setattr$GFAPI_6.0 + + _pub_glfs_set_statedump_path _glfs_set_statedump_path@GFAPI_6.4 ++_pub_glfs_h_creat_open _glfs_h_creat_open@GFAPI_6.6 +diff --git a/api/src/gfapi.map b/api/src/gfapi.map +index df65837..614f3f6 100644 +--- a/api/src/gfapi.map ++++ b/api/src/gfapi.map +@@ -276,3 +276,8 @@ GFAPI_6.4 { + global: + glfs_set_statedump_path; + } GFAPI_PRIVATE_6.1; ++ ++GFAPI_6.6 { ++ global: ++ glfs_h_creat_open; ++} GFAPI_6.4; +diff --git a/api/src/glfs-handleops.c b/api/src/glfs-handleops.c +index d4e1545..7b8ff14 100644 +--- a/api/src/glfs-handleops.c ++++ b/api/src/glfs-handleops.c +@@ -843,6 +843,141 @@ invalid_fs: + GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_h_creat, 3.4.2); + + struct glfs_object * ++pub_glfs_h_creat_open(struct glfs *fs, struct glfs_object *parent, ++ const char *path, int flags, mode_t mode, ++ struct stat *stat, struct glfs_fd **out_fd) ++{ ++ int ret = -1; ++ struct glfs_fd *glfd = NULL; ++ xlator_t *subvol = NULL; ++ inode_t *inode = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ struct iatt iatt = { ++ 0, ++ }; ++ uuid_t gfid; ++ dict_t *xattr_req = NULL; ++ struct glfs_object *object = NULL; ++ dict_t *fop_attr = NULL; ++ ++ /* validate in args */ ++ if ((fs == NULL) || (parent == NULL) || (path == NULL) || ++ (out_fd == NULL)) { ++ errno = EINVAL; ++ return NULL; ++ } ++ ++ DECLARE_OLD_THIS; ++ __GLFS_ENTRY_VALIDATE_FS(fs, invalid_fs); ++ ++ /* get the active volume */ ++ subvol = glfs_active_subvol(fs); ++ if (!subvol) { ++ ret = -1; ++ goto out; ++ } ++ ++ /* get/refresh the in arg objects inode in correlation to the xlator */ ++ inode = glfs_resolve_inode(fs, subvol, parent); ++ if (!inode) { ++ ret = -1; ++ goto out; ++ } ++ ++ xattr_req = dict_new(); ++ if (!xattr_req) { ++ ret = -1; ++ errno = ENOMEM; ++ goto out; ++ } ++ ++ gf_uuid_generate(gfid); ++ ret = dict_set_gfuuid(xattr_req, "gfid-req", gfid, true); ++ if (ret) { ++ ret = -1; ++ errno = ENOMEM; ++ goto out; ++ } ++ ++ GLFS_LOC_FILL_PINODE(inode, loc, ret, errno, out, path); ++ ++ glfd = glfs_fd_new(fs); ++ if (!glfd) { ++ ret = -1; ++ errno = ENOMEM; ++ goto out; ++ } ++ ++ glfd->fd = fd_create(loc.inode, getpid()); ++ if (!glfd->fd) { ++ ret = -1; ++ errno = ENOMEM; ++ goto out; ++ } ++ glfd->fd->flags = flags; ++ ++ ret = get_fop_attr_thrd_key(&fop_attr); ++ if (ret) ++ gf_msg_debug("gfapi", 0, "Getting leaseid from thread failed"); ++ ++ /* fop/op */ ++ ret = syncop_create(subvol, &loc, flags, mode, glfd->fd, &iatt, xattr_req, ++ NULL); ++ DECODE_SYNCOP_ERR(ret); ++ ++ /* populate out args */ ++ if (ret == 0) { ++ glfd->fd->flags = flags; ++ ++ ret = glfs_loc_link(&loc, &iatt); ++ if (ret != 0) { ++ goto out; ++ } ++ ++ if (stat) ++ glfs_iatt_to_stat(fs, &iatt, stat); ++ ++ ret = glfs_create_object(&loc, &object); ++ } ++ ++out: ++ if (ret && object != NULL) { ++ /* Release the held reference */ ++ glfs_h_close(object); ++ object = NULL; ++ } ++ ++ loc_wipe(&loc); ++ ++ if (inode) ++ inode_unref(inode); ++ ++ if (fop_attr) ++ dict_unref(fop_attr); ++ ++ if (xattr_req) ++ dict_unref(xattr_req); ++ ++ if (ret && glfd) { ++ GF_REF_PUT(glfd); ++ } else if (glfd) { ++ glfd_set_state_bind(glfd); ++ *out_fd = glfd; ++ } ++ ++ glfs_subvol_done(fs, subvol); ++ ++ __GLFS_EXIT_FS; ++ ++invalid_fs: ++ return object; ++} ++ ++GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_h_creat_open, 6.6); ++ ++struct glfs_object * + pub_glfs_h_mkdir(struct glfs *fs, struct glfs_object *parent, const char *path, + mode_t mode, struct stat *stat) + { +diff --git a/api/src/glfs-handles.h b/api/src/glfs-handles.h +index f7e6a06..4d039b9 100644 +--- a/api/src/glfs-handles.h ++++ b/api/src/glfs-handles.h +@@ -250,6 +250,11 @@ int + glfs_h_access(glfs_t *fs, glfs_object_t *object, int mask) __THROW + GFAPI_PUBLIC(glfs_h_access, 3.6.0); + ++struct glfs_object * ++glfs_h_creat_open(struct glfs *fs, struct glfs_object *parent, const char *path, ++ int flags, mode_t mode, struct stat *stat, ++ struct glfs_fd **out_fd) __THROW ++ GFAPI_PUBLIC(glfs_h_creat_open, 6.6); + /* + SYNOPSIS + +diff --git a/tests/basic/gfapi/glfs_h_creat_open.c b/tests/basic/gfapi/glfs_h_creat_open.c +new file mode 100644 +index 0000000..7672561 +--- /dev/null ++++ b/tests/basic/gfapi/glfs_h_creat_open.c +@@ -0,0 +1,118 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define LOG_ERR(func, ret) \ ++ do { \ ++ if (ret != 0) { \ ++ fprintf(stderr, "%s : returned error ret(%d), errno(%d)\n", func, \ ++ ret, errno); \ ++ exit(1); \ ++ } else { \ ++ fprintf(stderr, "%s : returned %d\n", func, ret); \ ++ } \ ++ } while (0) ++#define LOG_IF_NO_ERR(func, ret) \ ++ do { \ ++ if (ret == 0) { \ ++ fprintf(stderr, "%s : hasn't returned error %d\n", func, ret); \ ++ exit(1); \ ++ } else { \ ++ fprintf(stderr, "%s : returned %d\n", func, ret); \ ++ } \ ++ } while (0) ++int ++main(int argc, char *argv[]) ++{ ++ glfs_t *fs = NULL; ++ int ret = 0; ++ struct glfs_object *root = NULL, *leaf = NULL; ++ glfs_fd_t *fd = NULL; ++ char *filename = "/ro-file"; ++ struct stat sb = { ++ 0, ++ }; ++ char *logfile = NULL; ++ char *volname = NULL; ++ char *hostname = NULL; ++ char buf[32] = "abcdefghijklmnopqrstuvwxyz012345"; ++ ++ fprintf(stderr, "Starting glfs_h_creat_open\n"); ++ ++ if (argc != 4) { ++ fprintf(stderr, "Invalid argument\n"); ++ exit(1); ++ } ++ ++ hostname = argv[1]; ++ volname = argv[2]; ++ logfile = argv[3]; ++ ++ fs = glfs_new(volname); ++ if (!fs) { ++ fprintf(stderr, "glfs_new: returned NULL\n"); ++ return 1; ++ } ++ ++ ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); ++ LOG_ERR("glfs_set_volfile_server", ret); ++ ++ ret = glfs_set_logging(fs, logfile, 7); ++ LOG_ERR("glfs_set_logging", ret); ++ ++ ret = glfs_init(fs); ++ LOG_ERR("glfs_init", ret); ++ ++ sleep(2); ++ root = glfs_h_lookupat(fs, NULL, "/", &sb, 0); ++ if (!root) { ++ ret = -1; ++ LOG_ERR("glfs_h_lookupat root", ret); ++ } ++ leaf = glfs_h_lookupat(fs, root, filename, &sb, 0); ++ if (!leaf) { ++ ret = -1; ++ LOG_IF_NO_ERR("glfs_h_lookupat leaf", ret); ++ } ++ ++ leaf = glfs_h_creat_open(fs, root, filename, O_RDONLY, 00444, &sb, &fd); ++ if (!leaf || !fd) { ++ ret = -1; ++ LOG_ERR("glfs_h_creat leaf", ret); ++ } ++ fprintf(stderr, "glfs_h_create_open leaf - %p\n", leaf); ++ ++ ret = glfs_write(fd, buf, 32, 0); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_write: error writing to file %s, %s\n", filename, ++ strerror(errno)); ++ goto out; ++ } ++ ++ ret = glfs_h_getattrs(fs, leaf, &sb); ++ LOG_ERR("glfs_h_getattrs", ret); ++ ++ if (sb.st_size != 32) { ++ fprintf(stderr, "glfs_write: post size mismatch\n"); ++ goto out; ++ } ++ ++ fprintf(stderr, "Successfully opened and written to a read-only file \n"); ++out: ++ if (fd) ++ glfs_close(fd); ++ ++ ret = glfs_fini(fs); ++ LOG_ERR("glfs_fini", ret); ++ ++ fprintf(stderr, "End of libgfapi_fini\n"); ++ ++ exit(0); ++} +diff --git a/tests/basic/gfapi/glfs_h_creat_open.t b/tests/basic/gfapi/glfs_h_creat_open.t +new file mode 100755 +index 0000000..f24ae73 +--- /dev/null ++++ b/tests/basic/gfapi/glfs_h_creat_open.t +@@ -0,0 +1,27 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++TEST glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/brick1; ++EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++ ++logdir=`gluster --print-logdir` ++ ++TEST build_tester $(dirname $0)/glfs_h_creat_open.c -lgfapi ++ ++TEST ./$(dirname $0)/glfs_h_creat_open $H0 $V0 $logdir/glfs.log ++ ++cleanup_tester $(dirname $0)/glfs_h_creat_open ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup; +-- +1.8.3.1 + diff --git a/SOURCES/0518-glusterd-Fix-for-shared-storage-in-ipv6-env.patch b/SOURCES/0518-glusterd-Fix-for-shared-storage-in-ipv6-env.patch new file mode 100644 index 0000000..00d29b9 --- /dev/null +++ b/SOURCES/0518-glusterd-Fix-for-shared-storage-in-ipv6-env.patch @@ -0,0 +1,41 @@ +From 818025e467ea98b32a855c92ba6aef6e172e029f Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Fri, 8 Jan 2021 13:12:46 +0530 +Subject: [PATCH 518/526] glusterd: Fix for shared storage in ipv6 env + +Issue: +Mounting shared storage volume was failing in ipv6 env if the hostnames were FQDNs. +The brickname for the volume was being cut off, as a result, volume creation was failing. + +>Change-Id: Ib38993724c709b35b603f9ac666630c50c932c3e +>Fixes: #1406 +>Signed-off-by: nik-redhat +Upstream patch: https://github.com/gluster/glusterfs/pull/1972 + +BUG: 1856574 + +Change-Id: Ib38993724c709b35b603f9ac666630c50c932c3e +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/223248 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh b/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh +index 9597503..e9261af 100755 +--- a/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh ++++ b/extras/hook-scripts/set/post/S32gluster_enable_shared_storage.sh +@@ -46,7 +46,7 @@ do + + key=`echo $line | cut -d ':' -f 1` + if [ "$key" == "Hostname" ]; then +- hostname=`echo $line | cut -d ':' -f 2 | xargs` ++ hostname=`echo $line | cut -d ' ' -f 2 | xargs` + fi + + if [ "$key" == "State" ]; then +-- +1.8.3.1 + diff --git a/SOURCES/0519-glusterfs-events-Fix-incorrect-attribute-access-2002.patch b/SOURCES/0519-glusterfs-events-Fix-incorrect-attribute-access-2002.patch new file mode 100644 index 0000000..f37acfd --- /dev/null +++ b/SOURCES/0519-glusterfs-events-Fix-incorrect-attribute-access-2002.patch @@ -0,0 +1,58 @@ +From 6ed227367b6eb7d6d7afde3859ad0a711a3adf36 Mon Sep 17 00:00:00 2001 +From: Leela Venkaiah G +Date: Wed, 13 Jan 2021 16:02:25 +0530 +Subject: [PATCH 519/526] glusterfs-events: Fix incorrect attribute access + (#2002) + +Issue: When GlusterCmdException is raised, current code try to access +message atrribute which doesn't exist and resulting in a malformed +error string on failure operations + +Code Change: Replace `message` with `args[0]` + +>Fixes: #2001 +>Change-Id: I65c9f0ee79310937a384025b8d454acda154e4bb +>Signed-off-by: Leela Venkaiah G +Upstream patch: https://github.com/gluster/glusterfs/pull/2002 + +BUG: 1600459 +Change-Id: I65c9f0ee79310937a384025b8d454acda154e4bb +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/223584 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + events/src/peer_eventsapi.py | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/events/src/peer_eventsapi.py b/events/src/peer_eventsapi.py +index 26b77a0..c388da4 100644 +--- a/events/src/peer_eventsapi.py ++++ b/events/src/peer_eventsapi.py +@@ -174,9 +174,9 @@ def sync_to_peers(args): + sync_file_to_peers(WEBHOOKS_FILE_TO_SYNC) + except GlusterCmdException as e: + # Print stdout if stderr is empty +- errmsg = e.message[2] if e.message[2] else e.message[1] ++ errmsg = e.args[0][2] if e.args[0][2] else e.args[0][1] + handle_output_error("Failed to sync Webhooks file: [Error: {0}]" +- "{1}".format(e.message[0], errmsg), ++ "{1}".format(e.args[0][0], errmsg), + errcode=ERROR_WEBHOOK_SYNC_FAILED, + json_output=args.json) + +@@ -185,9 +185,9 @@ def sync_to_peers(args): + sync_file_to_peers(CUSTOM_CONFIG_FILE_TO_SYNC) + except GlusterCmdException as e: + # Print stdout if stderr is empty +- errmsg = e.message[2] if e.message[2] else e.message[1] ++ errmsg = e.args[0][2] if e.args[0][2] else e.args[0][1] + handle_output_error("Failed to sync Config file: [Error: {0}]" +- "{1}".format(e.message[0], errmsg), ++ "{1}".format(e.args[0][0], errmsg), + errcode=ERROR_CONFIG_SYNC_FAILED, + json_output=args.json) + +-- +1.8.3.1 + diff --git a/SOURCES/0520-performance-open-behind-seek-fop-should-open_and_res.patch b/SOURCES/0520-performance-open-behind-seek-fop-should-open_and_res.patch new file mode 100644 index 0000000..c46a9ca --- /dev/null +++ b/SOURCES/0520-performance-open-behind-seek-fop-should-open_and_res.patch @@ -0,0 +1,70 @@ +From a3fd2c9d85bbd23131c985599d9c9d74f66f32d2 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Thu, 10 Oct 2019 10:50:59 +0530 +Subject: [PATCH 520/526] performance/open-behind: seek fop should + open_and_resume + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/23530 +> fixes: bz#1760187 +> Change-Id: I4c6ad13194d4fc5c7705e35bf9a27fce504b51f9 +> Signed-off-by: Pranith Kumar K + +BUG: 1830713 +Change-Id: I4c6ad13194d4fc5c7705e35bf9a27fce504b51f9 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/224484 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/open-behind/src/open-behind.c | 27 +++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index 268c717..3ee3c40 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -711,6 +711,32 @@ err: + } + + int ++ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ gf_seek_what_t what, dict_t *xdata) ++{ ++ call_stub_t *stub = NULL; ++ fd_t *wind_fd = NULL; ++ ++ wind_fd = ob_get_wind_fd(this, fd, NULL); ++ ++ stub = fop_seek_stub(frame, default_seek_resume, wind_fd, offset, what, ++ xdata); ++ ++ fd_unref(wind_fd); ++ ++ if (!stub) ++ goto err; ++ ++ open_and_resume(this, wind_fd, stub); ++ ++ return 0; ++err: ++ STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0); ++ ++ return 0; ++} ++ ++int + ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { + call_stub_t *stub = NULL; +@@ -1276,6 +1302,7 @@ struct xlator_fops fops = { + .flush = ob_flush, + .fsync = ob_fsync, + .fstat = ob_fstat, ++ .seek = ob_seek, + .ftruncate = ob_ftruncate, + .fsetxattr = ob_fsetxattr, + .setxattr = ob_setxattr, +-- +1.8.3.1 + diff --git a/SOURCES/0521-open-behind-fix-missing-fd-reference.patch b/SOURCES/0521-open-behind-fix-missing-fd-reference.patch new file mode 100644 index 0000000..8e18af8 --- /dev/null +++ b/SOURCES/0521-open-behind-fix-missing-fd-reference.patch @@ -0,0 +1,121 @@ +From 211d0f7dbb4991b2191925973222ebc79f010e84 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Sun, 8 Mar 2020 18:36:45 +0100 +Subject: [PATCH 521/526] open-behind: fix missing fd reference + +Open behind was not keeping any reference on fd's pending to be +opened. This makes it possible that a concurrent close and en entry +fop (unlink, rename, ...) caused destruction of the fd while it +was still being used. + +Upstream patch: +> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24204 +> Change-Id: Ie9e992902cf2cd7be4af1f8b4e57af9bd6afd8e9 +> Fixes: bz#1810934 +> Signed-off-by: Xavi Hernandez + +Change-Id: Ie9e992902cf2cd7be4af1f8b4e57af9bd6afd8e9 +BUG: 1830713 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/224485 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/open-behind/src/open-behind.c | 27 ++++++++++++++--------- + 1 file changed, 16 insertions(+), 11 deletions(-) + +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index 3ee3c40..dd2f2fd 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -206,8 +206,13 @@ ob_fd_free(ob_fd_t *ob_fd) + if (ob_fd->xdata) + dict_unref(ob_fd->xdata); + +- if (ob_fd->open_frame) ++ if (ob_fd->open_frame) { ++ /* If we sill have a frame it means that background open has never ++ * been triggered. We need to release the pending reference. */ ++ fd_unref(ob_fd->fd); ++ + STACK_DESTROY(ob_fd->open_frame->root); ++ } + + GF_FREE(ob_fd); + } +@@ -297,6 +302,7 @@ ob_wake_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + call_resume(stub); + } + ++ /* The background open is completed. We can release the 'fd' reference. */ + fd_unref(fd); + + STACK_DESTROY(frame->root); +@@ -331,7 +337,9 @@ ob_fd_wake(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) + } + + if (frame) { +- frame->local = fd_ref(fd); ++ /* We don't need to take a reference here. We already have a reference ++ * while the open is pending. */ ++ frame->local = fd; + + STACK_WIND(frame, ob_wake_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, &ob_fd->loc, ob_fd->flags, fd, +@@ -345,15 +353,12 @@ void + ob_inode_wake(xlator_t *this, struct list_head *ob_fds) + { + ob_fd_t *ob_fd = NULL, *tmp = NULL; +- fd_t *fd = NULL; + + if (!list_empty(ob_fds)) { + list_for_each_entry_safe(ob_fd, tmp, ob_fds, ob_fds_on_inode) + { + ob_fd_wake(this, ob_fd->fd, ob_fd); +- fd = ob_fd->fd; + ob_fd_free(ob_fd); +- fd_unref(fd); + } + } + } +@@ -365,7 +370,7 @@ ob_fd_copy(ob_fd_t *src, ob_fd_t *dst) + if (!src || !dst) + goto out; + +- dst->fd = __fd_ref(src->fd); ++ dst->fd = src->fd; + dst->loc.inode = inode_ref(src->loc.inode); + gf_uuid_copy(dst->loc.gfid, src->loc.gfid); + dst->flags = src->flags; +@@ -509,7 +514,6 @@ ob_open_behind(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + + ob_fd->ob_inode = ob_inode; + +- /* don't do fd_ref, it'll cause leaks */ + ob_fd->fd = fd; + + ob_fd->open_frame = copy_frame(frame); +@@ -539,15 +543,16 @@ ob_open_behind(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + } + UNLOCK(&fd->inode->lock); + +- if (!open_in_progress && !unlinked) { +- fd_ref(fd); ++ /* We take a reference while the background open is pending or being ++ * processed. If we finally wind the request in the foreground, then ++ * ob_fd_free() will take care of this additional reference. */ ++ fd_ref(fd); + ++ if (!open_in_progress && !unlinked) { + STACK_UNWIND_STRICT(open, frame, 0, 0, fd, xdata); + + if (!conf->lazy_open) + ob_fd_wake(this, fd, NULL); +- +- fd_unref(fd); + } else { + ob_fd_free(ob_fd); + STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), +-- +1.8.3.1 + diff --git a/SOURCES/0522-lcov-improve-line-coverage.patch b/SOURCES/0522-lcov-improve-line-coverage.patch new file mode 100644 index 0000000..13ece12 --- /dev/null +++ b/SOURCES/0522-lcov-improve-line-coverage.patch @@ -0,0 +1,746 @@ +From 46e2bbd52d4427c1348fa38dcb5d2b5f125555f1 Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Thu, 30 May 2019 15:25:01 +0530 +Subject: [PATCH 522/526] lcov: improve line coverage + +upcall: remove extra variable assignment and use just one + initialization. +open-behind: reduce the overall number of lines, in functions + not frequently called +selinux: reduce some lines in init failure cases + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/22789 +> updates: bz#1693692 +> Change-Id: I7c1de94f2ec76a5bfe1f48a9632879b18e5fbb95 +> Signed-off-by: Amar Tumballi + +BUG: 1830713 +Change-Id: I7c1de94f2ec76a5bfe1f48a9632879b18e5fbb95 +Signed-off-by: Amar Tumballi +Reviewed-on: https://code.engineering.redhat.com/gerrit/224486 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/selinux/src/selinux.c | 6 +- + xlators/features/upcall/src/upcall.c | 108 +++++++--------------- + xlators/performance/open-behind/src/open-behind.c | 58 ++++-------- + 3 files changed, 55 insertions(+), 117 deletions(-) + +diff --git a/xlators/features/selinux/src/selinux.c b/xlators/features/selinux/src/selinux.c +index 58b4c5d..e8e16cd 100644 +--- a/xlators/features/selinux/src/selinux.c ++++ b/xlators/features/selinux/src/selinux.c +@@ -234,7 +234,6 @@ init(xlator_t *this) + priv = GF_CALLOC(1, sizeof(*priv), gf_selinux_mt_selinux_priv_t); + if (!priv) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); +- ret = ENOMEM; + goto out; + } + +@@ -242,7 +241,6 @@ init(xlator_t *this) + + this->local_pool = mem_pool_new(selinux_priv_t, 64); + if (!this->local_pool) { +- ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SL_MSG_ENOMEM, + "Failed to create local_t's memory pool"); + goto out; +@@ -252,9 +250,7 @@ init(xlator_t *this) + ret = 0; + out: + if (ret) { +- if (priv) { +- GF_FREE(priv); +- } ++ GF_FREE(priv); + mem_pool_destroy(this->local_pool); + } + return ret; +diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c +index 2583c50..0795f58 100644 +--- a/xlators/features/upcall/src/upcall.c ++++ b/xlators/features/upcall/src/upcall.c +@@ -57,14 +57,13 @@ static int32_t + up_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -111,14 +110,13 @@ up_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -167,14 +165,13 @@ static int32_t + up_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -220,14 +217,13 @@ static int32_t + up_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -274,14 +270,13 @@ static int32_t + up_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -343,14 +338,13 @@ static int32_t + up_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -410,14 +404,13 @@ static int32_t + up_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, newloc, NULL, oldloc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -472,14 +465,13 @@ static int32_t + up_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -531,14 +523,13 @@ static int32_t + up_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, newloc, NULL, oldloc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -592,14 +583,13 @@ static int32_t + up_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -653,14 +643,13 @@ static int32_t + up_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *params) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -717,15 +706,13 @@ static int32_t + up_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); +- + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -774,14 +761,13 @@ out: + static int32_t + up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -826,14 +812,13 @@ out: + static int32_t + up_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -852,14 +837,13 @@ err: + static int32_t + up_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -879,14 +863,13 @@ static int32_t + up_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -932,14 +915,13 @@ static int32_t + up_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -986,14 +968,13 @@ static int32_t + up_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1047,14 +1028,13 @@ static int32_t + up_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1110,14 +1090,13 @@ static int32_t + up_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1164,14 +1143,13 @@ static int32_t + up_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1216,14 +1194,13 @@ out: + static int32_t + up_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1270,14 +1247,13 @@ static int32_t + up_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1334,14 +1310,13 @@ static int32_t + up_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1361,14 +1336,13 @@ static int32_t + up_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1415,14 +1389,13 @@ static int32_t + up_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1470,14 +1443,13 @@ static int32_t + up_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1524,14 +1496,13 @@ static int + up_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1577,14 +1548,13 @@ static int32_t + up_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1652,14 +1622,13 @@ static int32_t + up_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, dict); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1727,14 +1696,13 @@ static int32_t + up_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, fd, fd->inode, dict); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1800,7 +1768,7 @@ static int32_t + up_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + dict_t *xattr = NULL; + +@@ -1808,13 +1776,11 @@ up_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + + xattr = dict_for_key_value(name, "", 1, _gf_true); + if (!xattr) { +- op_errno = ENOMEM; + goto err; + } + + local = upcall_local_init(frame, this, NULL, fd, fd->inode, xattr); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1885,7 +1851,7 @@ static int32_t + up_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + dict_t *xattr = NULL; + +@@ -1893,13 +1859,11 @@ up_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + + xattr = dict_for_key_value(name, "", 1, _gf_true); + if (!xattr) { +- op_errno = ENOMEM; + goto err; + } + + local = upcall_local_init(frame, this, loc, NULL, loc->inode, xattr); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -1950,14 +1914,13 @@ static int32_t + up_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +@@ -2000,14 +1963,13 @@ static int32_t + up_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) + { +- int32_t op_errno = -1; ++ int32_t op_errno = ENOMEM; + upcall_local_t *local = NULL; + + EXIT_IF_UPCALL_OFF(this, out); + + local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL); + if (!local) { +- op_errno = ENOMEM; + goto err; + } + +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index dd2f2fd..cbe89ec 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -581,7 +581,7 @@ ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + { + fd_t *old_fd = NULL; + int ret = -1; +- int op_errno = 0; ++ int op_errno = ENOMEM; + call_stub_t *stub = NULL; + + old_fd = fd_lookup(fd->inode, 0); +@@ -589,7 +589,6 @@ ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + /* open-behind only when this is the first FD */ + stub = fop_open_stub(frame, default_open_resume, loc, flags, fd, xdata); + if (!stub) { +- op_errno = ENOMEM; + fd_unref(old_fd); + goto err; + } +@@ -603,7 +602,6 @@ ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + + ret = ob_open_behind(frame, this, loc, flags, fd, xdata); + if (ret) { +- op_errno = ENOMEM; + goto err; + } + +@@ -900,18 +898,12 @@ int + ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int cmd, struct gf_flock *flock, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_finodelk_stub(frame, default_finodelk_resume, volume, fd, cmd, +- flock, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, 0); ++ call_stub_t *stub = fop_finodelk_stub(frame, default_finodelk_resume, ++ volume, fd, cmd, flock, xdata); ++ if (stub) ++ open_and_resume(this, fd, stub); ++ else ++ STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, 0); + + return 0; + } +@@ -921,18 +913,12 @@ ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fentrylk_stub(frame, default_fentrylk_resume, volume, fd, +- basename, cmd, type, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, 0); ++ call_stub_t *stub = fop_fentrylk_stub( ++ frame, default_fentrylk_resume, volume, fd, basename, cmd, type, xdata); ++ if (stub) ++ open_and_resume(this, fd, stub); ++ else ++ STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, 0); + + return 0; + } +@@ -941,18 +927,12 @@ int + ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd, optype, xattr, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, 0, 0); ++ call_stub_t *stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd, ++ optype, xattr, xdata); ++ if (stub) ++ open_and_resume(this, fd, stub); ++ else ++ STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, 0, 0); + + return 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0523-open-behind-rewrite-of-internal-logic.patch b/SOURCES/0523-open-behind-rewrite-of-internal-logic.patch new file mode 100644 index 0000000..621d5ae --- /dev/null +++ b/SOURCES/0523-open-behind-rewrite-of-internal-logic.patch @@ -0,0 +1,2720 @@ +From b924c8ca8a133fc9413c8ed1407e63f1658c7e79 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 12 May 2020 23:54:54 +0200 +Subject: [PATCH 523/526] open-behind: rewrite of internal logic + +There was a critical flaw in the previous implementation of open-behind. + +When an open is done in the background, it's necessary to take a +reference on the fd_t object because once we "fake" the open answer, +the fd could be destroyed. However as long as there's a reference, +the release function won't be called. So, if the application closes +the file descriptor without having actually opened it, there will +always remain at least 1 reference, causing a leak. + +To avoid this problem, the previous implementation didn't take a +reference on the fd_t, so there were races where the fd could be +destroyed while it was still in use. + +To fix this, I've implemented a new xlator cbk that gets called from +fuse when the application closes a file descriptor. + +The whole logic of handling background opens have been simplified and +it's more efficient now. Only if the fop needs to be delayed until an +open completes, a stub is created. Otherwise no memory allocations are +needed. + +Correctly handling the close request while the open is still pending +has added a bit of complexity, but overall normal operation is simpler. + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24451 +> Change-Id: I6376a5491368e0e1c283cc452849032636261592 +> Fixes: #1225 +> Signed-off-by: Xavi Hernandez + +BUG: 1830713 +Change-Id: I6376a5491368e0e1c283cc452849032636261592 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/224487 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/fd.c | 26 + + libglusterfs/src/glusterfs/fd.h | 3 + + libglusterfs/src/glusterfs/xlator.h | 4 + + libglusterfs/src/libglusterfs.sym | 1 + + tests/basic/open-behind/open-behind.t | 183 +++ + tests/basic/open-behind/tester-fd.c | 99 ++ + tests/basic/open-behind/tester.c | 444 +++++++ + tests/basic/open-behind/tester.h | 145 +++ + tests/bugs/glusterfs/bug-873962-spb.t | 1 + + xlators/mount/fuse/src/fuse-bridge.c | 2 + + .../open-behind/src/open-behind-messages.h | 6 +- + xlators/performance/open-behind/src/open-behind.c | 1302 ++++++++------------ + 12 files changed, 1393 insertions(+), 823 deletions(-) + create mode 100644 tests/basic/open-behind/open-behind.t + create mode 100644 tests/basic/open-behind/tester-fd.c + create mode 100644 tests/basic/open-behind/tester.c + create mode 100644 tests/basic/open-behind/tester.h + +diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c +index 314546a..e4ec401 100644 +--- a/libglusterfs/src/fd.c ++++ b/libglusterfs/src/fd.c +@@ -501,6 +501,32 @@ out: + } + + void ++fd_close(fd_t *fd) ++{ ++ xlator_t *xl, *old_THIS; ++ ++ old_THIS = THIS; ++ ++ for (xl = fd->inode->table->xl->graph->first; xl != NULL; xl = xl->next) { ++ if (!xl->call_cleanup) { ++ THIS = xl; ++ ++ if (IA_ISDIR(fd->inode->ia_type)) { ++ if (xl->cbks->fdclosedir != NULL) { ++ xl->cbks->fdclosedir(xl, fd); ++ } ++ } else { ++ if (xl->cbks->fdclose != NULL) { ++ xl->cbks->fdclose(xl, fd); ++ } ++ } ++ } ++ } ++ ++ THIS = old_THIS; ++} ++ ++void + fd_unref(fd_t *fd) + { + int32_t refcount = 0; +diff --git a/libglusterfs/src/glusterfs/fd.h b/libglusterfs/src/glusterfs/fd.h +index cdbe289..4d157c4 100644 +--- a/libglusterfs/src/glusterfs/fd.h ++++ b/libglusterfs/src/glusterfs/fd.h +@@ -107,6 +107,9 @@ fd_ref(fd_t *fd); + void + fd_unref(fd_t *fd); + ++void ++fd_close(fd_t *fd); ++ + fd_t * + fd_create(struct _inode *inode, pid_t pid); + +diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h +index 8650ccc..273039a 100644 +--- a/libglusterfs/src/glusterfs/xlator.h ++++ b/libglusterfs/src/glusterfs/xlator.h +@@ -705,6 +705,8 @@ typedef size_t (*cbk_inodectx_size_t)(xlator_t *this, inode_t *inode); + + typedef size_t (*cbk_fdctx_size_t)(xlator_t *this, fd_t *fd); + ++typedef void (*cbk_fdclose_t)(xlator_t *this, fd_t *fd); ++ + struct xlator_cbks { + cbk_forget_t forget; + cbk_release_t release; +@@ -715,6 +717,8 @@ struct xlator_cbks { + cbk_ictxmerge_t ictxmerge; + cbk_inodectx_size_t ictxsize; + cbk_fdctx_size_t fdctxsize; ++ cbk_fdclose_t fdclose; ++ cbk_fdclose_t fdclosedir; + }; + + typedef int32_t (*dumpop_priv_t)(xlator_t *this); +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index bc770e2..0a0862e 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -456,6 +456,7 @@ event_unregister_close + fd_anonymous + fd_anonymous_with_flags + fd_bind ++fd_close + fd_create + fd_create_uint64 + __fd_ctx_del +diff --git a/tests/basic/open-behind/open-behind.t b/tests/basic/open-behind/open-behind.t +new file mode 100644 +index 0000000..5e865d6 +--- /dev/null ++++ b/tests/basic/open-behind/open-behind.t +@@ -0,0 +1,183 @@ ++#!/bin/bash ++ ++WD="$(dirname "${0}")" ++ ++. ${WD}/../../include.rc ++. ${WD}/../../volume.rc ++ ++function assign() { ++ local _assign_var="${1}" ++ local _assign_value="${2}" ++ ++ printf -v "${_assign_var}" "%s" "${_assign_value}" ++} ++ ++function pipe_create() { ++ local _pipe_create_var="${1}" ++ local _pipe_create_name ++ local _pipe_create_fd ++ ++ _pipe_create_name="$(mktemp -u)" ++ mkfifo "${_pipe_create_name}" ++ exec {_pipe_create_fd}<>"${_pipe_create_name}" ++ rm "${_pipe_create_name}" ++ ++ assign "${_pipe_create_var}" "${_pipe_create_fd}" ++} ++ ++function pipe_close() { ++ local _pipe_close_fd="${!1}" ++ ++ exec {_pipe_close_fd}>&- ++} ++ ++function tester_start() { ++ declare -ag tester ++ local tester_in ++ local tester_out ++ ++ pipe_create tester_in ++ pipe_create tester_out ++ ++ ${WD}/tester <&${tester_in} >&${tester_out} & ++ ++ tester=("$!" "${tester_in}" "${tester_out}") ++} ++ ++function tester_send() { ++ declare -ag tester ++ local tester_res ++ local tester_extra ++ ++ echo "${*}" >&${tester[1]} ++ ++ read -t 3 -u ${tester[2]} tester_res tester_extra ++ echo "${tester_res} ${tester_extra}" ++ if [[ "${tester_res}" == "OK" ]]; then ++ return 0 ++ fi ++ ++ return 1 ++} ++ ++function tester_stop() { ++ declare -ag tester ++ local tester_res ++ ++ tester_send "quit" ++ ++ tester_res=0 ++ if ! wait ${tester[0]}; then ++ tester_res=$? ++ fi ++ ++ unset tester ++ ++ return ${tester_res} ++} ++ ++function count_open() { ++ local file="$(realpath "${B0}/${V0}/${1}")" ++ local count="0" ++ local inode ++ local ref ++ ++ inode="$(stat -c %i "${file}")" ++ ++ for fd in /proc/${BRICK_PID}/fd/*; do ++ ref="$(readlink "${fd}")" ++ if [[ "${ref}" == "${B0}/${V0}/"* ]]; then ++ if [[ "$(stat -c %i "${ref}")" == "${inode}" ]]; then ++ count="$((${count} + 1))" ++ fi ++ fi ++ done ++ ++ echo "${count}" ++} ++ ++cleanup ++ ++TEST build_tester ${WD}/tester.c ${WD}/tester-fd.c ++ ++TEST glusterd ++TEST pidof glusterd ++TEST ${CLI} volume create ${V0} ${H0}:${B0}/${V0} ++TEST ${CLI} volume set ${V0} flush-behind off ++TEST ${CLI} volume set ${V0} write-behind off ++TEST ${CLI} volume set ${V0} quick-read off ++TEST ${CLI} volume set ${V0} stat-prefetch on ++TEST ${CLI} volume set ${V0} io-cache off ++TEST ${CLI} volume set ${V0} open-behind on ++TEST ${CLI} volume set ${V0} lazy-open off ++TEST ${CLI} volume set ${V0} read-after-open off ++TEST ${CLI} volume start ${V0} ++ ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++BRICK_PID="$(get_brick_pid ${V0} ${H0} ${B0}/${V0})" ++ ++TEST touch "${M0}/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_start ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT_WITHIN 5 "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${CLI} volume set ${V0} lazy-open on ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++sleep 2 ++EXPECT "0" count_open "/test" ++TEST tester_send fd write 0 "test" ++EXPECT "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT "0" count_open "/test" ++EXPECT "test" tester_send fd read 0 64 ++# Even though read-after-open is disabled, use-anonymous-fd is also disabled, ++# so reads need to open the file first. ++EXPECT "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT "0" count_open "/test" ++TEST tester_send fd open 1 "${M0}/test" ++EXPECT "2" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "1" count_open "/test" ++TEST tester_send fd close 1 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${CLI} volume set ${V0} read-after-open on ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT "0" count_open "/test" ++EXPECT "test" tester_send fd read 0 64 ++EXPECT "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++TEST tester_stop ++ ++cleanup +diff --git a/tests/basic/open-behind/tester-fd.c b/tests/basic/open-behind/tester-fd.c +new file mode 100644 +index 0000000..00f02bc +--- /dev/null ++++ b/tests/basic/open-behind/tester-fd.c +@@ -0,0 +1,99 @@ ++/* ++ Copyright (c) 2020 Red Hat, Inc. ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include "tester.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int32_t ++fd_open(context_t *ctx, command_t *cmd) ++{ ++ obj_t *obj; ++ int32_t fd; ++ ++ obj = cmd->args[0].obj.ref; ++ ++ fd = open(cmd->args[1].str.data, O_RDWR); ++ if (fd < 0) { ++ return error(errno, "open() failed"); ++ } ++ ++ obj->type = OBJ_TYPE_FD; ++ obj->fd = fd; ++ ++ out_ok("%d", fd); ++ ++ return 0; ++} ++ ++static int32_t ++fd_close(context_t *ctx, command_t *cmd) ++{ ++ obj_t *obj; ++ ++ obj = cmd->args[0].obj.ref; ++ obj->type = OBJ_TYPE_NONE; ++ ++ if (close(obj->fd) != 0) { ++ return error(errno, "close() failed"); ++ } ++ ++ out_ok(); ++ ++ return 0; ++} ++ ++static int32_t ++fd_write(context_t *ctx, command_t *cmd) ++{ ++ ssize_t len, ret; ++ ++ len = strlen(cmd->args[1].str.data); ++ ret = write(cmd->args[0].obj.ref->fd, cmd->args[1].str.data, len); ++ if (ret < 0) { ++ return error(errno, "write() failed"); ++ } ++ ++ out_ok("%zd", ret); ++ ++ return 0; ++} ++ ++static int32_t ++fd_read(context_t *ctx, command_t *cmd) ++{ ++ char data[cmd->args[1].num.value + 1]; ++ ssize_t ret; ++ ++ ret = read(cmd->args[0].obj.ref->fd, data, cmd->args[1].num.value); ++ if (ret < 0) { ++ return error(errno, "read() failed"); ++ } ++ ++ data[ret] = 0; ++ ++ out_ok("%zd %s", ret, data); ++ ++ return 0; ++} ++ ++command_t fd_commands[] = { ++ {"open", fd_open, CMD_ARGS(ARG_VAL(OBJ_TYPE_NONE), ARG_STR(1024))}, ++ {"close", fd_close, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD))}, ++ {"write", fd_write, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD), ARG_STR(1024))}, ++ {"read", fd_read, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD), ARG_NUM(0, 1024))}, ++ CMD_END}; +diff --git a/tests/basic/open-behind/tester.c b/tests/basic/open-behind/tester.c +new file mode 100644 +index 0000000..b2da71c +--- /dev/null ++++ b/tests/basic/open-behind/tester.c +@@ -0,0 +1,444 @@ ++/* ++ Copyright (c) 2020 Red Hat, Inc. ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include "tester.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++static void * ++mem_alloc(size_t size) ++{ ++ void *ptr; ++ ++ ptr = malloc(size); ++ if (ptr == NULL) { ++ error(ENOMEM, "Failed to allocate memory (%zu bytes)", size); ++ } ++ ++ return ptr; ++} ++ ++static void ++mem_free(void *ptr) ++{ ++ free(ptr); ++} ++ ++static bool ++buffer_create(context_t *ctx, size_t size) ++{ ++ ctx->buffer.base = mem_alloc(size); ++ if (ctx->buffer.base == NULL) { ++ return false; ++ } ++ ++ ctx->buffer.size = size; ++ ctx->buffer.len = 0; ++ ctx->buffer.pos = 0; ++ ++ return true; ++} ++ ++static void ++buffer_destroy(context_t *ctx) ++{ ++ mem_free(ctx->buffer.base); ++ ctx->buffer.size = 0; ++ ctx->buffer.len = 0; ++} ++ ++static int32_t ++buffer_get(context_t *ctx) ++{ ++ ssize_t len; ++ ++ if (ctx->buffer.pos >= ctx->buffer.len) { ++ len = read(0, ctx->buffer.base, ctx->buffer.size); ++ if (len < 0) { ++ return error(errno, "read() failed"); ++ } ++ if (len == 0) { ++ return 0; ++ } ++ ++ ctx->buffer.len = len; ++ ctx->buffer.pos = 0; ++ } ++ ++ return ctx->buffer.base[ctx->buffer.pos++]; ++} ++ ++static int32_t ++str_skip_spaces(context_t *ctx, int32_t current) ++{ ++ while ((current > 0) && (current != '\n') && isspace(current)) { ++ current = buffer_get(ctx); ++ } ++ ++ return current; ++} ++ ++static int32_t ++str_token(context_t *ctx, char *buffer, uint32_t size, int32_t current) ++{ ++ uint32_t len; ++ ++ current = str_skip_spaces(ctx, current); ++ ++ len = 0; ++ while ((size > 0) && (current > 0) && (current != '\n') && ++ !isspace(current)) { ++ len++; ++ *buffer++ = current; ++ size--; ++ current = buffer_get(ctx); ++ } ++ ++ if (len == 0) { ++ return error(ENODATA, "Expecting a token"); ++ } ++ ++ if (size == 0) { ++ return error(ENOBUFS, "Token too long"); ++ } ++ ++ *buffer = 0; ++ ++ return current; ++} ++ ++static int32_t ++str_number(context_t *ctx, uint64_t min, uint64_t max, uint64_t *value, ++ int32_t current) ++{ ++ char text[32], *ptr; ++ uint64_t num; ++ ++ current = str_token(ctx, text, sizeof(text), current); ++ if (current > 0) { ++ num = strtoul(text, &ptr, 0); ++ if ((*ptr != 0) || (num < min) || (num > max)) { ++ return error(ERANGE, "Invalid number"); ++ } ++ *value = num; ++ } ++ ++ return current; ++} ++ ++static int32_t ++str_eol(context_t *ctx, int32_t current) ++{ ++ current = str_skip_spaces(ctx, current); ++ if (current != '\n') { ++ return error(EINVAL, "Expecting end of command"); ++ } ++ ++ return current; ++} ++ ++static void ++str_skip(context_t *ctx, int32_t current) ++{ ++ while ((current > 0) && (current != '\n')) { ++ current = buffer_get(ctx); ++ } ++} ++ ++static int32_t ++cmd_parse_obj(context_t *ctx, arg_t *arg, int32_t current) ++{ ++ obj_t *obj; ++ uint64_t id; ++ ++ current = str_number(ctx, 0, ctx->obj_count, &id, current); ++ if (current <= 0) { ++ return current; ++ } ++ ++ obj = &ctx->objs[id]; ++ if (obj->type != arg->obj.type) { ++ if (obj->type != OBJ_TYPE_NONE) { ++ return error(EBUSY, "Object is in use"); ++ } ++ return error(ENOENT, "Object is not defined"); ++ } ++ ++ arg->obj.ref = obj; ++ ++ return current; ++} ++ ++static int32_t ++cmd_parse_num(context_t *ctx, arg_t *arg, int32_t current) ++{ ++ return str_number(ctx, arg->num.min, arg->num.max, &arg->num.value, ++ current); ++} ++ ++static int32_t ++cmd_parse_str(context_t *ctx, arg_t *arg, int32_t current) ++{ ++ return str_token(ctx, arg->str.data, arg->str.size, current); ++} ++ ++static int32_t ++cmd_parse_args(context_t *ctx, command_t *cmd, int32_t current) ++{ ++ arg_t *arg; ++ ++ for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) { ++ switch (arg->type) { ++ case ARG_TYPE_OBJ: ++ current = cmd_parse_obj(ctx, arg, current); ++ break; ++ case ARG_TYPE_NUM: ++ current = cmd_parse_num(ctx, arg, current); ++ break; ++ case ARG_TYPE_STR: ++ current = cmd_parse_str(ctx, arg, current); ++ break; ++ default: ++ return error(EINVAL, "Unknown argument type"); ++ } ++ } ++ ++ if (current < 0) { ++ return current; ++ } ++ ++ current = str_eol(ctx, current); ++ if (current <= 0) { ++ return error(EINVAL, "Syntax error"); ++ } ++ ++ return cmd->handler(ctx, cmd); ++} ++ ++static int32_t ++cmd_parse(context_t *ctx, command_t *cmds) ++{ ++ char text[32]; ++ command_t *cmd; ++ int32_t current; ++ ++ cmd = cmds; ++ do { ++ current = str_token(ctx, text, sizeof(text), buffer_get(ctx)); ++ if (current <= 0) { ++ return current; ++ } ++ ++ while (cmd->name != NULL) { ++ if (strcmp(cmd->name, text) == 0) { ++ if (cmd->handler != NULL) { ++ return cmd_parse_args(ctx, cmd, current); ++ } ++ cmd = cmd->cmds; ++ break; ++ } ++ cmd++; ++ } ++ } while (cmd->name != NULL); ++ ++ str_skip(ctx, current); ++ ++ return error(ENOTSUP, "Unknown command"); ++} ++ ++static void ++cmd_fini(context_t *ctx, command_t *cmds) ++{ ++ command_t *cmd; ++ arg_t *arg; ++ ++ for (cmd = cmds; cmd->name != NULL; cmd++) { ++ if (cmd->handler == NULL) { ++ cmd_fini(ctx, cmd->cmds); ++ } else { ++ for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) { ++ switch (arg->type) { ++ case ARG_TYPE_STR: ++ mem_free(arg->str.data); ++ arg->str.data = NULL; ++ break; ++ default: ++ break; ++ } ++ } ++ } ++ } ++} ++ ++static bool ++cmd_init(context_t *ctx, command_t *cmds) ++{ ++ command_t *cmd; ++ arg_t *arg; ++ ++ for (cmd = cmds; cmd->name != NULL; cmd++) { ++ if (cmd->handler == NULL) { ++ if (!cmd_init(ctx, cmd->cmds)) { ++ return false; ++ } ++ } else { ++ for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) { ++ switch (arg->type) { ++ case ARG_TYPE_STR: ++ arg->str.data = mem_alloc(arg->str.size); ++ if (arg->str.data == NULL) { ++ return false; ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ } ++ } ++ ++ return true; ++} ++ ++static bool ++objs_create(context_t *ctx, uint32_t count) ++{ ++ uint32_t i; ++ ++ ctx->objs = mem_alloc(sizeof(obj_t) * count); ++ if (ctx->objs == NULL) { ++ return false; ++ } ++ ctx->obj_count = count; ++ ++ for (i = 0; i < count; i++) { ++ ctx->objs[i].type = OBJ_TYPE_NONE; ++ } ++ ++ return true; ++} ++ ++static int32_t ++objs_destroy(context_t *ctx) ++{ ++ uint32_t i; ++ int32_t err; ++ ++ err = 0; ++ for (i = 0; i < ctx->obj_count; i++) { ++ if (ctx->objs[i].type != OBJ_TYPE_NONE) { ++ err = error(ENOTEMPTY, "Objects not destroyed"); ++ break; ++ } ++ } ++ ++ mem_free(ctx->objs); ++ ctx->objs = NULL; ++ ctx->obj_count = 0; ++ ++ return err; ++} ++ ++static context_t * ++init(size_t size, uint32_t objs, command_t *cmds) ++{ ++ context_t *ctx; ++ ++ ctx = mem_alloc(sizeof(context_t)); ++ if (ctx == NULL) { ++ goto failed; ++ } ++ ++ if (!buffer_create(ctx, size)) { ++ goto failed_ctx; ++ } ++ ++ if (!objs_create(ctx, objs)) { ++ goto failed_buffer; ++ } ++ ++ if (!cmd_init(ctx, cmds)) { ++ goto failed_objs; ++ } ++ ++ ctx->active = true; ++ ++ return ctx; ++ ++failed_objs: ++ cmd_fini(ctx, cmds); ++ objs_destroy(ctx); ++failed_buffer: ++ buffer_destroy(ctx); ++failed_ctx: ++ mem_free(ctx); ++failed: ++ return NULL; ++} ++ ++static int32_t ++fini(context_t *ctx, command_t *cmds) ++{ ++ int32_t ret; ++ ++ cmd_fini(ctx, cmds); ++ buffer_destroy(ctx); ++ ++ ret = objs_destroy(ctx); ++ ++ ctx->active = false; ++ ++ return ret; ++} ++ ++static int32_t ++exec_quit(context_t *ctx, command_t *cmd) ++{ ++ ctx->active = false; ++ ++ return 0; ++} ++ ++static command_t commands[] = {{"fd", NULL, CMD_SUB(fd_commands)}, ++ {"quit", exec_quit, CMD_ARGS()}, ++ CMD_END}; ++ ++int32_t ++main(int32_t argc, char *argv[]) ++{ ++ context_t *ctx; ++ int32_t res; ++ ++ ctx = init(1024, 16, commands); ++ if (ctx == NULL) { ++ return 1; ++ } ++ ++ do { ++ res = cmd_parse(ctx, commands); ++ if (res < 0) { ++ out_err(-res); ++ } ++ } while (ctx->active); ++ ++ res = fini(ctx, commands); ++ if (res >= 0) { ++ out_ok(); ++ return 0; ++ } ++ ++ out_err(-res); ++ ++ return 1; ++} +diff --git a/tests/basic/open-behind/tester.h b/tests/basic/open-behind/tester.h +new file mode 100644 +index 0000000..64e940c +--- /dev/null ++++ b/tests/basic/open-behind/tester.h +@@ -0,0 +1,145 @@ ++/* ++ Copyright (c) 2020 Red Hat, Inc. ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef __TESTER_H__ ++#define __TESTER_H__ ++ ++#include ++#include ++#include ++ ++enum _obj_type; ++typedef enum _obj_type obj_type_t; ++ ++enum _arg_type; ++typedef enum _arg_type arg_type_t; ++ ++struct _buffer; ++typedef struct _buffer buffer_t; ++ ++struct _obj; ++typedef struct _obj obj_t; ++ ++struct _context; ++typedef struct _context context_t; ++ ++struct _arg; ++typedef struct _arg arg_t; ++ ++struct _command; ++typedef struct _command command_t; ++ ++enum _obj_type { OBJ_TYPE_NONE, OBJ_TYPE_FD }; ++ ++enum _arg_type { ARG_TYPE_NONE, ARG_TYPE_OBJ, ARG_TYPE_NUM, ARG_TYPE_STR }; ++ ++struct _buffer { ++ char *base; ++ uint32_t size; ++ uint32_t len; ++ uint32_t pos; ++}; ++ ++struct _obj { ++ obj_type_t type; ++ union { ++ int32_t fd; ++ }; ++}; ++ ++struct _context { ++ obj_t *objs; ++ buffer_t buffer; ++ uint32_t obj_count; ++ bool active; ++}; ++ ++struct _arg { ++ arg_type_t type; ++ union { ++ struct { ++ obj_type_t type; ++ obj_t *ref; ++ } obj; ++ struct { ++ uint64_t value; ++ uint64_t min; ++ uint64_t max; ++ } num; ++ struct { ++ uint32_t size; ++ char *data; ++ } str; ++ }; ++}; ++ ++struct _command { ++ const char *name; ++ int32_t (*handler)(context_t *ctx, command_t *cmd); ++ union { ++ arg_t *args; ++ command_t *cmds; ++ }; ++}; ++ ++#define msg(_stream, _fmt, _args...) \ ++ do { \ ++ fprintf(_stream, _fmt "\n", ##_args); \ ++ fflush(_stream); \ ++ } while (0) ++ ++#define msg_out(_fmt, _args...) msg(stdout, _fmt, ##_args) ++#define msg_err(_err, _fmt, _args...) \ ++ ({ \ ++ int32_t __msg_err = (_err); \ ++ msg(stderr, "[%4u:%-15s] " _fmt, __LINE__, __FUNCTION__, __msg_err, \ ++ ##_args); \ ++ -__msg_err; \ ++ }) ++ ++#define error(_err, _fmt, _args...) msg_err(_err, "E(%4d) " _fmt, ##_args) ++#define warn(_err, _fmt, _args...) msg_err(_err, "W(%4d) " _fmt, ##_args) ++#define info(_err, _fmt, _args...) msg_err(_err, "I(%4d) " _fmt, ##_args) ++ ++#define out_ok(_args...) msg_out("OK " _args) ++#define out_err(_err) msg_out("ERR %d", _err) ++ ++#define ARG_END \ ++ { \ ++ ARG_TYPE_NONE \ ++ } ++ ++#define CMD_ARGS1(_x, _args...) \ ++ .args = (arg_t[]) { _args } ++#define CMD_ARGS(_args...) CMD_ARGS1(, ##_args, ARG_END) ++ ++#define CMD_SUB(_cmds) .cmds = _cmds ++ ++#define CMD_END \ ++ { \ ++ NULL, NULL, CMD_SUB(NULL) \ ++ } ++ ++#define ARG_VAL(_type) \ ++ { \ ++ ARG_TYPE_OBJ, .obj = {.type = _type } \ ++ } ++#define ARG_NUM(_min, _max) \ ++ { \ ++ ARG_TYPE_NUM, .num = {.min = _min, .max = _max } \ ++ } ++#define ARG_STR(_size) \ ++ { \ ++ ARG_TYPE_STR, .str = {.size = _size } \ ++ } ++ ++extern command_t fd_commands[]; ++ ++#endif /* __TESTER_H__ */ +\ No newline at end of file +diff --git a/tests/bugs/glusterfs/bug-873962-spb.t b/tests/bugs/glusterfs/bug-873962-spb.t +index db84a22..db71cc0 100644 +--- a/tests/bugs/glusterfs/bug-873962-spb.t ++++ b/tests/bugs/glusterfs/bug-873962-spb.t +@@ -14,6 +14,7 @@ TEST $CLI volume set $V0 performance.io-cache off + TEST $CLI volume set $V0 performance.write-behind off + TEST $CLI volume set $V0 performance.stat-prefetch off + TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.open-behind off + TEST $CLI volume set $V0 cluster.background-self-heal-count 0 + TEST $CLI volume start $V0 + TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 --direct-io-mode=enable +diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c +index 919eea3..76b5809 100644 +--- a/xlators/mount/fuse/src/fuse-bridge.c ++++ b/xlators/mount/fuse/src/fuse-bridge.c +@@ -3398,6 +3398,8 @@ fuse_release(xlator_t *this, fuse_in_header_t *finh, void *msg, + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "finh->unique: %" PRIu64 ": RELEASE %p", finh->unique, state->fd); + ++ fd_close(state->fd); ++ + fuse_fd_ctx_destroy(this, state->fd); + fd_unref(fd); + +diff --git a/xlators/performance/open-behind/src/open-behind-messages.h b/xlators/performance/open-behind/src/open-behind-messages.h +index f250824..0e78917 100644 +--- a/xlators/performance/open-behind/src/open-behind-messages.h ++++ b/xlators/performance/open-behind/src/open-behind-messages.h +@@ -23,6 +23,10 @@ + */ + + GLFS_MSGID(OPEN_BEHIND, OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED, +- OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY); ++ OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY, ++ OPEN_BEHIND_MSG_FAILED, OPEN_BEHIND_MSG_BAD_STATE); ++ ++#define OPEN_BEHIND_MSG_FAILED_STR "Failed to submit fop" ++#define OPEN_BEHIND_MSG_BAD_STATE_STR "Unexpected state" + + #endif /* _OPEN_BEHIND_MESSAGES_H_ */ +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index cbe89ec..e43fe73 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -16,6 +16,18 @@ + #include "open-behind-messages.h" + #include + ++/* Note: The initial design of open-behind was made to cover the simple case ++ * of open, read, close for small files. This pattern combined with ++ * quick-read can do the whole operation without a single request to the ++ * bricks (except the initial lookup). ++ * ++ * The way to do this has been improved, but the logic remains the same. ++ * Basically, this means that any operation sent to the fd or the inode ++ * that it's not a read, causes the open request to be sent to the ++ * bricks, and all future operations will be executed synchronously, ++ * including opens (it's reset once all fd's are closed). ++ */ ++ + typedef struct ob_conf { + gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe + e.g - fstat() readv() +@@ -32,1096 +44,754 @@ typedef struct ob_conf { + */ + } ob_conf_t; + +-typedef struct ob_inode { +- inode_t *inode; +- struct list_head resume_fops; +- struct list_head ob_fds; +- int count; +- int op_ret; +- int op_errno; +- gf_boolean_t open_in_progress; +- int unlinked; +-} ob_inode_t; ++/* A negative state represents an errno value negated. In this case the ++ * current operation cannot be processed. */ ++typedef enum _ob_state { ++ /* There are no opens on the inode or the first open is already ++ * completed. The current operation can be sent directly. */ ++ OB_STATE_READY = 0, + +-typedef struct ob_fd { +- call_frame_t *open_frame; +- loc_t loc; +- dict_t *xdata; +- int flags; +- int op_errno; +- ob_inode_t *ob_inode; +- fd_t *fd; +- gf_boolean_t opened; +- gf_boolean_t ob_inode_fops_waiting; +- struct list_head list; +- struct list_head ob_fds_on_inode; +-} ob_fd_t; ++ /* There's an open pending and it has been triggered. The current ++ * operation should be "stubbified" and processed with ++ * ob_stub_dispatch(). */ ++ OB_STATE_OPEN_TRIGGERED, + +-ob_inode_t * +-ob_inode_alloc(inode_t *inode) +-{ +- ob_inode_t *ob_inode = NULL; ++ /* There's an open pending but it has not been triggered. The current ++ * operation can be processed directly but using an anonymous fd. */ ++ OB_STATE_OPEN_PENDING, + +- ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t); +- if (ob_inode == NULL) +- goto out; ++ /* The current operation is the first open on the inode. */ ++ OB_STATE_FIRST_OPEN ++} ob_state_t; + +- ob_inode->inode = inode; +- INIT_LIST_HEAD(&ob_inode->resume_fops); +- INIT_LIST_HEAD(&ob_inode->ob_fds); +-out: +- return ob_inode; +-} +- +-void +-ob_inode_free(ob_inode_t *ob_inode) +-{ +- if (ob_inode == NULL) +- goto out; ++typedef struct ob_inode { ++ /* List of stubs pending on the first open. Once the first open is ++ * complete, all these stubs will be resubmitted, and dependencies ++ * will be checked again. */ ++ struct list_head resume_fops; + +- list_del_init(&ob_inode->resume_fops); +- list_del_init(&ob_inode->ob_fds); ++ /* The inode this object references. */ ++ inode_t *inode; + +- GF_FREE(ob_inode); +-out: +- return; +-} ++ /* The fd from the first open sent to this inode. It will be set ++ * from the moment the open is processed until the open if fully ++ * executed or closed before actually opened. It's NULL in all ++ * other cases. */ ++ fd_t *first_fd; ++ ++ /* The stub from the first open operation. When open fop starts ++ * being processed, it's assigned the OB_OPEN_PREPARING value ++ * until the actual stub is created. This is necessary to avoid ++ * creating the stub inside a locked region. Once the stub is ++ * successfully created, it's assigned here. This value is set ++ * to NULL once the stub is resumed. */ ++ call_stub_t *first_open; ++ ++ /* The total number of currently open fd's on this inode. */ ++ int32_t open_count; ++ ++ /* This flag is set as soon as we know that the open will be ++ * sent to the bricks, even before the stub is ready. */ ++ bool triggered; ++} ob_inode_t; + +-ob_inode_t * +-ob_inode_get(xlator_t *this, inode_t *inode) ++/* Dummy pointer used temporarily while the actual open stub is being created */ ++#define OB_OPEN_PREPARING ((call_stub_t *)-1) ++ ++#define OB_POST_COMMON(_fop, _xl, _frame, _fd, _args...) \ ++ case OB_STATE_FIRST_OPEN: \ ++ gf_smsg((_xl)->name, GF_LOG_ERROR, EINVAL, OPEN_BEHIND_MSG_BAD_STATE, \ ++ "fop=%s", #_fop, "state=%d", __ob_state, NULL); \ ++ default_##_fop##_failure_cbk(_frame, EINVAL); \ ++ break; \ ++ case OB_STATE_READY: \ ++ default_##_fop(_frame, _xl, ##_args); \ ++ break; \ ++ case OB_STATE_OPEN_TRIGGERED: { \ ++ call_stub_t *__ob_stub = fop_##_fop##_stub(_frame, ob_##_fop, \ ++ ##_args); \ ++ if (__ob_stub != NULL) { \ ++ ob_stub_dispatch(_xl, __ob_inode, _fd, __ob_stub); \ ++ break; \ ++ } \ ++ __ob_state = -ENOMEM; \ ++ } \ ++ default: \ ++ gf_smsg((_xl)->name, GF_LOG_ERROR, -__ob_state, \ ++ OPEN_BEHIND_MSG_FAILED, "fop=%s", #_fop, NULL); \ ++ default_##_fop##_failure_cbk(_frame, -__ob_state) ++ ++#define OB_POST_FD(_fop, _xl, _frame, _fd, _trigger, _args...) \ ++ do { \ ++ ob_inode_t *__ob_inode; \ ++ fd_t *__first_fd; \ ++ ob_state_t __ob_state = ob_open_and_resume_fd( \ ++ _xl, _fd, 0, true, _trigger, &__ob_inode, &__first_fd); \ ++ switch (__ob_state) { \ ++ case OB_STATE_OPEN_PENDING: \ ++ if (!(_trigger)) { \ ++ fd_t *__ob_fd = fd_anonymous_with_flags((_fd)->inode, \ ++ (_fd)->flags); \ ++ if (__ob_fd != NULL) { \ ++ default_##_fop(_frame, _xl, ##_args); \ ++ fd_unref(__ob_fd); \ ++ break; \ ++ } \ ++ __ob_state = -ENOMEM; \ ++ } \ ++ OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ ++ } \ ++ } while (0) ++ ++#define OB_POST_FLUSH(_xl, _frame, _fd, _args...) \ ++ do { \ ++ ob_inode_t *__ob_inode; \ ++ fd_t *__first_fd; \ ++ ob_state_t __ob_state = ob_open_and_resume_fd( \ ++ _xl, _fd, 0, true, false, &__ob_inode, &__first_fd); \ ++ switch (__ob_state) { \ ++ case OB_STATE_OPEN_PENDING: \ ++ default_flush_cbk(_frame, NULL, _xl, 0, 0, NULL); \ ++ break; \ ++ OB_POST_COMMON(flush, _xl, _frame, __first_fd, ##_args); \ ++ } \ ++ } while (0) ++ ++#define OB_POST_INODE(_fop, _xl, _frame, _inode, _trigger, _args...) \ ++ do { \ ++ ob_inode_t *__ob_inode; \ ++ fd_t *__first_fd; \ ++ ob_state_t __ob_state = ob_open_and_resume_inode( \ ++ _xl, _inode, NULL, 0, true, _trigger, &__ob_inode, &__first_fd); \ ++ switch (__ob_state) { \ ++ case OB_STATE_OPEN_PENDING: \ ++ OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ ++ } \ ++ } while (0) ++ ++static ob_inode_t * ++ob_inode_get_locked(xlator_t *this, inode_t *inode) + { + ob_inode_t *ob_inode = NULL; + uint64_t value = 0; +- int ret = 0; + +- if (!inode) +- goto out; ++ if ((__inode_ctx_get(inode, this, &value) == 0) && (value != 0)) { ++ return (ob_inode_t *)(uintptr_t)value; ++ } + +- LOCK(&inode->lock); +- { +- __inode_ctx_get(inode, this, &value); +- if (value == 0) { +- ob_inode = ob_inode_alloc(inode); +- if (ob_inode == NULL) +- goto unlock; +- +- value = (uint64_t)(uintptr_t)ob_inode; +- ret = __inode_ctx_set(inode, this, &value); +- if (ret < 0) { +- ob_inode_free(ob_inode); +- ob_inode = NULL; +- } +- } else { +- ob_inode = (ob_inode_t *)(uintptr_t)value; ++ ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t); ++ if (ob_inode != NULL) { ++ ob_inode->inode = inode; ++ INIT_LIST_HEAD(&ob_inode->resume_fops); ++ ++ value = (uint64_t)(uintptr_t)ob_inode; ++ if (__inode_ctx_set(inode, this, &value) < 0) { ++ GF_FREE(ob_inode); ++ ob_inode = NULL; + } + } +-unlock: +- UNLOCK(&inode->lock); + +-out: + return ob_inode; + } + +-ob_fd_t * +-__ob_fd_ctx_get(xlator_t *this, fd_t *fd) ++static ob_state_t ++ob_open_and_resume_inode(xlator_t *xl, inode_t *inode, fd_t *fd, ++ int32_t open_count, bool synchronous, bool trigger, ++ ob_inode_t **pob_inode, fd_t **pfd) + { +- uint64_t value = 0; +- int ret = -1; +- ob_fd_t *ob_fd = NULL; ++ ob_conf_t *conf; ++ ob_inode_t *ob_inode; ++ call_stub_t *open_stub; + +- ret = __fd_ctx_get(fd, this, &value); +- if (ret) +- return NULL; ++ if (inode == NULL) { ++ return OB_STATE_READY; ++ } + +- ob_fd = (void *)((long)value); ++ conf = xl->private; + +- return ob_fd; +-} ++ *pfd = NULL; + +-ob_fd_t * +-ob_fd_ctx_get(xlator_t *this, fd_t *fd) +-{ +- ob_fd_t *ob_fd = NULL; +- +- LOCK(&fd->lock); ++ LOCK(&inode->lock); + { +- ob_fd = __ob_fd_ctx_get(this, fd); +- } +- UNLOCK(&fd->lock); +- +- return ob_fd; +-} ++ ob_inode = ob_inode_get_locked(xl, inode); ++ if (ob_inode == NULL) { ++ UNLOCK(&inode->lock); + +-int +-__ob_fd_ctx_set(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) +-{ +- uint64_t value = 0; +- int ret = -1; ++ return -ENOMEM; ++ } ++ *pob_inode = ob_inode; ++ ++ ob_inode->open_count += open_count; ++ ++ /* If first_fd is not NULL, it means that there's a previous open not ++ * yet completed. */ ++ if (ob_inode->first_fd != NULL) { ++ *pfd = ob_inode->first_fd; ++ /* If the current request doesn't trigger the open and it hasn't ++ * been triggered yet, we can continue without issuing the open ++ * only if the current request belongs to the same fd as the ++ * first one. */ ++ if (!trigger && !ob_inode->triggered && ++ (ob_inode->first_fd == fd)) { ++ UNLOCK(&inode->lock); ++ ++ return OB_STATE_OPEN_PENDING; ++ } + +- value = (long)((void *)ob_fd); ++ /* We need to issue the open. It could have already been triggered ++ * before. In this case open_stub will be NULL. Or the initial open ++ * may not be completely ready yet. In this case open_stub will be ++ * OB_OPEN_PREPARING. */ ++ open_stub = ob_inode->first_open; ++ ob_inode->first_open = NULL; ++ ob_inode->triggered = true; + +- ret = __fd_ctx_set(fd, this, value); ++ UNLOCK(&inode->lock); + +- return ret; +-} ++ if ((open_stub != NULL) && (open_stub != OB_OPEN_PREPARING)) { ++ call_resume(open_stub); ++ } + +-int +-ob_fd_ctx_set(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) +-{ +- int ret = -1; ++ return OB_STATE_OPEN_TRIGGERED; ++ } + +- LOCK(&fd->lock); +- { +- ret = __ob_fd_ctx_set(this, fd, ob_fd); +- } +- UNLOCK(&fd->lock); ++ /* There's no pending open. Only opens can be non synchronous, so all ++ * regular fops will be processed directly. For non synchronous opens, ++ * we'll still process them normally (i.e. synchornous) if there are ++ * more file descriptors open. */ ++ if (synchronous || (ob_inode->open_count > open_count)) { ++ UNLOCK(&inode->lock); + +- return ret; +-} ++ return OB_STATE_READY; ++ } + +-ob_fd_t * +-ob_fd_new(void) +-{ +- ob_fd_t *ob_fd = NULL; ++ *pfd = fd; + +- ob_fd = GF_CALLOC(1, sizeof(*ob_fd), gf_ob_mt_fd_t); ++ /* This is the first open. We keep a reference on the fd and set ++ * first_open stub to OB_OPEN_PREPARING until the actual stub can ++ * be assigned (we don't create the stub here to avoid doing memory ++ * allocations inside the mutex). */ ++ ob_inode->first_fd = __fd_ref(fd); ++ ob_inode->first_open = OB_OPEN_PREPARING; + +- INIT_LIST_HEAD(&ob_fd->list); +- INIT_LIST_HEAD(&ob_fd->ob_fds_on_inode); ++ /* If lazy_open is not set, we'll need to immediately send the open, ++ * so we set triggered right now. */ ++ ob_inode->triggered = !conf->lazy_open; ++ } ++ UNLOCK(&inode->lock); + +- return ob_fd; ++ return OB_STATE_FIRST_OPEN; + } + +-void +-ob_fd_free(ob_fd_t *ob_fd) ++static ob_state_t ++ob_open_and_resume_fd(xlator_t *xl, fd_t *fd, int32_t open_count, ++ bool synchronous, bool trigger, ob_inode_t **pob_inode, ++ fd_t **pfd) + { +- LOCK(&ob_fd->fd->inode->lock); +- { +- list_del_init(&ob_fd->ob_fds_on_inode); +- } +- UNLOCK(&ob_fd->fd->inode->lock); +- +- loc_wipe(&ob_fd->loc); +- +- if (ob_fd->xdata) +- dict_unref(ob_fd->xdata); ++ uint64_t err; + +- if (ob_fd->open_frame) { +- /* If we sill have a frame it means that background open has never +- * been triggered. We need to release the pending reference. */ +- fd_unref(ob_fd->fd); +- +- STACK_DESTROY(ob_fd->open_frame->root); ++ if ((fd_ctx_get(fd, xl, &err) == 0) && (err != 0)) { ++ return (ob_state_t)-err; + } + +- GF_FREE(ob_fd); ++ return ob_open_and_resume_inode(xl, fd->inode, fd, open_count, synchronous, ++ trigger, pob_inode, pfd); + } + +-int +-ob_wake_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +- int op_errno, fd_t *fd_ret, dict_t *xdata) ++static ob_state_t ++ob_open_behind(xlator_t *xl, fd_t *fd, int32_t flags, ob_inode_t **pob_inode, ++ fd_t **pfd) + { +- fd_t *fd = NULL; +- int count = 0; +- int ob_inode_op_ret = 0; +- int ob_inode_op_errno = 0; +- ob_fd_t *ob_fd = NULL; +- call_stub_t *stub = NULL, *tmp = NULL; +- ob_inode_t *ob_inode = NULL; +- gf_boolean_t ob_inode_fops_waiting = _gf_false; +- struct list_head fops_waiting_on_fd, fops_waiting_on_inode; ++ bool synchronous; + +- fd = frame->local; +- frame->local = NULL; +- +- INIT_LIST_HEAD(&fops_waiting_on_fd); +- INIT_LIST_HEAD(&fops_waiting_on_inode); ++ /* TODO: If O_CREAT, O_APPEND, O_WRONLY or O_DIRECT are specified, shouldn't ++ * we also execute this open synchronously ? */ ++ synchronous = (flags & O_TRUNC) != 0; + +- ob_inode = ob_inode_get(this, fd->inode); ++ return ob_open_and_resume_fd(xl, fd, 1, synchronous, true, pob_inode, pfd); ++} + +- LOCK(&fd->lock); ++static int32_t ++ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, ++ call_stub_t *stub) ++{ ++ LOCK(&ob_inode->inode->lock); + { +- ob_fd = __ob_fd_ctx_get(this, fd); +- ob_fd->opened = _gf_true; +- +- ob_inode_fops_waiting = ob_fd->ob_inode_fops_waiting; +- +- list_splice_init(&ob_fd->list, &fops_waiting_on_fd); +- +- if (op_ret < 0) { +- /* mark fd BAD for ever */ +- ob_fd->op_errno = op_errno; +- ob_fd = NULL; /*shouldn't be freed*/ +- } else { +- __fd_ctx_del(fd, this, NULL); +- } +- } +- UNLOCK(&fd->lock); +- +- if (ob_inode_fops_waiting) { +- LOCK(&fd->inode->lock); +- { +- count = --ob_inode->count; +- if (op_ret < 0) { +- /* TODO: when to reset the error? */ +- ob_inode->op_ret = -1; +- ob_inode->op_errno = op_errno; +- } +- +- if (count == 0) { +- ob_inode->open_in_progress = _gf_false; +- ob_inode_op_ret = ob_inode->op_ret; +- ob_inode_op_errno = ob_inode->op_errno; +- list_splice_init(&ob_inode->resume_fops, +- &fops_waiting_on_inode); +- } ++ /* We only queue a stub if the open has not been completed or ++ * cancelled. */ ++ if (ob_inode->first_fd == fd) { ++ list_add_tail(&stub->list, &ob_inode->resume_fops); ++ stub = NULL; + } +- UNLOCK(&fd->inode->lock); +- } +- +- if (ob_fd) +- ob_fd_free(ob_fd); +- +- list_for_each_entry_safe(stub, tmp, &fops_waiting_on_fd, list) +- { +- list_del_init(&stub->list); +- +- if (op_ret < 0) +- call_unwind_error(stub, -1, op_errno); +- else +- call_resume(stub); + } ++ UNLOCK(&ob_inode->inode->lock); + +- list_for_each_entry_safe(stub, tmp, &fops_waiting_on_inode, list) +- { +- list_del_init(&stub->list); +- +- if (ob_inode_op_ret < 0) +- call_unwind_error(stub, -1, ob_inode_op_errno); +- else +- call_resume(stub); ++ if (stub != NULL) { ++ call_resume(stub); + } + +- /* The background open is completed. We can release the 'fd' reference. */ +- fd_unref(fd); +- +- STACK_DESTROY(frame->root); +- + return 0; + } + +-int +-ob_fd_wake(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) ++static int32_t ++ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, ++ call_stub_t *stub) + { +- call_frame_t *frame = NULL; +- +- if (ob_fd == NULL) { +- LOCK(&fd->lock); +- { +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (!ob_fd) +- goto unlock; ++ bool closed; + +- frame = ob_fd->open_frame; +- ob_fd->open_frame = NULL; +- } +- unlock: +- UNLOCK(&fd->lock); +- } else { +- LOCK(&fd->lock); +- { +- frame = ob_fd->open_frame; +- ob_fd->open_frame = NULL; ++ LOCK(&ob_inode->inode->lock); ++ { ++ closed = ob_inode->first_fd != fd; ++ if (!closed) { ++ if (ob_inode->triggered) { ++ ob_inode->first_open = NULL; ++ } else { ++ ob_inode->first_open = stub; ++ stub = NULL; ++ } + } +- UNLOCK(&fd->lock); + } ++ UNLOCK(&ob_inode->inode->lock); + +- if (frame) { +- /* We don't need to take a reference here. We already have a reference +- * while the open is pending. */ +- frame->local = fd; +- +- STACK_WIND(frame, ob_wake_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, &ob_fd->loc, ob_fd->flags, fd, +- ob_fd->xdata); ++ if (stub != NULL) { ++ if (closed) { ++ call_stub_destroy(stub); ++ fd_unref(fd); ++ } else { ++ call_resume(stub); ++ } + } + + return 0; + } + +-void +-ob_inode_wake(xlator_t *this, struct list_head *ob_fds) ++static void ++ob_resume_pending(struct list_head *list) + { +- ob_fd_t *ob_fd = NULL, *tmp = NULL; ++ call_stub_t *stub; + +- if (!list_empty(ob_fds)) { +- list_for_each_entry_safe(ob_fd, tmp, ob_fds, ob_fds_on_inode) +- { +- ob_fd_wake(this, ob_fd->fd, ob_fd); +- ob_fd_free(ob_fd); +- } +- } +-} ++ while (!list_empty(list)) { ++ stub = list_first_entry(list, call_stub_t, list); ++ list_del_init(&stub->list); + +-/* called holding inode->lock and fd->lock */ +-void +-ob_fd_copy(ob_fd_t *src, ob_fd_t *dst) +-{ +- if (!src || !dst) +- goto out; +- +- dst->fd = src->fd; +- dst->loc.inode = inode_ref(src->loc.inode); +- gf_uuid_copy(dst->loc.gfid, src->loc.gfid); +- dst->flags = src->flags; +- dst->xdata = dict_ref(src->xdata); +- dst->ob_inode = src->ob_inode; +-out: +- return; ++ call_resume(stub); ++ } + } + +-int +-open_all_pending_fds_and_resume(xlator_t *this, inode_t *inode, +- call_stub_t *stub) ++static void ++ob_open_completed(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, int32_t op_ret, ++ int32_t op_errno) + { +- ob_inode_t *ob_inode = NULL; +- ob_fd_t *ob_fd = NULL, *tmp = NULL; +- gf_boolean_t was_open_in_progress = _gf_false; +- gf_boolean_t wait_for_open = _gf_false; +- struct list_head ob_fds; ++ struct list_head list; + +- ob_inode = ob_inode_get(this, inode); +- if (ob_inode == NULL) +- goto out; ++ INIT_LIST_HEAD(&list); + +- INIT_LIST_HEAD(&ob_fds); ++ if (op_ret < 0) { ++ fd_ctx_set(fd, xl, op_errno <= 0 ? EIO : op_errno); ++ } + +- LOCK(&inode->lock); ++ LOCK(&ob_inode->inode->lock); + { +- was_open_in_progress = ob_inode->open_in_progress; +- ob_inode->unlinked = 1; +- +- if (was_open_in_progress) { +- list_add_tail(&stub->list, &ob_inode->resume_fops); +- goto inode_unlock; +- } +- +- list_for_each_entry(ob_fd, &ob_inode->ob_fds, ob_fds_on_inode) +- { +- LOCK(&ob_fd->fd->lock); +- { +- if (ob_fd->opened) +- goto fd_unlock; +- +- ob_inode->count++; +- ob_fd->ob_inode_fops_waiting = _gf_true; +- +- if (ob_fd->open_frame == NULL) { +- /* open in progress no need of wake */ +- } else { +- tmp = ob_fd_new(); +- tmp->open_frame = ob_fd->open_frame; +- ob_fd->open_frame = NULL; +- +- ob_fd_copy(ob_fd, tmp); +- list_add_tail(&tmp->ob_fds_on_inode, &ob_fds); +- } +- } +- fd_unlock: +- UNLOCK(&ob_fd->fd->lock); +- } +- +- if (ob_inode->count) { +- wait_for_open = ob_inode->open_in_progress = _gf_true; +- list_add_tail(&stub->list, &ob_inode->resume_fops); ++ /* Only update the fields if the file has not been closed before ++ * getting here. */ ++ if (ob_inode->first_fd == fd) { ++ list_splice_init(&ob_inode->resume_fops, &list); ++ ob_inode->first_fd = NULL; ++ ob_inode->first_open = NULL; ++ ob_inode->triggered = false; + } + } +-inode_unlock: +- UNLOCK(&inode->lock); ++ UNLOCK(&ob_inode->inode->lock); + +-out: +- if (!was_open_in_progress) { +- if (!wait_for_open) { +- call_resume(stub); +- } else { +- ob_inode_wake(this, &ob_fds); +- } +- } ++ ob_resume_pending(&list); + +- return 0; ++ fd_unref(fd); + } + +-int +-open_and_resume(xlator_t *this, fd_t *fd, call_stub_t *stub) ++static int32_t ++ob_open_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret, ++ int32_t op_errno, fd_t *fd, dict_t *xdata) + { +- ob_fd_t *ob_fd = NULL; +- int op_errno = 0; +- +- if (!fd) +- goto nofd; +- +- LOCK(&fd->lock); +- { +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (!ob_fd) +- goto unlock; ++ ob_inode_t *ob_inode; + +- if (ob_fd->op_errno) { +- op_errno = ob_fd->op_errno; +- goto unlock; +- } ++ ob_inode = frame->local; ++ frame->local = NULL; + +- list_add_tail(&stub->list, &ob_fd->list); +- } +-unlock: +- UNLOCK(&fd->lock); ++ ob_open_completed(xl, ob_inode, cookie, op_ret, op_errno); + +-nofd: +- if (op_errno) +- call_unwind_error(stub, -1, op_errno); +- else if (ob_fd) +- ob_fd_wake(this, fd, NULL); +- else +- call_resume(stub); ++ STACK_DESTROY(frame->root); + + return 0; + } + +-int +-ob_open_behind(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, ++static int32_t ++ob_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) + { +- ob_fd_t *ob_fd = NULL; +- int ret = -1; +- ob_conf_t *conf = NULL; +- ob_inode_t *ob_inode = NULL; +- gf_boolean_t open_in_progress = _gf_false; +- int unlinked = 0; +- +- conf = this->private; +- +- if (flags & O_TRUNC) { +- STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); +- return 0; +- } +- +- ob_inode = ob_inode_get(this, fd->inode); +- +- ob_fd = ob_fd_new(); +- if (!ob_fd) +- goto enomem; +- +- ob_fd->ob_inode = ob_inode; +- +- ob_fd->fd = fd; +- +- ob_fd->open_frame = copy_frame(frame); +- if (!ob_fd->open_frame) +- goto enomem; +- ret = loc_copy(&ob_fd->loc, loc); +- if (ret) +- goto enomem; +- +- ob_fd->flags = flags; +- if (xdata) +- ob_fd->xdata = dict_ref(xdata); +- +- LOCK(&fd->inode->lock); +- { +- open_in_progress = ob_inode->open_in_progress; +- unlinked = ob_inode->unlinked; +- if (!open_in_progress && !unlinked) { +- ret = ob_fd_ctx_set(this, fd, ob_fd); +- if (ret) { +- UNLOCK(&fd->inode->lock); +- goto enomem; +- } +- +- list_add(&ob_fd->ob_fds_on_inode, &ob_inode->ob_fds); +- } +- } +- UNLOCK(&fd->inode->lock); +- +- /* We take a reference while the background open is pending or being +- * processed. If we finally wind the request in the foreground, then +- * ob_fd_free() will take care of this additional reference. */ +- fd_ref(fd); +- +- if (!open_in_progress && !unlinked) { +- STACK_UNWIND_STRICT(open, frame, 0, 0, fd, xdata); +- +- if (!conf->lazy_open) +- ob_fd_wake(this, fd, NULL); +- } else { +- ob_fd_free(ob_fd); +- STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); +- } ++ STACK_WIND_COOKIE(frame, ob_open_cbk, fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +-enomem: +- if (ob_fd) { +- if (ob_fd->open_frame) +- STACK_DESTROY(ob_fd->open_frame->root); +- +- loc_wipe(&ob_fd->loc); +- if (ob_fd->xdata) +- dict_unref(ob_fd->xdata); +- +- GF_FREE(ob_fd); +- } +- +- return -1; + } + +-int ++static int32_t + ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) + { +- fd_t *old_fd = NULL; +- int ret = -1; +- int op_errno = ENOMEM; +- call_stub_t *stub = NULL; +- +- old_fd = fd_lookup(fd->inode, 0); +- if (old_fd) { +- /* open-behind only when this is the first FD */ +- stub = fop_open_stub(frame, default_open_resume, loc, flags, fd, xdata); +- if (!stub) { +- fd_unref(old_fd); +- goto err; +- } +- +- open_and_resume(this, old_fd, stub); ++ ob_inode_t *ob_inode; ++ call_frame_t *open_frame; ++ call_stub_t *stub; ++ fd_t *first_fd; ++ ob_state_t state; ++ ++ state = ob_open_behind(this, fd, flags, &ob_inode, &first_fd); ++ if (state == OB_STATE_READY) { ++ /* There's no pending open, but there are other file descriptors opened ++ * or the current flags require a synchronous open. */ ++ return default_open(frame, this, loc, flags, fd, xdata); ++ } + +- fd_unref(old_fd); ++ if (state == OB_STATE_OPEN_TRIGGERED) { ++ /* The first open is in progress (either because it was already issued ++ * or because this request triggered it). We try to create a new stub ++ * to retry the operation once the initial open completes. */ ++ stub = fop_open_stub(frame, ob_open, loc, flags, fd, xdata); ++ if (stub != NULL) { ++ return ob_stub_dispatch(this, ob_inode, first_fd, stub); ++ } + +- return 0; ++ state = -ENOMEM; + } + +- ret = ob_open_behind(frame, this, loc, flags, fd, xdata); +- if (ret) { +- goto err; +- } ++ if (state == OB_STATE_FIRST_OPEN) { ++ /* We try to create a stub for the new open. A new frame needs to be ++ * used because the current one may be destroyed soon after sending ++ * the open's reply. */ ++ open_frame = copy_frame(frame); ++ if (open_frame != NULL) { ++ stub = fop_open_stub(open_frame, ob_open_resume, loc, flags, fd, ++ xdata); ++ if (stub != NULL) { ++ open_frame->local = ob_inode; + +- return 0; +-err: +- gf_msg(this->name, GF_LOG_ERROR, op_errno, OPEN_BEHIND_MSG_NO_MEMORY, "%s", +- loc->path); ++ /* TODO: Previous version passed xdata back to the caller, but ++ * probably this doesn't make sense since it won't contain ++ * any requested data. I think it would be better to pass ++ * NULL for xdata. */ ++ default_open_cbk(frame, NULL, this, 0, 0, fd, xdata); + +- STACK_UNWIND_STRICT(open, frame, -1, op_errno, 0, 0); ++ return ob_open_dispatch(this, ob_inode, first_fd, stub); ++ } + +- return 0; +-} ++ STACK_DESTROY(open_frame->root); ++ } + +-fd_t * +-ob_get_wind_fd(xlator_t *this, fd_t *fd, uint32_t *flag) +-{ +- fd_t *wind_fd = NULL; +- ob_fd_t *ob_fd = NULL; +- ob_conf_t *conf = NULL; ++ /* In case of error, simulate a regular completion but with an error ++ * code. */ ++ ob_open_completed(this, ob_inode, first_fd, -1, ENOMEM); + +- conf = this->private; ++ state = -ENOMEM; ++ } + +- ob_fd = ob_fd_ctx_get(this, fd); ++ /* In case of failure we need to decrement the number of open files because ++ * ob_fdclose() won't be called. */ + +- if (ob_fd && ob_fd->open_frame && conf->use_anonymous_fd) { +- wind_fd = fd_anonymous(fd->inode); +- if ((ob_fd->flags & O_DIRECT) && (flag)) +- *flag = *flag | O_DIRECT; +- } else { +- wind_fd = fd_ref(fd); ++ LOCK(&fd->inode->lock); ++ { ++ ob_inode->open_count--; + } ++ UNLOCK(&fd->inode->lock); + +- return wind_fd; ++ gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", ++ "open", "path=%s", loc->path, NULL); ++ ++ return default_open_failure_cbk(frame, -state); + } + +-int ++static int32_t + ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- fd_t *wind_fd = NULL; +- ob_conf_t *conf = NULL; ++ ob_conf_t *conf = this->private; ++ bool trigger = conf->read_after_open || !conf->use_anonymous_fd; + +- conf = this->private; +- +- if (!conf->read_after_open) +- wind_fd = ob_get_wind_fd(this, fd, &flags); +- else +- wind_fd = fd_ref(fd); +- +- stub = fop_readv_stub(frame, default_readv_resume, wind_fd, size, offset, +- flags, xdata); +- fd_unref(wind_fd); +- +- if (!stub) +- goto err; +- +- open_and_resume(this, wind_fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); ++ OB_POST_FD(readv, this, frame, fd, trigger, fd, size, offset, flags, xdata); + + return 0; + } + +-int ++static int32_t + ob_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_writev_stub(frame, default_writev_resume, fd, iov, count, offset, +- flags, iobref, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(writev, this, frame, fd, true, fd, iov, count, offset, flags, ++ iobref, xdata); + + return 0; + } + +-int ++static int32_t + ob_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- fd_t *wind_fd = NULL; +- +- wind_fd = ob_get_wind_fd(this, fd, NULL); +- +- stub = fop_fstat_stub(frame, default_fstat_resume, wind_fd, xdata); ++ ob_conf_t *conf = this->private; ++ bool trigger = !conf->use_anonymous_fd; + +- fd_unref(wind_fd); +- +- if (!stub) +- goto err; +- +- open_and_resume(this, wind_fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(fstat, this, frame, fd, trigger, fd, xdata); + + return 0; + } + +-int ++static int32_t + ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- fd_t *wind_fd = NULL; +- +- wind_fd = ob_get_wind_fd(this, fd, NULL); ++ ob_conf_t *conf = this->private; ++ bool trigger = !conf->use_anonymous_fd; + +- stub = fop_seek_stub(frame, default_seek_resume, wind_fd, offset, what, +- xdata); +- +- fd_unref(wind_fd); +- +- if (!stub) +- goto err; +- +- open_and_resume(this, wind_fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(seek, this, frame, fd, trigger, fd, offset, what, xdata); + + return 0; + } + +-int ++static int32_t + ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- ob_fd_t *ob_fd = NULL; +- gf_boolean_t unwind = _gf_false; +- +- LOCK(&fd->lock); +- { +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (ob_fd && ob_fd->open_frame) +- /* if open() was never wound to backend, +- no need to wind flush() either. +- */ +- unwind = _gf_true; +- } +- UNLOCK(&fd->lock); +- +- if (unwind) +- goto unwind; +- +- stub = fop_flush_stub(frame, default_flush_resume, fd, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, 0); +- +- return 0; +- +-unwind: +- STACK_UNWIND_STRICT(flush, frame, 0, 0, 0); ++ OB_POST_FLUSH(this, frame, fd, fd, xdata); + + return 0; + } + +-int ++static int32_t + ob_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fsync_stub(frame, default_fsync_resume, fd, flag, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(fsync, this, frame, fd, true, fd, flag, xdata); + + return 0; + } + +-int ++static int32_t + ob_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_lk_stub(frame, default_lk_resume, fd, cmd, flock, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(lk, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(lk, this, frame, fd, true, fd, cmd, flock, xdata); + + return 0; + } + +-int ++static int32_t + ob_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_ftruncate_stub(frame, default_ftruncate_resume, fd, offset, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(ftruncate, this, frame, fd, true, fd, offset, xdata); + + return 0; + } + +-int ++static int32_t + ob_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fsetxattr_stub(frame, default_fsetxattr_resume, fd, xattr, flags, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fsetxattr, frame, -1, ENOMEM, 0); ++ OB_POST_FD(fsetxattr, this, frame, fd, true, fd, xattr, flags, xdata); + + return 0; + } + +-int ++static int32_t + ob_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, fd, name, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(fgetxattr, this, frame, fd, true, fd, name, xdata); + + return 0; + } + +-int ++static int32_t + ob_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fremovexattr_stub(frame, default_fremovexattr_resume, fd, name, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fremovexattr, frame, -1, ENOMEM, 0); ++ OB_POST_FD(fremovexattr, this, frame, fd, true, fd, name, xdata); + + return 0; + } + +-int ++static int32_t + ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int cmd, struct gf_flock *flock, dict_t *xdata) + { +- call_stub_t *stub = fop_finodelk_stub(frame, default_finodelk_resume, +- volume, fd, cmd, flock, xdata); +- if (stub) +- open_and_resume(this, fd, stub); +- else +- STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, 0); ++ OB_POST_FD(finodelk, this, frame, fd, true, volume, fd, cmd, flock, xdata); + + return 0; + } + +-int ++static int32_t + ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) + { +- call_stub_t *stub = fop_fentrylk_stub( +- frame, default_fentrylk_resume, volume, fd, basename, cmd, type, xdata); +- if (stub) +- open_and_resume(this, fd, stub); +- else +- STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, 0); ++ OB_POST_FD(fentrylk, this, frame, fd, true, volume, fd, basename, cmd, type, ++ xdata); + + return 0; + } + +-int ++static int32_t + ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) + { +- call_stub_t *stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd, +- optype, xattr, xdata); +- if (stub) +- open_and_resume(this, fd, stub); +- else +- STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(fxattrop, this, frame, fd, true, fd, optype, xattr, xdata); + + return 0; + } + +-int ++static int32_t + ob_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *iatt, + int valid, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fsetattr_stub(frame, default_fsetattr_resume, fd, iatt, valid, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(fsetattr, this, frame, fd, true, fd, iatt, valid, xdata); + + return 0; + } + +-int ++static int32_t + ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) + { +- call_stub_t *stub; +- +- stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, offset, +- len, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); ++ OB_POST_FD(fallocate, this, frame, fd, true, fd, mode, offset, len, xdata); + + return 0; +-err: +- STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); +- return 0; + } + +-int ++static int32_t + ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) + { +- call_stub_t *stub; +- +- stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); ++ OB_POST_FD(discard, this, frame, fd, true, fd, offset, len, xdata); + + return 0; +-err: +- STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); +- return 0; + } + +-int ++static int32_t + ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) + { +- call_stub_t *stub; +- +- stub = fop_zerofill_stub(frame, default_zerofill_resume, fd, offset, len, +- xdata); +- if (!stub) +- goto err; ++ OB_POST_FD(zerofill, this, frame, fd, true, fd, offset, len, xdata); + +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + +-int ++static int32_t + ob_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_unlink_stub(frame, default_unlink_resume, loc, xflags, xdata); +- if (!stub) +- goto err; +- +- open_all_pending_fds_and_resume(this, loc->inode, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_INODE(unlink, this, frame, loc->inode, true, loc, xflags, xdata); + + return 0; + } + +-int ++static int32_t + ob_rename(call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_rename_stub(frame, default_rename_resume, src, dst, xdata); +- if (!stub) +- goto err; +- +- open_all_pending_fds_and_resume(this, dst->inode, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0); ++ OB_POST_INODE(rename, this, frame, dst->inode, true, src, dst, xdata); + + return 0; + } + +-int32_t ++static int32_t + ob_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_setattr_stub(frame, default_setattr_resume, loc, stbuf, valid, +- xdata); +- if (!stub) +- goto err; ++ OB_POST_INODE(setattr, this, frame, loc->inode, true, loc, stbuf, valid, ++ xdata); + +- open_all_pending_fds_and_resume(this, loc->inode, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + +-int32_t ++static int32_t + ob_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- gf_boolean_t access_xattr = _gf_false; +- + if (dict_get(dict, POSIX_ACL_DEFAULT_XATTR) || + dict_get(dict, POSIX_ACL_ACCESS_XATTR) || +- dict_get(dict, GF_SELINUX_XATTR_KEY)) +- access_xattr = _gf_true; +- +- if (!access_xattr) ++ dict_get(dict, GF_SELINUX_XATTR_KEY)) { + return default_setxattr(frame, this, loc, dict, flags, xdata); ++ } + +- stub = fop_setxattr_stub(frame, default_setxattr_resume, loc, dict, flags, +- xdata); +- if (!stub) +- goto err; +- +- open_all_pending_fds_and_resume(this, loc->inode, stub); ++ OB_POST_INODE(setxattr, this, frame, loc->inode, true, loc, dict, flags, ++ xdata); + + return 0; +-err: +- STACK_UNWIND_STRICT(setxattr, frame, -1, ENOMEM, NULL); +- return 0; + } + +-int +-ob_release(xlator_t *this, fd_t *fd) ++static void ++ob_fdclose(xlator_t *this, fd_t *fd) + { +- ob_fd_t *ob_fd = NULL; ++ struct list_head list; ++ ob_inode_t *ob_inode; ++ call_stub_t *stub; ++ ++ INIT_LIST_HEAD(&list); ++ stub = NULL; + +- ob_fd = ob_fd_ctx_get(this, fd); ++ LOCK(&fd->inode->lock); ++ { ++ ob_inode = ob_inode_get_locked(this, fd->inode); ++ if (ob_inode != NULL) { ++ ob_inode->open_count--; ++ ++ /* If this fd is the same as ob_inode->first_fd, it means that ++ * the initial open has not fully completed. We'll try to cancel ++ * it. */ ++ if (ob_inode->first_fd == fd) { ++ if (ob_inode->first_open == OB_OPEN_PREPARING) { ++ /* In this case ob_open_dispatch() has not been called yet. ++ * We clear first_fd and first_open to allow that function ++ * to know that the open is not really needed. This also ++ * allows other requests to work as expected if they ++ * arrive before the dispatch function is called. If there ++ * are pending fops, we can directly process them here. ++ * (note that there shouldn't be any fd related fops, but ++ * if there are, it's fine if they fail). */ ++ ob_inode->first_fd = NULL; ++ ob_inode->first_open = NULL; ++ ob_inode->triggered = false; ++ list_splice_init(&ob_inode->resume_fops, &list); ++ } else if (!ob_inode->triggered) { ++ /* If the open has already been dispatched, we can only ++ * cancel it if it has not been triggered. Otherwise we ++ * simply wait until it completes. While it's not triggered, ++ * first_open must be a valid stub and there can't be any ++ * pending fops. */ ++ GF_ASSERT((ob_inode->first_open != NULL) && ++ list_empty(&ob_inode->resume_fops)); ++ ++ ob_inode->first_fd = NULL; ++ stub = ob_inode->first_open; ++ ob_inode->first_open = NULL; ++ } ++ } ++ } ++ } ++ UNLOCK(&fd->inode->lock); + +- ob_fd_free(ob_fd); ++ if (stub != NULL) { ++ call_stub_destroy(stub); ++ fd_unref(fd); ++ } + +- return 0; ++ ob_resume_pending(&list); + } + + int + ob_forget(xlator_t *this, inode_t *inode) + { +- ob_inode_t *ob_inode = NULL; ++ ob_inode_t *ob_inode; + uint64_t value = 0; + +- inode_ctx_del(inode, this, &value); +- +- if (value) { ++ if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) { + ob_inode = (ob_inode_t *)(uintptr_t)value; +- ob_inode_free(ob_inode); ++ GF_FREE(ob_inode); + } + + return 0; +@@ -1153,20 +823,18 @@ ob_priv_dump(xlator_t *this) + int + ob_fdctx_dump(xlator_t *this, fd_t *fd) + { +- ob_fd_t *ob_fd = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; +- int ret = 0; ++ uint64_t value = 0; ++ int ret = 0, error = 0; + + ret = TRY_LOCK(&fd->lock); + if (ret) + return 0; + +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (!ob_fd) { +- UNLOCK(&fd->lock); +- return 0; ++ if ((__fd_ctx_get(fd, this, &value) == 0) && (value != 0)) { ++ error = (int32_t)value; + } + + gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind", +@@ -1175,17 +843,7 @@ ob_fdctx_dump(xlator_t *this, fd_t *fd) + + gf_proc_dump_write("fd", "%p", fd); + +- gf_proc_dump_write("open_frame", "%p", ob_fd->open_frame); +- +- if (ob_fd->open_frame) +- gf_proc_dump_write("open_frame.root.unique", "%" PRIu64, +- ob_fd->open_frame->root->unique); +- +- gf_proc_dump_write("loc.path", "%s", ob_fd->loc.path); +- +- gf_proc_dump_write("loc.ino", "%s", uuid_utoa(ob_fd->loc.gfid)); +- +- gf_proc_dump_write("flags", "%d", ob_fd->flags); ++ gf_proc_dump_write("error", "%d", error); + + UNLOCK(&fd->lock); + +@@ -1307,7 +965,7 @@ struct xlator_fops fops = { + }; + + struct xlator_cbks cbks = { +- .release = ob_release, ++ .fdclose = ob_fdclose, + .forget = ob_forget, + }; + +-- +1.8.3.1 + diff --git a/SOURCES/0524-open-behind-fix-call_frame-leak.patch b/SOURCES/0524-open-behind-fix-call_frame-leak.patch new file mode 100644 index 0000000..75a243d --- /dev/null +++ b/SOURCES/0524-open-behind-fix-call_frame-leak.patch @@ -0,0 +1,70 @@ +From 36dddf59a02d91d3db5b124be626ab6bc235ed5a Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Wed, 19 Aug 2020 23:27:38 +0200 +Subject: [PATCH 524/526] open-behind: fix call_frame leak + +When an open was delayed, a copy of the frame was created because the +current frame was used to unwind the "fake" open. When the open was +actually sent, the frame was correctly destroyed. However if the file +was closed before needing to send the open, the frame was not destroyed. + +This patch correctly destroys the frame in all cases. + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24892 +> Change-Id: I8c00fc7f15545c240e8151305d9e4cf06d653926 +> Signed-off-by: Xavi Hernandez +> Fixes: #1440 + +BUG: 1830713 +Change-Id: I8c00fc7f15545c240e8151305d9e4cf06d653926 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/224488 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/open-behind/src/open-behind.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index e43fe73..1ab635e 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -333,6 +333,14 @@ ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + return 0; + } + ++static void ++ob_open_destroy(call_stub_t *stub, fd_t *fd) ++{ ++ STACK_DESTROY(stub->frame->root); ++ call_stub_destroy(stub); ++ fd_unref(fd); ++} ++ + static int32_t + ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + call_stub_t *stub) +@@ -355,8 +363,7 @@ ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + + if (stub != NULL) { + if (closed) { +- call_stub_destroy(stub); +- fd_unref(fd); ++ ob_open_destroy(stub, fd); + } else { + call_resume(stub); + } +@@ -776,8 +783,7 @@ ob_fdclose(xlator_t *this, fd_t *fd) + UNLOCK(&fd->inode->lock); + + if (stub != NULL) { +- call_stub_destroy(stub); +- fd_unref(fd); ++ ob_open_destroy(stub, fd); + } + + ob_resume_pending(&list); +-- +1.8.3.1 + diff --git a/SOURCES/0525-open-behind-implement-create-fop.patch b/SOURCES/0525-open-behind-implement-create-fop.patch new file mode 100644 index 0000000..c7a5329 --- /dev/null +++ b/SOURCES/0525-open-behind-implement-create-fop.patch @@ -0,0 +1,109 @@ +From 41aae052b5e3afe64d3e0668643726bab0e77265 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Fri, 4 Sep 2020 14:49:50 +0200 +Subject: [PATCH 525/526] open-behind: implement create fop + +Open behind didn't implement create fop. This caused that files created +were not accounted for the number of open fd's. This could cause future +opens to be delayed when they shouldn't. + +This patch implements the create fop. It also fixes a problem when +destroying the stack: when frame->local was not NULL, STACK_DESTROY() +tried to mem_put() it, which is not correct. + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24953 +> Fixes: #1440 +> Change-Id: Ic982bad07d4af30b915d7eb1fbcef7a847a45869 +> Signed-off-by: Xavi Hernandez + +BUG: 1830713 +Change-Id: Ic982bad07d4af30b915d7eb1fbcef7a847a45869 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/224489 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/open-behind/src/open-behind.c | 52 +++++++++++++++++++++++ + 1 file changed, 52 insertions(+) + +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index 1ab635e..600c3b6 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -336,6 +336,7 @@ ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + static void + ob_open_destroy(call_stub_t *stub, fd_t *fd) + { ++ stub->frame->local = NULL; + STACK_DESTROY(stub->frame->root); + call_stub_destroy(stub); + fd_unref(fd); +@@ -516,6 +517,56 @@ ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + } + + static int32_t ++ob_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, ++ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) ++{ ++ ob_inode_t *ob_inode; ++ call_stub_t *stub; ++ fd_t *first_fd; ++ ob_state_t state; ++ ++ /* Create requests are never delayed. We always send them synchronously. */ ++ state = ob_open_and_resume_fd(this, fd, 1, true, true, &ob_inode, ++ &first_fd); ++ if (state == OB_STATE_READY) { ++ /* There's no pending open, but there are other file descriptors opened ++ * so we simply forward the request synchronously. */ ++ return default_create(frame, this, loc, flags, mode, umask, fd, xdata); ++ } ++ ++ if (state == OB_STATE_OPEN_TRIGGERED) { ++ /* The first open is in progress (either because it was already issued ++ * or because this request triggered it). We try to create a new stub ++ * to retry the operation once the initial open completes. */ ++ stub = fop_create_stub(frame, ob_create, loc, flags, mode, umask, fd, ++ xdata); ++ if (stub != NULL) { ++ return ob_stub_dispatch(this, ob_inode, first_fd, stub); ++ } ++ ++ state = -ENOMEM; ++ } ++ ++ /* Since we forced a synchronous request, OB_STATE_FIRST_OPEN will never ++ * be returned by ob_open_and_resume_fd(). If we are here it can only be ++ * because there has been a problem. */ ++ ++ /* In case of failure we need to decrement the number of open files because ++ * ob_fdclose() won't be called. */ ++ ++ LOCK(&fd->inode->lock); ++ { ++ ob_inode->open_count--; ++ } ++ UNLOCK(&fd->inode->lock); ++ ++ gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", ++ "create", "path=%s", loc->path, NULL); ++ ++ return default_create_failure_cbk(frame, -state); ++} ++ ++static int32_t + ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) + { +@@ -946,6 +997,7 @@ fini(xlator_t *this) + + struct xlator_fops fops = { + .open = ob_open, ++ .create = ob_create, + .readv = ob_readv, + .writev = ob_writev, + .flush = ob_flush, +-- +1.8.3.1 + diff --git a/SOURCES/0526-Quota-quota_fsck.py-converting-byte-string-to-string.patch b/SOURCES/0526-Quota-quota_fsck.py-converting-byte-string-to-string.patch new file mode 100644 index 0000000..fb74fd8 --- /dev/null +++ b/SOURCES/0526-Quota-quota_fsck.py-converting-byte-string-to-string.patch @@ -0,0 +1,44 @@ +From baeca3c9b70548463ceea0ae27e6f98cf06e96b7 Mon Sep 17 00:00:00 2001 +From: srijan-sivakumar +Date: Tue, 28 Jul 2020 22:27:34 +0530 +Subject: [PATCH 526/526] Quota quota_fsck.py, converting byte string to string + +Issue: The quota_fsck.py script throws an TypeError +due to the fact that the data is read as bytes and then +the string operations are applied on the. Now, in python3 +string is unicode and hence we get the type error. + +Code Changes: +Decoding the bytes value into utf-8 format. + +>Change-Id: Ia1ff52a821d664a371c8166692ff506ae39f6e40 +>Signed-off-by: srijan-sivakumar +>Fixes: #1401 +Upstream patch: https://review.gluster.org/c/glusterfs/+/24785 + +BUG: 1719171 +Change-Id: Ia1ff52a821d664a371c8166692ff506ae39f6e40 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/224780 +Tested-by: RHGS Build Bot +Reviewed-by: Kshithij Iyer +Reviewed-by: Rinku Kothiya +--- + extras/quota/quota_fsck.py | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/extras/quota/quota_fsck.py b/extras/quota/quota_fsck.py +index 174f2a2..ea8d638 100755 +--- a/extras/quota/quota_fsck.py ++++ b/extras/quota/quota_fsck.py +@@ -157,6 +157,7 @@ def get_quota_xattr_brick(dpath): + xattr_dict['parents'] = {} + + for xattr in pairs: ++ xattr = xattr.decode("utf-8") + xattr_key = xattr.split("=")[0] + if re.search("# file:", xattr_key): + # skip the file comment +-- +1.8.3.1 + diff --git a/SOURCES/0527-Events-Socket-creation-after-getaddrinfo-and-IPv4-an.patch b/SOURCES/0527-Events-Socket-creation-after-getaddrinfo-and-IPv4-an.patch new file mode 100644 index 0000000..133a24e --- /dev/null +++ b/SOURCES/0527-Events-Socket-creation-after-getaddrinfo-and-IPv4-an.patch @@ -0,0 +1,200 @@ +From 4152c77defac24ace3b1b6b9cc81a4f614254e4f Mon Sep 17 00:00:00 2001 +From: srijan-sivakumar +Date: Sat, 18 Jul 2020 05:59:09 +0530 +Subject: [PATCH 527/532] Events: Socket creation after getaddrinfo and IPv4 + and IPv6 packet capture + +Issue: Currently, the socket creation is done +prior to getaddrinfo function being invoked. This +can cause mismatch in the protocol and address +families of the created socket and the result +of the getaddrinfo api. Also, the glustereventsd +UDP server by default only captures IPv4 packets +hence IPv6 packets are not even captured. + +Code Changes: +1. Modified the socket creation in such a way that +the parameters taken in are dependent upon the +result of the getaddrinfo function. +2. Created a subclass for adding address family +in glustereventsd.py for both AF_INET and AF_INET6. +3. Modified addresses in the eventsapiconf.py.in + +Reasoning behind the approach: +1. If we are using getaddrinfo function then +socket creation should happen only after we +check if we received back valid addresses. +Hence socket creation should come after the call +to getaddrinfo +2. The listening server which pushes the events +to the webhook has to listen for both IPv4 +and IPv6 messages as we would not be sure as to +what address family is picked in _gf_event. + +>Fixes: #1377 +>Change-Id: I568dcd1a977c8832f0fef981e1f81cac7043c760 +>Signed-off-by: srijan-sivakumar +Upstream patch: https://review.gluster.org/c/glusterfs/+/24722 + +BUG: 1814744 +Change-Id: I568dcd1a977c8832f0fef981e1f81cac7043c760 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/225567 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +--- + events/src/eventsapiconf.py.in | 2 ++ + events/src/glustereventsd.py | 37 ++++++++++++++++++++++++++++++------- + libglusterfs/src/events.c | 27 +++++++++++++++++++-------- + 3 files changed, 51 insertions(+), 15 deletions(-) + +diff --git a/events/src/eventsapiconf.py.in b/events/src/eventsapiconf.py.in +index 76b5954..700093b 100644 +--- a/events/src/eventsapiconf.py.in ++++ b/events/src/eventsapiconf.py.in +@@ -28,6 +28,8 @@ def get_glusterd_workdir(): + return glusterd_workdir + + SERVER_ADDRESS = "0.0.0.0" ++SERVER_ADDRESSv4 = "0.0.0.0" ++SERVER_ADDRESSv6 = "::1" + DEFAULT_CONFIG_FILE = "@SYSCONF_DIR@/glusterfs/eventsconfig.json" + CUSTOM_CONFIG_FILE_TO_SYNC = "/events/config.json" + CUSTOM_CONFIG_FILE = get_glusterd_workdir() + CUSTOM_CONFIG_FILE_TO_SYNC +diff --git a/events/src/glustereventsd.py b/events/src/glustereventsd.py +index c4c7b65..341a3b6 100644 +--- a/events/src/glustereventsd.py ++++ b/events/src/glustereventsd.py +@@ -13,6 +13,7 @@ + from __future__ import print_function + import sys + import signal ++import threading + try: + import socketserver + except ImportError: +@@ -23,10 +24,17 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter + from eventtypes import all_events + import handlers + import utils +-from eventsapiconf import SERVER_ADDRESS, PID_FILE ++from eventsapiconf import SERVER_ADDRESSv4, SERVER_ADDRESSv6, PID_FILE + from eventsapiconf import AUTO_BOOL_ATTRIBUTES, AUTO_INT_ATTRIBUTES + from utils import logger, PidFile, PidFileLockFailed, boolify + ++# Subclass so that specifically IPv4 packets are captured ++class UDPServerv4(socketserver.ThreadingUDPServer): ++ address_family = socket.AF_INET ++ ++# Subclass so that specifically IPv6 packets are captured ++class UDPServerv6(socketserver.ThreadingUDPServer): ++ address_family = socket.AF_INET6 + + class GlusterEventsRequestHandler(socketserver.BaseRequestHandler): + +@@ -89,6 +97,10 @@ def signal_handler_sigusr2(sig, frame): + utils.restart_webhook_pool() + + ++def UDP_server_thread(sock): ++ sock.serve_forever() ++ ++ + def init_event_server(): + utils.setup_logger() + utils.load_all() +@@ -99,15 +111,26 @@ def init_event_server(): + sys.stderr.write("Unable to get Port details from Config\n") + sys.exit(1) + +- # Start the Eventing Server, UDP Server ++ # Creating the Eventing Server, UDP Server for IPv4 packets ++ try: ++ serverv4 = UDPServerv4((SERVER_ADDRESSv4, port), ++ GlusterEventsRequestHandler) ++ except socket.error as e: ++ sys.stderr.write("Failed to start Eventsd for IPv4: {0}\n".format(e)) ++ sys.exit(1) ++ # Creating the Eventing Server, UDP Server for IPv6 packets + try: +- server = socketserver.ThreadingUDPServer( +- (SERVER_ADDRESS, port), +- GlusterEventsRequestHandler) ++ serverv6 = UDPServerv6((SERVER_ADDRESSv6, port), ++ GlusterEventsRequestHandler) + except socket.error as e: +- sys.stderr.write("Failed to start Eventsd: {0}\n".format(e)) ++ sys.stderr.write("Failed to start Eventsd for IPv6: {0}\n".format(e)) + sys.exit(1) +- server.serve_forever() ++ server_thread1 = threading.Thread(target=UDP_server_thread, ++ args=(serverv4,)) ++ server_thread2 = threading.Thread(target=UDP_server_thread, ++ args=(serverv6,)) ++ server_thread1.start() ++ server_thread2.start() + + + def get_args(): +diff --git a/libglusterfs/src/events.c b/libglusterfs/src/events.c +index 6d1e383..4d720ca 100644 +--- a/libglusterfs/src/events.c ++++ b/libglusterfs/src/events.c +@@ -40,6 +40,7 @@ _gf_event(eventtypes_t event, const char *fmt, ...) + char *host = NULL; + struct addrinfo hints; + struct addrinfo *result = NULL; ++ struct addrinfo *iter_result_ptr = NULL; + xlator_t *this = THIS; + char *volfile_server_transport = NULL; + +@@ -51,13 +52,6 @@ _gf_event(eventtypes_t event, const char *fmt, ...) + goto out; + } + +- /* Initialize UDP socket */ +- sock = socket(AF_INET, SOCK_DGRAM, 0); +- if (sock < 0) { +- ret = EVENT_ERROR_SOCKET; +- goto out; +- } +- + if (ctx) { + volfile_server_transport = ctx->cmd_args.volfile_server_transport; + } +@@ -66,7 +60,6 @@ _gf_event(eventtypes_t event, const char *fmt, ...) + } + + /* host = NULL returns localhost */ +- host = NULL; + if (ctx && ctx->cmd_args.volfile_server && + (strcmp(volfile_server_transport, "unix"))) { + /* If it is client code then volfile_server is set +@@ -84,6 +77,24 @@ _gf_event(eventtypes_t event, const char *fmt, ...) + goto out; + } + ++ // iterate over the result and break when socket creation is success. ++ for (iter_result_ptr = result; iter_result_ptr != NULL; ++ iter_result_ptr = iter_result_ptr->ai_next) { ++ sock = socket(iter_result_ptr->ai_family, iter_result_ptr->ai_socktype, ++ iter_result_ptr->ai_protocol); ++ if (sock != -1) { ++ break; ++ } ++ } ++ /* ++ * If none of the addrinfo structures lead to a successful socket ++ * creation, socket creation has failed. ++ */ ++ if (sock < 0) { ++ ret = EVENT_ERROR_SOCKET; ++ goto out; ++ } ++ + va_start(arguments, fmt); + ret = gf_vasprintf(&msg, fmt, arguments); + va_end(arguments); +-- +1.8.3.1 + diff --git a/SOURCES/0528-Extras-Removing-xattr_analysis-script.patch b/SOURCES/0528-Extras-Removing-xattr_analysis-script.patch new file mode 100644 index 0000000..d04068d --- /dev/null +++ b/SOURCES/0528-Extras-Removing-xattr_analysis-script.patch @@ -0,0 +1,134 @@ +From 3fc74ce6c282f0f43fdcfeda47b71a1b19945b6d Mon Sep 17 00:00:00 2001 +From: srijan-sivakumar +Date: Wed, 3 Feb 2021 10:11:04 +0530 +Subject: [PATCH 528/532] Extras: Removing xattr_analysis script + +The xattr_analysis.py script is used rarely for +debugging and seeing that it has some dependencies, +removing it from the release. + +If need be, it would be directly shared with the cu. + +Label: DOWNSTREAM ONLY +BUG: 1719171 + +Change-Id: I4bb0df3ebfa7e43e13858b4b6e3efbb02ea79d5f +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/226301 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/quota/Makefile.am | 4 +-- + extras/quota/xattr_analysis.py | 73 ------------------------------------------ + glusterfs.spec.in | 1 - + 3 files changed, 2 insertions(+), 76 deletions(-) + delete mode 100755 extras/quota/xattr_analysis.py + +diff --git a/extras/quota/Makefile.am b/extras/quota/Makefile.am +index cdb6be1..e4d9322 100644 +--- a/extras/quota/Makefile.am ++++ b/extras/quota/Makefile.am +@@ -2,7 +2,7 @@ scriptsdir = $(datadir)/glusterfs/scripts + scripts_SCRIPTS = log_accounting.sh + + if WITH_SERVER +-scripts_SCRIPTS += xattr_analysis.py quota_fsck.py ++scripts_SCRIPTS += quota_fsck.py + endif + +-EXTRA_DIST = log_accounting.sh xattr_analysis.py quota_fsck.py ++EXTRA_DIST = log_accounting.sh quota_fsck.py +diff --git a/extras/quota/xattr_analysis.py b/extras/quota/xattr_analysis.py +deleted file mode 100755 +index 7bd7d96..0000000 +--- a/extras/quota/xattr_analysis.py ++++ /dev/null +@@ -1,73 +0,0 @@ +-#!/usr/bin/python3 +-# Below script has two purposes +-# 1. Display xattr of entire FS tree in a human readable form +-# 2. Display all the directory where contri and size mismatch. +-# (If there are any directory with contri and size mismatch that are not dirty +-# then that highlights a propagation issue) +-# The script takes only one input LOG _FILE generated from the command, +-# find | xargs getfattr -d -m. -e hex > log_gluster_xattr +- +-from __future__ import print_function +-import re +-import subprocess +-import sys +-from hurry.filesize import size +- +-if len(sys.argv) < 2: +- sys.exit('Usage: %s log_gluster_xattr \n' +- 'to generate log_gluster_xattr use: \n' +- 'find | xargs getfattr -d -m. -e hex > log_gluster_xattr' +- % sys.argv[0]) +-LOG_FILE=sys.argv[1] +- +-def get_quota_xattr_brick(): +- out = subprocess.check_output (["/usr/bin/cat", LOG_FILE]) +- pairs = out.splitlines() +- +- xdict = {} +- mismatch_size = [('====contri_size===', '====size====')] +- for xattr in pairs: +- k = xattr.split("=")[0] +- if re.search("# file:", k): +- print(xdict) +- filename=k +- print("=====" + filename + "=======") +- xdict = {} +- elif k is "": +- pass +- else: +- print(xattr) +- v = xattr.split("=")[1] +- if re.search("contri", k): +- if len(v) == 34: +- # for files size is obtained in iatt, file count should be 1, dir count=0 +- xdict['contri_file_count'] = int(v[18:34], 16) +- xdict['contri_dir_count'] = 0 +- else: +- xdict['contri_size'] = size(int(v[2:18], 16)) +- xdict['contri_file_count'] = int(v[18:34], 16) +- xdict['contri_dir_count'] = int(v[34:], 16) +- elif re.search("size", k): +- xdict['size'] = size(int(v[2:18], 16)) +- xdict['file_count'] = int(v[18:34], 16) +- xdict['dir_count'] = int(v[34:], 16) +- elif re.search("dirty", k): +- if v == '0x3000': +- xdict['dirty'] = False +- elif v == '0x3100': +- xdict['dirty'] = True +- elif re.search("limit_objects", k): +- xdict['limit_objects'] = int(v[2:18], 16) +- elif re.search("limit_set", k): +- xdict['limit_set'] = size(int(v[2:18], 16)) +- +- if 'size' in xdict and 'contri_size' in xdict and xdict['size'] != xdict['contri_size']: +- mismatch_size.append((xdict['contri_size'], xdict['size'], filename)) +- +- for values in mismatch_size: +- print(values) +- +- +-if __name__ == '__main__': +- get_quota_xattr_brick() +- +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 30d7162..2be7677 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -1380,7 +1380,6 @@ exit 0 + %if ( 0%{!?_without_server:1} ) + %files server + %doc extras/clear_xattrs.sh +-%{_datadir}/glusterfs/scripts/xattr_analysis.py* + %{_datadir}/glusterfs/scripts/quota_fsck.py* + # sysconf + %config(noreplace) %{_sysconfdir}/glusterfs +-- +1.8.3.1 + diff --git a/SOURCES/0529-geo-rep-prompt-should-work-for-ignore_deletes.patch b/SOURCES/0529-geo-rep-prompt-should-work-for-ignore_deletes.patch new file mode 100644 index 0000000..671451d --- /dev/null +++ b/SOURCES/0529-geo-rep-prompt-should-work-for-ignore_deletes.patch @@ -0,0 +1,75 @@ +From 1c7e96e73273b7891ea6ef0d768c2bf7ff5de7b0 Mon Sep 17 00:00:00 2001 +From: Shwetha K Acharya +Date: Thu, 4 Feb 2021 16:29:39 +0530 +Subject: [PATCH 529/532] geo-rep: prompt should work for ignore_deletes + +The python cli is intelligent enough to parse both "-" and "_" alike: + +Example: +geo-replication config updated successfully +sync_job 4 +geo-replication config updated successfully +gluster volume geo-replication primary 127.0.0.1::secondary config | grep sync_jobs +sync_jobs:5 + +Thus the prompt which appears after ignore-deletes true should +work for both ignore-deletes and ignore_deletes. + +Label: DOWNSTREAM ONLY + +BUG: 1224906 +Change-Id: I89f854200a604d07d3ac6c374fe6d445ce9f22ca +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/226599 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli-cmd-parser.c | 5 +++-- + tests/00-geo-rep/bug-1708603.t | 12 ++++++++++-- + 2 files changed, 13 insertions(+), 4 deletions(-) + +diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c +index 34f17c9..dda8979 100644 +--- a/cli/src/cli-cmd-parser.c ++++ b/cli/src/cli-cmd-parser.c +@@ -3107,8 +3107,9 @@ cli_cmd_gsync_set_parse(struct cli_state *state, const char **words, + if (!ret) + ret = dict_set_int32(dict, "type", type); + if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG) { +- if (!strcmp((char *)words[wordcount - 2], "ignore-deletes") && +- !strcmp((char *)words[wordcount - 1], "true")) { ++ if ((((!strcmp((char *)words[wordcount - 2], "ignore_deletes")) || ++ (!strcmp((char *)words[wordcount - 2], "ignore-deletes")))) && ++ ((!strcmp((char *)words[wordcount - 1], "true")))) { + question = + "There exists ~15 seconds delay for the option to take" + " effect from stime of the corresponding brick. Please" +diff --git a/tests/00-geo-rep/bug-1708603.t b/tests/00-geo-rep/bug-1708603.t +index 26913f1..edafb48 100644 +--- a/tests/00-geo-rep/bug-1708603.t ++++ b/tests/00-geo-rep/bug-1708603.t +@@ -44,11 +44,19 @@ TEST glusterfs -s $H0 --volfile-id $GSV0 $M1 + #Create geo-rep session + TEST create_georep_session $master $slave + +-echo n | $GEOREP_CLI $master $slave config ignore-deletes true >/dev/null 2>&1 +-EXPECT "false" echo $($GEOREP_CLI $master $slave config ignore-deletes) ++echo n | $GEOREP_CLI $master $slave config ignore_deletes true >/dev/null 2>&1 ++EXPECT "false" echo $($GEOREP_CLI $master $slave config ignore_deletes) ++ ++echo y | $GEOREP_CLI $master $slave config ignore_deletes true ++EXPECT "true" echo $($GEOREP_CLI $master $slave config ignore_deletes) ++ ++$GEOREP_CLI $master $slave config ignore_deletes false + echo y | $GEOREP_CLI $master $slave config ignore-deletes true + EXPECT "true" echo $($GEOREP_CLI $master $slave config ignore-deletes) + ++echo n | $GEOREP_CLI $master $slave config ignore-deletes true >/dev/null 2>&1 ++EXPECT "true" echo $($GEOREP_CLI $master $slave config ignore-deletes) ++ + #Stop Geo-rep + TEST $GEOREP_CLI $master $slave stop + +-- +1.8.3.1 + diff --git a/SOURCES/0530-gfapi-avoid-crash-while-logging-message.patch b/SOURCES/0530-gfapi-avoid-crash-while-logging-message.patch new file mode 100644 index 0000000..aec73b7 --- /dev/null +++ b/SOURCES/0530-gfapi-avoid-crash-while-logging-message.patch @@ -0,0 +1,41 @@ +From 5a7348a266587704dae4f1ddda16b7c95f547251 Mon Sep 17 00:00:00 2001 +From: Rinku Kothiya +Date: Sun, 7 Feb 2021 13:40:24 +0000 +Subject: [PATCH 530/532] gfapi: avoid crash while logging message. + +Breaking parameter into two different parameter +to avoid a crash. + +Upstream: +> Reviewed-on: https://github.com/gluster/glusterfs/pull/2139 +> fixes: #2138 +> Change-Id: Idd5f3631488c1d892748f83e6847fb6fd2d0802a +> Signed-off-by: Rinku Kothiya + +BUG: 1691320 + +Change-Id: Ifd6a96982ffd4e5334f8be2297de2ad826f3145b +Signed-off-by: Rinku Kothiya +Reviewed-on: https://code.engineering.redhat.com/gerrit/226851 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + api/src/glfs-fops.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c +index 051541f..6dc3b66 100644 +--- a/api/src/glfs-fops.c ++++ b/api/src/glfs-fops.c +@@ -1529,7 +1529,7 @@ glfs_pwritev_common(struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt, + ret = -1; + errno = EINVAL; + gf_smsg(THIS->name, GF_LOG_ERROR, errno, API_MSG_INVALID_ARG, +- "size >= %llu is not allowed", GF_UNIT_GB, NULL); ++ "Data size too large", "size=%llu", GF_UNIT_GB, NULL); + goto out; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0531-Glustereventsd-Default-port-change-2091.patch b/SOURCES/0531-Glustereventsd-Default-port-change-2091.patch new file mode 100644 index 0000000..8c2ecbf --- /dev/null +++ b/SOURCES/0531-Glustereventsd-Default-port-change-2091.patch @@ -0,0 +1,69 @@ +From 058a853a1438b2a62586c545f71150ade3de23b7 Mon Sep 17 00:00:00 2001 +From: schaffung +Date: Wed, 10 Feb 2021 13:43:48 +0530 +Subject: [PATCH 531/532] Glustereventsd Default port change (#2091) + +Issue : The default port of glustereventsd is currently 24009 +which is preventing glustereventsd from binding to the UDP port +due to selinux policies. + +Fix: Changing the default port to be bound by chanding it to something +in the ephemeral range. + +>Fixes: #2080 +>Change-Id: Ibdc87f83f82f69660dca95d6d14b226e10d8bd33 +>Signed-off-by: srijan-sivakumar +Upstream Patch : https://github.com/gluster/glusterfs/pull/2091 + +BUG: 1814744 +Change-Id: Ibdc87f83f82f69660dca95d6d14b226e10d8bd33 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/227249 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + events/src/eventsconfig.json | 2 +- + extras/firewalld/glusterfs.xml | 2 +- + libglusterfs/src/events.c | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/events/src/eventsconfig.json b/events/src/eventsconfig.json +index 89e5b9c..14d8f84 100644 +--- a/events/src/eventsconfig.json ++++ b/events/src/eventsconfig.json +@@ -1,5 +1,5 @@ + { + "log-level": "INFO", +- "port": 24009, ++ "port": 55555, + "disable-events-log": false + } +diff --git a/extras/firewalld/glusterfs.xml b/extras/firewalld/glusterfs.xml +index 7e17644..dc74b2e 100644 +--- a/extras/firewalld/glusterfs.xml ++++ b/extras/firewalld/glusterfs.xml +@@ -4,7 +4,7 @@ + Default ports for gluster-distributed storage + + +- ++ + + + +diff --git a/libglusterfs/src/events.c b/libglusterfs/src/events.c +index 4d720ca..3659606 100644 +--- a/libglusterfs/src/events.c ++++ b/libglusterfs/src/events.c +@@ -26,7 +26,7 @@ + #include "glusterfs/events.h" + + #define EVENT_HOST "127.0.0.1" +-#define EVENT_PORT 24009 ++#define EVENT_PORT 55555 + + int + _gf_event(eventtypes_t event, const char *fmt, ...) +-- +1.8.3.1 + diff --git a/SOURCES/0532-glusterd-fix-for-starting-brick-on-new-port.patch b/SOURCES/0532-glusterd-fix-for-starting-brick-on-new-port.patch new file mode 100644 index 0000000..97e5aa7 --- /dev/null +++ b/SOURCES/0532-glusterd-fix-for-starting-brick-on-new-port.patch @@ -0,0 +1,79 @@ +From 2dad17fdbaab2ab2cda6a05dec9dcd2d37ea32ff Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Wed, 10 Feb 2021 15:07:32 +0530 +Subject: [PATCH 532/532] glusterd: fix for starting brick on new port + +The Errno set by the runner code was not correct when the bind() fails +to assign an already occupied port in the __socket_server_bind(). + +Fix: +Updated the code to return the correct errno from the +__socket_server_bind() if the bind() fails due to EADDRINUSE error. And, +use the returned errno from runner_run() to retry allocating a new port +to the brick process. + +>Fixes: #1101 + +>Change-Id: If124337f41344a04f050754e402490529ef4ecdc +>Signed-off-by: nik-redhat nladha@redhat.com + +Upstream patch: https://github.com/gluster/glusterfs/pull/2090 + +BUG: 1865796 + +Change-Id: If124337f41344a04f050754e402490529ef4ecdc +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/227261 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez Juan +--- + rpc/rpc-transport/socket/src/socket.c | 3 +++ + xlators/mgmt/glusterd/src/glusterd-utils.c | 6 ++---- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c +index 1ee7320..96ed9f1 100644 +--- a/rpc/rpc-transport/socket/src/socket.c ++++ b/rpc/rpc-transport/socket/src/socket.c +@@ -973,8 +973,11 @@ __socket_server_bind(rpc_transport_t *this) + this->myinfo.identifier, strerror(errno)); + if (errno == EADDRINUSE) { + gf_log(this->name, GF_LOG_ERROR, "Port is already in use"); ++ ret = -EADDRINUSE; ++ goto out; + } + } ++ + if (AF_UNIX != SA(&this->myinfo.sockaddr)->sa_family) { + if (getsockname(priv->sock, SA(&this->myinfo.sockaddr), + &this->myinfo.sockaddr_len) != 0) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index cf32bd9..bc188a2 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -2151,6 +2151,7 @@ glusterd_volume_start_glusterfs(glusterd_volinfo_t *volinfo, + ret = -1; + goto out; + } ++ + /* Build the exp_path, before starting the glusterfsd even in + valgrind mode. Otherwise all the glusterfsd processes start + writing the valgrind log to the same file. +@@ -2289,13 +2290,10 @@ retry: + + if (wait) { + synclock_unlock(&priv->big_lock); +- errno = 0; + ret = runner_run(&runner); +- if (errno != 0) +- ret = errno; + synclock_lock(&priv->big_lock); + +- if (ret == EADDRINUSE) { ++ if (ret == -EADDRINUSE) { + /* retry after getting a new port */ + gf_msg(this->name, GF_LOG_WARNING, -ret, + GD_MSG_SRC_BRICK_PORT_UNAVAIL, +-- +1.8.3.1 + diff --git a/SOURCES/0533-glusterd-Rebalance-cli-is-not-showing-correct-status.patch b/SOURCES/0533-glusterd-Rebalance-cli-is-not-showing-correct-status.patch new file mode 100644 index 0000000..158b4b7 --- /dev/null +++ b/SOURCES/0533-glusterd-Rebalance-cli-is-not-showing-correct-status.patch @@ -0,0 +1,250 @@ +From 854ab79dbef449c39adf66e3faebb4681359fce4 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Thu, 18 Feb 2021 09:40:44 +0530 +Subject: [PATCH 533/538] glusterd: Rebalance cli is not showing correct status + after reboot (#2172) + +Rebalance cli is not showing correct status after reboot. + +The CLI is not correct status because defrag object is not +valid at the time of creating a rpc connection to show the status. +The defrag object is not valid because at the time of start a glusterd +glusterd_restart_rebalance can be call almost at the same time by two +different synctask and glusterd got a disconnect on rpc object and it +cleanup the defrag object. + +Solution: To avoid the defrag object populate a reference count before + create a defrag rpc object. +>Fixes: #1339 +>Signed-off-by: Mohit Agrawal +>Change-Id: Ia284015d79beaa3d703ebabb92f26870a5aaafba +Upstream Patch : https://github.com/gluster/glusterfs/pull/2172 + +BUG: 1832306 +Change-Id: Ia284015d79beaa3d703ebabb92f26870a5aaafba +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/228249 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-rebalance.c | 35 ++++++++++----- + xlators/mgmt/glusterd/src/glusterd-syncop.c | 1 + + xlators/mgmt/glusterd/src/glusterd-utils.c | 59 +++++++++++++++++++++++++- + xlators/mgmt/glusterd/src/glusterd-utils.h | 5 +++ + xlators/mgmt/glusterd/src/glusterd.h | 1 + + 5 files changed, 90 insertions(+), 11 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c +index b419a89..fcd5318 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c ++++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c +@@ -86,6 +86,7 @@ __glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata, + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + int pid = -1; ++ int refcnt = 0; + + this = THIS; + if (!this) +@@ -125,11 +126,12 @@ __glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata, + } + + case RPC_CLNT_DISCONNECT: { +- if (!defrag->connected) +- return 0; +- + LOCK(&defrag->lock); + { ++ if (!defrag->connected) { ++ UNLOCK(&defrag->lock); ++ return 0; ++ } + defrag->connected = 0; + } + UNLOCK(&defrag->lock); +@@ -146,11 +148,11 @@ __glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata, + glusterd_defrag_rpc_put(defrag); + if (defrag->cbk_fn) + defrag->cbk_fn(volinfo, volinfo->rebal.defrag_status); +- +- GF_FREE(defrag); ++ refcnt = glusterd_defrag_unref(defrag); + gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_REBALANCE_DISCONNECTED, +- "Rebalance process for volume %s has disconnected.", +- volinfo->volname); ++ "Rebalance process for volume %s has disconnected" ++ " and defrag refcnt is %d.", ++ volinfo->volname, refcnt); + break; + } + case RPC_CLNT_DESTROY: +@@ -309,7 +311,11 @@ glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr, + gf_msg_debug("glusterd", 0, "rebalance command failed"); + goto out; + } +- ++ /* Take reference before sleep to save defrag object cleanup while ++ glusterd_restart_rebalance call for other bricks by syncktask ++ at the time of restart a glusterd. ++ */ ++ glusterd_defrag_ref(defrag); + sleep(5); + + ret = glusterd_rebalance_rpc_create(volinfo); +@@ -372,6 +378,7 @@ glusterd_rebalance_rpc_create(glusterd_volinfo_t *volinfo) + GF_ASSERT(this); + priv = this->private; + GF_ASSERT(priv); ++ struct rpc_clnt *rpc = NULL; + + // rebalance process is not started + if (!defrag) +@@ -396,13 +403,21 @@ glusterd_rebalance_rpc_create(glusterd_volinfo_t *volinfo) + } + + glusterd_volinfo_ref(volinfo); +- ret = glusterd_rpc_create(&defrag->rpc, options, glusterd_defrag_notify, +- volinfo, _gf_true); ++ ret = glusterd_rpc_create(&rpc, options, glusterd_defrag_notify, volinfo, ++ _gf_false); + if (ret) { + gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL, + "Glusterd RPC creation failed"); + goto out; + } ++ LOCK(&defrag->lock); ++ { ++ if (!defrag->rpc) ++ defrag->rpc = rpc; ++ else ++ rpc_clnt_unref(rpc); ++ } ++ UNLOCK(&defrag->lock); + ret = 0; + out: + if (options) +diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c +index df78fef..05c9e11 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c ++++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c +@@ -1732,6 +1732,7 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + if (!rpc) { + if (pending_node->type == GD_NODE_REBALANCE && pending_node->node) { + volinfo = pending_node->node; ++ glusterd_defrag_ref(volinfo->rebal.defrag); + ret = glusterd_rebalance_rpc_create(volinfo); + if (ret) { + ret = 0; +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index bc188a2..9fb8eab 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -93,6 +93,44 @@ + #define NLMV4_VERSION 4 + #define NLMV1_VERSION 1 + ++int ++glusterd_defrag_ref(glusterd_defrag_info_t *defrag) ++{ ++ int refcnt = 0; ++ ++ if (!defrag) ++ goto out; ++ ++ LOCK(&defrag->lock); ++ { ++ refcnt = ++defrag->refcnt; ++ } ++ UNLOCK(&defrag->lock); ++ ++out: ++ return refcnt; ++} ++ ++int ++glusterd_defrag_unref(glusterd_defrag_info_t *defrag) ++{ ++ int refcnt = -1; ++ ++ if (!defrag) ++ goto out; ++ ++ LOCK(&defrag->lock); ++ { ++ refcnt = --defrag->refcnt; ++ if (refcnt <= 0) ++ GF_FREE(defrag); ++ } ++ UNLOCK(&defrag->lock); ++ ++out: ++ return refcnt; ++} ++ + gf_boolean_t + is_brick_mx_enabled(void) + { +@@ -9370,6 +9408,7 @@ glusterd_volume_defrag_restart(glusterd_volinfo_t *volinfo, char *op_errstr, + char pidfile[PATH_MAX] = ""; + int ret = -1; + pid_t pid = 0; ++ int refcnt = 0; + + this = THIS; + GF_ASSERT(this); +@@ -9410,7 +9449,25 @@ glusterd_volume_defrag_restart(glusterd_volinfo_t *volinfo, char *op_errstr, + volinfo->volname); + goto out; + } +- ret = glusterd_rebalance_rpc_create(volinfo); ++ refcnt = glusterd_defrag_ref(volinfo->rebal.defrag); ++ /* If refcnt value is 1 it means either defrag object is ++ poulated by glusterd_rebalance_defrag_init or previous ++ rpc creation was failed.If it is not 1 it means it(defrag) ++ was populated at the time of start a rebalance daemon. ++ We need to create a rpc object only while a previous ++ rpc connection was not established successfully at the ++ time of restart a rebalance daemon by ++ glusterd_handle_defrag_start otherwise rebalance cli ++ does not show correct status after just reboot a node and try ++ to print the rebalance status because defrag object has been ++ destroyed during handling of rpc disconnect. ++ */ ++ if (refcnt == 1) { ++ ret = glusterd_rebalance_rpc_create(volinfo); ++ } else { ++ ret = 0; ++ glusterd_defrag_unref(volinfo->rebal.defrag); ++ } + break; + } + case GF_DEFRAG_STATUS_NOT_STARTED: +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 02d85d2..4541471 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -886,4 +886,9 @@ int32_t + glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type, + int32_t sub_count); + ++int ++glusterd_defrag_ref(glusterd_defrag_info_t *defrag); ++ ++int ++glusterd_defrag_unref(glusterd_defrag_info_t *defrag); + #endif +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index efe4d0e..9de3f28 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -321,6 +321,7 @@ struct glusterd_defrag_info_ { + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; ++ int refcnt; + gf_lock_t lock; + int cmd; + pthread_t th; +-- +1.8.3.1 + diff --git a/SOURCES/0534-glusterd-Resolve-use-after-free-bug-2181.patch b/SOURCES/0534-glusterd-Resolve-use-after-free-bug-2181.patch new file mode 100644 index 0000000..2dc72c1 --- /dev/null +++ b/SOURCES/0534-glusterd-Resolve-use-after-free-bug-2181.patch @@ -0,0 +1,47 @@ +From b3647eb5415b2e3d9e1a11ad6c4689e520f17b39 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Mon, 22 Feb 2021 10:09:34 +0530 +Subject: [PATCH 534/538] glusterd: Resolve use after free bug (#2181) + +In the commit 61ae58e67567ea4de8f8efc6b70a9b1f8e0f1bea +introduced a coverity bug use object after cleanup +the object. + +Cleanup memory after comeout from a critical section +>Fixes: #2180 + +>Change-Id: Iee2050c4883a0dd44b8523bb822b664462ab6041 +>Signed-off-by: Mohit Agrawal +Upstream Patch : https://github.com/gluster/glusterfs/pull/2181 + +BUG: 1832306 +Change-Id: Iee2050c4883a0dd44b8523bb822b664462ab6041 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/228578 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 9fb8eab..6d40be5 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -122,11 +122,10 @@ glusterd_defrag_unref(glusterd_defrag_info_t *defrag) + LOCK(&defrag->lock); + { + refcnt = --defrag->refcnt; +- if (refcnt <= 0) +- GF_FREE(defrag); + } + UNLOCK(&defrag->lock); +- ++ if (refcnt <= 0) ++ GF_FREE(defrag); + out: + return refcnt; + } +-- +1.8.3.1 + diff --git a/SOURCES/0535-multiple-files-use-dict_allocate_and_serialize-where.patch b/SOURCES/0535-multiple-files-use-dict_allocate_and_serialize-where.patch new file mode 100644 index 0000000..e1622de --- /dev/null +++ b/SOURCES/0535-multiple-files-use-dict_allocate_and_serialize-where.patch @@ -0,0 +1,270 @@ +From 775d500cd136bd8c940faaeffde1217c25a87e3d Mon Sep 17 00:00:00 2001 +From: Yaniv Kaul +Date: Sun, 2 Jun 2019 21:14:18 +0300 +Subject: [PATCH 535/538] (multiple files) use dict_allocate_and_serialize() + where applicable. + +This function does length, allocation and serialization for you. + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/22800 +> Change-Id: I142a259952a2fe83dd719442afaefe4a43a8e55e +> updates: bz#1193929 +> Signed-off-by: Yaniv Kaul + +Change-Id: I142a259952a2fe83dd719442afaefe4a43a8e55e +BUG: 1911292 +Signed-off-by: Yaniv Kaul +Reviewed-on: https://code.engineering.redhat.com/gerrit/228611 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-inode-read.c | 34 +++++--------------------- + xlators/cluster/ec/src/ec-combine.c | 16 +++--------- + xlators/features/locks/src/posix.c | 23 +++-------------- + xlators/protocol/client/src/client-handshake.c | 14 +++-------- + xlators/protocol/server/src/server-handshake.c | 24 +++++++----------- + xlators/protocol/server/src/server-helpers.c | 27 +++----------------- + 6 files changed, 28 insertions(+), 110 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c +index 523a5b4..cf305af 100644 +--- a/xlators/cluster/afr/src/afr-inode-read.c ++++ b/xlators/cluster/afr/src/afr-inode-read.c +@@ -948,24 +948,13 @@ unlock: + goto unwind; + } + +- len = dict_serialized_length(local->dict); +- if (len <= 0) { +- goto unwind; +- } +- +- lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char); +- if (!lockinfo_buf) { ++ op_ret = dict_allocate_and_serialize( ++ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); ++ if (op_ret != 0) { + local->op_ret = -1; +- local->op_errno = ENOMEM; + goto unwind; + } + +- op_ret = dict_serialize(local->dict, lockinfo_buf); +- if (op_ret < 0) { +- local->op_ret = -1; +- local->op_errno = -op_ret; +- } +- + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { +@@ -1064,24 +1053,13 @@ unlock: + goto unwind; + } + +- len = dict_serialized_length(local->dict); +- if (len <= 0) { +- goto unwind; +- } +- +- lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char); +- if (!lockinfo_buf) { ++ op_ret = dict_allocate_and_serialize( ++ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); ++ if (op_ret != 0) { + local->op_ret = -1; +- local->op_errno = ENOMEM; + goto unwind; + } + +- op_ret = dict_serialize(local->dict, lockinfo_buf); +- if (op_ret < 0) { +- local->op_ret = -1; +- local->op_errno = -op_ret; +- } +- + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { +diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c +index 99e5534..9d712b3 100644 +--- a/xlators/cluster/ec/src/ec-combine.c ++++ b/xlators/cluster/ec/src/ec-combine.c +@@ -486,22 +486,12 @@ ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key) + + tmp = NULL; + +- len = dict_serialized_length(lockinfo); +- if (len < 0) { +- err = len; +- +- goto out; +- } +- ptr = GF_MALLOC(len, gf_common_mt_char); +- if (ptr == NULL) { +- err = -ENOMEM; +- +- goto out; +- } +- err = dict_serialize(lockinfo, ptr); ++ err = dict_allocate_and_serialize(lockinfo, (char **)&ptr, ++ (unsigned int *)&len); + if (err != 0) { + goto out; + } ++ + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + err = dict_set_dynptr(dict, key, ptr, len); + if (err != 0) { +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index 5ae0125..cdd1ff7 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -1547,8 +1547,9 @@ pl_fgetxattr_handle_lockinfo(xlator_t *this, fd_t *fd, dict_t *dict, + goto out; + } + +- len = dict_serialized_length(tmp); +- if (len < 0) { ++ op_ret = dict_allocate_and_serialize(tmp, (char **)&buf, ++ (unsigned int *)&len); ++ if (op_ret != 0) { + *op_errno = -op_ret; + op_ret = -1; + gf_log(this->name, GF_LOG_WARNING, +@@ -1558,24 +1559,6 @@ pl_fgetxattr_handle_lockinfo(xlator_t *this, fd_t *fd, dict_t *dict, + goto out; + } + +- buf = GF_CALLOC(1, len, gf_common_mt_char); +- if (buf == NULL) { +- op_ret = -1; +- *op_errno = ENOMEM; +- goto out; +- } +- +- op_ret = dict_serialize(tmp, buf); +- if (op_ret < 0) { +- *op_errno = -op_ret; +- op_ret = -1; +- gf_log(this->name, GF_LOG_WARNING, +- "dict_serialize failed (%s) while handling lockinfo " +- "for fd (ptr: %p inode-gfid:%s)", +- strerror(*op_errno), fd, uuid_utoa(fd->inode->gfid)); +- goto out; +- } +- + op_ret = dict_set_dynptr(dict, GF_XATTR_LOCKINFO_KEY, buf, len); + if (op_ret < 0) { + *op_errno = -op_ret; +diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c +index 0002361..6b20d92 100644 +--- a/xlators/protocol/client/src/client-handshake.c ++++ b/xlators/protocol/client/src/client-handshake.c +@@ -1286,18 +1286,10 @@ client_setvolume(xlator_t *this, struct rpc_clnt *rpc) + "Failed to set client opversion in handshake message"); + } + +- ret = dict_serialized_length(options); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_ERROR, +- "failed to get serialized length of dict"); ++ ret = dict_allocate_and_serialize(options, (char **)&req.dict.dict_val, ++ &req.dict.dict_len); ++ if (ret != 0) { + ret = -1; +- goto fail; +- } +- req.dict.dict_len = ret; +- req.dict.dict_val = GF_CALLOC(1, req.dict.dict_len, +- gf_client_mt_clnt_req_buf_t); +- ret = dict_serialize(options, req.dict.dict_val); +- if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_SERIALIZE_FAIL, + "failed to serialize " + "dictionary"); +diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c +index eeca73c..54dc030 100644 +--- a/xlators/protocol/server/src/server-handshake.c ++++ b/xlators/protocol/server/src/server-handshake.c +@@ -676,22 +676,16 @@ fail: + GF_ASSERT(rsp); + + rsp->op_ret = 0; +- ret = dict_serialized_length(reply); +- if (ret > 0) { +- rsp->dict.dict_len = ret; +- rsp->dict.dict_val = GF_CALLOC(1, rsp->dict.dict_len, +- gf_server_mt_rsp_buf_t); +- if (rsp->dict.dict_val) { +- ret = dict_serialize(reply, rsp->dict.dict_val); +- if (ret < 0) { +- gf_msg_debug("server-handshake", 0, +- "failed " +- "to serialize reply dict"); +- op_ret = -1; +- op_errno = -ret; +- } +- } ++ ++ ret = dict_allocate_and_serialize(reply, (char **)&rsp->dict.dict_val, ++ &rsp->dict.dict_len); ++ if (ret != 0) { ++ ret = -1; ++ gf_msg_debug("server-handshake", 0, "failed to serialize reply dict"); ++ op_ret = -1; ++ op_errno = -ret; + } ++ + rsp->op_ret = op_ret; + rsp->op_errno = gf_errno_to_error(op_errno); + +diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c +index e74a24d..33959b5 100644 +--- a/xlators/protocol/server/src/server-helpers.c ++++ b/xlators/protocol/server/src/server-helpers.c +@@ -902,7 +902,6 @@ serialize_rsp_direntp(gf_dirent_t *entries, gfs3_readdirp_rsp *rsp) + gfs3_dirplist *trav = NULL; + gfs3_dirplist *prev = NULL; + int ret = -1; +- int temp = 0; + + GF_VALIDATE_OR_GOTO("server", entries, out); + GF_VALIDATE_OR_GOTO("server", rsp, out); +@@ -923,28 +922,10 @@ serialize_rsp_direntp(gf_dirent_t *entries, gfs3_readdirp_rsp *rsp) + + /* if 'dict' is present, pack it */ + if (entry->dict) { +- temp = dict_serialized_length(entry->dict); +- +- if (temp < 0) { +- gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, PS_MSG_INVALID_ENTRY, +- "failed to get " +- "serialized length of reply dict"); +- errno = EINVAL; +- trav->dict.dict_len = 0; +- goto out; +- } +- trav->dict.dict_len = temp; +- +- trav->dict.dict_val = GF_CALLOC(1, trav->dict.dict_len, +- gf_server_mt_rsp_buf_t); +- if (!trav->dict.dict_val) { +- errno = ENOMEM; +- trav->dict.dict_len = 0; +- goto out; +- } +- +- ret = dict_serialize(entry->dict, trav->dict.dict_val); +- if (ret < 0) { ++ ret = dict_allocate_and_serialize(entry->dict, ++ (char **)&trav->dict.dict_val, ++ &trav->dict.dict_len); ++ if (ret != 0) { + gf_msg(THIS->name, GF_LOG_ERROR, 0, PS_MSG_DICT_SERIALIZE_FAIL, + "failed to serialize reply dict"); + errno = -ret; +-- +1.8.3.1 + diff --git a/SOURCES/0536-dht-Ongoing-IO-is-failed-during-volume-shrink-operat.patch b/SOURCES/0536-dht-Ongoing-IO-is-failed-during-volume-shrink-operat.patch new file mode 100644 index 0000000..94e0b64 --- /dev/null +++ b/SOURCES/0536-dht-Ongoing-IO-is-failed-during-volume-shrink-operat.patch @@ -0,0 +1,102 @@ +From 32281b4b5cf79d0ef6f0c65775bb81093e1ba479 Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Wed, 24 Feb 2021 18:44:12 +0530 +Subject: [PATCH 536/538] dht: Ongoing IO is failed during volume shrink + operation (#2188) + +In the commit (c878174) we have introduced a check +to avoid stale layout issue.To avoid a stale layout +issue dht has set a key along with layout at the time +of wind a create fop and posix validates the parent +layout based on the key value. If layout does not match +it throw and error.In case of volume shrink layout has +been changed by reabalance daemon and if layout does not +matches dht is not able to wind a create fop successfully. + +Solution: To avoid the issue populate a key only while + dht has wind a fop first time. After got an + error in 2nd attempt dht takes a lock and then + reattempt to wind a fop again. + +> Fixes: #2187 +> Change-Id: Ie018386e7823a11eea415496bb226ca032453a55 +> Signed-off-by: Mohit Agrawal +> (Cherry pick from commit da6ce622b722f7d12619c5860293faf03f7cd00c +> Reviewed on upstream link https://github.com/gluster/glusterfs/pull/2188 + +Bug: 1924044 +Change-Id: I7670dbe2d562b83db0af3753f994653ffdd49591 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/228941 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 41 ++++++++++++++++++++++++++---------- + 1 file changed, 30 insertions(+), 11 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index fe1d0ee..7425c1a 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -8526,15 +8526,32 @@ dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this, + { + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; ++ int lk_count = 0; + + local = frame->local; + + if (!dht_is_subvol_filled(this, subvol)) { +- gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, +- subvol->name); +- +- dht_set_parent_layout_in_dict(loc, this, local); +- ++ lk_count = local->lock[0].layout.parent_layout.lk_count; ++ gf_msg_debug(this->name, 0, "creating %s on %s with lock_count %d", ++ loc->path, subvol->name, lk_count); ++ /*The function dht_set_parent_layout_in_dict sets the layout ++ in dictionary and posix_create validates a layout before ++ creating a file.In case if parent layout does not match ++ with disk layout posix xlator throw an error but in case ++ if volume is shrunk layout has been changed by rebalance daemon ++ so we need to call this function only while a function is calling ++ without taking any lock otherwise we would not able to populate a ++ layout on disk in case if layout has changed. ++ */ ++ if (!lk_count) { ++ dht_set_parent_layout_in_dict(loc, this, local); ++ } else { ++ /* Delete a key to avoid layout validate if it was set by ++ previous STACK_WIND attempt when a lock was not taken ++ by dht_create ++ */ ++ (void)dict_del_sizen(local->params, GF_PREOP_PARENT_KEY); ++ } + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); +@@ -8554,12 +8571,14 @@ dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this, + + goto out; + } +- +- gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, +- subvol->name); +- +- dht_set_parent_layout_in_dict(loc, this, local); +- ++ lk_count = local->lock[0].layout.parent_layout.lk_count; ++ gf_msg_debug(this->name, 0, "creating %s on %s with lk_count %d", ++ loc->path, subvol->name, lk_count); ++ if (!lk_count) { ++ dht_set_parent_layout_in_dict(loc, this, local); ++ } else { ++ (void)dict_del_sizen(local->params, GF_PREOP_PARENT_KEY); ++ } + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); +-- +1.8.3.1 + diff --git a/SOURCES/0537-cluster-afr-Fix-race-in-lockinfo-f-getxattr.patch b/SOURCES/0537-cluster-afr-Fix-race-in-lockinfo-f-getxattr.patch new file mode 100644 index 0000000..dcf0940 --- /dev/null +++ b/SOURCES/0537-cluster-afr-Fix-race-in-lockinfo-f-getxattr.patch @@ -0,0 +1,387 @@ +From 7b7ec67680415c22773ebb2a5daacf298b6b1e06 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Sat, 13 Feb 2021 18:37:32 +0100 +Subject: [PATCH 537/538] cluster/afr: Fix race in lockinfo (f)getxattr + +A shared dictionary was updated outside the lock after having updated +the number of remaining answers. This means that one thread may be +processing the last answer and unwinding the request before another +thread completes updating the dict. + + Thread 1 Thread 2 + + LOCK() + call_cnt-- (=1) + UNLOCK() + LOCK() + call_cnt-- (=0) + UNLOCK() + update_dict(dict) + if (call_cnt == 0) { + STACK_UNWIND(dict); + } + update_dict(dict) + if (call_cnt == 0) { + STACK_UNWIND(dict); + } + +The updates from thread 1 are lost. + +This patch also reduces the work done inside the locked region and +reduces code duplication. + +Upstream-patch: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2162 +> Fixes: #2161 +> Change-Id: Idc0d34ab19ea6031de0641f7b05c624d90fac8fa +> Signed-off-by: Xavi Hernandez + +BUG: 1911292 +Change-Id: Idc0d34ab19ea6031de0641f7b05c624d90fac8fa +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/228924 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-inode-read.c | 254 ++++++++++++++----------------- + 1 file changed, 112 insertions(+), 142 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c +index cf305af..98e195a 100644 +--- a/xlators/cluster/afr/src/afr-inode-read.c ++++ b/xlators/cluster/afr/src/afr-inode-read.c +@@ -15,6 +15,8 @@ + #include + #include + ++#include ++ + #include + #include "afr.h" + #include +@@ -868,188 +870,121 @@ afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + return 0; + } + +-int32_t +-afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *dict, +- dict_t *xdata) ++static int32_t ++afr_update_local_dicts(call_frame_t *frame, dict_t *dict, dict_t *xdata) + { +- int call_cnt = 0, len = 0; +- char *lockinfo_buf = NULL; +- dict_t *lockinfo = NULL, *newdict = NULL; +- afr_local_t *local = NULL; ++ afr_local_t *local; ++ dict_t *local_dict; ++ dict_t *local_xdata; ++ int32_t ret; + +- LOCK(&frame->lock); +- { +- local = frame->local; ++ local = frame->local; ++ local_dict = NULL; ++ local_xdata = NULL; + +- call_cnt = --local->call_count; ++ ret = -ENOMEM; + +- if ((op_ret < 0) || (!dict && !xdata)) { +- goto unlock; +- } +- +- if (xdata) { +- if (!local->xdata_rsp) { +- local->xdata_rsp = dict_new(); +- if (!local->xdata_rsp) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unlock; +- } +- } ++ if ((dict != NULL) && (local->dict == NULL)) { ++ local_dict = dict_new(); ++ if (local_dict == NULL) { ++ goto done; + } ++ } + +- if (!dict) { +- goto unlock; ++ if ((xdata != NULL) && (local->xdata_rsp == NULL)) { ++ local_xdata = dict_new(); ++ if (local_xdata == NULL) { ++ goto done; + } ++ } + +- op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, +- (void **)&lockinfo_buf, &len); ++ if ((local_dict != NULL) || (local_xdata != NULL)) { ++ /* TODO: Maybe it would be better to preallocate both dicts before ++ * sending the requests. This way we don't need to use a LOCK() ++ * here. */ ++ LOCK(&frame->lock); + +- if (!lockinfo_buf) { +- goto unlock; ++ if ((local_dict != NULL) && (local->dict == NULL)) { ++ local->dict = local_dict; ++ local_dict = NULL; + } + +- if (!local->dict) { +- local->dict = dict_new(); +- if (!local->dict) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unlock; +- } ++ if ((local_xdata != NULL) && (local->xdata_rsp == NULL)) { ++ local->xdata_rsp = local_xdata; ++ local_xdata = NULL; + } +- } +-unlock: +- UNLOCK(&frame->lock); + +- if (lockinfo_buf != NULL) { +- lockinfo = dict_new(); +- if (lockinfo == NULL) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- } else { +- op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); +- +- if (lockinfo && local->dict) { +- dict_copy(lockinfo, local->dict); +- } +- } +- } +- +- if (xdata && local->xdata_rsp) { +- dict_copy(xdata, local->xdata_rsp); ++ UNLOCK(&frame->lock); + } + +- if (!call_cnt) { +- newdict = dict_new(); +- if (!newdict) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unwind; ++ if (dict != NULL) { ++ if (dict_copy(dict, local->dict) < 0) { ++ goto done; + } ++ } + +- op_ret = dict_allocate_and_serialize( +- local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); +- if (op_ret != 0) { +- local->op_ret = -1; +- goto unwind; ++ if (xdata != NULL) { ++ if (dict_copy(xdata, local->xdata_rsp) < 0) { ++ goto done; + } ++ } + +- op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, +- (void *)lockinfo_buf, len); +- if (op_ret < 0) { +- local->op_ret = -1; +- local->op_errno = -op_ret; +- goto unwind; +- } ++ ret = 0; + +- unwind: +- AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict, +- local->xdata_rsp); ++done: ++ if (local_dict != NULL) { ++ dict_unref(local_dict); + } + +- dict_unref(lockinfo); ++ if (local_xdata != NULL) { ++ dict_unref(local_xdata); ++ } + +- return 0; ++ return ret; + } + +-int32_t +-afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *dict, +- dict_t *xdata) ++static void ++afr_getxattr_lockinfo_cbk_common(call_frame_t *frame, int32_t op_ret, ++ int32_t op_errno, dict_t *dict, dict_t *xdata, ++ bool is_fgetxattr) + { +- int call_cnt = 0, len = 0; ++ int len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + +- LOCK(&frame->lock); +- { +- local = frame->local; +- +- call_cnt = --local->call_count; +- +- if ((op_ret < 0) || (!dict && !xdata)) { +- goto unlock; +- } +- +- if (xdata) { +- if (!local->xdata_rsp) { +- local->xdata_rsp = dict_new(); +- if (!local->xdata_rsp) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unlock; +- } +- } +- } +- +- if (!dict) { +- goto unlock; +- } ++ local = frame->local; + ++ if ((op_ret >= 0) && (dict != NULL)) { + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); +- +- if (!lockinfo_buf) { +- goto unlock; +- } +- +- if (!local->dict) { +- local->dict = dict_new(); +- if (!local->dict) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unlock; ++ if (lockinfo_buf != NULL) { ++ lockinfo = dict_new(); ++ if (lockinfo == NULL) { ++ op_ret = -1; ++ } else { ++ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); + } + } + } +-unlock: +- UNLOCK(&frame->lock); + +- if (lockinfo_buf != NULL) { +- lockinfo = dict_new(); +- if (lockinfo == NULL) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- } else { +- op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); +- +- if (lockinfo && local->dict) { +- dict_copy(lockinfo, local->dict); +- } ++ if ((op_ret >= 0) && ((lockinfo != NULL) || (xdata != NULL))) { ++ op_ret = afr_update_local_dicts(frame, lockinfo, xdata); ++ if (lockinfo != NULL) { ++ dict_unref(lockinfo); + } + } + +- if (xdata && local->xdata_rsp) { +- dict_copy(xdata, local->xdata_rsp); ++ if (op_ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; + } + +- if (!call_cnt) { ++ if (uatomic_sub_return(&local->call_count, 1) == 0) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; +- local->op_errno = ENOMEM; ++ local->op_errno = op_errno = ENOMEM; + goto unwind; + } + +@@ -1057,23 +992,58 @@ unlock: + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; ++ local->op_errno = op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { +- local->op_ret = -1; +- local->op_errno = -op_ret; ++ GF_FREE(lockinfo_buf); ++ local->op_ret = op_ret = -1; ++ local->op_errno = op_errno = -op_ret; + goto unwind; + } + + unwind: +- AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict, +- local->xdata_rsp); ++ /* TODO: These unwinds use op_ret and op_errno instead of local->op_ret ++ * and local->op_errno. This doesn't seem right because any ++ * failure during processing of each answer could be silently ++ * ignored. This is kept this was the old behavior and because ++ * local->op_ret is initialized as -1 and local->op_errno is ++ * initialized as EUCLEAN, which makes these values useless. */ ++ if (is_fgetxattr) { ++ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict, ++ local->xdata_rsp); ++ } else { ++ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict, ++ local->xdata_rsp); ++ } ++ ++ if (newdict != NULL) { ++ dict_unref(newdict); ++ } + } ++} ++ ++static int32_t ++afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ afr_getxattr_lockinfo_cbk_common(frame, op_ret, op_errno, dict, xdata, ++ false); + +- dict_unref(lockinfo); ++ return 0; ++} ++ ++static int32_t ++afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ afr_getxattr_lockinfo_cbk_common(frame, op_ret, op_errno, dict, xdata, ++ true); + + return 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0538-afr-fix-coverity-issue-introduced-by-90cefde.patch b/SOURCES/0538-afr-fix-coverity-issue-introduced-by-90cefde.patch new file mode 100644 index 0000000..de164a3 --- /dev/null +++ b/SOURCES/0538-afr-fix-coverity-issue-introduced-by-90cefde.patch @@ -0,0 +1,46 @@ +From 31cd7627ff329a39691239322df3bc88e962ad02 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Mon, 1 Mar 2021 05:19:39 +0100 +Subject: [PATCH 538/538] afr: fix coverity issue introduced by 90cefde + +Fixes coverity issues 1447029 and 1447028. + +Backport of: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2201 +> Updates: #2161 +> Change-Id: I6a564231d6aeb76de20675b7ced5d45eed8c377f +> Signed-off-by: Xavi Hernandez + +BUG: 1911292 +Change-Id: I6a564231d6aeb76de20675b7ced5d45eed8c377f +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/229200 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-inode-read.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c +index 98e195a..d874172 100644 +--- a/xlators/cluster/afr/src/afr-inode-read.c ++++ b/xlators/cluster/afr/src/afr-inode-read.c +@@ -918,13 +918,13 @@ afr_update_local_dicts(call_frame_t *frame, dict_t *dict, dict_t *xdata) + } + + if (dict != NULL) { +- if (dict_copy(dict, local->dict) < 0) { ++ if (dict_copy(dict, local->dict) == NULL) { + goto done; + } + } + + if (xdata != NULL) { +- if (dict_copy(xdata, local->xdata_rsp) < 0) { ++ if (dict_copy(xdata, local->xdata_rsp) == NULL) { + goto done; + } + } +-- +1.8.3.1 + diff --git a/SOURCES/0539-extras-disable-lookup-optimize-in-virt-and-block-gro.patch b/SOURCES/0539-extras-disable-lookup-optimize-in-virt-and-block-gro.patch new file mode 100644 index 0000000..18f851f --- /dev/null +++ b/SOURCES/0539-extras-disable-lookup-optimize-in-virt-and-block-gro.patch @@ -0,0 +1,62 @@ +From 88523814fe296c9cc9f7619e06210830f59c5edf Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Fri, 12 Mar 2021 10:32:09 +0100 +Subject: [PATCH 539/539] extras: disable lookup-optimize in virt and block + groups + +lookup-optimize doesn't provide any benefit for virtualized +environments and gluster-block workloads, but it's known to cause +corruption in some cases when sharding is also enabled and the volume +is expanded or shrunk. + +For this reason, we disable lookup-optimize by default on those +environments. + +Backport of: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2254 +> Fixes: #2253 +> Change-Id: I25861aa50b335556a995a9c33318dd3afb41bf71 +> Signed-off-by: Xavi Hernandez + +BUG: 1939372 +Change-Id: I25861aa50b335556a995a9c33318dd3afb41bf71 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/231173 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/group-distributed-virt | 1 + + extras/group-gluster-block | 1 + + extras/group-virt.example | 1 + + 3 files changed, 3 insertions(+) + +diff --git a/extras/group-distributed-virt b/extras/group-distributed-virt +index a960b76..6da3de0 100644 +--- a/extras/group-distributed-virt ++++ b/extras/group-distributed-virt +@@ -8,3 +8,4 @@ user.cifs=off + client.event-threads=4 + server.event-threads=4 + performance.client-io-threads=on ++cluster.lookup-optimize=off +diff --git a/extras/group-gluster-block b/extras/group-gluster-block +index 1e39801..b8d3e8d 100644 +--- a/extras/group-gluster-block ++++ b/extras/group-gluster-block +@@ -25,3 +25,4 @@ features.shard-block-size=64MB + user.cifs=off + server.allow-insecure=on + cluster.choose-local=off ++cluster.lookup-optimize=off +diff --git a/extras/group-virt.example b/extras/group-virt.example +index 3a441eb..155f5f5 100644 +--- a/extras/group-virt.example ++++ b/extras/group-virt.example +@@ -21,3 +21,4 @@ server.tcp-user-timeout=20 + server.keepalive-time=10 + server.keepalive-interval=2 + server.keepalive-count=5 ++cluster.lookup-optimize=off +-- +1.8.3.1 + diff --git a/SOURCES/0540-extras-Disable-write-behind-for-group-samba.patch b/SOURCES/0540-extras-Disable-write-behind-for-group-samba.patch new file mode 100644 index 0000000..0a89c64 --- /dev/null +++ b/SOURCES/0540-extras-Disable-write-behind-for-group-samba.patch @@ -0,0 +1,37 @@ +From 6895b6c67e9c29af3f966b4d9ee5cb40da763d24 Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Wed, 14 Apr 2021 12:38:45 +0530 +Subject: [PATCH 540/540] extras: Disable write-behind for group samba. + +when write-behind is enabled with Samba it could be a +source of data corruption. The translator, while +processing a write call, immediately returns success but continues +writing the data to the server in the background. This can cause data +corruption when two clients relying on Samba to provide data consistency +are operating on the same file. + +> fixes: https://github.com/gluster/glusterfs/issues/2329 + +Change-Id: I5265056ff315a5f3cd97ea11b18db0831b1b901d +Solution: Disable write-behind for samba group +BUG: 1948547 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/235876 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/group-samba | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/extras/group-samba b/extras/group-samba +index eeee6e0..9611a1f 100644 +--- a/extras/group-samba ++++ b/extras/group-samba +@@ -9,3 +9,4 @@ performance.nl-cache=on + performance.nl-cache-timeout=600 + performance.readdir-ahead=on + performance.parallel-readdir=on ++performance.write-behind=off +-- +1.8.3.1 + diff --git a/SOURCES/0541-glusterd-volgen-Add-functionality-to-accept-any-cust.patch b/SOURCES/0541-glusterd-volgen-Add-functionality-to-accept-any-cust.patch new file mode 100644 index 0000000..29135df --- /dev/null +++ b/SOURCES/0541-glusterd-volgen-Add-functionality-to-accept-any-cust.patch @@ -0,0 +1,545 @@ +From 23ab7175e64ab4d75fbcb6874008843cc78b65b8 Mon Sep 17 00:00:00 2001 +From: Ashish Pandey +Date: Fri, 16 Apr 2021 18:48:56 +0530 +Subject: [PATCH 541/542] glusterd-volgen: Add functionality to accept any + custom xlator + +Add new function which allow users to insert any custom xlators. +It makes to provide a way to add any processing into file operations. + +Users can deploy the plugin(xlator shared object) and integrate it to glusterfsd. + +If users want to enable a custom xlator, do the follows: + +1. put xlator object(.so file) into "XLATOR_DIR/user/" +2. set the option user.xlator. to the existing xlator-name to specify of the position in graph +3. restart gluster volume + +Options for custom xlator are able to set in "user.xlator..". + +Backport of : +>https://github.com/gluster/glusterfs/commit/ea86b664f3b1f54901ce1b7d7fba7d80456f2089 +>Fixes: https://github.com/gluster/glusterfs/issues/1943 +>Change-Id: Ife3ae1514ea474f5dae2897223012f9d04b64674 +>Signed-off-by:Ryo Furuhashi +>Co-authored-by: Yaniv Kaul +>Co-authored-by: Xavi Hernandez + +Change-Id: Ic8f28bfcfde67213eb1092b0ebf4822c874d37bb +BUG: 1927235 +Signed-off-by: Ashish Pandey +Reviewed-on: https://code.engineering.redhat.com/gerrit/236830 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Xavi Hernandez Juan +--- + cli/src/cli-rpc-ops.c | 148 ++++++++++++++++++++------ + cli/src/cli.h | 2 - + tests/basic/user-xlator.t | 65 ++++++++++++ + tests/env.rc.in | 3 + + xlators/mgmt/glusterd/src/glusterd-volgen.c | 155 ++++++++++++++++++++++++++++ + 5 files changed, 342 insertions(+), 31 deletions(-) + create mode 100755 tests/basic/user-xlator.t + +diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c +index 4e91265..51b5447 100644 +--- a/cli/src/cli-rpc-ops.c ++++ b/cli/src/cli-rpc-ops.c +@@ -2269,49 +2269,131 @@ out: + return ret; + } + +-char * +-is_server_debug_xlator(void *myframe) ++/* ++ * returns ++ * 1 : is server debug xlator ++ * 0 : is not server debug xlator ++ * <0 : error ++ */ ++static int ++is_server_debug_xlator(char *key, char *value) ++{ ++ if (!key || !value) ++ return -1; ++ ++ if (strcmp("debug.trace", key) == 0 || ++ strcmp("debug.error-gen", key) == 0) { ++ if (strcmp("client", value) == 0) ++ return 0; ++ else ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/* ++ * returns ++ * 1 : is user xlator ++ * 0 : is not user xlator ++ * <0 : error ++ */ ++static int ++is_server_user_xlator(char *key, char *value) ++{ ++ int ret = 0; ++ ++ if (!key || !value) ++ return -1; ++ ++ ret = fnmatch("user.xlator.*", key, 0); ++ if (ret < 0) { ++ ret = -1; ++ goto out; ++ } else if (ret == FNM_NOMATCH) { ++ ret = 0; ++ goto out; ++ } ++ ++ ret = fnmatch("user.xlator.*.*", key, 0); ++ if (ret < 0) { ++ ret = -1; ++ goto out; ++ } else if (ret != FNM_NOMATCH) { // this is user xlator's option key ++ ret = 0; ++ goto out; ++ } ++ ++ ret = 1; ++ ++out: ++ return ret; ++} ++ ++static int ++added_server_xlator(void *myframe, char **added_xlator) + { + call_frame_t *frame = NULL; + cli_local_t *local = NULL; + char **words = NULL; + char *key = NULL; + char *value = NULL; +- char *debug_xlator = NULL; ++ int ret = 0; + + frame = myframe; + local = frame->local; + words = (char **)local->words; + + while (*words != NULL) { +- if (strstr(*words, "trace") == NULL && +- strstr(*words, "error-gen") == NULL) { +- words++; +- continue; +- } +- + key = *words; + words++; + value = *words; +- if (value == NULL) ++ ++ if (!value) { + break; +- if (strstr(value, "client")) { +- words++; +- continue; +- } else { +- if (!(strstr(value, "posix") || strstr(value, "acl") || +- strstr(value, "locks") || strstr(value, "io-threads") || +- strstr(value, "marker") || strstr(value, "index"))) { +- words++; +- continue; +- } else { +- debug_xlator = gf_strdup(key); +- break; ++ } ++ ++ ret = is_server_debug_xlator(key, value); ++ if (ret < 0) { ++ gf_log(((call_frame_t *)myframe)->this->name, GF_LOG_ERROR, ++ "failed to check that debug xlator was added"); ++ ret = -1; ++ goto out; ++ } ++ ++ if (ret) { ++ *added_xlator = gf_strdup(key); ++ if (!*added_xlator) { ++ gf_log(((call_frame_t *)myframe)->this->name, GF_LOG_ERROR, ++ "Out of memory"); ++ ret = -1; ++ goto out; ++ } ++ break; ++ } ++ ++ ret = is_server_user_xlator(key, value); ++ if (ret < 0) { ++ gf_log(((call_frame_t *)myframe)->this->name, GF_LOG_ERROR, ++ "failed to check that user xlator was added"); ++ ret = -1; ++ goto out; ++ } ++ ++ if (ret) { ++ *added_xlator = gf_strdup(key); ++ if (!*added_xlator) { ++ gf_log(((call_frame_t *)myframe)->this->name, GF_LOG_ERROR, ++ "Out of memory"); ++ ret = -1; ++ goto out; + } ++ break; + } + } + +- return debug_xlator; ++out: ++ return ret; + } + + int +@@ -2327,7 +2409,7 @@ gf_cli_set_volume_cbk(struct rpc_req *req, struct iovec *iov, int count, + char msg[1024] = { + 0, + }; +- char *debug_xlator = NULL; ++ char *added_xlator = NULL; + char tmp_str[512] = { + 0, + }; +@@ -2365,18 +2447,26 @@ gf_cli_set_volume_cbk(struct rpc_req *req, struct iovec *iov, int count, + * The process has to be restarted. So this is a check from the + * volume set option such that if debug xlators such as trace/errorgen + * are provided in the set command, warn the user. ++ * volume set option such that if user custom xlators or debug ++ * xlators such as trace/errorgen are provided in the set command, ++ * warn the user. + */ +- debug_xlator = is_server_debug_xlator(myframe); ++ ret = added_server_xlator(myframe, &added_xlator); ++ if (ret < 0) { ++ gf_log("cli", GF_LOG_ERROR, ++ "failed to check that server graph has been changed"); ++ goto out; ++ } + + if (dict_get_str(dict, "help-str", &help_str) && !msg[0]) + snprintf(msg, sizeof(msg), "Set volume %s", + (rsp.op_ret) ? "unsuccessful" : "successful"); +- if (rsp.op_ret == 0 && debug_xlator) { ++ if (rsp.op_ret == 0 && added_xlator) { + snprintf(tmp_str, sizeof(tmp_str), + "\n%s translator has been " + "added to the server volume file. Please restart the" + " volume for enabling the translator", +- debug_xlator); ++ added_xlator); + } + + if ((global_state->mode & GLUSTER_MODE_XML) && (help_str == NULL)) { +@@ -2394,7 +2484,7 @@ gf_cli_set_volume_cbk(struct rpc_req *req, struct iovec *iov, int count, + cli_err("volume set: failed"); + } else { + if (help_str == NULL) { +- if (debug_xlator == NULL) ++ if (added_xlator == NULL) + cli_out("volume set: success"); + else + cli_out("volume set: success%s", tmp_str); +@@ -2408,7 +2498,7 @@ gf_cli_set_volume_cbk(struct rpc_req *req, struct iovec *iov, int count, + out: + if (dict) + dict_unref(dict); +- GF_FREE(debug_xlator); ++ GF_FREE(added_xlator); + cli_cmd_broadcast_response(ret); + gf_free_xdr_cli_rsp(rsp); + return ret; +diff --git a/cli/src/cli.h b/cli/src/cli.h +index 7b4f446..b5b69ea 100644 +--- a/cli/src/cli.h ++++ b/cli/src/cli.h +@@ -502,8 +502,6 @@ cli_xml_output_snapshot(int cmd_type, dict_t *dict, int op_ret, int op_errno, + int + cli_xml_snapshot_status_single_snap(cli_local_t *local, dict_t *dict, + char *key); +-char * +-is_server_debug_xlator(void *myframe); + + int32_t + cli_cmd_snapshot_parse(const char **words, int wordcount, dict_t **options, +diff --git a/tests/basic/user-xlator.t b/tests/basic/user-xlator.t +new file mode 100755 +index 0000000..a711f9f +--- /dev/null ++++ b/tests/basic/user-xlator.t +@@ -0,0 +1,65 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../include.rc ++. $(dirname $0)/../volume.rc ++ ++#### patchy.dev.d-backends-patchy1.vol ++brick=${B0//\//-} ++SERVER_VOLFILE="/var/lib/glusterd/vols/${V0}/${V0}.${H0}.${brick:1}-${V0}1.vol" ++ ++cleanup; ++ ++TEST mkdir -p $B0/single-brick ++TEST mkdir -p ${GLUSTER_XLATOR_DIR}/user ++ ++## deploy dummy user xlator ++TEST cp ${GLUSTER_XLATOR_DIR}/playground/template.so ${GLUSTER_XLATOR_DIR}/user/hoge.so ++ ++TEST glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1,2,3,4,5,6}; ++TEST $CLI volume set $V0 user.xlator.hoge posix ++TEST grep -q 'user/hoge' ${SERVER_VOLFILE} ++ ++TEST $CLI volume set $V0 user.xlator.hoge.opt1 10 ++TEST grep -q '"option opt1 10"' ${SERVER_VOLFILE} ++TEST $CLI volume set $V0 user.xlator.hoge.opt2 hogehoge ++TEST grep -q '"option opt2 hogehoge"' ${SERVER_VOLFILE} ++TEST $CLI volume set $V0 user.xlator.hoge.opt3 true ++TEST grep -q '"option opt3 true"' ${SERVER_VOLFILE} ++ ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}3 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}4 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}5 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}6 ++ ++TEST $CLI volume set $V0 user.xlator.hoge trash ++TEST grep -q 'user/hoge' ${SERVER_VOLFILE} ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}3 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}4 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}5 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}6 ++ ++TEST ! $CLI volume set $V0 user.xlator.hoge unknown ++TEST grep -q 'user/hoge' ${SERVER_VOLFILE} # When the CLI fails, the volfile is not modified. ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}3 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}4 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}5 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}6 ++ ++#### teardown ++ ++TEST rm -f ${GLUSTER_XLATOR_DIR}/user/hoge.so ++cleanup; +diff --git a/tests/env.rc.in b/tests/env.rc.in +index c7472a7..1f0ca88 100644 +--- a/tests/env.rc.in ++++ b/tests/env.rc.in +@@ -40,3 +40,6 @@ export GLUSTER_LIBEXECDIR + + RUN_NFS_TESTS=@BUILD_GNFS@ + export RUN_NFS_TESTS ++ ++GLUSTER_XLATOR_DIR=@libdir@/glusterfs/@PACKAGE_VERSION@/xlator ++export GLUSTER_XLATOR_DIR +\ No newline at end of file +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 1920284..a242b5c 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -45,6 +45,11 @@ struct gd_validate_reconf_opts { + + extern struct volopt_map_entry glusterd_volopt_map[]; + ++struct check_and_add_user_xlator_t { ++ volgen_graph_t *graph; ++ char *volname; ++}; ++ + #define RPC_SET_OPT(XL, CLI_OPT, XLATOR_OPT, ERROR_CMD) \ + do { \ + char *_value = NULL; \ +@@ -2822,6 +2827,145 @@ out: + return ret; + } + ++static gf_boolean_t ++check_user_xlator_position(dict_t *dict, char *key, data_t *value, ++ void *prev_xlname) ++{ ++ if (strncmp(key, "user.xlator.", SLEN("user.xlator.")) != 0) { ++ return false; ++ } ++ ++ if (fnmatch("user.xlator.*.*", key, 0) == 0) { ++ return false; ++ } ++ ++ char *value_str = data_to_str(value); ++ if (!value_str) { ++ return false; ++ } ++ ++ if (strcmp(value_str, prev_xlname) == 0) { ++ gf_log("glusterd", GF_LOG_INFO, ++ "found insert position of user-xlator(%s)", key); ++ return true; ++ } ++ ++ return false; ++} ++ ++static int ++set_user_xlator_option(dict_t *set_dict, char *key, data_t *value, void *data) ++{ ++ xlator_t *xl = data; ++ char *optname = strrchr(key, '.') + 1; ++ ++ gf_log("glusterd", GF_LOG_DEBUG, "set user xlator option %s = %s", key, ++ value->data); ++ ++ return xlator_set_option(xl, optname, strlen(optname), data_to_str(value)); ++} ++ ++static int ++insert_user_xlator_to_graph(dict_t *set_dict, char *key, data_t *value, ++ void *action_data) ++{ ++ int ret = -1; ++ ++ struct check_and_add_user_xlator_t *data = action_data; ++ ++ char *xlator_name = strrchr(key, '.') + 1; // user.xlator. ++ char *xlator_option_matcher = NULL; ++ char *type = NULL; ++ xlator_t *xl = NULL; ++ ++ // convert optkey to xlator type ++ if (gf_asprintf(&type, "user/%s", xlator_name) < 0) { ++ gf_log("glusterd", GF_LOG_ERROR, "failed to generate user-xlator type"); ++ goto out; ++ } ++ ++ gf_log("glusterd", GF_LOG_INFO, "add user xlator=%s to graph", type); ++ ++ xl = volgen_graph_add(data->graph, type, data->volname); ++ if (!xl) { ++ goto out; ++ } ++ ++ ret = gf_asprintf(&xlator_option_matcher, "user.xlator.%s.*", xlator_name); ++ if (ret < 0) { ++ gf_log("glusterd", GF_LOG_ERROR, ++ "failed to generate user-xlator option matcher"); ++ goto out; ++ } ++ ++ dict_foreach_fnmatch(set_dict, xlator_option_matcher, ++ set_user_xlator_option, xl); ++ ++out: ++ if (type) ++ GF_FREE(type); ++ if (xlator_option_matcher) ++ GF_FREE(xlator_option_matcher); ++ ++ return ret; ++} ++ ++static int ++validate_user_xlator_position(dict_t *this, char *key, data_t *value, ++ void *unused) ++{ ++ int ret = -1; ++ int i = 0; ++ ++ if (!value) ++ goto out; ++ ++ if (fnmatch("user.xlator.*.*", key, 0) == 0) { ++ ret = 0; ++ goto out; ++ } ++ ++ char *value_str = data_to_str(value); ++ if (!value_str) ++ goto out; ++ ++ int num_xlators = sizeof(server_graph_table) / ++ sizeof(server_graph_table[0]); ++ for (i = 0; i < num_xlators; i++) { ++ if (server_graph_table[i].dbg_key && ++ strcmp(value_str, server_graph_table[i].dbg_key) == 0) { ++ ret = 0; ++ goto out; ++ } ++ } ++ ++out: ++ if (ret == -1) ++ gf_log("glusterd", GF_LOG_ERROR, "invalid user xlator position %s = %s", ++ key, value->data); ++ ++ return ret; ++} ++ ++static int ++check_and_add_user_xl(volgen_graph_t *graph, dict_t *set_dict, char *volname, ++ char *prev_xlname) ++{ ++ if (!prev_xlname) ++ goto out; ++ ++ struct check_and_add_user_xlator_t data = {.graph = graph, ++ .volname = volname}; ++ ++ if (dict_foreach_match(set_dict, check_user_xlator_position, prev_xlname, ++ insert_user_xlator_to_graph, &data) < 0) { ++ return -1; ++ } ++ ++out: ++ return 0; ++} ++ + static int + server_graph_builder(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + dict_t *set_dict, void *param) +@@ -2831,6 +2975,12 @@ server_graph_builder(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + char *loglevel = NULL; + int i = 0; + ++ if (dict_foreach_fnmatch(set_dict, "user.xlator.*", ++ validate_user_xlator_position, NULL) < 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ + i = sizeof(server_graph_table) / sizeof(server_graph_table[0]) - 1; + + while (i >= 0) { +@@ -2848,6 +2998,11 @@ server_graph_builder(volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + if (ret) + goto out; + ++ ret = check_and_add_user_xl(graph, set_dict, volinfo->volname, ++ server_graph_table[i].dbg_key); ++ if (ret) ++ goto out; ++ + i--; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0542-xlaotrs-mgmt-Fixing-coverity-issue-1445996.patch b/SOURCES/0542-xlaotrs-mgmt-Fixing-coverity-issue-1445996.patch new file mode 100644 index 0000000..f6e0641 --- /dev/null +++ b/SOURCES/0542-xlaotrs-mgmt-Fixing-coverity-issue-1445996.patch @@ -0,0 +1,64 @@ +From f3db0c99faf813e0f2e9ffcf599416555a59df1f Mon Sep 17 00:00:00 2001 +From: Ashish Pandey +Date: Tue, 9 Feb 2021 16:43:35 +0530 +Subject: [PATCH 542/542] xlaotrs/mgmt: Fixing coverity issue 1445996 + +Backport of https://github.com/gluster/glusterfs/pull/2148/commits/9785e96e0bdf6e60896570fdf5e4a6976a6f60ba + +Fixing "Null pointer dereferences" + +BUG: 1927235 +Change-Id: Idbc014e1302d2450f97bccd028681198c0d97424 +Signed-off-by: Ashish Pandey +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/237433 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index a242b5c..71aed08 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -2916,21 +2916,23 @@ validate_user_xlator_position(dict_t *this, char *key, data_t *value, + { + int ret = -1; + int i = 0; ++ char *value_str = NULL; + + if (!value) + goto out; + ++ value_str = data_to_str(value); ++ if (!value_str) ++ goto out; ++ + if (fnmatch("user.xlator.*.*", key, 0) == 0) { + ret = 0; + goto out; + } + +- char *value_str = data_to_str(value); +- if (!value_str) +- goto out; +- + int num_xlators = sizeof(server_graph_table) / + sizeof(server_graph_table[0]); ++ + for (i = 0; i < num_xlators; i++) { + if (server_graph_table[i].dbg_key && + strcmp(value_str, server_graph_table[i].dbg_key) == 0) { +@@ -2942,7 +2944,7 @@ validate_user_xlator_position(dict_t *this, char *key, data_t *value, + out: + if (ret == -1) + gf_log("glusterd", GF_LOG_ERROR, "invalid user xlator position %s = %s", +- key, value->data); ++ key, value_str); + + return ret; + } +-- +1.8.3.1 + diff --git a/SOURCES/0543-glusterd-handle-custom-xlator-failure-cases.patch b/SOURCES/0543-glusterd-handle-custom-xlator-failure-cases.patch new file mode 100644 index 0000000..c6194c7 --- /dev/null +++ b/SOURCES/0543-glusterd-handle-custom-xlator-failure-cases.patch @@ -0,0 +1,162 @@ +From 71fc5b7949e00c4448f5ec1291e756b201a70082 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Thu, 29 Apr 2021 18:34:57 +0530 +Subject: [PATCH 543/543] glusterd: handle custom xlator failure cases + +Problem-1: +custom xlator insertion was failing for those xlators in the brick graph +whose dbg_key was NULL in the server_graph_table. Looking at the git log, +the dbg_key was added in commit d1397dbd7d6cdbd2d81d5d36d608b6175d449db4 +for inserting debug xlators. + +Fix: I think it is fine to define it for all brick xlators below server. + +Problem-2: +In the commit-op phase, glusterd_op_set_volume() updates the volinfo +dict with the key-value pairs and then proceeds to create the volfiles. +If any of the steps fail, the volinfo dict retains those key-values, +until glusterd is restarted or `gluster vol reset $VOLNAME` is issued. + +Fix: +Make a copy of the volinfo dict and if there are any failures in +proceeding with the set volume logic, restore the dict to its original +state. + +Backport of: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2371 +> Change-Id: I9010dab33d0139b8e6d603308e331b6d220a4849 +> Updates: #2370 +> Signed-off-by: Ravishankar N + +Change-Id: I9010dab33d0139b8e6d603308e331b6d220a4849 +BUG: 1953901 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/239889 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/basic/user-xlator.t | 16 ++++++++++++++-- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 16 ++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-volgen.c | 14 +++++++------- + 3 files changed, 37 insertions(+), 9 deletions(-) + +diff --git a/tests/basic/user-xlator.t b/tests/basic/user-xlator.t +index a711f9f..ed2d831 100755 +--- a/tests/basic/user-xlator.t ++++ b/tests/basic/user-xlator.t +@@ -35,8 +35,18 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}4 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}5 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}6 + +-TEST $CLI volume set $V0 user.xlator.hoge trash +-TEST grep -q 'user/hoge' ${SERVER_VOLFILE} ++# Test that the insertion at all positions between server and posix is successful. ++# It is not guaranteed that the brick process will start/work in all positions though. ++TESTS_EXPECTED_IN_LOOP=34 ++declare -a brick_side_xlators=("decompounder" "io-stats" "quota" "index" "barrier" ++ "marker" "selinux" "io-threads" "upcall" "leases" ++ "read-only" "worm" "locks" "access-control" ++ "bitrot-stub" "changelog" "trash") ++for xlator in "${brick_side_xlators[@]}" ++ do ++ TEST_IN_LOOP $CLI volume set $V0 user.xlator.hoge $xlator ++ TEST_IN_LOOP grep -q 'user/hoge' ${SERVER_VOLFILE} ++ done + + TEST $CLI volume stop $V0 + TEST $CLI volume start $V0 +@@ -49,6 +59,8 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}6 + + TEST ! $CLI volume set $V0 user.xlator.hoge unknown + TEST grep -q 'user/hoge' ${SERVER_VOLFILE} # When the CLI fails, the volfile is not modified. ++# User xlator insert failures must not prevent setting other volume options. ++TEST $CLI volume set $V0 storage.reserve 10% + + TEST $CLI volume stop $V0 + TEST $CLI volume start $V0 +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index 1e84f5f..893af29 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -2911,6 +2911,7 @@ glusterd_op_set_volume(dict_t *dict, char **errstr) + uint32_t new_op_version = 0; + gf_boolean_t quorum_action = _gf_false; + glusterd_svc_t *svc = NULL; ++ dict_t *volinfo_dict_orig = NULL; + + this = THIS; + GF_ASSERT(this); +@@ -2918,6 +2919,10 @@ glusterd_op_set_volume(dict_t *dict, char **errstr) + priv = this->private; + GF_ASSERT(priv); + ++ volinfo_dict_orig = dict_new(); ++ if (!volinfo_dict_orig) ++ goto out; ++ + ret = dict_get_int32n(dict, "count", SLEN("count"), &dict_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, +@@ -2949,6 +2954,11 @@ glusterd_op_set_volume(dict_t *dict, char **errstr) + goto out; + } + ++ if (dict_copy(volinfo->dict, volinfo_dict_orig) == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ + /* TODO: Remove this once v3.3 compatibility is not required */ + check_op_version = dict_get_str_boolean(dict, "check-op-version", + _gf_false); +@@ -3171,6 +3181,12 @@ out: + gf_msg_debug(this->name, 0, "returning %d", ret); + if (quorum_action) + glusterd_do_quorum_action(); ++ if (ret < 0 && count > 1) { ++ if (dict_reset(volinfo->dict) == 0) ++ dict_copy(volinfo_dict_orig, volinfo->dict); ++ } ++ if (volinfo_dict_orig) ++ dict_unref(volinfo_dict_orig); + return ret; + } + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 71aed08..aa85bdb 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -2706,24 +2706,24 @@ out: + static volgen_brick_xlator_t server_graph_table[] = { + {brick_graph_add_server, NULL}, + {brick_graph_add_decompounder, "decompounder"}, +- {brick_graph_add_io_stats, "NULL"}, ++ {brick_graph_add_io_stats, "io-stats"}, + {brick_graph_add_sdfs, "sdfs"}, + {brick_graph_add_namespace, "namespace"}, +- {brick_graph_add_cdc, NULL}, ++ {brick_graph_add_cdc, "cdc" }, + {brick_graph_add_quota, "quota"}, + {brick_graph_add_index, "index"}, +- {brick_graph_add_barrier, NULL}, ++ {brick_graph_add_barrier, "barrier" }, + {brick_graph_add_marker, "marker"}, + {brick_graph_add_selinux, "selinux"}, + {brick_graph_add_fdl, "fdl"}, + {brick_graph_add_iot, "io-threads"}, + {brick_graph_add_upcall, "upcall"}, + {brick_graph_add_leases, "leases"}, +- {brick_graph_add_pump, NULL}, +- {brick_graph_add_ro, NULL}, +- {brick_graph_add_worm, NULL}, ++ {brick_graph_add_pump, "pump" }, ++ {brick_graph_add_ro, "read-only" }, ++ {brick_graph_add_worm, "worm" }, + {brick_graph_add_locks, "locks"}, +- {brick_graph_add_acl, "acl"}, ++ {brick_graph_add_acl, "access-control"}, + {brick_graph_add_bitrot_stub, "bitrot-stub"}, + {brick_graph_add_changelog, "changelog"}, + #if USE_GFDB /* changetimerecorder depends on gfdb */ +-- +1.8.3.1 + diff --git a/SOURCES/0544-tests-avoid-empty-paths-in-environment-variables.patch b/SOURCES/0544-tests-avoid-empty-paths-in-environment-variables.patch new file mode 100644 index 0000000..cb5e80b --- /dev/null +++ b/SOURCES/0544-tests-avoid-empty-paths-in-environment-variables.patch @@ -0,0 +1,86 @@ +From 3eaf937e69fe4219738c93d39af1cc909b1ee3f8 Mon Sep 17 00:00:00 2001 +From: Rinku Kothiya +Date: Fri, 23 Apr 2021 09:30:35 +0000 +Subject: [PATCH 544/584] tests: avoid empty paths in environment variables + +Many variables containing paths in env.rc.in are defined in a way +that leave a trailing ':' in the variable when the previous value +was empty or undefined. + +In the particular case of 'LD_PRELOAD_PATH' variable, this causes +that the system looks for dynamic libraries in the current working +directory. When this directory is inside a Gluster mount point, a +significant delay is caused each time a program is run (and testing +framework can run lots of programs for each test). + +This patch prevents that variables containing paths could end with +a trailing ':'. + +Backport of : +>Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2349 +>Fixes: #2348 +>Change-Id: I669f5a78e14f176c0a58824ba577330989d84769 +>Signed-off-by: Xavi Hernandez +>Signed-off-by: Rinku Kothiya + +Change-Id: Ie903ca443aa4789553ac4687818a7f69c113af41 +Signed-off-by: Rinku Kothiya +--- + tests/env.rc.in | 17 +++++++---------- + 1 file changed, 7 insertions(+), 10 deletions(-) + +diff --git a/tests/env.rc.in b/tests/env.rc.in +index 1f0ca88..2d8ff0e 100644 +--- a/tests/env.rc.in ++++ b/tests/env.rc.in +@@ -2,34 +2,31 @@ prefix=@prefix@ + exec_prefix=@exec_prefix@ + libdir=@libdir@ + +-PATH=@sbindir@:$PATH ++PATH=@bindir@:@sbindir@${PATH:+:${PATH}} + export PATH + + GLUSTERD_PIDFILEDIR=@localstatedir@/run/gluster + export GLUSTERD_PIDFILEDIR + +-LD_LIBRARY_PATH=@libdir@:$LD_LIBRARY_PATH ++LD_LIBRARY_PATH=@libdir@${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + export LD_LIBRARY_PATH + +-LIBRARY_PATH=@libdir@:$LIBRARY_PATH ++LIBRARY_PATH=@libdir@${LIBRARY_PATH:+:${LIBRARY_PATH}} + export LIBRARY_PATH + +-CPATH=@includedir@:$CPATH ++CPATH=@includedir@${CPATH:+:${CPATH}} + export CPATH + + GLUSTERD_WORKDIR=@GLUSTERD_WORKDIR@ + export GLUSTERD_WORKDIR + +-PKG_CONFIG_PATH=@pkgconfigdir@:$PKG_CONFIG_PATH ++PKG_CONFIG_PATH=@pkgconfigdir@${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}} + export PKG_CONFIG_PATH + +-PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH +-export PYTHONPATH +- + PYTHON=@PYTHON@ + export PYTHON + +-PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH ++PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@${PYTHONPATH:+:${PYTHONPATH}} + export PYTHONPATH + + GLUSTER_CMD_DIR=@sbindir@ +@@ -42,4 +39,4 @@ RUN_NFS_TESTS=@BUILD_GNFS@ + export RUN_NFS_TESTS + + GLUSTER_XLATOR_DIR=@libdir@/glusterfs/@PACKAGE_VERSION@/xlator +-export GLUSTER_XLATOR_DIR +\ No newline at end of file ++export GLUSTER_XLATOR_DIR +-- +1.8.3.1 + diff --git a/SOURCES/0545-tests-Excluded-tests-for-unsupported-components.patch b/SOURCES/0545-tests-Excluded-tests-for-unsupported-components.patch new file mode 100644 index 0000000..add8025 --- /dev/null +++ b/SOURCES/0545-tests-Excluded-tests-for-unsupported-components.patch @@ -0,0 +1,32 @@ +From 6b340470e01dc177767fae990cf19037202140b7 Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Mon, 31 May 2021 21:27:41 +0300 +Subject: [PATCH 545/584] tests: Excluded tests for unsupported components + +Quota and Tier are depricated from RHGS-3.5.5. +Stop running regression tests for them. + +Label: DOWNSTREAM ONLY + +Signed-off-by: Tamar Shacked +Change-Id: I3ca1aacba9a31129f5e68fcffdd80e69e51f7bcc +--- + run-tests.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/run-tests.sh b/run-tests.sh +index c835d93..5cc18b0 100755 +--- a/run-tests.sh ++++ b/run-tests.sh +@@ -349,7 +349,7 @@ function run_tests() + fi + + for t in $(find ${regression_testsdir}/tests -name '*.t' \ +- | LC_COLLATE=C sort) ; do ++ | egrep -v "tier|quota" | LC_COLLATE=C sort) ; do + old_cores=$(ls /*-*.core 2> /dev/null | wc -l) + total_tests=$((total_tests+1)) + if match $t "$@" ; then +-- +1.8.3.1 + diff --git a/SOURCES/0546-Update-rfc.sh-to-rhgs-3.5.5.patch b/SOURCES/0546-Update-rfc.sh-to-rhgs-3.5.5.patch new file mode 100644 index 0000000..935f533 --- /dev/null +++ b/SOURCES/0546-Update-rfc.sh-to-rhgs-3.5.5.patch @@ -0,0 +1,36 @@ +From 6ff3314f24687c8224a5520f9c4d2b3c39e730b7 Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Tue, 1 Jun 2021 13:02:24 +0300 +Subject: [PATCH 546/584] Update rfc.sh to rhgs-3.5.5 + +Signed-off-by: Tamar Shacked +Change-Id: Iff543dc77174f983dd39f9fb7cc5005b49594750 +--- + rfc.sh | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/rfc.sh b/rfc.sh +index c0559b9..daeff32 100755 +--- a/rfc.sh ++++ b/rfc.sh +@@ -18,7 +18,7 @@ done + shift $((OPTIND-1)) + + +-branch="rhgs-3.5.4"; ++branch="rhgs-3.5.5"; + + set_hooks_commit_msg() + { +@@ -315,7 +315,7 @@ main() + if [ -z "${reference}" ]; then + $drier git push $ORIGIN HEAD:refs/for/$branch/rfc; + else +- $drier git push $ORIGIN HEAD:refs/for/$branch/ref-${reference}; ++ $drier git push $ORIGIN HEAD:refs/for/$branch; + fi + } + +-- +1.8.3.1 + diff --git a/SOURCES/0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch b/SOURCES/0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch new file mode 100644 index 0000000..2bd8e28 --- /dev/null +++ b/SOURCES/0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch @@ -0,0 +1,47 @@ +From 08c57926118b1ab8fa1fcd5b16913ff22d97d065 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Wed, 25 Sep 2019 19:50:27 +0530 +Subject: [PATCH 547/584] perf/write-behind: Clear frame->local on conflict + error + +WB saves the wb_inode in frame->local for the truncate and +ftruncate fops. This value is not cleared in case of error +on a conflicting write request. FRAME_DESTROY finds a non-null +frame->local and tries to free it using mem_put. However, +wb_inode is allocated using GF_CALLOC, causing the +process to crash. + +credit: vpolakis@gmail.com + +Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/23485/ +>Change-Id: I217f61470445775e05145aebe44c814731c1b8c5 +>Fixes: bz#1753592 +>Signed-off-by: N Balachandran + +BUG: 1917488 +Change-Id: I217f61470445775e05145aebe44c814731c1b8c5 +Signed-off-by: Sunil Kumar H G +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244277 +Tested-by: RHGS Build Bot +--- + xlators/performance/write-behind/src/write-behind.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c +index 90a0bcf..31ab723 100644 +--- a/xlators/performance/write-behind/src/write-behind.c ++++ b/xlators/performance/write-behind/src/write-behind.c +@@ -1523,6 +1523,10 @@ __wb_handle_failed_conflict(wb_request_t *req, wb_request_t *conflict, + */ + req->op_ret = -1; + req->op_errno = conflict->op_errno; ++ if ((req->stub->fop == GF_FOP_TRUNCATE) || ++ (req->stub->fop == GF_FOP_FTRUNCATE)) { ++ req->stub->frame->local = NULL; ++ } + + list_del_init(&req->todo); + list_add_tail(&req->winds, tasks); +-- +1.8.3.1 + diff --git a/SOURCES/0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch b/SOURCES/0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch new file mode 100644 index 0000000..aed347c --- /dev/null +++ b/SOURCES/0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch @@ -0,0 +1,49 @@ +From cb7e72bce8b6a46605753b72919c1c839ecb4cc9 Mon Sep 17 00:00:00 2001 +From: root +Date: Thu, 3 Jun 2021 12:08:24 +0530 +Subject: [PATCH 548/584] Add tar as dependency to geo-rep rpm for RHEL 8.3 and + above + +Reason: from RHEL 8.3, tar is not bundled by default + +>Fixes: #1849 +>Signed-off-by: Shwetha K Acharya +>Change-Id: Ic1424e0550cef6a78e3e9e7b42665ab01016436f +Upstream Patch: https://github.com/gluster/glusterfs/pull/1850 + +BUG: 1901468 +Change-Id: Ic1424e0550cef6a78e3e9e7b42665ab01016436f +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244896 +Tested-by: RHGS Build Bot +Reviewed-by: Srijan Sivakumar +--- + glusterfs.spec.in | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 2be7677..424f4ab 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -521,6 +521,9 @@ Requires: python%{_pythonver}-gluster = %{version}-%{release} + Requires: rsync + Requires: util-linux + Requires: %{name}-libs%{?_isa} = %{version}-%{release} ++%if ( 0%{?rhel} && ( ( 0%{?rhel} == 8 && 0%{?rhel_minor_version} >= 3 ) || 0%{?rhel} >= 9 ) ) ++Requires: tar ++%endif + # required for setting selinux bools + %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) + Requires(post): policycoreutils-python-utils +@@ -1982,6 +1985,8 @@ fi + %endif + + %changelog ++* Thu Nov 26 2020 Shwetha K Acharya ++- Add tar as dependency to georeplication rpm for RHEL version >= 8.3 + + * Mon May 11 2020 Sunny Kumar + - added requires policycoreutils-python-utils on rhel8 for geo-replication +-- +1.8.3.1 + diff --git a/SOURCES/0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch b/SOURCES/0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch new file mode 100644 index 0000000..b61e5ea --- /dev/null +++ b/SOURCES/0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch @@ -0,0 +1,45 @@ +From f90c13912a9c64e4479b55fee4ba4ac50e509302 Mon Sep 17 00:00:00 2001 +From: schaffung +Date: Sat, 9 Jan 2021 15:41:15 +0530 +Subject: [PATCH 549/584] geo-rep : Change in attribute for getting function + name in py 3 (#1900) + +Issue: The schedule_geo-rep script uses `func_name` to obtain +the name of the function being referred to but from python3 +onwards, the attribute has been changed to `__name__`. + +Code Change: + Changing `func_name` to `__name__`. + +>Fixes: #1898 +>Signed-off-by: srijan-sivakumar +>Change-Id: I4ed69a06cffed9db17c8f8949b8000c74be1d717 +Upstream Patch : https://github.com/gluster/glusterfs/pull/1900 + +BUG: 1903911 +Change-Id: I4ed69a06cffed9db17c8f8949b8000c74be1d717 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244570 +Tested-by: RHGS Build Bot +Reviewed-by: Shwetha Acharya +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/geo-rep/schedule_georep.py.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/extras/geo-rep/schedule_georep.py.in b/extras/geo-rep/schedule_georep.py.in +index ac93716..9bb3df5 100644 +--- a/extras/geo-rep/schedule_georep.py.in ++++ b/extras/geo-rep/schedule_georep.py.in +@@ -102,7 +102,7 @@ def cache_output_with_args(func): + """ + def wrapper(*args, **kwargs): + global cache_data +- key = "_".join([func.func_name] + list(args)) ++ key = "_".join([func.__name__] + list(args)) + if cache_data.get(key, None) is None: + cache_data[key] = func(*args, **kwargs) + +-- +1.8.3.1 + diff --git a/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch b/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch new file mode 100644 index 0000000..8bc6694 --- /dev/null +++ b/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch @@ -0,0 +1,184 @@ +From 053bb9c7356eae82b1089582bb2844388ae4df57 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 2 Jun 2021 07:49:12 -0400 +Subject: [PATCH 550/584] common-ha: stability fixes for ganesha_grace and + ganesha_mon RAs + +Include fixes suggested by ClusterHA devs. + +1) It turns out that crm_attribute attrs and attrd_updater attrs really +are one and the same, despite what I was told years ago. + +attrs created with crm_attribute ... --lifetime=reboot ... or +attrd_updater are one and same. As per ClusterHA devs having an attr +created with crm_attribute ... --lifetime=forever and also +creating/updating the same attr with attrd_updater is a recipe for +weird things to happen that will be difficult to debug. + +2) using hostname -s or hostname for node names in crm_attribute and +attrd_updater potentially could use the wrong name if the host has +been renamed; use ocf_local_nodename() (in ocf-shellfuncs) instead. + +https://github.com/gluster/glusterfs/issues/2276 +https://github.com/gluster/glusterfs/pull/2283 +commit 9bd2c697686ec40e2c4f711df961860c8a735baa + +Change-Id:If572d396fae9206628714fb2ce00f72e94f2258f +BUG: 1945143 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244593 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/ganesha/ocf/ganesha_grace | 28 +++++++++--------------- + extras/ganesha/ocf/ganesha_mon | 47 ++++++++++++++-------------------------- + 2 files changed, 26 insertions(+), 49 deletions(-) + +diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace +index 825f716..edc6fa2 100644 +--- a/extras/ganesha/ocf/ganesha_grace ++++ b/extras/ganesha/ocf/ganesha_grace +@@ -94,25 +94,21 @@ esac + ganesha_grace_start() + { + local rc=${OCF_ERR_GENERIC} +- local host=$(hostname -s) ++ local host=$(ocf_local_nodename) + +- ocf_log debug "ganesha_grace_start()" +- # give ganesha_mon RA a chance to set the crm_attr first ++ ocf_log debug "ganesha_grace_start ${host}" ++ # give ganesha_mon RA a chance to set the attr first + # I mislike the sleep, but it's not clear that looping + # with a small sleep is necessarily better + # start has a 40sec timeout, so a 5sec sleep here is okay + sleep 5 +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) ++ attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + if [ $? -ne 0 ]; then +- host=$(hostname) +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null ) +- if [ $? -ne 0 ]; then +- ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" +- fi ++ ocf_log info "grace start: attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" + fi + + # Three possibilities: +- # 1. There is no attribute at all and attr_updater returns ++ # 1. There is no attribute at all and attrd_updater returns + # a zero length string. This happens when + # ganesha_mon::monitor hasn't run at least once to set + # the attribute. The assumption here is that the system +@@ -164,17 +160,13 @@ ganesha_grace_notify() + + ganesha_grace_monitor() + { +- local host=$(hostname -s) ++ local host=$(ocf_local_nodename) + +- ocf_log debug "monitor" ++ ocf_log debug "ganesha_grace monitor ${host}" + +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) ++ attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + if [ $? -ne 0 ]; then +- host=$(hostname) +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) +- if [ $? -ne 0 ]; then +- ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" +- fi ++ ocf_log info "attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" + fi + + # if there is no attribute (yet), maybe it's because +diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon +index 2b4a9d6..7fbbf70 100644 +--- a/extras/ganesha/ocf/ganesha_mon ++++ b/extras/ganesha/ocf/ganesha_mon +@@ -124,7 +124,6 @@ ganesha_mon_stop() + + ganesha_mon_monitor() + { +- local host=$(hostname -s) + local pid_file="/var/run/ganesha.pid" + local rhel6_pid_file="/var/run/ganesha.nfsd.pid" + local proc_pid="/proc/" +@@ -141,31 +140,27 @@ ganesha_mon_monitor() + + if [ "x${proc_pid}" != "x/proc/" -a -d ${proc_pid} ]; then + +- attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 ++ attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 + if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 failed" ++ ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 failed" + fi + + # ganesha_grace (nfs-grace) RA follows grace-active attr + # w/ constraint location +- attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 ++ attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 + if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 failed" ++ ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 failed" + fi + + # ganesha_mon (nfs-mon) and ganesha_grace (nfs-grace) +- # track grace-active crm_attr (attr != crm_attr) +- # we can't just use the attr as there's no way to query +- # its value in RHEL6 pacemaker +- +- crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null +- if [ $? -ne 0 ]; then +- host=$(hostname) +- crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null +- if [ $? -ne 0 ]; then +- ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed" +- fi +- fi ++ # track grace-active attr. ++ # ++ # Originally we were told that attrs set with attrd_updater ++ # are different/distinct than attrs set with crm_attribute. ++ # Now, years later, we are told that they are the same and ++ # that the values of attrs set with attrd_updater can be ++ # retrieved with crm_attribute. Or with attrd_updater -Q ++ # now that we no longer have to deal with rhel6. + + return ${OCF_SUCCESS} + fi +@@ -182,26 +177,16 @@ ganesha_mon_monitor() + # the remaining ganesha.nfsds into grace before + # initiating the VIP fail-over. + +- attrd_updater -D -n ${OCF_RESKEY_grace_active} +- if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed" +- fi +- +- host=$(hostname -s) +- crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null ++ attrd_updater --delete --name ${OCF_RESKEY_grace_active} + if [ $? -ne 0 ]; then +- host=$(hostname) +- crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null +- if [ $? -ne 0 ]; then +- ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed" +- fi ++ ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_grace_active} failed" + fi + + sleep ${OCF_RESKEY_grace_delay} + +- attrd_updater -D -n ${OCF_RESKEY_ganesha_active} ++ attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} + if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_ganesha_active} failed" ++ ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} failed" + fi + + return ${OCF_SUCCESS} +-- +1.8.3.1 + diff --git a/SOURCES/0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch b/SOURCES/0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch new file mode 100644 index 0000000..e3a107f --- /dev/null +++ b/SOURCES/0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch @@ -0,0 +1,52 @@ +From fcfd40132624df5e888d53b4a8c4ce1cf7087413 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 2 Jun 2021 07:40:04 -0400 +Subject: [PATCH 551/584] common-ha: ensure shared_storage is mounted before + setup (#2296) + +If gluster shared-storage isn't mounted, ganesha will fail to start + +commit a249b9020d281d0482db0aeb52e8856acd931e02 +https://github.com/gluster/glusterfs/issues/2278 +https://github.com/gluster/glusterfs/pull/2296 + +Change-Id: I6ed7044ea6b6c61b013ebe17088bfde311b109b7 +BUG: 1918018 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244592 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/ganesha/scripts/ganesha-ha.sh | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh +index 491c61d..012084f 100644 +--- a/extras/ganesha/scripts/ganesha-ha.sh ++++ b/extras/ganesha/scripts/ganesha-ha.sh +@@ -195,9 +195,22 @@ setup_cluster() + local servers=${3} + local unclean="" + local quorum_policy="stop" ++ local dfresult="" + + logger "setting up cluster ${name} with the following ${servers}" + ++ # check that shared_storage is mounted ++ dfresult=$(df -T ${HA_VOL_MNT}) ++ if [[ -z "${dfresult}" ]]; then ++ logger "gluster shared_storage is not mounted, exiting..." ++ exit 1 ++ fi ++ ++ if [[ "${dfresult}" != *"fuse.glusterfs"* ]]; then ++ logger "gluster shared_storage is not mounted, exiting..." ++ exit 1 ++ fi ++ + # pcs cluster setup --force ${PCS9OR10_PCS_CNAME_OPTION} ${name} ${servers} + pcs cluster setup --force ${PCS9OR10_PCS_CNAME_OPTION} ${name} --enable ${servers} + if [ $? -ne 0 ]; then +-- +1.8.3.1 + diff --git a/SOURCES/0552-cluster-afr-Change-default-self-heal-window-size-to-.patch b/SOURCES/0552-cluster-afr-Change-default-self-heal-window-size-to-.patch new file mode 100644 index 0000000..41b94cd --- /dev/null +++ b/SOURCES/0552-cluster-afr-Change-default-self-heal-window-size-to-.patch @@ -0,0 +1,67 @@ +From e9e1b0bc6e2deaf44190636ab6826065ed3c0392 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar Karampuri +Date: Wed, 3 Feb 2021 18:10:40 +0530 +Subject: [PATCH 552/584] cluster/afr: Change default self-heal-window-size to + 1MB (#2068) + +At the moment self-heal-window-size is 128KB. This leads to healing data +in 128KB chunks. With the growth of data and the avg file sizes +nowadays, 1MB seems like a better default. + +Upstream patch details: +> https://github.com/gluster/glusterfs/pull/2111 +> Change-Id: I70c42c83b16c7adb53d6b5762969e878477efb5c +> Fixes: #2067 +> Signed-off-by: Pranith Kumar K + +BUG: 1946171 +Change-Id: Icd6a5c02ca16a1a6095f7bc10feed8ddc2505f41 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244557 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heal-data.c | 6 ++++++ + xlators/cluster/afr/src/afr.c | 6 +++--- + 2 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c +index b97c66b..156cb18 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-data.c ++++ b/xlators/cluster/afr/src/afr-self-heal-data.c +@@ -337,6 +337,12 @@ afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + } + + block = 128 * 1024 * priv->data_self_heal_window_size; ++ if (HAS_HOLES((&replies[source].poststat))) { ++ /*Reduce the possibility of data-block allocations in case of files ++ * with holes. Correct way to fix it would be to use seek fop while ++ * healing data*/ ++ block = 128 * 1024; ++ } + + type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies); + +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 33fe4d8..0956e5a 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -910,12 +910,12 @@ struct volume_options options[] = { + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, +- .default_value = "1", ++ .default_value = "8", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, +- .description = "Maximum number blocks per file for which self-heal " +- "process would be applied simultaneously."}, ++ .description = "Maximum number of 128KB blocks per file for which " ++ "self-heal process would be applied simultaneously."}, + {.key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", +-- +1.8.3.1 + diff --git a/SOURCES/0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch b/SOURCES/0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch new file mode 100644 index 0000000..2144845 --- /dev/null +++ b/SOURCES/0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch @@ -0,0 +1,46 @@ +From 1fa01865eb9bf6a1113669c262fc526ef11f61f2 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Sat, 6 Feb 2021 01:53:28 +0100 +Subject: [PATCH 553/584] cluster/ec: Change self-heal-window-size to 4MiB by + default (#2071) + +The current block size used for self-heal by default is 128 KiB. This +requires a significant amount of management requests for a very small +portion of data healed. + +With this patch the block size is increased to 4 MiB. For a standard +EC volume configuration of 4+2, this means that each healed block of +a file will update 1 MiB on each brick. + +Upstream patch details: +> https://github.com/gluster/glusterfs/pull/2071 +> Change-Id: Ifeec4a2d54988017d038085720513c121b03445b +> Updates: #2067 +> Signed-off-by: Xavi Hernandez + +BUG: 1946171 +Change-Id: I9e3eed2d83c9de54242e6161b2e3951c2f6f8000 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244558 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 4118c3b..a930089 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -1644,7 +1644,7 @@ struct volume_options options[] = { + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, +- .default_value = "1", ++ .default_value = "32", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, +-- +1.8.3.1 + diff --git a/SOURCES/0554-dht-fix-rebalance-of-sparse-files.patch b/SOURCES/0554-dht-fix-rebalance-of-sparse-files.patch new file mode 100644 index 0000000..935303b --- /dev/null +++ b/SOURCES/0554-dht-fix-rebalance-of-sparse-files.patch @@ -0,0 +1,245 @@ +From 2cb90b7798fa469f2d7d938ae88733eb1962d63d Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Fri, 9 Apr 2021 18:13:30 +0200 +Subject: [PATCH 554/584] dht: fix rebalance of sparse files + +Current implementation of rebalance for sparse files has a bug that, +in some cases, causes a read of 0 bytes from the source subvolume. +Posix xlator doesn't allow 0 byte reads and fails them with EINVAL, +which causes rebalance to abort the migration. + +This patch implements a more robust way of finding data segments in +a sparse file that avoids 0 byte reads, allowing the file to be +migrated successfully. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2318 +> Fixes: #2317 +> Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c +> Signed-off-by: Xavi Hernandez + +BUG: 1957641 +Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244551 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/distribute/issue-2317.t | 29 ++++++++ + tests/volume.rc | 4 ++ + xlators/cluster/dht/src/dht-rebalance.c | 116 +++++++++++++++++--------------- + 3 files changed, 93 insertions(+), 56 deletions(-) + create mode 100755 tests/bugs/distribute/issue-2317.t + +diff --git a/tests/bugs/distribute/issue-2317.t b/tests/bugs/distribute/issue-2317.t +new file mode 100755 +index 0000000..e29d003 +--- /dev/null ++++ b/tests/bugs/distribute/issue-2317.t +@@ -0,0 +1,29 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++TESTS_EXPECTED_IN_LOOP=126 ++ ++cleanup ++ ++TEST glusterd ++TEST ${CLI} volume create ${V0} replica 3 ${H0}:/$B0/${V0}_{0..2} ++TEST ${CLI} volume start ${V0} ++ ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++# Create several files to make sure that at least some of them should be ++# migrated by rebalance. ++for i in {0..63}; do ++ TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 ++ TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 seek=128 ++done ++ ++TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{3..5} ++TEST ${CLI} volume rebalance ${V0} start force ++EXPECT_WITHIN ${REBALANCE_TIMEOUT} "completed" rebalance_status_field "${V0}" ++ ++EXPECT "^0$" rebalance_failed_field "${V0}" ++ ++cleanup +diff --git a/tests/volume.rc b/tests/volume.rc +index 9a002d9..f5dd0b1 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -75,6 +75,10 @@ function rebalance_status_field { + $CLI volume rebalance $1 status | awk '{print $7}' | sed -n 3p + } + ++function rebalance_failed_field { ++ $CLI volume rebalance $1 status | awk '{print $5}' | sed -n 3p ++} ++ + function fix-layout_status_field { + #The fix-layout status can be up to 3 words, (ex:'fix-layout in progress'), hence the awk-print $2 thru $4. + #But if the status is less than 3 words, it also prints the next field i.e the run_time_in_secs.(ex:'completed 3.00'). +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index 072896d..eab7558 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -1024,6 +1024,46 @@ out: + return ret; + } + ++static int32_t ++dht_rebalance_sparse_segment(xlator_t *subvol, fd_t *fd, off_t *offset, ++ size_t *size) ++{ ++ off_t hole; ++ int32_t ret; ++ ++ do { ++ ret = syncop_seek(subvol, fd, *offset, GF_SEEK_DATA, NULL, offset); ++ if (ret >= 0) { ++ /* Starting at the offset of the last data segment, find the ++ * next hole. After a data segment there should always be a ++ * hole, since EOF is considered a hole. */ ++ ret = syncop_seek(subvol, fd, *offset, GF_SEEK_HOLE, NULL, &hole); ++ } ++ ++ if (ret < 0) { ++ if (ret == -ENXIO) { ++ /* This can happen if there are no more data segments (i.e. ++ * the offset is at EOF), or there was a data segment but the ++ * file has been truncated to a smaller size between both ++ * seek requests. In both cases we are done. The file doesn't ++ * contain more data. */ ++ ret = 0; ++ } ++ return ret; ++ } ++ ++ /* It could happen that at the same offset we detected data in the ++ * first seek, there could be a hole in the second seek if user is ++ * modifying the file concurrently. In this case we need to find a ++ * new data segment to migrate. */ ++ } while (hole <= *offset); ++ ++ /* Calculate the total size of the current data block */ ++ *size = hole - *offset; ++ ++ return 1; ++} ++ + static int + __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, +@@ -1032,8 +1072,6 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + int ret = 0; + int count = 0; + off_t offset = 0; +- off_t data_offset = 0; +- off_t hole_offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; +@@ -1048,71 +1086,36 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + while (total < ia_size) { + /* This is a regular file - read it sequentially */ + if (!hole_exists) { +- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) +- ? DHT_REBALANCE_BLKSIZE +- : (ia_size - total)); ++ data_block_size = ia_size - total; + } else { + /* This is a sparse file - read only the data segments in the file + */ + + /* If the previous data block is fully copied, find the next data +- * segment +- * starting at the offset of the last read and written byte, */ ++ * segment starting at the offset of the last read and written ++ * byte. */ + if (data_block_size <= 0) { +- ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, +- &data_offset); +- if (ret) { +- if (ret == -ENXIO) +- ret = 0; /* No more data segments */ +- else +- *fop_errno = -ret; /* Error occurred */ +- ++ ret = dht_rebalance_sparse_segment(from, src, &offset, ++ &data_block_size); ++ if (ret <= 0) { ++ *fop_errno = -ret; + break; + } +- +- /* If the position of the current data segment is greater than +- * the position of the next hole, find the next hole in order to +- * calculate the length of the new data segment */ +- if (data_offset > hole_offset) { +- /* Starting at the offset of the last data segment, find the +- * next hole */ +- ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, +- NULL, &hole_offset); +- if (ret) { +- /* If an error occurred here it's a real error because +- * if the seek for a data segment was successful then +- * necessarily another hole must exist (EOF is a hole) +- */ +- *fop_errno = -ret; +- break; +- } +- +- /* Calculate the total size of the current data block */ +- data_block_size = hole_offset - data_offset; +- } +- } else { +- /* There is still data in the current segment, move the +- * data_offset to the position of the last written byte */ +- data_offset = offset; + } +- +- /* Calculate how much data needs to be read and written. If the data +- * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and +- * write DHT_REBALANCE_BLKSIZE data length and the rest in the +- * next iteration(s) */ +- read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) +- ? DHT_REBALANCE_BLKSIZE +- : data_block_size); +- +- /* Calculate the remaining size of the data block - maybe there's no +- * need to seek for data in the next iteration */ +- data_block_size -= read_size; +- +- /* Set offset to the offset of the data segment so read and write +- * will have the correct position */ +- offset = data_offset; + } + ++ /* Calculate how much data needs to be read and written. If the data ++ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and ++ * write DHT_REBALANCE_BLKSIZE data length and the rest in the ++ * next iteration(s) */ ++ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) ++ ? DHT_REBALANCE_BLKSIZE ++ : data_block_size); ++ ++ /* Calculate the remaining size of the data block - maybe there's no ++ * need to seek for data in the next iteration */ ++ data_block_size -= read_size; ++ + ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, + &iobref, NULL, NULL, NULL); + +@@ -1177,6 +1180,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + iobref = NULL; + vector = NULL; + } ++ + if (iobref) + iobref_unref(iobref); + GF_FREE(vector); +-- +1.8.3.1 + diff --git a/SOURCES/0555-geo-rep-Improve-handling-of-gfid-mismatches.patch b/SOURCES/0555-geo-rep-Improve-handling-of-gfid-mismatches.patch new file mode 100644 index 0000000..85b19e0 --- /dev/null +++ b/SOURCES/0555-geo-rep-Improve-handling-of-gfid-mismatches.patch @@ -0,0 +1,79 @@ +From f2d3866e617d25ea62cda01afddc81ef0db3356e Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 4 May 2021 22:39:03 +0200 +Subject: [PATCH 555/584] geo-rep: Improve handling of gfid mismatches + +In some circumstances geo-replication can detect mismatching gfids +between primary and secondary. These entries are fixed in an iterative +way, assuming that after a fix, a previously failing entry could +succeed. + +Previous code was trying to fix them in a loop that can be executed +up to 10 times. If some entry cannot be fixed after 10 attempts, it's +discarded. These fixes are very slow, so trying to do them many times +causes geo-replication to get out of sync. + +To minimize the number of iterations done, this patch checks if the +number of entries and failures remains constant after each iteration. +If they are constant, it means that nothing else can be fixed, so it +makes no sense to do more iterations. This reduces the number of +iterations to 2 or 3 in most of the cases, improving geo-replication +performance. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2389 +> Fixes: #2388 +> Change-Id: I6d9a623a60045694e1a832195e1dc1fb9e88ae54 +> Signed-off-by: Xavi Hernandez + +BUG: 1957191 +Change-Id: I6d9a623a60045694e1a832195e1dc1fb9e88ae54 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244550 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/master.py | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py +index 98637e7..aef9373 100644 +--- a/geo-replication/syncdaemon/master.py ++++ b/geo-replication/syncdaemon/master.py +@@ -1224,9 +1224,11 @@ class GMasterChangelogMixin(GMasterCommon): + + if gconf.get("gfid-conflict-resolution"): + count = 0 ++ num_entries = len(entries) ++ num_failures = len(failures) + if failures: + logging.info(lf('Entry ops failed with gfid mismatch', +- count=len(failures))) ++ count=num_failures)) + while failures and count < self.MAX_OE_RETRIES: + count += 1 + self.handle_entry_failures(failures, entries) +@@ -1237,6 +1239,20 @@ class GMasterChangelogMixin(GMasterCommon): + "gfid mismatch") + break + ++ # If this iteration has not removed any entry or reduced ++ # the number of failures compared to the previous one, we ++ # don't need to keep iterating because we'll get the same ++ # result in all other attempts. ++ if ((num_entries == len(entries)) and ++ (num_failures == len(failures))): ++ logging.info(lf("No more gfid mismatches can be fixed", ++ entries=num_entries, ++ failures=num_failures)) ++ break ++ ++ num_entries = len(entries) ++ num_failures = len(failures) ++ + self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY') + self.status.dec_value("entry", len(entries)) + +-- +1.8.3.1 + diff --git a/SOURCES/0556-dht-don-t-ignore-xdata-in-fgetxattr.patch b/SOURCES/0556-dht-don-t-ignore-xdata-in-fgetxattr.patch new file mode 100644 index 0000000..0cf3545 --- /dev/null +++ b/SOURCES/0556-dht-don-t-ignore-xdata-in-fgetxattr.patch @@ -0,0 +1,52 @@ +From a7f6ad0c617a36414c8232cb692471703923b16d Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 19 Jan 2021 18:03:33 +0100 +Subject: [PATCH 556/584] dht: don't ignore xdata in fgetxattr + +DHT was passing NULL for xdata in fgetxattr() request, ignoring any +data sent by upper xlators. + +This patch fixes the issue by sending the received xdata to lower +xlators, as it's currently done for getxattr(). + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2020 +> Fixes: #1991 +> Change-Id: If3d3f1f2ce6215f3b1acc46480e133cb4294eaec +> Signed-off-by: Xavi Hernandez + +BUG: 1919132 +Change-Id: If3d3f1f2ce6215f3b1acc46480e133cb4294eaec +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244538 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 7425c1a..0773092 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -5262,7 +5262,7 @@ dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + + if (!ret && key && local->mds_subvol && dht_match_xattr(key)) { + STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol, +- local->mds_subvol->fops->fgetxattr, fd, key, NULL); ++ local->mds_subvol->fops->fgetxattr, fd, key, xdata); + + return 0; + } +@@ -5274,7 +5274,7 @@ dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd, +- key, NULL); ++ key, xdata); + } + return 0; + +-- +1.8.3.1 + diff --git a/SOURCES/0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch b/SOURCES/0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch new file mode 100644 index 0000000..2add6cb --- /dev/null +++ b/SOURCES/0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch @@ -0,0 +1,306 @@ +From ba57b043db1e19196cf860baeeeb1acfc9985cd2 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Wed, 24 Feb 2021 15:04:23 +0100 +Subject: [PATCH 557/584] cluster/dht: Fix stack overflow in readdir(p) + +When parallel-readdir is enabled, readdir(p) requests sent by DHT can be +immediately processed and answered in the same thread before the call to +STACK_WIND_COOKIE() completes. + +This means that the readdir(p) cbk is processed synchronously. In some +cases it may decide to send another readdir(p) request, which causes a +recursive call. + +When some special conditions happen and the directories are big, it's +possible that the number of nested calls is so high that the process +crashes because of a stack overflow. + +This patch fixes this by not allowing nested readdir(p) calls. When a +nested call is detected, it's queued instead of sending it. The queued +request is processed when the current call finishes by the top level +stack function. + +Backport of 3 patches: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2170 +> Fixes: #2169 +> Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099 +> Signed-off-by: Xavi Hernandez + +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2202 +> Updates: #2169 +> Change-Id: I97e73c0aae74fc5d80c975f56f2f7a64e3e1ae95 +> Signed-off-by: Xavi Hernandez + +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2242 +> Fixes: #2239 +> Change-Id: I6b2e48e87c85de27fad67a12d97abd91fa27c0c1 +> Signed-off-by: Pranith Kumar K + +BUG: 1798897 +Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244549 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/distribute/issue-2169.t | 33 +++++++++ + xlators/cluster/dht/src/dht-common.c | 134 ++++++++++++++++++++++++++++++++--- + xlators/cluster/dht/src/dht-common.h | 5 ++ + 3 files changed, 162 insertions(+), 10 deletions(-) + create mode 100755 tests/bugs/distribute/issue-2169.t + +diff --git a/tests/bugs/distribute/issue-2169.t b/tests/bugs/distribute/issue-2169.t +new file mode 100755 +index 0000000..91fa72a +--- /dev/null ++++ b/tests/bugs/distribute/issue-2169.t +@@ -0,0 +1,33 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++ ++TEST glusterd ++TEST ${CLI} volume create ${V0} ${H0}:/$B0/${V0}_0 ++TEST ${CLI} volume set ${V0} readdir-ahead on ++TEST ${CLI} volume set ${V0} parallel-readdir on ++TEST ${CLI} volume start ${V0} ++ ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++TEST mkdir -p ${M0}/d/d.{000..999} ++ ++EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0} ++ ++TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{1..7} ++ ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++ls -l ${M0}/d/ | wc -l ++ ++EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0} ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++ls -l ${M0}/d/ | wc -l ++ ++TEST ls ${M0}/d ++ ++cleanup +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 0773092..ce0fbbf 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -24,8 +24,15 @@ + #include + #include + ++#include ++ + int run_defrag = 0; + ++static int ++dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int op_ret, int op_errno, gf_dirent_t *entries, ++ dict_t *xdata); ++ + int + dht_link2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); + +@@ -6681,6 +6688,94 @@ out: + return; + } + ++/* Execute a READDIR request if no other request is in progress. Otherwise ++ * queue it to be executed when the current one finishes. ++ * ++ * When parallel-readdir is enabled and directory contents are cached, the ++ * callback of a readdirp will be called before returning from STACK_WIND. ++ * If the returned contents are not useful for DHT, and the buffer is not ++ * yet full, a nested readdirp request will be sent. This means that there ++ * will be many recursive calls. In the worst case there might be a stack ++ * overflow. ++ * ++ * To avoid this, we only wind a request if no other request is being wound. ++ * If there's another request, we simple store the values for the next call. ++ * When the thread processing the current wind completes it, it will take ++ * the new arguments and send the request from the top level stack. */ ++static void ++dht_queue_readdir(call_frame_t *frame, xlator_t *xl, off_t offset, ++ fop_readdir_cbk_t cbk) ++{ ++ dht_local_t *local; ++ int32_t queue; ++ xlator_t *this = NULL; ++ ++ local = frame->local; ++ this = frame->this; ++ ++ local->queue_xl = xl; ++ local->queue_offset = offset; ++ ++ if (uatomic_add_return(&local->queue, 1) == 1) { ++ /* If we are here it means that we are the first one to send a ++ * readdir request. Any attempt to send more readdir requests will ++ * find local->queue > 1, so it won't do anything. The needed data ++ * to send the request has been stored into local->queue_*. ++ * ++ * Note: this works because we will only have 1 additional request ++ * at most (the one called by the cbk function) while we are ++ * processing another readdir. */ ++ do { ++ STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl, ++ local->queue_xl->fops->readdir, local->fd, ++ local->size, local->queue_offset, local->xattr); ++ ++ /* If a new readdirp request has been added before returning ++ * from winding, we process it. */ ++ } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0); ++ ++ if (queue < 0) { ++ /* A negative value means that an unwind has been called before ++ * returning from the previous wind. This means that 'local' is ++ * not needed anymore and must be destroyed. */ ++ dht_local_wipe(this, local); ++ } ++ } ++} ++ ++/* Execute a READDIRP request if no other request is in progress. Otherwise ++ * queue it to be executed when the current one finishes. */ ++static void ++dht_queue_readdirp(call_frame_t *frame, xlator_t *xl, off_t offset, ++ fop_readdirp_cbk_t cbk) ++{ ++ dht_local_t *local; ++ int32_t queue; ++ xlator_t *this = NULL; ++ ++ local = frame->local; ++ this = frame->this; ++ ++ local->queue_xl = xl; ++ local->queue_offset = offset; ++ ++ /* Check dht_queue_readdir() comments for an explanation of this. */ ++ if (uatomic_add_return(&local->queue, 1) == 1) { ++ do { ++ STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl, ++ local->queue_xl->fops->readdirp, local->fd, ++ local->size, local->queue_offset, local->xattr); ++ } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0); ++ ++ if (queue < 0) { ++ /* A negative value means that an unwind has been called before ++ * returning from the previous wind. This means that 'local' is ++ * not needed anymore and must be destroyed. */ ++ dht_local_wipe(this, local); ++ } ++ } ++} ++ + /* Posix returns op_errno = ENOENT to indicate that there are no more + * entries + */ +@@ -6950,9 +7045,8 @@ done: + } + } + +- STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol, +- next_subvol->fops->readdirp, local->fd, local->size, +- next_offset, local->xattr); ++ dht_queue_readdirp(frame, next_subvol, next_offset, dht_readdirp_cbk); ++ + return 0; + } + +@@ -6970,6 +7064,17 @@ unwind: + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + ++ /* If we are inside a recursive call (or not inside a recursive call but ++ * the cbk is completed before the wind returns), local->queue will be 1. ++ * In this case we cannot destroy 'local' because it will be needed by ++ * the caller of STACK_WIND. In this case, we decrease the value to let ++ * the caller know that the operation has terminated and it must destroy ++ * 'local'. If local->queue 0, we can destroy it here because there are ++ * no other users. */ ++ if (uatomic_sub_return(&local->queue, 1) >= 0) { ++ frame->local = NULL; ++ } ++ + DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); +@@ -7071,9 +7176,8 @@ done: + goto unwind; + } + +- STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol, +- next_subvol->fops->readdir, local->fd, local->size, +- next_offset, NULL); ++ dht_queue_readdir(frame, next_subvol, next_offset, dht_readdir_cbk); ++ + return 0; + } + +@@ -7089,6 +7193,17 @@ unwind: + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + ++ /* If we are inside a recursive call (or not inside a recursive call but ++ * the cbk is completed before the wind returns), local->queue will be 1. ++ * In this case we cannot destroy 'local' because it will be needed by ++ * the caller of STACK_WIND. In this case, we decrease the value to let ++ * the caller know that the operation has terminated and it must destroy ++ * 'local'. If local->queue 0, we can destroy it here because there are ++ * no other users. */ ++ if (uatomic_sub_return(&local->queue, 1) >= 0) { ++ frame->local = NULL; ++ } ++ + if (!skip_hashed_check) { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); + gf_dirent_free(&entries); +@@ -7096,6 +7211,7 @@ unwind: + } else { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL); + } ++ + return 0; + } + +@@ -7172,11 +7288,9 @@ dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + } + } + +- STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol, +- xvol->fops->readdirp, fd, size, yoff, local->xattr); ++ dht_queue_readdirp(frame, xvol, yoff, dht_readdirp_cbk); + } else { +- STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol, +- xvol->fops->readdir, fd, size, yoff, local->xattr); ++ dht_queue_readdir(frame, xvol, yoff, dht_readdir_cbk); + } + + return 0; +diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h +index 92f1b89..132b3b3 100644 +--- a/xlators/cluster/dht/src/dht-common.h ++++ b/xlators/cluster/dht/src/dht-common.h +@@ -369,6 +369,11 @@ struct dht_local { + + dht_dir_transaction_t lock[2], *current; + ++ /* for nested readdirs */ ++ xlator_t *queue_xl; ++ off_t queue_offset; ++ int32_t queue; ++ + /* inodelks during filerename for backward compatibility */ + dht_lock_t **rename_inodelk_backward_compatible; + int rename_inodelk_bc_count; +-- +1.8.3.1 + diff --git a/SOURCES/0558-afr-fix-directory-entry-count.patch b/SOURCES/0558-afr-fix-directory-entry-count.patch new file mode 100644 index 0000000..4134f77 --- /dev/null +++ b/SOURCES/0558-afr-fix-directory-entry-count.patch @@ -0,0 +1,238 @@ +From 9bf6986f8ea3edd9de3d2629404f7ab11c1597de Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 9 Mar 2021 00:24:07 +0100 +Subject: [PATCH 558/584] afr: fix directory entry count + +AFR may hide some existing entries from a directory when reading it +because they are generated internally for private management. However +the returned number of entries from readdir() function is not updated +accordingly. So it may return a number higher than the real entries +present in the gf_dirent list. + +This may cause unexpected behavior of clients, including gfapi which +incorrectly assumes that there was an entry when the list was actually +empty. + +This patch also makes the check in gfapi more robust to avoid similar +issues that could appear in the future. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2233 +> Fixes: #2232 +> Change-Id: I81ba3699248a53ebb0ee4e6e6231a4301436f763 +> Signed-off-by: Xavi Hernandez + +BUG: 1927411 +Change-Id: I81ba3699248a53ebb0ee4e6e6231a4301436f763 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244535 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + api/src/glfs-fops.c | 3 +- + tests/bugs/replicate/issue-2232.c | 85 ++++++++++++++++++++++++++++++++++ + tests/bugs/replicate/issue-2232.t | 34 ++++++++++++++ + xlators/cluster/afr/src/afr-dir-read.c | 11 +++-- + 4 files changed, 129 insertions(+), 4 deletions(-) + create mode 100644 tests/bugs/replicate/issue-2232.c + create mode 100644 tests/bugs/replicate/issue-2232.t + +diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c +index 6dc3b66..821d250 100644 +--- a/api/src/glfs-fops.c ++++ b/api/src/glfs-fops.c +@@ -3748,8 +3748,9 @@ glfd_entry_refresh(struct glfs_fd *glfd, int plus) + errno = 0; + } + +- if (ret > 0) ++ if ((ret > 0) && !list_empty(&glfd->entries)) { + glfd->next = list_entry(glfd->entries.next, gf_dirent_t, list); ++ } + + gf_dirent_free(&old); + out: +diff --git a/tests/bugs/replicate/issue-2232.c b/tests/bugs/replicate/issue-2232.c +new file mode 100644 +index 0000000..df547c2 +--- /dev/null ++++ b/tests/bugs/replicate/issue-2232.c +@@ -0,0 +1,85 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int main(int argc, char **argv) ++{ ++ char log[128]; ++ struct dirent entry; ++ struct dirent *ent; ++ glfs_xreaddirp_stat_t *xstat; ++ int ret, flags; ++ ++ if (argc != 3) { ++ fprintf(stderr, "Syntax: %s \n", argv[0]); ++ exit(1); ++ } ++ char *hostname = argv[1]; ++ char *volname = argv[2]; ++ ++ glfs_t *fs = glfs_new(volname); ++ if (!fs) { ++ fprintf(stderr, "glfs_new() failed\n"); ++ exit(1); ++ } ++ ++ ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_set_volfile_server() failed\n"); ++ return ret; ++ } ++ ++ sprintf(log, "/tmp/logs-%d.log", getpid()); ++ ret = glfs_set_logging(fs, log, 9); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_set_logging() failed\n"); ++ return ret; ++ } ++ ++ ret = glfs_init(fs); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_init() failed\n"); ++ return ret; ++ } ++ ++ glfs_fd_t *fd = glfs_opendir(fs, "/"); ++ if (fd == NULL) { ++ fprintf(stderr, "glfs_opendir() failed\n"); ++ return 1; ++ } ++ ++ flags = GFAPI_XREADDIRP_STAT | GFAPI_XREADDIRP_HANDLE; ++ xstat = NULL; ++ while ((ret = glfs_xreaddirplus_r(fd, flags, &xstat, &entry, &ent)) > 0) { ++ if (xstat != NULL) { ++ glfs_free(xstat); ++ } ++ if ((strcmp(ent->d_name, ".") == 0) || ++ (strcmp(ent->d_name, "..") == 0)) { ++ xstat = NULL; ++ continue; ++ } ++ if ((xstat == NULL) || ((ret & GFAPI_XREADDIRP_HANDLE) == 0)) { ++ fprintf(stderr, "glfs_xreaddirplus_r() failed: %s\n", ++ strerror(errno)); ++ return 1; ++ } ++ ++ xstat = NULL; ++ } ++ ++ if (ret < 0) { ++ fprintf(stderr, "glfs_xreaddirplus_r() failed\n"); ++ return ret; ++ } ++ ++ glfs_close(fd); ++ ++ glfs_fini(fs); ++ ++ return ret; ++} +diff --git a/tests/bugs/replicate/issue-2232.t b/tests/bugs/replicate/issue-2232.t +new file mode 100644 +index 0000000..66a41e0 +--- /dev/null ++++ b/tests/bugs/replicate/issue-2232.t +@@ -0,0 +1,34 @@ ++#!/bin/bash ++ ++. $(dirname "${0}")/../../include.rc ++. $(dirname "${0}")/../../volume.rc ++ ++cleanup; ++TEST gcc $(dirname "${0}")/issue-2232.c -o $(dirname "${0}")/issue-2232 -lgfapi ++TEST glusterd ++TEST pidof glusterd ++ ++TEST $CLI volume create ${V0} replica 3 ${H0}:${B0}/${V0}{0..2} ++ ++# Create a fake .glusterfs-anonymous-inode-... entry ++ANONINO=".glusterfs-anonymous-inode-aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" ++TEST mkdir ${B0}/${V0}{0..2}/${ANONINO} ++gfid="$(uuidgen)" ++hex="0x$(echo "${gfid}" | tr -d '-')" ++TEST assign_gfid "${hex}" "${B0}/${V0}0/${ANONINO}" ++TEST assign_gfid "${hex}" "${B0}/${V0}1/${ANONINO}" ++TEST assign_gfid "${hex}" "${B0}/${V0}2/${ANONINO}" ++TEST mkdir -p "${B0}/${V0}0/.glusterfs/${gfid:0:2}/${gfid:2:2}" ++TEST mkdir -p "${B0}/${V0}1/.glusterfs/${gfid:0:2}/${gfid:2:2}" ++TEST mkdir -p "${B0}/${V0}2/.glusterfs/${gfid:0:2}/${gfid:2:2}" ++TEST ln -s "../../00/00/00000000-0000-0000-0000-000000000001/${ANONINO}" "${B0}/${V0}0/.glusterfs/${gfid:0:2}/${gfid:2:2}/${gfid}" ++TEST ln -s "../../00/00/00000000-0000-0000-0000-000000000001/${ANONINO}" "${B0}/${V0}1/.glusterfs/${gfid:0:2}/${gfid:2:2}/${gfid}" ++TEST ln -s "../../00/00/00000000-0000-0000-0000-000000000001/${ANONINO}" "${B0}/${V0}2/.glusterfs/${gfid:0:2}/${gfid:2:2}/${gfid}" ++ ++TEST $CLI volume start ${V0} ++ ++TEST $(dirname "${0}")/issue-2232 ${H0} ${V0} ++ ++TEST rm -f $(dirname $0)/issue-2232 ++ ++cleanup +diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c +index d64b6a9..a98f8df 100644 +--- a/xlators/cluster/afr/src/afr-dir-read.c ++++ b/xlators/cluster/afr/src/afr-dir-read.c +@@ -157,7 +157,7 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) + return 0; + } + +-static void ++static int32_t + afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) + { +@@ -168,6 +168,7 @@ afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + afr_private_t *priv = NULL; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t validate_subvol = _gf_false; ++ int32_t count = 0; + + this = THIS; + priv = this->private; +@@ -184,6 +185,7 @@ afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + + list_del_init(&entry->list); + list_add_tail(&entry->list, &entries->list); ++ count++; + + if (!validate_subvol) + continue; +@@ -197,6 +199,8 @@ afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + } + } + } ++ ++ return count; + } + + int32_t +@@ -222,8 +226,9 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + } + + if (op_ret >= 0) +- afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, +- &entries, local->fd); ++ op_ret = afr_readdir_transform_entries(frame, subvol_entries, ++ (long)cookie, &entries, ++ local->fd); + + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); + +-- +1.8.3.1 + diff --git a/SOURCES/0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch b/SOURCES/0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch new file mode 100644 index 0000000..91add36 --- /dev/null +++ b/SOURCES/0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch @@ -0,0 +1,163 @@ +From 2b6e6c234dffa72c9f2af747908b1e1f29080698 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Thu, 25 Mar 2021 11:52:13 +0530 +Subject: [PATCH 559/584] afr: make fsync post-op aware of inodelk count + (#2273) + +Problem: +Since commit bd540db1e, eager-locking was enabled for fsync. But on +certain VM workloads wit sharding enabled, shard xlator keeps sending +fsync on the base shard. This can cause blocked inodelks from other +clients (including shd) to time out due to call bail. + +Fix: +Make afr fsync aware of inodelk count and not delay post-op + unlock +when inodelk count > 1, just like writev. + +Code is restructured so that any fd based AFR_DATA_TRANSACTION can be made +aware by setting GLUSTERFS_INODELK_DOM_COUNT in xdata request. + +Note: We do not know yet why VMs go in to paused state because of the +blocked inodelks but this patch should be a first step in reducing the +occurence. + +Upstream patch details: +> https://github.com/gluster/glusterfs/pull/2273/ +> Updates: #2198 +> Change-Id: Ib91ebdd3101d590c326e69c829cf9335003e260b +> Signed-off-by: Ravishankar N + +BUG: 1943467 +Change-Id: Id407ca54007e3bbb206a1d9431ebaf89a2167f74 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244516 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-inode-write.c | 40 ++++++++++++++++++------------- + xlators/features/locks/src/posix.c | 1 + + 2 files changed, 24 insertions(+), 17 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index df82b6e..962a7b1 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -42,6 +42,7 @@ __afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) + struct iatt *stbuf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; ++ afr_lock_t *lock = NULL; + afr_read_subvol_args_t args = { + 0, + }; +@@ -50,6 +51,12 @@ __afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, local->inode, out); + ++ if (local->update_num_inodelks && ++ local->transaction.type == AFR_DATA_TRANSACTION) { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ lock->num_inodelks = local->num_inodelks; ++ } ++ + /*This code needs to stay till DHT sends fops on linked + * inodes*/ + if (!inode_is_linked(local->inode)) { +@@ -134,6 +141,7 @@ __afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; ++ int num_inodelks = 0; + + local = frame->local; + priv = this->private; +@@ -146,8 +154,16 @@ __afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; +- if (xdata) ++ if (xdata) { + local->replies[child_index].xdata = dict_ref(xdata); ++ if (dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, ++ &num_inodelks) == 0) { ++ if (num_inodelks > local->num_inodelks) { ++ local->num_inodelks = num_inodelks; ++ local->update_num_inodelks = _gf_true; ++ } ++ } ++ } + + if (op_ret >= 0) { + if (prebuf) +@@ -284,7 +300,6 @@ afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + afr_local_t *local = frame->local; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; +- int32_t num_inodelks = 0; + + LOCK(&frame->lock); + { +@@ -306,15 +321,6 @@ afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } +- +- ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, +- &num_inodelks); +- if (ret < 0) +- goto unlock; +- if (num_inodelks > local->num_inodelks) { +- local->num_inodelks = num_inodelks; +- local->update_num_inodelks = _gf_true; +- } + } + unlock: + UNLOCK(&frame->lock); +@@ -324,7 +330,6 @@ void + afr_process_post_writev(call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; +- afr_lock_t *lock = NULL; + + local = frame->local; + +@@ -343,11 +348,6 @@ afr_process_post_writev(call_frame_t *frame, xlator_t *this) + + if (local->update_open_fd_count) + local->inode_ctx->open_fd_count = local->open_fd_count; +- if (local->update_num_inodelks && +- local->transaction.type == AFR_DATA_TRANSACTION) { +- lock = &local->inode_ctx->lock[local->transaction.type]; +- lock->num_inodelks = local->num_inodelks; +- } + } + + int +@@ -2516,6 +2516,12 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + if (!local->xdata_req) + goto out; + ++ if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT, ++ this->name)) { ++ op_errno = ENOMEM; ++ goto out; ++ } ++ + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index cdd1ff7..22ef5b8 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -4943,6 +4943,7 @@ struct xlator_fops fops = { + .rchecksum = pl_rchecksum, + .statfs = pl_statfs, + .fsyncdir = pl_fsyncdir, ++ .fsync = pl_fsync, + .readdir = pl_readdir, + .symlink = pl_symlink, + .link = pl_link, +-- +1.8.3.1 + diff --git a/SOURCES/0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch b/SOURCES/0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch new file mode 100644 index 0000000..cccac36 --- /dev/null +++ b/SOURCES/0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch @@ -0,0 +1,73 @@ +From e56605d5808b41335026a5470fa10f5e5b5389f3 Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Mon, 6 Apr 2020 21:58:03 +0530 +Subject: [PATCH 560/584] posix: Avoid dict_del logs in posix_is_layout_stale + while key is NULL + +Problem: The key "GF_PREOP_PARENT_KEY" has been populated by dht and + for non-distribute volume like 1x3 key is not populated so + posix_is_layout stale throw a message while a file is created + +Solution: To avoid a log put a condition before delete a key + +Upstream patch details: +> https://review.gluster.org/#/c/glusterfs/+/24297/ +> Change-Id: I813ee7960633e7f9f5e9ad2f42f288053d9eb71f +> Fixes: #1150 +> Signed-off-by: Mohit Agrawal + +BUG: 1942816 +Change-Id: I746a2619989265f3bc9bb648c4b8e4bbefaedc56 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244925 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/glusterd/brick-mux-validation.t | 4 ++-- + xlators/storage/posix/src/posix-helpers.c | 5 +++-- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/tests/bugs/glusterd/brick-mux-validation.t b/tests/bugs/glusterd/brick-mux-validation.t +index 03a4768..61b0455 100644 +--- a/tests/bugs/glusterd/brick-mux-validation.t ++++ b/tests/bugs/glusterd/brick-mux-validation.t +@@ -24,7 +24,7 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}{1..3} + TEST $CLI volume start $V0 + + EXPECT 1 count_brick_processes +-EXPECT 1 count_brick_pids ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 count_brick_pids + EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 online_brick_count + + pkill gluster +@@ -101,4 +101,4 @@ TEST $CLI_IGNORE_PARTITION volume reset-brick $V1 $H0:$B0/${V1}1 $H0:$B0/${V1}1 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT 6 online_brick_count + EXPECT 1 count_brick_processes + +-cleanup; +\ No newline at end of file ++cleanup; +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index 110d383..16351d8 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -3596,13 +3596,14 @@ posix_is_layout_stale(dict_t *xdata, char *par_path, xlator_t *this) + op_ret = dict_get_str_sizen(xdata, GF_PREOP_PARENT_KEY, &xattr_name); + if (xattr_name == NULL) { + op_ret = 0; +- goto out; ++ return is_stale; + } + + arg_data = dict_get(xdata, xattr_name); + if (!arg_data) { + op_ret = 0; +- goto out; ++ dict_del_sizen(xdata, GF_PREOP_PARENT_KEY); ++ return is_stale; + } + + size = sys_lgetxattr(par_path, xattr_name, value_buf, +-- +1.8.3.1 + diff --git a/SOURCES/0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch b/SOURCES/0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch new file mode 100644 index 0000000..4f191cc --- /dev/null +++ b/SOURCES/0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch @@ -0,0 +1,202 @@ +From 488a5aa4932842334e2749224e9c39f8b6fd379c Mon Sep 17 00:00:00 2001 +From: Ashish Pandey +Date: Wed, 20 May 2020 11:30:17 +0530 +Subject: [PATCH 561/584] cluster/ec: Inform failure when some bricks are + unavailable. + +Provide proper information about failure when a fop +fails on some of the brick. +Also provide information about parent fop and +the map of the bricks on which it is failing. + +Upstream patch details: +>Change-Id: If812739617df65cd146c8e667fbacff653717248 +>updates #1434 +>Signed-off-by: Ashish Pandey +>https://review.gluster.org/#/c/glusterfs/+/24858/ + +Change-Id: I3549d637e7345f05f21ac1c0e8106973c69d1be9 +BUG: 1908635 +Signed-off-by: Ashish Pandey +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244926 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec-common.c | 76 +++++++++++++++++++++++--------------- + xlators/cluster/ec/src/ec.c | 14 ++++++- + 2 files changed, 58 insertions(+), 32 deletions(-) + +diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c +index e3f8769..a9624d8 100644 +--- a/xlators/cluster/ec/src/ec-common.c ++++ b/xlators/cluster/ec/src/ec-common.c +@@ -316,17 +316,19 @@ ec_check_status(ec_fop_data_t *fop) + } + } + +- gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, +- "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " +- "remaining=%s, good=%s, bad=%s, %s)", +- gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, +- ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), +- ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), +- ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), +- ec_bin(str4, sizeof(str4), fop->good, ec->nodes), +- ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), +- ec->nodes), +- ec_msg_str(fop)); ++ gf_msg( ++ fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, ++ "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " ++ "remaining=%s, good=%s, bad=%s," ++ "(Least significant bit represents first client/brick of subvol), %s)", ++ gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, ++ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), ++ ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), ++ ec_bin(str4, sizeof(str4), fop->good, ec->nodes), ++ ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), ++ ec->nodes), ++ ec_msg_str(fop)); + if (fop->use_fd) { + if (fop->fd != NULL) { + ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, +@@ -614,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop) + loc_t *loc2 = NULL; + char gfid1[64] = {0}; + char gfid2[64] = {0}; ++ ec_fop_data_t *parent = fop->parent; + + if (fop->errstr) + return fop->errstr; +- + if (!fop->use_fd) { + loc1 = &fop->loc[0]; + loc2 = &fop->loc[1]; +@@ -625,23 +627,45 @@ ec_msg_str(ec_fop_data_t *fop) + if (fop->id == GF_FOP_RENAME) { + gf_asprintf(&fop->errstr, + "FOP : '%s' failed on '%s' and '%s' with gfids " +- "%s and %s respectively", ++ "%s and %s respectively. Parent FOP: %s", + ec_fop_name(fop->id), loc1->path, loc2->path, + uuid_utoa_r(loc1->gfid, gfid1), +- uuid_utoa_r(loc2->gfid, gfid2)); ++ uuid_utoa_r(loc2->gfid, gfid2), ++ parent ? ec_fop_name(parent->id) : "No Parent"); + } else { +- gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s", +- ec_fop_name(fop->id), loc1->path, +- uuid_utoa_r(loc1->gfid, gfid1)); ++ gf_asprintf( ++ &fop->errstr, ++ "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s", ++ ec_fop_name(fop->id), loc1->path, ++ uuid_utoa_r(loc1->gfid, gfid1), ++ parent ? ec_fop_name(parent->id) : "No Parent"); + } + } else { +- gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s", +- ec_fop_name(fop->id), +- uuid_utoa_r(fop->fd->inode->gfid, gfid1)); ++ gf_asprintf( ++ &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s", ++ ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1), ++ parent ? ec_fop_name(parent->id) : "No Parent"); + } + return fop->errstr; + } + ++static void ++ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need, ++ int32_t loglevel) ++{ ++ ec_t *ec = fop->xl->private; ++ char str1[32], str2[32], str3[32]; ++ ++ gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT, ++ "Insufficient available children for this request: " ++ "Have : %d, Need : %u : Child UP : %s " ++ "Mask: %s, Healing : %s : %s ", ++ have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), ++ ec_bin(str3, sizeof(str3), fop->healing, ec->nodes), ++ ec_msg_str(fop)); ++} ++ + static int32_t + ec_child_select(ec_fop_data_t *fop) + { +@@ -699,11 +723,7 @@ ec_child_select(ec_fop_data_t *fop) + ec_trace("SELECT", fop, ""); + + if ((num < fop->minimum) && (num < ec->fragments)) { +- gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, +- "Insufficient available children " +- "for this request (have %d, need " +- "%d). %s", +- num, fop->minimum, ec_msg_str(fop)); ++ ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR); + return 0; + } + +@@ -711,11 +731,7 @@ ec_child_select(ec_fop_data_t *fop) + (fop->locks[0].update[EC_DATA_TXN] || + fop->locks[0].update[EC_METADATA_TXN])) { + if (ec->quorum_count && (num < ec->quorum_count)) { +- gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, +- "Insufficient available children " +- "for this request (have %d, need " +- "%d). %s", +- num, ec->quorum_count, ec_msg_str(fop)); ++ ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR); + return 0; + } + } +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index a930089..047cdd8 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -325,13 +325,18 @@ ec_get_event_from_state(ec_t *ec) + void + ec_up(xlator_t *this, ec_t *ec) + { ++ char str1[32], str2[32]; ++ + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + ec->up = 1; +- gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP"); ++ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, ++ "Going UP : Child UP = %s Child Notify = %s", ++ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); + + gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name); + } +@@ -339,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec) + void + ec_down(xlator_t *this, ec_t *ec) + { ++ char str1[32], str2[32]; ++ + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + ec->up = 0; +- gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN"); ++ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, ++ "Going DOWN : Child UP = %s Child Notify = %s", ++ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); + + gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name); + } +-- +1.8.3.1 + diff --git a/SOURCES/0562-shard.c-Fix-formatting.patch b/SOURCES/0562-shard.c-Fix-formatting.patch new file mode 100644 index 0000000..14fbed6 --- /dev/null +++ b/SOURCES/0562-shard.c-Fix-formatting.patch @@ -0,0 +1,12513 @@ +From ea96fcd832de0b49f0e050f535d22a500da1503a Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Thu, 3 Jun 2021 13:14:04 +0200 +Subject: [PATCH 562/584] shard.c: Fix formatting + +A previous downstream change [1] had changed the formatting of the +entire xlators/features/shard/src/shard.c. This patch reapplies the +correct formatting. No other changes have been made. + +[1] https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/185716/ + +BUG: 1925425 +Change-Id: Ie655ddaaa26aa884878e66bc0d9ce1f021f6a85f +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244956 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 11701 ++++++++++++++++++----------------- + 1 file changed, 6084 insertions(+), 5617 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 099b062..c5cc224 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -16,5813 +16,6226 @@ + #include + #include + +-static gf_boolean_t __is_shard_dir(uuid_t gfid) { +- shard_priv_t *priv = THIS->private; ++static gf_boolean_t ++__is_shard_dir(uuid_t gfid) ++{ ++ shard_priv_t *priv = THIS->private; + +- if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) +- return _gf_true; ++ if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) ++ return _gf_true; + +- return _gf_false; ++ return _gf_false; + } + +-static gf_boolean_t __is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) { +- if (frame->root->pid == GF_CLIENT_PID_GSYNCD && +- (__is_shard_dir(loc->pargfid) || +- (loc->parent && __is_shard_dir(loc->parent->gfid)))) +- return _gf_true; ++static gf_boolean_t ++__is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) ++{ ++ if (frame->root->pid == GF_CLIENT_PID_GSYNCD && ++ (__is_shard_dir(loc->pargfid) || ++ (loc->parent && __is_shard_dir(loc->parent->gfid)))) ++ return _gf_true; + +- return _gf_false; ++ return _gf_false; + } + +-void shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; ++void ++shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = { ++ 0, ++ }; + +- gf_uuid_unparse(gfid, gfid_str); +- snprintf(buf, len, "%s.%d", gfid_str, block_num); ++ gf_uuid_unparse(gfid, gfid_str); ++ snprintf(buf, len, "%s.%d", gfid_str, block_num); + } + +-void shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, +- size_t len) { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; ++void ++shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = { ++ 0, ++ }; + +- gf_uuid_unparse(gfid, gfid_str); +- snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); ++ gf_uuid_unparse(gfid, gfid_str); ++ snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); + } + +-int __shard_inode_ctx_get(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t **ctx) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx_p = NULL; ++int ++__shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx_p = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret == 0) { +- *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; +- return ret; +- } ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret == 0) { ++ *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ return ret; ++ } + +- ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); +- if (!ctx_p) +- return ret; ++ ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); ++ if (!ctx_p) ++ return ret; + +- INIT_LIST_HEAD(&ctx_p->ilist); +- INIT_LIST_HEAD(&ctx_p->to_fsync_list); ++ INIT_LIST_HEAD(&ctx_p->ilist); ++ INIT_LIST_HEAD(&ctx_p->to_fsync_list); + +- ret = __inode_ctx_set(inode, this, (uint64_t *)&ctx_p); +- if (ret < 0) { +- GF_FREE(ctx_p); +- return ret; +- } ++ ret = __inode_ctx_set(inode, this, (uint64_t *)&ctx_p); ++ if (ret < 0) { ++ GF_FREE(ctx_p); ++ return ret; ++ } + +- *ctx = ctx_p; ++ *ctx = ctx_p; + +- return ret; ++ return ret; + } + +-int shard_inode_ctx_get(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t **ctx) { +- int ret = 0; ++int ++shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) ++{ ++ int ret = 0; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get(inode, this, ctx); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get(inode, this, ctx); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, +- uint64_t block_size, int32_t valid) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, ++ uint64_t block_size, int32_t valid) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- if (valid & SHARD_MASK_BLOCK_SIZE) +- ctx->block_size = block_size; ++ if (valid & SHARD_MASK_BLOCK_SIZE) ++ ctx->block_size = block_size; + +- if (valid & SHARD_MASK_PROT) +- ctx->stat.ia_prot = stbuf->ia_prot; ++ if (valid & SHARD_MASK_PROT) ++ ctx->stat.ia_prot = stbuf->ia_prot; + +- if (valid & SHARD_MASK_NLINK) +- ctx->stat.ia_nlink = stbuf->ia_nlink; ++ if (valid & SHARD_MASK_NLINK) ++ ctx->stat.ia_nlink = stbuf->ia_nlink; + +- if (valid & SHARD_MASK_UID) +- ctx->stat.ia_uid = stbuf->ia_uid; ++ if (valid & SHARD_MASK_UID) ++ ctx->stat.ia_uid = stbuf->ia_uid; + +- if (valid & SHARD_MASK_GID) +- ctx->stat.ia_gid = stbuf->ia_gid; ++ if (valid & SHARD_MASK_GID) ++ ctx->stat.ia_gid = stbuf->ia_gid; + +- if (valid & SHARD_MASK_SIZE) +- ctx->stat.ia_size = stbuf->ia_size; ++ if (valid & SHARD_MASK_SIZE) ++ ctx->stat.ia_size = stbuf->ia_size; + +- if (valid & SHARD_MASK_BLOCKS) +- ctx->stat.ia_blocks = stbuf->ia_blocks; ++ if (valid & SHARD_MASK_BLOCKS) ++ ctx->stat.ia_blocks = stbuf->ia_blocks; + +- if (valid & SHARD_MASK_TIMES) { +- SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, +- stbuf->ia_mtime, stbuf->ia_mtime_nsec); +- SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, +- stbuf->ia_ctime, stbuf->ia_ctime_nsec); +- SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, +- stbuf->ia_atime, stbuf->ia_atime_nsec); +- } ++ if (valid & SHARD_MASK_TIMES) { ++ SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, ++ stbuf->ia_mtime, stbuf->ia_mtime_nsec); ++ SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, ++ stbuf->ia_ctime, stbuf->ia_ctime_nsec); ++ SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, ++ stbuf->ia_atime, stbuf->ia_atime_nsec); ++ } + +- if (valid & SHARD_MASK_OTHERS) { +- ctx->stat.ia_ino = stbuf->ia_ino; +- gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); +- ctx->stat.ia_dev = stbuf->ia_dev; +- ctx->stat.ia_type = stbuf->ia_type; +- ctx->stat.ia_rdev = stbuf->ia_rdev; +- ctx->stat.ia_blksize = stbuf->ia_blksize; +- } ++ if (valid & SHARD_MASK_OTHERS) { ++ ctx->stat.ia_ino = stbuf->ia_ino; ++ gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); ++ ctx->stat.ia_dev = stbuf->ia_dev; ++ ctx->stat.ia_type = stbuf->ia_type; ++ ctx->stat.ia_rdev = stbuf->ia_rdev; ++ ctx->stat.ia_blksize = stbuf->ia_blksize; ++ } + +- if (valid & SHARD_MASK_REFRESH_RESET) +- ctx->refresh = _gf_false; ++ if (valid & SHARD_MASK_REFRESH_RESET) ++ ctx->refresh = _gf_false; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, +- uint64_t block_size, int32_t valid) { +- int ret = -1; ++int ++shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, ++ uint64_t block_size, int32_t valid) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- ctx->refresh = _gf_true; ++ ctx->refresh = _gf_true; + +- return 0; ++ return 0; + } +-int shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) { +- int ret = -1; ++int ++shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_set_refresh_flag(inode, this); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_set_refresh_flag(inode, this); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- ctx->refreshed = _gf_true; +- return 0; ++ ctx->refreshed = _gf_true; ++ return 0; + } + +-int shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) { +- int ret = -1; ++int ++shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, +- inode_t *shard_inode) { +- int ret = -1; +- shard_inode_ctx_t *base_ictx = NULL; +- shard_inode_ctx_t *shard_ictx = NULL; ++int ++__shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *base_ictx = NULL; ++ shard_inode_ctx_t *shard_ictx = NULL; + +- ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); ++ if (ret) ++ return ret; + +- ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); ++ if (ret) ++ return ret; + +- if (shard_ictx->fsync_needed) { +- shard_ictx->fsync_needed++; +- return 1; +- } ++ if (shard_ictx->fsync_needed) { ++ shard_ictx->fsync_needed++; ++ return 1; ++ } + +- list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); +- shard_ictx->inode = shard_inode; +- shard_ictx->fsync_needed++; +- base_ictx->fsync_count++; +- shard_ictx->base_inode = base_inode; ++ list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); ++ shard_ictx->inode = shard_inode; ++ shard_ictx->fsync_needed++; ++ base_ictx->fsync_count++; ++ shard_ictx->base_inode = base_inode; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, +- inode_t *shard_inode) { +- int ret = -1; ++int ++shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; + +- /* This ref acts as a refkeepr on the base inode. We +- * need to keep this inode alive as it holds the head +- * of the to_fsync_list. +- */ +- inode_ref(base_inode); +- inode_ref(shard_inode); ++ /* This ref acts as a refkeepr on the base inode. We ++ * need to keep this inode alive as it holds the head ++ * of the to_fsync_list. ++ */ ++ inode_ref(base_inode); ++ inode_ref(shard_inode); + +- LOCK(&base_inode->lock); +- LOCK(&shard_inode->lock); +- { ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, shard_inode); } +- UNLOCK(&shard_inode->lock); +- UNLOCK(&base_inode->lock); ++ LOCK(&base_inode->lock); ++ LOCK(&shard_inode->lock); ++ { ++ ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, ++ shard_inode); ++ } ++ UNLOCK(&shard_inode->lock); ++ UNLOCK(&base_inode->lock); + +- /* Unref the base inode corresponding to the ref above, if the shard is +- * found to be already part of the fsync list. +- */ +- if (ret != 0) { +- inode_unref(base_inode); +- inode_unref(shard_inode); +- } +- return ret; ++ /* Unref the base inode corresponding to the ref above, if the shard is ++ * found to be already part of the fsync list. ++ */ ++ if (ret != 0) { ++ inode_unref(base_inode); ++ inode_unref(shard_inode); ++ } ++ return ret; + } + +-gf_boolean_t __shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++gf_boolean_t ++__shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- /* If inode ctx get fails, better to err on the side of caution and +- * try again? Unless the failure is due to mem-allocation. +- */ +- if (ret) +- return _gf_true; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ /* If inode ctx get fails, better to err on the side of caution and ++ * try again? Unless the failure is due to mem-allocation. ++ */ ++ if (ret) ++ return _gf_true; + +- return !ctx->refreshed; ++ return !ctx->refreshed; + } + +-gf_boolean_t shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) { +- gf_boolean_t flag = _gf_false; ++gf_boolean_t ++shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) ++{ ++ gf_boolean_t flag = _gf_false; + +- LOCK(&inode->lock); +- { flag = __shard_inode_ctx_needs_lookup(inode, this); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ flag = __shard_inode_ctx_needs_lookup(inode, this); ++ } ++ UNLOCK(&inode->lock); + +- return flag; ++ return flag; + } +-int __shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, +- struct iatt *stbuf) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- if ((stbuf->ia_size != ctx->stat.ia_size) || +- (stbuf->ia_blocks != ctx->stat.ia_blocks)) +- ctx->refresh = _gf_true; ++ if ((stbuf->ia_size != ctx->stat.ia_size) || ++ (stbuf->ia_blocks != ctx->stat.ia_blocks)) ++ ctx->refresh = _gf_true; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, +- struct iatt *stbuf) { +- int ret = -1; ++int ++shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_invalidate(inode, this, stbuf); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_invalidate(inode, this, stbuf); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, +- uint64_t *block_size) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, ++ uint64_t *block_size) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- *block_size = ctx->block_size; ++ *block_size = ctx->block_size; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, +- uint64_t *block_size) { +- int ret = -1; ++int ++shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, ++ uint64_t *block_size) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get_block_size(inode, this, block_size); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_block_size(inode, this, block_size); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, +- int *fsync_count) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- *fsync_count = ctx->fsync_needed; ++ *fsync_count = ctx->fsync_needed; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, +- int *fsync_count) { +- int ret = -1; ++int ++shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } +-int __shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t *ctx_out) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, ++ shard_inode_ctx_t *ctx_out) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); +- return 0; ++ memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); ++ return 0; + } + +-int shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t *ctx_out) { +- int ret = -1; ++int ++shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, ++ shard_inode_ctx_t *ctx_out) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get_all(inode, this, ctx_out); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_all(inode, this, ctx_out); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, +- struct iatt *buf, +- gf_boolean_t *need_refresh) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, ++ struct iatt *buf, ++ gf_boolean_t *need_refresh) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- if (ctx->refresh == _gf_false) +- *buf = ctx->stat; +- else +- *need_refresh = _gf_true; ++ if (ctx->refresh == _gf_false) ++ *buf = ctx->stat; ++ else ++ *need_refresh = _gf_true; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, +- struct iatt *buf, +- gf_boolean_t *need_refresh) { +- int ret = -1; ++int ++shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, ++ struct iatt *buf, ++ gf_boolean_t *need_refresh) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { +- ret = +- __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, need_refresh); +- } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, ++ need_refresh); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-void shard_local_wipe(shard_local_t *local) { +- int i = 0; +- int count = 0; ++void ++shard_local_wipe(shard_local_t *local) ++{ ++ int i = 0; ++ int count = 0; ++ ++ count = local->num_blocks; ++ ++ syncbarrier_destroy(&local->barrier); ++ loc_wipe(&local->loc); ++ loc_wipe(&local->dot_shard_loc); ++ loc_wipe(&local->dot_shard_rm_loc); ++ loc_wipe(&local->loc2); ++ loc_wipe(&local->tmp_loc); ++ loc_wipe(&local->int_inodelk.loc); ++ loc_wipe(&local->int_entrylk.loc); ++ loc_wipe(&local->newloc); ++ ++ if (local->name) ++ GF_FREE(local->name); ++ ++ if (local->int_entrylk.basename) ++ GF_FREE(local->int_entrylk.basename); ++ if (local->fd) ++ fd_unref(local->fd); + +- count = local->num_blocks; ++ if (local->xattr_req) ++ dict_unref(local->xattr_req); ++ if (local->xattr_rsp) ++ dict_unref(local->xattr_rsp); + +- syncbarrier_destroy(&local->barrier); +- loc_wipe(&local->loc); +- loc_wipe(&local->dot_shard_loc); +- loc_wipe(&local->dot_shard_rm_loc); +- loc_wipe(&local->loc2); +- loc_wipe(&local->tmp_loc); +- loc_wipe(&local->int_inodelk.loc); +- loc_wipe(&local->int_entrylk.loc); +- loc_wipe(&local->newloc); ++ for (i = 0; i < count; i++) { ++ if (!local->inode_list) ++ break; + +- if (local->name) +- GF_FREE(local->name); ++ if (local->inode_list[i]) ++ inode_unref(local->inode_list[i]); ++ } + +- if (local->int_entrylk.basename) +- GF_FREE(local->int_entrylk.basename); +- if (local->fd) +- fd_unref(local->fd); ++ GF_FREE(local->inode_list); + +- if (local->xattr_req) +- dict_unref(local->xattr_req); +- if (local->xattr_rsp) +- dict_unref(local->xattr_rsp); ++ GF_FREE(local->vector); ++ if (local->iobref) ++ iobref_unref(local->iobref); ++ if (local->list_inited) ++ gf_dirent_free(&local->entries_head); ++ if (local->inodelk_frame) ++ SHARD_STACK_DESTROY(local->inodelk_frame); ++ if (local->entrylk_frame) ++ SHARD_STACK_DESTROY(local->entrylk_frame); ++} + +- for (i = 0; i < count; i++) { +- if (!local->inode_list) +- break; +- +- if (local->inode_list[i]) +- inode_unref(local->inode_list[i]); +- } +- +- GF_FREE(local->inode_list); +- +- GF_FREE(local->vector); +- if (local->iobref) +- iobref_unref(local->iobref); +- if (local->list_inited) +- gf_dirent_free(&local->entries_head); +- if (local->inodelk_frame) +- SHARD_STACK_DESTROY(local->inodelk_frame); +- if (local->entrylk_frame) +- SHARD_STACK_DESTROY(local->entrylk_frame); +-} +- +-int shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) { +- int ret = -1; +- void *size_attr = NULL; +- uint64_t size_array[4]; +- +- ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); +- if (ret) { +- gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, +- SHARD_MSG_INTERNAL_XATTR_MISSING, +- "Failed to " +- "get " GF_XATTR_SHARD_FILE_SIZE " for %s", +- uuid_utoa(stbuf->ia_gfid)); +- return ret; +- } ++int ++shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) ++{ ++ int ret = -1; ++ void *size_attr = NULL; ++ uint64_t size_array[4]; ++ ++ ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); ++ if (ret) { ++ gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INTERNAL_XATTR_MISSING, ++ "Failed to " ++ "get " GF_XATTR_SHARD_FILE_SIZE " for %s", ++ uuid_utoa(stbuf->ia_gfid)); ++ return ret; ++ } + +- memcpy(size_array, size_attr, sizeof(size_array)); ++ memcpy(size_array, size_attr, sizeof(size_array)); + +- stbuf->ia_size = ntoh64(size_array[0]); +- stbuf->ia_blocks = ntoh64(size_array[2]); ++ stbuf->ia_size = ntoh64(size_array[0]); ++ stbuf->ia_blocks = ntoh64(size_array[2]); + +- return 0; ++ return 0; + } + +-int shard_call_count_return(call_frame_t *frame) { +- int call_count = 0; +- shard_local_t *local = NULL; ++int ++shard_call_count_return(call_frame_t *frame) ++{ ++ int call_count = 0; ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- LOCK(&frame->lock); +- { call_count = --local->call_count; } +- UNLOCK(&frame->lock); ++ LOCK(&frame->lock); ++ { ++ call_count = --local->call_count; ++ } ++ UNLOCK(&frame->lock); + +- return call_count; ++ return call_count; + } + +-static char *shard_internal_dir_string(shard_internal_dir_type_t type) { +- char *str = NULL; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- str = GF_SHARD_DIR; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- str = GF_SHARD_REMOVE_ME_DIR; +- break; +- default: +- break; +- } +- return str; ++static char * ++shard_internal_dir_string(shard_internal_dir_type_t type) ++{ ++ char *str = NULL; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ str = GF_SHARD_DIR; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ str = GF_SHARD_REMOVE_ME_DIR; ++ break; ++ default: ++ break; ++ } ++ return str; + } + +-static int shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, +- shard_internal_dir_type_t type) { +- int ret = -1; +- char *bname = NULL; +- inode_t *parent = NULL; +- loc_t *internal_dir_loc = NULL; +- shard_priv_t *priv = NULL; ++static int ++shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, ++ shard_internal_dir_type_t type) ++{ ++ int ret = -1; ++ char *bname = NULL; ++ inode_t *parent = NULL; ++ loc_t *internal_dir_loc = NULL; ++ shard_priv_t *priv = NULL; + +- priv = this->private; +- if (!local) +- return -1; ++ priv = this->private; ++ if (!local) ++ return -1; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ internal_dir_loc = &local->dot_shard_loc; ++ bname = GF_SHARD_DIR; ++ parent = inode_ref(this->itable->root); ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ internal_dir_loc = &local->dot_shard_rm_loc; ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ parent = inode_ref(priv->dot_shard_inode); ++ break; ++ default: ++ break; ++ } + +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- internal_dir_loc = &local->dot_shard_loc; +- bname = GF_SHARD_DIR; +- parent = inode_ref(this->itable->root); +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- internal_dir_loc = &local->dot_shard_rm_loc; +- bname = GF_SHARD_REMOVE_ME_DIR; +- parent = inode_ref(priv->dot_shard_inode); +- break; +- default: +- break; +- } +- +- internal_dir_loc->inode = inode_new(this->itable); +- internal_dir_loc->parent = parent; +- ret = inode_path(internal_dir_loc->parent, bname, +- (char **)&internal_dir_loc->path); +- if (ret < 0 || !(internal_dir_loc->inode)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", bname); +- goto out; +- } +- +- internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); +- if (internal_dir_loc->name) +- internal_dir_loc->name++; +- +- ret = 0; +-out: +- return ret; +-} +- +-inode_t *__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, +- inode_t *base_inode, int block_num, +- uuid_t gfid) { +- char block_bname[256] = { +- 0, +- }; +- inode_t *lru_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *lru_inode_ctx = NULL; +- shard_inode_ctx_t *lru_base_inode_ctx = NULL; +- inode_t *fsync_inode = NULL; +- inode_t *lru_base_inode = NULL; +- gf_boolean_t do_fsync = _gf_false; +- +- priv = this->private; +- +- shard_inode_ctx_get(linked_inode, this, &ctx); +- +- if (list_empty(&ctx->ilist)) { +- if (priv->inode_count + 1 <= priv->lru_limit) { +- /* If this inode was linked here for the first time (indicated +- * by empty list), and if there is still space in the priv list, +- * add this ctx to the tail of the list. +- */ +- /* For as long as an inode is in lru list, we try to +- * keep it alive by holding a ref on it. +- */ +- inode_ref(linked_inode); +- if (base_inode) +- gf_uuid_copy(ctx->base_gfid, base_inode->gfid); +- else +- gf_uuid_copy(ctx->base_gfid, gfid); +- ctx->block_num = block_num; +- list_add_tail(&ctx->ilist, &priv->ilist_head); +- priv->inode_count++; +- ctx->base_inode = inode_ref(base_inode); +- } else { +- /*If on the other hand there is no available slot for this inode +- * in the list, delete the lru inode from the head of the list, +- * unlink it. And in its place add this new inode into the list. +- */ +- lru_inode_ctx = +- list_first_entry(&priv->ilist_head, shard_inode_ctx_t, ilist); +- GF_ASSERT(lru_inode_ctx->block_num > 0); +- lru_base_inode = lru_inode_ctx->base_inode; +- list_del_init(&lru_inode_ctx->ilist); +- lru_inode = inode_find(linked_inode->table, lru_inode_ctx->stat.ia_gfid); +- /* If the lru inode was part of the pending-fsync list, +- * the base inode needs to be unref'd, the lru inode +- * deleted from fsync list and fsync'd in a new frame, +- * and then unlinked in memory and forgotten. +- */ +- if (!lru_base_inode) +- goto after_fsync_check; +- LOCK(&lru_base_inode->lock); +- LOCK(&lru_inode->lock); +- { +- if (!list_empty(&lru_inode_ctx->to_fsync_list)) { +- list_del_init(&lru_inode_ctx->to_fsync_list); +- lru_inode_ctx->fsync_needed = 0; +- do_fsync = _gf_true; +- __shard_inode_ctx_get(lru_base_inode, this, &lru_base_inode_ctx); +- lru_base_inode_ctx->fsync_count--; +- } +- } +- UNLOCK(&lru_inode->lock); +- UNLOCK(&lru_base_inode->lock); +- +- after_fsync_check: +- if (!do_fsync) { +- shard_make_block_bname(lru_inode_ctx->block_num, +- lru_inode_ctx->base_gfid, block_bname, +- sizeof(block_bname)); +- /* The following unref corresponds to the ref held at +- * the time the shard was added to the lru list. +- */ +- inode_unref(lru_inode); +- inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); +- inode_forget(lru_inode, 0); +- } else { +- /* The following unref corresponds to the ref +- * held when the shard was added to fsync list. +- */ +- inode_unref(lru_inode); +- fsync_inode = lru_inode; +- if (lru_base_inode) +- inode_unref(lru_base_inode); +- } +- /* The following unref corresponds to the ref +- * held by inode_find() above. +- */ +- inode_unref(lru_inode); +- +- /* The following unref corresponds to the ref held on the base shard +- * at the time of adding shard inode to lru list +- */ +- if (lru_base_inode) +- inode_unref(lru_base_inode); +- +- /* For as long as an inode is in lru list, we try to +- * keep it alive by holding a ref on it. +- */ +- inode_ref(linked_inode); +- if (base_inode) +- gf_uuid_copy(ctx->base_gfid, base_inode->gfid); +- else +- gf_uuid_copy(ctx->base_gfid, gfid); +- ctx->block_num = block_num; +- ctx->base_inode = inode_ref(base_inode); +- list_add_tail(&ctx->ilist, &priv->ilist_head); +- } +- } else { +- /* If this is not the first time this inode is being operated on, move +- * it to the most recently used end of the list. +- */ +- list_move_tail(&ctx->ilist, &priv->ilist_head); +- } +- return fsync_inode; +-} +- +-int shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, +- int32_t op_ret, int32_t op_errno) { +- switch (fop) { +- case GF_FOP_LOOKUP: +- SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, NULL, NULL); +- break; +- case GF_FOP_STAT: +- SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_FSTAT: +- SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_TRUNCATE: +- SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_FTRUNCATE: +- SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_MKNOD: +- SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL); +- break; +- case GF_FOP_LINK: +- SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL); +- break; +- case GF_FOP_CREATE: +- SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL, NULL); +- break; +- case GF_FOP_UNLINK: +- SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_RENAME: +- SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL, NULL); +- break; +- case GF_FOP_WRITE: +- SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_FALLOCATE: +- SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_ZEROFILL: +- SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_DISCARD: +- SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_READ: +- SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, NULL, +- NULL); +- break; +- case GF_FOP_FSYNC: +- SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_REMOVEXATTR: +- SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_FREMOVEXATTR: +- SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_FGETXATTR: +- SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_GETXATTR: +- SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_FSETXATTR: +- SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_SETXATTR: +- SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_SETATTR: +- SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_FSETATTR: +- SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_SEEK: +- SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); +- break; +- default: +- gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", fop); +- break; +- } +- return 0; +-} +- +-int shard_common_inode_write_success_unwind(glusterfs_fop_t fop, +- call_frame_t *frame, +- int32_t op_ret) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- switch (fop) { +- case GF_FOP_WRITE: +- SHARD_STACK_UNWIND(writev, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- case GF_FOP_FALLOCATE: +- SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- case GF_FOP_ZEROFILL: +- SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- case GF_FOP_DISCARD: +- SHARD_STACK_UNWIND(discard, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- default: +- gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", fop); +- break; +- } +- return 0; +-} +- +-int shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) { +- char block_bname[256] = { +- 0, +- }; +- fd_t *anon_fd = cookie; +- inode_t *shard_inode = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- +- if (anon_fd == NULL || op_ret < 0) { +- gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, +- "fsync failed on shard"); +- goto out; +- } +- shard_inode = anon_fd->inode; +- +- LOCK(&priv->lock); +- LOCK(&shard_inode->lock); +- { +- __shard_inode_ctx_get(shard_inode, this, &ctx); +- if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { +- shard_make_block_bname(ctx->block_num, shard_inode->gfid, block_bname, +- sizeof(block_bname)); +- inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); +- /* The following unref corresponds to the ref held by +- * inode_link() at the time the shard was created or +- * looked up +- */ +- inode_unref(shard_inode); +- inode_forget(shard_inode, 0); +- } +- } +- UNLOCK(&shard_inode->lock); +- UNLOCK(&priv->lock); ++ internal_dir_loc->inode = inode_new(this->itable); ++ internal_dir_loc->parent = parent; ++ ret = inode_path(internal_dir_loc->parent, bname, ++ (char **)&internal_dir_loc->path); ++ if (ret < 0 || !(internal_dir_loc->inode)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", bname); ++ goto out; ++ } ++ ++ internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); ++ if (internal_dir_loc->name) ++ internal_dir_loc->name++; + ++ ret = 0; + out: +- if (anon_fd) +- fd_unref(anon_fd); +- STACK_DESTROY(frame->root); +- return 0; ++ return ret; + } + +-int shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) { +- fd_t *anon_fd = NULL; +- call_frame_t *fsync_frame = NULL; +- +- fsync_frame = create_frame(this, this->ctx->pool); +- if (!fsync_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create new frame " +- "to fsync shard"); +- return -1; +- } +- +- anon_fd = fd_anonymous(inode); +- if (!anon_fd) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create anon fd to" +- " fsync shard"); +- STACK_DESTROY(fsync_frame->root); +- return -1; +- } +- +- STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, anon_fd, +- 1, NULL); +- return 0; +-} +- +-int shard_common_resolve_shards( +- call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t post_res_handler) { +- int i = -1; +- uint32_t shard_idx_iter = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- inode_t *inode = NULL; +- inode_t *res_inode = NULL; +- inode_t *fsync_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- priv = this->private; +- local = frame->local; +- local->call_count = 0; +- shard_idx_iter = local->first_block; +- res_inode = local->resolver_base_inode; +- if (res_inode) +- gf_uuid_copy(gfid, res_inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- if ((local->op_ret < 0) || (local->resolve_not)) +- goto out; +- +- while (shard_idx_iter <= local->last_block) { +- i++; +- if (shard_idx_iter == 0) { +- local->inode_list[i] = inode_ref(res_inode); +- shard_idx_iter++; +- continue; +- } +- +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- +- inode = NULL; +- inode = inode_resolve(this->itable, path); +- if (inode) { +- gf_msg_debug(this->name, 0, "Shard %d already " +- "present. gfid=%s. Saving inode for future.", +- shard_idx_iter, uuid_utoa(inode->gfid)); +- local->inode_list[i] = inode; +- /* Let the ref on the inodes that are already present +- * in inode table still be held so that they don't get +- * forgotten by the time the fop reaches the actual +- * write stage. +- */ +- LOCK(&priv->lock); +- { +- fsync_inode = __shard_update_shards_inode_list(inode, this, res_inode, +- shard_idx_iter, gfid); +- } +- UNLOCK(&priv->lock); +- shard_idx_iter++; +- if (fsync_inode) +- shard_initiate_evicted_inode_fsync(this, fsync_inode); +- continue; ++inode_t * ++__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, ++ inode_t *base_inode, int block_num, ++ uuid_t gfid) ++{ ++ char block_bname[256] = { ++ 0, ++ }; ++ inode_t *lru_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *lru_inode_ctx = NULL; ++ shard_inode_ctx_t *lru_base_inode_ctx = NULL; ++ inode_t *fsync_inode = NULL; ++ inode_t *lru_base_inode = NULL; ++ gf_boolean_t do_fsync = _gf_false; ++ ++ priv = this->private; ++ ++ shard_inode_ctx_get(linked_inode, this, &ctx); ++ ++ if (list_empty(&ctx->ilist)) { ++ if (priv->inode_count + 1 <= priv->lru_limit) { ++ /* If this inode was linked here for the first time (indicated ++ * by empty list), and if there is still space in the priv list, ++ * add this ctx to the tail of the list. ++ */ ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref(linked_inode); ++ if (base_inode) ++ gf_uuid_copy(ctx->base_gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(ctx->base_gfid, gfid); ++ ctx->block_num = block_num; ++ list_add_tail(&ctx->ilist, &priv->ilist_head); ++ priv->inode_count++; ++ ctx->base_inode = inode_ref(base_inode); ++ } else { ++ /*If on the other hand there is no available slot for this inode ++ * in the list, delete the lru inode from the head of the list, ++ * unlink it. And in its place add this new inode into the list. ++ */ ++ lru_inode_ctx = list_first_entry(&priv->ilist_head, ++ shard_inode_ctx_t, ilist); ++ GF_ASSERT(lru_inode_ctx->block_num > 0); ++ lru_base_inode = lru_inode_ctx->base_inode; ++ list_del_init(&lru_inode_ctx->ilist); ++ lru_inode = inode_find(linked_inode->table, ++ lru_inode_ctx->stat.ia_gfid); ++ /* If the lru inode was part of the pending-fsync list, ++ * the base inode needs to be unref'd, the lru inode ++ * deleted from fsync list and fsync'd in a new frame, ++ * and then unlinked in memory and forgotten. ++ */ ++ if (!lru_base_inode) ++ goto after_fsync_check; ++ LOCK(&lru_base_inode->lock); ++ LOCK(&lru_inode->lock); ++ { ++ if (!list_empty(&lru_inode_ctx->to_fsync_list)) { ++ list_del_init(&lru_inode_ctx->to_fsync_list); ++ lru_inode_ctx->fsync_needed = 0; ++ do_fsync = _gf_true; ++ __shard_inode_ctx_get(lru_base_inode, this, ++ &lru_base_inode_ctx); ++ lru_base_inode_ctx->fsync_count--; ++ } ++ } ++ UNLOCK(&lru_inode->lock); ++ UNLOCK(&lru_base_inode->lock); ++ ++ after_fsync_check: ++ if (!do_fsync) { ++ shard_make_block_bname(lru_inode_ctx->block_num, ++ lru_inode_ctx->base_gfid, block_bname, ++ sizeof(block_bname)); ++ /* The following unref corresponds to the ref held at ++ * the time the shard was added to the lru list. ++ */ ++ inode_unref(lru_inode); ++ inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); ++ inode_forget(lru_inode, 0); ++ } else { ++ /* The following unref corresponds to the ref ++ * held when the shard was added to fsync list. ++ */ ++ inode_unref(lru_inode); ++ fsync_inode = lru_inode; ++ if (lru_base_inode) ++ inode_unref(lru_base_inode); ++ } ++ /* The following unref corresponds to the ref ++ * held by inode_find() above. ++ */ ++ inode_unref(lru_inode); ++ ++ /* The following unref corresponds to the ref held on the base shard ++ * at the time of adding shard inode to lru list ++ */ ++ if (lru_base_inode) ++ inode_unref(lru_base_inode); ++ ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref(linked_inode); ++ if (base_inode) ++ gf_uuid_copy(ctx->base_gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(ctx->base_gfid, gfid); ++ ctx->block_num = block_num; ++ ctx->base_inode = inode_ref(base_inode); ++ list_add_tail(&ctx->ilist, &priv->ilist_head); ++ } + } else { +- local->call_count++; +- shard_idx_iter++; ++ /* If this is not the first time this inode is being operated on, move ++ * it to the most recently used end of the list. ++ */ ++ list_move_tail(&ctx->ilist, &priv->ilist_head); + } +- } +-out: +- post_res_handler(frame, this); +- return 0; ++ return fsync_inode; + } + +-int shard_update_file_size_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- dict_t *dict, dict_t *xdata) { +- inode_t *inode = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if ((local->fd) && (local->fd->inode)) +- inode = local->fd->inode; +- else if (local->loc.inode) +- inode = local->loc.inode; +- +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, +- SHARD_MSG_UPDATE_FILE_SIZE_FAILED, "Update to file size" +- " xattr failed on %s", +- uuid_utoa(inode->gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto err; +- } +- +- if (shard_modify_size_and_block_count(&local->postbuf, dict)) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +-err: +- local->post_update_size_handler(frame, this); +- return 0; ++int ++shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, ++ int32_t op_ret, int32_t op_errno) ++{ ++ switch (fop) { ++ case GF_FOP_LOOKUP: ++ SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_STAT: ++ SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_FSTAT: ++ SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_TRUNCATE: ++ SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_FTRUNCATE: ++ SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_MKNOD: ++ SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_LINK: ++ SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_CREATE: ++ SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_UNLINK: ++ SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_RENAME: ++ SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_WRITE: ++ SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_FALLOCATE: ++ SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_ZEROFILL: ++ SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_DISCARD: ++ SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_READ: ++ SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_FSYNC: ++ SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_REMOVEXATTR: ++ SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_FREMOVEXATTR: ++ SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_FGETXATTR: ++ SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_GETXATTR: ++ SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_FSETXATTR: ++ SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_SETXATTR: ++ SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_SETATTR: ++ SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_FSETATTR: ++ SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_SEEK: ++ SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); ++ break; ++ default: ++ gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", fop); ++ break; ++ } ++ return 0; + } + +-int shard_set_size_attrs(int64_t size, int64_t block_count, +- int64_t **size_attr_p) { +- int ret = -1; +- int64_t *size_attr = NULL; ++int ++shard_common_inode_write_success_unwind(glusterfs_fop_t fop, ++ call_frame_t *frame, int32_t op_ret) ++{ ++ shard_local_t *local = NULL; + +- if (!size_attr_p) +- goto out; ++ local = frame->local; + +- size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); +- if (!size_attr) +- goto out; ++ switch (fop) { ++ case GF_FOP_WRITE: ++ SHARD_STACK_UNWIND(writev, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ case GF_FOP_FALLOCATE: ++ SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ case GF_FOP_ZEROFILL: ++ SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ case GF_FOP_DISCARD: ++ SHARD_STACK_UNWIND(discard, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ default: ++ gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", fop); ++ break; ++ } ++ return 0; ++} + +- size_attr[0] = hton64(size); +- /* As sharding evolves, it _may_ be necessary to embed more pieces of +- * information within the same xattr. So allocating slots for them in +- * advance. For now, only bytes 0-63 and 128-191 which would make up the +- * current size and block count respectively of the file are valid. +- */ +- size_attr[2] = hton64(block_count); ++int ++shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ char block_bname[256] = { ++ 0, ++ }; ++ fd_t *anon_fd = cookie; ++ inode_t *shard_inode = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_priv_t *priv = NULL; + +- *size_attr_p = size_attr; ++ priv = this->private; + +- ret = 0; +-out: +- return ret; +-} ++ if (anon_fd == NULL || op_ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, ++ "fsync failed on shard"); ++ goto out; ++ } ++ shard_inode = anon_fd->inode; + +-int shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, +- loc_t *loc, +- shard_post_update_size_fop_handler_t handler) { +- int ret = -1; +- int64_t *size_attr = NULL; +- int64_t delta_blocks = 0; +- inode_t *inode = NULL; +- shard_local_t *local = NULL; +- dict_t *xattr_req = NULL; ++ LOCK(&priv->lock); ++ LOCK(&shard_inode->lock); ++ { ++ __shard_inode_ctx_get(shard_inode, this, &ctx); ++ if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { ++ shard_make_block_bname(ctx->block_num, shard_inode->gfid, ++ block_bname, sizeof(block_bname)); ++ inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); ++ /* The following unref corresponds to the ref held by ++ * inode_link() at the time the shard was created or ++ * looked up ++ */ ++ inode_unref(shard_inode); ++ inode_forget(shard_inode, 0); ++ } ++ } ++ UNLOCK(&shard_inode->lock); ++ UNLOCK(&priv->lock); + +- local = frame->local; +- local->post_update_size_handler = handler; ++out: ++ if (anon_fd) ++ fd_unref(anon_fd); ++ STACK_DESTROY(frame->root); ++ return 0; ++} + +- xattr_req = dict_new(); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } +- +- if (fd) +- inode = fd->inode; +- else +- inode = loc->inode; +- +- /* If both size and block count have not changed, then skip the xattrop. +- */ +- delta_blocks = GF_ATOMIC_GET(local->delta_blocks); +- if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) { +- goto out; +- } +- +- ret = shard_set_size_attrs(local->delta_size + local->hole_size, delta_blocks, +- &size_attr); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, +- "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } +- +- ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set key %s into dict. gfid=%s", GF_XATTR_SHARD_FILE_SIZE, +- uuid_utoa(inode->gfid)); +- GF_FREE(size_attr); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } ++int ++shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) ++{ ++ fd_t *anon_fd = NULL; ++ call_frame_t *fsync_frame = NULL; ++ ++ fsync_frame = create_frame(this, this->ctx->pool); ++ if (!fsync_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create new frame " ++ "to fsync shard"); ++ return -1; ++ } + +- if (fd) +- STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fxattrop, fd, GF_XATTROP_ADD_ARRAY64, +- xattr_req, NULL); +- else +- STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->xattrop, loc, GF_XATTROP_ADD_ARRAY64, +- xattr_req, NULL); ++ anon_fd = fd_anonymous(inode); ++ if (!anon_fd) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create anon fd to" ++ " fsync shard"); ++ STACK_DESTROY(fsync_frame->root); ++ return -1; ++ } + +- dict_unref(xattr_req); +- return 0; ++ STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, ++ anon_fd, 1, NULL); ++ return 0; ++} + +-out: +- if (xattr_req) +- dict_unref(xattr_req); +- handler(frame, this); +- return 0; +-} +- +-static inode_t *shard_link_internal_dir_inode(shard_local_t *local, +- inode_t *inode, struct iatt *buf, +- shard_internal_dir_type_t type) { +- inode_t *linked_inode = NULL; +- shard_priv_t *priv = NULL; +- char *bname = NULL; +- inode_t **priv_inode = NULL; +- inode_t *parent = NULL; +- +- priv = THIS->private; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- bname = GF_SHARD_DIR; +- priv_inode = &priv->dot_shard_inode; +- parent = inode->table->root; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- bname = GF_SHARD_REMOVE_ME_DIR; +- priv_inode = &priv->dot_shard_rm_inode; +- parent = priv->dot_shard_inode; +- break; +- default: +- break; +- } +- +- linked_inode = inode_link(inode, parent, bname, buf); +- inode_lookup(linked_inode); +- *priv_inode = linked_inode; +- return linked_inode; +-} +- +-int shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- shard_local_t *local = NULL; +- inode_t *linked_inode = NULL; +- shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; +- +- local = frame->local; +- +- if (op_ret) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto out; +- } +- +- /* To-Do: Fix refcount increment per call to +- * shard_link_internal_dir_inode(). +- */ +- linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); +- shard_inode_ctx_mark_dir_refreshed(linked_inode, this); +-out: +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- return 0; +-} +- +-int shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_internal_dir_type_t type) { +- loc_t loc = { +- 0, +- }; +- inode_t *inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- uuid_t gfid = { +- 0, +- }; +- +- local = frame->local; +- priv = this->private; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- gf_uuid_copy(gfid, priv->dot_shard_gfid); +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); +- break; +- default: +- break; +- } +- +- inode = inode_find(this->itable, gfid); +- +- if (!shard_inode_ctx_needs_lookup(inode, this)) { +- local->op_ret = 0; +- goto out; +- } ++int ++shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t post_res_handler) ++{ ++ int i = -1; ++ uint32_t shard_idx_iter = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ inode_t *inode = NULL; ++ inode_t *res_inode = NULL; ++ inode_t *fsync_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; + +- /* Plain assignment because the ref is already taken above through +- * call to inode_find() +- */ +- loc.inode = inode; +- gf_uuid_copy(loc.gfid, gfid); ++ priv = this->private; ++ local = frame->local; ++ local->call_count = 0; ++ shard_idx_iter = local->first_block; ++ res_inode = local->resolver_base_inode; ++ if (res_inode) ++ gf_uuid_copy(gfid, res_inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); + +- STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, +- NULL); +- loc_wipe(&loc); ++ if ((local->op_ret < 0) || (local->resolve_not)) ++ goto out; + +- return 0; ++ while (shard_idx_iter <= local->last_block) { ++ i++; ++ if (shard_idx_iter == 0) { ++ local->inode_list[i] = inode_ref(res_inode); ++ shard_idx_iter++; ++ continue; ++ } + ++ shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); ++ ++ inode = NULL; ++ inode = inode_resolve(this->itable, path); ++ if (inode) { ++ gf_msg_debug(this->name, 0, ++ "Shard %d already " ++ "present. gfid=%s. Saving inode for future.", ++ shard_idx_iter, uuid_utoa(inode->gfid)); ++ local->inode_list[i] = inode; ++ /* Let the ref on the inodes that are already present ++ * in inode table still be held so that they don't get ++ * forgotten by the time the fop reaches the actual ++ * write stage. ++ */ ++ LOCK(&priv->lock); ++ { ++ fsync_inode = __shard_update_shards_inode_list( ++ inode, this, res_inode, shard_idx_iter, gfid); ++ } ++ UNLOCK(&priv->lock); ++ shard_idx_iter++; ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync(this, fsync_inode); ++ continue; ++ } else { ++ local->call_count++; ++ shard_idx_iter++; ++ } ++ } + out: +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- return 0; ++ post_res_handler(frame, this); ++ return 0; + } + +-int shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- inode_t *link_inode = NULL; +- shard_local_t *local = NULL; +- shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; ++int ++shard_update_file_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ inode_t *inode = NULL; ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (op_ret) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } +- +- if (!IA_ISDIR(buf->ia_type)) { +- gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, +- "%s already exists and " +- "is not a directory. Please remove it from all bricks " +- "and try again", +- shard_internal_dir_string(type)); +- local->op_ret = -1; +- local->op_errno = EIO; +- goto unwind; +- } +- +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); +- if (link_inode != inode) { +- shard_refresh_internal_dir(frame, this, type); +- } else { +- shard_inode_ctx_mark_dir_refreshed(link_inode, this); +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- } +- return 0; ++ if ((local->fd) && (local->fd->inode)) ++ inode = local->fd->inode; ++ else if (local->loc.inode) ++ inode = local->loc.inode; + +-unwind: +- local->post_res_handler(frame, this); +- return 0; +-} +- +-int shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t post_res_handler, +- shard_internal_dir_type_t type) { +- int ret = -1; +- dict_t *xattr_req = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- uuid_t *gfid = NULL; +- loc_t *loc = NULL; +- gf_boolean_t free_gfid = _gf_true; +- +- local = frame->local; +- priv = this->private; +- local->post_res_handler = post_res_handler; +- +- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); +- if (!gfid) +- goto err; +- +- xattr_req = dict_new(); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- gf_uuid_copy(*gfid, priv->dot_shard_gfid); +- loc = &local->dot_shard_loc; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); +- loc = &local->dot_shard_rm_loc; +- break; +- default: +- bzero(*gfid, sizeof(uuid_t)); +- break; +- } +- +- ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set gfid of %s into dict", +- shard_internal_dir_string(type)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } else { +- free_gfid = _gf_false; +- } +- +- STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, +- xattr_req); +- +- dict_unref(xattr_req); +- return 0; +- +-err: +- if (xattr_req) +- dict_unref(xattr_req); +- if (free_gfid) +- GF_FREE(gfid); +- post_res_handler(frame, this); +- return 0; +-} +- +-static void shard_inode_ctx_update(inode_t *inode, xlator_t *this, +- dict_t *xdata, struct iatt *buf) { +- int ret = 0; +- uint64_t size = 0; +- void *bsize = NULL; +- +- if (shard_inode_ctx_get_block_size(inode, this, &size)) { +- /* Fresh lookup */ +- ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); +- if (!ret) +- size = ntoh64(*((uint64_t *)bsize)); +- /* If the file is sharded, set its block size, otherwise just +- * set 0. +- */ +- +- shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); +- } +- /* If the file is sharded, also set the remaining attributes, +- * except for ia_size and ia_blocks. +- */ +- if (size) { +- shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); +- (void)shard_inode_ctx_invalidate(inode, this, buf); +- } +-} +- +-int shard_delete_shards(void *opaque); +- +-int shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); +- +-int shard_start_background_deletion(xlator_t *this) { +- int ret = 0; +- gf_boolean_t i_cleanup = _gf_true; +- shard_priv_t *priv = NULL; +- call_frame_t *cleanup_frame = NULL; +- +- priv = this->private; +- +- LOCK(&priv->lock); +- { +- switch (priv->bg_del_state) { +- case SHARD_BG_DELETION_NONE: +- i_cleanup = _gf_true; +- priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; +- break; +- case SHARD_BG_DELETION_LAUNCHING: +- i_cleanup = _gf_false; +- break; +- case SHARD_BG_DELETION_IN_PROGRESS: +- priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; +- i_cleanup = _gf_false; +- break; +- default: +- break; +- } +- } +- UNLOCK(&priv->lock); +- if (!i_cleanup) +- return 0; +- +- cleanup_frame = create_frame(this, this->ctx->pool); +- if (!cleanup_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create " +- "new frame to delete shards"); +- ret = -ENOMEM; +- goto err; +- } +- +- set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); +- +- ret = synctask_new(this->ctx->env, shard_delete_shards, +- shard_delete_shards_cbk, cleanup_frame, cleanup_frame); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_WARNING, errno, SHARD_MSG_SHARDS_DELETION_FAILED, +- "failed to create task to do background " +- "cleanup of shards"); +- STACK_DESTROY(cleanup_frame->root); +- goto err; +- } +- return 0; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_UPDATE_FILE_SIZE_FAILED, ++ "Update to file size" ++ " xattr failed on %s", ++ uuid_utoa(inode->gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto err; ++ } + ++ if (shard_modify_size_and_block_count(&local->postbuf, dict)) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } + err: +- LOCK(&priv->lock); +- { priv->bg_del_state = SHARD_BG_DELETION_NONE; } +- UNLOCK(&priv->lock); +- return ret; ++ local->post_update_size_handler(frame, this); ++ return 0; + } + +-int shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, struct iatt *postparent) { +- int ret = -1; +- shard_priv_t *priv = NULL; +- gf_boolean_t i_start_cleanup = _gf_false; +- +- priv = this->private; +- +- if (op_ret < 0) +- goto unwind; +- +- if (IA_ISDIR(buf->ia_type)) +- goto unwind; +- +- /* Also, if the file is sharded, get the file size and block cnt xattr, +- * and store them in the stbuf appropriately. +- */ +- +- if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && +- frame->root->pid != GF_CLIENT_PID_GSYNCD) +- shard_modify_size_and_block_count(buf, xdata); +- +- /* If this was a fresh lookup, there are two possibilities: +- * 1) If the file is sharded (indicated by the presence of block size +- * xattr), store this block size, along with rdev and mode in its +- * inode ctx. +- * 2) If the file is not sharded, store size along with rdev and mode +- * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is +- * already initialised to all zeroes, nothing more needs to be done. +- */ ++int ++shard_set_size_attrs(int64_t size, int64_t block_count, int64_t **size_attr_p) ++{ ++ int ret = -1; ++ int64_t *size_attr = NULL; + +- (void)shard_inode_ctx_update(inode, this, xdata, buf); ++ if (!size_attr_p) ++ goto out; + +- LOCK(&priv->lock); +- { +- if (priv->first_lookup_done == _gf_false) { +- priv->first_lookup_done = _gf_true; +- i_start_cleanup = _gf_true; +- } +- } +- UNLOCK(&priv->lock); ++ size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); ++ if (!size_attr) ++ goto out; + +- if (!i_start_cleanup) +- goto unwind; ++ size_attr[0] = hton64(size); ++ /* As sharding evolves, it _may_ be necessary to embed more pieces of ++ * information within the same xattr. So allocating slots for them in ++ * advance. For now, only bytes 0-63 and 128-191 which would make up the ++ * current size and block count respectively of the file are valid. ++ */ ++ size_attr[2] = hton64(block_count); + +- ret = shard_start_background_deletion(this); +- if (ret < 0) { +- LOCK(&priv->lock); +- { priv->first_lookup_done = _gf_false; } +- UNLOCK(&priv->lock); +- } ++ *size_attr_p = size_attr; + +-unwind: +- SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, +- postparent); +- return 0; ++ ret = 0; ++out: ++ return ret; + } + +-int shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, +- dict_t *xattr_req) { +- int ret = -1; +- int32_t op_errno = ENOMEM; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- this->itable = loc->inode->table; +- if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && +- (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) { +- SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); +- } ++int ++shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ loc_t *loc, shard_post_update_size_fop_handler_t handler) ++{ ++ int ret = -1; ++ int64_t *size_attr = NULL; ++ int64_t delta_blocks = 0; ++ inode_t *inode = NULL; ++ shard_local_t *local = NULL; ++ dict_t *xattr_req = NULL; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = frame->local; ++ local->post_update_size_handler = handler; + +- frame->local = local; ++ xattr_req = dict_new(); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; ++ } + +- loc_copy(&local->loc, loc); ++ if (fd) ++ inode = fd->inode; ++ else ++ inode = loc->inode; + +- local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ /* If both size and block count have not changed, then skip the xattrop. ++ */ ++ delta_blocks = GF_ATOMIC_GET(local->delta_blocks); ++ if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) { ++ goto out; ++ } + +- if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ ret = shard_set_size_attrs(local->delta_size + local->hole_size, ++ delta_blocks, &size_attr); + if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict" +- " value: key:%s for path %s", +- GF_XATTR_SHARD_BLOCK_SIZE, loc->path); +- goto err; ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, ++ "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; + } +- } + +- if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); ++ ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); + if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict value: key:%s for path %s.", +- GF_XATTR_SHARD_FILE_SIZE, loc->path); +- goto err; ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key %s into dict. gfid=%s", ++ GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(inode->gfid)); ++ GF_FREE(size_attr); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; + } +- } + +- if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) +- dict_del(xattr_req, GF_CONTENT_KEY); ++ if (fd) ++ STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fxattrop, fd, ++ GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); ++ else ++ STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->xattrop, loc, ++ GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); + +- STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); +- return 0; +-} ++ dict_unref(xattr_req); ++ return 0; + +-int shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- inode_t *inode, struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- int ret = -1; +- int32_t mask = SHARD_INODE_WRITE_MASK; +- shard_local_t *local = NULL; +- shard_inode_ctx_t ctx = { +- 0, +- }; +- +- local = frame->local; +- +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, +- SHARD_MSG_BASE_FILE_LOOKUP_FAILED, "Lookup on base file" +- " failed : %s", +- loc_gfid_utoa(&(local->loc))); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } ++out: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ handler(frame, this); ++ return 0; ++} + +- local->prebuf = *buf; +- if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { +- local->op_ret = -1; +- local->op_errno = EINVAL; +- goto unwind; +- } +- +- if (shard_inode_ctx_get_all(inode, this, &ctx)) +- mask = SHARD_ALL_MASK; +- +- ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, +- (mask | SHARD_MASK_REFRESH_RESET)); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, +- "Failed to set inode" +- " write params into inode ctx for %s", +- uuid_utoa(buf->ia_gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unwind; +- } ++static inode_t * ++shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, ++ struct iatt *buf, shard_internal_dir_type_t type) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ char *bname = NULL; ++ inode_t **priv_inode = NULL; ++ inode_t *parent = NULL; ++ ++ priv = THIS->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ bname = GF_SHARD_DIR; ++ priv_inode = &priv->dot_shard_inode; ++ parent = inode->table->root; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ priv_inode = &priv->dot_shard_rm_inode; ++ parent = priv->dot_shard_inode; ++ break; ++ default: ++ break; ++ } + +-unwind: +- local->handler(frame, this); +- return 0; +-} +- +-int shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, +- shard_post_fop_handler_t handler) { +- int ret = -1; +- shard_local_t *local = NULL; +- dict_t *xattr_req = NULL; +- gf_boolean_t need_refresh = _gf_false; +- +- local = frame->local; +- local->handler = handler; +- +- ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, +- &need_refresh); +- /* By this time, inode ctx should have been created either in create, +- * mknod, readdirp or lookup. If not it is a bug! +- */ +- if ((ret == 0) && (need_refresh == _gf_false)) { +- gf_msg_debug(this->name, 0, "Skipping lookup on base file: %s" +- "Serving prebuf off the inode ctx cache", +- uuid_utoa(loc->gfid)); +- goto out; +- } +- +- xattr_req = dict_new(); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } ++ linked_inode = inode_link(inode, parent, bname, buf); ++ inode_lookup(linked_inode); ++ *priv_inode = linked_inode; ++ return linked_inode; ++} + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); ++int ++shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, int32_t op_errno, ++ inode_t *inode, struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ shard_local_t *local = NULL; ++ inode_t *linked_inode = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + +- STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xattr_req); ++ local = frame->local; + +- dict_unref(xattr_req); +- return 0; ++ if (op_ret) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto out; ++ } + ++ /* To-Do: Fix refcount increment per call to ++ * shard_link_internal_dir_inode(). ++ */ ++ linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ shard_inode_ctx_mark_dir_refreshed(linked_inode, this); + out: +- if (xattr_req) +- dict_unref(xattr_req); +- handler(frame, this); +- return 0; ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ return 0; + } + +-int shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_internal_dir_type_t type) ++{ ++ loc_t loc = { ++ 0, ++ }; ++ inode_t *inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ uuid_t gfid = { ++ 0, ++ }; + +- local = frame->local; ++ local = frame->local; ++ priv = this->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy(gfid, priv->dot_shard_gfid); ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); ++ break; ++ default: ++ break; ++ } + +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, +- SHARD_LOOKUP_MASK); ++ inode = inode_find(this->itable, gfid); + +- SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, +- &local->prebuf, local->xattr_rsp); +- return 0; +-} ++ if (!shard_inode_ctx_needs_lookup(inode, this)) { ++ local->op_ret = 0; ++ goto out; ++ } + +-int shard_post_stat_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ /* Plain assignment because the ref is already taken above through ++ * call to inode_find() ++ */ ++ loc.inode = inode; ++ gf_uuid_copy(loc.gfid, gfid); + +- local = frame->local; ++ STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, ++ NULL); ++ loc_wipe(&loc); + +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, +- SHARD_LOOKUP_MASK); ++ return 0; + +- SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, +- &local->prebuf, local->xattr_rsp); +- return 0; ++out: ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ return 0; + } + +-int shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *buf, +- dict_t *xdata) { +- inode_t *inode = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; ++int ++shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ inode_t *link_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, +- "stat failed: %s", local->fd ? uuid_utoa(local->fd->inode->gfid) +- : uuid_utoa((local->loc.inode)->gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } ++ local = frame->local; + +- local->prebuf = *buf; +- if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { +- local->op_ret = -1; +- local->op_errno = EINVAL; +- goto unwind; +- } +- local->xattr_rsp = dict_ref(xdata); ++ if (op_ret) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } + +- if (local->loc.inode) +- inode = local->loc.inode; +- else +- inode = local->fd->inode; ++ if (!IA_ISDIR(buf->ia_type)) { ++ gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, ++ "%s already exists and " ++ "is not a directory. Please remove it from all bricks " ++ "and try again", ++ shard_internal_dir_string(type)); ++ local->op_ret = -1; ++ local->op_errno = EIO; ++ goto unwind; ++ } + +- shard_inode_ctx_invalidate(inode, this, &local->prebuf); ++ link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ if (link_inode != inode) { ++ shard_refresh_internal_dir(frame, this, type); ++ } else { ++ shard_inode_ctx_mark_dir_refreshed(link_inode, this); ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ } ++ return 0; + + unwind: +- local->handler(frame, this); +- return 0; ++ local->post_res_handler(frame, this); ++ return 0; + } + +-int shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t post_res_handler, ++ shard_internal_dir_type_t type) ++{ ++ int ret = -1; ++ dict_t *xattr_req = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ uuid_t *gfid = NULL; ++ loc_t *loc = NULL; ++ gf_boolean_t free_gfid = _gf_true; + +- if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { +- STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->stat, loc, xdata); +- return 0; +- } ++ local = frame->local; ++ priv = this->private; ++ local->post_res_handler = post_res_handler; + +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); ++ if (!gfid) ++ goto err; + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->stat, loc, xdata); +- return 0; +- } ++ xattr_req = dict_new(); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy(*gfid, priv->dot_shard_gfid); ++ loc = &local->dot_shard_loc; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); ++ loc = &local->dot_shard_rm_loc; ++ break; ++ default: ++ bzero(*gfid, sizeof(uuid_t)); ++ break; ++ } + +- frame->local = local; ++ ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set gfid of %s into dict", ++ shard_internal_dir_string(type)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } else { ++ free_gfid = _gf_false; ++ } + +- local->handler = shard_post_stat_handler; +- loc_copy(&local->loc, loc); +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, ++ xattr_req); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, +- local, err); ++ dict_unref(xattr_req); ++ return 0; + +- STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); +- return 0; + err: +- shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); +- return 0; ++ if (xattr_req) ++ dict_unref(xattr_req); ++ if (free_gfid) ++ GF_FREE(gfid); ++ post_res_handler(frame, this); ++ return 0; + } + +-int shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++static void ++shard_inode_ctx_update(inode_t *inode, xlator_t *this, dict_t *xdata, ++ struct iatt *buf) ++{ ++ int ret = 0; ++ uint64_t size = 0; ++ void *bsize = NULL; ++ ++ if (shard_inode_ctx_get_block_size(inode, this, &size)) { ++ /* Fresh lookup */ ++ ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); ++ if (!ret) ++ size = ntoh64(*((uint64_t *)bsize)); ++ /* If the file is sharded, set its block size, otherwise just ++ * set 0. ++ */ ++ ++ shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); ++ } ++ /* If the file is sharded, also set the remaining attributes, ++ * except for ia_size and ia_blocks. ++ */ ++ if (size) { ++ shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); ++ (void)shard_inode_ctx_invalidate(inode, this, buf); ++ } ++} + +- if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { +- STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fstat, fd, xdata); +- return 0; +- } ++int ++shard_delete_shards(void *opaque); + +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } ++int ++shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fstat, fd, xdata); +- return 0; +- } ++int ++shard_start_background_deletion(xlator_t *this) ++{ ++ int ret = 0; ++ gf_boolean_t i_cleanup = _gf_true; ++ shard_priv_t *priv = NULL; ++ call_frame_t *cleanup_frame = NULL; + +- if (!this->itable) +- this->itable = fd->inode->table; ++ priv = this->private; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ LOCK(&priv->lock); ++ { ++ switch (priv->bg_del_state) { ++ case SHARD_BG_DELETION_NONE: ++ i_cleanup = _gf_true; ++ priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; ++ break; ++ case SHARD_BG_DELETION_LAUNCHING: ++ i_cleanup = _gf_false; ++ break; ++ case SHARD_BG_DELETION_IN_PROGRESS: ++ priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; ++ i_cleanup = _gf_false; ++ break; ++ default: ++ break; ++ } ++ } ++ UNLOCK(&priv->lock); ++ if (!i_cleanup) ++ return 0; + +- frame->local = local; ++ cleanup_frame = create_frame(this, this->ctx->pool); ++ if (!cleanup_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create " ++ "new frame to delete shards"); ++ ret = -ENOMEM; ++ goto err; ++ } + +- local->handler = shard_post_fstat_handler; +- local->fd = fd_ref(fd); +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, +- local, err); ++ ret = synctask_new(this->ctx->env, shard_delete_shards, ++ shard_delete_shards_cbk, cleanup_frame, cleanup_frame); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, errno, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "failed to create task to do background " ++ "cleanup of shards"); ++ STACK_DESTROY(cleanup_frame->root); ++ goto err; ++ } ++ return 0; + +- STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); +- return 0; + err: +- shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); +- return 0; ++ LOCK(&priv->lock); ++ { ++ priv->bg_del_state = SHARD_BG_DELETION_NONE; ++ } ++ UNLOCK(&priv->lock); ++ return ret; + } + +-int shard_post_update_size_truncate_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, struct iatt *postparent) ++{ ++ int ret = -1; ++ shard_priv_t *priv = NULL; ++ gf_boolean_t i_start_cleanup = _gf_false; + +- local = frame->local; ++ priv = this->private; + +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, NULL); +- else +- SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, NULL); +- return 0; +-} ++ if (op_ret < 0) ++ goto unwind; + +-int shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) { +- inode_t *inode = NULL; +- int64_t delta_blocks = 0; +- shard_local_t *local = NULL; ++ if (IA_ISDIR(buf->ia_type)) ++ goto unwind; + +- local = frame->local; ++ /* Also, if the file is sharded, get the file size and block cnt xattr, ++ * and store them in the stbuf appropriately. ++ */ + +- SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && ++ frame->root->pid != GF_CLIENT_PID_GSYNCD) ++ shard_modify_size_and_block_count(buf, xdata); ++ ++ /* If this was a fresh lookup, there are two possibilities: ++ * 1) If the file is sharded (indicated by the presence of block size ++ * xattr), store this block size, along with rdev and mode in its ++ * inode ctx. ++ * 2) If the file is not sharded, store size along with rdev and mode ++ * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is ++ * already initialised to all zeroes, nothing more needs to be done. ++ */ + +- inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, +- SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, "truncate on last" +- " shard failed : %s", +- uuid_utoa(inode->gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto err; +- } +- +- local->postbuf.ia_size = local->offset; +- /* Let the delta be negative. We want xattrop to do subtraction */ +- local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; +- delta_blocks = GF_ATOMIC_ADD(local->delta_blocks, +- postbuf->ia_blocks - prebuf->ia_blocks); +- GF_ASSERT(delta_blocks <= 0); +- local->postbuf.ia_blocks += delta_blocks; +- local->hole_size = 0; +- +- shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES); +- shard_update_file_size(frame, this, NULL, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +-} +- +-int shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, +- inode_t *inode) { +- size_t last_shard_size_after = 0; +- loc_t loc = { +- 0, +- }; +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- /* A NULL inode could be due to the fact that the last shard which +- * needs to be truncated does not exist due to it lying in a hole +- * region. So the only thing left to do in that case would be an +- * update to file size xattr. +- */ +- if (!inode) { +- gf_msg_debug(this->name, 0, +- "Last shard to be truncated absent in backend:%" PRIu64 +- " of gfid: %s. Directly proceeding to update file size", +- local->first_block, uuid_utoa(local->loc.inode->gfid)); +- shard_update_file_size(frame, this, NULL, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +- } ++ (void)shard_inode_ctx_update(inode, this, xdata, buf); + +- SHARD_SET_ROOT_FS_ID(frame, local); ++ LOCK(&priv->lock); ++ { ++ if (priv->first_lookup_done == _gf_false) { ++ priv->first_lookup_done = _gf_true; ++ i_start_cleanup = _gf_true; ++ } ++ } ++ UNLOCK(&priv->lock); + +- loc.inode = inode_ref(inode); +- gf_uuid_copy(loc.gfid, inode->gfid); ++ if (!i_start_cleanup) ++ goto unwind; + +- last_shard_size_after = (local->offset % local->block_size); ++ ret = shard_start_background_deletion(this); ++ if (ret < 0) { ++ LOCK(&priv->lock); ++ { ++ priv->first_lookup_done = _gf_false; ++ } ++ UNLOCK(&priv->lock); ++ } + +- STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, +- NULL); +- loc_wipe(&loc); +- return 0; ++unwind: ++ SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, ++ postparent); ++ return 0; + } + +-void shard_unlink_block_inode(shard_local_t *local, int shard_block_num); ++int ++shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) ++{ ++ int ret = -1; ++ int32_t op_errno = ENOMEM; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +-int shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) { +- int ret = 0; +- int call_count = 0; +- int shard_block_num = (long)cookie; +- uint64_t block_count = 0; +- shard_local_t *local = NULL; ++ this->itable = loc->inode->table; ++ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && ++ (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) { ++ SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); ++ } + +- local = frame->local; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto done; +- } +- ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count); +- if (!ret) { +- GF_ATOMIC_SUB(local->delta_blocks, block_count); +- } else { +- /* dict_get failed possibly due to a heterogeneous cluster? */ +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to get key %s from dict during truncate of gfid %s", +- GF_GET_FILE_BLOCK_COUNT, +- uuid_utoa(local->resolver_base_inode->gfid)); +- } +- +- shard_unlink_block_inode(local, shard_block_num); +-done: +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- shard_truncate_last_shard(frame, this, local->inode_list[0]); +- } +- return 0; +-} +- +-int shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) { +- int i = 1; +- int ret = -1; +- int call_count = 0; +- uint32_t cur_block = 0; +- uint32_t last_block = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- char *bname = NULL; +- loc_t loc = { +- 0, +- }; +- gf_boolean_t wind_failed = _gf_false; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- dict_t *xdata_req = NULL; +- +- local = frame->local; +- priv = this->private; +- +- cur_block = local->first_block + 1; +- last_block = local->last_block; +- +- /* Determine call count */ +- for (i = 1; i < local->num_blocks; i++) { +- if (!local->inode_list[i]) +- continue; +- call_count++; +- } +- +- if (!call_count) { +- /* Call count = 0 implies that all of the shards that need to be +- * unlinked do not exist. So shard xlator would now proceed to +- * do the final truncate + size updates. +- */ +- gf_msg_debug(this->name, 0, "Shards to be unlinked as part of " +- "truncate absent in backend: %s. Directly " +- "proceeding to update file size", +- uuid_utoa(inode->gfid)); +- local->postbuf.ia_size = local->offset; +- local->postbuf.ia_blocks = local->prebuf.ia_blocks; +- local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- local->hole_size = 0; +- shard_update_file_size(frame, this, local->fd, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +- } ++ frame->local = local; + +- local->call_count = call_count; +- i = 1; +- xdata_req = dict_new(); +- if (!xdata_req) { +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +- } +- ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set key %s into dict during truncate of %s", +- GF_GET_FILE_BLOCK_COUNT, +- uuid_utoa(local->resolver_base_inode->gfid)); +- dict_unref(xdata_req); +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +- } ++ loc_copy(&local->loc, loc); + +- SHARD_SET_ROOT_FS_ID(frame, local); +- while (cur_block <= last_block) { +- if (!local->inode_list[i]) { +- cur_block++; +- i++; +- continue; +- } +- if (wind_failed) { +- shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, +- NULL, NULL, NULL); +- goto next; +- } ++ local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; +- loc.parent = inode_ref(priv->dot_shard_inode); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s. Base file gfid = %s", +- bname, uuid_utoa(inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- loc_wipe(&loc); +- wind_failed = _gf_true; +- shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, +- NULL, NULL, NULL); +- goto next; ++ if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict" ++ " value: key:%s for path %s", ++ GF_XATTR_SHARD_BLOCK_SIZE, loc->path); ++ goto err; ++ } + } +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- loc.inode = inode_ref(local->inode_list[i]); + +- STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, (void *)(long)cur_block, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &loc, +- 0, xdata_req); +- loc_wipe(&loc); +- next: +- i++; +- cur_block++; +- if (!--call_count) +- break; +- } +- dict_unref(xdata_req); +- return 0; +-} +- +-int shard_truncate_do(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, ++ 8 * 4); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s for path %s.", ++ GF_XATTR_SHARD_FILE_SIZE, loc->path); ++ goto err; ++ } ++ } + +- local = frame->local; ++ if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) ++ dict_del(xattr_req, GF_CONTENT_KEY); + +- if (local->num_blocks == 1) { +- /* This means that there are no shards to be unlinked. +- * The fop boils down to truncating the last shard, updating +- * the size and unwinding. +- */ +- shard_truncate_last_shard(frame, this, local->inode_list[0]); ++ STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); + return 0; +- } else { +- shard_truncate_htol(frame, this, local->loc.inode); +- } +- return 0; + } + +-int shard_post_lookup_shards_truncate_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ int ret = -1; ++ int32_t mask = SHARD_INODE_WRITE_MASK; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t ctx = { ++ 0, ++ }; + +- local = frame->local; ++ local = frame->local; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +- } +- +- shard_truncate_do(frame, this); +- return 0; +-} +- +-void shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, +- struct iatt *buf) { +- int list_index = 0; +- char block_bname[256] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- inode_t *linked_inode = NULL; +- xlator_t *this = NULL; +- inode_t *fsync_inode = NULL; +- shard_priv_t *priv = NULL; +- inode_t *base_inode = NULL; +- +- this = THIS; +- priv = this->private; +- if (local->loc.inode) { +- gf_uuid_copy(gfid, local->loc.inode->gfid); +- base_inode = local->loc.inode; +- } else if (local->resolver_base_inode) { +- gf_uuid_copy(gfid, local->resolver_base_inode->gfid); +- base_inode = local->resolver_base_inode; +- } else { +- gf_uuid_copy(gfid, local->base_gfid); +- } +- +- shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); +- +- shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); +- linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); +- inode_lookup(linked_inode); +- list_index = block_num - local->first_block; +- local->inode_list[list_index] = linked_inode; +- +- LOCK(&priv->lock); +- { +- fsync_inode = __shard_update_shards_inode_list(linked_inode, this, +- base_inode, block_num, gfid); +- } +- UNLOCK(&priv->lock); +- if (fsync_inode) +- shard_initiate_evicted_inode_fsync(this, fsync_inode); +-} +- +-int shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- int call_count = 0; +- int shard_block_num = (long)cookie; +- uuid_t gfid = { +- 0, +- }; +- shard_local_t *local = NULL; +- +- local = frame->local; +- if (local->resolver_base_inode) +- gf_uuid_copy(gfid, local->resolver_base_inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- if (op_ret < 0) { +- /* Ignore absence of shards in the backend in truncate fop. */ +- switch (local->fop) { +- case GF_FOP_TRUNCATE: +- case GF_FOP_FTRUNCATE: +- case GF_FOP_RENAME: +- case GF_FOP_UNLINK: +- if (op_errno == ENOENT) +- goto done; +- break; +- case GF_FOP_WRITE: +- case GF_FOP_READ: +- case GF_FOP_ZEROFILL: +- case GF_FOP_DISCARD: +- case GF_FOP_FALLOCATE: +- if ((!local->first_lookup_done) && (op_errno == ENOENT)) { +- LOCK(&frame->lock); +- { local->create_count++; } +- UNLOCK(&frame->lock); +- goto done; +- } +- break; +- default: +- break; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_BASE_FILE_LOOKUP_FAILED, ++ "Lookup on base file" ++ " failed : %s", ++ loc_gfid_utoa(&(local->loc))); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; + } + +- /* else */ +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_LOOKUP_SHARD_FAILED, +- "Lookup on shard %d " +- "failed. Base file gfid = %s", +- shard_block_num, uuid_utoa(gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto done; +- } +- +- shard_link_block_inode(local, shard_block_num, inode, buf); +- +-done: +- if (local->lookup_shards_barriered) { +- syncbarrier_wake(&local->barrier); +- return 0; +- } else { +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- if (!local->first_lookup_done) +- local->first_lookup_done = _gf_true; +- local->pls_fop_handler(frame, this); ++ local->prebuf = *buf; ++ if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { ++ local->op_ret = -1; ++ local->op_errno = EINVAL; ++ goto unwind; + } +- } +- return 0; +-} + +-dict_t *shard_create_gfid_dict(dict_t *dict) { +- int ret = 0; +- dict_t *new = NULL; +- unsigned char *gfid = NULL; ++ if (shard_inode_ctx_get_all(inode, this, &ctx)) ++ mask = SHARD_ALL_MASK; + +- new = dict_copy_with_ref(dict, NULL); +- if (!new) +- return NULL; ++ ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, ++ (mask | SHARD_MASK_REFRESH_RESET)); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, ++ "Failed to set inode" ++ " write params into inode ctx for %s", ++ uuid_utoa(buf->ia_gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto unwind; ++ } + +- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); +- if (!gfid) { +- ret = -1; +- goto out; +- } ++unwind: ++ local->handler(frame, this); ++ return 0; ++} + +- gf_uuid_generate(gfid); ++int ++shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ shard_post_fop_handler_t handler) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ dict_t *xattr_req = NULL; ++ gf_boolean_t need_refresh = _gf_false; + +- ret = dict_set_gfuuid(new, "gfid-req", gfid, false); ++ local = frame->local; ++ local->handler = handler; + +-out: +- if (ret) { +- dict_unref(new); +- new = NULL; +- GF_FREE(gfid); +- } +- +- return new; +-} +- +-int shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, +- inode_t *inode, +- shard_post_lookup_shards_fop_handler_t handler) { +- int i = 0; +- int ret = 0; +- int count = 0; +- int call_count = 0; +- int32_t shard_idx_iter = 0; +- int last_block = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- char *bname = NULL; +- uuid_t gfid = { +- 0, +- }; +- loc_t loc = { +- 0, +- }; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- gf_boolean_t wind_failed = _gf_false; +- dict_t *xattr_req = NULL; +- +- priv = this->private; +- local = frame->local; +- count = call_count = local->call_count; +- shard_idx_iter = local->first_block; +- last_block = local->last_block; +- local->pls_fop_handler = handler; +- if (local->lookup_shards_barriered) +- local->barrier.waitfor = local->call_count; +- +- if (inode) +- gf_uuid_copy(gfid, inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- while (shard_idx_iter <= last_block) { +- if (local->inode_list[i]) { +- i++; +- shard_idx_iter++; +- continue; +- } +- +- if (wind_failed) { +- shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, +- -1, ENOMEM, NULL, NULL, NULL, NULL); +- goto next; +- } +- +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- +- bname = strrchr(path, '/') + 1; +- loc.inode = inode_new(this->itable); +- loc.parent = inode_ref(priv->dot_shard_inode); +- gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0 || !(loc.inode)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- loc_wipe(&loc); +- wind_failed = _gf_true; +- shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, +- -1, ENOMEM, NULL, NULL, NULL, NULL); +- goto next; ++ ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, ++ &need_refresh); ++ /* By this time, inode ctx should have been created either in create, ++ * mknod, readdirp or lookup. If not it is a bug! ++ */ ++ if ((ret == 0) && (need_refresh == _gf_false)) { ++ gf_msg_debug(this->name, 0, ++ "Skipping lookup on base file: %s" ++ "Serving prebuf off the inode ctx cache", ++ uuid_utoa(loc->gfid)); ++ goto out; + } + +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- +- xattr_req = shard_create_gfid_dict(local->xattr_req); ++ xattr_req = dict_new(); + if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- loc_wipe(&loc); +- shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, +- -1, ENOMEM, NULL, NULL, NULL, NULL); +- goto next; +- } +- +- STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, +- (void *)(long)shard_idx_iter, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); +- loc_wipe(&loc); +- dict_unref(xattr_req); +- next: +- shard_idx_iter++; +- i++; +- +- if (!--call_count) +- break; +- } +- if (local->lookup_shards_barriered) { +- syncbarrier_wait(&local->barrier, count); +- local->pls_fop_handler(frame, this); +- } +- return 0; +-} +- +-int shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (local->op_ret < 0) { +- if (local->op_errno == ENOENT) { +- /* If lookup on /.shard fails with ENOENT, it means that +- * the file was 0-byte in size but truncated sometime in +- * the past to a higher size which is reflected in the +- * size xattr, and now being truncated to a lower size. +- * In this case, the only thing that needs to be done is +- * to update the size xattr of the file and unwind. +- */ +- local->first_block = local->last_block = 0; +- local->num_blocks = 1; +- local->call_count = 0; +- local->op_ret = 0; +- local->postbuf.ia_size = local->offset; +- shard_update_file_size(frame, this, local->fd, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +- } else { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; + } +- } + +- if (!local->call_count) +- shard_truncate_do(frame, this); +- else +- shard_common_lookup_shards(frame, this, local->loc.inode, +- shard_post_lookup_shards_truncate_handler); +- +- return 0; +-} +- +-int shard_truncate_begin(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; +- +- /* First participant block here is the lowest numbered block that would +- * hold the last byte of the file post successful truncation. +- * Last participant block is the block that contains the last byte in +- * the current state of the file. +- * If (first block == last_block): +- * then that means that the file only needs truncation of the +- * first (or last since both are same) block. +- * Else +- * if (new_size % block_size == 0) +- * then that means there is no truncate to be done with +- * only shards from first_block + 1 through the last +- * block needing to be unlinked. +- * else +- * both truncate of the first block and unlink of the +- * remaining shards until end of file is required. +- */ +- local->first_block = +- (local->offset == 0) ? 0 : get_lowest_block(local->offset - 1, +- local->block_size); +- local->last_block = +- get_highest_block(0, local->prebuf.ia_size, local->block_size); +- +- local->num_blocks = local->last_block - local->first_block + 1; +- GF_ASSERT(local->num_blocks > 0); +- local->resolver_base_inode = +- (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; +- +- if ((local->first_block == 0) && (local->num_blocks == 1)) { +- if (local->fop == GF_FOP_TRUNCATE) +- STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->truncate, &local->loc, local->offset, +- local->xattr_req); +- else +- STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->ftruncate, local->fd, local->offset, +- local->xattr_req); +- return 0; +- } ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); + +- local->inode_list = +- GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) +- goto err; ++ STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- ret = +- shard_init_internal_dir_loc(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret) +- goto err; +- shard_lookup_internal_dir(frame, this, shard_post_resolve_truncate_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_post_resolve_truncate_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- return 0; ++ dict_unref(xattr_req); ++ return 0; + +-err: +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; ++out: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ handler(frame, this); ++ return 0; + } + +-int shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- struct iatt tmp_stbuf = { +- 0, +- }; +- +- local = frame->local; ++int ++shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ local = frame->local; + +- local->postbuf = tmp_stbuf = local->prebuf; ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, ++ SHARD_LOOKUP_MASK); + +- if (local->prebuf.ia_size == local->offset) { +- /* If the file size is same as requested size, unwind the call +- * immediately. +- */ +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, &local->postbuf, +- NULL); +- else +- SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, +- &local->postbuf, NULL); +- } else if (local->offset > local->prebuf.ia_size) { +- /* If the truncate is from a lower to a higher size, set the +- * new size xattr and unwind. +- */ +- local->hole_size = local->offset - local->prebuf.ia_size; +- local->delta_size = 0; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- local->postbuf.ia_size = local->offset; +- tmp_stbuf.ia_size = local->offset; +- shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, +- SHARD_INODE_WRITE_MASK); +- shard_update_file_size(frame, this, NULL, &local->loc, +- shard_post_update_size_truncate_handler); +- } else { +- /* ... else +- * i. unlink all shards that need to be unlinked. +- * ii. truncate the last of the shards. +- * iii. update the new size using setxattr. +- * and unwind the fop. +- */ +- local->hole_size = 0; +- local->delta_size = (local->offset - local->prebuf.ia_size); +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- tmp_stbuf.ia_size = local->offset; +- shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, +- SHARD_INODE_WRITE_MASK); +- shard_truncate_begin(frame, this); +- } +- return 0; ++ SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, ++ &local->prebuf, local->xattr_rsp); ++ return 0; + } + +-/* TO-DO: +- * Fix updates to size and block count with racing write(s) and truncate(s). +- */ ++int ++shard_post_stat_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +-int shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, +- off_t offset, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++ local = frame->local; + +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, ++ SHARD_LOOKUP_MASK); + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); ++ SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, ++ &local->prebuf, local->xattr_rsp); + return 0; +- } +- +- if (!this->itable) +- this->itable = loc->inode->table; ++} + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto err; +- loc_copy(&local->loc, loc); +- local->offset = offset; +- local->block_size = block_size; +- local->fop = GF_FOP_TRUNCATE; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->resolver_base_inode = loc->inode; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); +- return 0; ++int ++shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ dict_t *xdata) ++{ ++ inode_t *inode = NULL; ++ shard_local_t *local = NULL; + +-err: +- shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); +- return 0; +- } +- +- if (!this->itable) +- this->itable = fd->inode->table; +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto err; +- local->fd = fd_ref(fd); +- local->offset = offset; +- local->block_size = block_size; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->fop = GF_FOP_FTRUNCATE; +- +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); +- local->resolver_base_inode = fd->inode; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); +- return 0; +-} ++ local = frame->local; + +-int shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- int ret = -1; +- shard_local_t *local = NULL; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, ++ "stat failed: %s", ++ local->fd ? uuid_utoa(local->fd->inode->gfid) ++ : uuid_utoa((local->loc.inode)->gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } + +- local = frame->local; ++ local->prebuf = *buf; ++ if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { ++ local->op_ret = -1; ++ local->op_errno = EINVAL; ++ goto unwind; ++ } ++ local->xattr_rsp = dict_ref(xdata); + +- if (op_ret == -1) +- goto unwind; ++ if (local->loc.inode) ++ inode = local->loc.inode; ++ else ++ inode = local->fd->inode; + +- ret = +- shard_inode_ctx_set(inode, this, buf, local->block_size, SHARD_ALL_MASK); +- if (ret) +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, +- "Failed to set inode " +- "ctx for %s", +- uuid_utoa(inode->gfid)); ++ shard_inode_ctx_invalidate(inode, this, &local->prebuf); + + unwind: +- SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, +- postparent, xdata); +- +- return 0; ++ local->handler(frame, this); ++ return 0; + } + +-int shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, +- dev_t rdev, mode_t umask, dict_t *xdata) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; ++int ++shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { ++ STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->stat, loc, xdata); ++ return 0; ++ } + +- priv = this->private; +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } + +- frame->local = local; +- local->block_size = priv->block_size; +- if (!__is_gsyncd_on_shard_dir(frame, loc)) { +- SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->stat, loc, xdata); ++ return 0; ++ } + +- STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); +- return 0; +-} ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +-int32_t shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- shard_local_t *local = NULL; ++ frame->local = local; + +- local = frame->local; +- if (op_ret < 0) +- goto err; ++ local->handler = shard_post_stat_handler; ++ loc_copy(&local->loc, loc); ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- shard_inode_ctx_set(inode, this, buf, 0, SHARD_MASK_NLINK | SHARD_MASK_TIMES); +- buf->ia_size = local->prebuf.ia_size; +- buf->ia_blocks = local->prebuf.ia_blocks; ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, ++ local, err); + +- SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, +- postparent, xdata); +- return 0; ++ STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (local->op_ret < 0) { +- SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, NULL, +- NULL, NULL, NULL); +- return 0; +- } ++int ++shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, +- local->xattr_req); +- return 0; +-} ++ if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { ++ STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, xdata); ++ return 0; ++ } + +-int32_t shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, +- loc_t *newloc, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } + +- ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(oldloc->inode->gfid)); +- goto err; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, xdata); ++ return 0; ++ } + +- if (!block_size) { +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, +- oldloc, newloc, xdata); +- return 0; +- } ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- if (!this->itable) +- this->itable = oldloc->inode->table; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ frame->local = local; + +- frame->local = local; ++ local->handler = shard_post_fstat_handler; ++ local->fd = fd_ref(fd); ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- loc_copy(&local->loc, oldloc); +- loc_copy(&local->loc2, newloc); +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, ++ local, err); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_link_handler); +- return 0; ++ STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); +- +-int shard_post_lookup_shards_unlink_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- uuid_t gfid = { +- 0, +- }; ++int ++shard_post_update_size_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (local->resolver_base_inode) +- gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ if (local->fop == GF_FOP_TRUNCATE) ++ SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, NULL); + else +- gf_uuid_copy(gfid, local->base_gfid); +- +- if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { +- gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, +- "failed to delete shards of %s", uuid_utoa(gfid)); ++ SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, NULL); + return 0; +- } +- local->op_ret = 0; +- local->op_errno = 0; +- +- shard_unlink_shards_do(frame, this, local->resolver_base_inode); +- return 0; + } + +-int shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- local->lookup_shards_barriered = _gf_true; +- +- if (!local->call_count) +- shard_unlink_shards_do(frame, this, local->resolver_base_inode); +- else +- shard_common_lookup_shards(frame, this, local->resolver_base_inode, +- shard_post_lookup_shards_unlink_handler); +- return 0; +-} +- +-void shard_unlink_block_inode(shard_local_t *local, int shard_block_num) { +- char block_bname[256] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- inode_t *inode = NULL; +- inode_t *base_inode = NULL; +- xlator_t *this = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *base_ictx = NULL; +- int unref_base_inode = 0; +- int unref_shard_inode = 0; +- +- this = THIS; +- priv = this->private; +- +- inode = local->inode_list[shard_block_num - local->first_block]; +- shard_inode_ctx_get(inode, this, &ctx); +- base_inode = ctx->base_inode; +- if (base_inode) +- gf_uuid_copy(gfid, base_inode->gfid); +- else +- gf_uuid_copy(gfid, ctx->base_gfid); +- shard_make_block_bname(shard_block_num, gfid, block_bname, +- sizeof(block_bname)); +- +- LOCK(&priv->lock); +- if (base_inode) +- LOCK(&base_inode->lock); +- LOCK(&inode->lock); +- { +- __shard_inode_ctx_get(inode, this, &ctx); +- if (!list_empty(&ctx->ilist)) { +- list_del_init(&ctx->ilist); +- priv->inode_count--; +- unref_base_inode++; +- unref_shard_inode++; +- GF_ASSERT(priv->inode_count >= 0); +- } +- if (ctx->fsync_needed) { +- unref_base_inode++; +- unref_shard_inode++; +- list_del_init(&ctx->to_fsync_list); +- if (base_inode) { +- __shard_inode_ctx_get(base_inode, this, &base_ictx); +- base_ictx->fsync_count--; +- } +- } +- } +- UNLOCK(&inode->lock); +- if (base_inode) +- UNLOCK(&base_inode->lock); ++int ++shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ inode_t *inode = NULL; ++ int64_t delta_blocks = 0; ++ shard_local_t *local = NULL; + +- inode_unlink(inode, priv->dot_shard_inode, block_bname); +- inode_ref_reduce_by_n(inode, unref_shard_inode); +- inode_forget(inode, 0); ++ local = frame->local; + +- if (base_inode && unref_base_inode) +- inode_ref_reduce_by_n(base_inode, unref_base_inode); +- UNLOCK(&priv->lock); +-} ++ SHARD_UNSET_ROOT_FS_ID(frame, local); + +-int shard_rename_cbk(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode ++ : local->fd->inode; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, ++ "truncate on last" ++ " shard failed : %s", ++ uuid_utoa(inode->gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto err; ++ } + +- local = frame->local; ++ local->postbuf.ia_size = local->offset; ++ /* Let the delta be negative. We want xattrop to do subtraction */ ++ local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; ++ delta_blocks = GF_ATOMIC_ADD(local->delta_blocks, ++ postbuf->ia_blocks - prebuf->ia_blocks); ++ GF_ASSERT(delta_blocks <= 0); ++ local->postbuf.ia_blocks += delta_blocks; ++ local->hole_size = 0; + +- SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->preoldparent, +- &local->postoldparent, &local->prenewparent, +- &local->postnewparent, local->xattr_rsp); +- return 0; ++ shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES); ++ shard_update_file_size(frame, this, NULL, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; + } + +-int32_t shard_unlink_cbk(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = frame->local; ++int ++shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ size_t last_shard_size_after = 0; ++ loc_t loc = { ++ 0, ++ }; ++ shard_local_t *local = NULL; + +- SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, +- &local->preoldparent, &local->postoldparent, +- local->xattr_rsp); +- return 0; +-} ++ local = frame->local; + +-int shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) { +- int shard_block_num = (long)cookie; +- shard_local_t *local = NULL; ++ /* A NULL inode could be due to the fact that the last shard which ++ * needs to be truncated does not exist due to it lying in a hole ++ * region. So the only thing left to do in that case would be an ++ * update to file size xattr. ++ */ ++ if (!inode) { ++ gf_msg_debug(this->name, 0, ++ "Last shard to be truncated absent in backend:%" PRIu64 ++ " of gfid: %s. Directly proceeding to update file size", ++ local->first_block, uuid_utoa(local->loc.inode->gfid)); ++ shard_update_file_size(frame, this, NULL, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++ } + +- local = frame->local; ++ SHARD_SET_ROOT_FS_ID(frame, local); + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto done; +- } ++ loc.inode = inode_ref(inode); ++ gf_uuid_copy(loc.gfid, inode->gfid); + +- shard_unlink_block_inode(local, shard_block_num); +-done: +- syncbarrier_wake(&local->barrier); +- return 0; +-} +- +-int shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, +- inode_t *inode) { +- int i = 0; +- int ret = -1; +- int count = 0; +- uint32_t cur_block = 0; +- uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ +- char *bname = NULL; +- char path[PATH_MAX] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- loc_t loc = { +- 0, +- }; +- gf_boolean_t wind_failed = _gf_false; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; +- +- if (inode) +- gf_uuid_copy(gfid, inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- for (i = 0; i < local->num_blocks; i++) { +- if (!local->inode_list[i]) +- continue; +- count++; +- } +- +- if (!count) { +- /* callcount = 0 implies that all of the shards that need to be +- * unlinked are non-existent (in other words the file is full of +- * holes). +- */ +- gf_msg_debug(this->name, 0, "All shards that need to be " +- "unlinked are non-existent: %s", +- uuid_utoa(gfid)); ++ last_shard_size_after = (local->offset % local->block_size); ++ ++ STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, ++ NULL); ++ loc_wipe(&loc); + return 0; +- } ++} + +- SHARD_SET_ROOT_FS_ID(frame, local); +- local->barrier.waitfor = count; +- cur_block = cur_block_idx + local->first_block; ++void ++shard_unlink_block_inode(shard_local_t *local, int shard_block_num); + +- while (cur_block_idx < local->num_blocks) { +- if (!local->inode_list[cur_block_idx]) +- goto next; ++int ++shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ int call_count = 0; ++ int shard_block_num = (long)cookie; ++ uint64_t block_count = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; + +- if (wind_failed) { +- shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count); ++ if (!ret) { ++ GF_ATOMIC_SUB(local->delta_blocks, block_count); ++ } else { ++ /* dict_get failed possibly due to a heterogeneous cluster? */ ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get key %s from dict during truncate of gfid %s", ++ GF_GET_FILE_BLOCK_COUNT, ++ uuid_utoa(local->resolver_base_inode->gfid)); + } + +- shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; +- loc.parent = inode_ref(priv->dot_shard_inode); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- loc_wipe(&loc); +- wind_failed = _gf_true; +- shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; ++ shard_unlink_block_inode(local, shard_block_num); ++done: ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ shard_truncate_last_shard(frame, this, local->inode_list[0]); + } ++ return 0; ++} + +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- loc.inode = inode_ref(local->inode_list[cur_block_idx]); ++int ++shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ int i = 1; ++ int ret = -1; ++ int call_count = 0; ++ uint32_t cur_block = 0; ++ uint32_t last_block = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ char *bname = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ gf_boolean_t wind_failed = _gf_false; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ dict_t *xdata_req = NULL; + +- STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, +- (void *)(long)cur_block, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, +- local->xattr_req); +- loc_wipe(&loc); +- next: +- cur_block++; +- cur_block_idx++; +- } +- syncbarrier_wait(&local->barrier, count); +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- return 0; +-} +- +-int shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, +- int now, int first_block, +- gf_dirent_t *entry) { +- int i = 0; +- int ret = 0; +- shard_local_t *local = NULL; +- uuid_t gfid = { +- 0, +- }; +- +- local = cleanup_frame->local; +- +- local->inode_list = GF_CALLOC(now, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) +- return -ENOMEM; +- +- local->first_block = first_block; +- local->last_block = first_block + now - 1; +- local->num_blocks = now; +- gf_uuid_parse(entry->d_name, gfid); +- gf_uuid_copy(local->base_gfid, gfid); +- local->resolver_base_inode = inode_find(this->itable, gfid); +- local->call_count = 0; +- ret = syncbarrier_init(&local->barrier); +- if (ret) { +- GF_FREE(local->inode_list); +- local->inode_list = NULL; +- inode_unref(local->resolver_base_inode); +- local->resolver_base_inode = NULL; +- return -errno; +- } +- shard_common_resolve_shards(cleanup_frame, this, +- shard_post_resolve_unlink_handler); +- +- for (i = 0; i < local->num_blocks; i++) { +- if (local->inode_list[i]) +- inode_unref(local->inode_list[i]); +- } +- GF_FREE(local->inode_list); +- local->inode_list = NULL; +- if (local->op_ret) +- ret = -local->op_errno; +- syncbarrier_destroy(&local->barrier); +- inode_unref(local->resolver_base_inode); +- local->resolver_base_inode = NULL; +- STACK_RESET(cleanup_frame->root); +- return ret; +-} +- +-int __shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, +- gf_dirent_t *entry, inode_t *inode) { +- int ret = 0; +- int shard_count = 0; +- int first_block = 0; +- int now = 0; +- uint64_t size = 0; +- uint64_t block_size = 0; +- uint64_t size_array[4] = { +- 0, +- }; +- void *bsize = NULL; +- void *size_attr = NULL; +- dict_t *xattr_rsp = NULL; +- loc_t loc = { +- 0, +- }; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = cleanup_frame->local; +- ret = dict_reset(local->xattr_req); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to reset dict"); +- ret = -ENOMEM; +- goto err; +- } +- +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); +- ret = -ENOMEM; +- goto err; +- } +- +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); +- ret = -ENOMEM; +- goto err; +- } +- +- loc.inode = inode_ref(inode); +- loc.parent = inode_ref(priv->dot_shard_rm_inode); +- ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", entry->d_name); +- ret = -ENOMEM; +- goto err; +- } +- +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, +- &xattr_rsp); +- if (ret) +- goto err; +- +- ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); +- goto err; +- } +- block_size = ntoh64(*((uint64_t *)bsize)); +- +- ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); +- goto err; +- } +- +- memcpy(size_array, size_attr, sizeof(size_array)); +- size = ntoh64(size_array[0]); +- +- shard_count = (size / block_size) - 1; +- if (shard_count < 0) { +- gf_msg_debug(this->name, 0, "Size of %s hasn't grown beyond " +- "its shard-block-size. Nothing to delete. " +- "Returning", +- entry->d_name); +- /* File size < shard-block-size, so nothing to delete */ +- ret = 0; +- goto delete_marker; +- } +- if ((size % block_size) > 0) +- shard_count++; +- +- if (shard_count == 0) { +- gf_msg_debug(this->name, 0, "Size of %s is exactly equal to " +- "its shard-block-size. Nothing to delete. " +- "Returning", +- entry->d_name); +- ret = 0; +- goto delete_marker; +- } +- gf_msg_debug(this->name, 0, +- "base file = %s, " +- "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 ", " +- "shard_count=%d", +- entry->d_name, block_size, size, shard_count); +- +- /* Perform a gfid-based lookup to see if gfid corresponding to marker +- * file's base name exists. +- */ +- loc_wipe(&loc); +- loc.inode = inode_new(this->itable); +- if (!loc.inode) { +- ret = -ENOMEM; +- goto err; +- } +- gf_uuid_parse(entry->d_name, loc.gfid); +- ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); +- if (!ret) { +- gf_msg_debug(this->name, 0, "Base shard corresponding to gfid " +- "%s is present. Skipping shard deletion. " +- "Returning", +- entry->d_name); +- ret = 0; +- goto delete_marker; +- } ++ local = frame->local; ++ priv = this->private; + +- first_block = 1; ++ cur_block = local->first_block + 1; ++ last_block = local->last_block; + +- while (shard_count) { +- if (shard_count < local->deletion_rate) { +- now = shard_count; +- shard_count = 0; +- } else { +- now = local->deletion_rate; +- shard_count -= local->deletion_rate; ++ /* Determine call count */ ++ for (i = 1; i < local->num_blocks; i++) { ++ if (!local->inode_list[i]) ++ continue; ++ call_count++; + } + +- gf_msg_debug(this->name, 0, "deleting %d shards starting from " +- "block %d of gfid %s", +- now, first_block, entry->d_name); +- ret = shard_regulated_shards_deletion(cleanup_frame, this, now, first_block, +- entry); +- if (ret) +- goto err; +- first_block += now; +- } ++ if (!call_count) { ++ /* Call count = 0 implies that all of the shards that need to be ++ * unlinked do not exist. So shard xlator would now proceed to ++ * do the final truncate + size updates. ++ */ ++ gf_msg_debug(this->name, 0, ++ "Shards to be unlinked as part of " ++ "truncate absent in backend: %s. Directly " ++ "proceeding to update file size", ++ uuid_utoa(inode->gfid)); ++ local->postbuf.ia_size = local->offset; ++ local->postbuf.ia_blocks = local->prebuf.ia_blocks; ++ local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ local->hole_size = 0; ++ shard_update_file_size(frame, this, local->fd, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++ } + +-delete_marker: +- loc_wipe(&loc); +- loc.inode = inode_ref(inode); +- loc.parent = inode_ref(priv->dot_shard_rm_inode); +- ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", entry->d_name); +- ret = -ENOMEM; +- goto err; +- } +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, +- "Failed to delete %s " +- "from /%s", +- entry->d_name, GF_SHARD_REMOVE_ME_DIR); +-err: +- if (xattr_rsp) +- dict_unref(xattr_rsp); +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, +- gf_dirent_t *entry, inode_t *inode) { +- int ret = -1; +- loc_t loc = { +- 0, +- }; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- loc.inode = inode_ref(priv->dot_shard_rm_inode); +- +- ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, +- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); +- if (ret < 0) { +- if (ret == -EAGAIN) { +- ret = 0; +- } +- goto out; +- } +- { ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); } +- syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, +- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); +-out: +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) { +- SHARD_STACK_DESTROY(frame); +- return 0; +-} +- +-int shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, +- shard_internal_dir_type_t type) { +- int ret = 0; +- char *bname = NULL; +- loc_t *loc = NULL; +- shard_priv_t *priv = NULL; +- uuid_t gfid = { +- 0, +- }; +- struct iatt stbuf = { +- 0, +- }; +- +- priv = this->private; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- loc = &local->dot_shard_loc; +- gf_uuid_copy(gfid, priv->dot_shard_gfid); +- bname = GF_SHARD_DIR; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- loc = &local->dot_shard_rm_loc; +- gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); +- bname = GF_SHARD_REMOVE_ME_DIR; +- break; +- default: +- break; +- } +- +- loc->inode = inode_find(this->itable, gfid); +- if (!loc->inode) { +- ret = shard_init_internal_dir_loc(this, local, type); +- if (ret) +- goto err; +- ret = dict_reset(local->xattr_req); ++ local->call_count = call_count; ++ i = 1; ++ xdata_req = dict_new(); ++ if (!xdata_req) { ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++ } ++ ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8); + if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to reset " +- "dict"); +- ret = -ENOMEM; +- goto err; +- } +- ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); +- ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, local->xattr_req, +- NULL); +- if (ret < 0) { +- if (ret != -ENOENT) +- gf_msg(this->name, GF_LOG_ERROR, -ret, SHARD_MSG_SHARDS_DELETION_FAILED, +- "Lookup on %s failed, exiting", bname); +- goto err; +- } else { +- shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key %s into dict during truncate of %s", ++ GF_GET_FILE_BLOCK_COUNT, ++ uuid_utoa(local->resolver_base_inode->gfid)); ++ dict_unref(xdata_req); ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; + } +- } +- ret = 0; +-err: +- return ret; +-} +- +-int shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, +- gf_dirent_t *entry) { +- int ret = 0; +- loc_t loc = { +- 0, +- }; +- +- loc.inode = inode_new(this->itable); +- if (!loc.inode) { +- ret = -ENOMEM; +- goto err; +- } +- loc.parent = inode_ref(local->fd->inode); +- +- ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", entry->d_name); +- ret = -ENOMEM; +- goto err; +- } +- +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- +- ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); +- if (ret < 0) { +- goto err; +- } +- entry->inode = inode_ref(loc.inode); +- ret = 0; +-err: +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_delete_shards(void *opaque) { +- int ret = 0; +- off_t offset = 0; +- loc_t loc = { +- 0, +- }; +- inode_t *link_inode = NULL; +- xlator_t *this = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- gf_dirent_t entries; +- gf_dirent_t *entry = NULL; +- call_frame_t *cleanup_frame = NULL; +- gf_boolean_t done = _gf_false; +- +- this = THIS; +- priv = this->private; +- INIT_LIST_HEAD(&entries.list); +- +- cleanup_frame = opaque; +- +- local = mem_get0(this->local_pool); +- if (!local) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create local to " +- "delete shards"); +- ret = -ENOMEM; +- goto err; +- } +- cleanup_frame->local = local; +- local->fop = GF_FOP_UNLINK; +- +- local->xattr_req = dict_new(); +- if (!local->xattr_req) { +- ret = -ENOMEM; +- goto err; +- } +- local->deletion_rate = priv->deletion_rate; +- +- ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret == -ENOENT) { +- gf_msg_debug(this->name, 0, ".shard absent. Nothing to" +- " delete. Exiting"); +- ret = 0; +- goto err; +- } else if (ret < 0) { +- goto err; +- } + +- ret = shard_resolve_internal_dir(this, local, +- SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); +- if (ret == -ENOENT) { +- gf_msg_debug(this->name, 0, ".remove_me absent. " +- "Nothing to delete. Exiting"); +- ret = 0; +- goto err; +- } else if (ret < 0) { +- goto err; +- } +- +- local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); +- if (!local->fd) { +- ret = -ENOMEM; +- goto err; +- } +- +- for (;;) { +- offset = 0; ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ while (cur_block <= last_block) { ++ if (!local->inode_list[i]) { ++ cur_block++; ++ i++; ++ continue; ++ } ++ if (wind_failed) { ++ shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); ++ bname = strrchr(path, '/') + 1; ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on %s. Base file gfid = %s", ++ bname, uuid_utoa(inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ loc_wipe(&loc); ++ wind_failed = _gf_true; ++ shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ loc.inode = inode_ref(local->inode_list[i]); ++ ++ STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, ++ (void *)(long)cur_block, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &loc, 0, xdata_req); ++ loc_wipe(&loc); ++ next: ++ i++; ++ cur_block++; ++ if (!--call_count) ++ break; ++ } ++ dict_unref(xdata_req); ++ return 0; ++} ++ ++int ++shard_truncate_do(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->num_blocks == 1) { ++ /* This means that there are no shards to be unlinked. ++ * The fop boils down to truncating the last shard, updating ++ * the size and unwinding. ++ */ ++ shard_truncate_last_shard(frame, this, local->inode_list[0]); ++ return 0; ++ } else { ++ shard_truncate_htol(frame, this, local->loc.inode); ++ } ++ return 0; ++} ++ ++int ++shard_post_lookup_shards_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ shard_truncate_do(frame, this); ++ return 0; ++} ++ ++void ++shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, ++ struct iatt *buf) ++{ ++ int list_index = 0; ++ char block_bname[256] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ inode_t *linked_inode = NULL; ++ xlator_t *this = NULL; ++ inode_t *fsync_inode = NULL; ++ shard_priv_t *priv = NULL; ++ inode_t *base_inode = NULL; ++ ++ this = THIS; ++ priv = this->private; ++ if (local->loc.inode) { ++ gf_uuid_copy(gfid, local->loc.inode->gfid); ++ base_inode = local->loc.inode; ++ } else if (local->resolver_base_inode) { ++ gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ base_inode = local->resolver_base_inode; ++ } else { ++ gf_uuid_copy(gfid, local->base_gfid); ++ } ++ ++ shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); ++ ++ shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); ++ linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); ++ inode_lookup(linked_inode); ++ list_index = block_num - local->first_block; ++ local->inode_list[list_index] = linked_inode; ++ + LOCK(&priv->lock); + { +- if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { +- priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; +- } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { +- priv->bg_del_state = SHARD_BG_DELETION_NONE; +- done = _gf_true; +- } ++ fsync_inode = __shard_update_shards_inode_list( ++ linked_inode, this, base_inode, block_num, gfid); + } + UNLOCK(&priv->lock); +- if (done) +- break; +- while ((ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, +- &entries, local->xattr_req, NULL))) { +- if (ret > 0) +- ret = 0; +- list_for_each_entry(entry, &entries.list, list) { +- offset = entry->d_off; ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync(this, fsync_inode); ++} ++ ++int ++shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, int32_t op_errno, ++ inode_t *inode, struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ int call_count = 0; ++ int shard_block_num = (long)cookie; ++ uuid_t gfid = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ if (local->resolver_base_inode) ++ gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ if (op_ret < 0) { ++ /* Ignore absence of shards in the backend in truncate fop. */ ++ switch (local->fop) { ++ case GF_FOP_TRUNCATE: ++ case GF_FOP_FTRUNCATE: ++ case GF_FOP_RENAME: ++ case GF_FOP_UNLINK: ++ if (op_errno == ENOENT) ++ goto done; ++ break; ++ case GF_FOP_WRITE: ++ case GF_FOP_READ: ++ case GF_FOP_ZEROFILL: ++ case GF_FOP_DISCARD: ++ case GF_FOP_FALLOCATE: ++ if ((!local->first_lookup_done) && (op_errno == ENOENT)) { ++ LOCK(&frame->lock); ++ { ++ local->create_count++; ++ } ++ UNLOCK(&frame->lock); ++ goto done; ++ } ++ break; ++ default: ++ break; ++ } ++ ++ /* else */ ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_LOOKUP_SHARD_FAILED, ++ "Lookup on shard %d " ++ "failed. Base file gfid = %s", ++ shard_block_num, uuid_utoa(gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ++ shard_link_block_inode(local, shard_block_num, inode, buf); ++ ++done: ++ if (local->lookup_shards_barriered) { ++ syncbarrier_wake(&local->barrier); ++ return 0; ++ } else { ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ if (!local->first_lookup_done) ++ local->first_lookup_done = _gf_true; ++ local->pls_fop_handler(frame, this); ++ } ++ } ++ return 0; ++} ++ ++dict_t * ++shard_create_gfid_dict(dict_t *dict) ++{ ++ int ret = 0; ++ dict_t *new = NULL; ++ unsigned char *gfid = NULL; ++ ++ new = dict_copy_with_ref(dict, NULL); ++ if (!new) ++ return NULL; ++ ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); ++ if (!gfid) { ++ ret = -1; ++ goto out; ++ } ++ ++ gf_uuid_generate(gfid); ++ ++ ret = dict_set_gfuuid(new, "gfid-req", gfid, false); + +- if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) +- continue; ++out: ++ if (ret) { ++ dict_unref(new); ++ new = NULL; ++ GF_FREE(gfid); ++ } ++ ++ return new; ++} ++ ++int ++shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, ++ shard_post_lookup_shards_fop_handler_t handler) ++{ ++ int i = 0; ++ int ret = 0; ++ int count = 0; ++ int call_count = 0; ++ int32_t shard_idx_iter = 0; ++ int last_block = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ char *bname = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ loc_t loc = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ dict_t *xattr_req = NULL; + +- if (!entry->inode) { +- ret = shard_lookup_marker_entry(this, local, entry); +- if (ret < 0) ++ priv = this->private; ++ local = frame->local; ++ count = call_count = local->call_count; ++ shard_idx_iter = local->first_block; ++ last_block = local->last_block; ++ local->pls_fop_handler = handler; ++ if (local->lookup_shards_barriered) ++ local->barrier.waitfor = local->call_count; ++ ++ if (inode) ++ gf_uuid_copy(gfid, inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ while (shard_idx_iter <= last_block) { ++ if (local->inode_list[i]) { ++ i++; ++ shard_idx_iter++; + continue; + } +- link_inode = inode_link(entry->inode, local->fd->inode, entry->d_name, +- &entry->d_stat); + +- gf_msg_debug(this->name, 0, "Initiating deletion of " +- "shards of gfid %s", +- entry->d_name); +- ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, +- link_inode); +- inode_unlink(link_inode, local->fd->inode, entry->d_name); +- inode_unref(link_inode); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, -ret, +- SHARD_MSG_SHARDS_DELETION_FAILED, +- "Failed to clean up shards of gfid %s", entry->d_name); +- continue; ++ if (wind_failed) { ++ shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, ++ this, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ goto next; + } +- gf_msg(this->name, GF_LOG_INFO, 0, SHARD_MSG_SHARD_DELETION_COMPLETED, +- "Deleted " +- "shards of gfid=%s from backend", +- entry->d_name); +- } +- gf_dirent_free(&entries); +- if (ret) +- break; +- } +- } +- ret = 0; +- loc_wipe(&loc); +- return ret; ++ ++ shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); ++ ++ bname = strrchr(path, '/') + 1; ++ loc.inode = inode_new(this->itable); ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0 || !(loc.inode)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on %s, base file gfid = %s", ++ bname, uuid_utoa(gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ loc_wipe(&loc); ++ wind_failed = _gf_true; ++ shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, ++ this, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ goto next; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ loc_wipe(&loc); ++ shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, ++ this, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ goto next; ++ } ++ ++ STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, ++ (void *)(long)shard_idx_iter, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); ++ loc_wipe(&loc); ++ dict_unref(xattr_req); ++ next: ++ shard_idx_iter++; ++ i++; ++ ++ if (!--call_count) ++ break; ++ } ++ if (local->lookup_shards_barriered) { ++ syncbarrier_wait(&local->barrier, count); ++ local->pls_fop_handler(frame, this); ++ } ++ return 0; ++} ++ ++int ++shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ if (local->op_errno == ENOENT) { ++ /* If lookup on /.shard fails with ENOENT, it means that ++ * the file was 0-byte in size but truncated sometime in ++ * the past to a higher size which is reflected in the ++ * size xattr, and now being truncated to a lower size. ++ * In this case, the only thing that needs to be done is ++ * to update the size xattr of the file and unwind. ++ */ ++ local->first_block = local->last_block = 0; ++ local->num_blocks = 1; ++ local->call_count = 0; ++ local->op_ret = 0; ++ local->postbuf.ia_size = local->offset; ++ shard_update_file_size(frame, this, local->fd, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++ } else { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ } ++ ++ if (!local->call_count) ++ shard_truncate_do(frame, this); ++ else ++ shard_common_lookup_shards(frame, this, local->loc.inode, ++ shard_post_lookup_shards_truncate_handler); ++ ++ return 0; ++} ++ ++int ++shard_truncate_begin(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ /* First participant block here is the lowest numbered block that would ++ * hold the last byte of the file post successful truncation. ++ * Last participant block is the block that contains the last byte in ++ * the current state of the file. ++ * If (first block == last_block): ++ * then that means that the file only needs truncation of the ++ * first (or last since both are same) block. ++ * Else ++ * if (new_size % block_size == 0) ++ * then that means there is no truncate to be done with ++ * only shards from first_block + 1 through the last ++ * block needing to be unlinked. ++ * else ++ * both truncate of the first block and unlink of the ++ * remaining shards until end of file is required. ++ */ ++ local->first_block = (local->offset == 0) ++ ? 0 ++ : get_lowest_block(local->offset - 1, ++ local->block_size); ++ local->last_block = get_highest_block(0, local->prebuf.ia_size, ++ local->block_size); ++ ++ local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); ++ local->resolver_base_inode = (local->fop == GF_FOP_TRUNCATE) ++ ? local->loc.inode ++ : local->fd->inode; ++ ++ if ((local->first_block == 0) && (local->num_blocks == 1)) { ++ if (local->fop == GF_FOP_TRUNCATE) ++ STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, &local->loc, ++ local->offset, local->xattr_req); ++ else ++ STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->ftruncate, local->fd, ++ local->offset, local->xattr_req); ++ return 0; ++ } ++ ++ local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ goto err; ++ ++ local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ ret = shard_init_internal_dir_loc(this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret) ++ goto err; ++ shard_lookup_internal_dir(frame, this, ++ shard_post_resolve_truncate_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ local->post_res_handler = shard_post_resolve_truncate_handler; ++ shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); ++ } ++ return 0; + + err: +- LOCK(&priv->lock); +- { priv->bg_del_state = SHARD_BG_DELETION_NONE; } +- UNLOCK(&priv->lock); +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- if (op_ret) +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Unlock failed. Please check brick logs for " +- "more details"); +- SHARD_STACK_DESTROY(frame); +- return 0; +-} +- +-int shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) { +- loc_t *loc = NULL; +- call_frame_t *lk_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *lk_local = NULL; +- shard_inodelk_t *lock = NULL; +- +- local = frame->local; +- lk_frame = local->inodelk_frame; +- lk_local = lk_frame->local; +- local->inodelk_frame = NULL; +- loc = &local->int_inodelk.loc; +- lock = &lk_local->int_inodelk; +- lock->flock.l_type = F_UNLCK; +- +- STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, +- &lock->flock, NULL); +- local->int_inodelk.acquired_lock = _gf_false; +- return 0; +-} +- +-int shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *buf, +- struct iatt *preoldparent, struct iatt *postoldparent, +- struct iatt *prenewparent, struct iatt *postnewparent, +- dict_t *xdata); +-int shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- loc_t *dst_loc = NULL; +- loc_t tmp_loc = { +- 0, +- }; +- shard_local_t *local = frame->local; +- +- if (local->dst_block_size) { +- tmp_loc.parent = inode_ref(local->loc2.parent); +- ret = inode_path(tmp_loc.parent, local->loc2.name, (char **)&tmp_loc.path); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on pargfid=%s bname=%s", +- uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +- +- tmp_loc.name = strrchr(tmp_loc.path, '/'); +- if (tmp_loc.name) +- tmp_loc.name++; +- dst_loc = &tmp_loc; +- } else { +- dst_loc = &local->loc2; +- } +- +- /* To-Do: Request open-fd count on dst base file */ +- STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, +- local->xattr_req); +- loc_wipe(&tmp_loc); +- return 0; +-err: +- loc_wipe(&tmp_loc); +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +-} +- +-int shard_unlink_base_file(call_frame_t *frame, xlator_t *this); +- +-int shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, dict_t *dict, +- dict_t *xdata) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- priv = this->private; +- local = frame->local; +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Xattrop on marker file failed " +- "while performing %s; entry gfid=%s", +- gf_fop_string(local->fop), local->newloc.name); +- goto err; +- } +- +- inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, +- local->newloc.name); +- +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_base_file(frame, this); +- else if (local->fop == GF_FOP_RENAME) +- shard_rename_src_base_file(frame, this); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); +- return 0; +-} +- +-int shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) { +- int op_errno = ENOMEM; +- uint64_t bs = 0; +- dict_t *xdata = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- xdata = dict_new(); +- if (!xdata) +- goto err; +- +- if (local->fop == GF_FOP_UNLINK) +- bs = local->block_size; +- else if (local->fop == GF_FOP_RENAME) +- bs = local->dst_block_size; +- SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, +- local->prebuf.ia_size, 0, err); +- STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->xattrop, &local->newloc, +- GF_XATTROP_GET_AND_SET, xdata, NULL); +- dict_unref(xdata); +- return 0; +-err: +- if (xdata) +- dict_unref(xdata); +- shard_common_failure_unwind(local->fop, frame, -1, op_errno); +- return 0; ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- inode_t *linked_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- priv = this->private; +- +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Lookup on marker file failed " +- "while performing %s; entry gfid=%s", +- gf_fop_string(local->fop), local->newloc.name); +- goto err; +- } +- +- linked_inode = +- inode_link(inode, priv->dot_shard_rm_inode, local->newloc.name, buf); +- inode_unref(local->newloc.inode); +- local->newloc.inode = linked_inode; +- shard_set_size_attrs_on_marker_file(frame, this); +- return 0; ++int ++shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ struct iatt tmp_stbuf = { ++ 0, ++ }; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ local->postbuf = tmp_stbuf = local->prebuf; ++ ++ if (local->prebuf.ia_size == local->offset) { ++ /* If the file size is same as requested size, unwind the call ++ * immediately. ++ */ ++ if (local->fop == GF_FOP_TRUNCATE) ++ SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, ++ &local->postbuf, NULL); ++ else ++ SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, ++ &local->postbuf, NULL); ++ } else if (local->offset > local->prebuf.ia_size) { ++ /* If the truncate is from a lower to a higher size, set the ++ * new size xattr and unwind. ++ */ ++ local->hole_size = local->offset - local->prebuf.ia_size; ++ local->delta_size = 0; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ local->postbuf.ia_size = local->offset; ++ tmp_stbuf.ia_size = local->offset; ++ shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, ++ SHARD_INODE_WRITE_MASK); ++ shard_update_file_size(frame, this, NULL, &local->loc, ++ shard_post_update_size_truncate_handler); ++ } else { ++ /* ... else ++ * i. unlink all shards that need to be unlinked. ++ * ii. truncate the last of the shards. ++ * iii. update the new size using setxattr. ++ * and unwind the fop. ++ */ ++ local->hole_size = 0; ++ local->delta_size = (local->offset - local->prebuf.ia_size); ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ tmp_stbuf.ia_size = local->offset; ++ shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, ++ SHARD_INODE_WRITE_MASK); ++ shard_truncate_begin(frame, this); ++ } ++ return 0; ++} ++ ++/* TO-DO: ++ * Fix updates to size and block count with racing write(s) and truncate(s). ++ */ ++ ++int ++shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = loc->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto err; ++ loc_copy(&local->loc, loc); ++ local->offset = offset; ++ local->block_size = block_size; ++ local->fop = GF_FOP_TRUNCATE; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->resolver_base_inode = loc->inode; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_truncate_handler); ++ return 0; ++ + err: +- shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) { +- int op_errno = ENOMEM; +- dict_t *xattr_req = NULL; +- shard_local_t *local = NULL; ++int ++shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = fd->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto err; ++ local->fd = fd_ref(fd); ++ local->offset = offset; ++ local->block_size = block_size; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->fop = GF_FOP_FTRUNCATE; ++ ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); ++ local->resolver_base_inode = fd->inode; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_truncate_handler); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret == -1) ++ goto unwind; ++ ++ ret = shard_inode_ctx_set(inode, this, buf, local->block_size, ++ SHARD_ALL_MASK); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, ++ "Failed to set inode " ++ "ctx for %s", ++ uuid_utoa(inode->gfid)); ++ ++unwind: ++ SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, ++ postparent, xdata); ++ ++ return 0; ++} ++ ++int ++shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, ++ dev_t rdev, mode_t umask, dict_t *xdata) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ local->block_size = priv->block_size; ++ if (!__is_gsyncd_on_shard_dir(frame, loc)) { ++ SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); ++ } ++ ++ STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int32_t ++shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ if (op_ret < 0) ++ goto err; ++ ++ shard_inode_ctx_set(inode, this, buf, 0, ++ SHARD_MASK_NLINK | SHARD_MASK_TIMES); ++ buf->ia_size = local->prebuf.ia_size; ++ buf->ia_blocks = local->prebuf.ia_blocks; ++ ++ SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, ++ postparent, xdata); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, ++ NULL, NULL, NULL, NULL); ++ return 0; ++ } ++ ++ STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, ++ local->xattr_req); ++ return 0; ++} ++ ++int32_t ++shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(oldloc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size) { ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, ++ oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = oldloc->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ loc_copy(&local->loc, oldloc); ++ loc_copy(&local->loc2, newloc); ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_link_handler); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); ++ ++int ++shard_post_lookup_shards_unlink_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ ++ local = frame->local; ++ ++ if (local->resolver_base_inode) ++ gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { ++ gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, ++ "failed to delete shards of %s", uuid_utoa(gfid)); ++ return 0; ++ } ++ local->op_ret = 0; ++ local->op_errno = 0; ++ ++ shard_unlink_shards_do(frame, this, local->resolver_base_inode); ++ return 0; ++} ++ ++int ++shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ local->lookup_shards_barriered = _gf_true; ++ ++ if (!local->call_count) ++ shard_unlink_shards_do(frame, this, local->resolver_base_inode); ++ else ++ shard_common_lookup_shards(frame, this, local->resolver_base_inode, ++ shard_post_lookup_shards_unlink_handler); ++ return 0; ++} ++ ++void ++shard_unlink_block_inode(shard_local_t *local, int shard_block_num) ++{ ++ char block_bname[256] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ inode_t *inode = NULL; ++ inode_t *base_inode = NULL; ++ xlator_t *this = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ int unref_base_inode = 0; ++ int unref_shard_inode = 0; ++ ++ this = THIS; ++ priv = this->private; ++ ++ inode = local->inode_list[shard_block_num - local->first_block]; ++ shard_inode_ctx_get(inode, this, &ctx); ++ base_inode = ctx->base_inode; ++ if (base_inode) ++ gf_uuid_copy(gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(gfid, ctx->base_gfid); ++ shard_make_block_bname(shard_block_num, gfid, block_bname, ++ sizeof(block_bname)); ++ ++ LOCK(&priv->lock); ++ if (base_inode) ++ LOCK(&base_inode->lock); ++ LOCK(&inode->lock); ++ { ++ __shard_inode_ctx_get(inode, this, &ctx); ++ if (!list_empty(&ctx->ilist)) { ++ list_del_init(&ctx->ilist); ++ priv->inode_count--; ++ unref_base_inode++; ++ unref_shard_inode++; ++ GF_ASSERT(priv->inode_count >= 0); ++ } ++ if (ctx->fsync_needed) { ++ unref_base_inode++; ++ unref_shard_inode++; ++ list_del_init(&ctx->to_fsync_list); ++ if (base_inode) { ++ __shard_inode_ctx_get(base_inode, this, &base_ictx); ++ base_ictx->fsync_count--; ++ } ++ } ++ } ++ UNLOCK(&inode->lock); ++ if (base_inode) ++ UNLOCK(&base_inode->lock); ++ ++ inode_unlink(inode, priv->dot_shard_inode, block_bname); ++ inode_ref_reduce_by_n(inode, unref_shard_inode); ++ inode_forget(inode, 0); ++ ++ if (base_inode && unref_base_inode) ++ inode_ref_reduce_by_n(base_inode, unref_base_inode); ++ UNLOCK(&priv->lock); ++} ++ ++int ++shard_rename_cbk(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->preoldparent, ++ &local->postoldparent, &local->prenewparent, ++ &local->postnewparent, local->xattr_rsp); ++ return 0; ++} ++ ++int32_t ++shard_unlink_cbk(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ ++ SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, ++ &local->preoldparent, &local->postoldparent, ++ local->xattr_rsp); ++ return 0; ++} ++ ++int ++shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int shard_block_num = (long)cookie; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ++ shard_unlink_block_inode(local, shard_block_num); ++done: ++ syncbarrier_wake(&local->barrier); ++ return 0; ++} ++ ++int ++shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ int i = 0; ++ int ret = -1; ++ int count = 0; ++ uint32_t cur_block = 0; ++ uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ ++ char *bname = NULL; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ loc_t loc = { ++ 0, ++ }; ++ gf_boolean_t wind_failed = _gf_false; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ if (inode) ++ gf_uuid_copy(gfid, inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ for (i = 0; i < local->num_blocks; i++) { ++ if (!local->inode_list[i]) ++ continue; ++ count++; ++ } ++ ++ if (!count) { ++ /* callcount = 0 implies that all of the shards that need to be ++ * unlinked are non-existent (in other words the file is full of ++ * holes). ++ */ ++ gf_msg_debug(this->name, 0, ++ "All shards that need to be " ++ "unlinked are non-existent: %s", ++ uuid_utoa(gfid)); ++ return 0; ++ } ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ local->barrier.waitfor = count; ++ cur_block = cur_block_idx + local->first_block; ++ ++ while (cur_block_idx < local->num_blocks) { ++ if (!local->inode_list[cur_block_idx]) ++ goto next; ++ ++ if (wind_failed) { ++ shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); ++ bname = strrchr(path, '/') + 1; ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on %s, base file gfid = %s", ++ bname, uuid_utoa(gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ loc_wipe(&loc); ++ wind_failed = _gf_true; ++ shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ loc.inode = inode_ref(local->inode_list[cur_block_idx]); ++ ++ STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, ++ (void *)(long)cur_block, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, ++ local->xattr_req); ++ loc_wipe(&loc); ++ next: ++ cur_block++; ++ cur_block_idx++; ++ } ++ syncbarrier_wait(&local->barrier, count); ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ return 0; ++} ++ ++int ++shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, ++ int now, int first_block, gf_dirent_t *entry) ++{ ++ int i = 0; ++ int ret = 0; ++ shard_local_t *local = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ ++ local = cleanup_frame->local; ++ ++ local->inode_list = GF_CALLOC(now, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ return -ENOMEM; ++ ++ local->first_block = first_block; ++ local->last_block = first_block + now - 1; ++ local->num_blocks = now; ++ gf_uuid_parse(entry->d_name, gfid); ++ gf_uuid_copy(local->base_gfid, gfid); ++ local->resolver_base_inode = inode_find(this->itable, gfid); ++ local->call_count = 0; ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) { ++ GF_FREE(local->inode_list); ++ local->inode_list = NULL; ++ inode_unref(local->resolver_base_inode); ++ local->resolver_base_inode = NULL; ++ return -errno; ++ } ++ shard_common_resolve_shards(cleanup_frame, this, ++ shard_post_resolve_unlink_handler); ++ ++ for (i = 0; i < local->num_blocks; i++) { ++ if (local->inode_list[i]) ++ inode_unref(local->inode_list[i]); ++ } ++ GF_FREE(local->inode_list); ++ local->inode_list = NULL; ++ if (local->op_ret) ++ ret = -local->op_errno; ++ syncbarrier_destroy(&local->barrier); ++ inode_unref(local->resolver_base_inode); ++ local->resolver_base_inode = NULL; ++ STACK_RESET(cleanup_frame->root); ++ return ret; ++} ++ ++int ++__shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, ++ gf_dirent_t *entry, inode_t *inode) ++{ ++ int ret = 0; ++ int shard_count = 0; ++ int first_block = 0; ++ int now = 0; ++ uint64_t size = 0; ++ uint64_t block_size = 0; ++ uint64_t size_array[4] = { ++ 0, ++ }; ++ void *bsize = NULL; ++ void *size_attr = NULL; ++ dict_t *xattr_rsp = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = cleanup_frame->local; ++ ret = dict_reset(local->xattr_req); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to reset dict"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.inode = inode_ref(inode); ++ loc.parent = inode_ref(priv->dot_shard_rm_inode); ++ ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, ++ &xattr_rsp); ++ if (ret) ++ goto err; ++ ++ ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); ++ goto err; ++ } ++ block_size = ntoh64(*((uint64_t *)bsize)); ++ ++ ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); ++ goto err; ++ } ++ ++ memcpy(size_array, size_attr, sizeof(size_array)); ++ size = ntoh64(size_array[0]); ++ ++ shard_count = (size / block_size) - 1; ++ if (shard_count < 0) { ++ gf_msg_debug(this->name, 0, ++ "Size of %s hasn't grown beyond " ++ "its shard-block-size. Nothing to delete. " ++ "Returning", ++ entry->d_name); ++ /* File size < shard-block-size, so nothing to delete */ ++ ret = 0; ++ goto delete_marker; ++ } ++ if ((size % block_size) > 0) ++ shard_count++; ++ ++ if (shard_count == 0) { ++ gf_msg_debug(this->name, 0, ++ "Size of %s is exactly equal to " ++ "its shard-block-size. Nothing to delete. " ++ "Returning", ++ entry->d_name); ++ ret = 0; ++ goto delete_marker; ++ } ++ gf_msg_debug(this->name, 0, ++ "base file = %s, " ++ "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 ++ ", " ++ "shard_count=%d", ++ entry->d_name, block_size, size, shard_count); ++ ++ /* Perform a gfid-based lookup to see if gfid corresponding to marker ++ * file's base name exists. ++ */ ++ loc_wipe(&loc); ++ loc.inode = inode_new(this->itable); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ gf_uuid_parse(entry->d_name, loc.gfid); ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); ++ if (!ret) { ++ gf_msg_debug(this->name, 0, ++ "Base shard corresponding to gfid " ++ "%s is present. Skipping shard deletion. " ++ "Returning", ++ entry->d_name); ++ ret = 0; ++ goto delete_marker; ++ } ++ ++ first_block = 1; ++ ++ while (shard_count) { ++ if (shard_count < local->deletion_rate) { ++ now = shard_count; ++ shard_count = 0; ++ } else { ++ now = local->deletion_rate; ++ shard_count -= local->deletion_rate; ++ } ++ ++ gf_msg_debug(this->name, 0, ++ "deleting %d shards starting from " ++ "block %d of gfid %s", ++ now, first_block, entry->d_name); ++ ret = shard_regulated_shards_deletion(cleanup_frame, this, now, ++ first_block, entry); ++ if (ret) ++ goto err; ++ first_block += now; ++ } ++ ++delete_marker: ++ loc_wipe(&loc); ++ loc.inode = inode_ref(inode); ++ loc.parent = inode_ref(priv->dot_shard_rm_inode); ++ ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); ++ if (ret) ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Failed to delete %s " ++ "from /%s", ++ entry->d_name, GF_SHARD_REMOVE_ME_DIR); ++err: ++ if (xattr_rsp) ++ dict_unref(xattr_rsp); ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, ++ gf_dirent_t *entry, inode_t *inode) ++{ ++ int ret = -1; ++ loc_t loc = { ++ 0, ++ }; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ loc.inode = inode_ref(priv->dot_shard_rm_inode); ++ ++ ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, ++ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); ++ if (ret < 0) { ++ if (ret == -EAGAIN) { ++ ret = 0; ++ } ++ goto out; ++ } ++ { ++ ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); ++ } ++ syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, ++ ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); ++out: ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) ++{ ++ SHARD_STACK_DESTROY(frame); ++ return 0; ++} ++ ++int ++shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, ++ shard_internal_dir_type_t type) ++{ ++ int ret = 0; ++ char *bname = NULL; ++ loc_t *loc = NULL; ++ shard_priv_t *priv = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ struct iatt stbuf = { ++ 0, ++ }; ++ ++ priv = this->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ loc = &local->dot_shard_loc; ++ gf_uuid_copy(gfid, priv->dot_shard_gfid); ++ bname = GF_SHARD_DIR; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ loc = &local->dot_shard_rm_loc; ++ gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ break; ++ default: ++ break; ++ } ++ ++ loc->inode = inode_find(this->itable, gfid); ++ if (!loc->inode) { ++ ret = shard_init_internal_dir_loc(this, local, type); ++ if (ret) ++ goto err; ++ ret = dict_reset(local->xattr_req); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to reset " ++ "dict"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); ++ ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, ++ local->xattr_req, NULL); ++ if (ret < 0) { ++ if (ret != -ENOENT) ++ gf_msg(this->name, GF_LOG_ERROR, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Lookup on %s failed, exiting", bname); ++ goto err; ++ } else { ++ shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); ++ } ++ } ++ ret = 0; ++err: ++ return ret; ++} ++ ++int ++shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, ++ gf_dirent_t *entry) ++{ ++ int ret = 0; ++ loc_t loc = { ++ 0, ++ }; ++ ++ loc.inode = inode_new(this->itable); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ loc.parent = inode_ref(local->fd->inode); ++ ++ ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); ++ if (ret < 0) { ++ goto err; ++ } ++ entry->inode = inode_ref(loc.inode); ++ ret = 0; ++err: ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards(void *opaque) ++{ ++ int ret = 0; ++ off_t offset = 0; ++ loc_t loc = { ++ 0, ++ }; ++ inode_t *link_inode = NULL; ++ xlator_t *this = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ gf_dirent_t entries; ++ gf_dirent_t *entry = NULL; ++ call_frame_t *cleanup_frame = NULL; ++ gf_boolean_t done = _gf_false; ++ ++ this = THIS; ++ priv = this->private; ++ INIT_LIST_HEAD(&entries.list); ++ ++ cleanup_frame = opaque; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create local to " ++ "delete shards"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ cleanup_frame->local = local; ++ local->fop = GF_FOP_UNLINK; ++ ++ local->xattr_req = dict_new(); ++ if (!local->xattr_req) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ local->deletion_rate = priv->deletion_rate; ++ ++ ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret == -ENOENT) { ++ gf_msg_debug(this->name, 0, ++ ".shard absent. Nothing to" ++ " delete. Exiting"); ++ ret = 0; ++ goto err; ++ } else if (ret < 0) { ++ goto err; ++ } ++ ++ ret = shard_resolve_internal_dir(this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ if (ret == -ENOENT) { ++ gf_msg_debug(this->name, 0, ++ ".remove_me absent. " ++ "Nothing to delete. Exiting"); ++ ret = 0; ++ goto err; ++ } else if (ret < 0) { ++ goto err; ++ } ++ ++ local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); ++ if (!local->fd) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ for (;;) { ++ offset = 0; ++ LOCK(&priv->lock); ++ { ++ if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { ++ priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; ++ } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { ++ priv->bg_del_state = SHARD_BG_DELETION_NONE; ++ done = _gf_true; ++ } ++ } ++ UNLOCK(&priv->lock); ++ if (done) ++ break; ++ while ( ++ (ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, ++ &entries, local->xattr_req, NULL))) { ++ if (ret > 0) ++ ret = 0; ++ list_for_each_entry(entry, &entries.list, list) ++ { ++ offset = entry->d_off; ++ ++ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) ++ continue; ++ ++ if (!entry->inode) { ++ ret = shard_lookup_marker_entry(this, local, entry); ++ if (ret < 0) ++ continue; ++ } ++ link_inode = inode_link(entry->inode, local->fd->inode, ++ entry->d_name, &entry->d_stat); ++ ++ gf_msg_debug(this->name, 0, ++ "Initiating deletion of " ++ "shards of gfid %s", ++ entry->d_name); ++ ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, ++ link_inode); ++ inode_unlink(link_inode, local->fd->inode, entry->d_name); ++ inode_unref(link_inode); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Failed to clean up shards of gfid %s", ++ entry->d_name); ++ continue; ++ } ++ gf_msg(this->name, GF_LOG_INFO, 0, ++ SHARD_MSG_SHARD_DELETION_COMPLETED, ++ "Deleted " ++ "shards of gfid=%s from backend", ++ entry->d_name); ++ } ++ gf_dirent_free(&entries); ++ if (ret) ++ break; ++ } ++ } ++ ret = 0; ++ loc_wipe(&loc); ++ return ret; ++ ++err: ++ LOCK(&priv->lock); ++ { ++ priv->bg_del_state = SHARD_BG_DELETION_NONE; ++ } ++ UNLOCK(&priv->lock); ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ if (op_ret) ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Unlock failed. Please check brick logs for " ++ "more details"); ++ SHARD_STACK_DESTROY(frame); ++ return 0; ++} ++ ++int ++shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_inodelk_t *lock = NULL; ++ ++ local = frame->local; ++ lk_frame = local->inodelk_frame; ++ lk_local = lk_frame->local; ++ local->inodelk_frame = NULL; ++ loc = &local->int_inodelk.loc; ++ lock = &lk_local->int_inodelk; ++ lock->flock.l_type = F_UNLCK; ++ ++ STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, ++ &lock->flock, NULL); ++ local->int_inodelk.acquired_lock = _gf_false; ++ return 0; ++} ++ ++int ++shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ struct iatt *preoldparent, struct iatt *postoldparent, ++ struct iatt *prenewparent, struct iatt *postnewparent, ++ dict_t *xdata); ++int ++shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ loc_t *dst_loc = NULL; ++ loc_t tmp_loc = { ++ 0, ++ }; ++ shard_local_t *local = frame->local; ++ ++ if (local->dst_block_size) { ++ tmp_loc.parent = inode_ref(local->loc2.parent); ++ ret = inode_path(tmp_loc.parent, local->loc2.name, ++ (char **)&tmp_loc.path); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on pargfid=%s bname=%s", ++ uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } ++ ++ tmp_loc.name = strrchr(tmp_loc.path, '/'); ++ if (tmp_loc.name) ++ tmp_loc.name++; ++ dst_loc = &tmp_loc; ++ } else { ++ dst_loc = &local->loc2; ++ } ++ ++ /* To-Do: Request open-fd count on dst base file */ ++ STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, ++ local->xattr_req); ++ loc_wipe(&tmp_loc); ++ return 0; ++err: ++ loc_wipe(&tmp_loc); ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++} ++ ++int ++shard_unlink_base_file(call_frame_t *frame, xlator_t *this); ++ ++int ++shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Xattrop on marker file failed " ++ "while performing %s; entry gfid=%s", ++ gf_fop_string(local->fop), local->newloc.name); ++ goto err; ++ } ++ ++ inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, ++ local->newloc.name); ++ ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file(frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) ++{ ++ int op_errno = ENOMEM; ++ uint64_t bs = 0; ++ dict_t *xdata = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ xdata = dict_new(); ++ if (!xdata) ++ goto err; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ bs = local->block_size; ++ else if (local->fop == GF_FOP_RENAME) ++ bs = local->dst_block_size; ++ SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, ++ local->prebuf.ia_size, 0, err); ++ STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, ++ &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL); ++ dict_unref(xdata); ++ return 0; ++err: ++ if (xdata) ++ dict_unref(xdata); ++ shard_common_failure_unwind(local->fop, frame, -1, op_errno); ++ return 0; ++} ++ ++int ++shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Lookup on marker file failed " ++ "while performing %s; entry gfid=%s", ++ gf_fop_string(local->fop), local->newloc.name); ++ goto err; ++ } ++ ++ linked_inode = inode_link(inode, priv->dot_shard_rm_inode, ++ local->newloc.name, buf); ++ inode_unref(local->newloc.inode); ++ local->newloc.inode = linked_inode; ++ shard_set_size_attrs_on_marker_file(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) ++{ ++ int op_errno = ENOMEM; ++ dict_t *xattr_req = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) ++ goto err; ++ ++ STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); ++ dict_unref(xattr_req); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, op_errno); ++ return 0; ++} ++ ++int ++shard_create_marker_file_under_remove_me_cbk( ++ call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (op_ret < 0) { ++ if ((op_errno != EEXIST) && (op_errno != ENODATA)) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Marker file creation " ++ "failed while performing %s; entry gfid=%s", ++ gf_fop_string(local->fop), local->newloc.name); ++ goto err; ++ } else { ++ shard_lookup_marker_file(frame, this); ++ return 0; ++ } ++ } ++ ++ linked_inode = inode_link(inode, priv->dot_shard_rm_inode, ++ local->newloc.name, buf); ++ inode_unref(local->newloc.inode); ++ local->newloc.inode = linked_inode; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file(frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++} ++ ++int ++shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, ++ loc_t *loc) ++{ ++ int ret = 0; ++ int op_errno = ENOMEM; ++ uint64_t bs = 0; ++ char g1[64] = { ++ 0, ++ }; ++ char g2[64] = { ++ 0, ++ }; ++ dict_t *xattr_req = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) ++ goto err; ++ ++ local->newloc.inode = inode_new(this->itable); ++ local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); ++ ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), ++ (char **)&local->newloc.path); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on " ++ "pargfid=%s bname=%s", ++ uuid_utoa_r(priv->dot_shard_rm_gfid, g1), ++ uuid_utoa_r(loc->inode->gfid, g2)); ++ goto err; ++ } ++ local->newloc.name = strrchr(local->newloc.path, '/'); ++ if (local->newloc.name) ++ local->newloc.name++; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ bs = local->block_size; ++ else if (local->fop == GF_FOP_RENAME) ++ bs = local->dst_block_size; ++ ++ SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, ++ local->prebuf.ia_size, 0, err); ++ ++ STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, ++ &local->newloc, 0, 0, 0644, xattr_req); ++ dict_unref(xattr_req); ++ return 0; ++ ++err: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, ++ NULL, NULL, NULL, NULL, NULL); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); ++ ++int ++shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } else { ++ shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); ++ local->preoldparent = *preparent; ++ local->postoldparent = *postparent; ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ if (local->cleanup_required) ++ shard_start_background_deletion(this); ++ } ++ ++ if (local->entrylk_frame) { ++ ret = shard_unlock_entrylk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ } ++ ++ ret = shard_unlock_inodelk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ ++ shard_unlink_cbk(frame, this); ++ return 0; ++} ++ ++int ++shard_unlink_base_file(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ ++ /* To-Do: Request open-fd count on base file */ ++ STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, ++ local->xattr_req); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ if (op_ret) ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Unlock failed. Please check brick logs for " ++ "more details"); ++ SHARD_STACK_DESTROY(frame); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_entrylk_t *lock = NULL; ++ ++ local = frame->local; ++ lk_frame = local->entrylk_frame; ++ lk_local = lk_frame->local; ++ local->entrylk_frame = NULL; ++ lock = &lk_local->int_entrylk; ++ loc = &lock->loc; ++ ++ STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->entrylk, this->name, loc, ++ lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, ++ NULL); ++ local->int_entrylk.acquired_lock = _gf_false; ++ return 0; ++} ++ ++int ++shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ switch (local->fop) { ++ case GF_FOP_UNLINK: ++ case GF_FOP_RENAME: ++ shard_create_marker_file_under_remove_me(frame, this, ++ &local->int_inodelk.loc); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "post-entrylk handler not defined. This case should not" ++ " be hit"); ++ break; ++ } ++ return 0; ++} ++ ++int ++shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ call_frame_t *main_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *main_local = NULL; ++ ++ local = frame->local; ++ main_frame = local->main_frame; ++ main_local = main_frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(main_local->fop, main_frame, op_ret, ++ op_errno); ++ return 0; ++ } ++ main_local->int_entrylk.acquired_lock = _gf_true; ++ shard_post_entrylk_fop_handler(main_frame, this); ++ return 0; ++} ++ ++int ++shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, ++ uuid_t gfid) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ shard_local_t *entrylk_local = NULL; ++ shard_entrylk_t *int_entrylk = NULL; ++ call_frame_t *entrylk_frame = NULL; ++ ++ local = frame->local; ++ entrylk_frame = create_frame(this, this->ctx->pool); ++ if (!entrylk_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create new frame " ++ "to lock marker file"); ++ goto err; ++ } ++ ++ entrylk_local = mem_get0(this->local_pool); ++ if (!entrylk_local) { ++ STACK_DESTROY(entrylk_frame->root); ++ goto err; ++ } ++ ++ entrylk_frame->local = entrylk_local; ++ entrylk_local->main_frame = frame; ++ int_entrylk = &entrylk_local->int_entrylk; ++ ++ int_entrylk->loc.inode = inode_ref(inode); ++ set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); ++ local->entrylk_frame = entrylk_frame; ++ gf_uuid_unparse(gfid, gfid_str); ++ int_entrylk->basename = gf_strdup(gfid_str); ++ ++ STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, ++ int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++ } ++ ++ if (local->prebuf.ia_nlink > 1) { ++ gf_msg_debug(this->name, 0, ++ "link count on %s > 1:%d, " ++ "performing rename()/unlink()", ++ local->int_inodelk.loc.path, local->prebuf.ia_nlink); ++ if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file(frame, this); ++ else if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file(frame, this); ++ } else { ++ gf_msg_debug(this->name, 0, ++ "link count on %s = 1, creating " ++ "file under .remove_me", ++ local->int_inodelk.loc.path); ++ local->cleanup_required = _gf_true; ++ shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, ++ local->prebuf.ia_gfid); ++ } ++ return 0; ++} ++ ++int ++shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ switch (local->fop) { ++ case GF_FOP_UNLINK: ++ case GF_FOP_RENAME: ++ shard_lookup_base_file(frame, this, &local->int_inodelk.loc, ++ shard_post_lookup_base_shard_rm_handler); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "post-inodelk handler not defined. This case should not" ++ " be hit"); ++ break; ++ } ++ return 0; ++} ++ ++int ++shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ call_frame_t *main_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *main_local = NULL; ++ ++ local = frame->local; ++ main_frame = local->main_frame; ++ main_local = main_frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(main_local->fop, main_frame, op_ret, ++ op_errno); ++ return 0; ++ } ++ main_local->int_inodelk.acquired_lock = _gf_true; ++ shard_post_inodelk_fop_handler(main_frame, this); ++ return 0; ++} ++ ++int ++shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) ++{ ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_inodelk_t *int_inodelk = NULL; ++ ++ local = frame->local; ++ lk_frame = create_frame(this, this->ctx->pool); ++ if (!lk_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create new frame " ++ "to lock base shard"); ++ goto err; ++ } ++ lk_local = mem_get0(this->local_pool); ++ if (!lk_local) { ++ STACK_DESTROY(lk_frame->root); ++ goto err; ++ } ++ ++ lk_frame->local = lk_local; ++ lk_local->main_frame = frame; ++ int_inodelk = &lk_local->int_inodelk; ++ ++ int_inodelk->flock.l_len = 0; ++ int_inodelk->flock.l_start = 0; ++ int_inodelk->domain = this->name; ++ int_inodelk->flock.l_type = F_WRLCK; ++ loc_copy(&local->int_inodelk.loc, loc); ++ set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); ++ local->inodelk_frame = lk_frame; ++ ++ STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, ++ &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++ } ++ if (local->fop == GF_FOP_UNLINK) ++ loc = &local->loc; ++ else if (local->fop == GF_FOP_RENAME) ++ loc = &local->loc2; ++ shard_acquire_inodelk(frame, this, loc); ++ return 0; ++} ++ ++int ++shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type); ++int ++shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++ } ++ shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ return 0; ++} ++ ++void ++shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ local->dot_shard_rm_loc.inode = inode_find(this->itable, ++ priv->dot_shard_rm_gfid); ++ if (!local->dot_shard_rm_loc.inode) { ++ local->dot_shard_loc.inode = inode_find(this->itable, ++ priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ local->post_res_handler = shard_pre_mkdir_rm_handler; ++ shard_refresh_internal_dir(frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } ++ } else { ++ local->post_res_handler = shard_post_mkdir_rm_handler; ++ shard_refresh_internal_dir(frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ } ++} ++ ++int ++shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); ++ return 0; ++ } ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ loc_copy(&local->loc, loc); ++ local->xflag = xflag; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ local->block_size = block_size; ++ local->resolver_base_inode = loc->inode; ++ local->fop = GF_FOP_UNLINK; ++ if (!this->itable) ++ this->itable = (local->loc.inode)->table; ++ ++ local->resolve_not = _gf_true; ++ shard_begin_rm_resolution(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_rename_cbk(frame, this); ++ return 0; ++} ++ ++int ++shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ struct iatt *preoldparent, struct iatt *postoldparent, ++ struct iatt *prenewparent, struct iatt *postnewparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto err; ++ } ++ /* Set ctx->refresh to TRUE to force a lookup on disk when ++ * shard_lookup_base_file() is called next to refresh the hard link ++ * count in ctx. Note that this is applicable only to the case where ++ * the rename dst is already existent and sharded. ++ */ ++ if ((local->dst_block_size) && (!local->cleanup_required)) ++ shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); ++ ++ local->prebuf = *buf; ++ local->preoldparent = *preoldparent; ++ local->postoldparent = *postoldparent; ++ local->prenewparent = *prenewparent; ++ local->postnewparent = *postnewparent; ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ ++ if (local->dst_block_size) { ++ if (local->entrylk_frame) { ++ ret = shard_unlock_entrylk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ } ++ ++ ret = shard_unlock_inodelk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ goto err; ++ } ++ if (local->cleanup_required) ++ shard_start_background_deletion(this); ++ } ++ ++ /* Now the base file of src, if sharded, is looked up to gather ia_size ++ * and ia_blocks.*/ ++ if (local->block_size) { ++ local->tmp_loc.inode = inode_new(this->itable); ++ gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); ++ shard_lookup_base_file(frame, this, &local->tmp_loc, ++ shard_post_rename_lookup_handler); ++ } else { ++ shard_rename_cbk(frame, this); ++ } ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++} ++ ++int ++shard_post_lookup_dst_base_file_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ /* Save dst base file attributes into postbuf so the information is not ++ * lost when it is overwritten after lookup on base file of src in ++ * shard_lookup_base_file_cbk(). ++ */ ++ local->postbuf = local->prebuf; ++ shard_rename_src_base_file(frame, this); ++ return 0; ++} ++ ++int ++shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ uint64_t dst_block_size = 0; ++ shard_local_t *local = NULL; ++ ++ if (IA_ISDIR(oldloc->inode->ia_type)) { ++ STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); ++ if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(oldloc->inode->gfid)); ++ goto err; ++ } ++ ++ if (newloc->inode) ++ ret = shard_inode_ctx_get_block_size(newloc->inode, this, ++ &dst_block_size); ++ ++ /* The following stack_wind covers the case where: ++ * a. the src file is not sharded and dst doesn't exist, OR ++ * b. the src and dst both exist but are not sharded. ++ */ ++ if (((!block_size) && (!dst_block_size)) || ++ frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ loc_copy(&local->loc, oldloc); ++ loc_copy(&local->loc2, newloc); ++ local->resolver_base_inode = newloc->inode; ++ local->fop = GF_FOP_RENAME; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ ++ local->block_size = block_size; ++ local->dst_block_size = dst_block_size; ++ if (!this->itable) ++ this->itable = (local->loc.inode)->table; ++ local->resolve_not = _gf_true; ++ ++ /* The following if-block covers the case where the dst file exists ++ * and is sharded. ++ */ ++ if (local->dst_block_size) { ++ shard_begin_rm_resolution(frame, this); ++ } else { ++ /* The following block covers the case where the dst either doesn't ++ * exist or is NOT sharded but the src is sharded. In this case, shard ++ * xlator would go ahead and rename src to dst. Once done, it would also ++ * lookup the base shard of src to get the ia_size and ia_blocks xattr ++ * values. ++ */ ++ shard_rename_src_base_file(frame, this); ++ } ++ return 0; ++ ++err: ++ shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, ++ struct iatt *stbuf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret == -1) ++ goto unwind; ++ ++ ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, ++ SHARD_ALL_MASK); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, ++ "Failed to set inode " ++ "ctx for %s", ++ uuid_utoa(inode->gfid)); ++ ++unwind: ++ SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, ++ preparent, postparent, xdata); ++ return 0; ++} ++ ++int ++shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ local->block_size = priv->block_size; ++ ++ if (!__is_gsyncd_on_shard_dir(frame, loc)) { ++ SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); ++ } ++ ++ STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, ++ xdata); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) ++{ ++ /* To-Do: Handle open with O_TRUNC under locks */ ++ SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); ++ return 0; ++} ++ ++int ++shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ fd_t *fd, dict_t *xdata) ++{ ++ STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); ++ return 0; ++} ++ ++int ++shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iovec *vector, ++ int32_t count, struct iatt *stbuf, struct iobref *iobref, ++ dict_t *xdata) ++{ ++ int i = 0; ++ int call_count = 0; ++ void *address = NULL; ++ uint64_t block_num = 0; ++ off_t off = 0; ++ struct iovec vec = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ fd_t *anon_fd = cookie; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ local = frame->local; ++ ++ /* If shard has already seen a failure here before, there is no point ++ * in aggregating subsequent reads, so just go to out. ++ */ ++ if (local->op_ret < 0) ++ goto out; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto out; ++ } ++ ++ if (local->op_ret >= 0) ++ local->op_ret += op_ret; ++ ++ shard_inode_ctx_get(anon_fd->inode, this, &ctx); ++ block_num = ctx->block_num; ++ ++ if (block_num == local->first_block) { ++ address = local->iobuf->ptr; ++ } else { ++ /* else ++ * address to start writing to = beginning of buffer + ++ * number of bytes until end of first block + ++ * + block_size times number of blocks ++ * between the current block and the first ++ */ ++ address = (char *)local->iobuf->ptr + ++ (local->block_size - (local->offset % local->block_size)) + ++ ((block_num - local->first_block - 1) * local->block_size); ++ } ++ ++ for (i = 0; i < count; i++) { ++ address = (char *)address + off; ++ memcpy(address, vector[i].iov_base, vector[i].iov_len); ++ off += vector[i].iov_len; ++ } ++ ++out: ++ if (anon_fd) ++ fd_unref(anon_fd); ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ vec.iov_base = local->iobuf->ptr; ++ if (local->offset + local->req_size > local->prebuf.ia_size) ++ local->total_size = local->prebuf.ia_size - local->offset; ++ vec.iov_len = local->total_size; ++ local->op_ret = local->total_size; ++ SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, ++ &vec, 1, &local->prebuf, local->iobref, ++ local->xattr_rsp); ++ return 0; ++ } ++ } ++ ++ return 0; ++} ++ ++int ++shard_readv_do(call_frame_t *frame, xlator_t *this) ++{ ++ int i = 0; ++ int call_count = 0; ++ int last_block = 0; ++ int cur_block = 0; ++ off_t orig_offset = 0; ++ off_t shard_offset = 0; ++ size_t read_size = 0; ++ size_t remaining_size = 0; ++ fd_t *fd = NULL; ++ fd_t *anon_fd = NULL; ++ shard_local_t *local = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ ++ local = frame->local; ++ fd = local->fd; ++ ++ orig_offset = local->offset; ++ cur_block = local->first_block; ++ last_block = local->last_block; ++ remaining_size = local->total_size; ++ local->call_count = call_count = local->num_blocks; ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ ++ if (fd->flags & O_DIRECT) ++ local->flags = O_DIRECT; ++ ++ while (cur_block <= last_block) { ++ if (wind_failed) { ++ shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, ++ 0, NULL, NULL, NULL); ++ goto next; ++ } + +- local = frame->local; ++ shard_offset = orig_offset % local->block_size; ++ read_size = local->block_size - shard_offset; ++ if (read_size > remaining_size) ++ read_size = remaining_size; ++ ++ remaining_size -= read_size; ++ ++ if (cur_block == 0) { ++ anon_fd = fd_ref(fd); ++ } else { ++ anon_fd = fd_anonymous(local->inode_list[i]); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, ++ ENOMEM, NULL, 0, NULL, NULL, NULL); ++ goto next; ++ } ++ } + +- xattr_req = shard_create_gfid_dict(local->xattr_req); +- if (!xattr_req) +- goto err; ++ STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readv, anon_fd, read_size, ++ shard_offset, local->flags, local->xattr_req); + +- STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); +- dict_unref(xattr_req); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, -1, op_errno); +- return 0; ++ orig_offset += read_size; ++ next: ++ cur_block++; ++ i++; ++ call_count--; ++ } ++ return 0; + } + +-int shard_create_marker_file_under_remove_me_cbk( +- call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- inode_t *linked_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- priv = this->private; +- +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- if (op_ret < 0) { +- if ((op_errno != EEXIST) && (op_errno != ENODATA)) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Marker file creation " +- "failed while performing %s; entry gfid=%s", +- gf_fop_string(local->fop), local->newloc.name); +- goto err; +- } else { +- shard_lookup_marker_file(frame, this); +- return 0; ++int ++shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int shard_block_num = (long)cookie; ++ int call_count = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ if (op_errno == EEXIST) { ++ LOCK(&frame->lock); ++ { ++ local->eexist_count++; ++ } ++ UNLOCK(&frame->lock); ++ } else { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } ++ gf_msg_debug(this->name, 0, ++ "mknod of shard %d " ++ "failed: %s", ++ shard_block_num, strerror(op_errno)); ++ goto done; + } +- } + +- linked_inode = +- inode_link(inode, priv->dot_shard_rm_inode, local->newloc.name, buf); +- inode_unref(local->newloc.inode); +- local->newloc.inode = linked_inode; ++ shard_link_block_inode(local, shard_block_num, inode, buf); + +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_base_file(frame, this); +- else if (local->fop == GF_FOP_RENAME) +- shard_rename_src_base_file(frame, this); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); +- return 0; +-} +- +-int shard_create_marker_file_under_remove_me(call_frame_t *frame, +- xlator_t *this, loc_t *loc) { +- int ret = 0; +- int op_errno = ENOMEM; +- uint64_t bs = 0; +- char g1[64] = { +- 0, +- }; +- char g2[64] = { +- 0, +- }; +- dict_t *xattr_req = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- priv = this->private; +- local = frame->local; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- xattr_req = shard_create_gfid_dict(local->xattr_req); +- if (!xattr_req) +- goto err; +- +- local->newloc.inode = inode_new(this->itable); +- local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); +- ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), +- (char **)&local->newloc.path); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on " +- "pargfid=%s bname=%s", +- uuid_utoa_r(priv->dot_shard_rm_gfid, g1), +- uuid_utoa_r(loc->inode->gfid, g2)); +- goto err; +- } +- local->newloc.name = strrchr(local->newloc.path, '/'); +- if (local->newloc.name) +- local->newloc.name++; +- +- if (local->fop == GF_FOP_UNLINK) +- bs = local->block_size; +- else if (local->fop == GF_FOP_RENAME) +- bs = local->dst_block_size; +- +- SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, +- local->prebuf.ia_size, 0, err); +- +- STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, &local->newloc, +- 0, 0, 0644, xattr_req); +- dict_unref(xattr_req); +- return 0; ++done: ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ local->create_count = 0; ++ local->post_mknod_handler(frame, this); ++ } + +-err: +- if (xattr_req) +- dict_unref(xattr_req); +- shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, +- NULL, NULL, NULL, NULL, NULL); +- return 0; ++ return 0; + } + +-int shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); +- +-int shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) { +- int ret = 0; +- shard_local_t *local = NULL; ++int ++shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, ++ shard_post_mknod_fop_handler_t post_mknod_handler) ++{ ++ int i = 0; ++ int shard_idx_iter = 0; ++ int last_block = 0; ++ int ret = 0; ++ int call_count = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ mode_t mode = 0; ++ char *bname = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t ctx_tmp = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ fd_t *fd = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ dict_t *xattr_req = NULL; + +- local = frame->local; ++ local = frame->local; ++ priv = this->private; ++ fd = local->fd; ++ shard_idx_iter = local->first_block; ++ last_block = local->last_block; ++ call_count = local->call_count = local->create_count; ++ local->post_mknod_handler = post_mknod_handler; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- } else { +- shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); +- local->preoldparent = *preparent; +- local->postoldparent = *postparent; +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- if (local->cleanup_required) +- shard_start_background_deletion(this); +- } ++ SHARD_SET_ROOT_FS_ID(frame, local); + +- if (local->entrylk_frame) { +- ret = shard_unlock_entrylk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; ++ ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get inode " ++ "ctx for %s", ++ uuid_utoa(fd->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; + } +- } ++ mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); + +- ret = shard_unlock_inodelk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; +- } +- +- shard_unlink_cbk(frame, this); +- return 0; +-} +- +-int shard_unlink_base_file(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = frame->local; +- +- /* To-Do: Request open-fd count on base file */ +- STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, +- local->xattr_req); +- return 0; +-} +- +-int shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- if (op_ret) +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Unlock failed. Please check brick logs for " +- "more details"); +- SHARD_STACK_DESTROY(frame); +- return 0; +-} +- +-int shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) { +- loc_t *loc = NULL; +- call_frame_t *lk_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *lk_local = NULL; +- shard_entrylk_t *lock = NULL; +- +- local = frame->local; +- lk_frame = local->entrylk_frame; +- lk_local = lk_frame->local; +- local->entrylk_frame = NULL; +- lock = &lk_local->int_entrylk; +- loc = &lock->loc; +- +- STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->entrylk, this->name, loc, +- lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, +- NULL); +- local->int_entrylk.acquired_lock = _gf_false; +- return 0; +-} +- +-int shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- switch (local->fop) { +- case GF_FOP_UNLINK: +- case GF_FOP_RENAME: +- shard_create_marker_file_under_remove_me(frame, this, +- &local->int_inodelk.loc); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "post-entrylk handler not defined. This case should not" +- " be hit"); +- break; +- } +- return 0; +-} +- +-int shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- call_frame_t *main_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *main_local = NULL; +- +- local = frame->local; +- main_frame = local->main_frame; +- main_local = main_frame->local; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(main_local->fop, main_frame, op_ret, op_errno); +- return 0; +- } +- main_local->int_entrylk.acquired_lock = _gf_true; +- shard_post_entrylk_fop_handler(main_frame, this); +- return 0; +-} +- +-int shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, +- uuid_t gfid) { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; +- shard_local_t *local = NULL; +- shard_local_t *entrylk_local = NULL; +- shard_entrylk_t *int_entrylk = NULL; +- call_frame_t *entrylk_frame = NULL; +- +- local = frame->local; +- entrylk_frame = create_frame(this, this->ctx->pool); +- if (!entrylk_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create new frame " +- "to lock marker file"); +- goto err; +- } +- +- entrylk_local = mem_get0(this->local_pool); +- if (!entrylk_local) { +- STACK_DESTROY(entrylk_frame->root); +- goto err; +- } +- +- entrylk_frame->local = entrylk_local; +- entrylk_local->main_frame = frame; +- int_entrylk = &entrylk_local->int_entrylk; +- +- int_entrylk->loc.inode = inode_ref(inode); +- set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); +- local->entrylk_frame = entrylk_frame; +- gf_uuid_unparse(gfid, gfid_str); +- int_entrylk->basename = gf_strdup(gfid_str); +- +- STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, +- int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +-} ++ while (shard_idx_iter <= last_block) { ++ if (local->inode_list[i]) { ++ shard_idx_iter++; ++ i++; ++ continue; ++ } + +-int shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; ++ if (wind_failed) { ++ shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, ++ -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); ++ goto next; ++ } + +- priv = this->private; +- local = frame->local; ++ shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, ++ sizeof(path)); ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, ++ -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ bname = strrchr(path, '/') + 1; ++ loc.inode = inode_new(this->itable); ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0 || !(loc.inode)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ "on %s, base file gfid = %s", ++ bname, uuid_utoa(fd->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ loc_wipe(&loc); ++ dict_unref(xattr_req); ++ shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, ++ -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, ++ (void *)(long)shard_idx_iter, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->mknod, &loc, mode, ++ ctx_tmp.stat.ia_rdev, 0, xattr_req); ++ loc_wipe(&loc); ++ dict_unref(xattr_req); ++ ++ next: ++ shard_idx_iter++; ++ i++; ++ if (!--call_count) ++ break; ++ } + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; +- } +- +- if (local->prebuf.ia_nlink > 1) { +- gf_msg_debug(this->name, 0, "link count on %s > 1:%d, " +- "performing rename()/unlink()", +- local->int_inodelk.loc.path, local->prebuf.ia_nlink); +- if (local->fop == GF_FOP_RENAME) +- shard_rename_src_base_file(frame, this); +- else if (local->fop == GF_FOP_UNLINK) +- shard_unlink_base_file(frame, this); +- } else { +- gf_msg_debug(this->name, 0, "link count on %s = 1, creating " +- "file under .remove_me", +- local->int_inodelk.loc.path); +- local->cleanup_required = _gf_true; +- shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, +- local->prebuf.ia_gfid); +- } +- return 0; +-} +- +-int shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- switch (local->fop) { +- case GF_FOP_UNLINK: +- case GF_FOP_RENAME: +- shard_lookup_base_file(frame, this, &local->int_inodelk.loc, +- shard_post_lookup_base_shard_rm_handler); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "post-inodelk handler not defined. This case should not" +- " be hit"); +- break; +- } +- return 0; +-} +- +-int shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- call_frame_t *main_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *main_local = NULL; +- +- local = frame->local; +- main_frame = local->main_frame; +- main_local = main_frame->local; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(main_local->fop, main_frame, op_ret, op_errno); +- return 0; +- } +- main_local->int_inodelk.acquired_lock = _gf_true; +- shard_post_inodelk_fop_handler(main_frame, this); +- return 0; +-} +- +-int shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) { +- call_frame_t *lk_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *lk_local = NULL; +- shard_inodelk_t *int_inodelk = NULL; +- +- local = frame->local; +- lk_frame = create_frame(this, this->ctx->pool); +- if (!lk_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create new frame " +- "to lock base shard"); +- goto err; +- } +- lk_local = mem_get0(this->local_pool); +- if (!lk_local) { +- STACK_DESTROY(lk_frame->root); +- goto err; +- } +- +- lk_frame->local = lk_local; +- lk_local->main_frame = frame; +- int_inodelk = &lk_local->int_inodelk; +- +- int_inodelk->flock.l_len = 0; +- int_inodelk->flock.l_start = 0; +- int_inodelk->domain = this->name; +- int_inodelk->flock.l_type = F_WRLCK; +- loc_copy(&local->int_inodelk.loc, loc); +- set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); +- local->inodelk_frame = lk_frame; +- +- STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, +- &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); +- return 0; + err: +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; ++ /* ++ * This block is for handling failure in shard_inode_ctx_get_all(). ++ * Failures in the while-loop are handled within the loop. ++ */ ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ post_mknod_handler(frame, this); ++ return 0; + } + +-int shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) { +- loc_t *loc = NULL; +- shard_local_t *local = NULL; ++int ++shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); + +- local = frame->local; ++int ++shard_post_lookup_shards_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); +- return 0; +- } +- if (local->fop == GF_FOP_UNLINK) +- loc = &local->loc; +- else if (local->fop == GF_FOP_RENAME) +- loc = &local->loc2; +- shard_acquire_inodelk(frame, this, loc); +- return 0; +-} ++ local = frame->local; + +-int shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler, +- shard_internal_dir_type_t type); +-int shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- local = frame->local; ++ if (local->create_count) { ++ shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); ++ } else { ++ shard_readv_do(frame, this); ++ } + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; +- } +- shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); +- return 0; + } + +-void shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; ++int ++shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- priv = this->private; +- local = frame->local; ++ local = frame->local; + +- local->dot_shard_rm_loc.inode = +- inode_find(this->itable, priv->dot_shard_rm_gfid); +- if (!local->dot_shard_rm_loc.inode) { +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_pre_mkdir_rm_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- } else { +- local->post_res_handler = shard_post_mkdir_rm_handler; +- shard_refresh_internal_dir(frame, this, +- SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); +- } +-} +- +-int shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, +- dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); +- return 0; +- } +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- loc_copy(&local->loc, loc); +- local->xflag = xflag; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- local->block_size = block_size; +- local->resolver_base_inode = loc->inode; +- local->fop = GF_FOP_UNLINK; +- if (!this->itable) +- this->itable = (local->loc.inode)->table; +- +- local->resolve_not = _gf_true; +- shard_begin_rm_resolution(frame, this); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); +- return 0; +-} ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +-int shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) { +- shard_rename_cbk(frame, this); +- return 0; ++ if (!local->eexist_count) { ++ shard_readv_do(frame, this); ++ } else { ++ local->call_count = local->eexist_count; ++ shard_common_lookup_shards(frame, this, local->loc.inode, ++ shard_post_lookup_shards_readv_handler); ++ } ++ return 0; + } + +-int shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *buf, +- struct iatt *preoldparent, struct iatt *postoldparent, +- struct iatt *prenewparent, struct iatt *postnewparent, +- dict_t *xdata) { +- int ret = 0; +- shard_local_t *local = NULL; +- +- local = frame->local; ++int ++shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto err; +- } +- /* Set ctx->refresh to TRUE to force a lookup on disk when +- * shard_lookup_base_file() is called next to refresh the hard link +- * count in ctx. Note that this is applicable only to the case where +- * the rename dst is already existent and sharded. +- */ +- if ((local->dst_block_size) && (!local->cleanup_required)) +- shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); +- +- local->prebuf = *buf; +- local->preoldparent = *preoldparent; +- local->postoldparent = *postoldparent; +- local->prenewparent = *prenewparent; +- local->postnewparent = *postnewparent; +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); ++ local = frame->local; + +- if (local->dst_block_size) { +- if (local->entrylk_frame) { +- ret = shard_unlock_entrylk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; +- } ++ if (local->op_ret < 0) { ++ if (local->op_errno != ENOENT) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } else { ++ struct iovec vec = { ++ 0, ++ }; ++ ++ vec.iov_base = local->iobuf->ptr; ++ vec.iov_len = local->total_size; ++ local->op_ret = local->total_size; ++ SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1, ++ &local->prebuf, local->iobref, NULL); ++ return 0; ++ } + } + +- ret = shard_unlock_inodelk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; +- goto err; +- } +- if (local->cleanup_required) +- shard_start_background_deletion(this); +- } +- +- /* Now the base file of src, if sharded, is looked up to gather ia_size +- * and ia_blocks.*/ +- if (local->block_size) { +- local->tmp_loc.inode = inode_new(this->itable); +- gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); +- shard_lookup_base_file(frame, this, &local->tmp_loc, +- shard_post_rename_lookup_handler); +- } else { +- shard_rename_cbk(frame, this); +- } +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +-} +- +-int shard_post_lookup_dst_base_file_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; ++ if (local->call_count) { ++ shard_common_lookup_shards(frame, this, local->resolver_base_inode, ++ shard_post_lookup_shards_readv_handler); ++ } else { ++ shard_readv_do(frame, this); ++ } + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); + return 0; +- } +- +- /* Save dst base file attributes into postbuf so the information is not +- * lost when it is overwritten after lookup on base file of src in +- * shard_lookup_base_file_cbk(). +- */ +- local->postbuf = local->prebuf; +- shard_rename_src_base_file(frame, this); +- return 0; +-} +- +-int shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, +- loc_t *newloc, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- uint64_t dst_block_size = 0; +- shard_local_t *local = NULL; +- +- if (IA_ISDIR(oldloc->inode->ia_type)) { +- STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); +- return 0; +- } +- +- ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); +- if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(oldloc->inode->gfid)); +- goto err; +- } +- +- if (newloc->inode) +- ret = shard_inode_ctx_get_block_size(newloc->inode, this, &dst_block_size); +- +- /* The following stack_wind covers the case where: +- * a. the src file is not sharded and dst doesn't exist, OR +- * b. the src and dst both exist but are not sharded. +- */ +- if (((!block_size) && (!dst_block_size)) || +- frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); +- return 0; +- } +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- loc_copy(&local->loc, oldloc); +- loc_copy(&local->loc2, newloc); +- local->resolver_base_inode = newloc->inode; +- local->fop = GF_FOP_RENAME; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- +- local->block_size = block_size; +- local->dst_block_size = dst_block_size; +- if (!this->itable) +- this->itable = (local->loc.inode)->table; +- local->resolve_not = _gf_true; +- +- /* The following if-block covers the case where the dst file exists +- * and is sharded. +- */ +- if (local->dst_block_size) { +- shard_begin_rm_resolution(frame, this); +- } else { +- /* The following block covers the case where the dst either doesn't +- * exist or is NOT sharded but the src is sharded. In this case, shard +- * xlator would go ahead and rename src to dst. Once done, it would also +- * lookup the base shard of src to get the ia_size and ia_blocks xattr +- * values. +- */ +- shard_rename_src_base_file(frame, this); +- } +- return 0; +- +-err: +- shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); +- return 0; + } + +-int shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, +- struct iatt *stbuf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- int ret = -1; +- shard_local_t *local = NULL; ++int ++shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ struct iobuf *iobuf = NULL; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; + +- local = frame->local; ++ priv = this->private; ++ local = frame->local; + +- if (op_ret == -1) +- goto unwind; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, +- SHARD_ALL_MASK); +- if (ret) +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, +- "Failed to set inode " +- "ctx for %s", +- uuid_utoa(inode->gfid)); ++ if (local->offset >= local->prebuf.ia_size) { ++ /* If the read is being performed past the end of the file, ++ * unwind the FOP with 0 bytes read as status. ++ */ ++ struct iovec vec = { ++ 0, ++ }; + +-unwind: +- SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, +- preparent, postparent, xdata); +- return 0; +-} ++ iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); ++ if (!iobuf) ++ goto err; + +-int shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +- mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; ++ vec.iov_base = iobuf->ptr; ++ vec.iov_len = 0; ++ local->iobref = iobref_new(); ++ iobref_add(local->iobref, iobuf); ++ iobuf_unref(iobuf); + +- priv = this->private; +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, ++ local->iobref, NULL); ++ return 0; ++ } + +- frame->local = local; +- local->block_size = priv->block_size; ++ local->first_block = get_lowest_block(local->offset, local->block_size); + +- if (!__is_gsyncd_on_shard_dir(frame, loc)) { +- SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); +- } ++ local->total_size = local->req_size; + +- STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, +- xdata); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { +- /* To-Do: Handle open with O_TRUNC under locks */ +- SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); +- return 0; +-} +- +-int shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +- fd_t *fd, dict_t *xdata) { +- STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); +- return 0; +-} +- +-int shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iovec *vector, +- int32_t count, struct iatt *stbuf, struct iobref *iobref, +- dict_t *xdata) { +- int i = 0; +- int call_count = 0; +- void *address = NULL; +- uint64_t block_num = 0; +- off_t off = 0; +- struct iovec vec = { +- 0, +- }; +- shard_local_t *local = NULL; +- fd_t *anon_fd = cookie; +- shard_inode_ctx_t *ctx = NULL; +- +- local = frame->local; +- +- /* If shard has already seen a failure here before, there is no point +- * in aggregating subsequent reads, so just go to out. +- */ +- if (local->op_ret < 0) +- goto out; +- +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto out; +- } ++ local->last_block = get_highest_block(local->offset, local->total_size, ++ local->block_size); + +- if (local->op_ret >= 0) +- local->op_ret += op_ret; ++ local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); ++ local->resolver_base_inode = local->loc.inode; + +- shard_inode_ctx_get(anon_fd->inode, this, &ctx); +- block_num = ctx->block_num; +- +- if (block_num == local->first_block) { +- address = local->iobuf->ptr; +- } else { +- /* else +- * address to start writing to = beginning of buffer + +- * number of bytes until end of first block + +- * + block_size times number of blocks +- * between the current block and the first +- */ +- address = (char *)local->iobuf->ptr + +- (local->block_size - (local->offset % local->block_size)) + +- ((block_num - local->first_block - 1) * local->block_size); +- } ++ local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ goto err; + +- for (i = 0; i < count; i++) { +- address = (char *)address + off; +- memcpy(address, vector[i].iov_base, vector[i].iov_len); +- off += vector[i].iov_len; +- } ++ iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); ++ if (!iobuf) ++ goto err; + +-out: +- if (anon_fd) +- fd_unref(anon_fd); +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- } else { +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- vec.iov_base = local->iobuf->ptr; +- if (local->offset + local->req_size > local->prebuf.ia_size) +- local->total_size = local->prebuf.ia_size - local->offset; +- vec.iov_len = local->total_size; +- local->op_ret = local->total_size; +- SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, &vec, 1, +- &local->prebuf, local->iobref, local->xattr_rsp); +- return 0; +- } +- } +- +- return 0; +-} +- +-int shard_readv_do(call_frame_t *frame, xlator_t *this) { +- int i = 0; +- int call_count = 0; +- int last_block = 0; +- int cur_block = 0; +- off_t orig_offset = 0; +- off_t shard_offset = 0; +- size_t read_size = 0; +- size_t remaining_size = 0; +- fd_t *fd = NULL; +- fd_t *anon_fd = NULL; +- shard_local_t *local = NULL; +- gf_boolean_t wind_failed = _gf_false; +- +- local = frame->local; +- fd = local->fd; +- +- orig_offset = local->offset; +- cur_block = local->first_block; +- last_block = local->last_block; +- remaining_size = local->total_size; +- local->call_count = call_count = local->num_blocks; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- if (fd->flags & O_DIRECT) +- local->flags = O_DIRECT; +- +- while (cur_block <= last_block) { +- if (wind_failed) { +- shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, 0, +- NULL, NULL, NULL); +- goto next; +- } +- +- shard_offset = orig_offset % local->block_size; +- read_size = local->block_size - shard_offset; +- if (read_size > remaining_size) +- read_size = remaining_size; +- +- remaining_size -= read_size; +- +- if (cur_block == 0) { +- anon_fd = fd_ref(fd); +- } else { +- anon_fd = fd_anonymous(local->inode_list[i]); +- if (!anon_fd) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, NULL, +- 0, NULL, NULL, NULL); +- goto next; +- } ++ local->iobref = iobref_new(); ++ if (!local->iobref) { ++ iobuf_unref(iobuf); ++ goto err; + } + +- STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readv, anon_fd, read_size, +- shard_offset, local->flags, local->xattr_req); ++ if (iobref_add(local->iobref, iobuf) != 0) { ++ iobuf_unref(iobuf); ++ goto err; ++ } + +- orig_offset += read_size; +- next: +- cur_block++; +- i++; +- call_count--; +- } +- return 0; +-} ++ memset(iobuf->ptr, 0, local->total_size); ++ iobuf_unref(iobuf); ++ local->iobuf = iobuf; + +-int shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- int shard_block_num = (long)cookie; +- int call_count = 0; +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (op_ret < 0) { +- if (op_errno == EEXIST) { +- LOCK(&frame->lock); +- { local->eexist_count++; } +- UNLOCK(&frame->lock); ++ local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ ret = shard_init_internal_dir_loc(this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret) ++ goto err; ++ shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { +- local->op_ret = op_ret; +- local->op_errno = op_errno; ++ local->post_res_handler = shard_post_resolve_readv_handler; ++ shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } +- gf_msg_debug(this->name, 0, "mknod of shard %d " +- "failed: %s", +- shard_block_num, strerror(op_errno)); +- goto done; +- } +- +- shard_link_block_inode(local, shard_block_num, inode, buf); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); ++ return 0; ++} + +-done: +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- local->create_count = 0; +- local->post_mknod_handler(frame, this); +- } +- +- return 0; +-} +- +-int shard_common_resume_mknod( +- call_frame_t *frame, xlator_t *this, +- shard_post_mknod_fop_handler_t post_mknod_handler) { +- int i = 0; +- int shard_idx_iter = 0; +- int last_block = 0; +- int ret = 0; +- int call_count = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- mode_t mode = 0; +- char *bname = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t ctx_tmp = { +- 0, +- }; +- shard_local_t *local = NULL; +- gf_boolean_t wind_failed = _gf_false; +- fd_t *fd = NULL; +- loc_t loc = { +- 0, +- }; +- dict_t *xattr_req = NULL; +- +- local = frame->local; +- priv = this->private; +- fd = local->fd; +- shard_idx_iter = local->first_block; +- last_block = local->last_block; +- call_count = local->call_count = local->create_count; +- local->post_mknod_handler = post_mknod_handler; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get inode " +- "ctx for %s", +- uuid_utoa(fd->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +- mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); ++int ++shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, uint32_t flags, dict_t *xdata) ++{ ++ int ret = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- while (shard_idx_iter <= last_block) { +- if (local->inode_list[i]) { +- shard_idx_iter++; +- i++; +- continue; ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; + } + +- if (wind_failed) { +- shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, +- ENOMEM, NULL, NULL, NULL, NULL, NULL); +- goto next; ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ /* block_size = 0 means that the file was created before ++ * sharding was enabled on the volume. ++ */ ++ STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, ++ xdata); ++ return 0; + } + +- shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, +- sizeof(path)); +- +- xattr_req = shard_create_gfid_dict(local->xattr_req); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, +- ENOMEM, NULL, NULL, NULL, NULL, NULL); +- goto next; +- } ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- bname = strrchr(path, '/') + 1; +- loc.inode = inode_new(this->itable); +- loc.parent = inode_ref(priv->dot_shard_inode); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0 || !(loc.inode)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- "on %s, base file gfid = %s", +- bname, uuid_utoa(fd->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- loc_wipe(&loc); +- dict_unref(xattr_req); +- shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, +- ENOMEM, NULL, NULL, NULL, NULL, NULL); +- goto next; +- } ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; ++ frame->local = local; + +- STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, +- (void *)(long)shard_idx_iter, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->mknod, &loc, mode, +- ctx_tmp.stat.ia_rdev, 0, xattr_req); +- loc_wipe(&loc); +- dict_unref(xattr_req); ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto err; ++ local->fd = fd_ref(fd); ++ local->block_size = block_size; ++ local->offset = offset; ++ local->req_size = size; ++ local->flags = flags; ++ local->fop = GF_FOP_READ; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- next: +- shard_idx_iter++; +- i++; +- if (!--call_count) +- break; +- } ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- return 0; ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_readv_handler); ++ return 0; + err: +- /* +- * This block is for handling failure in shard_inode_ctx_get_all(). +- * Failures in the while-loop are handled within the loop. +- */ +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- post_mknod_handler(frame, this); +- return 0; ++ shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); +- +-int shard_post_lookup_shards_readv_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_common_inode_write_post_update_size_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ shard_common_inode_write_success_unwind(local->fop, frame, ++ local->written_size); ++ } + return 0; +- } +- +- if (local->create_count) { +- shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); +- } else { +- shard_readv_do(frame, this); +- } +- +- return 0; + } + +-int shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++static gf_boolean_t ++shard_is_appending_write(shard_local_t *local) ++{ ++ if (local->fop != GF_FOP_WRITE) ++ return _gf_false; ++ if (local->flags & O_APPEND) ++ return _gf_true; ++ if (local->fd->flags & O_APPEND) ++ return _gf_true; ++ return _gf_false; ++} + +- local = frame->local; ++int ++__shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- if (!local->eexist_count) { +- shard_readv_do(frame, this); +- } else { +- local->call_count = local->eexist_count; +- shard_common_lookup_shards(frame, this, local->loc.inode, +- shard_post_lookup_shards_readv_handler); +- } +- return 0; +-} ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +-int shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ if (shard_is_appending_write(local)) { ++ local->delta_size = local->total_size; ++ } else if (local->offset + local->total_size > ctx->stat.ia_size) { ++ local->delta_size = (local->offset + local->total_size) - ++ ctx->stat.ia_size; ++ } else { ++ local->delta_size = 0; ++ } ++ ctx->stat.ia_size += (local->delta_size); ++ local->postbuf = ctx->stat; + +- local = frame->local; ++ return 0; ++} + +- if (local->op_ret < 0) { +- if (local->op_errno != ENOENT) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- return 0; +- } else { +- struct iovec vec = { +- 0, +- }; ++int ++shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = -1; + +- vec.iov_base = local->iobuf->ptr; +- vec.iov_len = local->total_size; +- local->op_ret = local->total_size; +- SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1, +- &local->prebuf, local->iobref, NULL); +- return 0; ++ LOCK(&inode->lock); ++ { ++ ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); + } +- } ++ UNLOCK(&inode->lock); + +- if (local->call_count) { +- shard_common_lookup_shards(frame, this, local->resolver_base_inode, +- shard_post_lookup_shards_readv_handler); +- } else { +- shard_readv_do(frame, this); +- } +- +- return 0; ++ return ret; + } + +-int shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- struct iobuf *iobuf = NULL; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; ++int ++shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, struct iatt *pre, ++ struct iatt *post, dict_t *xdata) ++{ ++ int call_count = 0; ++ fd_t *anon_fd = cookie; ++ shard_local_t *local = NULL; ++ glusterfs_fop_t fop = 0; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ local = frame->local; ++ fop = local->fop; + +- if (local->offset >= local->prebuf.ia_size) { +- /* If the read is being performed past the end of the file, +- * unwind the FOP with 0 bytes read as status. +- */ +- struct iovec vec = { +- 0, +- }; ++ LOCK(&frame->lock); ++ { ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } else { ++ local->written_size += op_ret; ++ GF_ATOMIC_ADD(local->delta_blocks, ++ post->ia_blocks - pre->ia_blocks); ++ local->delta_size += (post->ia_size - pre->ia_size); ++ shard_inode_ctx_set(local->fd->inode, this, post, 0, ++ SHARD_MASK_TIMES); ++ if (local->fd->inode != anon_fd->inode) ++ shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, ++ anon_fd->inode); ++ } ++ } ++ UNLOCK(&frame->lock); + +- iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); +- if (!iobuf) +- goto err; ++ if (anon_fd) ++ fd_unref(anon_fd); + +- vec.iov_base = iobuf->ptr; +- vec.iov_len = 0; +- local->iobref = iobref_new(); +- iobref_add(local->iobref, iobuf); +- iobuf_unref(iobuf); ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(fop, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); ++ local->hole_size = 0; ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ shard_update_file_size( ++ frame, this, local->fd, NULL, ++ shard_common_inode_write_post_update_size_handler); ++ } ++ } + +- SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, +- local->iobref, NULL); + return 0; +- } ++} + +- local->first_block = get_lowest_block(local->offset, local->block_size); ++int ++shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ struct iovec *vec, int count, off_t shard_offset, ++ size_t size) ++{ ++ shard_local_t *local = NULL; + +- local->total_size = local->req_size; ++ local = frame->local; + +- local->last_block = +- get_highest_block(local->offset, local->total_size, local->block_size); ++ switch (local->fop) { ++ case GF_FOP_WRITE: ++ STACK_WIND_COOKIE( ++ frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->writev, fd, vec, count, shard_offset, ++ local->flags, local->iobref, local->xattr_req); ++ break; ++ case GF_FOP_FALLOCATE: ++ STACK_WIND_COOKIE( ++ frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fallocate, fd, local->flags, ++ shard_offset, size, local->xattr_req); ++ break; ++ case GF_FOP_ZEROFILL: ++ STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->zerofill, fd, ++ shard_offset, size, local->xattr_req); ++ break; ++ case GF_FOP_DISCARD: ++ STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->discard, fd, ++ shard_offset, size, local->xattr_req); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", local->fop); ++ break; ++ } ++ return 0; ++} + +- local->num_blocks = local->last_block - local->first_block + 1; +- GF_ASSERT(local->num_blocks > 0); +- local->resolver_base_inode = local->loc.inode; ++int ++shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) ++{ ++ int i = 0; ++ int count = 0; ++ int call_count = 0; ++ int last_block = 0; ++ uint32_t cur_block = 0; ++ fd_t *fd = NULL; ++ fd_t *anon_fd = NULL; ++ shard_local_t *local = NULL; ++ struct iovec *vec = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ gf_boolean_t odirect = _gf_false; ++ off_t orig_offset = 0; ++ off_t shard_offset = 0; ++ off_t vec_offset = 0; ++ size_t remaining_size = 0; ++ size_t shard_write_size = 0; + +- local->inode_list = +- GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) +- goto err; ++ local = frame->local; ++ fd = local->fd; ++ ++ orig_offset = local->offset; ++ remaining_size = local->total_size; ++ cur_block = local->first_block; ++ local->call_count = call_count = local->num_blocks; ++ last_block = local->last_block; ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ ++ if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC ++ " into " ++ "dict: %s", ++ uuid_utoa(fd->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ local->call_count = 1; ++ shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ return 0; ++ } + +- iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); +- if (!iobuf) +- goto err; ++ if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) ++ odirect = _gf_true; + +- local->iobref = iobref_new(); +- if (!local->iobref) { +- iobuf_unref(iobuf); +- goto err; +- } ++ while (cur_block <= last_block) { ++ if (wind_failed) { ++ shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } + +- if (iobref_add(local->iobref, iobuf) != 0) { +- iobuf_unref(iobuf); +- goto err; +- } ++ shard_offset = orig_offset % local->block_size; ++ shard_write_size = local->block_size - shard_offset; ++ if (shard_write_size > remaining_size) ++ shard_write_size = remaining_size; ++ ++ remaining_size -= shard_write_size; ++ ++ if (local->fop == GF_FOP_WRITE) { ++ count = iov_subset(local->vector, local->count, vec_offset, ++ vec_offset + shard_write_size, NULL); ++ ++ vec = GF_CALLOC(count, sizeof(struct iovec), gf_shard_mt_iovec); ++ if (!vec) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ GF_FREE(vec); ++ shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, ++ -1, ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ count = iov_subset(local->vector, local->count, vec_offset, ++ vec_offset + shard_write_size, vec); ++ } + +- memset(iobuf->ptr, 0, local->total_size); +- iobuf_unref(iobuf); +- local->iobuf = iobuf; ++ if (cur_block == 0) { ++ anon_fd = fd_ref(fd); ++ } else { ++ anon_fd = fd_anonymous(local->inode_list[i]); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ GF_FREE(vec); ++ shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); ++ goto next; ++ } ++ ++ if (local->fop == GF_FOP_WRITE) { ++ if (odirect) ++ local->flags = O_DIRECT; ++ else ++ local->flags = GF_ANON_FD_FLAGS; ++ } ++ } + +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- ret = +- shard_init_internal_dir_loc(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret) +- goto err; +- shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_post_resolve_readv_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +- off_t offset, uint32_t flags, dict_t *xdata) { +- int ret = 0; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size for %s from its inode ctx", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- /* block_size = 0 means that the file was created before +- * sharding was enabled on the volume. +- */ +- STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); +- return 0; +- } +- +- if (!this->itable) +- this->itable = fd->inode->table; +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto err; +- local->fd = fd_ref(fd); +- local->block_size = block_size; +- local->offset = offset; +- local->req_size = size; +- local->flags = flags; +- local->fop = GF_FOP_READ; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_readv_handler); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); +- return 0; ++ shard_common_inode_write_wind(frame, this, anon_fd, vec, count, ++ shard_offset, shard_write_size); ++ if (vec) ++ vec_offset += shard_write_size; ++ orig_offset += shard_write_size; ++ GF_FREE(vec); ++ vec = NULL; ++ next: ++ cur_block++; ++ i++; ++ call_count--; ++ } ++ return 0; + } + +-int shard_common_inode_write_post_update_size_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; ++int ++shard_common_inode_write_post_mknod_handler(call_frame_t *frame, ++ xlator_t *this); + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- } else { +- shard_common_inode_write_success_unwind(local->fop, frame, +- local->written_size); +- } +- return 0; +-} ++int ++shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +-static gf_boolean_t shard_is_appending_write(shard_local_t *local) { +- if (local->fop != GF_FOP_WRITE) +- return _gf_false; +- if (local->flags & O_APPEND) +- return _gf_true; +- if (local->fd->flags & O_APPEND) +- return _gf_true; +- return _gf_false; +-} ++ local = frame->local; + +-int __shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ if (local->create_count) { ++ shard_common_resume_mknod(frame, this, ++ shard_common_inode_write_post_mknod_handler); ++ } else { ++ shard_common_inode_write_do(frame, this); ++ } + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ return 0; ++} + +- if (shard_is_appending_write(local)) { +- local->delta_size = local->total_size; +- } else if (local->offset + local->total_size > ctx->stat.ia_size) { +- local->delta_size = (local->offset + local->total_size) - ctx->stat.ia_size; +- } else { +- local->delta_size = 0; +- } +- ctx->stat.ia_size += (local->delta_size); +- local->postbuf = ctx->stat; ++int ++shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- return 0; +-} ++ local = frame->local; + +-int shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = -1; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- LOCK(&inode->lock); +- { ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); } +- UNLOCK(&inode->lock); ++ if (!local->eexist_count) { ++ shard_common_inode_write_do(frame, this); ++ } else { ++ local->call_count = local->eexist_count; ++ shard_common_lookup_shards( ++ frame, this, local->loc.inode, ++ shard_common_inode_write_post_lookup_shards_handler); ++ } + +- return ret; ++ return 0; + } + +-int shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, struct iatt *pre, +- struct iatt *post, dict_t *xdata) { +- int call_count = 0; +- fd_t *anon_fd = cookie; +- shard_local_t *local = NULL; +- glusterfs_fop_t fop = 0; ++int ++shard_common_inode_write_post_resolve_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; +- fop = local->fop; ++ local = frame->local; + +- LOCK(&frame->lock); +- { +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- } else { +- local->written_size += op_ret; +- GF_ATOMIC_ADD(local->delta_blocks, post->ia_blocks - pre->ia_blocks); +- local->delta_size += (post->ia_size - pre->ia_size); +- shard_inode_ctx_set(local->fd->inode, this, post, 0, SHARD_MASK_TIMES); +- if (local->fd->inode != anon_fd->inode) +- shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, +- anon_fd->inode); +- } +- } +- UNLOCK(&frame->lock); +- +- if (anon_fd) +- fd_unref(anon_fd); +- +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); + if (local->op_ret < 0) { +- shard_common_failure_unwind(fop, frame, local->op_ret, local->op_errno); ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ if (local->call_count) { ++ shard_common_lookup_shards( ++ frame, this, local->resolver_base_inode, ++ shard_common_inode_write_post_lookup_shards_handler); + } else { +- shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); +- local->hole_size = 0; +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- shard_update_file_size(frame, this, local->fd, NULL, +- shard_common_inode_write_post_update_size_handler); ++ shard_common_inode_write_do(frame, this); + } +- } + +- return 0; ++ return 0; + } + +-int shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, +- struct iovec *vec, int count, +- off_t shard_offset, size_t size) { +- shard_local_t *local = NULL; ++int ++shard_common_inode_write_post_lookup_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ shard_priv_t *priv = this->private; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- local = frame->local; ++ local->postbuf = local->prebuf; ++ ++ /*Adjust offset to EOF so that correct shard is chosen for append*/ ++ if (shard_is_appending_write(local)) ++ local->offset = local->prebuf.ia_size; ++ ++ local->first_block = get_lowest_block(local->offset, local->block_size); ++ local->last_block = get_highest_block(local->offset, local->total_size, ++ local->block_size); ++ local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); ++ local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) { ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++ } + +- switch (local->fop) { +- case GF_FOP_WRITE: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd, +- vec, count, shard_offset, local->flags, local->iobref, +- local->xattr_req); +- break; +- case GF_FOP_FALLOCATE: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, fd, +- local->flags, shard_offset, size, local->xattr_req); +- break; +- case GF_FOP_ZEROFILL: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, fd, +- shard_offset, size, local->xattr_req); +- break; +- case GF_FOP_DISCARD: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, fd, +- shard_offset, size, local->xattr_req); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", local->fop); +- break; +- } +- return 0; +-} +- +-int shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) { +- int i = 0; +- int count = 0; +- int call_count = 0; +- int last_block = 0; +- uint32_t cur_block = 0; +- fd_t *fd = NULL; +- fd_t *anon_fd = NULL; +- shard_local_t *local = NULL; +- struct iovec *vec = NULL; +- gf_boolean_t wind_failed = _gf_false; +- gf_boolean_t odirect = _gf_false; +- off_t orig_offset = 0; +- off_t shard_offset = 0; +- off_t vec_offset = 0; +- size_t remaining_size = 0; +- size_t shard_write_size = 0; +- +- local = frame->local; +- fd = local->fd; +- +- orig_offset = local->offset; +- remaining_size = local->total_size; +- cur_block = local->first_block; +- local->call_count = call_count = local->num_blocks; +- last_block = local->last_block; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC " into " +- "dict: %s", +- uuid_utoa(fd->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- local->call_count = 1; +- shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, +- NULL, NULL, NULL); +- return 0; +- } ++ gf_msg_trace(this->name, 0, ++ "%s: gfid=%s first_block=%" PRIu64 ++ " " ++ "last_block=%" PRIu64 " num_blocks=%" PRIu64 " offset=%" PRId64 ++ " total_size=%zu flags=%" PRId32 "", ++ gf_fop_list[local->fop], ++ uuid_utoa(local->resolver_base_inode->gfid), ++ local->first_block, local->last_block, local->num_blocks, ++ local->offset, local->total_size, local->flags); + +- if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) +- odirect = _gf_true; ++ local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + +- while (cur_block <= last_block) { +- if (wind_failed) { +- shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, +- NULL, NULL, NULL); +- goto next; ++ if (!local->dot_shard_loc.inode) { ++ /*change handler*/ ++ shard_mkdir_internal_dir(frame, this, ++ shard_common_inode_write_post_resolve_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ /*change handler*/ ++ local->post_res_handler = shard_common_inode_write_post_resolve_handler; ++ shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } ++ return 0; ++} + +- shard_offset = orig_offset % local->block_size; +- shard_write_size = local->block_size - shard_offset; +- if (shard_write_size > remaining_size) +- shard_write_size = remaining_size; ++int ++shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ inode_t *link_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + +- remaining_size -= shard_write_size; ++ local = frame->local; + +- if (local->fop == GF_FOP_WRITE) { +- count = iov_subset(local->vector, local->count, vec_offset, +- vec_offset + shard_write_size, NULL); ++ SHARD_UNSET_ROOT_FS_ID(frame, local); + +- vec = GF_CALLOC(count, sizeof(struct iovec), gf_shard_mt_iovec); +- if (!vec) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- GF_FREE(vec); +- shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; +- } +- count = iov_subset(local->vector, local->count, vec_offset, +- vec_offset + shard_write_size, vec); ++ if (op_ret == -1) { ++ if (op_errno != EEXIST) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } else { ++ gf_msg_debug(this->name, 0, ++ "mkdir on %s failed " ++ "with EEXIST. Attempting lookup now", ++ shard_internal_dir_string(type)); ++ shard_lookup_internal_dir(frame, this, local->post_res_handler, ++ type); ++ return 0; ++ } + } + +- if (cur_block == 0) { +- anon_fd = fd_ref(fd); ++ link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ if (link_inode != inode) { ++ shard_refresh_internal_dir(frame, this, type); + } else { +- anon_fd = fd_anonymous(local->inode_list[i]); +- if (!anon_fd) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- GF_FREE(vec); +- shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; +- } +- +- if (local->fop == GF_FOP_WRITE) { +- if (odirect) +- local->flags = O_DIRECT; +- else +- local->flags = GF_ANON_FD_FLAGS; +- } +- } +- +- shard_common_inode_write_wind(frame, this, anon_fd, vec, count, +- shard_offset, shard_write_size); +- if (vec) +- vec_offset += shard_write_size; +- orig_offset += shard_write_size; +- GF_FREE(vec); +- vec = NULL; +- next: +- cur_block++; +- i++; +- call_count--; +- } +- return 0; ++ shard_inode_ctx_mark_dir_refreshed(link_inode, this); ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ } ++ return 0; ++unwind: ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ return 0; + } + +-int shard_common_inode_write_post_mknod_handler(call_frame_t *frame, +- xlator_t *this); ++int ++shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ dict_t *xattr_req = NULL; ++ uuid_t *gfid = NULL; ++ loc_t *loc = NULL; ++ gf_boolean_t free_gfid = _gf_true; + +-int shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++ local = frame->local; ++ priv = this->private; + +- local = frame->local; ++ local->post_res_handler = handler; ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); ++ if (!gfid) ++ goto err; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy(*gfid, priv->dot_shard_gfid); ++ loc = &local->dot_shard_loc; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); ++ loc = &local->dot_shard_rm_loc; ++ break; ++ default: ++ bzero(*gfid, sizeof(uuid_t)); ++ break; ++ } + +- if (local->create_count) { +- shard_common_resume_mknod(frame, this, +- shard_common_inode_write_post_mknod_handler); +- } else { +- shard_common_inode_write_do(frame, this); +- } ++ xattr_req = dict_new(); ++ if (!xattr_req) ++ goto err; + +- return 0; +-} ++ ret = shard_init_internal_dir_loc(this, local, type); ++ if (ret) ++ goto err; + +-int shard_common_inode_write_post_mknod_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++ ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set gfid-req for %s", ++ shard_internal_dir_string(type)); ++ goto err; ++ } else { ++ free_gfid = _gf_false; ++ } + +- local = frame->local; ++ SHARD_SET_ROOT_FS_ID(frame, local); + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); ++ STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, ++ 0755, 0, xattr_req); ++ dict_unref(xattr_req); + return 0; +- } + +- if (!local->eexist_count) { +- shard_common_inode_write_do(frame, this); +- } else { +- local->call_count = local->eexist_count; +- shard_common_lookup_shards( +- frame, this, local->loc.inode, +- shard_common_inode_write_post_lookup_shards_handler); +- } +- +- return 0; ++err: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ if (free_gfid) ++ GF_FREE(gfid); ++ handler(frame, this); ++ return 0; + } + +-int shard_common_inode_write_post_resolve_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); ++int ++shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ /* To-Do: Wind flush on all shards of the file */ ++ SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); + return 0; +- } +- +- if (local->call_count) { +- shard_common_lookup_shards( +- frame, this, local->resolver_base_inode, +- shard_common_inode_write_post_lookup_shards_handler); +- } else { +- shard_common_inode_write_do(frame, this); +- } +- +- return 0; + } + +-int shard_common_inode_write_post_lookup_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = frame->local; +- shard_priv_t *priv = this->private; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); ++int ++shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) ++{ ++ STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +- } +- +- local->postbuf = local->prebuf; ++} + +- /*Adjust offset to EOF so that correct shard is chosen for append*/ +- if (shard_is_appending_write(local)) +- local->offset = local->prebuf.ia_size; ++int ++__shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- local->first_block = get_lowest_block(local->offset, local->block_size); +- local->last_block = +- get_highest_block(local->offset, local->total_size, local->block_size); +- local->num_blocks = local->last_block - local->first_block + 1; +- GF_ASSERT(local->num_blocks > 0); +- local->inode_list = +- GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) { +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +- } ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- gf_msg_trace( +- this->name, 0, "%s: gfid=%s first_block=%" PRIu64 " " +- "last_block=%" PRIu64 " num_blocks=%" PRIu64 +- " offset=%" PRId64 " total_size=%zu flags=%" PRId32 "", +- gf_fop_list[local->fop], uuid_utoa(local->resolver_base_inode->gfid), +- local->first_block, local->last_block, local->num_blocks, local->offset, +- local->total_size, local->flags); ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); ++ local->postbuf.ia_ctime = ctx->stat.ia_ctime; ++ local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; ++ local->postbuf.ia_atime = ctx->stat.ia_atime; ++ local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; ++ local->postbuf.ia_mtime = ctx->stat.ia_mtime; ++ local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; + +- if (!local->dot_shard_loc.inode) { +- /*change handler*/ +- shard_mkdir_internal_dir(frame, this, +- shard_common_inode_write_post_resolve_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- /*change handler*/ +- local->post_res_handler = shard_common_inode_write_post_resolve_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- return 0; ++ return 0; + } + +-int shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- inode_t *link_inode = NULL; +- shard_local_t *local = NULL; +- shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; +- +- local = frame->local; +- +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- +- if (op_ret == -1) { +- if (op_errno != EEXIST) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } else { +- gf_msg_debug(this->name, 0, "mkdir on %s failed " +- "with EEXIST. Attempting lookup now", +- shard_internal_dir_string(type)); +- shard_lookup_internal_dir(frame, this, local->post_res_handler, type); +- return 0; +- } +- } +- +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); +- if (link_inode != inode) { +- shard_refresh_internal_dir(frame, this, type); +- } else { +- shard_inode_ctx_mark_dir_refreshed(link_inode, this); +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- } +- return 0; +-unwind: +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- return 0; +-} +- +-int shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler, +- shard_internal_dir_type_t type) { +- int ret = -1; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- dict_t *xattr_req = NULL; +- uuid_t *gfid = NULL; +- loc_t *loc = NULL; +- gf_boolean_t free_gfid = _gf_true; +- +- local = frame->local; +- priv = this->private; +- +- local->post_res_handler = handler; +- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); +- if (!gfid) +- goto err; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- gf_uuid_copy(*gfid, priv->dot_shard_gfid); +- loc = &local->dot_shard_loc; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); +- loc = &local->dot_shard_rm_loc; +- break; +- default: +- bzero(*gfid, sizeof(uuid_t)); +- break; +- } +- +- xattr_req = dict_new(); +- if (!xattr_req) +- goto err; +- +- ret = shard_init_internal_dir_loc(this, local, type); +- if (ret) +- goto err; +- +- ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set gfid-req for %s", shard_internal_dir_string(type)); +- goto err; +- } else { +- free_gfid = _gf_false; +- } +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, +- 0755, 0, xattr_req); +- dict_unref(xattr_req); +- return 0; ++int ++shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = 0; + +-err: +- if (xattr_req) +- dict_unref(xattr_req); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- if (free_gfid) +- GF_FREE(gfid); +- handler(frame, this); +- return 0; +-} ++ LOCK(&inode->lock); ++ { ++ ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); ++ } ++ UNLOCK(&inode->lock); + +-int shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- /* To-Do: Wind flush on all shards of the file */ +- SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); +- return 0; ++ return ret; + } + +-int shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { +- STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->flush, fd, xdata); +- return 0; +-} ++int ++shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int call_count = 0; ++ uint64_t fsync_count = 0; ++ fd_t *anon_fd = cookie; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ inode_t *base_inode = NULL; ++ gf_boolean_t unref_shard_inode = _gf_false; ++ ++ local = frame->local; ++ base_inode = local->fd->inode; + +-int __shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++ if (local->op_ret < 0) ++ goto out; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ LOCK(&frame->lock); ++ { ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ UNLOCK(&frame->lock); ++ goto out; ++ } ++ shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, ++ SHARD_MASK_TIMES); ++ } ++ UNLOCK(&frame->lock); ++ fd_ctx_get(anon_fd, this, &fsync_count); ++out: ++ if (anon_fd && (base_inode != anon_fd->inode)) { ++ LOCK(&base_inode->lock); ++ LOCK(&anon_fd->inode->lock); ++ { ++ __shard_inode_ctx_get(anon_fd->inode, this, &ctx); ++ __shard_inode_ctx_get(base_inode, this, &base_ictx); ++ if (op_ret == 0) ++ ctx->fsync_needed -= fsync_count; ++ GF_ASSERT(ctx->fsync_needed >= 0); ++ if (ctx->fsync_needed != 0) { ++ list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); ++ base_ictx->fsync_count++; ++ } else { ++ unref_shard_inode = _gf_true; ++ } ++ } ++ UNLOCK(&anon_fd->inode->lock); ++ UNLOCK(&base_inode->lock); ++ } + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ if (unref_shard_inode) ++ inode_unref(anon_fd->inode); ++ if (anon_fd) ++ fd_unref(anon_fd); + +- local->postbuf.ia_ctime = ctx->stat.ia_ctime; +- local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; +- local->postbuf.ia_atime = ctx->stat.ia_atime; +- local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; +- local->postbuf.ia_mtime = ctx->stat.ia_mtime; +- local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; ++ call_count = shard_call_count_return(frame); ++ if (call_count != 0) ++ return 0; + +- return 0; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ shard_get_timestamps_from_inode_ctx(local, base_inode, this); ++ SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, local->xattr_rsp); ++ } ++ return 0; + } + +-int shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = 0; ++int ++shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ int call_count = 0; ++ int fsync_count = 0; ++ fd_t *anon_fd = NULL; ++ inode_t *base_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *iter = NULL; ++ struct list_head copy = { ++ 0, ++ }; ++ shard_inode_ctx_t *tmp = NULL; + +- LOCK(&inode->lock); +- { ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); } +- UNLOCK(&inode->lock); ++ local = frame->local; ++ base_inode = local->fd->inode; ++ local->postbuf = local->prebuf; ++ INIT_LIST_HEAD(©); + +- return ret; +-} ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +-int shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *prebuf, struct iatt *postbuf, +- dict_t *xdata) { +- int call_count = 0; +- uint64_t fsync_count = 0; +- fd_t *anon_fd = cookie; +- shard_local_t *local = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *base_ictx = NULL; +- inode_t *base_inode = NULL; +- gf_boolean_t unref_shard_inode = _gf_false; +- +- local = frame->local; +- base_inode = local->fd->inode; +- +- if (local->op_ret < 0) +- goto out; +- +- LOCK(&frame->lock); +- { +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- UNLOCK(&frame->lock); +- goto out; +- } +- shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, SHARD_MASK_TIMES); +- } +- UNLOCK(&frame->lock); +- fd_ctx_get(anon_fd, this, &fsync_count); +-out: +- if (anon_fd && (base_inode != anon_fd->inode)) { + LOCK(&base_inode->lock); +- LOCK(&anon_fd->inode->lock); + { +- __shard_inode_ctx_get(anon_fd->inode, this, &ctx); +- __shard_inode_ctx_get(base_inode, this, &base_ictx); +- if (op_ret == 0) +- ctx->fsync_needed -= fsync_count; +- GF_ASSERT(ctx->fsync_needed >= 0); +- if (ctx->fsync_needed != 0) { +- list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); +- base_ictx->fsync_count++; +- } else { +- unref_shard_inode = _gf_true; +- } +- } +- UNLOCK(&anon_fd->inode->lock); ++ __shard_inode_ctx_get(base_inode, this, &ctx); ++ list_splice_init(&ctx->to_fsync_list, ©); ++ call_count = ctx->fsync_count; ++ ctx->fsync_count = 0; ++ } + UNLOCK(&base_inode->lock); +- } +- +- if (unref_shard_inode) +- inode_unref(anon_fd->inode); +- if (anon_fd) +- fd_unref(anon_fd); +- +- call_count = shard_call_count_return(frame); +- if (call_count != 0) +- return 0; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, +- local->op_errno); +- } else { +- shard_get_timestamps_from_inode_ctx(local, base_inode, this); +- SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, local->xattr_rsp); +- } +- return 0; +-} +- +-int shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- int call_count = 0; +- int fsync_count = 0; +- fd_t *anon_fd = NULL; +- inode_t *base_inode = NULL; +- shard_local_t *local = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *iter = NULL; +- struct list_head copy = { +- 0, +- }; +- shard_inode_ctx_t *tmp = NULL; +- +- local = frame->local; +- base_inode = local->fd->inode; +- local->postbuf = local->prebuf; +- INIT_LIST_HEAD(©); +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, +- local->op_errno); +- return 0; +- } +- +- LOCK(&base_inode->lock); +- { +- __shard_inode_ctx_get(base_inode, this, &ctx); +- list_splice_init(&ctx->to_fsync_list, ©); +- call_count = ctx->fsync_count; +- ctx->fsync_count = 0; +- } +- UNLOCK(&base_inode->lock); +- +- local->call_count = ++call_count; +- +- /* Send fsync() on the base shard first */ +- anon_fd = fd_ref(local->fd); +- STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, +- local->xattr_req); +- call_count--; +- anon_fd = NULL; +- +- list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) { +- list_del_init(&iter->to_fsync_list); +- fsync_count = 0; +- shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); +- GF_ASSERT(fsync_count > 0); +- anon_fd = fd_anonymous(iter->inode); +- if (!anon_fd) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create " +- "anon fd to fsync shard"); +- shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, +- NULL, NULL, NULL); +- continue; +- } ++ local->call_count = ++call_count; + +- ret = fd_ctx_set(anon_fd, this, fsync_count); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, +- "Failed to set fd " +- "ctx for shard inode gfid=%s", +- uuid_utoa(iter->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, +- NULL, NULL, NULL); +- continue; +- } ++ /* Send fsync() on the base shard first */ ++ anon_fd = fd_ref(local->fd); + STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, + local->xattr_req); + call_count--; +- } ++ anon_fd = NULL; + +- return 0; ++ list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) ++ { ++ list_del_init(&iter->to_fsync_list); ++ fsync_count = 0; ++ shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); ++ GF_ASSERT(fsync_count > 0); ++ anon_fd = fd_anonymous(iter->inode); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create " ++ "anon fd to fsync shard"); ++ shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ continue; ++ } ++ ++ ret = fd_ctx_set(anon_fd, this, fsync_count); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, ++ "Failed to set fd " ++ "ctx for shard inode gfid=%s", ++ uuid_utoa(iter->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ continue; ++ } ++ STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, ++ anon_fd, local->datasync, local->xattr_req); ++ call_count--; ++ } ++ ++ return 0; + } + +-int shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +- dict_t *xdata) { +- int ret = 0; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size for %s from its inode ctx", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); +- return 0; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); ++ return 0; ++ } + +- if (!this->itable) +- this->itable = fd->inode->table; ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- frame->local = local; ++ frame->local = local; + +- local->fd = fd_ref(fd); +- local->fop = GF_FOP_FSYNC; +- local->datasync = datasync; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ local->fd = fd_ref(fd); ++ local->fop = GF_FOP_FSYNC; ++ local->datasync = datasync; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_fsync_handler); +- return 0; ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_fsync_handler); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, +- gf_dirent_t *orig_entries, dict_t *xdata) { +- gf_dirent_t *entry = NULL; +- gf_dirent_t *tmp = NULL; +- shard_local_t *local = NULL; ++int ++shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, gf_dirent_t *orig_entries, ++ dict_t *xdata) ++{ ++ gf_dirent_t *entry = NULL; ++ gf_dirent_t *tmp = NULL; ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (op_ret < 0) +- goto unwind; ++ if (op_ret < 0) ++ goto unwind; + +- list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) { +- list_del_init(&entry->list); +- list_add_tail(&entry->list, &local->entries_head.list); ++ list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) ++ { ++ list_del_init(&entry->list); ++ list_add_tail(&entry->list, &local->entries_head.list); + +- if (!entry->dict) +- continue; ++ if (!entry->dict) ++ continue; + +- if (IA_ISDIR(entry->d_stat.ia_type)) +- continue; ++ if (IA_ISDIR(entry->d_stat.ia_type)) ++ continue; + +- if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) +- shard_modify_size_and_block_count(&entry->d_stat, entry->dict); +- if (!entry->inode) +- continue; ++ if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) ++ shard_modify_size_and_block_count(&entry->d_stat, entry->dict); ++ if (!entry->inode) ++ continue; + +- shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); +- } +- local->op_ret += op_ret; ++ shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); ++ } ++ local->op_ret += op_ret; + + unwind: +- if (local->fop == GF_FOP_READDIR) +- SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, +- &local->entries_head, xdata); +- else +- SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &local->entries_head, +- xdata); +- return 0; ++ if (local->fop == GF_FOP_READDIR) ++ SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, ++ &local->entries_head, xdata); ++ else ++ SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, ++ &local->entries_head, xdata); ++ return 0; + } + +-int32_t shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- gf_dirent_t *orig_entries, dict_t *xdata) { +- fd_t *fd = NULL; +- gf_dirent_t *entry = NULL; +- gf_dirent_t *tmp = NULL; +- shard_local_t *local = NULL; +- gf_boolean_t last_entry = _gf_false; ++int32_t ++shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, ++ dict_t *xdata) ++{ ++ fd_t *fd = NULL; ++ gf_dirent_t *entry = NULL; ++ gf_dirent_t *tmp = NULL; ++ shard_local_t *local = NULL; ++ gf_boolean_t last_entry = _gf_false; + +- local = frame->local; +- fd = local->fd; ++ local = frame->local; ++ fd = local->fd; + +- if (op_ret < 0) +- goto unwind; ++ if (op_ret < 0) ++ goto unwind; + +- list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) { +- if (last_entry) +- last_entry = _gf_false; ++ list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) ++ { ++ if (last_entry) ++ last_entry = _gf_false; ++ ++ if (__is_root_gfid(fd->inode->gfid) && ++ !(strcmp(entry->d_name, GF_SHARD_DIR))) { ++ local->offset = entry->d_off; ++ op_ret--; ++ last_entry = _gf_true; ++ continue; ++ } + +- if (__is_root_gfid(fd->inode->gfid) && +- !(strcmp(entry->d_name, GF_SHARD_DIR))) { +- local->offset = entry->d_off; +- op_ret--; +- last_entry = _gf_true; +- continue; +- } ++ list_del_init(&entry->list); ++ list_add_tail(&entry->list, &local->entries_head.list); + +- list_del_init(&entry->list); +- list_add_tail(&entry->list, &local->entries_head.list); ++ if (!entry->dict) ++ continue; + +- if (!entry->dict) +- continue; ++ if (IA_ISDIR(entry->d_stat.ia_type)) ++ continue; + +- if (IA_ISDIR(entry->d_stat.ia_type)) +- continue; ++ if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && ++ frame->root->pid != GF_CLIENT_PID_GSYNCD) ++ shard_modify_size_and_block_count(&entry->d_stat, entry->dict); + +- if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && +- frame->root->pid != GF_CLIENT_PID_GSYNCD) +- shard_modify_size_and_block_count(&entry->d_stat, entry->dict); ++ if (!entry->inode) ++ continue; + +- if (!entry->inode) +- continue; ++ shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); ++ } + +- shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); +- } ++ local->op_ret = op_ret; + +- local->op_ret = op_ret; ++ if (last_entry) { ++ if (local->fop == GF_FOP_READDIR) ++ STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, ++ local->fd, local->readdir_size, local->offset, ++ local->xattr_req); ++ else ++ STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, ++ local->fd, local->readdir_size, local->offset, ++ local->xattr_req); ++ return 0; ++ } + +- if (last_entry) { ++unwind: + if (local->fop == GF_FOP_READDIR) +- STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdir, local->fd, +- local->readdir_size, local->offset, local->xattr_req); ++ SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, ++ &local->entries_head, xdata); + else +- STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdirp, local->fd, +- local->readdir_size, local->offset, local->xattr_req); ++ SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, ++ &local->entries_head, xdata); + return 0; +- } ++} + +-unwind: +- if (local->fop == GF_FOP_READDIR) +- SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, &local->entries_head, +- xdata); +- else +- SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &local->entries_head, +- xdata); +- return 0; +-} +- +-int shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +- off_t offset, int whichop, dict_t *xdata) { +- int ret = 0; +- shard_local_t *local = NULL; +- +- local = mem_get0(this->local_pool); +- if (!local) { +- goto err; +- } +- +- frame->local = local; +- +- local->fd = fd_ref(fd); +- local->fop = whichop; +- local->readdir_size = size; +- INIT_LIST_HEAD(&local->entries_head.list); +- local->list_inited = _gf_true; +- +- if (whichop == GF_FOP_READDIR) { +- STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); +- } else { +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, +- local, err); +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); +- if (ret) { +- gf_log(this->name, GF_LOG_WARNING, +- "Failed to set " +- "dict value: key:%s, directory gfid=%s", +- GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); +- goto err; ++int ++shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, int whichop, dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) { ++ goto err; + } + +- STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdirp, fd, size, offset, +- local->xattr_req); +- } ++ frame->local = local; ++ ++ local->fd = fd_ref(fd); ++ local->fop = whichop; ++ local->readdir_size = size; ++ INIT_LIST_HEAD(&local->entries_head.list); ++ local->list_inited = _gf_true; ++ ++ if (whichop == GF_FOP_READDIR) { ++ STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); ++ } else { ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, ++ local, err); ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_log(this->name, GF_LOG_WARNING, ++ "Failed to set " ++ "dict value: key:%s, directory gfid=%s", ++ GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } ++ ++ STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readdirp, fd, size, offset, ++ local->xattr_req); ++ } + +- return 0; ++ return 0; + + err: +- STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); +- return 0; ++ STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); ++ return 0; + } + +-int32_t shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, +- size_t size, off_t offset, dict_t *xdata) { +- shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); +- return 0; ++int32_t ++shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, dict_t *xdata) ++{ ++ shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); ++ return 0; + } + +-int32_t shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, +- size_t size, off_t offset, dict_t *xdata) { +- shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); +- return 0; ++int32_t ++shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, dict_t *xdata) ++{ ++ shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); ++ return 0; + } + + int32_t +@@ -6037,77 +6450,86 @@ shard_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + return 0; + } + +-int32_t shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *dict, +- dict_t *xdata) { +- if (op_ret < 0) +- goto unwind; ++int32_t ++shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ if (op_ret < 0) ++ goto unwind; + +- if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { +- dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); +- dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); +- } ++ if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { ++ dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); ++ dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); ++ } + + unwind: +- SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); +- return 0; ++ SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); ++ return 0; + } + +-int32_t shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, +- const char *name, dict_t *xdata) { +- int op_errno = EINVAL; ++int32_t ++shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, ++ dict_t *xdata) ++{ ++ int op_errno = EINVAL; + +- if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && +- (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { +- op_errno = ENODATA; +- goto out; +- } ++ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && ++ (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { ++ op_errno = ENODATA; ++ goto out; ++ } + +- STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); +- return 0; ++ STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); ++ return 0; + out: +- shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); ++ return 0; + } + +-int32_t shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *dict, +- dict_t *xdata) { +- if (op_ret < 0) +- goto unwind; ++int32_t ++shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ if (op_ret < 0) ++ goto unwind; + +- if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { +- dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); +- dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); +- } ++ if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { ++ dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); ++ dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); ++ } + + unwind: +- SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); +- return 0; ++ SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); ++ return 0; + } + +-int32_t shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, +- const char *name, dict_t *xdata) { +- int op_errno = EINVAL; ++int32_t ++shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ const char *name, dict_t *xdata) ++{ ++ int op_errno = EINVAL; + +- if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && +- (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { +- op_errno = ENODATA; +- goto out; +- } ++ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && ++ (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { ++ op_errno = ENODATA; ++ goto out; ++ } + +- STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); +- return 0; ++ STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); ++ return 0; + out: +- shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); ++ return 0; + } + +-int32_t shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, dict_t *xdata) { ++int32_t ++shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ + int ret = -1; + shard_local_t *local = NULL; + +@@ -6141,8 +6563,9 @@ err: + return 0; + } + +-int32_t shard_post_lookup_set_xattr_handler(call_frame_t *frame, +- xlator_t *this) { ++int32_t ++shard_post_lookup_set_xattr_handler(call_frame_t *frame, xlator_t *this) ++{ + shard_local_t *local = NULL; + + local = frame->local; +@@ -6164,9 +6587,11 @@ int32_t shard_post_lookup_set_xattr_handler(call_frame_t *frame, + return 0; + } + +-int32_t shard_common_set_xattr(call_frame_t *frame, xlator_t *this, +- glusterfs_fop_t fop, loc_t *loc, fd_t *fd, +- dict_t *dict, int32_t flags, dict_t *xdata) { ++int32_t ++shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, ++ loc_t *loc, fd_t *fd, dict_t *dict, int32_t flags, ++ dict_t *xdata) ++{ + int ret = -1; + int op_errno = ENOMEM; + uint64_t block_size = 0; +@@ -6249,489 +6674,531 @@ err: + return 0; + } + +-int32_t shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, +- dict_t *dict, int32_t flags, dict_t *xdata) { ++int32_t ++shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, ++ int32_t flags, dict_t *xdata) ++{ + shard_common_set_xattr(frame, this, GF_FOP_FSETXATTR, NULL, fd, dict, flags, + xdata); + return 0; + } + +-int32_t shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, +- dict_t *dict, int32_t flags, dict_t *xdata) { ++int32_t ++shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, ++ int32_t flags, dict_t *xdata) ++{ + shard_common_set_xattr(frame, this, GF_FOP_SETXATTR, loc, NULL, dict, flags, + xdata); + return 0; + } + +-int shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (local->fop == GF_FOP_SETATTR) { +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, +- SHARD_LOOKUP_MASK); +- SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, local->xattr_rsp); +- } else if (local->fop == GF_FOP_FSETATTR) { +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, +- SHARD_LOOKUP_MASK); +- SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, local->xattr_rsp); +- } ++ if (local->fop == GF_FOP_SETATTR) { ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, ++ SHARD_LOOKUP_MASK); ++ SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, local->xattr_rsp); ++ } else if (local->fop == GF_FOP_FSETATTR) { ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, ++ SHARD_LOOKUP_MASK); ++ SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, local->xattr_rsp); ++ } + +- return 0; ++ return 0; + } + +-int shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *prebuf, struct iatt *postbuf, +- dict_t *xdata) { +- shard_local_t *local = NULL; ++int ++shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } + +- local->prebuf = *prebuf; +- if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { +- local->op_ret = -1; +- local->op_errno = EINVAL; +- goto unwind; +- } +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- local->postbuf = *postbuf; +- local->postbuf.ia_size = local->prebuf.ia_size; +- local->postbuf.ia_blocks = local->prebuf.ia_blocks; ++ local->prebuf = *prebuf; ++ if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { ++ local->op_ret = -1; ++ local->op_errno = EINVAL; ++ goto unwind; ++ } ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ local->postbuf = *postbuf; ++ local->postbuf.ia_size = local->prebuf.ia_size; ++ local->postbuf.ia_blocks = local->prebuf.ia_blocks; + + unwind: +- local->handler(frame, this); +- return 0; ++ local->handler(frame, this); ++ return 0; + } + +-int shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, +- struct iatt *stbuf, int32_t valid, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ struct iatt *stbuf, int32_t valid, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { +- STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); +- return 0; +- } ++ if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { ++ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); ++ return 0; ++ } + +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); +- return 0; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); ++ return 0; ++ } + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- frame->local = local; ++ frame->local = local; + +- local->handler = shard_post_setattr_handler; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->fop = GF_FOP_SETATTR; +- loc_copy(&local->loc, loc); ++ local->handler = shard_post_setattr_handler; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->fop = GF_FOP_SETATTR; ++ loc_copy(&local->loc, loc); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, +- local, err); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, ++ local, err); + +- STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, +- local->xattr_req); +- return 0; ++ STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, ++ local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, +- struct iatt *stbuf, int32_t valid, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ struct iatt *stbuf, int32_t valid, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { +- STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); +- return 0; +- } ++ if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { ++ STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); ++ return 0; ++ } + +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block size from inode ctx of %s", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block size from inode ctx of %s", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); +- return 0; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); ++ return 0; ++ } + +- if (!this->itable) +- this->itable = fd->inode->table; ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- frame->local = local; ++ frame->local = local; + +- local->handler = shard_post_setattr_handler; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->fop = GF_FOP_FSETATTR; +- local->fd = fd_ref(fd); ++ local->handler = shard_post_setattr_handler; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->fop = GF_FOP_FSETATTR; ++ local->fd = fd_ref(fd); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, +- local, err); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, ++ local, err); + +- STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, +- local->xattr_req); +- return 0; ++ STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, ++ local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, +- glusterfs_fop_t fop, fd_t *fd, +- struct iovec *vector, int32_t count, +- off_t offset, uint32_t flags, size_t len, +- struct iobref *iobref, dict_t *xdata) { +- int ret = 0; +- int i = 0; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size for %s from its inode ctx", +- uuid_utoa(fd->inode->gfid)); +- goto out; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- /* block_size = 0 means that the file was created before +- * sharding was enabled on the volume. +- */ +- switch (fop) { +- case GF_FOP_WRITE: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, +- fd, vector, count, offset, flags, iobref, xdata); +- break; +- case GF_FOP_FALLOCATE: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fallocate, fd, flags, offset, +- len, xdata); +- break; +- case GF_FOP_ZEROFILL: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->zerofill, fd, offset, len, +- xdata); +- break; +- case GF_FOP_DISCARD: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", fop); +- break; +- } +- return 0; +- } +- +- if (!this->itable) +- this->itable = fd->inode->table; +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto out; +- +- frame->local = local; +- +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto out; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto out; +- +- if (vector) { +- local->vector = iov_dup(vector, count); +- if (!local->vector) +- goto out; +- for (i = 0; i < count; i++) +- local->total_size += vector[i].iov_len; +- local->count = count; +- } else { +- local->total_size = len; +- } +- +- local->fop = fop; +- local->offset = offset; +- local->flags = flags; +- if (iobref) +- local->iobref = iobref_ref(iobref); +- local->fd = fd_ref(fd); +- local->block_size = block_size; +- local->resolver_base_inode = local->fd->inode; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_common_inode_write_post_lookup_handler); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, ++ glusterfs_fop_t fop, fd_t *fd, ++ struct iovec *vector, int32_t count, ++ off_t offset, uint32_t flags, size_t len, ++ struct iobref *iobref, dict_t *xdata) ++{ ++ int ret = 0; ++ int i = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa(fd->inode->gfid)); ++ goto out; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ /* block_size = 0 means that the file was created before ++ * sharding was enabled on the volume. ++ */ ++ switch (fop) { ++ case GF_FOP_WRITE: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->writev, fd, vector, ++ count, offset, flags, iobref, xdata); ++ break; ++ case GF_FOP_FALLOCATE: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fallocate, fd, flags, ++ offset, len, xdata); ++ break; ++ case GF_FOP_ZEROFILL: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->zerofill, fd, offset, ++ len, xdata); ++ break; ++ case GF_FOP_DISCARD: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->discard, fd, offset, ++ len, xdata); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", fop); ++ break; ++ } ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = fd->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto out; ++ ++ frame->local = local; ++ ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto out; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto out; ++ ++ if (vector) { ++ local->vector = iov_dup(vector, count); ++ if (!local->vector) ++ goto out; ++ for (i = 0; i < count; i++) ++ local->total_size += vector[i].iov_len; ++ local->count = count; ++ } else { ++ local->total_size = len; ++ } ++ ++ local->fop = fop; ++ local->offset = offset; ++ local->flags = flags; ++ if (iobref) ++ local->iobref = iobref_ref(iobref); ++ local->fd = fd_ref(fd); ++ local->block_size = block_size; ++ local->resolver_base_inode = local->fd->inode; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_common_inode_write_post_lookup_handler); ++ return 0; + out: +- shard_common_failure_unwind(fop, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(fop, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, +- struct iovec *vector, int32_t count, off_t offset, +- uint32_t flags, struct iobref *iobref, dict_t *xdata) { +- shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, +- offset, flags, 0, iobref, xdata); +- return 0; ++int ++shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ struct iovec *vector, int32_t count, off_t offset, uint32_t flags, ++ struct iobref *iobref, dict_t *xdata) ++{ ++ shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, ++ offset, flags, 0, iobref, xdata); ++ return 0; + } + +-int shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, +- int32_t keep_size, off_t offset, size_t len, +- dict_t *xdata) { +- if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && +- (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) +- goto out; ++int ++shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ int32_t keep_size, off_t offset, size_t len, dict_t *xdata) ++{ ++ if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && ++ (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) ++ goto out; + +- shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, +- offset, keep_size, len, NULL, xdata); +- return 0; ++ shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, ++ offset, keep_size, len, NULL, xdata); ++ return 0; + out: +- shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); ++ return 0; + } + +-int shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- off_t len, dict_t *xdata) { +- shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, +- offset, 0, len, NULL, xdata); +- return 0; ++int ++shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ off_t len, dict_t *xdata) ++{ ++ shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, ++ offset, 0, len, NULL, xdata); ++ return 0; + } + +-int shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- size_t len, dict_t *xdata) { +- shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, +- offset, 0, len, NULL, xdata); +- return 0; ++int ++shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ size_t len, dict_t *xdata) ++{ ++ shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, ++ offset, 0, len, NULL, xdata); ++ return 0; + } + +-int32_t shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- gf_seek_what_t what, dict_t *xdata) { +- /* TBD */ +- gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, +- "seek called on %s.", uuid_utoa(fd->inode->gfid)); +- shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); +- return 0; ++int32_t ++shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ gf_seek_what_t what, dict_t *xdata) ++{ ++ /* TBD */ ++ gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, ++ "seek called on %s.", uuid_utoa(fd->inode->gfid)); ++ shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); ++ return 0; + } + +-int32_t mem_acct_init(xlator_t *this) { +- int ret = -1; ++int32_t ++mem_acct_init(xlator_t *this) ++{ ++ int ret = -1; + +- if (!this) +- return ret; ++ if (!this) ++ return ret; + +- ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); ++ ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); + +- if (ret != 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, +- "Memory accounting init" +- "failed"); +- return ret; +- } ++ if (ret != 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, ++ "Memory accounting init" ++ "failed"); ++ return ret; ++ } + +- return ret; ++ return ret; + } + +-int init(xlator_t *this) { +- int ret = -1; +- shard_priv_t *priv = NULL; ++int ++init(xlator_t *this) ++{ ++ int ret = -1; ++ shard_priv_t *priv = NULL; ++ ++ if (!this) { ++ gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, ++ "this is NULL. init() failed"); ++ return -1; ++ } + +- if (!this) { +- gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, +- "this is NULL. init() failed"); +- return -1; +- } +- +- if (!this->parents) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, +- "Dangling volume. Check volfile"); +- goto out; +- } +- +- if (!this->children || this->children->next) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, +- "shard not configured with exactly one sub-volume. " +- "Check volfile"); +- goto out; +- } +- +- priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); +- if (!priv) +- goto out; +- +- GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); +- +- GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); +- +- GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); +- +- this->local_pool = mem_pool_new(shard_local_t, 128); +- if (!this->local_pool) { +- ret = -1; +- goto out; +- } +- gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); +- gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); +- +- this->private = priv; +- LOCK_INIT(&priv->lock); +- INIT_LIST_HEAD(&priv->ilist_head); +- ret = 0; ++ if (!this->parents) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, ++ "Dangling volume. Check volfile"); ++ goto out; ++ } ++ ++ if (!this->children || this->children->next) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, ++ "shard not configured with exactly one sub-volume. " ++ "Check volfile"); ++ goto out; ++ } ++ ++ priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); ++ if (!priv) ++ goto out; ++ ++ GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); ++ ++ GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); ++ ++ GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); ++ ++ this->local_pool = mem_pool_new(shard_local_t, 128); ++ if (!this->local_pool) { ++ ret = -1; ++ goto out; ++ } ++ gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); ++ gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); ++ ++ this->private = priv; ++ LOCK_INIT(&priv->lock); ++ INIT_LIST_HEAD(&priv->ilist_head); ++ ret = 0; + out: +- if (ret) { +- GF_FREE(priv); +- mem_pool_destroy(this->local_pool); +- } ++ if (ret) { ++ GF_FREE(priv); ++ mem_pool_destroy(this->local_pool); ++ } + +- return ret; ++ return ret; + } + +-void fini(xlator_t *this) { +- shard_priv_t *priv = NULL; ++void ++fini(xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; + +- GF_VALIDATE_OR_GOTO("shard", this, out); ++ GF_VALIDATE_OR_GOTO("shard", this, out); + +- mem_pool_destroy(this->local_pool); +- this->local_pool = NULL; ++ mem_pool_destroy(this->local_pool); ++ this->local_pool = NULL; + +- priv = this->private; +- if (!priv) +- goto out; ++ priv = this->private; ++ if (!priv) ++ goto out; + +- this->private = NULL; +- LOCK_DESTROY(&priv->lock); +- GF_FREE(priv); ++ this->private = NULL; ++ LOCK_DESTROY(&priv->lock); ++ GF_FREE(priv); + + out: +- return; ++ return; + } + +-int reconfigure(xlator_t *this, dict_t *options) { +- int ret = -1; +- shard_priv_t *priv = NULL; ++int ++reconfigure(xlator_t *this, dict_t *options) ++{ ++ int ret = -1; ++ shard_priv_t *priv = NULL; + +- priv = this->private; ++ priv = this->private; + +- GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); ++ GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); + +- GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, uint32, +- out); +- ret = 0; ++ GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, ++ uint32, out); ++ ret = 0; + + out: +- return ret; ++ return ret; + } + +-int shard_forget(xlator_t *this, inode_t *inode) { +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; +- shard_priv_t *priv = NULL; ++int ++shard_forget(xlator_t *this, inode_t *inode) ++{ ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_priv_t *priv = NULL; + +- priv = this->private; +- if (!priv) +- return 0; ++ priv = this->private; ++ if (!priv) ++ return 0; + +- inode_ctx_del(inode, this, &ctx_uint); +- if (!ctx_uint) +- return 0; ++ inode_ctx_del(inode, this, &ctx_uint); ++ if (!ctx_uint) ++ return 0; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- /* When LRU limit reaches inode will be forcefully removed from the +- * table, inode needs to be removed from LRU of shard as well. +- */ +- if (!list_empty(&ctx->ilist)) { +- LOCK(&priv->lock); +- { +- list_del_init(&ctx->ilist); +- priv->inode_count--; ++ /* When LRU limit reaches inode will be forcefully removed from the ++ * table, inode needs to be removed from LRU of shard as well. ++ */ ++ if (!list_empty(&ctx->ilist)) { ++ LOCK(&priv->lock); ++ { ++ list_del_init(&ctx->ilist); ++ priv->inode_count--; ++ } ++ UNLOCK(&priv->lock); + } +- UNLOCK(&priv->lock); +- } +- GF_FREE(ctx); ++ GF_FREE(ctx); + +- return 0; ++ return 0; + } + +-int shard_release(xlator_t *this, fd_t *fd) { +- /* TBD */ +- return 0; ++int ++shard_release(xlator_t *this, fd_t *fd) ++{ ++ /* TBD */ ++ return 0; + } + +-int shard_priv_dump(xlator_t *this) { +- shard_priv_t *priv = NULL; +- char key_prefix[GF_DUMP_MAX_BUF_LEN] = { +- 0, +- }; +- char *str = NULL; ++int ++shard_priv_dump(xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; ++ char key_prefix[GF_DUMP_MAX_BUF_LEN] = { ++ 0, ++ }; ++ char *str = NULL; + +- priv = this->private; ++ priv = this->private; + +- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); +- gf_proc_dump_add_section("%s", key_prefix); +- str = gf_uint64_2human_readable(priv->block_size); +- gf_proc_dump_write("shard-block-size", "%s", str); +- gf_proc_dump_write("inode-count", "%d", priv->inode_count); +- gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); +- gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); ++ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); ++ gf_proc_dump_add_section("%s", key_prefix); ++ str = gf_uint64_2human_readable(priv->block_size); ++ gf_proc_dump_write("shard-block-size", "%s", str); ++ gf_proc_dump_write("inode-count", "%d", priv->inode_count); ++ gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); ++ gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); + +- GF_FREE(str); ++ GF_FREE(str); + +- return 0; ++ return 0; + } + +-int shard_releasedir(xlator_t *this, fd_t *fd) { return 0; } ++int ++shard_releasedir(xlator_t *this, fd_t *fd) ++{ ++ return 0; ++} + + struct xlator_fops fops = { + .lookup = shard_lookup, +-- +1.8.3.1 + diff --git a/SOURCES/0563-features-shard-Use-fd-lookup-post-file-open.patch b/SOURCES/0563-features-shard-Use-fd-lookup-post-file-open.patch new file mode 100644 index 0000000..c680f92 --- /dev/null +++ b/SOURCES/0563-features-shard-Use-fd-lookup-post-file-open.patch @@ -0,0 +1,318 @@ +From a19fa252942938a308ffa655fca3814d0660c6e2 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Wed, 3 Jun 2020 18:58:56 +0530 +Subject: [PATCH 563/584] features/shard: Use fd lookup post file open + +Issue: +When a process has the open fd and the same file is +unlinked in middle of the operations, then file based +lookup fails with ENOENT or stale file + +Solution: +When the file already open and fd is available, use fstat +to get the file attributes + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/24528/ +> Change-Id: I0e83aee9f11b616dcfe13769ebfcda6742e4e0f4 +> Fixes: #1281 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: I0e83aee9f11b616dcfe13769ebfcda6742e4e0f4 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244957 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-1281.t | 34 +++++++++++ + xlators/features/shard/src/shard.c | 119 +++++++++++++++++++++++-------------- + 2 files changed, 110 insertions(+), 43 deletions(-) + create mode 100644 tests/bugs/shard/issue-1281.t + +diff --git a/tests/bugs/shard/issue-1281.t b/tests/bugs/shard/issue-1281.t +new file mode 100644 +index 0000000..9704caa +--- /dev/null ++++ b/tests/bugs/shard/issue-1281.t +@@ -0,0 +1,34 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++#Open a file and store descriptor in fd = 5 ++exec 5>$M0/foo ++ ++#Unlink the same file which is opened in prev step ++TEST unlink $M0/foo ++ ++#Write something on the file using the open fd = 5 ++echo "issue-1281" >&5 ++ ++#Write on the descriptor should be succesful ++EXPECT 0 echo $? ++ ++#Close the fd = 5 ++exec 5>&- ++ ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index c5cc224..2ba4528 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -1653,26 +1653,24 @@ err: + } + + int +-shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) ++shard_set_iattr_invoke_post_handler(call_frame_t *frame, xlator_t *this, ++ inode_t *inode, int32_t op_ret, ++ int32_t op_errno, struct iatt *buf, ++ dict_t *xdata) + { + int ret = -1; + int32_t mask = SHARD_INODE_WRITE_MASK; +- shard_local_t *local = NULL; ++ shard_local_t *local = frame->local; + shard_inode_ctx_t ctx = { + 0, + }; + +- local = frame->local; +- + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_BASE_FILE_LOOKUP_FAILED, + "Lookup on base file" + " failed : %s", +- loc_gfid_utoa(&(local->loc))); ++ uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; +@@ -1706,18 +1704,57 @@ unwind: + } + + int +-shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, +- shard_post_fop_handler_t handler) ++shard_fstat_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ dict_t *xdata) ++{ ++ shard_local_t *local = frame->local; ++ ++ shard_set_iattr_invoke_post_handler(frame, this, local->fd->inode, op_ret, ++ op_errno, buf, xdata); ++ return 0; ++} ++ ++int ++shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ /* In case of op_ret < 0, inode passed to this function will be NULL ++ ex: in case of op_errno = ENOENT. So refer prefilled inode data ++ which is part of local. ++ Note: Reassigning/overriding the inode passed to this cbk with inode ++ which is part of *struct shard_local_t* won't cause any issue as ++ both inodes have same reference/address as of the inode passed */ ++ inode = ((shard_local_t *)frame->local)->loc.inode; ++ ++ shard_set_iattr_invoke_post_handler(frame, this, inode, op_ret, op_errno, ++ buf, xdata); ++ return 0; ++} ++ ++/* This function decides whether to make file based lookup or ++ * fd based lookup (fstat) depending on the 3rd and 4th arg. ++ * If fd != NULL and loc == NULL then call is for fstat ++ * If fd == NULL and loc != NULL then call is for file based ++ * lookup. Please pass args based on the requirement. ++ */ ++int ++shard_refresh_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ fd_t *fd, shard_post_fop_handler_t handler) + { + int ret = -1; ++ inode_t *inode = NULL; + shard_local_t *local = NULL; + dict_t *xattr_req = NULL; + gf_boolean_t need_refresh = _gf_false; + + local = frame->local; + local->handler = handler; ++ inode = fd ? fd->inode : loc->inode; + +- ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, ++ ret = shard_inode_ctx_fill_iatt_from_cache(inode, this, &local->prebuf, + &need_refresh); + /* By this time, inode ctx should have been created either in create, + * mknod, readdirp or lookup. If not it is a bug! +@@ -1726,7 +1763,7 @@ shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_msg_debug(this->name, 0, + "Skipping lookup on base file: %s" + "Serving prebuf off the inode ctx cache", +- uuid_utoa(loc->gfid)); ++ uuid_utoa(inode->gfid)); + goto out; + } + +@@ -1737,10 +1774,14 @@ shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, + goto out; + } + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, inode->gfid, local, out); + +- STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xattr_req); ++ if (fd) ++ STACK_WIND(frame, shard_fstat_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, xattr_req); ++ else ++ STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + + dict_unref(xattr_req); + return 0; +@@ -2718,8 +2759,8 @@ shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + local->resolver_base_inode = loc->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); ++ shard_refresh_base_file(frame, this, &local->loc, NULL, ++ shard_post_lookup_truncate_handler); + return 0; + + err: +@@ -2774,8 +2815,8 @@ shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->resolver_base_inode = fd->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_post_lookup_truncate_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); +@@ -2919,8 +2960,8 @@ shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + if (!local->xattr_req) + goto err; + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_link_handler); ++ shard_refresh_base_file(frame, this, &local->loc, NULL, ++ shard_post_lookup_link_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); +@@ -4249,8 +4290,8 @@ shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) + switch (local->fop) { + case GF_FOP_UNLINK: + case GF_FOP_RENAME: +- shard_lookup_base_file(frame, this, &local->int_inodelk.loc, +- shard_post_lookup_base_shard_rm_handler); ++ shard_refresh_base_file(frame, this, &local->int_inodelk.loc, NULL, ++ shard_post_lookup_base_shard_rm_handler); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +@@ -4505,8 +4546,8 @@ shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + if (local->block_size) { + local->tmp_loc.inode = inode_new(this->itable); + gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); +- shard_lookup_base_file(frame, this, &local->tmp_loc, +- shard_post_rename_lookup_handler); ++ shard_refresh_base_file(frame, this, &local->tmp_loc, NULL, ++ shard_post_rename_lookup_handler); + } else { + shard_rename_cbk(frame, this); + } +@@ -5242,8 +5283,8 @@ shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_readv_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_post_lookup_readv_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); +@@ -6046,8 +6087,8 @@ shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_fsync_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_post_lookup_fsync_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); +@@ -6420,12 +6461,8 @@ shard_common_remove_xattr(call_frame_t *frame, xlator_t *this, + if (xdata) + local->xattr_req = dict_ref(xdata); + +- /* To-Do: Switch from LOOKUP which is path-based, to FSTAT if the fop is +- * on an fd. This comes under a generic class of bugs in shard tracked by +- * bz #1782428. +- */ +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_remove_xattr_handler); ++ shard_refresh_base_file(frame, this, loc, fd, ++ shard_post_lookup_remove_xattr_handler); + return 0; + err: + shard_common_failure_unwind(fop, frame, -1, op_errno); +@@ -6662,12 +6699,8 @@ shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + if (xdata) + local->xattr_rsp = dict_ref(xdata); + +- /* To-Do: Switch from LOOKUP which is path-based, to FSTAT if the fop is +- * on an fd. This comes under a generic class of bugs in shard tracked by +- * bz #1782428. +- */ +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_set_xattr_handler); ++ shard_refresh_base_file(frame, this, loc, fd, ++ shard_post_lookup_set_xattr_handler); + return 0; + err: + shard_common_failure_unwind(fop, frame, -1, op_errno); +@@ -6951,8 +6984,8 @@ shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_common_inode_write_post_lookup_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_common_inode_write_post_lookup_handler); + return 0; + out: + shard_common_failure_unwind(fop, frame, -1, ENOMEM); +-- +1.8.3.1 + diff --git a/SOURCES/0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch b/SOURCES/0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch new file mode 100644 index 0000000..35cda2e --- /dev/null +++ b/SOURCES/0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch @@ -0,0 +1,215 @@ +From a7a56c079df2eb0253efdd53e1538656c0ce9095 Mon Sep 17 00:00:00 2001 +From: Yaniv Kaul +Date: Mon, 25 Nov 2019 15:37:46 +0200 +Subject: [PATCH 564/584] store.c/glusterd-store.c: remove sys_stat calls + +Instead of querying for the file size and allocating a char array +according to its size, let's just use a fixed size. +Those calls are not really needed, and are either expensive or +cached anyway. Since we do dynamic allocation/free, let's just use +a fixed array instead. + +I'll see if there are other sys_stat() calls that are not really +useful and try to eliminate them in separate patches. + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/23752/ +> Change-Id: I76b40e78a52ab38f613fc0cdef4be60e6253bf20 +> updates: bz#1193929 +> Signed-off-by: Yaniv Kaul + +BUG: 1925425 +Change-Id: I76b40e78a52ab38f613fc0cdef4be60e6253bf20 +Signed-off-by: Yaniv Kaul +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244958 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/store.h | 4 +- + libglusterfs/src/store.c | 71 ++++-------------------------- + xlators/mgmt/glusterd/src/glusterd-store.c | 5 +-- + 3 files changed, 12 insertions(+), 68 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/store.h b/libglusterfs/src/glusterfs/store.h +index 3b3a24c..f63bd05 100644 +--- a/libglusterfs/src/glusterfs/store.h ++++ b/libglusterfs/src/glusterfs/store.h +@@ -59,8 +59,8 @@ int32_t + gf_store_unlink_tmppath(gf_store_handle_t *shandle); + + int +-gf_store_read_and_tokenize(FILE *file, char *str, int size, char **iter_key, +- char **iter_val, gf_store_op_errno_t *store_errno); ++gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, ++ gf_store_op_errno_t *store_errno); + + int32_t + gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value); +diff --git a/libglusterfs/src/store.c b/libglusterfs/src/store.c +index cdf0aea..fa3649b 100644 +--- a/libglusterfs/src/store.c ++++ b/libglusterfs/src/store.c +@@ -184,8 +184,8 @@ out: + } + + int +-gf_store_read_and_tokenize(FILE *file, char *str, int size, char **iter_key, +- char **iter_val, gf_store_op_errno_t *store_errno) ++gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, ++ gf_store_op_errno_t *store_errno) + { + int32_t ret = -1; + char *savetok = NULL; +@@ -193,15 +193,15 @@ gf_store_read_and_tokenize(FILE *file, char *str, int size, char **iter_key, + char *value = NULL; + char *temp = NULL; + size_t str_len = 0; ++ char str[8192]; + + GF_ASSERT(file); +- GF_ASSERT(str); + GF_ASSERT(iter_key); + GF_ASSERT(iter_val); + GF_ASSERT(store_errno); + + retry: +- temp = fgets(str, size, file); ++ temp = fgets(str, 8192, file); + if (temp == NULL || feof(file)) { + ret = -1; + *store_errno = GD_STORE_EOF; +@@ -241,13 +241,8 @@ int32_t + gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value) + { + int32_t ret = -1; +- char *scan_str = NULL; + char *iter_key = NULL; + char *iter_val = NULL; +- char *free_str = NULL; +- struct stat st = { +- 0, +- }; + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT(handle); +@@ -279,32 +274,9 @@ gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value) + } else { + fseek(handle->read, 0, SEEK_SET); + } +- ret = sys_fstat(handle->fd, &st); +- if (ret < 0) { +- gf_msg("", GF_LOG_WARNING, errno, LG_MSG_FILE_OP_FAILED, +- "stat on file %s failed", handle->path); +- ret = -1; +- store_errno = GD_STORE_STAT_FAILED; +- goto out; +- } +- +- /* "st.st_size + 1" is used as we are fetching each +- * line of a file using fgets, fgets will append "\0" +- * to the end of the string +- */ +- scan_str = GF_CALLOC(1, st.st_size + 1, gf_common_mt_char); +- +- if (scan_str == NULL) { +- ret = -1; +- store_errno = GD_STORE_ENOMEM; +- goto out; +- } +- +- free_str = scan_str; +- + do { +- ret = gf_store_read_and_tokenize(handle->read, scan_str, st.st_size + 1, +- &iter_key, &iter_val, &store_errno); ++ ret = gf_store_read_and_tokenize(handle->read, &iter_key, &iter_val, ++ &store_errno); + if (ret < 0) { + gf_msg_trace("", 0, + "error while reading key '%s': " +@@ -334,8 +306,6 @@ out: + sys_close(handle->fd); + } + +- GF_FREE(free_str); +- + return ret; + } + +@@ -561,40 +531,16 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + gf_store_op_errno_t *op_errno) + { + int32_t ret = -1; +- char *scan_str = NULL; + char *iter_key = NULL; + char *iter_val = NULL; +- struct stat st = { +- 0, +- }; + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT(iter); + GF_ASSERT(key); + GF_ASSERT(value); + +- ret = sys_stat(iter->filepath, &st); +- if (ret < 0) { +- gf_msg("", GF_LOG_WARNING, errno, LG_MSG_FILE_OP_FAILED, +- "stat on file failed"); +- ret = -1; +- store_errno = GD_STORE_STAT_FAILED; +- goto out; +- } +- +- /* "st.st_size + 1" is used as we are fetching each +- * line of a file using fgets, fgets will append "\0" +- * to the end of the string +- */ +- scan_str = GF_CALLOC(1, st.st_size + 1, gf_common_mt_char); +- if (!scan_str) { +- ret = -1; +- store_errno = GD_STORE_ENOMEM; +- goto out; +- } +- +- ret = gf_store_read_and_tokenize(iter->file, scan_str, st.st_size + 1, +- &iter_key, &iter_val, &store_errno); ++ ret = gf_store_read_and_tokenize(iter->file, &iter_key, &iter_val, ++ &store_errno); + if (ret < 0) { + goto out; + } +@@ -619,7 +565,6 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + ret = 0; + + out: +- GF_FREE(scan_str); + if (ret) { + GF_FREE(*key); + GF_FREE(*value); +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index 4fa8116..da63c03 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -4092,7 +4092,6 @@ out: + int32_t + glusterd_store_retrieve_missed_snaps_list(xlator_t *this) + { +- char buf[PATH_MAX] = ""; + char path[PATH_MAX] = ""; + char *snap_vol_id = NULL; + char *missed_node_info = NULL; +@@ -4129,8 +4128,8 @@ glusterd_store_retrieve_missed_snaps_list(xlator_t *this) + } + + do { +- ret = gf_store_read_and_tokenize( +- fp, buf, sizeof(buf), &missed_node_info, &value, &store_errno); ++ ret = gf_store_read_and_tokenize(fp, &missed_node_info, &value, ++ &store_errno); + if (ret) { + if (store_errno == GD_STORE_EOF) { + gf_msg_debug(this->name, 0, "EOF for missed_snap_list"); +-- +1.8.3.1 + diff --git a/SOURCES/0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch b/SOURCES/0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch new file mode 100644 index 0000000..5e91703 --- /dev/null +++ b/SOURCES/0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch @@ -0,0 +1,124 @@ +From d491843640658e91a77f15647cefd1c00422c731 Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Wed, 21 Oct 2020 16:14:29 +0530 +Subject: [PATCH 565/584] libglusterfs/coverity: pointer to local outside the + scope + +issue: gf_store_read_and_tokenize() returns the address +of the locally referred string. + +fix: pass the buf to gf_store_read_and_tokenize() and +use it for tokenize. + +CID: 1430143 + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1675 +> Updates: #1060 +> Change-Id: Ifc346540c263f58f4014ba2ba8c1d491c20ac609 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: Ifc346540c263f58f4014ba2ba8c1d491c20ac609 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244959 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/store.h | 3 ++- + libglusterfs/src/store.c | 13 ++++++++----- + xlators/mgmt/glusterd/src/glusterd-store.c | 3 ++- + 3 files changed, 12 insertions(+), 7 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/store.h b/libglusterfs/src/glusterfs/store.h +index f63bd05..68a20ad 100644 +--- a/libglusterfs/src/glusterfs/store.h ++++ b/libglusterfs/src/glusterfs/store.h +@@ -60,7 +60,8 @@ gf_store_unlink_tmppath(gf_store_handle_t *shandle); + + int + gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, +- gf_store_op_errno_t *store_errno); ++ gf_store_op_errno_t *store_errno, char *str, ++ size_t buf_size); + + int32_t + gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value); +diff --git a/libglusterfs/src/store.c b/libglusterfs/src/store.c +index fa3649b..3af627a 100644 +--- a/libglusterfs/src/store.c ++++ b/libglusterfs/src/store.c +@@ -185,7 +185,8 @@ out: + + int + gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, +- gf_store_op_errno_t *store_errno) ++ gf_store_op_errno_t *store_errno, char *str, ++ size_t buf_size) + { + int32_t ret = -1; + char *savetok = NULL; +@@ -193,7 +194,6 @@ gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, + char *value = NULL; + char *temp = NULL; + size_t str_len = 0; +- char str[8192]; + + GF_ASSERT(file); + GF_ASSERT(iter_key); +@@ -201,7 +201,7 @@ gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, + GF_ASSERT(store_errno); + + retry: +- temp = fgets(str, 8192, file); ++ temp = fgets(str, buf_size, file); + if (temp == NULL || feof(file)) { + ret = -1; + *store_errno = GD_STORE_EOF; +@@ -275,8 +275,9 @@ gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value) + fseek(handle->read, 0, SEEK_SET); + } + do { ++ char buf[8192]; + ret = gf_store_read_and_tokenize(handle->read, &iter_key, &iter_val, +- &store_errno); ++ &store_errno, buf, 8192); + if (ret < 0) { + gf_msg_trace("", 0, + "error while reading key '%s': " +@@ -533,6 +534,8 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + int32_t ret = -1; + char *iter_key = NULL; + char *iter_val = NULL; ++ char buf[8192]; ++ + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT(iter); +@@ -540,7 +543,7 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + GF_ASSERT(value); + + ret = gf_store_read_and_tokenize(iter->file, &iter_key, &iter_val, +- &store_errno); ++ &store_errno, buf, 8192); + if (ret < 0) { + goto out; + } +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index da63c03..a8651d8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -4128,8 +4128,9 @@ glusterd_store_retrieve_missed_snaps_list(xlator_t *this) + } + + do { ++ char buf[8192]; + ret = gf_store_read_and_tokenize(fp, &missed_node_info, &value, +- &store_errno); ++ &store_errno, buf, 8192); + if (ret) { + if (store_errno == GD_STORE_EOF) { + gf_msg_debug(this->name, 0, "EOF for missed_snap_list"); +-- +1.8.3.1 + diff --git a/SOURCES/0566-enahancement-debug-Option-to-generate-core-dump-with.patch b/SOURCES/0566-enahancement-debug-Option-to-generate-core-dump-with.patch new file mode 100644 index 0000000..548271e --- /dev/null +++ b/SOURCES/0566-enahancement-debug-Option-to-generate-core-dump-with.patch @@ -0,0 +1,236 @@ +From e66ab728426e147bf4fc594109137ebfb1f2dda6 Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Mon, 23 Nov 2020 08:09:44 +0530 +Subject: [PATCH 566/584] enahancement/debug: Option to generate core dump + without killing the process + +Comments and idea proposed by: Xavi Hernandez(jahernan@redhat.com): + +On production systems sometimes we see a log message saying that an assertion +has failed. But it's hard to track why it failed without additional information +(on debug builds, a GF_ASSERT() generates a core dump and kills the process, +so it can be used to debug the issue, but many times we are only able to +reproduce assertion failures on production systems, where GF_ASSERT() only logs +a message and continues). + +In other cases we may have a core dump caused by a bug, but the core dump doesn't +necessarily happen when the bug has happened. Sometimes the crash happens so much +later that the causes that triggered the bug are lost. In these cases we can add +more assertions to the places that touch the potential candidates to cause the bug, +but the only thing we'll get is a log message, which may not be enough. + +One solution would be to always generate a core dump in case of assertion failure, +but this was already discussed and it was decided that it was too drastic. If a +core dump was really needed, a new macro was created to do so: GF_ABORT(), +but GF_ASSERT() would continue to not kill the process on production systems. + +I'm proposing to modify GF_ASSERT() on production builds so that it conditionally +triggers a signal when a debugger is attached. When this happens, the debugger +will generate a core dump and continue the process as if nothing had happened. +If there's no debugger attached, GF_ASSERT() will behave as always. + +The idea I have is to use SIGCONT to do that. This signal is harmless, so we can +unmask it (we currently mask all unneeded signals) and raise it inside a GF_ASSERT() +when some global variable is set to true. + +To produce the core dump, run the script under extras/debug/gfcore.py on other +terminal. gdb breaks and produces coredump when GF_ASSERT is hit. + +The script is copied from #1810 which is written by Xavi Hernandez(jahernan@redhat.com) + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1814 +> Fixes: #1810 +> Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1927640 +Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244960 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/debug/gfcore.py | 77 +++++++++++++++++++++++++++++++ + libglusterfs/src/common-utils.c | 11 +++++ + libglusterfs/src/glusterfs/common-utils.h | 10 +++- + libglusterfs/src/libglusterfs.sym | 16 +++++++ + 4 files changed, 112 insertions(+), 2 deletions(-) + create mode 100755 extras/debug/gfcore.py + +diff --git a/extras/debug/gfcore.py b/extras/debug/gfcore.py +new file mode 100755 +index 0000000..9f097f0 +--- /dev/null ++++ b/extras/debug/gfcore.py +@@ -0,0 +1,77 @@ ++#!/usr/bin/env python3 ++ ++def launch(): ++ if len(sys.argv) < 3: ++ sys.stderr.write("Syntax: {} []\n".format(os.path.basename(sys.argv[0]))) ++ sys.exit(1) ++ ++ pid = int(sys.argv[1]) ++ count = int(sys.argv[2]) ++ base = os.getcwd() ++ if len(sys.argv) > 3: ++ base = sys.argv[3] ++ base = os.path.realpath(base) ++ ++ subprocess.run([ ++ "gdb", "-batch", ++ "-p", str(pid), ++ "-ex", "py arg_count = {}".format(count), ++ "-ex", "py arg_dir = '{}'".format(base), ++ "-x", __file__ ++ ]) ++ ++class GFCore(object): ++ def __init__(self, count, base): ++ self.count = count ++ self.base = base ++ gdb.execute('set pagination off') ++ gdb.execute('set gf_signal_on_assert = 1') ++ gdb.events.stop.connect(self.gf_stop) ++ ++ self.cont() ++ ++ def cont(self, quit = False): ++ if not(quit) and (self.count > 0): ++ gdb.execute('continue') ++ else: ++ gdb.execute('set gf_signal_on_assert = 0') ++ gdb.execute('quit') ++ ++ def gf_stop(self, event): ++ quit = False ++ ++ if isinstance(event, gdb.SignalEvent): ++ if event.stop_signal == 'SIGCONT': ++ now = datetime.utcnow().isoformat() ++ pid = gdb.selected_inferior().pid ++ name = "{}/gfcore.{}.{}".format(self.base, pid, now) ++ print("Generating coredump '{}'".format(name)) ++ gdb.execute('gcore {}'.format(name)) ++ self.count -= 1 ++ ++ elif event.stop_signal == 'SIGINT': ++ print("SIGINT received. Exiting") ++ quit = True ++ ++ else: ++ print("Ignoring signal {}".format(event.stop_signal)) ++ else: ++ print("Unexpected event {}".format(type(event))) ++ ++ self.cont(quit) ++ ++# Module 'gdb' is not available when running outside gdb. ++try: ++ import gdb ++ from datetime import datetime ++ ++ GFCore(arg_count, arg_dir) ++except ModuleNotFoundError: ++ import sys ++ import os ++ import subprocess ++ ++ try: ++ launch() ++ except KeyboardInterrupt: ++ pass +diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c +index 70d5d21..d351b93 100644 +--- a/libglusterfs/src/common-utils.c ++++ b/libglusterfs/src/common-utils.c +@@ -77,9 +77,19 @@ char *vol_type_str[] = { + "Distributed-Disperse", + }; + ++gf_boolean_t gf_signal_on_assert = false; ++ + typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size); + typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size); + ++void gf_assert(void) ++{ ++ if (gf_signal_on_assert) { ++ raise(SIGCONT); ++ } ++ ++} ++ + void + gf_xxh64_wrapper(const unsigned char *data, size_t const len, + unsigned long long const seed, char *xxh64) +@@ -4021,6 +4031,7 @@ gf_thread_vcreate(pthread_t *thread, const pthread_attr_t *attr, + sigdelset(&set, SIGSYS); + sigdelset(&set, SIGFPE); + sigdelset(&set, SIGABRT); ++ sigdelset(&set, SIGCONT); + + pthread_sigmask(SIG_BLOCK, &set, &old); + +diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h +index f0a0a41..604afd0 100644 +--- a/libglusterfs/src/glusterfs/common-utils.h ++++ b/libglusterfs/src/glusterfs/common-utils.h +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #ifndef ffsll + #define ffsll(x) __builtin_ffsll(x) +@@ -431,14 +432,19 @@ BIT_VALUE(unsigned char *array, unsigned int index) + #define GF_FILE_CONTENT_REQUESTED(_xattr_req, _content_limit) \ + (dict_get_uint64(_xattr_req, "glusterfs.content", _content_limit) == 0) + ++void gf_assert(void); ++ + #ifdef DEBUG + #define GF_ASSERT(x) assert(x); + #else + #define GF_ASSERT(x) \ + do { \ +- if (!(x)) { \ ++ if (caa_unlikely(!(x))) { \ ++ gf_assert(); \ + gf_msg_callingfn("", GF_LOG_ERROR, 0, LG_MSG_ASSERTION_FAILED, \ +- "Assertion failed: " #x); \ ++ "Assertion failed: To attach gdb and coredump," \ ++ " Run the script under " \ ++ "\"glusterfs/extras/debug/gfcore.py\""); \ + } \ + } while (0) + #endif +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index 0a0862e..9072afa 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -1167,3 +1167,19 @@ gf_changelog_register_generic + gf_gfid_generate_from_xxh64 + find_xlator_option_in_cmd_args_t + gf_d_type_from_ia_type ++glusterfs_graph_fini ++glusterfs_process_svc_attach_volfp ++glusterfs_mux_volfile_reconfigure ++glusterfs_process_svc_detach ++mgmt_is_multiplexed_daemon ++xlator_is_cleanup_starting ++gf_nanosleep ++gf_syncfs ++graph_total_client_xlator ++get_xattrs_to_heal ++gf_latency_statedump_and_reset ++gf_latency_new ++gf_latency_reset ++gf_latency_update ++gf_frame_latency_update ++gf_assert +-- +1.8.3.1 + diff --git a/SOURCES/0567-inode-create-inode-outside-locked-region.patch b/SOURCES/0567-inode-create-inode-outside-locked-region.patch new file mode 100644 index 0000000..23d51c4 --- /dev/null +++ b/SOURCES/0567-inode-create-inode-outside-locked-region.patch @@ -0,0 +1,86 @@ +From 5c81d813c8b1f494d31d54c1ab09a3f0153ebfd4 Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Sat, 9 Feb 2019 13:13:47 +0530 +Subject: [PATCH 567/584] inode: create inode outside locked region + +Only linking of inode to the table, and inserting it in +a list needs to be in locked region. + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/22183/ +> Updates: bz#1670031 +> Change-Id: I6ea7e956b80cf2765c2233d761909c4bf9c7253c +> Signed-off-by: Amar Tumballi + +BUG: 1927640 +Change-Id: I6ea7e956b80cf2765c2233d761909c4bf9c7253c +Signed-off-by: Amar Tumballi +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244961 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/inode.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 98f8ea6..46db04f 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -620,7 +620,7 @@ out: + } + + static inode_t * +-__inode_create(inode_table_t *table) ++inode_create(inode_table_t *table) + { + inode_t *newi = NULL; + +@@ -647,11 +647,7 @@ __inode_create(inode_table_t *table) + goto out; + } + +- list_add(&newi->list, &table->lru); +- table->lru_size++; +- + out: +- + return newi; + } + +@@ -668,14 +664,16 @@ inode_new(inode_table_t *table) + return NULL; + } + +- pthread_mutex_lock(&table->lock); +- { +- inode = __inode_create(table); +- if (inode != NULL) { ++ inode = inode_create(table); ++ if (inode) { ++ pthread_mutex_lock(&table->lock); ++ { ++ list_add(&inode->list, &table->lru); ++ table->lru_size++; + __inode_ref(inode, false); + } ++ pthread_mutex_unlock(&table->lock); + } +- pthread_mutex_unlock(&table->lock); + + return inode; + } +@@ -1613,7 +1611,10 @@ __inode_table_init_root(inode_table_t *table) + if (!table) + return; + +- root = __inode_create(table); ++ root = inode_create(table); ++ ++ list_add(&root->list, &table->lru); ++ table->lru_size++; + + iatt.ia_gfid[15] = 1; + iatt.ia_ino = 1; +-- +1.8.3.1 + diff --git a/SOURCES/0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch b/SOURCES/0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch new file mode 100644 index 0000000..22c6790 --- /dev/null +++ b/SOURCES/0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch @@ -0,0 +1,131 @@ +From 2640ee56201d320b838909f95608abe07e3ff9b0 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Tue, 24 Nov 2020 15:29:58 +0530 +Subject: [PATCH 568/584] core: tcmu-runner process continuous growing logs + lru_size showing -1 + +* core: tcmu-runner process continuous growing logs lru_size showing -1 + +At the time of calling inode_table_prune it checks if current lru_size +is greater than lru_limit but lru_list is empty it throws a log message +"Empty inode lru list found but with (%d) lru_size".As per code reading +it seems lru_size is out of sync with the actual number of inodes in +lru_list. Due to throwing continuous error messages entire disk is +getting full and the user has to restart the tcmu-runner process to use +the volumes.The log message was introduce by a patch +https://review.gluster.org/#/c/glusterfs/+/15087/. + +Solution: Introduce a flag in_lru_list to take decision about inode is + being part of lru_list or not. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1776 +> Fixes: #1775 +> Change-Id: I4b836bebf4b5db65fbf88ff41c6c88f4a7ac55c1 +> Signed-off-by: Mohit Agrawal + +BUG: 1927640 +Change-Id: I4b836bebf4b5db65fbf88ff41c6c88f4a7ac55c1 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244962 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/inode.h | 1 + + libglusterfs/src/inode.c | 14 ++++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/libglusterfs/src/glusterfs/inode.h b/libglusterfs/src/glusterfs/inode.h +index 62c093d..17d0340 100644 +--- a/libglusterfs/src/glusterfs/inode.h ++++ b/libglusterfs/src/glusterfs/inode.h +@@ -110,6 +110,7 @@ struct _inode { + struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */ + bool in_invalidate_list; /* Set if inode is in table invalidate list */ + bool invalidate_sent; /* Set it if invalidator_fn is called for inode */ ++ bool in_lru_list; /* Set if inode is in table lru list */ + }; + + #define UUID0_STR "00000000-0000-0000-0000-000000000000" +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 46db04f..8e91197 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -417,8 +417,10 @@ __inode_passivate(inode_t *inode) + dentry_t *dentry = NULL; + dentry_t *t = NULL; + ++ GF_ASSERT(!inode->in_lru_list); + list_move_tail(&inode->list, &inode->table->lru); + inode->table->lru_size++; ++ inode->in_lru_list = _gf_true; + + list_for_each_entry_safe(dentry, t, &inode->dentry_list, inode_list) + { +@@ -531,7 +533,10 @@ __inode_ref(inode_t *inode, bool is_invalidate) + inode->in_invalidate_list = false; + inode->table->invalidate_size--; + } else { ++ GF_ASSERT(inode->table->lru_size > 0); ++ GF_ASSERT(inode->in_lru_list); + inode->table->lru_size--; ++ inode->in_lru_list = _gf_false; + } + if (is_invalidate) { + inode->in_invalidate_list = true; +@@ -670,6 +675,8 @@ inode_new(inode_table_t *table) + { + list_add(&inode->list, &table->lru); + table->lru_size++; ++ GF_ASSERT(!inode->in_lru_list); ++ inode->in_lru_list = _gf_true; + __inode_ref(inode, false); + } + pthread_mutex_unlock(&table->lock); +@@ -1533,6 +1540,7 @@ inode_table_prune(inode_table_t *table) + lru_size = table->lru_size; + while (lru_size > (table->lru_limit)) { + if (list_empty(&table->lru)) { ++ GF_ASSERT(0); + gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, + LG_MSG_INVALID_INODE_LIST, + "Empty inode lru list found" +@@ -1543,6 +1551,7 @@ inode_table_prune(inode_table_t *table) + + lru_size--; + entry = list_entry(table->lru.next, inode_t, list); ++ GF_ASSERT(entry->in_lru_list); + /* The logic of invalidation is required only if invalidator_fn + is present */ + if (table->invalidator_fn) { +@@ -1560,6 +1569,7 @@ inode_table_prune(inode_table_t *table) + } + + table->lru_size--; ++ entry->in_lru_list = _gf_false; + __inode_retire(entry); + ret++; + } +@@ -1615,6 +1625,7 @@ __inode_table_init_root(inode_table_t *table) + + list_add(&root->list, &table->lru); + table->lru_size++; ++ root->in_lru_list = _gf_true; + + iatt.ia_gfid[15] = 1; + iatt.ia_ino = 1; +@@ -1873,8 +1884,11 @@ inode_table_destroy(inode_table_t *inode_table) + while (!list_empty(&inode_table->lru)) { + trav = list_first_entry(&inode_table->lru, inode_t, list); + inode_forget_atomic(trav, 0); ++ GF_ASSERT(inode_table->lru_size > 0); ++ GF_ASSERT(trav->in_lru_list); + __inode_retire(trav); + inode_table->lru_size--; ++ trav->in_lru_list = _gf_false; + } + + /* Same logic for invalidate list */ +-- +1.8.3.1 + diff --git a/SOURCES/0569-features-shard-optimization-over-shard-lookup-in-cas.patch b/SOURCES/0569-features-shard-optimization-over-shard-lookup-in-cas.patch new file mode 100644 index 0000000..fff8223 --- /dev/null +++ b/SOURCES/0569-features-shard-optimization-over-shard-lookup-in-cas.patch @@ -0,0 +1,200 @@ +From 1b86a4bda540ff4cf307c7f38d3041318636ecb7 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Thu, 6 Aug 2020 14:39:59 +0530 +Subject: [PATCH 569/584] features/shard: optimization over shard lookup in + case of prealloc + +Assume that we are preallocating a VM of size 1TB with a shard +block size of 64MB then there will be ~16k shards. + +This creation happens in 2 steps shard_fallocate() path i.e + +1. lookup for the shards if any already present and +2. mknod over those shards do not exist. + +But in case of fresh creation, we dont have to lookup for all +shards which are not present as the the file size will be 0. +Through this, we can save lookup on all shards which are not +present. This optimization is quite useful in the case of +preallocating big vm. + +Also if the file is already present and the call is to +extend it to bigger size then we need not to lookup for non- +existent shards. Just lookup preexisting shards, populate +the inodes and issue mknod on extended size. + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/24813/ +> Fixes: #1425 +> Change-Id: I60036fe8302c696e0ca80ff11ab0ef5bcdbd7880 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: I60036fe8302c696e0ca80ff11ab0ef5bcdbd7880 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244963 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-1425.t | 45 +++++++++++++++++++++++++++++++++++++ + xlators/features/shard/src/shard.c | 46 ++++++++++++++++++++++++++++++++------ + 2 files changed, 84 insertions(+), 7 deletions(-) + create mode 100644 tests/bugs/shard/issue-1425.t + +diff --git a/tests/bugs/shard/issue-1425.t b/tests/bugs/shard/issue-1425.t +new file mode 100644 +index 0000000..bbe82c0 +--- /dev/null ++++ b/tests/bugs/shard/issue-1425.t +@@ -0,0 +1,45 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume start $V0 ++TEST $CLI volume profile $V0 start ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST fallocate -l 20M $M0/foo ++gfid_new=$(get_gfid_string $M0/foo) ++ ++# Check for the base shard ++TEST stat $M0/foo ++TEST stat $B0/${V0}0/foo ++ ++# There should be 4 associated shards ++EXPECT_WITHIN $FILE_COUNT_TIME 4 get_file_count $B0/${V0}0/.shard/$gfid_new ++ ++# There should be 1+4 shards and we expect 4 lookups less than on the build without this patch ++EXPECT "21" echo `$CLI volume profile $V0 info incremental | grep -w LOOKUP | awk '{print $8}'` ++ ++# Delete the base shard and check shards get cleaned up ++TEST unlink $M0/foo ++ ++TEST ! stat $M0/foo ++TEST ! stat $B0/${V0}0/foo ++ ++# There should be no shards now ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_new ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 2ba4528..a6ad1b8 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -995,6 +995,10 @@ shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) + } + + int ++shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, ++ xlator_t *this); ++ ++int + shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler) + { +@@ -1011,21 +1015,47 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; ++ uint64_t resolve_count = 0; + + priv = this->private; + local = frame->local; + local->call_count = 0; + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; ++ ++ if ((local->op_ret < 0) || (local->resolve_not)) ++ goto out; ++ ++ /* If this prealloc FOP is for fresh file creation, then the size of the ++ * file will be 0. Then there will be no shards associated with this file. ++ * So we can skip the lookup process for the shards which do not exists ++ * and directly issue mknod to crete shards. ++ * ++ * In case the prealloc fop is to extend the preallocated file to bigger ++ * size then just lookup and populate inodes of existing shards and ++ * update the create count ++ */ ++ if (local->fop == GF_FOP_FALLOCATE) { ++ if (!local->prebuf.ia_size) { ++ local->inode_list[0] = inode_ref(res_inode); ++ local->create_count = local->last_block; ++ shard_common_inode_write_post_lookup_shards_handler(frame, this); ++ return 0; ++ } ++ if (local->prebuf.ia_size < local->total_size) ++ local->create_count = local->last_block - ++ ((local->prebuf.ia_size - 1) / ++ local->block_size); ++ } ++ ++ resolve_count = local->last_block - local->create_count; ++ + if (res_inode) + gf_uuid_copy(gfid, res_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + +- if ((local->op_ret < 0) || (local->resolve_not)) +- goto out; +- +- while (shard_idx_iter <= local->last_block) { ++ while (shard_idx_iter <= resolve_count) { + i++; + if (shard_idx_iter == 0) { + local->inode_list[i] = inode_ref(res_inode); +@@ -2434,7 +2464,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + int count = 0; + int call_count = 0; + int32_t shard_idx_iter = 0; +- int last_block = 0; ++ int lookup_count = 0; + char path[PATH_MAX] = { + 0, + }; +@@ -2454,7 +2484,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + local = frame->local; + count = call_count = local->call_count; + shard_idx_iter = local->first_block; +- last_block = local->last_block; ++ lookup_count = local->last_block - local->create_count; + local->pls_fop_handler = handler; + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; +@@ -2464,7 +2494,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + else + gf_uuid_copy(gfid, local->base_gfid); + +- while (shard_idx_iter <= last_block) { ++ while (shard_idx_iter <= lookup_count) { + if (local->inode_list[i]) { + i++; + shard_idx_iter++; +@@ -5651,6 +5681,8 @@ shard_common_inode_write_post_resolve_handler(call_frame_t *frame, + shard_common_lookup_shards( + frame, this, local->resolver_base_inode, + shard_common_inode_write_post_lookup_shards_handler); ++ } else if (local->create_count) { ++ shard_common_inode_write_post_lookup_shards_handler(frame, this); + } else { + shard_common_inode_write_do(frame, this); + } +-- +1.8.3.1 + diff --git a/SOURCES/0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch b/SOURCES/0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch new file mode 100644 index 0000000..4d87bcb --- /dev/null +++ b/SOURCES/0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch @@ -0,0 +1,340 @@ +From 1a8b001a121ada4d3d338b52b312896f1790f2bb Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Mon, 11 Jan 2021 12:34:55 +0530 +Subject: [PATCH 570/584] features/shard: avoid repeatative calls to + gf_uuid_unparse() + +The issue is shard_make_block_abspath() calls gf_uuid_unparse() +every time while constructing shard path. The gfid can be parsed +and saved once and passed while constructing the path. Thus +we can avoid calling gf_uuid_unparse(). + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1689 +> Fixes: #1423 +> Change-Id: Ia26fbd5f09e812bbad9e5715242f14143c013c9c +> Signed-off-by: Vinayakswami Hariharmath vharihar@redhat.com + +BUG: 1925425 +Change-Id: Ia26fbd5f09e812bbad9e5715242f14143c013c9c +Signed-off-by: Vinayakswami Hariharmath vharihar@redhat.com +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244964 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-1425.t | 9 ++- + xlators/features/shard/src/shard.c | 119 ++++++++++++++++++------------------- + 2 files changed, 65 insertions(+), 63 deletions(-) + +diff --git a/tests/bugs/shard/issue-1425.t b/tests/bugs/shard/issue-1425.t +index bbe82c0..8b77705 100644 +--- a/tests/bugs/shard/issue-1425.t ++++ b/tests/bugs/shard/issue-1425.t +@@ -21,7 +21,13 @@ TEST $CLI volume profile $V0 start + + TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 + ++$CLI volume profile $V0 info clear ++ + TEST fallocate -l 20M $M0/foo ++ ++# There should be 1+4 shards and we expect 4 lookups less than on the build without this patch ++EXPECT "5" echo `$CLI volume profile $V0 info incremental | grep -w LOOKUP | awk '{print $8}'` ++ + gfid_new=$(get_gfid_string $M0/foo) + + # Check for the base shard +@@ -31,9 +37,6 @@ TEST stat $B0/${V0}0/foo + # There should be 4 associated shards + EXPECT_WITHIN $FILE_COUNT_TIME 4 get_file_count $B0/${V0}0/.shard/$gfid_new + +-# There should be 1+4 shards and we expect 4 lookups less than on the build without this patch +-EXPECT "21" echo `$CLI volume profile $V0 info incremental | grep -w LOOKUP | awk '{print $8}'` +- + # Delete the base shard and check shards get cleaned up + TEST unlink $M0/foo + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index a6ad1b8..d1d7d7a 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -16,6 +16,8 @@ + #include + #include + ++#define SHARD_PATH_MAX (sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE + 16) ++ + static gf_boolean_t + __is_shard_dir(uuid_t gfid) + { +@@ -49,15 +51,19 @@ shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) + snprintf(buf, len, "%s.%d", gfid_str, block_num); + } + +-void +-shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) ++static int ++shard_make_base_path(char *path, uuid_t gfid) + { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; ++ strcpy(path, "/" GF_SHARD_DIR "/"); ++ uuid_utoa_r(gfid, path + sizeof(GF_SHARD_DIR) + 1); ++ return (sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE); ++} + +- gf_uuid_unparse(gfid, gfid_str); +- snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); ++static inline void ++shard_append_index(char *path, int path_size, int prefix_len, ++ int shard_idx_iter) ++{ ++ snprintf(path + prefix_len, path_size - prefix_len, ".%d", shard_idx_iter); + } + + int +@@ -1004,9 +1010,8 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + { + int i = -1; + uint32_t shard_idx_iter = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ int prefix_len = 0; ++ char path[SHARD_PATH_MAX]; + uuid_t gfid = { + 0, + }; +@@ -1055,6 +1060,9 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + else + gf_uuid_copy(gfid, local->base_gfid); + ++ /* Build base shard path before appending index of the shard */ ++ prefix_len = shard_make_base_path(path, gfid); ++ + while (shard_idx_iter <= resolve_count) { + i++; + if (shard_idx_iter == 0) { +@@ -1062,16 +1070,13 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + shard_idx_iter++; + continue; + } +- +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, shard_idx_iter); + inode = NULL; + inode = inode_resolve(this->itable, path); + if (inode) { + gf_msg_debug(this->name, 0, +- "Shard %d already " +- "present. gfid=%s. Saving inode for future.", +- shard_idx_iter, uuid_utoa(inode->gfid)); ++ "Shard %s already present. Saving inode for future.", ++ path); + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get +@@ -2153,9 +2158,8 @@ shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) + int call_count = 0; + uint32_t cur_block = 0; + uint32_t last_block = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ int prefix_len = 0; ++ char path[SHARD_PATH_MAX]; + char *bname = NULL; + loc_t loc = { + 0, +@@ -2216,6 +2220,10 @@ shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) + return 0; + } + ++ /* Build base shard path before appending index of the shard */ ++ prefix_len = shard_make_base_path(path, inode->gfid); ++ bname = path + sizeof(GF_SHARD_DIR) + 1; ++ + SHARD_SET_ROOT_FS_ID(frame, local); + while (cur_block <= last_block) { + if (!local->inode_list[i]) { +@@ -2229,15 +2237,12 @@ shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) + goto next; + } + +- shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, cur_block); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s. Base file gfid = %s", +- bname, uuid_utoa(inode->gfid)); ++ "Inode path failed on %s.", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); +@@ -2465,13 +2470,8 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + int call_count = 0; + int32_t shard_idx_iter = 0; + int lookup_count = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ char path[SHARD_PATH_MAX]; + char *bname = NULL; +- uuid_t gfid = { +- 0, +- }; + loc_t loc = { + 0, + }; +@@ -2489,10 +2489,16 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; + ++ /* Build base shard path before appending index of the shard */ ++ strcpy(path, "/" GF_SHARD_DIR "/"); ++ + if (inode) +- gf_uuid_copy(gfid, inode->gfid); ++ uuid_utoa_r(inode->gfid, path + sizeof(GF_SHARD_DIR) + 1); + else +- gf_uuid_copy(gfid, local->base_gfid); ++ uuid_utoa_r(local->base_gfid, path + sizeof(GF_SHARD_DIR) + 1); ++ ++ int prefix_len = sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE; ++ bname = path + sizeof(GF_SHARD_DIR) + 1; + + while (shard_idx_iter <= lookup_count) { + if (local->inode_list[i]) { +@@ -2508,18 +2514,14 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + goto next; + } + +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- +- bname = strrchr(path, '/') + 1; ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, shard_idx_iter); + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0 || !(loc.inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); ++ "Inode path failed on %s", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); +@@ -3168,12 +3170,7 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + uint32_t cur_block = 0; + uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ + char *bname = NULL; +- char path[PATH_MAX] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; ++ char path[SHARD_PATH_MAX]; + loc_t loc = { + 0, + }; +@@ -3184,10 +3181,16 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + priv = this->private; + local = frame->local; + ++ /* Build base shard path before appending index of the shard */ ++ strcpy(path, "/" GF_SHARD_DIR "/"); ++ + if (inode) +- gf_uuid_copy(gfid, inode->gfid); ++ uuid_utoa_r(inode->gfid, path + sizeof(GF_SHARD_DIR) + 1); + else +- gf_uuid_copy(gfid, local->base_gfid); ++ uuid_utoa_r(local->base_gfid, path + sizeof(GF_SHARD_DIR) + 1); ++ ++ int prefix_len = sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE; ++ bname = path + sizeof(GF_SHARD_DIR) + 1; + + for (i = 0; i < local->num_blocks; i++) { + if (!local->inode_list[i]) +@@ -3203,7 +3206,7 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + gf_msg_debug(this->name, 0, + "All shards that need to be " + "unlinked are non-existent: %s", +- uuid_utoa(gfid)); ++ path); + return 0; + } + +@@ -3221,15 +3224,12 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + goto next; + } + +- shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, cur_block); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); ++ "Inode path failed on %s", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); +@@ -4971,9 +4971,8 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + int last_block = 0; + int ret = 0; + int call_count = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ int prefix_len = 0; ++ char path[SHARD_PATH_MAX]; + mode_t mode = 0; + char *bname = NULL; + shard_priv_t *priv = NULL; +@@ -4996,6 +4995,10 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + call_count = local->call_count = local->create_count; + local->post_mknod_handler = post_mknod_handler; + ++ /* Build base shard path before appending index of the shard */ ++ prefix_len = shard_make_base_path(path, fd->inode->gfid); ++ bname = path + sizeof(GF_SHARD_DIR) + 1; ++ + SHARD_SET_ROOT_FS_ID(frame, local); + + ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); +@@ -5022,10 +5025,7 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } +- +- shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, +- sizeof(path)); +- ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, shard_idx_iter); + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; +@@ -5036,7 +5036,6 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + goto next; + } + +- bname = strrchr(path, '/') + 1; + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +-- +1.8.3.1 + diff --git a/SOURCES/0571-NetBSD-build-fixes.patch b/SOURCES/0571-NetBSD-build-fixes.patch new file mode 100644 index 0000000..8a6d4a4 --- /dev/null +++ b/SOURCES/0571-NetBSD-build-fixes.patch @@ -0,0 +1,98 @@ +From 2c0d11bb406e50fb515abf0c5a4006e1b362ac8e Mon Sep 17 00:00:00 2001 +From: Emmanuel Dreyfus +Date: Tue, 30 Jun 2020 16:42:36 +0200 +Subject: [PATCH 571/584] NetBSD build fixes + +- Make sure -largp is used at link time +- PTHREAD_MUTEX_ADAPTIVE_NP is not available, use PTHREAD_MUTEX_DEFAULT instead +- Avoid non POSIX [[ ]] in scripts +- Do not check of lock.spinlock is NULL since it is not a pointer + (it is not a pointer on Linux either) + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/24648/ +> Change-Id: I5e04a7c552d24f8a473c2b837828d1bddfa7e128 +> Fixes: #1347 +> Type: Bug +> Signed-off-by: Emmanuel Dreyfus + +BUG: 1925425 +Change-Id: I5e04a7c552d24f8a473c2b837828d1bddfa7e128 +Signed-off-by: Emmanuel Dreyfus +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245040 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + configure.ac | 3 +++ + rpc/rpc-lib/src/rpcsvc.c | 4 ++++ + tools/gfind_missing_files/gfind_missing_files.sh | 2 +- + xlators/performance/write-behind/src/write-behind.c | 4 ++-- + 4 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/configure.ac b/configure.ac +index 327733e..6138a59 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -998,6 +998,9 @@ case $host_os in + CFLAGS="${CFLAGS} -isystem /usr/local/include" + ARGP_LDADD=-largp + ;; ++ *netbsd*) ++ ARGP_LDADD=-largp ++ ;; + esac + dnl argp-standalone does not provide a pkg-config file + AC_CHECK_HEADER([argp.h], AC_DEFINE(HAVE_ARGP, 1, [have argp])) +diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c +index 3f184bf..b031d93 100644 +--- a/rpc/rpc-lib/src/rpcsvc.c ++++ b/rpc/rpc-lib/src/rpcsvc.c +@@ -46,6 +46,10 @@ + #include "xdr-rpcclnt.h" + #include + ++#ifndef PTHREAD_MUTEX_ADAPTIVE_NP ++#define PTHREAD_MUTEX_ADAPTIVE_NP PTHREAD_MUTEX_DEFAULT ++#endif ++ + struct rpcsvc_program gluster_dump_prog; + + #define rpcsvc_alloc_request(svc, request) \ +diff --git a/tools/gfind_missing_files/gfind_missing_files.sh b/tools/gfind_missing_files/gfind_missing_files.sh +index f42fe7b..e7aaa0b 100644 +--- a/tools/gfind_missing_files/gfind_missing_files.sh ++++ b/tools/gfind_missing_files/gfind_missing_files.sh +@@ -61,7 +61,7 @@ mount_slave() + + parse_cli() + { +- if [[ $# -ne 4 ]]; then ++ if [ "$#" -ne 4 ]; then + echo "Usage: gfind_missing_files " + exit 1 + else +diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c +index 31ab723..76d257f 100644 +--- a/xlators/performance/write-behind/src/write-behind.c ++++ b/xlators/performance/write-behind/src/write-behind.c +@@ -2490,7 +2490,7 @@ wb_mark_readdirp_start(xlator_t *this, inode_t *directory) + + wb_directory_inode = wb_inode_create(this, directory); + +- if (!wb_directory_inode || !wb_directory_inode->lock.spinlock) ++ if (!wb_directory_inode) + return; + + LOCK(&wb_directory_inode->lock); +@@ -2510,7 +2510,7 @@ wb_mark_readdirp_end(xlator_t *this, inode_t *directory) + + wb_directory_inode = wb_inode_ctx_get(this, directory); + +- if (!wb_directory_inode || !wb_directory_inode->lock.spinlock) ++ if (!wb_directory_inode) + return; + + LOCK(&wb_directory_inode->lock); +-- +1.8.3.1 + diff --git a/SOURCES/0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch b/SOURCES/0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch new file mode 100644 index 0000000..1447916 --- /dev/null +++ b/SOURCES/0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch @@ -0,0 +1,183 @@ +From 1491b33007e84be0a0a74354e89deca8a21ed198 Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Tue, 19 Jan 2021 15:39:35 +0530 +Subject: [PATCH 572/584] locks: remove unused conditional switch to spin_lock + code + +use of spin_locks is depend on the variable use_spinlocks +but the same is commented in the current code base through +https://review.gluster.org/#/c/glusterfs/+/14763/. So it is +of no use to have conditional switching to spin_lock or +mutex. Removing the dead code as part of the patch + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2007 +> Fixes: #1996 +> Change-Id: Ib005dd86969ce33d3409164ef3e1011bb3169129 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: Ib005dd86969ce33d3409164ef3e1011bb3169129 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244965 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + configure.ac | 7 ----- + libglusterfs/src/Makefile.am | 2 +- + libglusterfs/src/common-utils.c | 5 ---- + libglusterfs/src/glusterfs/locking.h | 51 ------------------------------------ + libglusterfs/src/locking.c | 27 ------------------- + 5 files changed, 1 insertion(+), 91 deletions(-) + delete mode 100644 libglusterfs/src/locking.c + +diff --git a/configure.ac b/configure.ac +index 6138a59..3d99f6a 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -585,13 +585,6 @@ AC_CHECK_HEADERS([linux/falloc.h]) + + AC_CHECK_HEADERS([linux/oom.h], AC_DEFINE(HAVE_LINUX_OOM_H, 1, [have linux/oom.h])) + +-dnl Mac OS X does not have spinlocks +-AC_CHECK_FUNC([pthread_spin_init], [have_spinlock=yes]) +-if test "x${have_spinlock}" = "xyes"; then +- AC_DEFINE(HAVE_SPINLOCK, 1, [define if found spinlock]) +-fi +-AC_SUBST(HAVE_SPINLOCK) +- + dnl some os may not have GNU defined strnlen function + AC_CHECK_FUNC([strnlen], [have_strnlen=yes]) + if test "x${have_strnlen}" = "xyes"; then +diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am +index 970f4b7..830a0c3 100644 +--- a/libglusterfs/src/Makefile.am ++++ b/libglusterfs/src/Makefile.am +@@ -35,7 +35,7 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \ + strfd.c parse-utils.c $(CONTRIBDIR)/mount/mntent.c \ + $(CONTRIBDIR)/libexecinfo/execinfo.c quota-common-utils.c rot-buffs.c \ + $(CONTRIBDIR)/timer-wheel/timer-wheel.c \ +- $(CONTRIBDIR)/timer-wheel/find_last_bit.c default-args.c locking.c \ ++ $(CONTRIBDIR)/timer-wheel/find_last_bit.c default-args.c \ + $(CONTRIBDIR)/xxhash/xxhash.c \ + compound-fop-utils.c throttle-tbf.c monitoring.c + +diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c +index d351b93..c2dfe28 100644 +--- a/libglusterfs/src/common-utils.c ++++ b/libglusterfs/src/common-utils.c +@@ -860,11 +860,6 @@ gf_dump_config_flags() + gf_msg_plain_nomem(GF_LOG_ALERT, "setfsid 1"); + #endif + +-/* define if found spinlock */ +-#ifdef HAVE_SPINLOCK +- gf_msg_plain_nomem(GF_LOG_ALERT, "spinlock 1"); +-#endif +- + /* Define to 1 if you have the header file. */ + #ifdef HAVE_SYS_EPOLL_H + gf_msg_plain_nomem(GF_LOG_ALERT, "epoll.h 1"); +diff --git a/libglusterfs/src/glusterfs/locking.h b/libglusterfs/src/glusterfs/locking.h +index 43cc877..63097bb 100644 +--- a/libglusterfs/src/glusterfs/locking.h ++++ b/libglusterfs/src/glusterfs/locking.h +@@ -22,55 +22,6 @@ + #define pthread_spin_init(l, v) (*l = v) + #endif + +-#if defined(HAVE_SPINLOCK) +- +-typedef union { +- pthread_spinlock_t spinlock; +- pthread_mutex_t mutex; +-} gf_lock_t; +- +-#if !defined(LOCKING_IMPL) +-extern int use_spinlocks; +- +-/* +- * Using a dispatch table would be unpleasant because we're dealing with two +- * different types. If the dispatch contains direct pointers to pthread_xx +- * or mutex_xxx then we have to hope that every possible union alternative +- * starts at the same address as the union itself. I'm old enough to remember +- * compilers where this was not the case (for alignment reasons) so I'm a bit +- * paranoid about that. Also, I don't like casting arguments through "void *" +- * which we'd also have to do to avoid type errors. The other alternative would +- * be to define actual functions which pick out the right union member, and put +- * those in the dispatch tables. Now we have a pointer dereference through the +- * dispatch table plus a function call, which is likely to be worse than the +- * branching here from the ?: construct. If it were a clear win it might be +- * worth the extra complexity, but for now this way seems preferable. +- */ +- +-#define LOCK_INIT(x) \ +- (use_spinlocks ? pthread_spin_init(&((x)->spinlock), 0) \ +- : pthread_mutex_init(&((x)->mutex), 0)) +- +-#define LOCK(x) \ +- (use_spinlocks ? pthread_spin_lock(&((x)->spinlock)) \ +- : pthread_mutex_lock(&((x)->mutex))) +- +-#define TRY_LOCK(x) \ +- (use_spinlocks ? pthread_spin_trylock(&((x)->spinlock)) \ +- : pthread_mutex_trylock(&((x)->mutex))) +- +-#define UNLOCK(x) \ +- (use_spinlocks ? pthread_spin_unlock(&((x)->spinlock)) \ +- : pthread_mutex_unlock(&((x)->mutex))) +- +-#define LOCK_DESTROY(x) \ +- (use_spinlocks ? pthread_spin_destroy(&((x)->spinlock)) \ +- : pthread_mutex_destroy(&((x)->mutex))) +- +-#endif +- +-#else +- + typedef pthread_mutex_t gf_lock_t; + + #define LOCK_INIT(x) pthread_mutex_init(x, 0) +@@ -79,6 +30,4 @@ typedef pthread_mutex_t gf_lock_t; + #define UNLOCK(x) pthread_mutex_unlock(x) + #define LOCK_DESTROY(x) pthread_mutex_destroy(x) + +-#endif /* HAVE_SPINLOCK */ +- + #endif /* _LOCKING_H */ +diff --git a/libglusterfs/src/locking.c b/libglusterfs/src/locking.c +deleted file mode 100644 +index 7577054..0000000 +--- a/libglusterfs/src/locking.c ++++ /dev/null +@@ -1,27 +0,0 @@ +-/* +- Copyright (c) 2015 Red Hat, Inc. +- This file is part of GlusterFS. +- +- This file is licensed to you under your choice of the GNU Lesser +- General Public License, version 3 or any later version (LGPLv3 or +- later), or the GNU General Public License, version 2 (GPLv2), in all +- cases as published by the Free Software Foundation. +-*/ +- +-#if defined(HAVE_SPINLOCK) +-/* None of this matters otherwise. */ +- +-#include +-#include +- +-#define LOCKING_IMPL +-#include "glusterfs/locking.h" +- +-int use_spinlocks = 0; +- +-static void __attribute__((constructor)) gf_lock_setup(void) +-{ +- // use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1); +-} +- +-#endif +-- +1.8.3.1 + diff --git a/SOURCES/0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch b/SOURCES/0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch new file mode 100644 index 0000000..3033727 --- /dev/null +++ b/SOURCES/0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch @@ -0,0 +1,148 @@ +From 0e453ede1f248a004965d0d368e2c4beb83f2ce1 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Mon, 25 Jan 2021 17:32:14 +0530 +Subject: [PATCH 573/584] features/shard: unlink fails due to nospace to mknod + marker file + +When we hit the max capacity of the storage space, shard_unlink() +starts failing if there is no space left on the brick to create a +marker file. + +shard_unlink() happens in below steps: + +1. create a marker file in the name of gfid of the base file under +BRICK_PATH/.shard/.remove_me +2. unlink the base file +3. shard_delete_shards() deletes the shards in background by +picking the entries in BRICK_PATH/.shard/.remove_me + +If a marker file creation fails then we can't really delete the +shards which eventually a problem for user who is looking to make +space by deleting unwanted data. + +Solution: +Create the marker file by marking xdata = GLUSTERFS_INTERNAL_FOP_KEY +which is considered to be internal op and allowed to create under +reserved space. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2057 +> Fixes: #2038 +> Change-Id: I7facebab940f9aeee81d489df429e00ef4fb7c5d +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1891403 +Change-Id: I7facebab940f9aeee81d489df429e00ef4fb7c5d +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244966 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-2038.t | 56 ++++++++++++++++++++++++++++++++++++++ + xlators/features/shard/src/shard.c | 20 ++++++++++++++ + 2 files changed, 76 insertions(+) + create mode 100644 tests/bugs/shard/issue-2038.t + +diff --git a/tests/bugs/shard/issue-2038.t b/tests/bugs/shard/issue-2038.t +new file mode 100644 +index 0000000..fc3e7f9 +--- /dev/null ++++ b/tests/bugs/shard/issue-2038.t +@@ -0,0 +1,56 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../snapshot.rc ++ ++cleanup ++ ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++TEST verify_lvm_version ++TEST glusterd ++TEST pidof glusterd ++TEST init_n_bricks 1 ++TEST setup_lvm 1 ++ ++TEST $CLI volume create $V0 $H0:$L1 ++TEST $CLI volume start $V0 ++ ++$CLI volume info ++ ++TEST $CLI volume set $V0 features.shard on ++TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++#Setting the size in percentage ++TEST $CLI volume set $V0 storage.reserve 40 ++ ++#wait 5s to reset disk_space_full flag ++sleep 5 ++ ++TEST touch $M0/test ++TEST unlink $M0/test ++ ++TEST dd if=/dev/zero of=$M0/a bs=80M count=1 ++TEST dd if=/dev/zero of=$M0/b bs=10M count=1 ++ ++gfid_new=$(get_gfid_string $M0/a) ++ ++# Wait 5s to update disk_space_full flag because thread check disk space ++# after every 5s ++ ++sleep 5 ++# setup_lvm create lvm partition of 150M and 40M are reserve so after ++# consuming more than 110M next unlink should not fail ++# Delete the base shard and check shards get cleaned up ++TEST unlink $M0/a ++TEST ! stat $M0/a ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index d1d7d7a..8d4a970 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -4078,6 +4078,16 @@ shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, + SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, + local->prebuf.ia_size, 0, err); + ++ /* Mark this as an internal operation, so that in case of disk full, ++ * the marker file will be created as part of reserve space */ ++ ret = dict_set_int32_sizen(xattr_req, GLUSTERFS_INTERNAL_FOP_KEY, 1); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key: %s on path %s", GLUSTERFS_INTERNAL_FOP_KEY, ++ local->newloc.path); ++ goto err; ++ } ++ + STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + &local->newloc, 0, 0, 0644, xattr_req); +@@ -5843,6 +5853,16 @@ shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, + + SHARD_SET_ROOT_FS_ID(frame, local); + ++ /* Mark this as an internal operation, so that in case of disk full ++ * the internal dir will be created as part of reserve space */ ++ ret = dict_set_int32_sizen(xattr_req, GLUSTERFS_INTERNAL_FOP_KEY, 1); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key: %s on path %s", GLUSTERFS_INTERNAL_FOP_KEY, ++ loc->path); ++ goto err; ++ } ++ + STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, + 0755, 0, xattr_req); +-- +1.8.3.1 + diff --git a/SOURCES/0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch b/SOURCES/0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch new file mode 100644 index 0000000..810abd4 --- /dev/null +++ b/SOURCES/0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch @@ -0,0 +1,712 @@ +From cb0d240004e6d40f8d7f30d177d5970ebc8e25fb Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Wed, 3 Feb 2021 17:04:25 +0530 +Subject: [PATCH 574/584] features/shard: delay unlink of a file that has + fd_count > 0 + +When there are multiple processes working on a file and if any +process unlinks that file then unlink operation shouldn't harm +other processes working on it. This is a posix a compliant +behavior and this should be supported when shard feature is +enabled also. + +Problem description: +Let's consider 2 clients C1 and C2 working on a file F1 with 5 +shards on gluster mount and gluster server has 4 bricks +B1, B2, B3, B4. + +Assume that base file/shard is present on B1, 1st, 2nd shards +on B2, 3rd and 4th shards on B3 and 5th shard falls on B4 C1 +has opened the F1 in append mode and is writing to it. The +write FOP goes to 5th shard in this case. So the +inode->fd_count = 1 on B1(base file) and B4 (5th shard). + +C2 at the same time issued unlink to F1. On the server, the +base file has fd_count = 1 (since C1 has opened the file), +the base file is renamed under .glusterfs/unlink and +returned to C2. Then unlink will be sent to shards on all +bricks and shards on B2 and B3 will be deleted which have +no open reference yet. C1 starts getting errors while +accessing the remaining shards though it has open references +for the file. + +This is one such undefined behavior. Likewise we will +encounter many such undefined behaviors as we dont have one +global lock to access all shards as one. Of Course having such +global lock will lead to performance hit as it reduces window +for parallel access of shards. + +Solution: +The above undefined behavior can be addressed by delaying the +unlink of a file when there are open references on it. +File unlink happens in 2 steps. +step 1: client creates marker file under .shard/remove_me and +sends unlink on base file to the server +step 2: on return from the server, the associated shards will +be cleaned up and finally marker file will be removed. + +In step 2, the back ground deletion process does nameless +lookup using marker file name (marker file is named after the +gfid of the base file) in glusterfs/unlink dir. If the nameless +look up is successful then that means the gfid still has open +fds and deletion of shards has to be delayed. If nameless +lookup fails then that indicates the gfid is unlinked and no +open fds on that file (the gfid path is unlinked during final +close on the file). The shards on which deletion is delayed +are unlinked one the all open fds are closed and this is +done through a thread which wakes up every 10 mins. + +Also removed active_fd_count from inode structure and +referring fd_count wherever active_fd_count was used. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1563 +> Fixes: #1358 +> Change-Id: I8985093386e26215e0b0dce294c534a66f6ca11c +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1782428 +Change-Id: I8985093386e26215e0b0dce294c534a66f6ca11c +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244967 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/glusterfs.h | 1 + + tests/bugs/shard/issue-1358.t | 100 +++++++++++++ + tests/bugs/shard/unlinks-and-renames.t | 5 + + xlators/features/shard/src/shard.c | 199 ++++++++++++++++++++++++- + xlators/features/shard/src/shard.h | 11 ++ + xlators/storage/posix/src/posix-entry-ops.c | 36 +++++ + xlators/storage/posix/src/posix-inode-fd-ops.c | 64 +++++--- + 7 files changed, 391 insertions(+), 25 deletions(-) + create mode 100644 tests/bugs/shard/issue-1358.t + +diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h +index d3400bf..4401cf6 100644 +--- a/libglusterfs/src/glusterfs/glusterfs.h ++++ b/libglusterfs/src/glusterfs/glusterfs.h +@@ -261,6 +261,7 @@ enum gf_internal_fop_indicator { + #define GF_XATTROP_PURGE_INDEX "glusterfs.xattrop-purge-index" + + #define GF_GFIDLESS_LOOKUP "gfidless-lookup" ++#define GF_UNLINKED_LOOKUP "unlinked-lookup" + /* replace-brick and pump related internal xattrs */ + #define RB_PUMP_CMD_START "glusterfs.pump.start" + #define RB_PUMP_CMD_PAUSE "glusterfs.pump.pause" +diff --git a/tests/bugs/shard/issue-1358.t b/tests/bugs/shard/issue-1358.t +new file mode 100644 +index 0000000..1838e06 +--- /dev/null ++++ b/tests/bugs/shard/issue-1358.t +@@ -0,0 +1,100 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST mkdir $M0/dir ++TEST dd if=/dev/urandom of=$M0/dir/foo bs=4M count=5 ++gfid_new=$(get_gfid_string $M0/dir/foo) ++ ++# Ensure its shards dir is created now. ++TEST stat $B0/${V0}0/.shard/$gfid_new.1 ++TEST stat $B0/${V0}1/.shard/$gfid_new.1 ++TEST stat $B0/${V0}0/.shard/$gfid_new.2 ++TEST stat $B0/${V0}1/.shard/$gfid_new.2 ++ ++# Open a file and store descriptor in fd = 5 ++exec 5>$M0/dir/foo ++ ++# Write something on the file using the open fd = 5 ++echo "issue-1358" >&5 ++ ++# Write on the descriptor should be succesful ++EXPECT 0 echo $? ++ ++# Unlink the same file which is opened in prev step ++TEST unlink $M0/dir/foo ++ ++# Check the base file ++TEST ! stat $M0/dir/foo ++TEST ! stat $B0/${V0}0/foo ++TEST ! stat $B0/${V0}1/foo ++ ++# Write something on the file using the open fd = 5 ++echo "issue-1281" >&5 ++ ++# Write on the descriptor should be succesful ++EXPECT 0 echo $? ++ ++# Check ".shard/.remove_me" ++EXPECT_WITHIN $FILE_COUNT_TIME 1 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 1 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new ++ ++# Close the fd = 5 ++exec 5>&- ++ ++###### To see the shards deleted, wait for 10 mins or repeat the same steps i.e open a file ##### ++###### write something to it, unlink it and close it. This will wake up the thread that is ###### ++###### responsible to delete the shards ++ ++TEST touch $M0/dir/new ++exec 6>$M0/dir/new ++echo "issue-1358" >&6 ++EXPECT 0 echo $? ++TEST unlink $M0/dir/new ++exec 6>&- ++ ++# Now check the ".shard/remove_me" and the gfid will not be there ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new ++ ++# check for the absence of shards ++TEST ! stat $B0/${V0}0/.shard/$gfid_new.1 ++TEST ! stat $B0/${V0}1/.shard/$gfid_new.1 ++TEST ! stat $B0/${V0}0/.shard/$gfid_new.2 ++TEST ! stat $B0/${V0}1/.shard/$gfid_new.2 ++ ++#### Create the file with same name and check creation and deletion works fine ###### ++TEST dd if=/dev/urandom of=$M0/dir/foo bs=4M count=5 ++gfid_new=$(get_gfid_string $M0/dir/foo) ++ ++# Ensure its shards dir is created now. ++TEST stat $B0/${V0}0/.shard/$gfid_new.1 ++TEST stat $B0/${V0}1/.shard/$gfid_new.1 ++TEST stat $B0/${V0}0/.shard/$gfid_new.2 ++TEST stat $B0/${V0}1/.shard/$gfid_new.2 ++ ++TEST unlink $M0/dir/foo ++cleanup ++ +diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t +index 990ca69..3280fcb 100644 +--- a/tests/bugs/shard/unlinks-and-renames.t ++++ b/tests/bugs/shard/unlinks-and-renames.t +@@ -24,6 +24,11 @@ TEST pidof glusterd + TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} + TEST $CLI volume set $V0 features.shard on + TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.write-behind off ++ + TEST $CLI volume start $V0 + TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 8d4a970..b828ff9 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -1242,7 +1242,8 @@ out: + + static inode_t * + shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, +- struct iatt *buf, shard_internal_dir_type_t type) ++ xlator_t *this, struct iatt *buf, ++ shard_internal_dir_type_t type) + { + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; +@@ -1250,7 +1251,7 @@ shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, + inode_t **priv_inode = NULL; + inode_t *parent = NULL; + +- priv = THIS->private; ++ priv = this->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: +@@ -1294,7 +1295,7 @@ shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, + /* To-Do: Fix refcount increment per call to + * shard_link_internal_dir_inode(). + */ +- linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ linked_inode = shard_link_internal_dir_inode(local, inode, this, buf, type); + shard_inode_ctx_mark_dir_refreshed(linked_inode, this); + out: + shard_common_resolve_shards(frame, this, local->post_res_handler); +@@ -1383,7 +1384,7 @@ shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + goto unwind; + } + +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ link_inode = shard_link_internal_dir_inode(local, inode, this, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { +@@ -3586,7 +3587,8 @@ shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, + "Lookup on %s failed, exiting", bname); + goto err; + } else { +- shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); ++ shard_link_internal_dir_inode(local, loc->inode, this, &stbuf, ++ type); + } + } + ret = 0; +@@ -3633,6 +3635,45 @@ err: + return ret; + } + ++static int ++shard_nameless_lookup_base_file(xlator_t *this, char *gfid) ++{ ++ int ret = 0; ++ loc_t loc = { ++ 0, ++ }; ++ dict_t *xattr_req = dict_new(); ++ if (!xattr_req) { ++ ret = -1; ++ goto out; ++ } ++ ++ loc.inode = inode_new(this->itable); ++ if (loc.inode == NULL) { ++ ret = -1; ++ goto out; ++ } ++ ++ ret = gf_uuid_parse(gfid, loc.gfid); ++ if (ret < 0) ++ goto out; ++ ++ ret = dict_set_uint32(xattr_req, GF_UNLINKED_LOOKUP, 1); ++ if (ret < 0) ++ goto out; ++ ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, xattr_req, NULL); ++ if (ret < 0) ++ goto out; ++ ++out: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ loc_wipe(&loc); ++ ++ return ret; ++} ++ + int + shard_delete_shards(void *opaque) + { +@@ -3734,6 +3775,11 @@ shard_delete_shards(void *opaque) + if (ret < 0) + continue; + } ++ ++ ret = shard_nameless_lookup_base_file(this, entry->d_name); ++ if (!ret) ++ continue; ++ + link_inode = inode_link(entry->inode, local->fd->inode, + entry->d_name, &entry->d_stat); + +@@ -4105,6 +4151,9 @@ err: + int + shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); + ++static int ++shard_unlink_handler_spawn(xlator_t *this); ++ + int + shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, +@@ -4126,7 +4175,7 @@ shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + if (xdata) + local->xattr_rsp = dict_ref(xdata); + if (local->cleanup_required) +- shard_start_background_deletion(this); ++ shard_unlink_handler_spawn(this); + } + + if (local->entrylk_frame) { +@@ -5785,7 +5834,7 @@ shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + } + } + +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ link_inode = shard_link_internal_dir_inode(local, inode, this, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { +@@ -7098,6 +7147,132 @@ shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + return 0; + } + ++static void ++shard_unlink_wait(shard_unlink_thread_t *ti) ++{ ++ struct timespec wait_till = { ++ 0, ++ }; ++ ++ pthread_mutex_lock(&ti->mutex); ++ { ++ /* shard_unlink_handler() runs every 10 mins of interval */ ++ wait_till.tv_sec = time(NULL) + 600; ++ ++ while (!ti->rerun) { ++ if (pthread_cond_timedwait(&ti->cond, &ti->mutex, &wait_till) == ++ ETIMEDOUT) ++ break; ++ } ++ ti->rerun = _gf_false; ++ } ++ pthread_mutex_unlock(&ti->mutex); ++} ++ ++static void * ++shard_unlink_handler(void *data) ++{ ++ shard_unlink_thread_t *ti = data; ++ xlator_t *this = ti->this; ++ ++ THIS = this; ++ ++ while (!ti->stop) { ++ shard_start_background_deletion(this); ++ shard_unlink_wait(ti); ++ } ++ return NULL; ++} ++ ++static int ++shard_unlink_handler_spawn(xlator_t *this) ++{ ++ int ret = 0; ++ shard_priv_t *priv = this->private; ++ shard_unlink_thread_t *ti = &priv->thread_info; ++ ++ ti->this = this; ++ ++ pthread_mutex_lock(&ti->mutex); ++ { ++ if (ti->running) { ++ pthread_cond_signal(&ti->cond); ++ } else { ++ ret = gf_thread_create(&ti->thread, NULL, shard_unlink_handler, ti, ++ "shard_unlink"); ++ if (ret < 0) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to create \"shard_unlink\" thread"); ++ goto unlock; ++ } ++ ti->running = _gf_true; ++ } ++ ++ ti->rerun = _gf_true; ++ } ++unlock: ++ pthread_mutex_unlock(&ti->mutex); ++ return ret; ++} ++ ++static int ++shard_unlink_handler_init(shard_unlink_thread_t *ti) ++{ ++ int ret = 0; ++ xlator_t *this = THIS; ++ ++ ret = pthread_mutex_init(&ti->mutex, NULL); ++ if (ret) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to init mutex for \"shard_unlink\" thread"); ++ goto out; ++ } ++ ++ ret = pthread_cond_init(&ti->cond, NULL); ++ if (ret) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to init cond var for \"shard_unlink\" thread"); ++ pthread_mutex_destroy(&ti->mutex); ++ goto out; ++ } ++ ++ ti->running = _gf_false; ++ ti->rerun = _gf_false; ++ ti->stop = _gf_false; ++ ++out: ++ return -ret; ++} ++ ++static void ++shard_unlink_handler_fini(shard_unlink_thread_t *ti) ++{ ++ int ret = 0; ++ xlator_t *this = THIS; ++ if (!ti) ++ return; ++ ++ pthread_mutex_lock(&ti->mutex); ++ if (ti->running) { ++ ti->rerun = _gf_true; ++ ti->stop = _gf_true; ++ pthread_cond_signal(&ti->cond); ++ } ++ pthread_mutex_unlock(&ti->mutex); ++ ++ if (ti->running) { ++ ret = pthread_join(ti->thread, NULL); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, 0, ++ "Failed to clean up shard unlink thread."); ++ ti->running = _gf_false; ++ } ++ ti->thread = 0; ++ ++ pthread_cond_destroy(&ti->cond); ++ pthread_mutex_destroy(&ti->mutex); ++} ++ + int32_t + mem_acct_init(xlator_t *this) + { +@@ -7164,6 +7339,14 @@ init(xlator_t *this) + this->private = priv; + LOCK_INIT(&priv->lock); + INIT_LIST_HEAD(&priv->ilist_head); ++ ++ ret = shard_unlink_handler_init(&priv->thread_info); ++ if (ret) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to initialize resources for \"shard_unlink\" thread"); ++ goto out; ++ } ++ + ret = 0; + out: + if (ret) { +@@ -7188,6 +7371,8 @@ fini(xlator_t *this) + if (!priv) + goto out; + ++ shard_unlink_handler_fini(&priv->thread_info); ++ + this->private = NULL; + LOCK_DESTROY(&priv->lock); + GF_FREE(priv); +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 4fe181b..3dcb112 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -207,6 +207,16 @@ typedef enum { + + /* rm = "remove me" */ + ++typedef struct shard_unlink_thread { ++ pthread_mutex_t mutex; ++ pthread_cond_t cond; ++ pthread_t thread; ++ gf_boolean_t running; ++ gf_boolean_t rerun; ++ gf_boolean_t stop; ++ xlator_t *this; ++} shard_unlink_thread_t; ++ + typedef struct shard_priv { + uint64_t block_size; + uuid_t dot_shard_gfid; +@@ -220,6 +230,7 @@ typedef struct shard_priv { + shard_bg_deletion_state_t bg_del_state; + gf_boolean_t first_lookup_done; + uint64_t lru_limit; ++ shard_unlink_thread_t thread_info; + } shard_priv_t; + + typedef struct { +diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c +index b3a5381..1511e68 100644 +--- a/xlators/storage/posix/src/posix-entry-ops.c ++++ b/xlators/storage/posix/src/posix-entry-ops.c +@@ -183,6 +183,11 @@ posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + struct posix_private *priv = NULL; + posix_inode_ctx_t *ctx = NULL; + int ret = 0; ++ uint32_t lookup_unlink_dir = 0; ++ char *unlink_path = NULL; ++ struct stat lstatbuf = { ++ 0, ++ }; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); +@@ -208,7 +213,36 @@ posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + op_ret = -1; + if (gf_uuid_is_null(loc->pargfid) || (loc->name == NULL)) { + /* nameless lookup */ ++ op_ret = op_errno = errno = 0; + MAKE_INODE_HANDLE(real_path, this, loc, &buf); ++ ++ /* The gfid will be renamed to ".glusterfs/unlink" in case ++ * there are any open fds on the file in posix_unlink path. ++ * So client can request server to do nameless lookup with ++ * xdata = GF_UNLINKED_LOOKUP in ".glusterfs/unlink" ++ * dir if a client wants to know the status of the all open fds ++ * on the unlinked file. If the file still present in the ++ * ".glusterfs/unlink" dir then it indicates there still ++ * open fds present on the file and the file is still under ++ * unlink process */ ++ if (op_ret < 0 && errno == ENOENT) { ++ ret = dict_get_uint32(xdata, GF_UNLINKED_LOOKUP, ++ &lookup_unlink_dir); ++ if (!ret && lookup_unlink_dir) { ++ op_ret = op_errno = errno = 0; ++ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, loc->gfid, ++ unlink_path); ++ ret = sys_lstat(unlink_path, &lstatbuf); ++ if (ret) { ++ op_ret = -1; ++ op_errno = errno; ++ } else { ++ iatt_from_stat(&buf, &lstatbuf); ++ buf.ia_nlink = 0; ++ } ++ goto nameless_lookup_unlink_dir_out; ++ } ++ } + } else { + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &buf); + if (!real_path || !par_path) { +@@ -328,6 +362,8 @@ out: + + if (op_ret == 0) + op_errno = 0; ++ ++nameless_lookup_unlink_dir_out: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, + (loc) ? loc->inode : NULL, &buf, xattr, &postparent); + +diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c +index 761e018..4c2983a 100644 +--- a/xlators/storage/posix/src/posix-inode-fd-ops.c ++++ b/xlators/storage/posix/src/posix-inode-fd-ops.c +@@ -2504,6 +2504,39 @@ out: + return 0; + } + ++static int ++posix_unlink_renamed_file(xlator_t *this, inode_t *inode) ++{ ++ int ret = 0; ++ char *unlink_path = NULL; ++ uint64_t ctx_uint = 0; ++ posix_inode_ctx_t *ctx = NULL; ++ struct posix_private *priv = this->private; ++ ++ ret = inode_ctx_get(inode, this, &ctx_uint); ++ ++ if (ret < 0) ++ goto out; ++ ++ ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ++ if (ctx->unlink_flag == GF_UNLINK_TRUE) { ++ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, inode->gfid, unlink_path); ++ if (!unlink_path) { ++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, ++ "Failed to remove gfid :%s", uuid_utoa(inode->gfid)); ++ ret = -1; ++ } else { ++ ret = sys_unlink(unlink_path); ++ if (!ret) ++ ctx->unlink_flag = GF_UNLINK_FALSE; ++ } ++ } ++ ++out: ++ return ret; ++} ++ + int32_t + posix_release(xlator_t *this, fd_t *fd) + { +@@ -2514,6 +2547,9 @@ posix_release(xlator_t *this, fd_t *fd) + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + ++ if (fd->inode->active_fd_count == 0) ++ posix_unlink_renamed_file(this, fd->inode); ++ + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_PFD_NULL, +@@ -5881,41 +5917,33 @@ posix_forget(xlator_t *this, inode_t *inode) + uint64_t ctx_uint1 = 0; + uint64_t ctx_uint2 = 0; + posix_inode_ctx_t *ctx = NULL; +- posix_mdata_t *mdata = NULL; +- struct posix_private *priv_posix = NULL; +- +- priv_posix = (struct posix_private *)this->private; +- if (!priv_posix) +- return 0; ++ struct posix_private *priv = this->private; + + ret = inode_ctx_del2(inode, this, &ctx_uint1, &ctx_uint2); ++ ++ if (ctx_uint2) ++ GF_FREE((posix_mdata_t *)(uintptr_t)ctx_uint2); ++ + if (!ctx_uint1) +- goto check_ctx2; ++ return 0; + + ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint1; + + if (ctx->unlink_flag == GF_UNLINK_TRUE) { +- POSIX_GET_FILE_UNLINK_PATH(priv_posix->base_path, inode->gfid, +- unlink_path); ++ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, inode->gfid, unlink_path); + if (!unlink_path) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, + "Failed to remove gfid :%s", uuid_utoa(inode->gfid)); + ret = -1; +- goto ctx_free; ++ } else { ++ ret = sys_unlink(unlink_path); + } +- ret = sys_unlink(unlink_path); + } +-ctx_free: ++ + pthread_mutex_destroy(&ctx->xattrop_lock); + pthread_mutex_destroy(&ctx->write_atomic_lock); + pthread_mutex_destroy(&ctx->pgfid_lock); + GF_FREE(ctx); + +-check_ctx2: +- if (ctx_uint2) { +- mdata = (posix_mdata_t *)(uintptr_t)ctx_uint2; +- } +- +- GF_FREE(mdata); + return ret; + } +-- +1.8.3.1 + diff --git a/SOURCES/0575-libglusterfs-add-functions-to-calculate-time-differe.patch b/SOURCES/0575-libglusterfs-add-functions-to-calculate-time-differe.patch new file mode 100644 index 0000000..98ffc3c --- /dev/null +++ b/SOURCES/0575-libglusterfs-add-functions-to-calculate-time-differe.patch @@ -0,0 +1,160 @@ +From 59e69ae1c7ccda74a8cbf8c9b2ae37bc74cbf612 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Fri, 4 Jun 2021 10:55:37 +0530 +Subject: [PATCH 575/584] libglusterfs: add functions to calculate time + difference + +Add gf_tvdiff() and gf_tsdiff() to calculate the difference +between 'struct timeval' and 'struct timespec' values, use +them where appropriate. + +Upstream patch details: +> https://github.com/gluster/glusterfs/commit/ba7f24b1cedf2549394c21b3f0df1661227cefae +> Change-Id: I172be06ee84e99a1da76847c15e5ea3fbc059338 +> Signed-off-by: Dmitry Antipov +> Updates: #1002 + +BUG: 1928676 +Change-Id: I723ab9555b0f8caef108742acc2cb63d6a32eb96 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245294 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfsd/src/glusterfsd-mgmt.c | 4 ++-- + libglusterfs/src/glusterfs/common-utils.h | 32 +++++++++++++++++++++++++++++++ + libglusterfs/src/latency.c | 3 +-- + xlators/cluster/dht/src/dht-rebalance.c | 6 ++---- + xlators/debug/io-stats/src/io-stats.c | 8 ++------ + 5 files changed, 39 insertions(+), 14 deletions(-) + +diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c +index 61d1b21..a51dd9e 100644 +--- a/glusterfsd/src/glusterfsd-mgmt.c ++++ b/glusterfsd/src/glusterfsd-mgmt.c +@@ -534,7 +534,7 @@ glusterfs_volume_top_write_perf(uint32_t blk_size, uint32_t blk_count, + } + + gettimeofday(&end, NULL); +- *time = (end.tv_sec - begin.tv_sec) * 1e6 + (end.tv_usec - begin.tv_usec); ++ *time = gf_tvdiff(&begin, &end); + *throughput = total_blks / *time; + gf_log("glusterd", GF_LOG_INFO, + "Throughput %.2f Mbps time %.2f secs " +@@ -653,7 +653,7 @@ glusterfs_volume_top_read_perf(uint32_t blk_size, uint32_t blk_count, + } + + gettimeofday(&end, NULL); +- *time = (end.tv_sec - begin.tv_sec) * 1e6 + (end.tv_usec - begin.tv_usec); ++ *time = gf_tvdiff(&begin, &end); + *throughput = total_blks / *time; + gf_log("glusterd", GF_LOG_INFO, + "Throughput %.2f Mbps time %.2f secs " +diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h +index 604afd0..bd48b6f 100644 +--- a/libglusterfs/src/glusterfs/common-utils.h ++++ b/libglusterfs/src/glusterfs/common-utils.h +@@ -1090,4 +1090,36 @@ find_xlator_option_in_cmd_args_t(const char *option_name, cmd_args_t *args); + int + gf_d_type_from_ia_type(ia_type_t type); + ++/* Return delta value in microseconds. */ ++ ++static inline double ++gf_tvdiff(struct timeval *start, struct timeval *end) ++{ ++ struct timeval t; ++ ++ if (start->tv_usec > end->tv_usec) ++ t.tv_sec = end->tv_sec - 1, t.tv_usec = end->tv_usec + 1000000; ++ else ++ t.tv_sec = end->tv_sec, t.tv_usec = end->tv_usec; ++ ++ return (double)(t.tv_sec - start->tv_sec) * 1e6 + ++ (double)(t.tv_usec - start->tv_usec); ++} ++ ++/* Return delta value in nanoseconds. */ ++ ++static inline double ++gf_tsdiff(struct timespec *start, struct timespec *end) ++{ ++ struct timespec t; ++ ++ if (start->tv_nsec > end->tv_nsec) ++ t.tv_sec = end->tv_sec - 1, t.tv_nsec = end->tv_nsec + 1000000000; ++ else ++ t.tv_sec = end->tv_sec, t.tv_nsec = end->tv_nsec; ++ ++ return (double)(t.tv_sec - start->tv_sec) * 1e9 + ++ (double)(t.tv_nsec - start->tv_nsec); ++} ++ + #endif /* _COMMON_UTILS_H */ +diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c +index e1e6de7..ce61399 100644 +--- a/libglusterfs/src/latency.c ++++ b/libglusterfs/src/latency.c +@@ -33,8 +33,7 @@ gf_update_latency(call_frame_t *frame) + if (!(begin->tv_sec && end->tv_sec)) + goto out; + +- elapsed = (end->tv_sec - begin->tv_sec) * 1e9 + +- (end->tv_nsec - begin->tv_nsec); ++ elapsed = gf_tsdiff(begin, end); + + if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { + gf_log("[core]", GF_LOG_WARNING, "Invalid frame op value: %d", +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index eab7558..e07dec0 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -2927,8 +2927,7 @@ gf_defrag_migrate_single_file(void *opaque) + + if (defrag->stats == _gf_true) { + gettimeofday(&end, NULL); +- elapsed = (end.tv_sec - start.tv_sec) * 1e6 + +- (end.tv_usec - start.tv_usec); ++ elapsed = gf_tvdiff(&start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration of " + "file:%s size:%" PRIu64 +@@ -3529,8 +3528,7 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + } + + gettimeofday(&end, NULL); +- elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + +- (end.tv_usec - dir_start.tv_usec); ++ elapsed = gf_tvdiff(&dir_start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration operation on dir %s took " + "%.2f secs", +diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c +index 9b34895..8ad96fb 100644 +--- a/xlators/debug/io-stats/src/io-stats.c ++++ b/xlators/debug/io-stats/src/io-stats.c +@@ -281,9 +281,7 @@ is_fop_latency_started(call_frame_t *frame) + begin = &frame->begin; \ + end = &frame->end; \ + \ +- elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + \ +- (end->tv_nsec - begin->tv_nsec)) / \ +- 1000; \ ++ elapsed = gf_tsdiff(begin, end) / 1000.0; \ + throughput = op_ret / elapsed; \ + \ + conf = this->private; \ +@@ -1774,9 +1772,7 @@ update_ios_latency(struct ios_conf *conf, call_frame_t *frame, + begin = &frame->begin; + end = &frame->end; + +- elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + +- (end->tv_nsec - begin->tv_nsec)) / +- 1000; ++ elapsed = gf_tsdiff(begin, end) / 1000.0; + + update_ios_latency_stats(&conf->cumulative, elapsed, op); + update_ios_latency_stats(&conf->incremental, elapsed, op); +-- +1.8.3.1 + diff --git a/SOURCES/0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch b/SOURCES/0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch new file mode 100644 index 0000000..6883559 --- /dev/null +++ b/SOURCES/0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch @@ -0,0 +1,573 @@ +From f2b9d3a089cc9ff9910da0075defe306851aca5c Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Fri, 4 Jun 2021 12:27:57 +0530 +Subject: [PATCH 576/584] rpcsvc: Add latency tracking for rpc programs + +Added latency tracking of rpc-handling code. With this change we +should be able to monitor the amount of time rpc-handling code is +consuming for each of the rpc call. + +Upstream patch details: +> https://review.gluster.org/#/c/glusterfs/+/24955/ +> fixes: #1466 +> Change-Id: I04fc7f3b12bfa5053c0fc36885f271cb78f581cd +> Signed-off-by: Pranith Kumar K + +BUG: 1928676 +Change-Id: Ibcedddb5db3ff4906607050cf9f7ea3ebb266cc5 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245295 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez Juan +Reviewed-by: Ashish Pandey +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/latency.h | 22 +++++--- + libglusterfs/src/glusterfs/mem-types.h | 1 + + libglusterfs/src/glusterfs/stack.h | 7 +-- + libglusterfs/src/glusterfs/statedump.h | 2 + + libglusterfs/src/glusterfs/xlator.h | 2 +- + libglusterfs/src/latency.c | 93 +++++++++++++++------------------- + libglusterfs/src/libglusterfs.sym | 5 ++ + libglusterfs/src/monitoring.c | 8 +-- + libglusterfs/src/statedump.c | 38 +++++++++++++- + libglusterfs/src/xlator.c | 5 ++ + rpc/rpc-lib/src/libgfrpc.sym | 1 + + rpc/rpc-lib/src/rpcsvc.c | 72 +++++++++++++++++++++++++- + rpc/rpc-lib/src/rpcsvc.h | 5 ++ + xlators/protocol/server/src/server.c | 2 + + 14 files changed, 193 insertions(+), 70 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/latency.h b/libglusterfs/src/glusterfs/latency.h +index ed47b1f..4d601bb 100644 +--- a/libglusterfs/src/glusterfs/latency.h ++++ b/libglusterfs/src/glusterfs/latency.h +@@ -11,13 +11,23 @@ + #ifndef __LATENCY_H__ + #define __LATENCY_H__ + +-#include "glusterfs/glusterfs.h" ++#include ++#include + +-typedef struct fop_latency { +- double min; /* min time for the call (microseconds) */ +- double max; /* max time for the call (microseconds) */ +- double total; /* total time (microseconds) */ ++typedef struct _gf_latency { ++ uint64_t min; /* min time for the call (nanoseconds) */ ++ uint64_t max; /* max time for the call (nanoseconds) */ ++ uint64_t total; /* total time (nanoseconds) */ + uint64_t count; +-} fop_latency_t; ++} gf_latency_t; + ++gf_latency_t * ++gf_latency_new(size_t n); ++ ++void ++gf_latency_reset(gf_latency_t *lat); ++ ++void ++gf_latency_update(gf_latency_t *lat, struct timespec *begin, ++ struct timespec *end); + #endif /* __LATENCY_H__ */ +diff --git a/libglusterfs/src/glusterfs/mem-types.h b/libglusterfs/src/glusterfs/mem-types.h +index 92730a9..970b9ff 100644 +--- a/libglusterfs/src/glusterfs/mem-types.h ++++ b/libglusterfs/src/glusterfs/mem-types.h +@@ -139,6 +139,7 @@ enum gf_common_mem_types_ { + gf_common_mt_mgmt_v3_lock_timer_t, /* used only in one location */ + gf_common_mt_server_cmdline_t, /* used only in one location */ + gf_mt_gfdb_query_record_t, ++ gf_common_mt_latency_t, + gf_common_mt_end + }; + #endif +diff --git a/libglusterfs/src/glusterfs/stack.h b/libglusterfs/src/glusterfs/stack.h +index bd466d8..536a330 100644 +--- a/libglusterfs/src/glusterfs/stack.h ++++ b/libglusterfs/src/glusterfs/stack.h +@@ -45,6 +45,9 @@ typedef int32_t (*ret_fn_t)(call_frame_t *frame, call_frame_t *prev_frame, + xlator_t *this, int32_t op_ret, int32_t op_errno, + ...); + ++void ++gf_frame_latency_update(call_frame_t *frame); ++ + struct call_pool { + union { + struct list_head all_frames; +@@ -149,8 +152,6 @@ struct _call_stack { + } while (0); + + struct xlator_fops; +-void +-gf_update_latency(call_frame_t *frame); + + static inline void + FRAME_DESTROY(call_frame_t *frame) +@@ -158,7 +159,7 @@ FRAME_DESTROY(call_frame_t *frame) + void *local = NULL; + + if (frame->root->ctx->measure_latency) +- gf_update_latency(frame); ++ gf_frame_latency_update(frame); + + list_del_init(&frame->frames); + if (frame->local) { +diff --git a/libglusterfs/src/glusterfs/statedump.h b/libglusterfs/src/glusterfs/statedump.h +index 89d04f9..ce08270 100644 +--- a/libglusterfs/src/glusterfs/statedump.h ++++ b/libglusterfs/src/glusterfs/statedump.h +@@ -127,4 +127,6 @@ gf_proc_dump_xlator_meminfo(xlator_t *this, strfd_t *strfd); + void + gf_proc_dump_xlator_profile(xlator_t *this, strfd_t *strfd); + ++void ++gf_latency_statedump_and_reset(char *key, gf_latency_t *lat); + #endif /* STATEDUMP_H */ +diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h +index 273039a..ecb9fa4 100644 +--- a/libglusterfs/src/glusterfs/xlator.h ++++ b/libglusterfs/src/glusterfs/xlator.h +@@ -808,7 +808,7 @@ struct _xlator { + + struct { + /* for latency measurement */ +- fop_latency_t latencies[GF_FOP_MAXVALUE]; ++ gf_latency_t latencies[GF_FOP_MAXVALUE]; + /* for latency measurement */ + fop_metrics_t metrics[GF_FOP_MAXVALUE]; + +diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c +index ce61399..ce4b0e8 100644 +--- a/libglusterfs/src/latency.c ++++ b/libglusterfs/src/latency.c +@@ -14,39 +14,34 @@ + */ + + #include "glusterfs/glusterfs.h" +-#include "glusterfs/xlator.h" +-#include "glusterfs/common-utils.h" + #include "glusterfs/statedump.h" +-#include "glusterfs/libglusterfs-messages.h" + +-void +-gf_update_latency(call_frame_t *frame) ++gf_latency_t * ++gf_latency_new(size_t n) + { +- double elapsed; +- struct timespec *begin, *end; +- +- fop_latency_t *lat; +- +- begin = &frame->begin; +- end = &frame->end; ++ int i = 0; ++ gf_latency_t *lat = NULL; + +- if (!(begin->tv_sec && end->tv_sec)) +- goto out; ++ lat = GF_MALLOC(n * sizeof(*lat), gf_common_mt_latency_t); ++ if (!lat) ++ return NULL; + +- elapsed = gf_tsdiff(begin, end); ++ for (i = 0; i < n; i++) { ++ gf_latency_reset(lat + i); ++ } ++ return lat; ++} + +- if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { +- gf_log("[core]", GF_LOG_WARNING, "Invalid frame op value: %d", +- frame->op); ++void ++gf_latency_update(gf_latency_t *lat, struct timespec *begin, ++ struct timespec *end) ++{ ++ if (!(begin->tv_sec && end->tv_sec)) { ++ /*Measure latency might have been enabled/disabled during the op*/ + return; + } + +- /* Can happen mostly at initiator xlator, as STACK_WIND/UNWIND macros +- set it right anyways for those frames */ +- if (!frame->op) +- frame->op = frame->root->op; +- +- lat = &frame->this->stats.interval.latencies[frame->op]; ++ double elapsed = gf_tsdiff(begin, end); + + if (lat->max < elapsed) + lat->max = elapsed; +@@ -56,40 +51,34 @@ gf_update_latency(call_frame_t *frame) + + lat->total += elapsed; + lat->count++; +-out: +- return; + } + + void +-gf_proc_dump_latency_info(xlator_t *xl) ++gf_latency_reset(gf_latency_t *lat) + { +- char key_prefix[GF_DUMP_MAX_BUF_LEN]; +- char key[GF_DUMP_MAX_BUF_LEN]; +- int i; +- +- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.latency", xl->name); +- gf_proc_dump_add_section("%s", key_prefix); +- +- for (i = 0; i < GF_FOP_MAXVALUE; i++) { +- gf_proc_dump_build_key(key, key_prefix, "%s", (char *)gf_fop_list[i]); +- +- fop_latency_t *lat = &xl->stats.interval.latencies[i]; ++ if (!lat) ++ return; ++ memset(lat, 0, sizeof(*lat)); ++ lat->min = ULLONG_MAX; ++ /* make sure 'min' is set to high value, so it would be ++ properly set later */ ++} + +- /* Doesn't make sense to continue if there are no fops +- came in the given interval */ +- if (!lat->count) +- continue; ++void ++gf_frame_latency_update(call_frame_t *frame) ++{ ++ gf_latency_t *lat; ++ /* Can happen mostly at initiator xlator, as STACK_WIND/UNWIND macros ++ set it right anyways for those frames */ ++ if (!frame->op) ++ frame->op = frame->root->op; + +- gf_proc_dump_write(key, "%.03f,%" PRId64 ",%.03f", +- (lat->total / lat->count), lat->count, lat->total); ++ if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { ++ gf_log("[core]", GF_LOG_WARNING, "Invalid frame op value: %d", ++ frame->op); ++ return; + } + +- memset(xl->stats.interval.latencies, 0, +- sizeof(xl->stats.interval.latencies)); +- +- /* make sure 'min' is set to high value, so it would be +- properly set later */ +- for (i = 0; i < GF_FOP_MAXVALUE; i++) { +- xl->stats.interval.latencies[i].min = 0xffffffff; +- } ++ lat = &frame->this->stats.interval.latencies[frame->op]; ++ gf_latency_update(lat, &frame->begin, &frame->end); + } +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index 9072afa..4f968e1 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -1183,3 +1183,8 @@ gf_latency_reset + gf_latency_update + gf_frame_latency_update + gf_assert ++gf_latency_statedump_and_reset ++gf_latency_new ++gf_latency_reset ++gf_latency_update ++gf_frame_latency_update +diff --git a/libglusterfs/src/monitoring.c b/libglusterfs/src/monitoring.c +index 6d9bfb1..20b7f52 100644 +--- a/libglusterfs/src/monitoring.c ++++ b/libglusterfs/src/monitoring.c +@@ -113,15 +113,15 @@ dump_latency_and_count(xlator_t *xl, int fd) + dprintf(fd, "%s.interval.%s.fail_count %" PRIu64 "\n", xl->name, + gf_fop_list[index], cbk); + } +- if (xl->stats.interval.latencies[index].count != 0.0) { ++ if (xl->stats.interval.latencies[index].count != 0) { + dprintf(fd, "%s.interval.%s.latency %lf\n", xl->name, + gf_fop_list[index], +- (xl->stats.interval.latencies[index].total / ++ (((double)xl->stats.interval.latencies[index].total) / + xl->stats.interval.latencies[index].count)); +- dprintf(fd, "%s.interval.%s.max %lf\n", xl->name, ++ dprintf(fd, "%s.interval.%s.max %" PRIu64 "\n", xl->name, + gf_fop_list[index], + xl->stats.interval.latencies[index].max); +- dprintf(fd, "%s.interval.%s.min %lf\n", xl->name, ++ dprintf(fd, "%s.interval.%s.min %" PRIu64 "\n", xl->name, + gf_fop_list[index], + xl->stats.interval.latencies[index].min); + } +diff --git a/libglusterfs/src/statedump.c b/libglusterfs/src/statedump.c +index d18b50f..4bf4cc2 100644 +--- a/libglusterfs/src/statedump.c ++++ b/libglusterfs/src/statedump.c +@@ -201,6 +201,40 @@ gf_proc_dump_write(char *key, char *value, ...) + return ret; + } + ++void ++gf_latency_statedump_and_reset(char *key, gf_latency_t *lat) ++{ ++ /* Doesn't make sense to continue if there are no fops ++ came in the given interval */ ++ if (!lat || !lat->count) ++ return; ++ gf_proc_dump_write(key, ++ "AVG:%lf CNT:%" PRIu64 " TOTAL:%" PRIu64 " MIN:%" PRIu64 ++ " MAX:%" PRIu64, ++ (((double)lat->total) / lat->count), lat->count, ++ lat->total, lat->min, lat->max); ++ gf_latency_reset(lat); ++} ++ ++void ++gf_proc_dump_xl_latency_info(xlator_t *xl) ++{ ++ char key_prefix[GF_DUMP_MAX_BUF_LEN]; ++ char key[GF_DUMP_MAX_BUF_LEN]; ++ int i; ++ ++ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.latency", xl->name); ++ gf_proc_dump_add_section("%s", key_prefix); ++ ++ for (i = 0; i < GF_FOP_MAXVALUE; i++) { ++ gf_proc_dump_build_key(key, key_prefix, "%s", (char *)gf_fop_list[i]); ++ ++ gf_latency_t *lat = &xl->stats.interval.latencies[i]; ++ ++ gf_latency_statedump_and_reset(key, lat); ++ } ++} ++ + static void + gf_proc_dump_xlator_mem_info(xlator_t *xl) + { +@@ -487,7 +521,7 @@ gf_proc_dump_single_xlator_info(xlator_t *trav) + return; + + if (ctx->measure_latency) +- gf_proc_dump_latency_info(trav); ++ gf_proc_dump_xl_latency_info(trav); + + gf_proc_dump_xlator_mem_info(trav); + +@@ -1024,7 +1058,7 @@ gf_proc_dump_xlator_profile(xlator_t *this, strfd_t *strfd) + { + gf_dump_strfd = strfd; + +- gf_proc_dump_latency_info(this); ++ gf_proc_dump_xl_latency_info(this); + + gf_dump_strfd = NULL; + } +diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c +index 36cc32c..b9ad411 100644 +--- a/libglusterfs/src/xlator.c ++++ b/libglusterfs/src/xlator.c +@@ -246,6 +246,7 @@ xlator_dynload_apis(xlator_t *xl) + void *handle = NULL; + volume_opt_list_t *vol_opt = NULL; + xlator_api_t *xlapi = NULL; ++ int i = 0; + + handle = xl->dlhandle; + +@@ -343,6 +344,10 @@ xlator_dynload_apis(xlator_t *xl) + memcpy(xl->op_version, xlapi->op_version, + sizeof(uint32_t) * GF_MAX_RELEASES); + ++ for (i = 0; i < GF_FOP_MAXVALUE; i++) { ++ gf_latency_reset(&xl->stats.interval.latencies[i]); ++ } ++ + ret = 0; + out: + return ret; +diff --git a/rpc/rpc-lib/src/libgfrpc.sym b/rpc/rpc-lib/src/libgfrpc.sym +index f3544e3..a1757cc 100644 +--- a/rpc/rpc-lib/src/libgfrpc.sym ++++ b/rpc/rpc-lib/src/libgfrpc.sym +@@ -66,3 +66,4 @@ rpc_transport_unix_options_build + rpc_transport_unref + rpc_clnt_mgmt_pmap_signout + rpcsvc_autoscale_threads ++rpcsvc_statedump +diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c +index b031d93..855b512 100644 +--- a/rpc/rpc-lib/src/rpcsvc.c ++++ b/rpc/rpc-lib/src/rpcsvc.c +@@ -25,6 +25,7 @@ + #include + #include "rpc-drc.h" + #include "protocol-common.h" ++#include + + #include + #include +@@ -377,6 +378,10 @@ rpcsvc_program_actor(rpcsvc_request_t *req) + goto err; + } + ++ if (svc->xl->ctx->measure_latency) { ++ timespec_now(&req->begin); ++ } ++ + req->ownthread = program->ownthread; + req->synctask = program->synctask; + +@@ -1526,10 +1531,18 @@ rpcsvc_submit_generic(rpcsvc_request_t *req, struct iovec *proghdr, + size_t hdrlen = 0; + char new_iobref = 0; + rpcsvc_drc_globals_t *drc = NULL; ++ gf_latency_t *lat = NULL; + + if ((!req) || (!req->trans)) + return -1; + ++ if (req->prog && req->begin.tv_sec) { ++ if ((req->procnum >= 0) && (req->procnum < req->prog->numactors)) { ++ timespec_now(&req->end); ++ lat = &req->prog->latencies[req->procnum]; ++ gf_latency_update(lat, &req->begin, &req->end); ++ } ++ } + trans = req->trans; + + for (i = 0; i < hdrcount; i++) { +@@ -1860,6 +1873,15 @@ rpcsvc_submit_message(rpcsvc_request_t *req, struct iovec *proghdr, + iobref); + } + ++void ++rpcsvc_program_destroy(rpcsvc_program_t *program) ++{ ++ if (program) { ++ GF_FREE(program->latencies); ++ GF_FREE(program); ++ } ++} ++ + int + rpcsvc_program_unregister(rpcsvc_t *svc, rpcsvc_program_t *program) + { +@@ -1917,8 +1939,7 @@ rpcsvc_program_unregister(rpcsvc_t *svc, rpcsvc_program_t *program) + + ret = 0; + out: +- if (prog) +- GF_FREE(prog); ++ rpcsvc_program_destroy(prog); + + if (ret == -1) { + if (program) { +@@ -2303,6 +2324,11 @@ rpcsvc_program_register(rpcsvc_t *svc, rpcsvc_program_t *program, + } + + memcpy(newprog, program, sizeof(*program)); ++ newprog->latencies = gf_latency_new(program->numactors); ++ if (!newprog->latencies) { ++ rpcsvc_program_destroy(newprog); ++ goto out; ++ } + + INIT_LIST_HEAD(&newprog->program); + pthread_mutexattr_init(&thr_attr); +@@ -3240,6 +3266,48 @@ out: + return ret; + } + ++void ++rpcsvc_program_dump(rpcsvc_program_t *prog) ++{ ++ char key_prefix[GF_DUMP_MAX_BUF_LEN]; ++ char key[GF_DUMP_MAX_BUF_LEN]; ++ int i; ++ ++ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s", prog->progname); ++ gf_proc_dump_add_section("%s", key_prefix); ++ ++ gf_proc_dump_build_key(key, key_prefix, "program-number"); ++ gf_proc_dump_write(key, "%d", prog->prognum); ++ ++ gf_proc_dump_build_key(key, key_prefix, "program-version"); ++ gf_proc_dump_write(key, "%d", prog->progver); ++ ++ strncat(key_prefix, ".latency", ++ sizeof(key_prefix) - strlen(key_prefix) - 1); ++ ++ for (i = 0; i < prog->numactors; i++) { ++ gf_proc_dump_build_key(key, key_prefix, "%s", prog->actors[i].procname); ++ gf_latency_statedump_and_reset(key, &prog->latencies[i]); ++ } ++} ++ ++void ++rpcsvc_statedump(rpcsvc_t *svc) ++{ ++ rpcsvc_program_t *prog = NULL; ++ int ret = 0; ++ ret = pthread_rwlock_tryrdlock(&svc->rpclock); ++ if (ret) ++ return; ++ { ++ list_for_each_entry(prog, &svc->programs, program) ++ { ++ rpcsvc_program_dump(prog); ++ } ++ } ++ pthread_rwlock_unlock(&svc->rpclock); ++} ++ + rpcsvc_actor_t gluster_dump_actors[GF_DUMP_MAXVALUE] = { + [GF_DUMP_NULL] = {"NULL", GF_DUMP_NULL, NULL, NULL, 0, DRC_NA}, + [GF_DUMP_DUMP] = {"DUMP", GF_DUMP_DUMP, rpcsvc_dump, NULL, 0, DRC_NA}, +diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h +index a51edc7..e336d00 100644 +--- a/rpc/rpc-lib/src/rpcsvc.h ++++ b/rpc/rpc-lib/src/rpcsvc.h +@@ -275,6 +275,8 @@ struct rpcsvc_request { + gf_boolean_t ownthread; + + gf_boolean_t synctask; ++ struct timespec begin; /*req handling start time*/ ++ struct timespec end; /*req handling end time*/ + }; + + #define rpcsvc_request_program(req) ((rpcsvc_program_t *)((req)->prog)) +@@ -431,6 +433,7 @@ struct rpcsvc_program { + + /* Program specific state handed to actors */ + void *private; ++ gf_latency_t *latencies; /*Tracks latency statistics for the rpc call*/ + + /* This upcall is provided by the program during registration. + * It is used to notify the program about events like connection being +@@ -696,4 +699,6 @@ rpcsvc_autoscale_threads(glusterfs_ctx_t *ctx, rpcsvc_t *rpc, int incr); + + extern int + rpcsvc_destroy(rpcsvc_t *svc); ++void ++rpcsvc_statedump(rpcsvc_t *svc); + #endif +diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c +index 54d9c0f..90eb3ff 100644 +--- a/xlators/protocol/server/src/server.c ++++ b/xlators/protocol/server/src/server.c +@@ -267,6 +267,8 @@ server_priv(xlator_t *this) + gf_proc_dump_build_key(key, "server", "total-bytes-write"); + gf_proc_dump_write(key, "%" PRIu64, total_write); + ++ rpcsvc_statedump(conf->rpc); ++ + ret = 0; + out: + if (ret) +-- +1.8.3.1 + diff --git a/SOURCES/0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch b/SOURCES/0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch new file mode 100644 index 0000000..1a5d0ea --- /dev/null +++ b/SOURCES/0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch @@ -0,0 +1,472 @@ +From d7665cf3249310c5faf87368f395b4e25cb86b48 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 15 Apr 2021 10:29:06 +0530 +Subject: [PATCH 577/584] protocol/client: don't reopen fds on which POSIX + locks are held after a reconnect + +XXXXXXXXXXXXXXXXXXX + IMPORTANT: +XXXXXXXXXXXXXXXXXXX +As a best pratice, with this patch we are bumping up the op-version +from GD_OP_VERSION_7_1 to GD_OP_VERSION_7_2 since it introduces a +new volume option. Enabling the new option will have effect only +after all the servers and clients are upgraded to this version. +---------------------------------------------------------------------- + +Bricks cleanup any granted locks after a client disconnects and +currently these locks are not healed after a reconnect. This means +post reconnect a competing process could be granted a lock even though +the first process which was granted locks has not unlocked. By not +re-opening fds, subsequent operations on such fds will fail forcing +the application to close the current fd and reopen a new one. This way +we prevent any silent corruption. + +A new option "client.strict-locks" is introduced to control this +behaviour. This option is set to "off" by default. + +> Upstream patch: https://review.gluster.org/#/c/glusterfs/+/22712/ +> Change-Id: Ieed545efea466cb5e8f5a36199aa26380c301b9e +> Signed-off-by: Raghavendra G +> updates: bz#1694920 + +BUG: 1689375 +Change-Id: Ieed545efea466cb5e8f5a36199aa26380c301b9e +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244909 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Ravishankar Narayanankutty +--- + libglusterfs/src/glusterfs/globals.h | 4 +- + tests/bugs/bug-1694920.t | 63 ++++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 14 ++++++ + xlators/protocol/client/src/client-handshake.c | 3 +- + xlators/protocol/client/src/client-helpers.c | 5 +- + xlators/protocol/client/src/client-lk.c | 2 +- + xlators/protocol/client/src/client-rpc-fops.c | 45 ++++++++++++++++- + xlators/protocol/client/src/client-rpc-fops_v2.c | 32 +++++++++++- + xlators/protocol/client/src/client.c | 13 +++++ + xlators/protocol/client/src/client.h | 16 ++++++ + 10 files changed, 190 insertions(+), 7 deletions(-) + create mode 100644 tests/bugs/bug-1694920.t + +diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h +index 33fb023..ce2d110 100644 +--- a/libglusterfs/src/glusterfs/globals.h ++++ b/libglusterfs/src/glusterfs/globals.h +@@ -50,7 +50,7 @@ + 1 /* MIN is the fresh start op-version, mostly \ + should not change */ + #define GD_OP_VERSION_MAX \ +- GD_OP_VERSION_7_1 /* MAX VERSION is the maximum \ ++ GD_OP_VERSION_7_2 /* MAX VERSION is the maximum \ + count in VME table, should \ + keep changing with \ + introduction of newer \ +@@ -140,6 +140,8 @@ + + #define GD_OP_VERSION_7_1 70100 /* Op-version for GlusterFS 7.1 */ + ++#define GD_OP_VERSION_7_2 70200 /* Op-version for GlusterFS 7.2 */ ++ + #include "glusterfs/xlator.h" + #include "glusterfs/options.h" + +diff --git a/tests/bugs/bug-1694920.t b/tests/bugs/bug-1694920.t +new file mode 100644 +index 0000000..5bf93c9 +--- /dev/null ++++ b/tests/bugs/bug-1694920.t +@@ -0,0 +1,63 @@ ++#!/bin/bash ++ ++SCRIPT_TIMEOUT=300 ++ ++. $(dirname $0)/../include.rc ++. $(dirname $0)/../volume.rc ++. $(dirname $0)/../fileio.rc ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/${V0}; ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume start $V0 ++TEST $GFS -s $H0 --volfile-id=$V0 $M0; ++ ++TEST touch $M0/a ++ ++#When all bricks are up, lock and unlock should succeed ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++TEST fd_close $fd1 ++ ++#When all bricks are down, lock/unlock should fail ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST $CLI volume stop $V0 ++TEST ! flock -x $fd1 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" client_connected_status_meta $M0 $V0-client-0 ++TEST fd_close $fd1 ++ ++#When a brick goes down and comes back up operations on fd which had locks on it should succeed by default ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++TEST $CLI volume stop $V0 ++sleep 2 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" client_connected_status_meta $M0 $V0-client-0 ++TEST fd_write $fd1 "data" ++TEST fd_close $fd1 ++ ++#When a brick goes down and comes back up operations on fd which had locks on it should fail when client.strict-locks is on ++TEST $CLI volume set $V0 client.strict-locks on ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++TEST $CLI volume stop $V0 ++sleep 2 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" client_connected_status_meta $M0 $V0-client-0 ++TEST ! fd_write $fd1 "data" ++TEST fd_close $fd1 ++ ++cleanup +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index c1ca190..01f3912 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -2022,6 +2022,20 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .value = "9", + .flags = VOLOPT_FLAG_CLIENT_OPT}, + ++ {.key = "client.strict-locks", ++ .voltype = "protocol/client", ++ .option = "strict-locks", ++ .value = "off", ++ .op_version = GD_OP_VERSION_7_2, ++ .validate_fn = validate_boolean, ++ .type = GLOBAL_DOC, ++ .description = "When set, doesn't reopen saved fds after reconnect " ++ "if POSIX locks are held on them. Hence subsequent " ++ "operations on these fds will fail. This is " ++ "necessary for stricter lock complaince as bricks " ++ "cleanup any granted locks when a client " ++ "disconnects."}, ++ + /* Server xlator options */ + {.key = "network.tcp-window-size", + .voltype = "protocol/server", +diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c +index 6b20d92..a12472b 100644 +--- a/xlators/protocol/client/src/client-handshake.c ++++ b/xlators/protocol/client/src/client-handshake.c +@@ -910,7 +910,8 @@ client_post_handshake(call_frame_t *frame, xlator_t *this) + { + list_for_each_entry_safe(fdctx, tmp, &conf->saved_fds, sfd_pos) + { +- if (fdctx->remote_fd != -1) ++ if (fdctx->remote_fd != -1 || ++ (!list_empty(&fdctx->lock_list) && conf->strict_locks)) + continue; + + fdctx->reopen_done = client_child_up_reopen_done; +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index 53b4484..6543100 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -410,6 +410,7 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) + { + clnt_fd_ctx_t *fdctx = NULL; + clnt_conf_t *conf = NULL; ++ gf_boolean_t locks_held = _gf_false; + + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, remote_fd, out); +@@ -431,11 +432,13 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) + *remote_fd = -1; + else + *remote_fd = fdctx->remote_fd; ++ ++ locks_held = !list_empty(&fdctx->lock_list); + } + } + pthread_spin_unlock(&conf->fd_lock); + +- if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1)) ++ if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && (!locks_held)) + *remote_fd = GF_ANON_FD_NO; + + return 0; +diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c +index 679e198..c1fb055 100644 +--- a/xlators/protocol/client/src/client-lk.c ++++ b/xlators/protocol/client/src/client-lk.c +@@ -351,7 +351,7 @@ delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner) + + list_for_each_entry_safe(lock, tmp, &fdctx->lock_list, list) + { +- if (!is_same_lkowner(&lock->owner, owner)) { ++ if (is_same_lkowner(&lock->owner, owner)) { + list_del_init(&lock->list); + list_add_tail(&lock->list, &delete_list); + count++; +diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c +index 1c8b31b..3110c78 100644 +--- a/xlators/protocol/client/src/client-rpc-fops.c ++++ b/xlators/protocol/client/src/client-rpc-fops.c +@@ -22,8 +22,18 @@ int32_t + client3_getspec(call_frame_t *frame, xlator_t *this, void *data); + rpc_clnt_prog_t clnt3_3_fop_prog; + +-/* CBK */ ++int ++client_is_setlk(int32_t cmd) ++{ ++ if ((cmd == F_SETLK) || (cmd == F_SETLK64) || (cmd == F_SETLKW) || ++ (cmd == F_SETLKW64)) { ++ return 1; ++ } + ++ return 0; ++} ++ ++/* CBK */ + int + client3_3_symlink_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +@@ -816,7 +826,8 @@ client3_3_flush_cbk(struct rpc_req *req, struct iovec *iov, int count, + goto out; + } + +- if (rsp.op_ret >= 0 && !fd_is_anonymous(local->fd)) { ++ if ((rsp.op_ret >= 0 || (rsp.op_errno == ENOTCONN)) && ++ !fd_is_anonymous(local->fd)) { + /* Delete all saved locks of the owner issuing flush */ + ret = delete_granted_locks_owner(local->fd, &local->owner); + gf_msg_trace(this->name, 0, "deleting locks of owner (%s) returned %d", +@@ -2388,10 +2399,12 @@ client3_3_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; ++ clnt_local_t *local = NULL; + + this = THIS; + + frame = myframe; ++ local = frame->local; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; +@@ -2412,6 +2425,18 @@ client3_3_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + ret = client_post_lk(this, &rsp, &lock, &xdata); + if (ret < 0) + goto out; ++ ++ /* Save the lock to the client lock cache to be able ++ to recover in the case of server reboot.*/ ++ ++ if (client_is_setlk(local->cmd)) { ++ ret = client_add_lock_for_recovery(local->fd, &lock, &local->owner, ++ local->cmd); ++ if (ret < 0) { ++ rsp.op_ret = -1; ++ rsp.op_errno = -ret; ++ } ++ } + } + + out: +@@ -4263,8 +4288,16 @@ client3_3_flush(call_frame_t *frame, xlator_t *this, void *data) + ret = client_pre_flush(this, &req, args->fd, args->xdata); + if (ret) { + op_errno = -ret; ++ if (op_errno == EBADF) { ++ ret = delete_granted_locks_owner(local->fd, &local->owner); ++ gf_msg_trace(this->name, 0, ++ "deleting locks of owner (%s) returned %d", ++ lkowner_utoa(&local->owner), ret); ++ } ++ + goto unwind; + } ++ + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_FLUSH, + client3_3_flush_cbk, NULL, + (xdrproc_t)xdr_gfs3_flush_req); +@@ -5199,8 +5232,16 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + args->xdata); + if (ret) { + op_errno = -ret; ++ ++ if ((op_errno == EBADF) && (args->flock->l_type == F_UNLCK) && ++ client_is_setlk(local->cmd)) { ++ client_add_lock_for_recovery(local->fd, args->flock, &local->owner, ++ local->cmd); ++ } ++ + goto unwind; + } ++ + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_LK, + client3_3_lk_cbk, NULL, + (xdrproc_t)xdr_gfs3_lk_req); +diff --git a/xlators/protocol/client/src/client-rpc-fops_v2.c b/xlators/protocol/client/src/client-rpc-fops_v2.c +index 613dda8..954fc58 100644 +--- a/xlators/protocol/client/src/client-rpc-fops_v2.c ++++ b/xlators/protocol/client/src/client-rpc-fops_v2.c +@@ -723,7 +723,8 @@ client4_0_flush_cbk(struct rpc_req *req, struct iovec *iov, int count, + goto out; + } + +- if (rsp.op_ret >= 0 && !fd_is_anonymous(local->fd)) { ++ if ((rsp.op_ret >= 0 || (rsp.op_errno == ENOTCONN)) && ++ !fd_is_anonymous(local->fd)) { + /* Delete all saved locks of the owner issuing flush */ + ret = delete_granted_locks_owner(local->fd, &local->owner); + gf_msg_trace(this->name, 0, "deleting locks of owner (%s) returned %d", +@@ -2193,10 +2194,12 @@ client4_0_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; ++ clnt_local_t *local = NULL; + + this = THIS; + + frame = myframe; ++ local = frame->local; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; +@@ -2217,6 +2220,18 @@ client4_0_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + ret = client_post_lk_v2(this, &rsp, &lock, &xdata); + if (ret < 0) + goto out; ++ ++ /* Save the lock to the client lock cache to be able ++ to recover in the case of server reboot.*/ ++ ++ if (client_is_setlk(local->cmd)) { ++ ret = client_add_lock_for_recovery(local->fd, &lock, &local->owner, ++ local->cmd); ++ if (ret < 0) { ++ rsp.op_ret = -1; ++ rsp.op_errno = -ret; ++ } ++ } + } + + out: +@@ -3998,6 +4013,13 @@ client4_0_flush(call_frame_t *frame, xlator_t *this, void *data) + ret = client_pre_flush_v2(this, &req, args->fd, args->xdata); + if (ret) { + op_errno = -ret; ++ if (op_errno == EBADF) { ++ ret = delete_granted_locks_owner(local->fd, &local->owner); ++ gf_msg_trace(this->name, 0, ++ "deleting locks of owner (%s) returned %d", ++ lkowner_utoa(&local->owner), ret); ++ } ++ + goto unwind; + } + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_FLUSH, +@@ -4771,8 +4793,16 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + args->xdata); + if (ret) { + op_errno = -ret; ++ ++ if ((op_errno == EBADF) && (args->flock->l_type == F_UNLCK) && ++ client_is_setlk(local->cmd)) { ++ client_add_lock_for_recovery(local->fd, args->flock, &local->owner, ++ local->cmd); ++ } ++ + goto unwind; + } ++ + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_LK, + client4_0_lk_cbk, NULL, + (xdrproc_t)xdr_gfx_lk_req); +diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c +index ed855ca..63c90ea 100644 +--- a/xlators/protocol/client/src/client.c ++++ b/xlators/protocol/client/src/client.c +@@ -2491,6 +2491,7 @@ build_client_config(xlator_t *this, clnt_conf_t *conf) + GF_OPTION_INIT("filter-O_DIRECT", conf->filter_o_direct, bool, out); + + GF_OPTION_INIT("send-gids", conf->send_gids, bool, out); ++ GF_OPTION_INIT("strict-locks", conf->strict_locks, bool, out); + + conf->client_id = glusterfs_leaf_position(this); + +@@ -2676,6 +2677,7 @@ reconfigure(xlator_t *this, dict_t *options) + out); + + GF_OPTION_RECONF("send-gids", conf->send_gids, options, bool, out); ++ GF_OPTION_RECONF("strict-locks", conf->strict_locks, options, bool, out); + + ret = 0; + out: +@@ -3032,6 +3034,17 @@ struct volume_options options[] = { + " power. Range 1-32 threads.", + .op_version = {GD_OP_VERSION_RHS_3_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, ++ {.key = {"strict-locks"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .default_value = "off", ++ .op_version = {GD_OP_VERSION_7_2}, ++ .flags = OPT_FLAG_SETTABLE, ++ .description = "When set, doesn't reopen saved fds after reconnect " ++ "if POSIX locks are held on them. Hence subsequent " ++ "operations on these fds will fail. This is " ++ "necessary for stricter lock complaince as bricks " ++ "cleanup any granted locks when a client " ++ "disconnects."}, + {.key = {NULL}}, + }; + +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index f12fa61..bde3d1a 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -235,6 +235,15 @@ typedef struct clnt_conf { + * up, disconnects can be + * logged + */ ++ ++ gf_boolean_t strict_locks; /* When set, doesn't reopen saved fds after ++ reconnect if POSIX locks are held on them. ++ Hence subsequent operations on these fds will ++ fail. This is necessary for stricter lock ++ complaince as bricks cleanup any granted ++ locks when a client disconnects. ++ */ ++ + } clnt_conf_t; + + typedef struct _client_fd_ctx { +@@ -513,4 +522,11 @@ compound_request_cleanup_v2(gfx_compound_req *req); + void + client_compound_rsp_cleanup_v2(gfx_compound_rsp *rsp, int len); + ++int ++client_add_lock_for_recovery(fd_t *fd, struct gf_flock *flock, ++ gf_lkowner_t *owner, int32_t cmd); ++ ++int ++client_is_setlk(int32_t cmd); ++ + #endif /* !_CLIENT_H */ +-- +1.8.3.1 + diff --git a/SOURCES/0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch b/SOURCES/0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch new file mode 100644 index 0000000..d5df9e2 --- /dev/null +++ b/SOURCES/0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch @@ -0,0 +1,46 @@ +From ffb4085b3e04878e85bf505a541203aa2ee71e9c Mon Sep 17 00:00:00 2001 +From: l17zhou +Date: Fri, 6 Mar 2020 03:54:02 +0200 +Subject: [PATCH 578/584] protocol/client: fallback to anonymous fd for fsync + +> Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24203/ +> Change-Id: I32f801206ce7fbd05aa693f44c2f140304f2e275 +> Fixes: bz#1810842 + +BUG: 1689375 +Change-Id: I32f801206ce7fbd05aa693f44c2f140304f2e275 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245538 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/protocol/client/src/client-common.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/xlators/protocol/client/src/client-common.c b/xlators/protocol/client/src/client-common.c +index 64db98d..1417a60 100644 +--- a/xlators/protocol/client/src/client-common.c ++++ b/xlators/protocol/client/src/client-common.c +@@ -449,7 +449,8 @@ client_pre_fsync(xlator_t *this, gfs3_fsync_req *req, fd_t *fd, int32_t flags, + int64_t remote_fd = -1; + int op_errno = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, ++ out); + + req->fd = remote_fd; + req->data = flags; +@@ -2641,7 +2642,8 @@ client_pre_fsync_v2(xlator_t *this, gfx_fsync_req *req, fd_t *fd, int32_t flags, + int64_t remote_fd = -1; + int op_errno = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, ++ out); + + req->fd = remote_fd; + req->data = flags; +-- +1.8.3.1 + diff --git a/SOURCES/0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch b/SOURCES/0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch new file mode 100644 index 0000000..d568966 --- /dev/null +++ b/SOURCES/0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch @@ -0,0 +1,168 @@ +From 96c4c3c47c914aced8864e7d178a4d57f7fced05 Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Sun, 6 Jun 2021 14:26:18 +0300 +Subject: [PATCH 579/584] cli: changing rebal task ID to "None" in case status + is being reset + +Rebalance status is being reset during replace/reset-brick operations. +This cause 'volume status' to shows rebalance as "not started". + +Fix: +change rebalance-status to "reset due to (replace|reset)-brick" + +Backport of: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/1869 +> Change-Id: Ia73a8bea3dcd8e51acf4faa6434c3cb0d09856d0 +> Signed-off-by: Tamar Shacked +> Fixes: #1717 + +BUG: 1889966 + +Signed-off-by: Tamar Shacked +Change-Id: Ia73a8bea3dcd8e51acf4faa6434c3cb0d09856d0 +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245402 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli-rpc-ops.c | 15 ++++++- + rpc/xdr/src/cli1-xdr.x | 2 + + tests/bugs/glusterd/reset-rebalance-state.t | 46 ++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-replace-brick.c | 4 +- + xlators/mgmt/glusterd/src/glusterd-reset-brick.c | 3 +- + 5 files changed, 65 insertions(+), 5 deletions(-) + create mode 100644 tests/bugs/glusterd/reset-rebalance-state.t + +diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c +index 51b5447..4167c68 100644 +--- a/cli/src/cli-rpc-ops.c ++++ b/cli/src/cli-rpc-ops.c +@@ -72,6 +72,8 @@ char *cli_vol_task_status_str[] = {"not started", + "fix-layout stopped", + "fix-layout completed", + "fix-layout failed", ++ "reset due to replace-brick", ++ "reset due to reset-brick", + "unknown"}; + + int32_t +@@ -8357,12 +8359,21 @@ cli_print_volume_status_tasks(dict_t *dict) + ret = dict_get_str(dict, key, &task_id_str); + if (ret) + return; +- cli_out("%-20s : %-20s", "ID", task_id_str); + + snprintf(key, sizeof(key), "task%d.status", i); + ret = dict_get_int32(dict, key, &status); +- if (ret) ++ if (ret) { ++ cli_out("%-20s : %-20s", "ID", task_id_str); + return; ++ } ++ ++ if (!strcmp(op, "Rebalance") && ++ (status == GF_DEFRAG_STATUS_RESET_DUE_REPLACE_BRC || ++ status == GF_DEFRAG_STATUS_RESET_DUE_RESET_BRC)) { ++ task_id_str = "None"; ++ } ++ ++ cli_out("%-20s : %-20s", "ID", task_id_str); + + snprintf(task, sizeof(task), "task%d", i); + +diff --git a/rpc/xdr/src/cli1-xdr.x b/rpc/xdr/src/cli1-xdr.x +index 777cb00..17d96f1 100644 +--- a/rpc/xdr/src/cli1-xdr.x ++++ b/rpc/xdr/src/cli1-xdr.x +@@ -45,6 +45,8 @@ + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, ++ GF_DEFRAG_STATUS_RESET_DUE_REPLACE_BRC, ++ GF_DEFRAG_STATUS_RESET_DUE_RESET_BRC, + GF_DEFRAG_STATUS_MAX + }; + +diff --git a/tests/bugs/glusterd/reset-rebalance-state.t b/tests/bugs/glusterd/reset-rebalance-state.t +new file mode 100644 +index 0000000..829d2b1 +--- /dev/null ++++ b/tests/bugs/glusterd/reset-rebalance-state.t +@@ -0,0 +1,46 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../cluster.rc ++. $(dirname $0)/../../volume.rc ++ ++ ++get_rebalance_status() { ++ $CLI volume status $V0 | egrep ^"Status " | awk '{print $3}' ++} ++ ++run_rebal_check_status() { ++ TEST $CLI volume rebalance $V0 start ++ EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 ++ REBAL_STATE=$(get_rebalance_status) ++ TEST [ $REBAL_STATE == "completed" ] ++} ++ ++replace_brick_check_status() { ++ TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_replace commit force ++ REBAL_STATE=$(get_rebalance_status) ++ TEST [ $REBAL_STATE == "reset" ] ++} ++ ++reset_brick_check_status() { ++ TEST $CLI volume reset-brick $V0 $H0:$B0/${V0}2 start ++ TEST $CLI volume reset-brick $V0 $H0:$B0/${V0}2 $H0:$B0/${V0}2 commit force ++ REBAL_STATE=$(get_rebalance_status) ++ TEST [ $REBAL_STATE == "reset" ] ++} ++ ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd; ++ ++TEST $CLI volume info; ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..6} force; ++TEST $CLI volume start $V0; ++ ++run_rebal_check_status; ++replace_brick_check_status; ++reset_brick_check_status; ++ ++cleanup; ++ +diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +index 0615081..80b80e4 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c ++++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +@@ -548,8 +548,8 @@ glusterd_op_replace_brick(dict_t *dict, dict_t *rsp_dict) + (void)glusterd_svcs_manager(volinfo); + goto out; + } +- +- volinfo->rebal.defrag_status = 0; ++ if (volinfo->rebal.defrag_status != GF_DEFRAG_STATUS_NOT_STARTED) ++ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_RESET_DUE_REPLACE_BRC; + + ret = glusterd_svcs_manager(volinfo); + if (ret) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-reset-brick.c b/xlators/mgmt/glusterd/src/glusterd-reset-brick.c +index cf04ce8..19d7549 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-reset-brick.c ++++ b/xlators/mgmt/glusterd/src/glusterd-reset-brick.c +@@ -342,7 +342,8 @@ glusterd_op_reset_brick(dict_t *dict, dict_t *rsp_dict) + goto out; + } + +- volinfo->rebal.defrag_status = 0; ++ if (volinfo->rebal.defrag_status != GF_DEFRAG_STATUS_NOT_STARTED) ++ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_RESET_DUE_RESET_BRC; + + ret = glusterd_svcs_manager(volinfo); + if (ret) { +-- +1.8.3.1 + diff --git a/SOURCES/0580-cluster-dht-suppress-file-migration-error-for-node-n.patch b/SOURCES/0580-cluster-dht-suppress-file-migration-error-for-node-n.patch new file mode 100644 index 0000000..06befeb --- /dev/null +++ b/SOURCES/0580-cluster-dht-suppress-file-migration-error-for-node-n.patch @@ -0,0 +1,138 @@ +From a5da8bb830e86b6dd77a06cd59d220052e80b21c Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Sun, 6 Jun 2021 11:57:06 +0300 +Subject: [PATCH 580/584] cluster/dht: suppress file migration error for node + not supposed to migrate file + +A rebalance process does a lookup for every file in the dir it is processing +before checking if it supposed to migrate the file. +In this issue there are two rebalance processses running on a replica subvol: +R1 is migrating the FILE. +R2 is not supposed to migrate the FILE, but it does a lookup and + finds a stale linkfile which is mostly due to a stale layout. + Then, it tries to unlink the stale linkfile and gets EBUSY + as the linkfile fd is open due R1 migration. + As a result a misleading error msg about FILE migration failure + due EBUSY is logged in R2 logfile. + +Fix: +suppress the error in case it occured in a node that +is not supposed to migrate the file. + +Backport of: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24712/ +> fixes: #1371 +> Change-Id: I37832b404e2b0cc40ac5caf45f14c32c891e71f3 +> Signed-off-by: Tamar Shacked + +BUG: 1815462 +Signed-off-by: Tamar Shacked +Change-Id: I915ee8e7470d85a849b198bfa7d58d368a246aae +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245401 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-rebalance.c | 38 ++++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 13 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index e07dec0..cc0f2c9 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -2604,10 +2604,10 @@ out: + * all hardlinks. + */ + +-int ++gf_boolean_t + gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) + { +- int ret = 0; ++ gf_boolean_t ret = _gf_false; + int i = local_subvol_index; + char *str = NULL; + uint32_t hashval = 0; +@@ -2629,12 +2629,11 @@ gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) + } + + str = uuid_utoa_r(gfid, buf); +- ret = dht_hash_compute(this, 0, str, &hashval); +- if (ret == 0) { ++ if (dht_hash_compute(this, 0, str, &hashval) == 0) { + index = (hashval % entry->count); + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ +- ret = 1; ++ ret = _gf_true; + goto out; + } + +@@ -2647,12 +2646,12 @@ gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) + /* None of the bricks in the subvol are up. + * CHILD_DOWN will kill the process soon */ + +- return 0; ++ return _gf_false; + } + + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ +- ret = 1; ++ ret = _gf_true; + goto out; + } + } +@@ -2701,6 +2700,7 @@ gf_defrag_migrate_single_file(void *opaque) + struct iatt *iatt_ptr = NULL; + gf_boolean_t update_skippedcount = _gf_true; + int i = 0; ++ gf_boolean_t should_i_migrate = 0; + + rebal_entry = (struct dht_container *)opaque; + if (!rebal_entry) { +@@ -2754,11 +2754,29 @@ gf_defrag_migrate_single_file(void *opaque) + goto out; + } + ++ should_i_migrate = gf_defrag_should_i_migrate( ++ this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid); ++ + gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); + + gf_uuid_copy(entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); ++ ++ if (!should_i_migrate) { ++ /* this node isn't supposed to migrate the file. suppressing any ++ * potential error from lookup as this file is under migration by ++ * another node */ ++ if (ret) { ++ gf_msg_debug(this->name, -ret, ++ "Ignoring lookup failure: node isn't migrating %s", ++ entry_loc.path); ++ ret = 0; ++ } ++ gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path); ++ goto out; ++ } ++ + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s lookup failed", entry_loc.path); +@@ -2779,12 +2797,6 @@ gf_defrag_migrate_single_file(void *opaque) + goto out; + } + +- if (!gf_defrag_should_i_migrate(this, rebal_entry->local_subvol_index, +- entry->d_stat.ia_gfid)) { +- gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path); +- goto out; +- } +- + iatt_ptr = &iatt; + + hashed_subvol = dht_subvol_get_hashed(this, &entry_loc); +-- +1.8.3.1 + diff --git a/SOURCES/0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch b/SOURCES/0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch new file mode 100644 index 0000000..1267608 --- /dev/null +++ b/SOURCES/0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch @@ -0,0 +1,1431 @@ +From 57c794e31c0333f508ada740227c9afa1889f8ae Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 15 Apr 2021 11:27:57 +0530 +Subject: [PATCH 581/584] afr: don't reopen fds on which POSIX locks are held + +When client.strict-locks is enabled on a volume and there are POSIX +locks held on the files, after disconnect and reconnection of the +clients do not re-open such fds which might lead to multiple clients +acquiring the locks and cause data corruption. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/1980/commits/56bde56c2741c5eac59937a6cf951a14f2878460 +> Change-Id: I8777ffbc2cc8d15ab57b58b72b56eb67521787c5 +> Fixes: #1977 +> Signed-off-by: karthik-us + +BUG: 1689375 +Change-Id: I8777ffbc2cc8d15ab57b58b72b56eb67521787c5 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245414 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Ravishankar Narayanankutty +--- + rpc/rpc-lib/src/protocol-common.h | 6 + + tests/bugs/replicate/do-not-reopen-fd.t | 206 +++++++++++++++++ + xlators/cluster/afr/src/afr-common.c | 15 +- + xlators/cluster/afr/src/afr-open.c | 280 +++++++++++++++++++---- + xlators/cluster/afr/src/afr.h | 3 + + xlators/protocol/client/src/client-common.c | 148 ++++++++---- + xlators/protocol/client/src/client-common.h | 4 + + xlators/protocol/client/src/client-helpers.c | 22 +- + xlators/protocol/client/src/client-rpc-fops.c | 23 +- + xlators/protocol/client/src/client-rpc-fops_v2.c | 25 +- + xlators/protocol/client/src/client.c | 21 +- + xlators/protocol/client/src/client.h | 8 +- + 12 files changed, 654 insertions(+), 107 deletions(-) + create mode 100644 tests/bugs/replicate/do-not-reopen-fd.t + +diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h +index 779878f..f56aaaa 100644 +--- a/rpc/rpc-lib/src/protocol-common.h ++++ b/rpc/rpc-lib/src/protocol-common.h +@@ -312,6 +312,12 @@ enum glusterd_mgmt_v3_procnum { + GLUSTERD_MGMT_V3_MAXVALUE, + }; + ++enum gf_fd_reopen_status { ++ FD_REOPEN_ALLOWED = 0, ++ FD_REOPEN_NOT_ALLOWED, ++ FD_BAD, ++}; ++ + typedef struct gf_gsync_detailed_status_ gf_gsync_status_t; + + enum gf_get_volume_info_type { +diff --git a/tests/bugs/replicate/do-not-reopen-fd.t b/tests/bugs/replicate/do-not-reopen-fd.t +new file mode 100644 +index 0000000..76d8e70 +--- /dev/null ++++ b/tests/bugs/replicate/do-not-reopen-fd.t +@@ -0,0 +1,206 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../fileio.rc ++ ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 client.strict-locks on ++TEST $CLI volume heal $V0 disable ++TEST $CLI volume start $V0 ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M1 ++ ++TEST touch $M0/a ++ ++# Kill one brick and take lock on the fd and do a write. ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++ ++TEST flock -x $fd1 ++TEST fd_write $fd1 "data-1" ++ ++# Restart the brick and then write. Now fd should not get re-opened but write ++# should still succeed as there were no quorum disconnects. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd_write $fd1 "data-2" ++EXPECT "" cat $B0/${V0}0/a ++EXPECT "data-2" cat $B0/${V0}1/a ++EXPECT "data-2" cat $B0/${V0}2/a ++ ++# Check there is no fd opened on the 1st brick by checking for the gfid inside ++# /proc/pid-of-brick/fd/ directory ++gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) ++gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) ++ ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++TEST fd2=`fd_available` ++TEST fd_open $fd2 'rw' $M1/a ++ ++# Kill 2nd brick and try writing to the file. The write should fail due to ++# quorum failure. ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++TEST ! fd_write $fd1 "data-3" ++TEST ! fd_cat $fd1 ++ ++# Restart the bricks and try writing to the file. This should fail as two bricks ++# which were down previously, will return EBADFD now. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++TEST ! fd_write $fd1 "data-4" ++TEST ! fd_cat $fd1 ++ ++# Enable heal and check the files will have same content on all the bricks after ++# the heal is completed. ++EXPECT_WITHIN $HEAL_TIMEOUT "^2$" get_pending_heal_count $V0 ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++ ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++EXPECT "data-4" cat $B0/${V0}0/a ++EXPECT "data-4" cat $B0/${V0}1/a ++EXPECT "data-4" cat $B0/${V0}2/a ++TEST $CLI volume heal $V0 disable ++ ++# Try writing to the file again on the same fd, which should fail again, since ++# it is not yet re-opened. ++TEST ! fd_write $fd1 "data-5" ++ ++# At this point only one brick will have the lock. Try taking the lock again on ++# the bad fd, which should also fail with EBADFD. ++TEST ! flock -x $fd1 ++ ++# Kill the only brick that is having lock and try taking lock on another client ++# which should succeed. ++TEST kill_brick $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++TEST flock -x $fd2 ++TEST fd_write $fd2 "data-6" ++ ++# Bring the brick up and try writing & reading on the old fd, which should still ++# fail and operations on the 2nd fd should succeed. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++TEST ! fd_write $fd1 "data-7" ++ ++TEST ! fd_cat $fd1 ++TEST fd_cat $fd2 ++ ++# Close both the fds which will release the locks and then re-open and take lock ++# on the old fd. Operations on that fd should succeed afterwards. ++TEST fd_close $fd1 ++TEST fd_close $fd2 ++ ++TEST ! ls /proc/$$/fd/$fd1 ++TEST ! ls /proc/$$/fd/$fd2 ++EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++TEST flock -x $fd1 ++TEST fd_write $fd1 "data-8" ++TEST fd_cat $fd1 ++ ++EXPECT "data-8" head -n 1 $B0/${V0}0/a ++EXPECT "data-8" head -n 1 $B0/${V0}1/a ++EXPECT "data-8" head -n 1 $B0/${V0}2/a ++ ++TEST fd_close $fd1 ++ ++# Heal the volume ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++ ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++TEST $CLI volume heal $V0 disable ++ ++# Kill one brick and open a fd. ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++ ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Restart the brick and then write. Now fd should get re-opened and write should ++# succeed on the previously down brick as well since there are no locks held on ++# any of the bricks. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd_write $fd1 "data-10" ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++ ++EXPECT "data-10" head -n 1 $B0/${V0}0/a ++EXPECT "data-10" head -n 1 $B0/${V0}1/a ++EXPECT "data-10" head -n 1 $B0/${V0}2/a ++TEST fd_close $fd1 ++ ++# Kill one brick, open and take lock on a fd. ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++TEST flock -x $fd1 ++ ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Kill & restart another brick so that it will return EBADFD ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 ++ ++# Restart the bricks and then write. Now fd should not get re-opened since lock ++# is still held on one brick and write should also fail as there is no quorum. ++ ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++TEST ! fd_write $fd1 "data-11" ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++EXPECT "data-10" head -n 1 $B0/${V0}0/a ++EXPECT "data-10" head -n 1 $B0/${V0}1/a ++EXPECT "data-11" head -n 1 $B0/${V0}2/a ++ ++TEST fd_close $fd1 ++cleanup +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 416012c..bd46e59 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2067,6 +2067,8 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this) + dict_unref(local->cont.entrylk.xdata); + } + ++ GF_FREE(local->need_open); ++ + if (local->xdata_req) + dict_unref(local->xdata_req); + +@@ -5689,6 +5691,14 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) + } + local->is_new_entry = _gf_false; + ++ local->need_open = GF_CALLOC(priv->child_count, sizeof(*local->need_open), ++ gf_afr_mt_char); ++ if (!local->need_open) { ++ if (op_errno) ++ *op_errno = ENOMEM; ++ goto out; ++ } ++ + INIT_LIST_HEAD(&local->healer); + return 0; + out: +@@ -6124,9 +6134,8 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) + char *substr = NULL; + char *status = NULL; + +- ret = afr_lockless_inspect(frame, this, loc->gfid, &inode, +- &entry_selfheal, &data_selfheal, +- &metadata_selfheal, &pending); ++ ret = afr_lockless_inspect(frame, this, loc->gfid, &inode, &entry_selfheal, ++ &data_selfheal, &metadata_selfheal, &pending); + + if (ret == -ENOMEM) { + ret = -1; +diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c +index ff72c73..73c1552 100644 +--- a/xlators/cluster/afr/src/afr-open.c ++++ b/xlators/cluster/afr/src/afr-open.c +@@ -35,6 +35,8 @@ + #include "afr-dir-read.h" + #include "afr-dir-write.h" + #include "afr-transaction.h" ++#include "afr-self-heal.h" ++#include "protocol-common.h" + + gf_boolean_t + afr_is_fd_fixable(fd_t *fd) +@@ -239,8 +241,32 @@ afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + return 0; + } + ++static void ++afr_fd_ctx_reset_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) ++{ ++ afr_fd_ctx_t *fd_ctx = NULL; ++ afr_private_t *priv = NULL; ++ int i = 0; ++ ++ priv = this->private; ++ fd_ctx = afr_fd_ctx_get(fd, this); ++ if (!fd_ctx) ++ return; ++ ++ LOCK(&fd->lock); ++ { ++ for (i = 0; i < priv->child_count; i++) { ++ if (fd_ctx->opened_on[i] == AFR_FD_OPENING && need_open[i]) { ++ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; ++ need_open[i] = 0; ++ } ++ } ++ } ++ UNLOCK(&fd->lock); ++} ++ + static int +-afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) ++afr_fd_ctx_set_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) + { + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; +@@ -248,7 +274,6 @@ afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) + int count = 0; + + priv = this->private; +- + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + return 0; +@@ -271,21 +296,217 @@ afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) + return count; + } + ++static int ++afr_do_fix_open(call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = frame->local; ++ afr_private_t *priv = NULL; ++ int i = 0; ++ int need_open_count = 0; ++ ++ priv = this->private; ++ ++ need_open_count = AFR_COUNT(local->need_open, priv->child_count); ++ if (!need_open_count) { ++ goto out; ++ } ++ gf_msg_debug(this->name, 0, "need open count: %d", need_open_count); ++ local->call_count = need_open_count; ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!local->need_open[i]) ++ continue; ++ ++ if (IA_IFDIR == local->fd->inode->ia_type) { ++ gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", ++ local->loc.path, priv->children[i]->name); ++ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, ++ priv->children[i], ++ priv->children[i]->fops->opendir, &local->loc, ++ local->fd, NULL); ++ } else { ++ gf_msg_debug(this->name, 0, ++ "opening fd for file %s on subvolume %s", ++ local->loc.path, priv->children[i]->name); ++ ++ STACK_WIND_COOKIE( ++ frame, afr_openfd_fix_open_cbk, (void *)(long)i, ++ priv->children[i], priv->children[i]->fops->open, &local->loc, ++ local->fd_ctx->flags & ~(O_CREAT | O_EXCL | O_TRUNC), local->fd, ++ NULL); ++ } ++ if (!--need_open_count) ++ break; ++ } ++ return 0; ++ ++out: ++ afr_fd_ctx_reset_need_open(local->fd, this, local->need_open); ++ AFR_STACK_DESTROY(frame); ++ return 0; ++} ++ ++static int ++afr_is_reopen_allowed_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct gf_flock *lock, dict_t *xdata) ++{ ++ afr_local_t *local = frame->local; ++ afr_private_t *priv = NULL; ++ int ret = -1; ++ int call_count = 0; ++ int i = (long)cookie; ++ int32_t fd_reopen_status = -1; ++ int32_t final_reopen_status = -1; ++ ++ priv = this->private; ++ local->replies[i].valid = 1; ++ local->replies[i].op_ret = op_ret; ++ local->replies[i].op_errno = op_errno; ++ if (op_ret != 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_DICT_GET_FAILED, ++ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); ++ } ++ ++ if (xdata) ++ local->replies[i].xdata = dict_ref(xdata); ++ ++ call_count = afr_frame_return(frame); ++ ++ if (call_count) ++ return 0; ++ ++ /* Currently we get 3 values from the lower layer (protocol/client) in the ++ * getlk_cbk. ++ * FD_REOPEN_ALLOWED : No conflicting locks are held and reopen is allowed ++ * FD_REOPEN_NOT_ALLOWED : Conflicting locks are held and reopen is not ++ * allowed ++ * FD_BAD : FD is not valid ++ * ++ * - If we get FD_REOPEN_NOT_ALLOWED from any of the bricks, will block the ++ * reopen taking this as high priority. ++ * - If we get FD_BAD from all the replies, we will not reopen since we do ++ * not know the correct status. ++ * - If we get FD_BAD from few brick and FD_REOPEN_NOT_ALLOWED from one or ++ * more bricks, then we will block reopen. ++ * - If we get FD_BAD from few bricks and FD_REOPEN_ALLOWED from one or ++ * more bricks, then we will allow the reopen. ++ * ++ * We will update the final_reopen_status only when the value returned ++ * from lower layer is >= FD_REOPEN_ALLOWED and < FD_BAD. We will not set ++ * FD_BAD in final_reopen_status, since it can lead to unexpected ++ * behaviours. ++ * ++ * At the end of this loop, if we still have final_reopen_status as -1 ++ * i.e., the init value, it means we failed to get the fd status from any ++ * of the bricks or we do not have a valid fd on any of the bricks. We ++ * will not reopen the fd in this case as well. ++ */ ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (final_reopen_status != FD_REOPEN_NOT_ALLOWED && ++ local->replies[i].xdata) { ++ ret = dict_get_int32(xdata, "fd-reopen-status", &fd_reopen_status); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, ++ "Failed to get whether reopen is allowed or not on fd " ++ "for file %s on subvolume %s.", ++ local->loc.path, priv->children[i]->name); ++ } else if (fd_reopen_status >= FD_REOPEN_ALLOWED && ++ fd_reopen_status < FD_BAD) { ++ final_reopen_status = fd_reopen_status; ++ } ++ } ++ ++ if (final_reopen_status == FD_REOPEN_NOT_ALLOWED) ++ break; ++ } ++ ++ if (final_reopen_status == FD_REOPEN_NOT_ALLOWED) { ++ gf_log(this->name, GF_LOG_INFO, ++ "Conflicting locks held on file %s. FD reopen is not allowed.", ++ local->loc.path); ++ } else if (final_reopen_status == -1) { ++ gf_log(this->name, GF_LOG_INFO, ++ "Failed to get the lock information " ++ "on file %s. FD reopen is not allowed.", ++ local->loc.path); ++ } else { ++ afr_local_replies_wipe(local, priv); ++ afr_do_fix_open(frame, this); ++ return 0; ++ } ++ ++ afr_fd_ctx_reset_need_open(local->fd, this, local->need_open); ++ AFR_STACK_DESTROY(frame); ++ return 0; ++} ++ + void +-afr_fix_open(fd_t *fd, xlator_t *this) ++afr_is_reopen_allowed(xlator_t *this, call_frame_t *frame) + { + afr_private_t *priv = NULL; ++ afr_local_t *local = NULL; ++ dict_t *xdata = NULL; + int i = 0; ++ int call_count = 0; ++ struct gf_flock flock = { ++ 0, ++ }; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ flock.l_type = F_WRLCK; ++ afr_set_lk_owner(frame, this, frame->root); ++ lk_owner_copy(&flock.l_owner, &frame->root->lk_owner); ++ ++ call_count = AFR_COUNT(local->child_up, priv->child_count); ++ if (!call_count) ++ goto out; ++ local->call_count = call_count; ++ ++ xdata = dict_new(); ++ if (xdata == NULL) ++ goto out; ++ ++ if (dict_set_int32(xdata, "fd-reopen-status", -1)) ++ goto out; ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (local->child_up[i]) { ++ STACK_WIND_COOKIE(frame, afr_is_reopen_allowed_cbk, (void *)(long)i, ++ priv->children[i], priv->children[i]->fops->lk, ++ local->fd, F_GETLK, &flock, xdata); ++ } else { ++ continue; ++ } ++ ++ if (!--call_count) ++ break; ++ } ++ ++ dict_unref(xdata); ++ return; ++ ++out: ++ if (xdata) ++ dict_unref(xdata); ++ afr_fd_ctx_reset_need_open(local->fd, this, local->need_open); ++ AFR_STACK_DESTROY(frame); ++ return; ++} ++ ++void ++afr_fix_open(fd_t *fd, xlator_t *this) ++{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; +- unsigned char *need_open = NULL; + int call_count = 0; + +- priv = this->private; +- + if (!afr_is_fd_fixable(fd)) + goto out; + +@@ -293,12 +514,6 @@ afr_fix_open(fd_t *fd, xlator_t *this) + if (!fd_ctx) + goto out; + +- need_open = alloca0(priv->child_count); +- +- call_count = afr_fd_ctx_need_open(fd, this, need_open); +- if (!call_count) +- goto out; +- + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; +@@ -307,47 +522,24 @@ afr_fix_open(fd_t *fd, xlator_t *this) + if (!local) + goto out; + ++ call_count = afr_fd_ctx_set_need_open(fd, this, local->need_open); ++ if (!call_count) ++ goto out; ++ + local->loc.inode = inode_ref(fd->inode); + ret = loc_path(&local->loc, NULL); + if (ret < 0) + goto out; +- + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; + +- local->call_count = call_count; +- +- gf_msg_debug(this->name, 0, "need open count: %d", call_count); +- +- for (i = 0; i < priv->child_count; i++) { +- if (!need_open[i]) +- continue; +- +- if (IA_IFDIR == fd->inode->ia_type) { +- gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", +- local->loc.path, priv->children[i]->name); +- +- STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, +- priv->children[i], +- priv->children[i]->fops->opendir, &local->loc, +- local->fd, NULL); +- } else { +- gf_msg_debug(this->name, 0, +- "opening fd for file %s on subvolume %s", +- local->loc.path, priv->children[i]->name); +- +- STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, +- priv->children[i], priv->children[i]->fops->open, +- &local->loc, fd_ctx->flags & (~O_TRUNC), +- local->fd, NULL); +- } +- +- if (!--call_count) +- break; +- } +- ++ afr_is_reopen_allowed(this, frame); + return; ++ + out: ++ if (call_count) ++ afr_fd_ctx_reset_need_open(fd, this, local->need_open); + if (frame) + AFR_STACK_DESTROY(frame); ++ return; + } +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 6a9a763..ffc7317 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -895,6 +895,9 @@ typedef struct _afr_local { + afr_ta_fop_state_t fop_state; + int ta_failed_subvol; + gf_boolean_t is_new_entry; ++ ++ /* For fix_open */ ++ unsigned char *need_open; + } afr_local_t; + + typedef struct afr_spbc_timeout { +diff --git a/xlators/protocol/client/src/client-common.c b/xlators/protocol/client/src/client-common.c +index 1417a60..92cda12 100644 +--- a/xlators/protocol/client/src/client-common.c ++++ b/xlators/protocol/client/src/client-common.c +@@ -343,7 +343,7 @@ client_pre_readv(xlator_t *this, gfs3_read_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_READ, out); + + req->size = size; + req->offset = offset; +@@ -368,7 +368,7 @@ client_pre_writev(xlator_t *this, gfs3_write_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_WRITE, out); + + req->size = size; + req->offset = offset; +@@ -429,7 +429,8 @@ client_pre_flush(xlator_t *this, gfs3_flush_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FLUSH, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -450,7 +451,7 @@ client_pre_fsync(xlator_t *this, gfs3_fsync_req *req, fd_t *fd, int32_t flags, + int op_errno = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FSYNC, out); + + req->fd = remote_fd; + req->data = flags; +@@ -591,7 +592,8 @@ client_pre_fsyncdir(xlator_t *this, gfs3_fsyncdir_req *req, fd_t *fd, + int32_t op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSYNCDIR, out); + + req->fd = remote_fd; + req->data = flags; +@@ -668,7 +670,8 @@ client_pre_ftruncate(xlator_t *this, gfs3_ftruncate_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = EINVAL; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FTRUNCATE, out); + + req->offset = offset; + req->fd = remote_fd; +@@ -687,7 +690,8 @@ client_pre_fstat(xlator_t *this, gfs3_fstat_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSTAT, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -710,7 +714,8 @@ client_pre_lk(xlator_t *this, gfs3_lk_req *req, int32_t cmd, + int32_t gf_type = 0; + int ret = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_LK, out); + + ret = client_cmd_to_gf_cmd(cmd, &gf_cmd); + if (ret) { +@@ -787,7 +792,8 @@ client_pre_readdir(xlator_t *this, gfs3_readdir_req *req, fd_t *fd, size_t size, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIR, out); + + req->size = size; + req->offset = offset; +@@ -869,7 +875,7 @@ client_pre_finodelk(xlator_t *this, gfs3_finodelk_req *req, fd_t *fd, int cmd, + int32_t gf_cmd = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FINODELK, out); + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; +@@ -952,7 +958,8 @@ client_pre_fentrylk(xlator_t *this, gfs3_fentrylk_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FENTRYLK, out); + + req->fd = remote_fd; + req->cmd = cmd_entrylk; +@@ -1013,7 +1020,7 @@ client_pre_fxattrop(xlator_t *this, gfs3_fxattrop_req *req, fd_t *fd, + int64_t remote_fd = -1; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FXATTROP, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -1039,7 +1046,8 @@ client_pre_fgetxattr(xlator_t *this, gfs3_fgetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FGETXATTR, out); + + req->namelen = 1; /* Use it as a flag */ + req->fd = remote_fd; +@@ -1065,7 +1073,8 @@ client_pre_fsetxattr(xlator_t *this, gfs3_fsetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETXATTR, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -1091,7 +1100,8 @@ client_pre_rchecksum(xlator_t *this, gfs3_rchecksum_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_RCHECKSUM, out); + + req->len = len; + req->offset = offset; +@@ -1141,7 +1151,8 @@ client_pre_fsetattr(xlator_t *this, gfs3_fsetattr_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETATTR, out); + + req->fd = remote_fd; + req->valid = valid; +@@ -1161,7 +1172,8 @@ client_pre_readdirp(xlator_t *this, gfs3_readdirp_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIRP, out); + + req->size = size; + req->offset = offset; +@@ -1187,7 +1199,8 @@ client_pre_fremovexattr(xlator_t *this, gfs3_fremovexattr_req *req, fd_t *fd, + if (!(fd && fd->inode)) + goto out; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FREMOVEXATTR, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->name = (char *)name; +@@ -1208,7 +1221,8 @@ client_pre_fallocate(xlator_t *this, gfs3_fallocate_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FALLOCATE, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -1230,7 +1244,8 @@ client_pre_discard(xlator_t *this, gfs3_discard_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_DISCARD, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -1251,7 +1266,8 @@ client_pre_zerofill(xlator_t *this, gfs3_zerofill_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_ZEROFILL, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -1286,7 +1302,8 @@ client_pre_seek(xlator_t *this, gfs3_seek_req *req, fd_t *fd, off_t offset, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_SEEK, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->fd = remote_fd; +@@ -2508,7 +2525,7 @@ client_pre_readv_v2(xlator_t *this, gfx_read_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_READ, out); + + req->size = size; + req->offset = offset; +@@ -2532,7 +2549,7 @@ client_pre_writev_v2(xlator_t *this, gfx_write_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_WRITE, out); + + req->size = size; + req->offset = offset; +@@ -2567,10 +2584,10 @@ client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd_in, FALLBACK_TO_ANON_FD, remote_fd_in, +- op_errno, out); ++ op_errno, GFS3_OP_COPY_FILE_RANGE, out); + + CLIENT_GET_REMOTE_FD(this, fd_out, FALLBACK_TO_ANON_FD, remote_fd_out, +- op_errno, out); ++ op_errno, GFS3_OP_COPY_FILE_RANGE, out); + req->size = size; + req->off_in = off_in; + req->off_out = off_out; +@@ -2623,7 +2640,8 @@ client_pre_flush_v2(xlator_t *this, gfx_flush_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FLUSH, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -2643,7 +2661,7 @@ client_pre_fsync_v2(xlator_t *this, gfx_fsync_req *req, fd_t *fd, int32_t flags, + int op_errno = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FSYNC, out); + + req->fd = remote_fd; + req->data = flags; +@@ -2778,7 +2796,8 @@ client_pre_fsyncdir_v2(xlator_t *this, gfx_fsyncdir_req *req, fd_t *fd, + int32_t op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSYNCDIR, out); + + req->fd = remote_fd; + req->data = flags; +@@ -2852,7 +2871,8 @@ client_pre_ftruncate_v2(xlator_t *this, gfx_ftruncate_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = EINVAL; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FTRUNCATE, out); + + req->offset = offset; + req->fd = remote_fd; +@@ -2870,7 +2890,8 @@ client_pre_fstat_v2(xlator_t *this, gfx_fstat_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSTAT, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -2892,7 +2913,8 @@ client_pre_lk_v2(xlator_t *this, gfx_lk_req *req, int32_t cmd, + int32_t gf_type = 0; + int ret = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_LK, out); + + ret = client_cmd_to_gf_cmd(cmd, &gf_cmd); + if (ret) { +@@ -2967,7 +2989,8 @@ client_pre_readdir_v2(xlator_t *this, gfx_readdir_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIR, out); + + req->size = size; + req->offset = offset; +@@ -3048,7 +3071,7 @@ client_pre_finodelk_v2(xlator_t *this, gfx_finodelk_req *req, fd_t *fd, int cmd, + int32_t gf_cmd = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FINODELK, out); + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; +@@ -3129,7 +3152,8 @@ client_pre_fentrylk_v2(xlator_t *this, gfx_fentrylk_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FENTRYLK, out); + + req->fd = remote_fd; + req->cmd = cmd_entrylk; +@@ -3185,7 +3209,7 @@ client_pre_fxattrop_v2(xlator_t *this, gfx_fxattrop_req *req, fd_t *fd, + int64_t remote_fd = -1; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FXATTROP, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -3207,7 +3231,8 @@ client_pre_fgetxattr_v2(xlator_t *this, gfx_fgetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FGETXATTR, out); + + req->namelen = 1; /* Use it as a flag */ + req->fd = remote_fd; +@@ -3232,7 +3257,8 @@ client_pre_fsetxattr_v2(xlator_t *this, gfx_fsetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETXATTR, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -3256,7 +3282,8 @@ client_pre_rchecksum_v2(xlator_t *this, gfx_rchecksum_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_RCHECKSUM, out); + + req->len = len; + req->offset = offset; +@@ -3304,7 +3331,8 @@ client_pre_fsetattr_v2(xlator_t *this, gfx_fsetattr_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETATTR, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->fd = remote_fd; +@@ -3324,7 +3352,8 @@ client_pre_readdirp_v2(xlator_t *this, gfx_readdirp_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIRP, out); + + req->size = size; + req->offset = offset; +@@ -3349,7 +3378,8 @@ client_pre_fremovexattr_v2(xlator_t *this, gfx_fremovexattr_req *req, fd_t *fd, + if (!(fd && fd->inode)) + goto out; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FREMOVEXATTR, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->name = (char *)name; +@@ -3369,7 +3399,8 @@ client_pre_fallocate_v2(xlator_t *this, gfx_fallocate_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FALLOCATE, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -3390,7 +3421,8 @@ client_pre_discard_v2(xlator_t *this, gfx_discard_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_DISCARD, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -3410,7 +3442,8 @@ client_pre_zerofill_v2(xlator_t *this, gfx_zerofill_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_ZEROFILL, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -3439,7 +3472,8 @@ client_pre_seek_v2(xlator_t *this, gfx_seek_req *req, fd_t *fd, off_t offset, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_SEEK, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->fd = remote_fd; +@@ -3587,3 +3621,25 @@ client_post_rename_v2(xlator_t *this, gfx_rename_rsp *rsp, struct iatt *stbuf, + + return xdr_to_dict(&rsp->xdata, xdata); + } ++ ++void ++set_fd_reopen_status(xlator_t *this, dict_t *xdata, ++ enum gf_fd_reopen_status fd_reopen_status) ++{ ++ clnt_conf_t *conf = NULL; ++ ++ conf = this->private; ++ if (!conf) { ++ gf_msg_debug(this->name, ENOMEM, "Failed to get client conf"); ++ return; ++ } ++ ++ if (!conf->strict_locks) ++ fd_reopen_status = FD_REOPEN_ALLOWED; ++ ++ if (dict_set_int32(xdata, "fd-reopen-status", fd_reopen_status)) ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, PC_MSG_DICT_SET_FAILED, ++ NULL); ++ ++ return; ++} +diff --git a/xlators/protocol/client/src/client-common.h b/xlators/protocol/client/src/client-common.h +index a2043d8..16fb167 100644 +--- a/xlators/protocol/client/src/client-common.h ++++ b/xlators/protocol/client/src/client-common.h +@@ -627,4 +627,8 @@ client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req, + off64_t off_out, size_t size, int32_t flags, + dict_t **xdata); + ++void ++set_fd_reopen_status(xlator_t *this, dict_t *xdata, ++ enum gf_fd_reopen_status fd_reopen_allowed); ++ + #endif /* __CLIENT_COMMON_H__ */ +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index 6543100..48b6448 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -406,11 +406,12 @@ clnt_readdir_rsp_cleanup_v2(gfx_readdir_rsp *rsp) + } + + int +-client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) ++client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd, ++ enum gf_fop_procnum fop) + { + clnt_fd_ctx_t *fdctx = NULL; + clnt_conf_t *conf = NULL; +- gf_boolean_t locks_held = _gf_false; ++ gf_boolean_t locks_involved = _gf_false; + + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, remote_fd, out); +@@ -423,23 +424,32 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) + if (fd->anonymous) { + *remote_fd = GF_ANON_FD_NO; + } else { ++ if (conf->strict_locks && ++ (fop == GFS3_OP_WRITE || fop == GFS3_OP_FTRUNCATE || ++ fop == GFS3_OP_FALLOCATE || fop == GFS3_OP_ZEROFILL || ++ fop == GFS3_OP_DISCARD)) { ++ locks_involved = _gf_true; ++ } + *remote_fd = -1; + gf_msg_debug(this->name, EBADF, "not a valid fd for gfid: %s", + uuid_utoa(fd->inode->gfid)); + } + } else { +- if (__is_fd_reopen_in_progress(fdctx)) ++ if (__is_fd_reopen_in_progress(fdctx)) { + *remote_fd = -1; +- else ++ } else { + *remote_fd = fdctx->remote_fd; ++ } + +- locks_held = !list_empty(&fdctx->lock_list); ++ locks_involved = !list_empty(&fdctx->lock_list); + } + } + pthread_spin_unlock(&conf->fd_lock); + +- if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && (!locks_held)) ++ if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && ++ (!locks_involved)) { + *remote_fd = GF_ANON_FD_NO; ++ } + + return 0; + out: +diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c +index 3110c78..46ac544 100644 +--- a/xlators/protocol/client/src/client-rpc-fops.c ++++ b/xlators/protocol/client/src/client-rpc-fops.c +@@ -2439,6 +2439,13 @@ client3_3_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + } + } + ++ if (local->check_reopen) { ++ if (lock.l_type == F_WRLCK) ++ set_fd_reopen_status(this, xdata, FD_REOPEN_NOT_ALLOWED); ++ else ++ set_fd_reopen_status(this, xdata, FD_REOPEN_ALLOWED); ++ } ++ + out: + if ((rsp.op_ret == -1) && (EAGAIN != gf_error_to_errno(rsp.op_errno))) { + gf_msg(this->name, GF_LOG_WARNING, gf_error_to_errno(rsp.op_errno), +@@ -5198,6 +5205,7 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + 0, + }, + }; ++ dict_t *xdata = NULL; + int32_t gf_cmd = 0; + clnt_local_t *local = NULL; + clnt_conf_t *conf = NULL; +@@ -5224,6 +5232,10 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + goto unwind; + } + ++ ret = dict_get_int32(args->xdata, "fd-reopen-status", &local->check_reopen); ++ if (ret) ++ local->check_reopen = 0; ++ + local->owner = frame->root->lk_owner; + local->cmd = args->cmd; + local->fd = fd_ref(args->fd); +@@ -5237,6 +5249,13 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + client_is_setlk(local->cmd)) { + client_add_lock_for_recovery(local->fd, args->flock, &local->owner, + local->cmd); ++ } else if (local->check_reopen) { ++ xdata = dict_new(); ++ if (xdata == NULL) { ++ op_errno = ENOMEM; ++ goto unwind; ++ } ++ set_fd_reopen_status(this, xdata, FD_BAD); + } + + goto unwind; +@@ -5254,8 +5273,10 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + + return 0; + unwind: +- CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); ++ CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, xdata); + GF_FREE(req.xdata.xdata_val); ++ if (xdata) ++ dict_unref(xdata); + + return 0; + } +diff --git a/xlators/protocol/client/src/client-rpc-fops_v2.c b/xlators/protocol/client/src/client-rpc-fops_v2.c +index 954fc58..d0055e9 100644 +--- a/xlators/protocol/client/src/client-rpc-fops_v2.c ++++ b/xlators/protocol/client/src/client-rpc-fops_v2.c +@@ -2234,6 +2234,13 @@ client4_0_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + } + } + ++ if (local->check_reopen) { ++ if (lock.l_type == F_WRLCK) ++ set_fd_reopen_status(this, xdata, FD_REOPEN_NOT_ALLOWED); ++ else ++ set_fd_reopen_status(this, xdata, FD_REOPEN_ALLOWED); ++ } ++ + out: + if ((rsp.op_ret == -1) && (EAGAIN != gf_error_to_errno(rsp.op_errno))) { + gf_msg(this->name, GF_LOG_WARNING, gf_error_to_errno(rsp.op_errno), +@@ -4759,6 +4766,7 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + 0, + }, + }; ++ dict_t *xdata = NULL; + int32_t gf_cmd = 0; + clnt_local_t *local = NULL; + clnt_conf_t *conf = NULL; +@@ -4785,6 +4793,10 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + goto unwind; + } + ++ ret = dict_get_int32(args->xdata, "fd-reopen-status", &local->check_reopen); ++ if (ret) ++ local->check_reopen = 0; ++ + local->owner = frame->root->lk_owner; + local->cmd = args->cmd; + local->fd = fd_ref(args->fd); +@@ -4798,6 +4810,13 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + client_is_setlk(local->cmd)) { + client_add_lock_for_recovery(local->fd, args->flock, &local->owner, + local->cmd); ++ } else if (local->check_reopen) { ++ xdata = dict_new(); ++ if (xdata == NULL) { ++ op_errno = ENOMEM; ++ goto unwind; ++ } ++ set_fd_reopen_status(this, xdata, FD_BAD); + } + + goto unwind; +@@ -4815,8 +4834,10 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + + return 0; + unwind: +- CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); ++ CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, xdata); + GF_FREE(req.xdata.pairs.pairs_val); ++ if (xdata) ++ dict_unref(xdata); + + return 0; + } +@@ -6094,7 +6115,7 @@ client4_0_rchecksum(call_frame_t *frame, xlator_t *this, void *data) + conf = this->private; + + CLIENT_GET_REMOTE_FD(this, args->fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, +- unwind); ++ GFS3_OP_RCHECKSUM, unwind); + + req.len = args->len; + req.offset = args->offset; +diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c +index 63c90ea..35a5340 100644 +--- a/xlators/protocol/client/src/client.c ++++ b/xlators/protocol/client/src/client.c +@@ -864,9 +864,11 @@ int32_t + client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) + { +- int ret = -1; ++ int ret = 0; ++ int op_errno = ENOTCONN; + clnt_conf_t *conf = NULL; + rpc_clnt_procedure_t *proc = NULL; ++ clnt_fd_ctx_t *fdctx = NULL; + clnt_args_t args = { + 0, + }; +@@ -875,6 +877,21 @@ client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + if (!conf || !conf->fops) + goto out; + ++ if (conf->strict_locks) { ++ pthread_spin_lock(&conf->fd_lock); ++ { ++ fdctx = this_fd_get_ctx(fd, this); ++ if (fdctx && !list_empty(&fdctx->lock_list)) { ++ ret = -1; ++ op_errno = EBADFD; ++ } ++ } ++ pthread_spin_unlock(&conf->fd_lock); ++ ++ if (ret) ++ goto out; ++ } ++ + args.loc = loc; + args.fd = fd; + args.xdata = xdata; +@@ -888,7 +905,7 @@ client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + + out: + if (ret) +- STACK_UNWIND_STRICT(open, frame, -1, ENOTCONN, NULL, NULL); ++ STACK_UNWIND_STRICT(open, frame, -1, op_errno, NULL, NULL); + + return 0; + } +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index bde3d1a..2a50625 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -98,10 +98,10 @@ typedef enum { + free(_this_rsp->xdata.xdata_val); \ + } while (0) + +-#define CLIENT_GET_REMOTE_FD(xl, fd, flags, remote_fd, op_errno, label) \ ++#define CLIENT_GET_REMOTE_FD(xl, fd, flags, remote_fd, op_errno, fop, label) \ + do { \ + int _ret = 0; \ +- _ret = client_get_remote_fd(xl, fd, flags, &remote_fd); \ ++ _ret = client_get_remote_fd(xl, fd, flags, &remote_fd, fop); \ + if (_ret < 0) { \ + op_errno = errno; \ + goto label; \ +@@ -286,6 +286,7 @@ typedef struct client_local { + client_posix_lock_t *client_lock; + gf_lkowner_t owner; + int32_t cmd; ++ int32_t check_reopen; + struct list_head lock_list; + pthread_mutex_t mutex; + char *name; +@@ -435,7 +436,8 @@ client_default_reopen_done(clnt_fd_ctx_t *fdctx, int64_t rfd, xlator_t *this); + void + client_attempt_reopen(fd_t *fd, xlator_t *this); + int +-client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd); ++client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd, ++ enum gf_fop_procnum fop); + int + client_fd_fop_prepare_local(call_frame_t *frame, fd_t *fd, int64_t remote_fd); + gf_boolean_t +-- +1.8.3.1 + diff --git a/SOURCES/0582-protocol-client-Fix-lock-memory-leak.patch b/SOURCES/0582-protocol-client-Fix-lock-memory-leak.patch new file mode 100644 index 0000000..3fd1dae --- /dev/null +++ b/SOURCES/0582-protocol-client-Fix-lock-memory-leak.patch @@ -0,0 +1,501 @@ +From adeec3d5d85baad8b50d203f34a47ad5360d7cd7 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 7 Jun 2021 18:36:11 +0530 +Subject: [PATCH 582/584] protocol/client: Fix lock memory leak + +Problem-1: +When an overlapping lock is issued the merged lock is not assigned the +owner. When flush is issued on the fd, this particular lock is not freed +leading to memory leak + +Fix-1: +Assign the owner while merging the locks. + +Problem-2: +On fd-destroy lock structs could be present in fdctx. For some reason +with flock -x command and closing of the bash fd, it leads to this code +path. Which leaks the lock structs. + +Fix-2: +When fdctx is being destroyed in client, make sure to cleanup any lock +structs. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2338/commits/926402f639471d2664bf00c6692221ba297c525f +> fixes: gluster#2337 +> Change-Id: I298124213ce5a1cf2b1f1756d5e8a9745d9c0a1c +> Signed-off-by: Pranith Kumar K + +BUG: 1689375 +Change-Id: I298124213ce5a1cf2b1f1756d5e8a9745d9c0a1c +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245603 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/client/issue-2337-lock-mem-leak.c | 52 ++++++++++++++++++ + tests/bugs/client/issue-2337-lock-mem-leak.t | 42 ++++++++++++++ + tests/bugs/replicate/do-not-reopen-fd.t | 65 ++++++++++++++-------- + tests/volume.rc | 8 +++ + xlators/protocol/client/src/client-helpers.c | 10 ++++ + xlators/protocol/client/src/client-lk.c | 82 ++++++++++++++++++---------- + xlators/protocol/client/src/client.h | 8 ++- + 7 files changed, 213 insertions(+), 54 deletions(-) + create mode 100644 tests/bugs/client/issue-2337-lock-mem-leak.c + create mode 100644 tests/bugs/client/issue-2337-lock-mem-leak.t + +diff --git a/tests/bugs/client/issue-2337-lock-mem-leak.c b/tests/bugs/client/issue-2337-lock-mem-leak.c +new file mode 100644 +index 0000000..d4e02a7 +--- /dev/null ++++ b/tests/bugs/client/issue-2337-lock-mem-leak.c +@@ -0,0 +1,52 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int ++main(int argc, char *argv[]) ++{ ++ int fd = -1; ++ char *filename = NULL; ++ struct flock lock = { ++ 0, ++ }; ++ int i = 0; ++ int ret = -1; ++ ++ if (argc != 2) { ++ fprintf(stderr, "Usage: %s ", argv[0]); ++ goto out; ++ } ++ ++ filename = argv[1]; ++ ++ fd = open(filename, O_RDWR | O_CREAT, 0); ++ if (fd < 0) { ++ fprintf(stderr, "open (%s) failed (%s)\n", filename, strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_type = F_WRLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 2; ++ ++ while (i < 100) { ++ lock.l_start = i; ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++ ++ i++; ++ } ++ ++ ret = 0; ++ ++out: ++ return ret; ++} +diff --git a/tests/bugs/client/issue-2337-lock-mem-leak.t b/tests/bugs/client/issue-2337-lock-mem-leak.t +new file mode 100644 +index 0000000..64132a2 +--- /dev/null ++++ b/tests/bugs/client/issue-2337-lock-mem-leak.t +@@ -0,0 +1,42 @@ ++#!/bin/bash ++ ++#Test that lock fop is not leaking any memory for overlapping regions ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../fileio.rc ++ ++cleanup; ++ ++LOCK_TEST=$(dirname $0)/issue-2337-lock-mem-leak ++build_tester $(dirname $0)/issue-2337-lock-mem-leak.c -o ${LOCK_TEST} ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}1 ++#Guard against flush-behind ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume start $V0 ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++TEST touch $M0/a ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++statedump=$(generate_mount_statedump $V0 $M0) ++EXPECT_NOT "^nostatedump$" echo $statedump ++#Making sure no one changes this mem-tracker name ++TEST grep gf_client_mt_clnt_lock_t $statedump ++TEST fd_close $fd1 ++ ++statedump=$(generate_mount_statedump $V0 $M0) ++EXPECT_NOT "^nostatedump$" echo $statedump ++TEST ! grep gf_client_mt_clnt_lock_t $statedump ++ ++TEST ${LOCK_TEST} $M0/a ++ ++statedump=$(generate_mount_statedump $V0 $M0) ++EXPECT_NOT "^nostatedump$" echo $statedump ++TEST ! grep gf_client_mt_clnt_lock_t $statedump ++TEST cleanup_mount_statedump $V0 ++TEST rm ${LOCK_TEST} ++cleanup +diff --git a/tests/bugs/replicate/do-not-reopen-fd.t b/tests/bugs/replicate/do-not-reopen-fd.t +index 76d8e70..13b5218 100644 +--- a/tests/bugs/replicate/do-not-reopen-fd.t ++++ b/tests/bugs/replicate/do-not-reopen-fd.t +@@ -45,13 +45,17 @@ EXPECT "data-2" cat $B0/${V0}2/a + gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) + gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) + +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + TEST fd2=`fd_available` + TEST fd_open $fd2 'rw' $M1/a + ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ + # Kill 2nd brick and try writing to the file. The write should fail due to + # quorum failure. + TEST kill_brick $V0 $H0 $B0/${V0}1 +@@ -66,6 +70,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-4" + TEST ! fd_cat $fd1 ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Enable heal and check the files will have same content on all the bricks after + # the heal is completed. +@@ -89,7 +96,9 @@ TEST ! fd_write $fd1 "data-5" + + # At this point only one brick will have the lock. Try taking the lock again on + # the bad fd, which should also fail with EBADFD. +-TEST ! flock -x $fd1 ++# TODO: At the moment quorum failure in lk leads to unlock on the bricks where ++# lock succeeds. This will change lock state on 3rd brick, commenting for now ++#TEST ! flock -x $fd1 + + # Kill the only brick that is having lock and try taking lock on another client + # which should succeed. +@@ -97,15 +106,25 @@ TEST kill_brick $V0 $H0 $B0/${V0}2 + EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 2 + TEST flock -x $fd2 + TEST fd_write $fd2 "data-6" ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++ + + # Bring the brick up and try writing & reading on the old fd, which should still + # fail and operations on the 2nd fd should succeed. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}2 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M1 $V0-replicate-0 2 ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST ! fd_write $fd1 "data-7" + + TEST ! fd_cat $fd1 ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST fd_cat $fd2 + + # Close both the fds which will release the locks and then re-open and take lock +@@ -113,17 +132,15 @@ TEST fd_cat $fd2 + TEST fd_close $fd1 + TEST fd_close $fd2 + +-TEST ! ls /proc/$$/fd/$fd1 +-TEST ! ls /proc/$$/fd/$fd2 +-EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a +-EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + TEST flock -x $fd1 + TEST fd_write $fd1 "data-8" +@@ -134,6 +151,10 @@ EXPECT "data-8" head -n 1 $B0/${V0}1/a + EXPECT "data-8" head -n 1 $B0/${V0}2/a + + TEST fd_close $fd1 ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ + + # Heal the volume + TEST $CLI volume heal $V0 enable +@@ -152,9 +173,9 @@ EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replica + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Restart the brick and then write. Now fd should get re-opened and write should + # succeed on the previously down brick as well since there are no locks held on +@@ -163,7 +184,7 @@ TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd_write $fd1 "data-10" +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + + EXPECT "data-10" head -n 1 $B0/${V0}0/a + EXPECT "data-10" head -n 1 $B0/${V0}1/a +@@ -177,9 +198,9 @@ TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + TEST flock -x $fd1 + +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Kill & restart another brick so that it will return EBADFD + TEST kill_brick $V0 $H0 $B0/${V0}1 +@@ -194,9 +215,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-11" +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + EXPECT "data-10" head -n 1 $B0/${V0}0/a + EXPECT "data-10" head -n 1 $B0/${V0}1/a +diff --git a/tests/volume.rc b/tests/volume.rc +index f5dd0b1..17c3835 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -407,6 +407,14 @@ function gf_check_file_opened_in_brick { + fi + } + ++function gf_open_file_count_in_brick { ++ vol=$1 ++ host=$2 ++ brick=$3 ++ realpath=$4 ++ ls -l /proc/$(get_brick_pid $vol $host $brick)/fd | grep "${realpath}$" | wc -l ++} ++ + function gf_get_gfid_backend_file_path { + brickpath=$1 + filepath_in_brick=$2 +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index 48b6448..a80f303 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -3156,11 +3156,14 @@ client_fdctx_destroy(xlator_t *this, clnt_fd_ctx_t *fdctx) + int32_t ret = -1; + char parent_down = 0; + fd_lk_ctx_t *lk_ctx = NULL; ++ gf_lkowner_t null_owner = {0}; ++ struct list_head deleted_list; + + GF_VALIDATE_OR_GOTO("client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fdctx, out); + + conf = (clnt_conf_t *)this->private; ++ INIT_LIST_HEAD(&deleted_list); + + if (fdctx->remote_fd == -1) { + gf_msg_debug(this->name, 0, "not a valid fd"); +@@ -3174,6 +3177,13 @@ client_fdctx_destroy(xlator_t *this, clnt_fd_ctx_t *fdctx) + pthread_mutex_unlock(&conf->lock); + lk_ctx = fdctx->lk_ctx; + fdctx->lk_ctx = NULL; ++ pthread_spin_lock(&conf->fd_lock); ++ { ++ __delete_granted_locks_owner_from_fdctx(fdctx, &null_owner, ++ &deleted_list); ++ } ++ pthread_spin_unlock(&conf->fd_lock); ++ destroy_client_locks_from_list(&deleted_list); + + if (lk_ctx) + fd_lk_ctx_unref(lk_ctx); +diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c +index c1fb055..cb4e894 100644 +--- a/xlators/protocol/client/src/client-lk.c ++++ b/xlators/protocol/client/src/client-lk.c +@@ -253,6 +253,7 @@ __insert_and_merge(clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock) + sum = add_locks(lock, conf); + + sum->fd = lock->fd; ++ sum->owner = conf->owner; + + __delete_client_lock(conf); + __destroy_client_lock(conf); +@@ -320,56 +321,77 @@ destroy_client_lock(client_posix_lock_t *lock) + GF_FREE(lock); + } + +-int32_t +-delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner) ++void ++destroy_client_locks_from_list(struct list_head *deleted) + { +- clnt_fd_ctx_t *fdctx = NULL; + client_posix_lock_t *lock = NULL; + client_posix_lock_t *tmp = NULL; +- xlator_t *this = NULL; +- clnt_conf_t *conf = NULL; +- +- struct list_head delete_list; +- int ret = 0; ++ xlator_t *this = THIS; + int count = 0; + +- INIT_LIST_HEAD(&delete_list); +- this = THIS; +- conf = this->private; ++ list_for_each_entry_safe(lock, tmp, deleted, list) ++ { ++ list_del_init(&lock->list); ++ destroy_client_lock(lock); ++ count++; ++ } + +- pthread_spin_lock(&conf->fd_lock); ++ /* FIXME: Need to actually print the locks instead of count */ ++ gf_msg_trace(this->name, 0, "Number of locks cleared=%d", count); ++} + +- fdctx = this_fd_get_ctx(fd, this); +- if (!fdctx) { +- pthread_spin_unlock(&conf->fd_lock); ++void ++__delete_granted_locks_owner_from_fdctx(clnt_fd_ctx_t *fdctx, ++ gf_lkowner_t *owner, ++ struct list_head *deleted) ++{ ++ client_posix_lock_t *lock = NULL; ++ client_posix_lock_t *tmp = NULL; + +- gf_msg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_FD_CTX_INVALID, +- "fdctx not valid"); +- ret = -1; +- goto out; ++ gf_boolean_t is_null_lkowner = _gf_false; ++ ++ if (is_lk_owner_null(owner)) { ++ is_null_lkowner = _gf_true; + } + + list_for_each_entry_safe(lock, tmp, &fdctx->lock_list, list) + { +- if (is_same_lkowner(&lock->owner, owner)) { ++ if (is_null_lkowner || is_same_lkowner(&lock->owner, owner)) { + list_del_init(&lock->list); +- list_add_tail(&lock->list, &delete_list); +- count++; ++ list_add_tail(&lock->list, deleted); + } + } ++} + +- pthread_spin_unlock(&conf->fd_lock); ++int32_t ++delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner) ++{ ++ clnt_fd_ctx_t *fdctx = NULL; ++ xlator_t *this = NULL; ++ clnt_conf_t *conf = NULL; ++ int ret = 0; ++ struct list_head deleted_locks; + +- if (!list_empty(&delete_list)) { +- list_for_each_entry_safe(lock, tmp, &delete_list, list) +- { +- list_del_init(&lock->list); +- destroy_client_lock(lock); ++ this = THIS; ++ conf = this->private; ++ INIT_LIST_HEAD(&deleted_locks); ++ ++ pthread_spin_lock(&conf->fd_lock); ++ { ++ fdctx = this_fd_get_ctx(fd, this); ++ if (!fdctx) { ++ pthread_spin_unlock(&conf->fd_lock); ++ ++ gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_FD_CTX_INVALID, ++ NULL); ++ ret = -1; ++ goto out; + } ++ __delete_granted_locks_owner_from_fdctx(fdctx, owner, &deleted_locks); + } ++ pthread_spin_unlock(&conf->fd_lock); + +- /* FIXME: Need to actually print the locks instead of count */ +- gf_msg_trace(this->name, 0, "Number of locks cleared=%d", count); ++ destroy_client_locks_from_list(&deleted_locks); + + out: + return ret; +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index 2a50625..f952aea 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -406,8 +406,12 @@ int + client_attempt_lock_recovery(xlator_t *this, clnt_fd_ctx_t *fdctx); + int32_t + delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner); +-int32_t +-delete_granted_locks_fd(clnt_fd_ctx_t *fdctx); ++void ++__delete_granted_locks_owner_from_fdctx(clnt_fd_ctx_t *fdctx, ++ gf_lkowner_t *owner, ++ struct list_head *deleted); ++void ++destroy_client_locks_from_list(struct list_head *deleted); + int32_t + client_cmd_to_gf_cmd(int32_t cmd, int32_t *gf_cmd); + void +-- +1.8.3.1 + diff --git a/SOURCES/0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch b/SOURCES/0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch new file mode 100644 index 0000000..1ac1777 --- /dev/null +++ b/SOURCES/0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch @@ -0,0 +1,138 @@ +From f114ba25fab57d1ab9a51fc1f101f2b5571f167a Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 7 Jun 2021 19:24:55 +0530 +Subject: [PATCH 583/584] protocol/client: Initialize list head to prevent NULL + de-reference + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2456/commits/00761df0cd14833ff256b69dba7cf8e2b699554c +> fixes: #2443 +> Change-Id: I86ef0270d41d6fb924db97fde3196d7c98c8b564 +> Signed-off-by: Pranith Kumar K + +BUG: 1689375 +Change-Id: I86ef0270d41d6fb924db97fde3196d7c98c8b564 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245613 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/locks/issue-2443-crash.c | 67 +++++++++++++++++++++++++++++++++ + tests/bugs/locks/issue-2443-crash.t | 18 +++++++++ + xlators/protocol/client/src/client-lk.c | 1 + + 3 files changed, 86 insertions(+) + create mode 100644 tests/bugs/locks/issue-2443-crash.c + create mode 100644 tests/bugs/locks/issue-2443-crash.t + +diff --git a/tests/bugs/locks/issue-2443-crash.c b/tests/bugs/locks/issue-2443-crash.c +new file mode 100644 +index 0000000..5f580bf +--- /dev/null ++++ b/tests/bugs/locks/issue-2443-crash.c +@@ -0,0 +1,67 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int ++main(int argc, char *argv[]) ++{ ++ int fd = -1; ++ char *filename = NULL; ++ struct flock lock = { ++ 0, ++ }; ++ int i = 0; ++ int ret = -1; ++ ++ if (argc != 2) { ++ fprintf(stderr, "Usage: %s ", argv[0]); ++ goto out; ++ } ++ ++ filename = argv[1]; ++ ++ fd = open(filename, O_RDWR | O_CREAT, 0); ++ if (fd < 0) { ++ fprintf(stderr, "open (%s) failed (%s)\n", filename, strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_start = 0; ++ lock.l_type = F_RDLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 2; ++ ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_start = 2; ++ lock.l_type = F_WRLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 2; ++ ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_start = 0; ++ lock.l_type = F_RDLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 4; ++ ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++out: ++ return ret; ++} +diff --git a/tests/bugs/locks/issue-2443-crash.t b/tests/bugs/locks/issue-2443-crash.t +new file mode 100644 +index 0000000..162a4d7 +--- /dev/null ++++ b/tests/bugs/locks/issue-2443-crash.t +@@ -0,0 +1,18 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/brick0 ++TEST $CLI volume start $V0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++build_tester $(dirname $0)/issue-2443-crash.c ++TEST mv $(dirname $0)/issue-2443-crash $M0 ++cd $M0 ++TEST ./issue-2443-crash a ++ ++cd - ++cleanup; +diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c +index cb4e894..37c1d35 100644 +--- a/xlators/protocol/client/src/client-lk.c ++++ b/xlators/protocol/client/src/client-lk.c +@@ -101,6 +101,7 @@ add_locks(client_posix_lock_t *l1, client_posix_lock_t *l2) + sum = GF_CALLOC(1, sizeof(*sum), gf_client_mt_clnt_lock_t); + if (!sum) + return NULL; ++ INIT_LIST_HEAD(&sum->list); + + sum->fl_start = min(l1->fl_start, l2->fl_start); + sum->fl_end = max(l1->fl_end, l2->fl_end); +-- +1.8.3.1 + diff --git a/SOURCES/0584-dht-fixing-xattr-inconsistency.patch b/SOURCES/0584-dht-fixing-xattr-inconsistency.patch new file mode 100644 index 0000000..bf2c6b9 --- /dev/null +++ b/SOURCES/0584-dht-fixing-xattr-inconsistency.patch @@ -0,0 +1,429 @@ +From 2c6c4ad77ba5511a62846af932840deb5bc389ae Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Mon, 7 Jun 2021 12:25:57 +0300 +Subject: [PATCH 584/584] dht - fixing xattr inconsistency + +The scenario of setting an xattr to a dir, killing one of the bricks, +removing the xattr, bringing back the brick results in xattr +inconsistency - The downed brick will still have the xattr, but the rest +won't. +This patch add a mechanism that will remove the extra xattrs during +lookup. + +Backport of: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24687/ +> fixes: #1324 +> Change-Id: Ifec0b7aea6cd40daa8b0319b881191cf83e031d1 +> Signed-off-by: Barak Sason Rofman + +BUG: 1600379 +Change-Id: I588f69b283e5354cd362d74486d6ec6d226ecc96 +Signed-off-by: Tamar Shacked +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245560 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/common-utils.c | 20 +++++++- + libglusterfs/src/glusterfs/common-utils.h | 6 +++ + tests/bugs/distribute/bug-1600379.t | 54 ++++++++++++++++++++ + xlators/cluster/dht/src/dht-common.c | 14 ++---- + xlators/cluster/dht/src/dht-common.h | 4 -- + xlators/cluster/dht/src/dht-helper.c | 4 ++ + xlators/cluster/dht/src/dht-selfheal.c | 11 ++++ + xlators/storage/posix/src/posix-helpers.c | 19 +++++++ + xlators/storage/posix/src/posix-inode-fd-ops.c | 69 ++++++++++++++++++++++++++ + xlators/storage/posix/src/posix.h | 3 ++ + 10 files changed, 189 insertions(+), 15 deletions(-) + create mode 100644 tests/bugs/distribute/bug-1600379.t + +diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c +index c2dfe28..d8b7c6e 100644 +--- a/libglusterfs/src/common-utils.c ++++ b/libglusterfs/src/common-utils.c +@@ -54,6 +54,7 @@ + #include "xxhash.h" + #include + #include "glusterfs/libglusterfs-messages.h" ++#include "glusterfs/glusterfs-acl.h" + #include "protocol-common.h" + #ifdef __FreeBSD__ + #include +@@ -82,12 +83,21 @@ gf_boolean_t gf_signal_on_assert = false; + typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size); + typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size); + +-void gf_assert(void) ++char *xattrs_to_heal[] = {"user.", ++ POSIX_ACL_ACCESS_XATTR, ++ POSIX_ACL_DEFAULT_XATTR, ++ QUOTA_LIMIT_KEY, ++ QUOTA_LIMIT_OBJECTS_KEY, ++ GF_SELINUX_XATTR_KEY, ++ GF_XATTR_MDATA_KEY, ++ NULL}; ++ ++void ++gf_assert(void) + { + if (gf_signal_on_assert) { + raise(SIGCONT); + } +- + } + + void +@@ -5430,3 +5440,9 @@ gf_d_type_from_ia_type(ia_type_t type) + return DT_UNKNOWN; + } + } ++ ++char ** ++get_xattrs_to_heal() ++{ ++ return xattrs_to_heal; ++} +diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h +index bd48b6f..8439bb6 100644 +--- a/libglusterfs/src/glusterfs/common-utils.h ++++ b/libglusterfs/src/glusterfs/common-utils.h +@@ -183,6 +183,12 @@ enum _gf_xlator_ipc_targets { + typedef enum _gf_special_pid gf_special_pid_t; + typedef enum _gf_xlator_ipc_targets _gf_xlator_ipc_targets_t; + ++/* Array to hold custom xattr keys */ ++extern char *xattrs_to_heal[]; ++ ++char ** ++get_xattrs_to_heal(); ++ + /* The DHT file rename operation is not a straightforward rename. + * It involves creating linkto and linkfiles, and can unlink or rename the + * source file depending on the hashed and cached subvols for the source +diff --git a/tests/bugs/distribute/bug-1600379.t b/tests/bugs/distribute/bug-1600379.t +new file mode 100644 +index 0000000..8d2f615 +--- /dev/null ++++ b/tests/bugs/distribute/bug-1600379.t +@@ -0,0 +1,54 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++# Initialize ++#------------------------------------------------------------ ++cleanup; ++ ++# Start glusterd ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++# Create a volume ++TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2} ++ ++# Verify volume creation ++EXPECT "$V0" volinfo_field $V0 'Volume Name'; ++EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++# Start volume and verify successful start ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; ++#------------------------------------------------------------ ++ ++# Test case - Remove xattr from killed brick on lookup ++#------------------------------------------------------------ ++# Create a dir and set custom xattr ++TEST mkdir $M0/testdir ++TEST setfattr -n user.attr -v val $M0/testdir ++xattr_val=`getfattr -d $B0/${V0}2/testdir | awk '{print $1}'`; ++TEST ${xattr_val}='user.attr="val"'; ++ ++# Kill 2nd brick process ++TEST kill_brick $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "1" online_brick_count ++ ++# Remove custom xattr ++TEST setfattr -x user.attr $M0/testdir ++ ++# Bring up the killed brick process ++TEST $CLI volume start $V0 force ++ ++# Perform lookup ++sleep 5 ++TEST ls $M0/testdir ++ ++# Check brick xattrs ++xattr_val_2=`getfattr -d $B0/${V0}2/testdir`; ++TEST [ ${xattr_val_2} = ''] ; ++ ++cleanup; +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index ce0fbbf..edfc6e7 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -127,15 +128,6 @@ dht_read_iatt_from_xdata(xlator_t *this, dict_t *xdata, struct iatt *stbuf) + int + dht_rmdir_unlock(call_frame_t *frame, xlator_t *this); + +-char *xattrs_to_heal[] = {"user.", +- POSIX_ACL_ACCESS_XATTR, +- POSIX_ACL_DEFAULT_XATTR, +- QUOTA_LIMIT_KEY, +- QUOTA_LIMIT_OBJECTS_KEY, +- GF_SELINUX_XATTR_KEY, +- GF_XATTR_MDATA_KEY, +- NULL}; +- + char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + + /* Return true if key exists in array +@@ -143,6 +135,8 @@ char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + static gf_boolean_t + dht_match_xattr(const char *key) + { ++ char **xattrs_to_heal = get_xattrs_to_heal(); ++ + return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0; + } + +@@ -5399,11 +5393,13 @@ dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + int call_cnt = 0; + dht_local_t *local = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; ++ char **xattrs_to_heal; + + conf = this->private; + local = frame->local; + call_cnt = conf->subvolume_cnt; + local->flags = flags; ++ xattrs_to_heal = get_xattrs_to_heal(); + + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); +diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h +index 132b3b3..b856c68 100644 +--- a/xlators/cluster/dht/src/dht-common.h ++++ b/xlators/cluster/dht/src/dht-common.h +@@ -54,10 +54,6 @@ + #define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*" + #define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol." + +-/* Array to hold custom xattr keys +- */ +-extern char *xattrs_to_heal[]; +- + /* Rebalance nodeuuid flags */ + #define REBAL_NODEUUID_MINE 0x01 + +diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c +index 4f7370d..4c3940a 100644 +--- a/xlators/cluster/dht/src/dht-helper.c ++++ b/xlators/cluster/dht/src/dht-helper.c +@@ -2289,6 +2289,7 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + int luret = -1; + int luflag = -1; + int i = 0; ++ char **xattrs_to_heal; + + if (!src || !dst) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, +@@ -2305,6 +2306,9 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + and set it to dst dict, here index start from 1 because + user xattr already checked in previous statement + */ ++ ++ xattrs_to_heal = get_xattrs_to_heal(); ++ + for (i = 1; xattrs_to_heal[i]; i++) { + keyval = dict_get(src, xattrs_to_heal[i]); + if (keyval) { +diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c +index f4e17d1..8af7301 100644 +--- a/xlators/cluster/dht/src/dht-selfheal.c ++++ b/xlators/cluster/dht/src/dht-selfheal.c +@@ -2315,6 +2315,15 @@ dht_dir_heal_xattrs(void *data) + if (subvol == mds_subvol) + continue; + if (uret || uflag) { ++ /* Custom xattr heal is required - let posix handle it */ ++ ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, ++ "path=%s", local->loc.path, "key=%s", ++ "sync_backend_xattrs", NULL); ++ goto out; ++ } ++ + ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata, + NULL); + if (ret) { +@@ -2325,6 +2334,8 @@ dht_dir_heal_xattrs(void *data) + "user xattr on path %s on " + "subvol %s, gfid = %s ", + local->loc.path, subvol->name, gfid); ++ } else { ++ dict_del(xdata, "sync_backend_xattrs"); + } + } + } +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index 16351d8..40a9ee4 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -3656,3 +3656,22 @@ out: + + return is_stale; + } ++ ++/* Delete user xattr from the file at the file-path specified by data and from ++ * dict */ ++int ++posix_delete_user_xattr(dict_t *dict, char *k, data_t *v, void *data) ++{ ++ int ret; ++ char *real_path = data; ++ ++ ret = sys_lremovexattr(real_path, k); ++ if (ret) { ++ gf_msg("posix-helpers", GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, errno, ++ "removexattr failed. key %s path %s", k, real_path); ++ } ++ ++ dict_del(dict, k); ++ ++ return ret; ++} +diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c +index 4c2983a..be22c5e 100644 +--- a/xlators/storage/posix/src/posix-inode-fd-ops.c ++++ b/xlators/storage/posix/src/posix-inode-fd-ops.c +@@ -62,6 +62,7 @@ + #include + #include "posix-gfid-path.h" + #include ++#include + + extern char *marker_xattrs[]; + #define ALIGN_SIZE 4096 +@@ -2733,6 +2734,7 @@ posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t ret = 0; + ssize_t acl_size = 0; + dict_t *xattr = NULL; ++ dict_t *subvol_xattrs = NULL; + posix_xattr_filler_t filler = { + 0, + }; +@@ -2748,6 +2750,10 @@ posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + struct mdata_iatt mdata_iatt = { + 0, + }; ++ int8_t sync_backend_xattrs = _gf_false; ++ data_pair_t *custom_xattrs; ++ data_t *keyval = NULL; ++ char **xattrs_to_heal = get_xattrs_to_heal(); + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); +@@ -2930,6 +2936,66 @@ posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + goto out; + } + ++ ret = dict_get_int8(xdata, "sync_backend_xattrs", &sync_backend_xattrs); ++ if (ret) { ++ gf_msg_debug(this->name, -ret, "Unable to get sync_backend_xattrs"); ++ } ++ ++ if (sync_backend_xattrs) { ++ /* List all custom xattrs */ ++ subvol_xattrs = dict_new(); ++ if (!subvol_xattrs) ++ goto out; ++ ++ ret = dict_set_int32_sizen(xdata, "list-xattr", 1); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, ++ "Unable to set list-xattr in dict "); ++ goto out; ++ } ++ ++ subvol_xattrs = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, ++ NULL); ++ ++ /* Remove all user xattrs from the file */ ++ dict_foreach_fnmatch(subvol_xattrs, "user.*", posix_delete_user_xattr, ++ real_path); ++ ++ /* Remove all custom xattrs from the file */ ++ for (i = 1; xattrs_to_heal[i]; i++) { ++ keyval = dict_get(subvol_xattrs, xattrs_to_heal[i]); ++ if (keyval) { ++ ret = sys_lremovexattr(real_path, xattrs_to_heal[i]); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, ++ errno, "removexattr failed. key %s path %s", ++ xattrs_to_heal[i], loc->path); ++ goto out; ++ } ++ ++ dict_del(subvol_xattrs, xattrs_to_heal[i]); ++ keyval = NULL; ++ } ++ } ++ ++ /* Set custom xattrs based on info provided by DHT */ ++ custom_xattrs = dict->members_list; ++ ++ while (custom_xattrs != NULL) { ++ ret = sys_lsetxattr(real_path, custom_xattrs->key, ++ custom_xattrs->value->data, ++ custom_xattrs->value->len, flags); ++ if (ret) { ++ op_errno = errno; ++ gf_log(this->name, GF_LOG_ERROR, "setxattr failed - %s %d", ++ custom_xattrs->key, ret); ++ goto out; ++ } ++ ++ custom_xattrs = custom_xattrs->next; ++ } ++ } ++ + xattr = dict_new(); + if (!xattr) + goto out; +@@ -3037,6 +3103,9 @@ out: + if (xattr) + dict_unref(xattr); + ++ if (subvol_xattrs) ++ dict_unref(subvol_xattrs); ++ + return 0; + } + +diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h +index 4be979c..b357d34 100644 +--- a/xlators/storage/posix/src/posix.h ++++ b/xlators/storage/posix/src/posix.h +@@ -686,4 +686,7 @@ posix_update_iatt_buf(struct iatt *buf, int fd, char *loc, dict_t *xdata); + gf_boolean_t + posix_is_layout_stale(dict_t *xdata, char *par_path, xlator_t *this); + ++int ++posix_delete_user_xattr(dict_t *dict, char *k, data_t *v, void *data); ++ + #endif /* _POSIX_H */ +-- +1.8.3.1 + diff --git a/SOURCES/0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch b/SOURCES/0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch new file mode 100644 index 0000000..e3fa401 --- /dev/null +++ b/SOURCES/0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch @@ -0,0 +1,77 @@ +From ba399a083a56963bb7414535ede6eff6afcd1a0a Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Mon, 14 Jun 2021 12:32:06 -0400 +Subject: [PATCH 585/585] ganesha_ha: ganesha_grace RA fails in start() and/or + fails in monitor () (#2523) + +shell [[ ]] string compare fails to match returned attr to the +pattern and subsequently returns status of "not running", resulting +in dependencies such as the IPaddr (cluster_ip) RA not starting + +Change-Id: I2c8d6f5c4cf0480672d52d8aa0d9226950441dc9 +commit 8ec66a43eedd505ec0b40f55c05f13a77fe8074e +PR: https://github.com/gluster/glusterfs/pull/2523 +issue: https://github.com/gluster/glusterfs/issues/2522 +BUG: 1945143 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/247613 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/ganesha/ocf/ganesha_grace | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace +index edc6fa2..ca219af 100644 +--- a/extras/ganesha/ocf/ganesha_grace ++++ b/extras/ganesha/ocf/ganesha_grace +@@ -122,15 +122,18 @@ ganesha_grace_start() + + # case 1 + if [[ -z "${attr}" ]]; then ++ ocf_log debug "grace start: returning success case 1" + return ${OCF_SUCCESS} + fi + + # case 2 +- if [[ "${attr}" = *"value=1" ]]; then ++ if [[ "${attr}" = *"host=\"${host}\" value=\"1\"" ]]; then ++ ocf_log debug "grace start: returning success case 2" + return ${OCF_SUCCESS} + fi + + # case 3 ++ ocf_log info "grace start returning: not running case 3 (${attr})" + return ${OCF_NOT_RUNNING} + } + +@@ -162,7 +165,7 @@ ganesha_grace_monitor() + { + local host=$(ocf_local_nodename) + +- ocf_log debug "ganesha_grace monitor ${host}" ++ ocf_log debug "ganesha_grace_monitor ${host}" + + attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + if [ $? -ne 0 ]; then +@@ -174,13 +177,16 @@ ganesha_grace_monitor() + # chance to create it. In which case we'll pretend + # everything is okay this time around + if [[ -z "${attr}" ]]; then ++ ocf_log debug "grace monitor: returning success case 1" + return ${OCF_SUCCESS} + fi + +- if [[ "${attr}" = *"value=1" ]]; then ++ if [[ "${attr}" = *"host=\"${host}\" value=\"1\"" ]]; then ++ ocf_log debug "grace monitor: returning success case 2" + return ${OCF_SUCCESS} + fi + ++ ocf_log info "grace monitor: returning not running case 3 (${attr})" + return ${OCF_NOT_RUNNING} + } + +-- +1.8.3.1 + diff --git a/SOURCES/0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch b/SOURCES/0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch new file mode 100644 index 0000000..62c574d --- /dev/null +++ b/SOURCES/0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch @@ -0,0 +1,298 @@ +From e431321f1348b5d51733a6b6c5e046fd8c6e28cc Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 5 Jul 2021 10:52:10 +0530 +Subject: [PATCH 586/586] protocol/client: Do not reopen fd post handshake if + posix lock is held + +Problem: +With client.strict-locks enabled, in some cases where the posix lock is +taken after a brick gets disconnected, the fd is getting reopened when +the brick gets reconnected to the client as part of client_post_handshake. +In such cases the saved fdctx's lock_list may not have the latest +information. + +Fix: +Check the lock information in the fdctx->lk_ctx as well post handshake +which will have the latest information on the locks. +Also check for this field in other places as well to prevent writes +happening with anonymous fd even without re-opening the fd on the +restarted brick. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2582 +> Fixes: #2581 +> Change-Id: I7a0799e242ce188c6597dec0a65b4dae7dcd815b +> Signed-off-by: karthik-us ksubrahm@redhat.com + +BUG: 1689375 +Change-Id: I7a0799e242ce188c6597dec0a65b4dae7dcd815b +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/252588 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/replicate/do-not-reopen-fd.t | 76 ++++++++++++++++++-------- + xlators/protocol/client/src/client-handshake.c | 2 +- + xlators/protocol/client/src/client-helpers.c | 11 +++- + xlators/protocol/client/src/client.c | 2 +- + xlators/protocol/client/src/client.h | 3 + + 5 files changed, 67 insertions(+), 27 deletions(-) + +diff --git a/tests/bugs/replicate/do-not-reopen-fd.t b/tests/bugs/replicate/do-not-reopen-fd.t +index 13b5218..f346709 100644 +--- a/tests/bugs/replicate/do-not-reopen-fd.t ++++ b/tests/bugs/replicate/do-not-reopen-fd.t +@@ -20,10 +20,41 @@ TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 + TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M1 + + TEST touch $M0/a ++gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) ++gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) ++ ++ ++# Open fd from a client, check for open fd on all the bricks. ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Kill a brick and take lock on the fd ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST flock -x $fd1 ++ ++# Restart the brick and check for no open fd on the restarted brick. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Write on the fd. It should fail on the restarted brick. ++TEST fd_write $fd1 "data-0" ++EXPECT "" cat $B0/${V0}0/a ++EXPECT "data-0" cat $B0/${V0}1/a ++EXPECT "data-0" cat $B0/${V0}2/a ++ ++TEST fd_close $fd1 + + # Kill one brick and take lock on the fd and do a write. + TEST kill_brick $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + +@@ -34,7 +65,7 @@ TEST fd_write $fd1 "data-1" + # should still succeed as there were no quorum disconnects. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd_write $fd1 "data-2" + EXPECT "" cat $B0/${V0}0/a + EXPECT "data-2" cat $B0/${V0}1/a +@@ -42,9 +73,6 @@ EXPECT "data-2" cat $B0/${V0}2/a + + # Check there is no fd opened on the 1st brick by checking for the gfid inside + # /proc/pid-of-brick/fd/ directory +-gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) +-gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) +- + EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a +@@ -59,7 +87,7 @@ EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + # Kill 2nd brick and try writing to the file. The write should fail due to + # quorum failure. + TEST kill_brick $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-3" + TEST ! fd_cat $fd1 + +@@ -67,7 +95,7 @@ TEST ! fd_cat $fd1 + # which were down previously, will return EBADFD now. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-4" + TEST ! fd_cat $fd1 + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +@@ -79,9 +107,9 @@ EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + EXPECT_WITHIN $HEAL_TIMEOUT "^2$" get_pending_heal_count $V0 + TEST $CLI volume heal $V0 enable + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 + + TEST $CLI volume heal $V0 + EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +@@ -103,7 +131,7 @@ TEST ! fd_write $fd1 "data-5" + # Kill the only brick that is having lock and try taking lock on another client + # which should succeed. + TEST kill_brick $V0 $H0 $B0/${V0}2 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 2 + TEST flock -x $fd2 + TEST fd_write $fd2 "data-6" + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +@@ -114,17 +142,17 @@ EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a + # fail and operations on the 2nd fd should succeed. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}2 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M1 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M1 $V0-replicate-0 2 + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST ! fd_write $fd1 "data-7" + + TEST ! fd_cat $fd1 + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST fd_cat $fd2 + + # Close both the fds which will release the locks and then re-open and take lock +@@ -159,9 +187,9 @@ EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0 + # Heal the volume + TEST $CLI volume heal $V0 enable + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 + + TEST $CLI volume heal $V0 + EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +@@ -169,7 +197,7 @@ TEST $CLI volume heal $V0 disable + + # Kill one brick and open a fd. + TEST kill_brick $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + +@@ -182,7 +210,7 @@ EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + # any of the bricks. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd_write $fd1 "data-10" + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + +@@ -193,7 +221,7 @@ TEST fd_close $fd1 + + # Kill one brick, open and take lock on a fd. + TEST kill_brick $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + TEST flock -x $fd1 +@@ -204,7 +232,7 @@ EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Kill & restart another brick so that it will return EBADFD + TEST kill_brick $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" brick_up_status $V0 $H0 $B0/${V0}1 + + # Restart the bricks and then write. Now fd should not get re-opened since lock + # is still held on one brick and write should also fail as there is no quorum. +@@ -212,8 +240,8 @@ EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-11" + EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c +index a12472b..20e03d8 100644 +--- a/xlators/protocol/client/src/client-handshake.c ++++ b/xlators/protocol/client/src/client-handshake.c +@@ -911,7 +911,7 @@ client_post_handshake(call_frame_t *frame, xlator_t *this) + list_for_each_entry_safe(fdctx, tmp, &conf->saved_fds, sfd_pos) + { + if (fdctx->remote_fd != -1 || +- (!list_empty(&fdctx->lock_list) && conf->strict_locks)) ++ (!fdctx_lock_lists_empty(fdctx) && conf->strict_locks)) + continue; + + fdctx->reopen_done = client_child_up_reopen_done; +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index a80f303..b4a7294 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -15,6 +15,15 @@ + #include + #include + ++gf_boolean_t ++fdctx_lock_lists_empty(clnt_fd_ctx_t *fdctx) ++{ ++ if (list_empty(&fdctx->lock_list) && fd_lk_ctx_empty(fdctx->lk_ctx)) ++ return _gf_true; ++ ++ return _gf_false; ++} ++ + int + client_fd_lk_list_empty(fd_lk_ctx_t *lk_ctx, gf_boolean_t try_lock) + { +@@ -441,7 +450,7 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd, + *remote_fd = fdctx->remote_fd; + } + +- locks_involved = !list_empty(&fdctx->lock_list); ++ locks_involved = !fdctx_lock_lists_empty(fdctx); + } + } + pthread_spin_unlock(&conf->fd_lock); +diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c +index 35a5340..6df2ed1 100644 +--- a/xlators/protocol/client/src/client.c ++++ b/xlators/protocol/client/src/client.c +@@ -881,7 +881,7 @@ client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + pthread_spin_lock(&conf->fd_lock); + { + fdctx = this_fd_get_ctx(fd, this); +- if (fdctx && !list_empty(&fdctx->lock_list)) { ++ if (fdctx && !fdctx_lock_lists_empty(fdctx)) { + ret = -1; + op_errno = EBADFD; + } +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index f952aea..799fe6e 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -535,4 +535,7 @@ client_add_lock_for_recovery(fd_t *fd, struct gf_flock *flock, + int + client_is_setlk(int32_t cmd); + ++gf_boolean_t ++fdctx_lock_lists_empty(clnt_fd_ctx_t *fdctx); ++ + #endif /* !_CLIENT_H */ +-- +1.8.3.1 + diff --git a/SOURCES/0587-Update-rfc.sh-to-rhgs-3.5.6.patch b/SOURCES/0587-Update-rfc.sh-to-rhgs-3.5.6.patch new file mode 100644 index 0000000..420a4cf --- /dev/null +++ b/SOURCES/0587-Update-rfc.sh-to-rhgs-3.5.6.patch @@ -0,0 +1,26 @@ +From f72780b560ea8efe1508aa9ddc574e6dc066bf9a Mon Sep 17 00:00:00 2001 +From: Csaba Henk +Date: Wed, 29 Sep 2021 10:44:37 +0200 +Subject: [PATCH 587/610] Update rfc.sh to rhgs-3.5.6 + +Signed-off-by: Csaba Henk +--- + rfc.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/rfc.sh b/rfc.sh +index daeff32..67798cb 100755 +--- a/rfc.sh ++++ b/rfc.sh +@@ -18,7 +18,7 @@ done + shift $((OPTIND-1)) + + +-branch="rhgs-3.5.5"; ++branch="rhgs-3.5.6"; + + set_hooks_commit_msg() + { +-- +1.8.3.1 + diff --git a/SOURCES/0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch b/SOURCES/0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch new file mode 100644 index 0000000..1e6c488 --- /dev/null +++ b/SOURCES/0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch @@ -0,0 +1,388 @@ +From e3813685237dbdf8dc7cf28726fff2caf2288706 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Mon, 19 Jul 2021 15:37:02 +0200 +Subject: [PATCH 588/610] locks: Fix null gfid in lock contention notifications + +This patch fixes 3 problems: + +First problem: + +After commit c0bd592e, the pl_inode_t object was also created in the +cbk of lookup requests. Lookup requests are a bit different than any +other request because the inode received may not be completely +initialized. In particular, inode->gfid may be null. + +This caused that the gfid stored in the pl_inode_t object was null in +some cases. This gfid is used mostly for logs, but also to send lock +contention notifications. This meant that some notifications could be +sent with a null gfid, making impossible for the client xlator to +correctly identify the contending inode, so the lock was not released +immediately when eager-lock was also enabled. + +Second problem: + +The feature introduced by c0bd592e needed to track the number of +hardlinks of each inode to detect when it was deleted. However it +was done using the 'get-link-count' special xattr on lookup, while +posix only implements it for unlink and rename. + +Also, the number of hardlinks was not incremented for mkdir, mknod, +rename, ..., so it didn't work correctly for directories. + +Third problem: + +When the last hardlink of an open file is deleted, all locks will be +denied with ESTALE error, but that's not correct. Access to the open +fd must succeed. + +The first problem is fixed by avoiding creating pl_inode_t objects +during lookup. Second and third problems are fixed by completely +ignoring if the file has been deleted or not. Even if we grant a +lock on a non-existing file, the next operation done by the client +inside the lock will return the correct error, which should be enough. + +Upstream patch: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2553 +> Fixes: #2551 +> Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253 +> Signed-off-by: Xavi Hernandez + +BUG: 1962972 +Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279192 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/locks/issue-2551.t | 58 ++++++++++++++++++ + xlators/features/locks/src/common.c | 31 +++------- + xlators/features/locks/src/locks.h | 2 - + xlators/features/locks/src/posix.c | 118 +++--------------------------------- + 4 files changed, 74 insertions(+), 135 deletions(-) + create mode 100644 tests/bugs/locks/issue-2551.t + +diff --git a/tests/bugs/locks/issue-2551.t b/tests/bugs/locks/issue-2551.t +new file mode 100644 +index 0000000..a32af02 +--- /dev/null ++++ b/tests/bugs/locks/issue-2551.t +@@ -0,0 +1,58 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++function check_time() { ++ local max="${1}" ++ local start="$(date +"%s")" ++ ++ shift ++ ++ if "${@}"; then ++ if [[ $(($(date +"%s") - ${start})) -lt ${max} ]]; then ++ return 0 ++ fi ++ fi ++ ++ return 1 ++} ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/brick{0..2} ++TEST $CLI volume set $V0 disperse.eager-lock on ++TEST $CLI volume set $V0 disperse.eager-lock-timeout 30 ++TEST $CLI volume set $V0 features.locks-notify-contention on ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 performance.quick-read off ++ ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 ++ ++TEST mkdir $M0/dir ++TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1 ++ ++TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 conv=notrunc ++TEST check_time 5 dd if=/dev/zero of=$M1/dir/test bs=4k count=1 conv=notrunc +diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c +index cddbfa6..5403086 100644 +--- a/xlators/features/locks/src/common.c ++++ b/xlators/features/locks/src/common.c +@@ -468,9 +468,7 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) + pl_inode->check_mlock_info = _gf_true; + pl_inode->mlock_enforced = _gf_false; + +- /* -2 means never looked up. -1 means something went wrong and link +- * tracking is disabled. */ +- pl_inode->links = -2; ++ pl_inode->remove_running = 0; + + ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); + if (ret) { +@@ -1403,11 +1401,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + + pthread_mutex_lock(&pl_inode->mutex); + +- if (pl_inode->removed) { +- error = ESTALE; +- goto unlock; +- } +- + if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { + error = -1; + /* We skip the unlock here because the caller must create a stub when +@@ -1420,7 +1413,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode->is_locked = _gf_true; + pl_inode->remove_running++; + +-unlock: + pthread_mutex_unlock(&pl_inode->mutex); + + done: +@@ -1490,20 +1482,18 @@ pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) + + pthread_mutex_lock(&pl_inode->mutex); + +- if (error == 0) { +- if (pl_inode->links >= 0) { +- pl_inode->links--; +- } +- if (pl_inode->links == 0) { +- pl_inode->removed = _gf_true; +- } +- } +- + pl_inode->remove_running--; + + if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { + pl_inode->is_locked = _gf_false; + ++ /* At this point it's possible that the inode has been deleted, but ++ * there could be open fd's still referencing it, so we can't prevent ++ * pending locks from being granted. If the file has really been ++ * deleted, whatever the client does once the lock is granted will ++ * fail with the appropriate error, so we don't need to worry about ++ * it here. */ ++ + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, +@@ -1555,11 +1545,6 @@ pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) + pl_dom_list_t *dom; + pl_inode_lock_t *ilock; + +- /* If the inode has been deleted, we won't allow any lock. */ +- if (pl_inode->removed) { +- return -ESTALE; +- } +- + /* We only synchronize with locks made for regular operations coming from + * the user. Locks done for internal purposes are hard to control and could + * lead to long delays or deadlocks quite easily. */ +diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h +index 6666feb..2406dcd 100644 +--- a/xlators/features/locks/src/locks.h ++++ b/xlators/features/locks/src/locks.h +@@ -202,10 +202,8 @@ struct __pl_inode { + int fop_wind_count; + pthread_cond_t check_fop_wind_count; + +- int32_t links; /* Number of hard links the inode has. */ + uint32_t remove_running; /* Number of remove operations running. */ + gf_boolean_t is_locked; /* Regular locks will be blocked. */ +- gf_boolean_t removed; /* The inode has been deleted. */ + }; + typedef struct __pl_inode pl_inode_t; + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index 22ef5b8..d5effef 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -2975,104 +2975,24 @@ out: + return ret; + } + +-static int32_t +-pl_request_link_count(dict_t **pxdata) +-{ +- dict_t *xdata; +- +- xdata = *pxdata; +- if (xdata == NULL) { +- xdata = dict_new(); +- if (xdata == NULL) { +- return ENOMEM; +- } +- } else { +- dict_ref(xdata); +- } +- +- if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) { +- dict_unref(xdata); +- return ENOMEM; +- } +- +- *pxdata = xdata; +- +- return 0; +-} +- +-static int32_t +-pl_check_link_count(dict_t *xdata) +-{ +- int32_t count; +- +- /* In case we are unable to read the link count from xdata, we take a +- * conservative approach and return -2, which will prevent the inode from +- * being considered deleted. In fact it will cause link tracking for this +- * inode to be disabled completely to avoid races. */ +- +- if (xdata == NULL) { +- return -2; +- } +- +- if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) { +- return -2; +- } +- +- return count; +-} +- + int32_t + pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) + { +- pl_inode_t *pl_inode; +- +- if (op_ret >= 0) { +- pl_inode = pl_inode_get(this, inode, NULL); +- if (pl_inode == NULL) { +- PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL, +- NULL); +- return 0; +- } +- +- pthread_mutex_lock(&pl_inode->mutex); +- +- /* We only update the link count if we previously didn't know it. +- * Doing it always can lead to races since lookup is not executed +- * atomically most of the times. */ +- if (pl_inode->links == -2) { +- pl_inode->links = pl_check_link_count(xdata); +- if (buf->ia_type == IA_IFDIR) { +- /* Directories have at least 2 links. To avoid special handling +- * for directories, we simply decrement the value here to make +- * them equivalent to regular files. */ +- pl_inode->links--; +- } +- } +- +- pthread_mutex_unlock(&pl_inode->mutex); +- } +- + PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata, + postparent); ++ + return 0; + } + + int32_t + pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + { +- int32_t error; ++ PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); ++ STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xdata); + +- error = pl_request_link_count(&xdata); +- if (error == 0) { +- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); +- STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xdata); +- dict_unref(xdata); +- } else { +- STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL); +- } + return 0; + } + +@@ -3881,9 +3801,7 @@ unlock: + __dump_posixlks(pl_inode); + } + +- gf_proc_dump_write("links", "%d", pl_inode->links); + gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running); +- gf_proc_dump_write("removed", "%u", pl_inode->removed); + } + pthread_mutex_unlock(&pl_inode->mutex); + +@@ -4508,21 +4426,9 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) + { +- pl_inode_t *pl_inode = (pl_inode_t *)cookie; +- +- if (op_ret >= 0) { +- pthread_mutex_lock(&pl_inode->mutex); +- +- /* TODO: can happen pl_inode->links == 0 ? */ +- if (pl_inode->links >= 0) { +- pl_inode->links++; +- } +- +- pthread_mutex_unlock(&pl_inode->mutex); +- } +- + PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); ++ + return 0; + } + +@@ -4530,18 +4436,10 @@ int + pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) + { +- pl_inode_t *pl_inode; +- +- pl_inode = pl_inode_get(this, oldloc->inode, NULL); +- if (pl_inode == NULL) { +- STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, +- NULL); +- return 0; +- } +- + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); +- STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); ++ STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); ++ + return 0; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch b/SOURCES/0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch new file mode 100644 index 0000000..861791f --- /dev/null +++ b/SOURCES/0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch @@ -0,0 +1,63 @@ +From 0bb71e1492b1ad442758399eb8dcb5f087d77f12 Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Wed, 28 Apr 2021 02:14:27 +0530 +Subject: [PATCH 589/610] extras: fix for postscript failure on logrotation of + snapd logs (#2310) + +Issue: +On executing the logrotate command, the postscript runs as a separate process, +and when we do a grep for the snapd process it returns the PID of that +short-term process as well, and executing a kill on that throws the error. +To check a similar error could be seen if we replace the killall for bricks +log rotation with a for loop on PIDs. + +Fix: +Use the killall command on the list of snapd processes instead of +using the kill command to individually kill them. + +>Fixes: #2360 +>Change-Id: I1ad6e3e4d74128706e71900d02e715635294ff72 +>Signed-off-by: nik-redhat + +Upstream patch: https://github.com/gluster/glusterfs/pull/2310 +BUG: 1668303 + +Change-Id: I59910fc3660e11e131b1aa813848c2e19cbffefd +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279533 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/glusterfs-logrotate | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/extras/glusterfs-logrotate b/extras/glusterfs-logrotate +index 75f700e..2b9028b 100644 +--- a/extras/glusterfs-logrotate ++++ b/extras/glusterfs-logrotate +@@ -45,3 +45,22 @@ + compress + delaycompress + } ++ ++# Rotate snapd log ++/var/log/glusterfs/snaps/*/*.log { ++ sharedscripts ++ weekly ++ maxsize 10M ++ minsize 100k ++ ++ # 6 months of logs are good enough ++ rotate 26 ++ ++ missingok ++ compress ++ delaycompress ++ notifempty ++ postrotate ++ /usr/bin/killall -HUP `pgrep -f "glusterfs.*snapd"` > /dev/null 2>&1 || true ++ endscript ++} +-- +1.8.3.1 + diff --git a/SOURCES/0590-cluster-afr-Don-t-check-for-stale-entry-index.patch b/SOURCES/0590-cluster-afr-Don-t-check-for-stale-entry-index.patch new file mode 100644 index 0000000..c7ff40a --- /dev/null +++ b/SOURCES/0590-cluster-afr-Don-t-check-for-stale-entry-index.patch @@ -0,0 +1,128 @@ +From 87138f86b8cb98d1c9d1a4c9a2393e7978d20b1d Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Tue, 5 Oct 2021 12:33:01 +0530 +Subject: [PATCH 590/610] cluster/afr: Don't check for stale entry-index + +Problem: +In every entry index heal there is a check to see if the +index is stale or not. + 1. If a file is created when the brick is down this +will lead to an extra index lookup because the name is not stale. + 2. If a file is deleted when the brick is down this will also lead to + and extra index lookup because the name is not stale. + 3. If a file is created and deleted when the brick is down then the + index is stale and this will save entry-heal i.e. 2 entrylks and 2 lookups + +Since 1, 2 happen significantly more than 3, this is a bad tradeoff. + +Fix: +Let stale index be removed as part of normal entry heal detecting 'the +name is already deleted' code path. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2612 +> fixes: gluster#2611 +> Change-Id: I29bcc07f2480877a83b30dbd7e2e5631a74df8e8 +> Signed-off-by: Pranith Kumar K + +BUG: 1994593 +Change-Id: I29bcc07f2480877a83b30dbd7e2e5631a74df8e8 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279606 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heal-entry.c | 46 +++++++-------------------- + 1 file changed, 11 insertions(+), 35 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c +index a17dd93..14b7417 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-entry.c ++++ b/xlators/cluster/afr/src/afr-self-heal-entry.c +@@ -933,37 +933,8 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data) + { + int ret = 0; +- loc_t loc = { +- 0, +- }; +- struct iatt iatt = { +- 0, +- }; + afr_granular_esh_args_t *args = data; + +- /* Look up the actual inode associated with entry. If the lookup returns +- * ESTALE or ENOENT, then it means we have a stale index. Remove it. +- * This is analogous to the check in afr_shd_index_heal() except that +- * here it is achieved through LOOKUP and in afr_shd_index_heal() through +- * a GETXATTR. +- */ +- +- loc.inode = inode_new(args->xl->itable); +- loc.parent = inode_ref(args->heal_fd->inode); +- gf_uuid_copy(loc.pargfid, loc.parent->gfid); +- loc.name = entry->d_name; +- +- ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL); +- if ((ret == -ENOENT) || (ret == -ESTALE)) { +- /* The name indices under the pgfid index dir are guaranteed +- * to be regular files. Hence the hardcoding. +- */ +- afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); +- ret = 0; +- goto out; +- } +- /* TBD: afr_shd_zero_xattrop? */ +- + ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd, + entry->d_name, parent->inode, subvol, + _gf_false); +@@ -974,8 +945,6 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + if (ret == -1) + args->mismatch = _gf_true; + +-out: +- loc_wipe(&loc); + return ret; + } + +@@ -1050,7 +1019,9 @@ afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + local = frame->local; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, +- "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid)); ++ "performing %s entry selfheal on %s", ++ (local->need_full_crawl ? "full" : "granular"), ++ uuid_utoa(fd->inode->gfid)); + + for (i = 0; i < priv->child_count; i++) { + /* Expunge */ +@@ -1112,6 +1083,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t did_sh = _gf_true; ++ char *heal_type = "granular entry"; + + priv = this->private; + local = frame->local; +@@ -1194,11 +1166,15 @@ postop_unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, + postop_lock, NULL); + out: +- if (did_sh) +- afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources, ++ if (did_sh) { ++ if (local->need_full_crawl) { ++ heal_type = "full entry"; ++ } ++ afr_log_selfheal(fd->inode->gfid, this, ret, heal_type, source, sources, + healed_sinks); +- else ++ } else { + ret = 1; ++ } + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); +-- +1.8.3.1 + diff --git a/SOURCES/0591-afr-check-for-valid-iatt.patch b/SOURCES/0591-afr-check-for-valid-iatt.patch new file mode 100644 index 0000000..8f1e48e --- /dev/null +++ b/SOURCES/0591-afr-check-for-valid-iatt.patch @@ -0,0 +1,44 @@ +From 19460ebc988795eeabaeb8e25d6eba9a3cf2864b Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 4 Oct 2021 12:44:21 +0530 +Subject: [PATCH 591/610] afr: check for valid iatt + +Problem: +If the entry being processed by afr_shd_anon_inode_cleaner() is no +longer present, gfid lookup fails with ENOENT on all bricks and iatt +will never be assigned, causing a crash due to null dereference. + +Fix: +Add a null-check for iatt. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2660 +> Fixes: gluster#2659 +> Change-Id: I6abfc8063677861ce9388ca4efdf491ec956dc74 +> Signed-off-by: Ravishankar N + +BUG: 1995029 +Change-Id: I6abfc8063677861ce9388ca4efdf491ec956dc74 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279529 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heald.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c +index 18aed93..bc720cf 100644 +--- a/xlators/cluster/afr/src/afr-self-heald.c ++++ b/xlators/cluster/afr/src/afr-self-heald.c +@@ -870,7 +870,7 @@ afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + } + + /*Inode is deleted from subvol*/ +- if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { ++ if (count == 1 || (iatt && iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); +-- +1.8.3.1 + diff --git a/SOURCES/0592-md-cache-fix-integer-signedness-mismatch.patch b/SOURCES/0592-md-cache-fix-integer-signedness-mismatch.patch new file mode 100644 index 0000000..94cfe88 --- /dev/null +++ b/SOURCES/0592-md-cache-fix-integer-signedness-mismatch.patch @@ -0,0 +1,119 @@ +From be3448ed5d9d59752cff4df8325ee67eb7d41531 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Mon, 19 Jul 2021 06:56:18 +0200 +Subject: [PATCH 592/610] md-cache: fix integer signedness mismatch + +md-cache uses a mechanism based on a generation number to detect +modifications made by other clients to the entries and invalidate +the cached data. + +This generation number is a 32 bit integer. When it overflows, +special management is done to avoid problems. This overflow condition +is tracked with a single bit. + +For many fops, when they are received, the overflow bit and the +current generation number are recorded in a single 64-bit value +which is used later in the cbk. + +This is the problematic function: + + uint64_t + __mdc_get_generation(xlator_t *this, struct md_cache *mdc) + { + uint64_t gen = 0, rollover; + struct mdc_conf *conf = NULL; + + conf = this->private; + + gen = GF_ATOMIC_INC(conf->generation); + if (gen == 0) { + gf_log("MDC", GF_LOG_NOTICE, "%p Reset 1", mdc); + mdc->gen_rollover = !mdc->gen_rollover; + gen = GF_ATOMIC_INC(conf->generation); + mdc->ia_time = 0; + mdc->generation = 0; + mdc->invalidation_time = gen - 1; + } + + rollover = mdc->gen_rollover; + gen |= (rollover << 32); + return gen; + } + +'conf->generation' is declared as an atomic signed 32-bit integer, +and 'gen' is an unsigned 64-bit value. When 'gen' is assigned from +a signed int, the sign bit is extended to fill the high 32 bits of +'gen'. If the counter has overflown the maximum signed positive +value, it will become negative (sign bit = 1). + +In this case, when 'rollover' is later combined with 'gen', all the +high bits remain at '1'. + +This value is used later in 'mdc_inode_iatt_set_validate' during +callback processing. The overflow condition and generation numbers +from when the operation was received are recovered this way: + + rollover = incident_time >> 32; + incident_time = (incident_time & 0xffffffff); + +('incident_time' is the saved value from '__mdc_get_generation'). + +So here rollover will be 0xffffffff, when it's expected to be 0 +or 1 only. When this is compared later with the cached overflow +bit, it doesn't match, which prevents updating the cached info. + +This is bad in general, but it's even worse when an entry is not +cached and 'rollover' is 0xffffffff the first time. When md-cache +doesn't have cached data it assumes it's everything 0. This causes +a mismatch, which sends an invalidation request to the kernel, but +since the 'rollover' doesn't match, the cached data is not updated. +So the next time the cached data is checked, it will also send an +invalidation to the kernel, indefinitely. + +This patch fixes two things: + +1. The 'generation' field is made unsigned to avoid sign extension. +2. Invalidation requests are only sent if we already had valid cached + data. Otherwise it doesn't make sense to send an invalidation. + +Upstream patch: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2619 +> Fixes: #2617 +> Change-Id: Ie40e68288cf143e1bc1a40f46da98f51bb2d6864 +> Signed-off-by: Xavi Hernandez + +BUG: 1904137 +Change-Id: Ie40e68288cf143e1bc1a40f46da98f51bb2d6864 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279188 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/md-cache/src/md-cache.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c +index bbbee3b..e0256d6 100644 +--- a/xlators/performance/md-cache/src/md-cache.c ++++ b/xlators/performance/md-cache/src/md-cache.c +@@ -79,7 +79,7 @@ struct mdc_conf { + gf_boolean_t cache_statfs; + struct mdc_statfs_cache statfs_cache; + char *mdc_xattr_str; +- gf_atomic_int32_t generation; ++ gf_atomic_uint32_t generation; + }; + + struct mdc_local; +@@ -537,7 +537,7 @@ mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf, + (iatt->ia_mtime_nsec != mdc->md_mtime_nsec) || + (iatt->ia_ctime != mdc->md_ctime) || + (iatt->ia_ctime_nsec != mdc->md_ctime_nsec)) { +- if (conf->global_invalidation && ++ if (conf->global_invalidation && mdc->valid && + (!prebuf || (prebuf->ia_mtime != mdc->md_mtime) || + (prebuf->ia_mtime_nsec != mdc->md_mtime_nsec) || + (prebuf->ia_ctime != mdc->md_ctime) || +-- +1.8.3.1 + diff --git a/SOURCES/0593-dht-explicit-null-dereference.patch b/SOURCES/0593-dht-explicit-null-dereference.patch new file mode 100644 index 0000000..4ad9eea --- /dev/null +++ b/SOURCES/0593-dht-explicit-null-dereference.patch @@ -0,0 +1,58 @@ +From 76c9faf5c750428e5eb69462b82ee0c12cbdabc0 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 25 Sep 2020 18:39:51 +0530 +Subject: [PATCH 593/610] dht: explicit null dereference + +Added a null check for uuid_list_copy, to avoid +null dereference in strtok_r() in case of strdup() +failure. + +CID: 1325612 +CID: 1274223 + +>Updates: #1060 + +>Change-Id: I641a5068cd76d7b2ed92eccf39e7f97d6f7b2480 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/25046 +BUG: 1997447 + +Change-Id: I576b4ce610948bdb84eb30377a684c54df718bdc +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280063 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 2 ++ + xlators/cluster/dht/src/dht-shared.c | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index edfc6e7..e6a16ff 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -4296,6 +4296,8 @@ dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + index = conf->local_subvols_cnt; + + uuid_list_copy = gf_strdup(uuid_list); ++ if (!uuid_list_copy) ++ goto unlock; + + for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str; + uuid_str = next_uuid_str) { +diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c +index 58e3339..cca272a 100644 +--- a/xlators/cluster/dht/src/dht-shared.c ++++ b/xlators/cluster/dht/src/dht-shared.c +@@ -567,6 +567,8 @@ gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, + pattern_str = strtok_r(data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup(pattern_str); ++ if (!dup_str) ++ goto out; + pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); + if (!pattern_list) { + goto out; +-- +1.8.3.1 + diff --git a/SOURCES/0594-glusterd-resource-leaks.patch b/SOURCES/0594-glusterd-resource-leaks.patch new file mode 100644 index 0000000..ccc2f3b --- /dev/null +++ b/SOURCES/0594-glusterd-resource-leaks.patch @@ -0,0 +1,52 @@ +From 663df92f9b4b9f35ae10f84487494829987e2f58 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 25 Sep 2020 17:56:19 +0530 +Subject: [PATCH 594/610] glusterd: resource leaks + +Issue: +iobref was not freed before exiting the function. + +Fix: +Modified the code to free iobref before exiting. + +CID: 1430107 +>Updates: #1060 + +>Change-Id: I89351b3aa645792eb8dda6292d1e559057b02d8b +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/25042 +BUG: 1997447 + +Change-Id: Iea56afca015a7c0f15ab32f490ea27f5ea323a07 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280066 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 6d40be5..c037933 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -6042,7 +6042,6 @@ send_attach_req(xlator_t *this, struct rpc_clnt *rpc, char *path, + GF_ATOMIC_INC(conf->blockers); + ret = rpc_clnt_submit(rpc, &gd_brick_prog, op, cbkfn, &iov, 1, NULL, 0, + iobref, frame, NULL, 0, NULL, 0, NULL); +- return ret; + + free_iobref: + iobref_unref(iobref); +@@ -6051,7 +6050,7 @@ maybe_free_iobuf: + iobuf_unref(iobuf); + } + err: +- return -1; ++ return ret; + } + + extern size_t +-- +1.8.3.1 + diff --git a/SOURCES/0595-glusterd-use-after-free-coverity-issue.patch b/SOURCES/0595-glusterd-use-after-free-coverity-issue.patch new file mode 100644 index 0000000..7430838 --- /dev/null +++ b/SOURCES/0595-glusterd-use-after-free-coverity-issue.patch @@ -0,0 +1,51 @@ +From 025718f1734655c411475ea338cee1659d96763e Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 3 Sep 2020 15:42:45 +0530 +Subject: [PATCH 595/610] glusterd: use after free (coverity issue) + +Issue: +dict_unref is called on the same dict again, +in the out label of the code, which causes the +use after free issue. + +Fix: +Set the dict to NULL after unref, to avoid +use after free issue. + +CID: 1430127 + +>Updates: #1060 + +>Change-Id: Ide9a5cbc5f496705c671e72b0260da6d4c06f16d +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24946 +BUG: 1997447 + +Change-Id: Id1e58cd6226b9329ad49bd5b75ee96a3a5ec5ab7 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280067 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +index 386eed2..b0fa490 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +@@ -2039,8 +2039,9 @@ glusterd_update_snaps_synctask(void *opaque) + "Failed to remove snap %s", snap->snapname); + goto out; + } +- if (dict) +- dict_unref(dict); ++ ++ dict_unref(dict); ++ dict = NULL; + } + snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix); + ret = dict_get_int32(peer_data, buf, &val); +-- +1.8.3.1 + diff --git a/SOURCES/0596-locks-null-dereference.patch b/SOURCES/0596-locks-null-dereference.patch new file mode 100644 index 0000000..4ad016f --- /dev/null +++ b/SOURCES/0596-locks-null-dereference.patch @@ -0,0 +1,43 @@ +From 099fcac6fecef6fc367d8fcae8442195f3f174db Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 25 Sep 2020 18:19:39 +0530 +Subject: [PATCH 596/610] locks: null dereference + +Added a null check before executing the strtok_r() +to avoid null dereference in case of strdup() failure. + +CID: 1407938 +>Updates: #1060 + +>Change-Id: Iec6e72ae8cb54f6d0a287615c43756325b2026ec +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/25045 +BUG: 1997447 + +Change-Id: I47e6e2402badaf4103607b4164f19142a99a2f71 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280065 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/locks/src/posix.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index d5effef..03c4907 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -494,6 +494,9 @@ pl_inodelk_xattr_fill_multiple(dict_t *this, char *key, data_t *value, + char *save_ptr = NULL; + + tmp_key = gf_strdup(key); ++ if (!tmp_key) ++ return -1; ++ + strtok_r(tmp_key, ":", &save_ptr); + if (!*save_ptr) { + gf_msg(THIS->name, GF_LOG_ERROR, 0, EINVAL, +-- +1.8.3.1 + diff --git a/SOURCES/0597-glusterd-memory-deallocated-twice.patch b/SOURCES/0597-glusterd-memory-deallocated-twice.patch new file mode 100644 index 0000000..7e2c49f --- /dev/null +++ b/SOURCES/0597-glusterd-memory-deallocated-twice.patch @@ -0,0 +1,163 @@ +From 59c05230c0df58765e30553c66bbcc0c9965d362 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Tue, 11 Aug 2020 23:12:26 +0530 +Subject: [PATCH 597/610] glusterd: memory deallocated twice + +Issue: +If the the pointer tmptier is destroyed in the function +code it still it checks for the same in the out label. +And tries to destroy the same pointer again. + +Fix: +So, instead of passing the ptr by value, if we +pass it by reference then, on making the ptr in the +function the value will persist, in the calling +function and next time when the gf_store_iter_destory() +is called it won't try to free the ptr again. + +CID: 1430122 + +>Updates: #1060 + +>Change-Id: I019cea8e301c7cc87be792c03b58722fc96f04ef +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24855 +BUG: 1997447 + +Change-Id: Ib403efd08d47a69d25f291ae61c9cbfcaaa05da8 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280076 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/store.h | 2 +- + libglusterfs/src/store.c | 12 +++++++----- + xlators/mgmt/glusterd/src/glusterd-store.c | 16 ++++++++-------- + 3 files changed, 16 insertions(+), 14 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/store.h b/libglusterfs/src/glusterfs/store.h +index 68a20ad..76af2df 100644 +--- a/libglusterfs/src/glusterfs/store.h ++++ b/libglusterfs/src/glusterfs/store.h +@@ -93,7 +93,7 @@ int32_t + gf_store_iter_get_matching(gf_store_iter_t *iter, char *key, char **value); + + int32_t +-gf_store_iter_destroy(gf_store_iter_t *iter); ++gf_store_iter_destroy(gf_store_iter_t **iter); + + char * + gf_store_strerror(gf_store_op_errno_t op_errno); +diff --git a/libglusterfs/src/store.c b/libglusterfs/src/store.c +index 3af627a..e4931bf 100644 +--- a/libglusterfs/src/store.c ++++ b/libglusterfs/src/store.c +@@ -606,23 +606,25 @@ out: + } + + int32_t +-gf_store_iter_destroy(gf_store_iter_t *iter) ++gf_store_iter_destroy(gf_store_iter_t **iter) + { + int32_t ret = -1; + +- if (!iter) ++ if (!(*iter)) + return 0; + + /* gf_store_iter_new will not return a valid iter object with iter->file + * being NULL*/ +- ret = fclose(iter->file); ++ ret = fclose((*iter)->file); + if (ret) + gf_msg("", GF_LOG_ERROR, errno, LG_MSG_FILE_OP_FAILED, + "Unable" + " to close file: %s, ret: %d", +- iter->filepath, ret); ++ (*iter)->filepath, ret); ++ ++ GF_FREE(*iter); ++ *iter = NULL; + +- GF_FREE(iter); + return ret; + } + +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index a8651d8..e027575 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -2576,7 +2576,7 @@ glusterd_store_retrieve_snapd(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -2895,13 +2895,13 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(tmpiter)) { ++ if (gf_store_iter_destroy(&tmpiter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; + } + +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -3067,7 +3067,7 @@ glusterd_store_retrieve_node_state(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -3379,7 +3379,7 @@ glusterd_store_update_volinfo(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -3574,7 +3574,7 @@ glusterd_store_retrieve_options(xlator_t *this) + goto out; + ret = 0; + out: +- (void)gf_store_iter_destroy(iter); ++ (void)gf_store_iter_destroy(&iter); + gf_store_handle_destroy(shandle); + return ret; + } +@@ -4026,7 +4026,7 @@ glusterd_store_update_snap(glusterd_snap_t *snap) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -4774,7 +4774,7 @@ glusterd_store_retrieve_peers(xlator_t *this) + is_ok = _gf_true; + + next: +- (void)gf_store_iter_destroy(iter); ++ (void)gf_store_iter_destroy(&iter); + + if (!is_ok) { + gf_log(this->name, GF_LOG_WARNING, +-- +1.8.3.1 + diff --git a/SOURCES/0598-glusterd-null-dereference.patch b/SOURCES/0598-glusterd-null-dereference.patch new file mode 100644 index 0000000..fac1b8f --- /dev/null +++ b/SOURCES/0598-glusterd-null-dereference.patch @@ -0,0 +1,51 @@ +From 84aaaded4e958a10c7492233c053e3c681f2d575 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 2 Jul 2020 18:10:32 +0530 +Subject: [PATCH 598/610] glusterd: null dereference + +Issue: +There has been either an explicit null +dereference or a dereference after null +check in some cases. + +Fix: +Added the proper condition for null check +and fixed null derefencing. + +CID: 1430106 : Dereference after null check +CID: 1430120 : Explicit null dereferenced +CID: 1430132 : Dereference after null check +CID: 1430134 : Dereference after null check + +>Change-Id: I7e795cf9f7146a633097c26a766f16b159881fa3 +>Updates: #1060 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24664 +BUG: 1997447 + +Change-Id: I2b2632c93094d0e7b9fbd65a2ca2b0eaf6212d79 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280083 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-syncop.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c +index 05c9e11..f1807cd 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c ++++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c +@@ -1797,7 +1797,7 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + pending_node = NULL; + ret = 0; + out: +- if (pending_node) ++ if (pending_node && pending_node->node) + glusterd_pending_node_put_rpc(pending_node); + + if (rsp_dict) +-- +1.8.3.1 + diff --git a/SOURCES/0599-afr-null-dereference-nagative-value.patch b/SOURCES/0599-afr-null-dereference-nagative-value.patch new file mode 100644 index 0000000..7d59cc7 --- /dev/null +++ b/SOURCES/0599-afr-null-dereference-nagative-value.patch @@ -0,0 +1,59 @@ +From 4186f81596a481a5c0c5a707fc9b2358ee8f49f0 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 3 Jul 2020 17:18:33 +0530 +Subject: [PATCH 599/610] afr: null dereference & nagative value + +Added a check for NULL before dereferencing +the object as it may be NULL in few cases +inside the funtion. Also, added a check for +the negative value of gfid_idx. + +CID: 1430140 +CID: 1430145 + +>Change-Id: Ib7d23459b48bbc471dbcccab6d20572261882d11 +>Updates: #1060 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24671 +BUG: 1997447 + +Change-Id: I7e705a106d97001b67f5cde8589413c0c24ee507 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280085 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heal-common.c | 2 +- + xlators/cluster/afr/src/afr-self-heal-name.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 0954d2c..cbd5117 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -140,7 +140,7 @@ heal: + } + } + out: +- if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) { ++ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { + ret = -afr_final_errno(local, priv); + } + loc_wipe(&loc); +diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c +index 9ec2066..c5ab8d7 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-name.c ++++ b/xlators/cluster/afr/src/afr-self-heal-name.c +@@ -353,7 +353,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode, + replies, gfid, locked_on, source, sources, + is_gfid_absent, &gfid_idx); +- if (ret) ++ if (ret || (gfid_idx < 0)) + return ret; + + ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname, +-- +1.8.3.1 + diff --git a/SOURCES/0600-dht-xlator-integer-handling-issue.patch b/SOURCES/0600-dht-xlator-integer-handling-issue.patch new file mode 100644 index 0000000..c3970ac --- /dev/null +++ b/SOURCES/0600-dht-xlator-integer-handling-issue.patch @@ -0,0 +1,161 @@ +From 1cd16553d436fa703f5e18d71c35108d0e179e8b Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 9 Apr 2020 11:36:34 +0530 +Subject: [PATCH 600/610] dht xlator: integer handling issue + +Issue: The ret value is passed to the function +instead of the proper errno value + +Fix: Passing the errno generated to +the log function + +CID: 1415824 : Improper use of negative value +CID: 1420205 : Improper use of negative value +>Change-Id: Iaa7407ebd03eda46a2c027695e6bf0f598b371b2 +>Updates: #1060 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24314 +BUG: 1997447 + +Change-Id: Ibb7f432dbcc9ffd8dff6be6f984a6705894d6bef +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280086 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 12 ++++++++---- + xlators/cluster/dht/src/dht-common.h | 2 +- + xlators/cluster/dht/src/dht-helper.c | 9 ++++++--- + xlators/cluster/dht/src/dht-selfheal.c | 8 +++++--- + 4 files changed, 20 insertions(+), 11 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index e6a16ff..5eaaa1e 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -672,13 +672,14 @@ dht_discover_complete(xlator_t *this, call_frame_t *discover_frame) + + if (local->need_xattr_heal && !heal_path) { + local->need_xattr_heal = 0; +- ret = dht_dir_xattr_heal(this, local); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, ret, ++ ret = dht_dir_xattr_heal(this, local, &op_errno); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for " + "directory gfid is %s ", + gfid_local); ++ } + } + } + +@@ -1205,7 +1206,7 @@ dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, + to non hashed subvol + */ + int +-dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) ++dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno) + { + dht_local_t *copy_local = NULL; + call_frame_t *copy = NULL; +@@ -1217,6 +1218,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) + "No gfid exists for path %s " + "so healing xattr is not possible", + local->loc.path); ++ *op_errno = EIO; + goto out; + } + +@@ -1230,6 +1232,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) + "Memory allocation failed " + "for path %s gfid %s ", + local->loc.path, gfid_local); ++ *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } else { + copy_local->stbuf = local->stbuf; +@@ -1244,6 +1247,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) + "Synctask creation failed to heal xattr " + "for path %s gfid %s ", + local->loc.path, gfid_local); ++ *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } + } +diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h +index b856c68..1cb1c0c 100644 +--- a/xlators/cluster/dht/src/dht-common.h ++++ b/xlators/cluster/dht/src/dht-common.h +@@ -1493,7 +1493,7 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag); + + int +-dht_dir_xattr_heal(xlator_t *this, dht_local_t *local); ++dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno); + + int32_t + dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, +diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c +index 4c3940a..d3444b3 100644 +--- a/xlators/cluster/dht/src/dht-helper.c ++++ b/xlators/cluster/dht/src/dht-helper.c +@@ -2105,6 +2105,7 @@ dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) + dht_local_t *local = NULL; + xlator_t *this = NULL; + int ret = -1; ++ int op_errno = 0; + + local = heal_frame->local; + main_frame = local->main_frame; +@@ -2114,10 +2115,12 @@ dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) + dht_set_fixed_dir_stat(&local->postparent); + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; +- ret = dht_dir_xattr_heal(this, local); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, ret, DHT_MSG_DIR_XATTR_HEAL_FAILED, ++ ret = dht_dir_xattr_heal(this, local, &op_errno); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for directory %s ", local->loc.path); ++ } + } + + DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf, +diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c +index 8af7301..2da9817 100644 +--- a/xlators/cluster/dht/src/dht-selfheal.c ++++ b/xlators/cluster/dht/src/dht-selfheal.c +@@ -1471,6 +1471,7 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, + { + int missing_dirs = 0; + int i = 0; ++ int op_errno = 0; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *this = NULL; +@@ -1493,13 +1494,14 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; +- ret = dht_dir_xattr_heal(this, local); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, ret, ++ ret = dht_dir_xattr_heal(this, local, &op_errno); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "%s:xattr heal failed for " + "directory (gfid = %s)", + local->loc.path, local->gfid); ++ } + } else { + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_copy(loc->gfid, local->gfid); +-- +1.8.3.1 + diff --git a/SOURCES/0601-coverity-resource-leak-2321.patch b/SOURCES/0601-coverity-resource-leak-2321.patch new file mode 100644 index 0000000..35dc964 --- /dev/null +++ b/SOURCES/0601-coverity-resource-leak-2321.patch @@ -0,0 +1,99 @@ +From 6d7049a19029331266f70f68d860bbccef01a35d Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Thu, 8 Jul 2021 11:26:54 +0530 +Subject: [PATCH 601/610] coverity: resource leak (#2321) + +Issue: +Variable `arg` is not freed before the function exits, +and leads to resource leak. + +Fix: +Free the arg variable if the status of function call +`glusterd_compare_friend_volume` is +`GLUSTERD_VOL_COMP_UPDATE_REQ`, or if the `glusterd_launch_synctask` +fails to start the process. + +And, added a check for return value on calling +`glusterd_launch_synctask` function and exit if the +thread creation fails. + +CID: 1401716 +>Updates: #1060 + +>Change-Id: I4abd621771f88853d8d01e9039cdee2f3d862c4f +>Signed-off-by: nik-redhat + +Upstream link: https://github.com/gluster/glusterfs/pull/2321 +BUG: 1997447 + +Change-Id: Ida81dfcd58c5ef45d3ae036d6bd6b36dc6693538 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280090 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 10 +++++++--- + xlators/mgmt/glusterd/src/glusterd-utils.h | 2 +- + 2 files changed, 8 insertions(+), 4 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index c037933..cec9c20 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -5371,6 +5371,7 @@ glusterd_compare_friend_data(dict_t *peer_data, dict_t *cmp, int32_t *status, + + if (GLUSTERD_VOL_COMP_RJT == *status) { + ret = 0; ++ update = _gf_false; + goto out; + } + if (GLUSTERD_VOL_COMP_UPDATE_REQ == *status) { +@@ -5385,11 +5386,12 @@ glusterd_compare_friend_data(dict_t *peer_data, dict_t *cmp, int32_t *status, + * first brick to come up before attaching the subsequent bricks + * in case brick multiplexing is enabled + */ +- glusterd_launch_synctask(glusterd_import_friend_volumes_synctask, arg); ++ ret = glusterd_launch_synctask(glusterd_import_friend_volumes_synctask, ++ arg); + } + + out: +- if (ret && arg) { ++ if ((ret || !update) && arg) { + dict_unref(arg->peer_data); + dict_unref(arg->peer_ver_data); + GF_FREE(arg); +@@ -13115,7 +13117,7 @@ gd_default_synctask_cbk(int ret, call_frame_t *frame, void *opaque) + return ret; + } + +-void ++int + glusterd_launch_synctask(synctask_fn_t fn, void *opaque) + { + xlator_t *this = NULL; +@@ -13131,6 +13133,8 @@ glusterd_launch_synctask(synctask_fn_t fn, void *opaque) + gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SPAWN_SVCS_FAIL, + "Failed to spawn bricks" + " and other volume related services"); ++ ++ return ret; + } + + /* +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 4541471..3f4f3b8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -681,7 +681,7 @@ int32_t + glusterd_take_lvm_snapshot(glusterd_brickinfo_t *brickinfo, + char *origin_brick_path); + +-void ++int + glusterd_launch_synctask(synctask_fn_t fn, void *opaque); + + int +-- +1.8.3.1 + diff --git a/SOURCES/0602-coverity-null-dereference-2395.patch b/SOURCES/0602-coverity-null-dereference-2395.patch new file mode 100644 index 0000000..6edc3aa --- /dev/null +++ b/SOURCES/0602-coverity-null-dereference-2395.patch @@ -0,0 +1,87 @@ +From 2ff83650a5f05e3f06853df6d79d3b18f88dfb23 Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Thu, 6 May 2021 10:45:46 +0530 +Subject: [PATCH 602/610] coverity: null dereference (#2395) + +Fix: +Updated the code to make it more readable and fixed +the NULL dereferencing. + +CID: 1234622 +>Updates: #1060 + +>Change-Id: I05bd203bc46fe84be86398bd664a3485409c3bfe +>Signed-off-by: nik-redhat + +Upstream link: https://github.com/gluster/glusterfs/pull/2395 +BUG: 1997447 + +Change-Id: If39cc85115de673a83b6c97137ea8d1f0f825245 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280093 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-lock.c | 32 +++++++++++++++----------------- + 1 file changed, 15 insertions(+), 17 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c +index f9bac4f..6474dfa 100644 +--- a/xlators/cluster/dht/src/dht-lock.c ++++ b/xlators/cluster/dht/src/dht-lock.c +@@ -914,37 +914,35 @@ dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; ++ dht_ilock_wrap_t *my_layout; + + local = frame->local; + lk_index = (long)cookie; + ++ my_layout = &(local->lock[0].layout.my_layout); ++ + if (op_ret == -1) { +- local->lock[0].layout.my_layout.op_ret = -1; +- local->lock[0].layout.my_layout.op_errno = op_errno; +- +- if (local && local->lock[0].layout.my_layout.locks[lk_index]) { +- uuid_utoa_r(local->lock[0] +- .layout.my_layout.locks[lk_index] +- ->loc.inode->gfid, +- gfid); +- +- gf_msg_debug( +- this->name, op_errno, +- "inodelk failed on gfid: %s " +- "subvolume: %s", +- gfid, +- local->lock[0].layout.my_layout.locks[lk_index]->xl->name); ++ my_layout->op_ret = -1; ++ my_layout->op_errno = op_errno; ++ ++ if (my_layout->locks[lk_index]) { ++ uuid_utoa_r(my_layout->locks[lk_index]->loc.inode->gfid, gfid); ++ ++ gf_msg_debug(this->name, op_errno, ++ "inodelk failed on gfid: %s " ++ "subvolume: %s", ++ gfid, my_layout->locks[lk_index]->xl->name); + } + + goto out; + } + +- local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true; ++ my_layout->locks[lk_index]->locked = _gf_true; + + out: + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { +- if (local->lock[0].layout.my_layout.op_ret < 0) { ++ if (my_layout->op_ret < 0) { + dht_inodelk_cleanup(frame); + return 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0603-Coverity-Resource-leak-fix-CID-1356547.patch b/SOURCES/0603-Coverity-Resource-leak-fix-CID-1356547.patch new file mode 100644 index 0000000..8c6b53b --- /dev/null +++ b/SOURCES/0603-Coverity-Resource-leak-fix-CID-1356547.patch @@ -0,0 +1,51 @@ +From 015e6cac71b0a0c330f1e4792f9d60214b191f45 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 7 Oct 2021 21:07:46 +0530 +Subject: [PATCH 603/610] Coverity: Resource leak fix (CID: 1356547) + +Issue: +In function gf_svc_readdirp() there is a chance that 'local' will be allocated +memory but not released in the failure path. + +Fix: +Assign 'local' to 'frame->local' immediately after the successful allocation, so +it will be released by the existing failure path code itself. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2362/ +> Change-Id: I4474dc4d4be5432d169cb7d434728f211054997e +> Signed-off-by: karthik-us +> Updates: gluster#1060 + +BUG: 1997447 +Change-Id: I4474dc4d4be5432d169cb7d434728f211054997e +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280100 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/snapview-client/src/snapview-client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/snapview-client/src/snapview-client.c b/xlators/features/snapview-client/src/snapview-client.c +index 9c789ae..e97db89 100644 +--- a/xlators/features/snapview-client/src/snapview-client.c ++++ b/xlators/features/snapview-client/src/snapview-client.c +@@ -2156,6 +2156,7 @@ gf_svc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + "failed to allocate local"); + goto out; + } ++ frame->local = local; + + /* + * This is mainly for samba shares (or windows clients). As part of +@@ -2184,7 +2185,6 @@ gf_svc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + + local->subvolume = subvolume; + local->fd = fd_ref(fd); +- frame->local = local; + + STACK_WIND(frame, gf_svc_readdirp_cbk, subvolume, subvolume->fops->readdirp, + fd, size, off, xdata); +-- +1.8.3.1 + diff --git a/SOURCES/0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch b/SOURCES/0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch new file mode 100644 index 0000000..a680327 --- /dev/null +++ b/SOURCES/0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch @@ -0,0 +1,50 @@ +From dee1c932df22ee12fe4568b40e58a475309e62fd Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 7 Oct 2021 21:18:49 +0530 +Subject: [PATCH 604/610] Coverity: Fix dereference before null check (CID: + 1391415) + +Problem: +In function gf_client_dump_inodes_to_dict() there is a null check for +a variable which is already dereferenced in the previous line. This +means that there could be a chance that this variable is null. But it +is not being validate for null before dereferencing it in the first +place. + +Fix: +Added null check before dereferencing the variable at the first place. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2369/ +> Change-Id: I988b0e93542782353a8059e33db1522b6a5e55f8 +> Signed-off-by: karthik-us +> Updates: gluster#1060 + +BUG: 1997447 +Change-Id: I988b0e93542782353a8059e33db1522b6a5e55f8 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280103 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/client_t.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c +index e875c8b..216900a 100644 +--- a/libglusterfs/src/client_t.c ++++ b/libglusterfs/src/client_t.c +@@ -828,8 +828,9 @@ gf_client_dump_inodes_to_dict(xlator_t *this, dict_t *dict) + clienttable->cliententries[count].next_free) + continue; + client = clienttable->cliententries[count].client; +- if (!strcmp(client->bound_xl->name, this->name)) { +- if (client->bound_xl && client->bound_xl->itable) { ++ if (client->bound_xl && ++ !strcmp(client->bound_xl->name, this->name)) { ++ if (client->bound_xl->itable) { + /* Presently every brick contains only + * one bound_xl for all connections. + * This will lead to duplicating of +-- +1.8.3.1 + diff --git a/SOURCES/0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch b/SOURCES/0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch new file mode 100644 index 0000000..849c959 --- /dev/null +++ b/SOURCES/0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch @@ -0,0 +1,53 @@ +From 25fc2530f7ee6d7267e2ccc1b75a47a3ae539dff Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 7 Oct 2021 21:29:27 +0530 +Subject: [PATCH 605/610] Coverity: Fix copy into fixed size buffer (CID: + 1325542) + +Problem: +In __mnt3_fresh_lookup() mres->resolveloc.path is being copied into +a fixed size string mres->remainingdir, with strncpy without checking +the size of the source string. This could lead to string overflow. + +Fix: +Copy only till the destination string length and check whether the +soruce string overflows. If so log an error message and return. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2474/ +> Change-Id: I26dd0653d2636c667ad4e356d12d3d51956c77c3 +> Signed-off-by: karthik-us +> Updates: gluster#1060 + +BUG: 1997447 +Change-Id: I26dd0653d2636c667ad4e356d12d3d51956c77c3 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280106 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/nfs/server/src/mount3.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c +index 734453c..3951b9e 100644 +--- a/xlators/nfs/server/src/mount3.c ++++ b/xlators/nfs/server/src/mount3.c +@@ -1104,8 +1104,13 @@ __mnt3_fresh_lookup(mnt3_resolve_t *mres) + { + inode_unlink(mres->resolveloc.inode, mres->resolveloc.parent, + mres->resolveloc.name); +- strncpy(mres->remainingdir, mres->resolveloc.path, +- strlen(mres->resolveloc.path)); ++ if (snprintf(mres->remainingdir, sizeof(mres->remainingdir), "%s", ++ mres->resolveloc.path) >= sizeof(mres->remainingdir)) { ++ gf_msg(GF_MNT, GF_LOG_ERROR, EFAULT, NFS_MSG_RESOLVE_INODE_FAIL, ++ "Failed to copy resolve path: %s", mres->resolveloc.path); ++ nfs_loc_wipe(&mres->resolveloc); ++ return -EFAULT; ++ } + nfs_loc_wipe(&mres->resolveloc); + return __mnt3_resolve_subdir(mres); + } +-- +1.8.3.1 + diff --git a/SOURCES/0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch b/SOURCES/0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch new file mode 100644 index 0000000..05ca17b --- /dev/null +++ b/SOURCES/0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch @@ -0,0 +1,69 @@ +From a6ba95b73469ad81d8c5a27293f8d09cc26928a3 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Fri, 18 Dec 2020 16:28:29 +0530 +Subject: [PATCH 606/610] dht: handle DHT_SUBVOL_STATUS_KEY in dht_pt_getxattr + (#1934) + +In non distribute volumes (plain replicate, ec), DHT uses pass-through +FOPs (dht_pt_getxattr) instead of the usual FOPS (dht_getxattr). The +pass through FOP was not handling the DHT_SUBVOL_STATUS_KEY virtual +xattr because of which geo-rep session was going into a faulty state. +Fixing it now. + +> updates: #1925 +> Change-Id: I766b5b5c047c954a9957ab78aca680eedef1ff1f +> Signed-off-by: Ravishankar N + +Upstream patch: https://github.com/gluster/glusterfs/pull/1934 + +BUG: 2006205 +Change-Id: I766b5b5c047c954a9957ab78aca680eedef1ff1f +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280112 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 5eaaa1e..c8980e5 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -11584,9 +11584,33 @@ int + dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) + { ++ int op_errno = -1; ++ dht_local_t *local = NULL; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(loc, err); ++ VALIDATE_OR_GOTO(loc->inode, err); ++ VALIDATE_OR_GOTO(this->private, err); ++ ++ local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR); ++ if (!local) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ if (key && ++ strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) { ++ dht_vgetxattr_subvol_status(frame, this, key); ++ return 0; ++ } ++ + STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; ++err: ++ DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); ++ return 0; + } + + static int +-- +1.8.3.1 + diff --git a/SOURCES/0607-SELinux-Fix-boolean-management.patch b/SOURCES/0607-SELinux-Fix-boolean-management.patch new file mode 100644 index 0000000..4a62b03 --- /dev/null +++ b/SOURCES/0607-SELinux-Fix-boolean-management.patch @@ -0,0 +1,121 @@ +From 4b65ff0d1a3d70fcf3cfa8ab769135ae12f529d8 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 7 Oct 2021 22:02:32 +0530 +Subject: [PATCH 607/610] SELinux: Fix boolean management + +Remove %triggerun ganesha +This trigger shouldn't be needed to begin with since removing +selinux-policy-targeted means that the user is switching SELinux off, or +is is switching the policy (to "mls" or "minimum"). In either case the +current boolean setting is not going to be used any more. The last +option, removal of glusterfs-ganesha, is covered by '%postun ganesha'. +But more importantly, the trigger is called every time +selinux-policy-targeted is updated (which can be avoided). +%triggerun is executed after %triggerin - +https://docs.fedoraproject.org/en-US/packaging-guidelines/Scriptlets/#ordering +So when selinux-policy-targeted is updated, the new version is installed +first triggering `semanage boolean -m ganesha_use_fusefs --on`, +and then the old version is uninstalled triggering +`semanage boolean -m ganesha_use_fusefs --off`. + +* use selinux_[un]set_booleans instead of "semanage boolean" + The macro pair properly manages SELinux stores and doesn't disable the + boolean in case it was enabled before ${name}-ganesha was installed. + +* Only change booleans when the package is first installed or + uninstalled +Updating ${name}-ganesha would disable the boolean because %postun is +called after %post (same issue as with the triggers). + +Signed-off-by: Vit Mojzis +Signed-off-by: Kaleb S. KEITHLEY +Change-Id: Ibb926ffbe00c9f000bd740708c0a4b3435ee7871 +PR: https://github.com/gluster/glusterfs/pull/2833 +Issue: https://github.com/gluster/glusterfs/issues/2522 +Resolves: rhbz#1973566 +Resolves: rhbz#1975400 + +BUG: 1973566 +Change-Id: Idef6cbd6bce35151518d6f76e5b74774e5756fc9 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280114 +Tested-by: RHGS Build Bot +Reviewed-by: Kaleb Keithley +--- + glusterfs.spec.in | 34 +++++++++++++++++++++------------- + 1 file changed, 21 insertions(+), 13 deletions(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 424f4ab..a9a83b1 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -954,7 +954,10 @@ exit 0 + %if ( 0%{!?_without_server:1} ) + %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %post ganesha +-semanage boolean -m ganesha_use_fusefs --on ++# first install ++if [ $1 -eq 1 ]; then ++ %selinux_set_booleans ganesha_use_fusefs=1 ++fi + exit 0 + %endif + %endif +@@ -962,7 +965,9 @@ exit 0 + %if ( 0%{!?_without_georeplication:1} ) + %post geo-replication + %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) +-%selinux_set_booleans %{selinuxbooleans} ++if [ $1 -eq 1 ]; then ++ %selinux_set_booleans %{selinuxbooleans} ++fi + %endif + if [ $1 -ge 1 ]; then + %systemd_postun_with_restart glusterd +@@ -1089,29 +1094,32 @@ exit 0 + %if ( 0%{!?_without_server:1} ) + %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %postun ganesha +-semanage boolean -m ganesha_use_fusefs --off ++if [ $1 -eq 0 ]; then ++ # use the value of ganesha_use_fusefs from before glusterfs-ganesha was installed ++ %selinux_unset_booleans ganesha_use_fusefs=1 ++fi + exit 0 + %endif + %endif + +-##----------------------------------------------------------------------------- +-## All %%trigger should be placed here and keep them sorted +-## +-%if ( 0%{!?_without_server:1} ) +-%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +-%trigger ganesha -- selinux-policy-targeted +-semanage boolean -m ganesha_use_fusefs --on ++%if ( 0%{!?_without_georeplication:1} ) ++%postun geo-replication ++%if ( 0%{?rhel} && 0%{?rhel} >= 8 ) ++if [ $1 -eq 0 ]; then ++ %selinux_unset_booleans %{selinuxbooleans} ++fi + exit 0 + %endif + %endif + + ##----------------------------------------------------------------------------- +-## All %%triggerun should be placed here and keep them sorted ++## All %%trigger should be placed here and keep them sorted + ## + %if ( 0%{!?_without_server:1} ) + %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +-%triggerun ganesha -- selinux-policy-targeted +-semanage boolean -m ganesha_use_fusefs --off ++# ensure ganesha_use_fusefs is on in case of policy mode switch (eg. mls->targeted) ++%triggerin ganesha -- selinux-policy-targeted ++semanage boolean -m ganesha_use_fusefs --on -S targeted + exit 0 + %endif + %endif +-- +1.8.3.1 + diff --git a/SOURCES/0608-cluster-ec-Track-heal-statistics-in-shd.patch b/SOURCES/0608-cluster-ec-Track-heal-statistics-in-shd.patch new file mode 100644 index 0000000..b08d7a9 --- /dev/null +++ b/SOURCES/0608-cluster-ec-Track-heal-statistics-in-shd.patch @@ -0,0 +1,143 @@ +From d806760f1d4c78a2519b01f1c2d07aba0c533755 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Fri, 28 Aug 2020 16:03:54 +0530 +Subject: [PATCH 608/610] cluster/ec: Track heal statistics in shd + +With this change we should be able to inspect number of heals +attempted and completed by each shd. + +> Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24926/ +> fixes: #1453 +> Change-Id: I10f5d86efcc0a8e4d648da808751d37725682c39 +> Signed-off-by: Pranith Kumar K + +BUG: 1853631 +Change-Id: I10f5d86efcc0a8e4d648da808751d37725682c39 +Signed-off-by: Sheetal Pamecha +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280208 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec-heald.c | 49 ++++++++++++++++++++++++++++++++++++++- + xlators/cluster/ec/src/ec-types.h | 5 ++++ + xlators/cluster/ec/src/ec.c | 6 +++++ + 3 files changed, 59 insertions(+), 1 deletion(-) + +diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c +index 4f4b6aa..cd4d3ad 100644 +--- a/xlators/cluster/ec/src/ec-heald.c ++++ b/xlators/cluster/ec/src/ec-heald.c +@@ -152,15 +152,58 @@ ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name) + return ret; + } + ++static gf_boolean_t ++ec_is_heal_completed(char *status) ++{ ++ char *bad_pos = NULL; ++ char *zero_pos = NULL; ++ ++ if (!status) { ++ return _gf_false; ++ } ++ ++ /*Logic: ++ * Status will be of the form Good: , Bad: ++ * If heal completes, if we do strchr for '0' it should be present after ++ * 'Bad:' i.e. strRchr for ':' ++ * */ ++ ++ zero_pos = strchr(status, '0'); ++ bad_pos = strrchr(status, ':'); ++ if (!zero_pos || !bad_pos) { ++ /*malformed status*/ ++ return _gf_false; ++ } ++ ++ if (zero_pos > bad_pos) { ++ return _gf_true; ++ } ++ ++ return _gf_false; ++} ++ + int + ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, + gf_boolean_t full) + { + dict_t *xdata = NULL; ++ dict_t *dict = NULL; + uint32_t count; + int32_t ret; ++ char *heal_status = NULL; ++ ec_t *ec = healer->this->private; ++ ++ GF_ATOMIC_INC(ec->stats.shd.attempted); ++ ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL, ++ &xdata); ++ if (ret == 0) { ++ if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) { ++ if (ec_is_heal_completed(heal_status)) { ++ GF_ATOMIC_INC(ec->stats.shd.completed); ++ } ++ } ++ } + +- ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata); + if (!full && (loc->inode->ia_type == IA_IFDIR)) { + /* If we have just healed a directory, it's possible that + * other index entries have appeared to be healed. */ +@@ -179,6 +222,10 @@ ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, + dict_unref(xdata); + } + ++ if (dict) { ++ dict_unref(dict); ++ } ++ + return ret; + } + +diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h +index 700dc39..ef7a7fe 100644 +--- a/xlators/cluster/ec/src/ec-types.h ++++ b/xlators/cluster/ec/src/ec-types.h +@@ -626,6 +626,11 @@ struct _ec_statistics { + requests. (Basically memory allocation + errors). */ + } stripe_cache; ++ struct { ++ gf_atomic_t attempted; /*Number of heals attempted on ++ files/directories*/ ++ gf_atomic_t completed; /*Number of heals complted on files/directories*/ ++ } shd; + }; + + struct _ec { +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 047cdd8..24de9e8 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -649,6 +649,8 @@ ec_statistics_init(ec_t *ec) + GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0); ++ GF_ATOMIC_INIT(ec->stats.shd.attempted, 0); ++ GF_ATOMIC_INIT(ec->stats.shd.completed, 0); + } + + int32_t +@@ -1445,6 +1447,10 @@ ec_dump_private(xlator_t *this) + GF_ATOMIC_GET(ec->stats.stripe_cache.allocs)); + gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.errors)); ++ gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC, ++ GF_ATOMIC_GET(ec->stats.shd.attempted)); ++ gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC, ++ GF_ATOMIC_GET(ec->stats.shd.completed)); + + return 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch b/SOURCES/0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch new file mode 100644 index 0000000..a3290cb --- /dev/null +++ b/SOURCES/0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch @@ -0,0 +1,43 @@ +From 89cdfb40264c12105a1b4990fa9b45290aa6cef0 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Fri, 8 Oct 2021 09:40:41 +0530 +Subject: [PATCH 609/610] feature/shard: wrong dname results in dentry not + found error + +Due to wrong dname passed to inode_unlink in +shard_evicted_inode_fsync_cbk() resulting in dentry not found +error. + +This patch addresses the issue. + +> upstream patch: https://github.com/gluster/glusterfs/pull/2475 +> Fixes: #2470 +> Change-Id: I6c479980ae3fa7ba558327055a9e5e5c2d2a850f +> Signed-off-by: Vinayakswami Hariharmath vharihar@redhat.com + +BUG: 1911665 +Change-Id: I96aa5f57303b69a08990de039ddeecad7e7ae6af +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280202 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index b828ff9..882373f 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -950,7 +950,7 @@ shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + { + __shard_inode_ctx_get(shard_inode, this, &ctx); + if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { +- shard_make_block_bname(ctx->block_num, shard_inode->gfid, ++ shard_make_block_bname(ctx->block_num, ctx->base_gfid, + block_bname, sizeof(block_bname)); + inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); + /* The following unref corresponds to the ref held by +-- +1.8.3.1 + diff --git a/SOURCES/0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch b/SOURCES/0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch new file mode 100644 index 0000000..132da9c --- /dev/null +++ b/SOURCES/0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch @@ -0,0 +1,51 @@ +From b3e86a66de224107f6760157a7cb692227e42954 Mon Sep 17 00:00:00 2001 +From: Shwetha Acharya +Date: Mon, 30 Aug 2021 18:54:15 +0530 +Subject: [PATCH 610/610] glusterfs.spec.in: remove condtionals from tar + dependency (#2734) + +* glusterfs.spec.in: remove condtionals from tar dependency + +The conditional on rhel minor version fails and tar is not +marked as required. + +As there is not any universal macro to specify the +minor release, removing the conditionals above the +"Requires: tar" statement + +with this change irrespective of rhel 8.3 and +above, tar will be marked required for geo-rep. + +> Change-Id: Id1e3320a0b1a245fc9cd8c7acb09cc119fca18b8 +> Signed-off-by: Shwetha K Acharya + +Upstream patch: https://github.com/gluster/glusterfs/pull/2734 + +BUG: 1901468 +Change-Id: Id1e3320a0b1a245fc9cd8c7acb09cc119fca18b8 +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280116 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: RHGS Build Bot +--- + glusterfs.spec.in | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index a9a83b1..8b6646f 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -521,9 +521,8 @@ Requires: python%{_pythonver}-gluster = %{version}-%{release} + Requires: rsync + Requires: util-linux + Requires: %{name}-libs%{?_isa} = %{version}-%{release} +-%if ( 0%{?rhel} && ( ( 0%{?rhel} == 8 && 0%{?rhel_minor_version} >= 3 ) || 0%{?rhel} >= 9 ) ) + Requires: tar +-%endif ++ + # required for setting selinux bools + %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) + Requires(post): policycoreutils-python-utils +-- +1.8.3.1 + diff --git a/SOURCES/0611-SELinux-Fix-boolean-management-again.patch b/SOURCES/0611-SELinux-Fix-boolean-management-again.patch new file mode 100644 index 0000000..a5b2612 --- /dev/null +++ b/SOURCES/0611-SELinux-Fix-boolean-management-again.patch @@ -0,0 +1,54 @@ +From 5ad4711f40c0e8ab7c196ac1c9025bf78b8b94e0 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Thu, 18 Nov 2021 09:21:56 -0500 +Subject: [PATCH 611/611] SELinux: Fix boolean management, again + +When upgrading from a version of the package that does not include +the previous fix this means the flawed scriptlet is still executed, +undoing the setting of the boolean. + +In order to work the boolean needs to be set in %posttrans. This is +a temporary change that can (or should) be removed in the next version +of RHGS, i.e. 3.5.7. + +Issue: https://github.com/gluster/glusterfs/issues/2522 +Resolves: rhbz#1973566 +Resolves: rhbz#1975400 + +Label: DOWNSTREAM ONLY + +BUG: 1973566 +Change-Id: Ida39a3ee5e6b4b0d3255bfef95601890afd80709 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/292189 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfs.spec.in | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 8b6646f..87176c9 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -1123,6 +1123,17 @@ exit 0 + %endif + %endif + ++%if ( 0%{!?_without_server:1} ) ++%if ( ( 0%{?fedora} && 0%{?fedora} > 25 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) ++# temporary fix to be removed in the next version (i.e. RHGS 3.5.7). This ++# is only needed when upgrading from the flawed versions (e.g. RHGS 3.5.5 ++# and earlier.) ++%posttrans ganesha ++semanage boolean -m ganesha_use_fusefs --on -S targeted ++exit 0 ++%endif ++%endif ++ + ##----------------------------------------------------------------------------- + ## All %%files should be placed here and keep them grouped + ## +-- +1.8.3.1 + diff --git a/SPECS/glusterfs.spec b/SPECS/glusterfs.spec index c4f7f83..8a7701b 100644 --- a/SPECS/glusterfs.spec +++ b/SPECS/glusterfs.spec @@ -79,6 +79,11 @@ # rpmbuild -ta glusterfs-6.0.tar.gz --without rdma %{?_without_rdma:%global _without_rdma --disable-ibverbs} +# No RDMA Support on 32-bit ARM +%ifarch armv7hl +%global _without_rdma --disable-ibverbs +%endif + # server # if you wish to build rpms without server components, compile like this # rpmbuild -ta glusterfs-6.0.tar.gz --without server @@ -232,7 +237,8 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist} %else Name: glusterfs Version: 6.0 -Release: 49.1%{?dist} +Release: 61%{?dist} +ExcludeArch: i686 %endif License: GPLv2 or LGPLv3+ URL: http://docs.gluster.org/ @@ -789,7 +795,137 @@ Patch0477: 0477-glusterd-snapshot-Snapshot-prevalidation-failure-not.patch Patch0478: 0478-DHT-Fixing-rebalance-failure-on-issuing-stop-command.patch Patch0479: 0479-ganesha-ha-revised-regex-exprs-for-status.patch Patch0480: 0480-DHT-Rebalance-Ensure-Rebalance-reports-status-only-o.patch -Patch0481: 0481-RHGS-3.5.3-rebuild-to-ship-with-RHEL.patch +Patch0481: 0481-Update-rfc.sh-to-rhgs-3.5.4.patch +Patch0482: 0482-logger-Always-print-errors-in-english.patch +Patch0483: 0483-afr-more-quorum-checks-in-lookup-and-new-entry-marki.patch +Patch0484: 0484-glusterd-rebalance-status-displays-stats-as-0-after-.patch +Patch0485: 0485-cli-rpc-conditional-init-of-global-quota-rpc-1578.patch +Patch0486: 0486-glusterd-brick-sock-file-deleted-log-error-1560.patch +Patch0487: 0487-Events-Log-file-not-re-opened-after-logrotate.patch +Patch0488: 0488-glusterd-afr-enable-granular-entry-heal-by-default.patch +Patch0489: 0489-glusterd-fix-bug-in-enabling-granular-entry-heal.patch +Patch0490: 0490-Segmentation-fault-occurs-during-truncate.patch +Patch0491: 0491-glusterd-mount-directory-getting-truncated-on-mounti.patch +Patch0492: 0492-afr-lookup-Pass-xattr_req-in-while-doing-a-selfheal-.patch +Patch0493: 0493-geo-rep-Note-section-is-required-for-ignore_deletes.patch +Patch0494: 0494-glusterd-start-the-brick-on-a-different-port.patch +Patch0495: 0495-geo-rep-descriptive-message-when-worker-crashes-due-.patch +Patch0496: 0496-posix-Use-MALLOC-instead-of-alloca-to-allocate-memor.patch +Patch0497: 0497-socket-Use-AES128-cipher-in-SSL-if-AES-is-supported-.patch +Patch0498: 0498-geo-rep-Fix-corner-case-in-rename-on-mkdir-during-hy.patch +Patch0499: 0499-gfapi-give-appropriate-error-when-size-exceeds.patch +Patch0500: 0500-features-shard-Convert-shard-block-indices-to-uint64.patch +Patch0501: 0501-Cli-Removing-old-syntax-of-tier-cmds-from-help-menu.patch +Patch0502: 0502-dht-fixing-a-permission-update-issue.patch +Patch0503: 0503-gfapi-Suspend-synctasks-instead-of-blocking-them.patch +Patch0504: 0504-io-stats-Configure-ios_sample_buf_size-based-on-samp.patch +Patch0505: 0505-trash-Create-inode_table-only-while-feature-is-enabl.patch +Patch0506: 0506-posix-Attach-a-posix_spawn_disk_thread-with-glusterf.patch +Patch0507: 0507-inode-make-critical-section-smaller.patch +Patch0508: 0508-fuse-fetch-arbitrary-number-of-groups-from-proc-pid-.patch +Patch0509: 0509-core-configure-optimum-inode-table-hash_size-for-shd.patch +Patch0510: 0510-glusterd-brick_mux-Optimize-friend-handshake-code-to.patch +Patch0511: 0511-features-shard-Missing-format-specifier.patch +Patch0512: 0512-glusterd-shared-storage-mount-fails-in-ipv6-environm.patch +Patch0513: 0513-afr-mark-pending-xattrs-as-a-part-of-metadata-heal.patch +Patch0514: 0514-afr-event-gen-changes.patch +Patch0515: 0515-cluster-afr-Heal-directory-rename-without-rmdir-mkdi.patch +Patch0516: 0516-afr-return-EIO-for-gfid-split-brains.patch +Patch0517: 0517-gfapi-glfs_h_creat_open-new-API-to-create-handle-and.patch +Patch0518: 0518-glusterd-Fix-for-shared-storage-in-ipv6-env.patch +Patch0519: 0519-glusterfs-events-Fix-incorrect-attribute-access-2002.patch +Patch0520: 0520-performance-open-behind-seek-fop-should-open_and_res.patch +Patch0521: 0521-open-behind-fix-missing-fd-reference.patch +Patch0522: 0522-lcov-improve-line-coverage.patch +Patch0523: 0523-open-behind-rewrite-of-internal-logic.patch +Patch0524: 0524-open-behind-fix-call_frame-leak.patch +Patch0525: 0525-open-behind-implement-create-fop.patch +Patch0526: 0526-Quota-quota_fsck.py-converting-byte-string-to-string.patch +Patch0527: 0527-Events-Socket-creation-after-getaddrinfo-and-IPv4-an.patch +Patch0528: 0528-Extras-Removing-xattr_analysis-script.patch +Patch0529: 0529-geo-rep-prompt-should-work-for-ignore_deletes.patch +Patch0530: 0530-gfapi-avoid-crash-while-logging-message.patch +Patch0531: 0531-Glustereventsd-Default-port-change-2091.patch +Patch0532: 0532-glusterd-fix-for-starting-brick-on-new-port.patch +Patch0533: 0533-glusterd-Rebalance-cli-is-not-showing-correct-status.patch +Patch0534: 0534-glusterd-Resolve-use-after-free-bug-2181.patch +Patch0535: 0535-multiple-files-use-dict_allocate_and_serialize-where.patch +Patch0536: 0536-dht-Ongoing-IO-is-failed-during-volume-shrink-operat.patch +Patch0537: 0537-cluster-afr-Fix-race-in-lockinfo-f-getxattr.patch +Patch0538: 0538-afr-fix-coverity-issue-introduced-by-90cefde.patch +Patch0539: 0539-extras-disable-lookup-optimize-in-virt-and-block-gro.patch +Patch0540: 0540-extras-Disable-write-behind-for-group-samba.patch +Patch0541: 0541-glusterd-volgen-Add-functionality-to-accept-any-cust.patch +Patch0542: 0542-xlaotrs-mgmt-Fixing-coverity-issue-1445996.patch +Patch0543: 0543-glusterd-handle-custom-xlator-failure-cases.patch +Patch0544: 0544-tests-avoid-empty-paths-in-environment-variables.patch +Patch0545: 0545-tests-Excluded-tests-for-unsupported-components.patch +Patch0546: 0546-Update-rfc.sh-to-rhgs-3.5.5.patch +Patch0547: 0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch +Patch0548: 0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch +Patch0549: 0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch +Patch0550: 0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch +Patch0551: 0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch +Patch0552: 0552-cluster-afr-Change-default-self-heal-window-size-to-.patch +Patch0553: 0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch +Patch0554: 0554-dht-fix-rebalance-of-sparse-files.patch +Patch0555: 0555-geo-rep-Improve-handling-of-gfid-mismatches.patch +Patch0556: 0556-dht-don-t-ignore-xdata-in-fgetxattr.patch +Patch0557: 0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch +Patch0558: 0558-afr-fix-directory-entry-count.patch +Patch0559: 0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch +Patch0560: 0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch +Patch0561: 0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch +Patch0562: 0562-shard.c-Fix-formatting.patch +Patch0563: 0563-features-shard-Use-fd-lookup-post-file-open.patch +Patch0564: 0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch +Patch0565: 0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch +Patch0566: 0566-enahancement-debug-Option-to-generate-core-dump-with.patch +Patch0567: 0567-inode-create-inode-outside-locked-region.patch +Patch0568: 0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch +Patch0569: 0569-features-shard-optimization-over-shard-lookup-in-cas.patch +Patch0570: 0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch +Patch0571: 0571-NetBSD-build-fixes.patch +Patch0572: 0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch +Patch0573: 0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch +Patch0574: 0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch +Patch0575: 0575-libglusterfs-add-functions-to-calculate-time-differe.patch +Patch0576: 0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch +Patch0577: 0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch +Patch0578: 0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch +Patch0579: 0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch +Patch0580: 0580-cluster-dht-suppress-file-migration-error-for-node-n.patch +Patch0581: 0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch +Patch0582: 0582-protocol-client-Fix-lock-memory-leak.patch +Patch0583: 0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch +Patch0584: 0584-dht-fixing-xattr-inconsistency.patch +Patch0585: 0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch +Patch0586: 0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch +Patch0587: 0587-Update-rfc.sh-to-rhgs-3.5.6.patch +Patch0588: 0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch +Patch0589: 0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch +Patch0590: 0590-cluster-afr-Don-t-check-for-stale-entry-index.patch +Patch0591: 0591-afr-check-for-valid-iatt.patch +Patch0592: 0592-md-cache-fix-integer-signedness-mismatch.patch +Patch0593: 0593-dht-explicit-null-dereference.patch +Patch0594: 0594-glusterd-resource-leaks.patch +Patch0595: 0595-glusterd-use-after-free-coverity-issue.patch +Patch0596: 0596-locks-null-dereference.patch +Patch0597: 0597-glusterd-memory-deallocated-twice.patch +Patch0598: 0598-glusterd-null-dereference.patch +Patch0599: 0599-afr-null-dereference-nagative-value.patch +Patch0600: 0600-dht-xlator-integer-handling-issue.patch +Patch0601: 0601-coverity-resource-leak-2321.patch +Patch0602: 0602-coverity-null-dereference-2395.patch +Patch0603: 0603-Coverity-Resource-leak-fix-CID-1356547.patch +Patch0604: 0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch +Patch0605: 0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch +Patch0606: 0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch +Patch0607: 0607-SELinux-Fix-boolean-management.patch +Patch0608: 0608-cluster-ec-Track-heal-statistics-in-shd.patch +Patch0609: 0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch +Patch0610: 0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch +Patch0611: 0611-SELinux-Fix-boolean-management-again.patch %description GlusterFS is a distributed file-system capable of scaling to several @@ -998,6 +1134,8 @@ Requires: python%{_pythonver}-gluster = %{version}-%{release} Requires: rsync Requires: util-linux Requires: %{name}-libs%{?_isa} = %{version}-%{release} +Requires: tar + # required for setting selinux bools %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) Requires(post): policycoreutils-python-utils @@ -1501,7 +1639,10 @@ exit 0 %if ( 0%{!?_without_server:1} ) %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %post ganesha -semanage boolean -m ganesha_use_fusefs --on +# first install +if [ $1 -eq 1 ]; then + %selinux_set_booleans ganesha_use_fusefs=1 +fi exit 0 %endif %endif @@ -1509,7 +1650,9 @@ exit 0 %if ( 0%{!?_without_georeplication:1} ) %post geo-replication %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) -%selinux_set_booleans %{selinuxbooleans} +if [ $1 -eq 1 ]; then + %selinux_set_booleans %{selinuxbooleans} +fi %endif if [ $1 -ge 1 ]; then %systemd_postun_with_restart glusterd @@ -1636,7 +1779,20 @@ exit 0 %if ( 0%{!?_without_server:1} ) %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %postun ganesha -semanage boolean -m ganesha_use_fusefs --off +if [ $1 -eq 0 ]; then + # use the value of ganesha_use_fusefs from before glusterfs-ganesha was installed + %selinux_unset_booleans ganesha_use_fusefs=1 +fi +exit 0 +%endif +%endif + +%if ( 0%{!?_without_georeplication:1} ) +%postun geo-replication +%if ( 0%{?rhel} && 0%{?rhel} >= 8 ) +if [ $1 -eq 0 ]; then + %selinux_unset_booleans %{selinuxbooleans} +fi exit 0 %endif %endif @@ -1646,19 +1802,20 @@ exit 0 ## %if ( 0%{!?_without_server:1} ) %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) -%trigger ganesha -- selinux-policy-targeted -semanage boolean -m ganesha_use_fusefs --on +# ensure ganesha_use_fusefs is on in case of policy mode switch (eg. mls->targeted) +%triggerin ganesha -- selinux-policy-targeted +semanage boolean -m ganesha_use_fusefs --on -S targeted exit 0 %endif %endif -##----------------------------------------------------------------------------- -## All %%triggerun should be placed here and keep them sorted -## %if ( 0%{!?_without_server:1} ) -%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) -%triggerun ganesha -- selinux-policy-targeted -semanage boolean -m ganesha_use_fusefs --off +%if ( ( 0%{?fedora} && 0%{?fedora} > 25 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +# temporary fix to be removed in the next version (i.e. RHGS 3.5.7). This +# is only needed when upgrading from the flawed versions (e.g. RHGS 3.5.5 +# and earlier.) +%posttrans ganesha +semanage boolean -m ganesha_use_fusefs --on -S targeted exit 0 %endif %endif @@ -1930,7 +2087,6 @@ exit 0 %if ( 0%{!?_without_server:1} ) %files server %doc extras/clear_xattrs.sh -%{_datadir}/glusterfs/scripts/xattr_analysis.py* %{_datadir}/glusterfs/scripts/quota_fsck.py* # sysconf %config(noreplace) %{_sysconfdir}/glusterfs @@ -2533,11 +2689,55 @@ fi %endif %changelog -* Tue Mar 16 2021 CentOS Sources - 6.0-49.1.el7.centos -- remove vendor and/or packager lines +* Mon Nov 29 2021 Gluster Jenkins - 6.0-61 +- fixes bugs bz#1973566 + +* Mon Oct 11 2021 Gluster Jenkins - 6.0-60 +- fixes bugs bz#1668303 bz#1853631 bz#1901468 bz#1904137 bz#1911665 + bz#1962972 bz#1973566 bz#1994593 bz#1995029 bz#1997447 bz#2006205 + +* Tue Jul 06 2021 Gluster Jenkins - 6.0-59 +- fixes bugs bz#1689375 + +* Wed Jun 16 2021 Gluster Jenkins - 6.0-58 +- fixes bugs bz#1945143 + +* Tue Jun 08 2021 Gluster Jenkins - 6.0-57 +- fixes bugs bz#1600379 bz#1689375 bz#1782428 bz#1798897 bz#1815462 + bz#1889966 bz#1891403 bz#1901468 bz#1903911 bz#1908635 bz#1917488 bz#1918018 + bz#1919132 bz#1925425 bz#1927411 bz#1927640 bz#1928676 bz#1942816 bz#1943467 + bz#1945143 bz#1946171 bz#1957191 bz#1957641 + +* Thu May 06 2021 Gluster Jenkins - 6.0-56.2 +- fixes bugs bz#1953901 + +* Thu Apr 22 2021 Gluster Jenkins - 6.0-56.1 +- fixes bugs bz#1927235 + +* Wed Apr 14 2021 Gluster Jenkins - 6.0-56 +- fixes bugs bz#1948547 + +* Fri Mar 19 2021 Gluster Jenkins - 6.0-55 +- fixes bugs bz#1939372 + +* Wed Mar 03 2021 Gluster Jenkins - 6.0-54 +- fixes bugs bz#1832306 bz#1911292 bz#1924044 + +* Thu Feb 11 2021 Gluster Jenkins - 6.0-53 +- fixes bugs bz#1224906 bz#1691320 bz#1719171 bz#1814744 bz#1865796 + +* Thu Jan 28 2021 Gluster Jenkins - 6.0-52 +- fixes bugs bz#1600459 bz#1719171 bz#1830713 bz#1856574 + +* Mon Dec 28 2020 Gluster Jenkins - 6.0-51 +- fixes bugs bz#1640148 bz#1856574 bz#1910119 -* Fri Feb 19 2021 Gluster Jenkins - 6.0-49.1 -- fixes bugs bz#1930561 +* Tue Dec 15 2020 Gluster Jenkins - 6.0-50 +- fixes bugs bz#1224906 bz#1412494 bz#1612973 bz#1663821 bz#1691320 + bz#1726673 bz#1749304 bz#1752739 bz#1779238 bz#1813866 bz#1814744 bz#1821599 + bz#1832306 bz#1835229 bz#1842449 bz#1865796 bz#1878077 bz#1882923 bz#1885966 + bz#1890506 bz#1896425 bz#1898776 bz#1898777 bz#1898778 bz#1898781 bz#1898784 + bz#1903468 * Wed Nov 25 2020 Gluster Jenkins - 6.0-49 - fixes bugs bz#1286171