diff --git a/SOURCES/0544-RHGS-3.5.4-rebuild-to-ship-with-RHEL-8.5.patch b/SOURCES/0544-RHGS-3.5.4-rebuild-to-ship-with-RHEL-8.5.patch deleted file mode 100644 index 171ed10..0000000 --- a/SOURCES/0544-RHGS-3.5.4-rebuild-to-ship-with-RHEL-8.5.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 840f437d232fbafac9f4448b0f8d0e9976ea1e1d Mon Sep 17 00:00:00 2001 -From: Tamar Shacked -Date: Mon, 23 Aug 2021 20:46:13 +0300 -Subject: [PATCH 544/544] RHGS-3.5.4: rebuild to ship with RHEL-8.5 - -Label: DOWNSTREAM ONLY -BUG: 1996984 - -Signed-off-by: Tamar Shacked -Change-Id: Idafc64b8ee5da165c87428b8a5166cf319ef7660 -Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/267350 -Tested-by: RHGS Build Bot -Reviewed-by: Sunil Kumar Heggodu Gopala Acharya ---- - glusterfs.spec.in | 2 ++ - rfc.sh | 2 +- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/glusterfs.spec.in b/glusterfs.spec.in -index 2be7677..4511979 100644 ---- a/glusterfs.spec.in -+++ b/glusterfs.spec.in -@@ -1982,6 +1982,8 @@ fi - %endif - - %changelog -+* Tue Aug 24 2021 Tamar Shacked -+- build RGHS client for RHEL-8.5 (#1996984) - - * Mon May 11 2020 Sunny Kumar - - added requires policycoreutils-python-utils on rhel8 for geo-replication -diff --git a/rfc.sh b/rfc.sh -index c0559b9..b1153be 100755 ---- a/rfc.sh -+++ b/rfc.sh -@@ -315,7 +315,7 @@ main() - if [ -z "${reference}" ]; then - $drier git push $ORIGIN HEAD:refs/for/$branch/rfc; - else -- $drier git push $ORIGIN HEAD:refs/for/$branch/ref-${reference}; -+ $drier git push $ORIGIN HEAD:refs/for/$branch; - fi - } - --- -1.8.3.1 - diff --git a/SOURCES/0544-tests-avoid-empty-paths-in-environment-variables.patch b/SOURCES/0544-tests-avoid-empty-paths-in-environment-variables.patch new file mode 100644 index 0000000..cb5e80b --- /dev/null +++ b/SOURCES/0544-tests-avoid-empty-paths-in-environment-variables.patch @@ -0,0 +1,86 @@ +From 3eaf937e69fe4219738c93d39af1cc909b1ee3f8 Mon Sep 17 00:00:00 2001 +From: Rinku Kothiya +Date: Fri, 23 Apr 2021 09:30:35 +0000 +Subject: [PATCH 544/584] tests: avoid empty paths in environment variables + +Many variables containing paths in env.rc.in are defined in a way +that leave a trailing ':' in the variable when the previous value +was empty or undefined. + +In the particular case of 'LD_PRELOAD_PATH' variable, this causes +that the system looks for dynamic libraries in the current working +directory. When this directory is inside a Gluster mount point, a +significant delay is caused each time a program is run (and testing +framework can run lots of programs for each test). + +This patch prevents that variables containing paths could end with +a trailing ':'. + +Backport of : +>Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2349 +>Fixes: #2348 +>Change-Id: I669f5a78e14f176c0a58824ba577330989d84769 +>Signed-off-by: Xavi Hernandez +>Signed-off-by: Rinku Kothiya + +Change-Id: Ie903ca443aa4789553ac4687818a7f69c113af41 +Signed-off-by: Rinku Kothiya +--- + tests/env.rc.in | 17 +++++++---------- + 1 file changed, 7 insertions(+), 10 deletions(-) + +diff --git a/tests/env.rc.in b/tests/env.rc.in +index 1f0ca88..2d8ff0e 100644 +--- a/tests/env.rc.in ++++ b/tests/env.rc.in +@@ -2,34 +2,31 @@ prefix=@prefix@ + exec_prefix=@exec_prefix@ + libdir=@libdir@ + +-PATH=@sbindir@:$PATH ++PATH=@bindir@:@sbindir@${PATH:+:${PATH}} + export PATH + + GLUSTERD_PIDFILEDIR=@localstatedir@/run/gluster + export GLUSTERD_PIDFILEDIR + +-LD_LIBRARY_PATH=@libdir@:$LD_LIBRARY_PATH ++LD_LIBRARY_PATH=@libdir@${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + export LD_LIBRARY_PATH + +-LIBRARY_PATH=@libdir@:$LIBRARY_PATH ++LIBRARY_PATH=@libdir@${LIBRARY_PATH:+:${LIBRARY_PATH}} + export LIBRARY_PATH + +-CPATH=@includedir@:$CPATH ++CPATH=@includedir@${CPATH:+:${CPATH}} + export CPATH + + GLUSTERD_WORKDIR=@GLUSTERD_WORKDIR@ + export GLUSTERD_WORKDIR + +-PKG_CONFIG_PATH=@pkgconfigdir@:$PKG_CONFIG_PATH ++PKG_CONFIG_PATH=@pkgconfigdir@${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}} + export PKG_CONFIG_PATH + +-PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH +-export PYTHONPATH +- + PYTHON=@PYTHON@ + export PYTHON + +-PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH ++PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@${PYTHONPATH:+:${PYTHONPATH}} + export PYTHONPATH + + GLUSTER_CMD_DIR=@sbindir@ +@@ -42,4 +39,4 @@ RUN_NFS_TESTS=@BUILD_GNFS@ + export RUN_NFS_TESTS + + GLUSTER_XLATOR_DIR=@libdir@/glusterfs/@PACKAGE_VERSION@/xlator +-export GLUSTER_XLATOR_DIR +\ No newline at end of file ++export GLUSTER_XLATOR_DIR +-- +1.8.3.1 + diff --git a/SOURCES/0545-tests-Excluded-tests-for-unsupported-components.patch b/SOURCES/0545-tests-Excluded-tests-for-unsupported-components.patch new file mode 100644 index 0000000..add8025 --- /dev/null +++ b/SOURCES/0545-tests-Excluded-tests-for-unsupported-components.patch @@ -0,0 +1,32 @@ +From 6b340470e01dc177767fae990cf19037202140b7 Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Mon, 31 May 2021 21:27:41 +0300 +Subject: [PATCH 545/584] tests: Excluded tests for unsupported components + +Quota and Tier are depricated from RHGS-3.5.5. +Stop running regression tests for them. + +Label: DOWNSTREAM ONLY + +Signed-off-by: Tamar Shacked +Change-Id: I3ca1aacba9a31129f5e68fcffdd80e69e51f7bcc +--- + run-tests.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/run-tests.sh b/run-tests.sh +index c835d93..5cc18b0 100755 +--- a/run-tests.sh ++++ b/run-tests.sh +@@ -349,7 +349,7 @@ function run_tests() + fi + + for t in $(find ${regression_testsdir}/tests -name '*.t' \ +- | LC_COLLATE=C sort) ; do ++ | egrep -v "tier|quota" | LC_COLLATE=C sort) ; do + old_cores=$(ls /*-*.core 2> /dev/null | wc -l) + total_tests=$((total_tests+1)) + if match $t "$@" ; then +-- +1.8.3.1 + diff --git a/SOURCES/0546-Update-rfc.sh-to-rhgs-3.5.5.patch b/SOURCES/0546-Update-rfc.sh-to-rhgs-3.5.5.patch new file mode 100644 index 0000000..935f533 --- /dev/null +++ b/SOURCES/0546-Update-rfc.sh-to-rhgs-3.5.5.patch @@ -0,0 +1,36 @@ +From 6ff3314f24687c8224a5520f9c4d2b3c39e730b7 Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Tue, 1 Jun 2021 13:02:24 +0300 +Subject: [PATCH 546/584] Update rfc.sh to rhgs-3.5.5 + +Signed-off-by: Tamar Shacked +Change-Id: Iff543dc77174f983dd39f9fb7cc5005b49594750 +--- + rfc.sh | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/rfc.sh b/rfc.sh +index c0559b9..daeff32 100755 +--- a/rfc.sh ++++ b/rfc.sh +@@ -18,7 +18,7 @@ done + shift $((OPTIND-1)) + + +-branch="rhgs-3.5.4"; ++branch="rhgs-3.5.5"; + + set_hooks_commit_msg() + { +@@ -315,7 +315,7 @@ main() + if [ -z "${reference}" ]; then + $drier git push $ORIGIN HEAD:refs/for/$branch/rfc; + else +- $drier git push $ORIGIN HEAD:refs/for/$branch/ref-${reference}; ++ $drier git push $ORIGIN HEAD:refs/for/$branch; + fi + } + +-- +1.8.3.1 + diff --git a/SOURCES/0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch b/SOURCES/0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch new file mode 100644 index 0000000..2bd8e28 --- /dev/null +++ b/SOURCES/0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch @@ -0,0 +1,47 @@ +From 08c57926118b1ab8fa1fcd5b16913ff22d97d065 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Wed, 25 Sep 2019 19:50:27 +0530 +Subject: [PATCH 547/584] perf/write-behind: Clear frame->local on conflict + error + +WB saves the wb_inode in frame->local for the truncate and +ftruncate fops. This value is not cleared in case of error +on a conflicting write request. FRAME_DESTROY finds a non-null +frame->local and tries to free it using mem_put. However, +wb_inode is allocated using GF_CALLOC, causing the +process to crash. + +credit: vpolakis@gmail.com + +Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/23485/ +>Change-Id: I217f61470445775e05145aebe44c814731c1b8c5 +>Fixes: bz#1753592 +>Signed-off-by: N Balachandran + +BUG: 1917488 +Change-Id: I217f61470445775e05145aebe44c814731c1b8c5 +Signed-off-by: Sunil Kumar H G +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244277 +Tested-by: RHGS Build Bot +--- + xlators/performance/write-behind/src/write-behind.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c +index 90a0bcf..31ab723 100644 +--- a/xlators/performance/write-behind/src/write-behind.c ++++ b/xlators/performance/write-behind/src/write-behind.c +@@ -1523,6 +1523,10 @@ __wb_handle_failed_conflict(wb_request_t *req, wb_request_t *conflict, + */ + req->op_ret = -1; + req->op_errno = conflict->op_errno; ++ if ((req->stub->fop == GF_FOP_TRUNCATE) || ++ (req->stub->fop == GF_FOP_FTRUNCATE)) { ++ req->stub->frame->local = NULL; ++ } + + list_del_init(&req->todo); + list_add_tail(&req->winds, tasks); +-- +1.8.3.1 + diff --git a/SOURCES/0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch b/SOURCES/0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch new file mode 100644 index 0000000..aed347c --- /dev/null +++ b/SOURCES/0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch @@ -0,0 +1,49 @@ +From cb7e72bce8b6a46605753b72919c1c839ecb4cc9 Mon Sep 17 00:00:00 2001 +From: root +Date: Thu, 3 Jun 2021 12:08:24 +0530 +Subject: [PATCH 548/584] Add tar as dependency to geo-rep rpm for RHEL 8.3 and + above + +Reason: from RHEL 8.3, tar is not bundled by default + +>Fixes: #1849 +>Signed-off-by: Shwetha K Acharya +>Change-Id: Ic1424e0550cef6a78e3e9e7b42665ab01016436f +Upstream Patch: https://github.com/gluster/glusterfs/pull/1850 + +BUG: 1901468 +Change-Id: Ic1424e0550cef6a78e3e9e7b42665ab01016436f +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244896 +Tested-by: RHGS Build Bot +Reviewed-by: Srijan Sivakumar +--- + glusterfs.spec.in | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 2be7677..424f4ab 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -521,6 +521,9 @@ Requires: python%{_pythonver}-gluster = %{version}-%{release} + Requires: rsync + Requires: util-linux + Requires: %{name}-libs%{?_isa} = %{version}-%{release} ++%if ( 0%{?rhel} && ( ( 0%{?rhel} == 8 && 0%{?rhel_minor_version} >= 3 ) || 0%{?rhel} >= 9 ) ) ++Requires: tar ++%endif + # required for setting selinux bools + %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) + Requires(post): policycoreutils-python-utils +@@ -1982,6 +1985,8 @@ fi + %endif + + %changelog ++* Thu Nov 26 2020 Shwetha K Acharya ++- Add tar as dependency to georeplication rpm for RHEL version >= 8.3 + + * Mon May 11 2020 Sunny Kumar + - added requires policycoreutils-python-utils on rhel8 for geo-replication +-- +1.8.3.1 + diff --git a/SOURCES/0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch b/SOURCES/0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch new file mode 100644 index 0000000..b61e5ea --- /dev/null +++ b/SOURCES/0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch @@ -0,0 +1,45 @@ +From f90c13912a9c64e4479b55fee4ba4ac50e509302 Mon Sep 17 00:00:00 2001 +From: schaffung +Date: Sat, 9 Jan 2021 15:41:15 +0530 +Subject: [PATCH 549/584] geo-rep : Change in attribute for getting function + name in py 3 (#1900) + +Issue: The schedule_geo-rep script uses `func_name` to obtain +the name of the function being referred to but from python3 +onwards, the attribute has been changed to `__name__`. + +Code Change: + Changing `func_name` to `__name__`. + +>Fixes: #1898 +>Signed-off-by: srijan-sivakumar +>Change-Id: I4ed69a06cffed9db17c8f8949b8000c74be1d717 +Upstream Patch : https://github.com/gluster/glusterfs/pull/1900 + +BUG: 1903911 +Change-Id: I4ed69a06cffed9db17c8f8949b8000c74be1d717 +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244570 +Tested-by: RHGS Build Bot +Reviewed-by: Shwetha Acharya +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/geo-rep/schedule_georep.py.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/extras/geo-rep/schedule_georep.py.in b/extras/geo-rep/schedule_georep.py.in +index ac93716..9bb3df5 100644 +--- a/extras/geo-rep/schedule_georep.py.in ++++ b/extras/geo-rep/schedule_georep.py.in +@@ -102,7 +102,7 @@ def cache_output_with_args(func): + """ + def wrapper(*args, **kwargs): + global cache_data +- key = "_".join([func.func_name] + list(args)) ++ key = "_".join([func.__name__] + list(args)) + if cache_data.get(key, None) is None: + cache_data[key] = func(*args, **kwargs) + +-- +1.8.3.1 + diff --git a/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch b/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch new file mode 100644 index 0000000..8bc6694 --- /dev/null +++ b/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch @@ -0,0 +1,184 @@ +From 053bb9c7356eae82b1089582bb2844388ae4df57 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 2 Jun 2021 07:49:12 -0400 +Subject: [PATCH 550/584] common-ha: stability fixes for ganesha_grace and + ganesha_mon RAs + +Include fixes suggested by ClusterHA devs. + +1) It turns out that crm_attribute attrs and attrd_updater attrs really +are one and the same, despite what I was told years ago. + +attrs created with crm_attribute ... --lifetime=reboot ... or +attrd_updater are one and same. As per ClusterHA devs having an attr +created with crm_attribute ... --lifetime=forever and also +creating/updating the same attr with attrd_updater is a recipe for +weird things to happen that will be difficult to debug. + +2) using hostname -s or hostname for node names in crm_attribute and +attrd_updater potentially could use the wrong name if the host has +been renamed; use ocf_local_nodename() (in ocf-shellfuncs) instead. + +https://github.com/gluster/glusterfs/issues/2276 +https://github.com/gluster/glusterfs/pull/2283 +commit 9bd2c697686ec40e2c4f711df961860c8a735baa + +Change-Id:If572d396fae9206628714fb2ce00f72e94f2258f +BUG: 1945143 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244593 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/ganesha/ocf/ganesha_grace | 28 +++++++++--------------- + extras/ganesha/ocf/ganesha_mon | 47 ++++++++++++++-------------------------- + 2 files changed, 26 insertions(+), 49 deletions(-) + +diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace +index 825f716..edc6fa2 100644 +--- a/extras/ganesha/ocf/ganesha_grace ++++ b/extras/ganesha/ocf/ganesha_grace +@@ -94,25 +94,21 @@ esac + ganesha_grace_start() + { + local rc=${OCF_ERR_GENERIC} +- local host=$(hostname -s) ++ local host=$(ocf_local_nodename) + +- ocf_log debug "ganesha_grace_start()" +- # give ganesha_mon RA a chance to set the crm_attr first ++ ocf_log debug "ganesha_grace_start ${host}" ++ # give ganesha_mon RA a chance to set the attr first + # I mislike the sleep, but it's not clear that looping + # with a small sleep is necessarily better + # start has a 40sec timeout, so a 5sec sleep here is okay + sleep 5 +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) ++ attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + if [ $? -ne 0 ]; then +- host=$(hostname) +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null ) +- if [ $? -ne 0 ]; then +- ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" +- fi ++ ocf_log info "grace start: attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" + fi + + # Three possibilities: +- # 1. There is no attribute at all and attr_updater returns ++ # 1. There is no attribute at all and attrd_updater returns + # a zero length string. This happens when + # ganesha_mon::monitor hasn't run at least once to set + # the attribute. The assumption here is that the system +@@ -164,17 +160,13 @@ ganesha_grace_notify() + + ganesha_grace_monitor() + { +- local host=$(hostname -s) ++ local host=$(ocf_local_nodename) + +- ocf_log debug "monitor" ++ ocf_log debug "ganesha_grace monitor ${host}" + +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) ++ attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + if [ $? -ne 0 ]; then +- host=$(hostname) +- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) +- if [ $? -ne 0 ]; then +- ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" +- fi ++ ocf_log info "attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" + fi + + # if there is no attribute (yet), maybe it's because +diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon +index 2b4a9d6..7fbbf70 100644 +--- a/extras/ganesha/ocf/ganesha_mon ++++ b/extras/ganesha/ocf/ganesha_mon +@@ -124,7 +124,6 @@ ganesha_mon_stop() + + ganesha_mon_monitor() + { +- local host=$(hostname -s) + local pid_file="/var/run/ganesha.pid" + local rhel6_pid_file="/var/run/ganesha.nfsd.pid" + local proc_pid="/proc/" +@@ -141,31 +140,27 @@ ganesha_mon_monitor() + + if [ "x${proc_pid}" != "x/proc/" -a -d ${proc_pid} ]; then + +- attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 ++ attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 + if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 failed" ++ ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 failed" + fi + + # ganesha_grace (nfs-grace) RA follows grace-active attr + # w/ constraint location +- attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 ++ attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 + if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 failed" ++ ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 failed" + fi + + # ganesha_mon (nfs-mon) and ganesha_grace (nfs-grace) +- # track grace-active crm_attr (attr != crm_attr) +- # we can't just use the attr as there's no way to query +- # its value in RHEL6 pacemaker +- +- crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null +- if [ $? -ne 0 ]; then +- host=$(hostname) +- crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null +- if [ $? -ne 0 ]; then +- ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed" +- fi +- fi ++ # track grace-active attr. ++ # ++ # Originally we were told that attrs set with attrd_updater ++ # are different/distinct than attrs set with crm_attribute. ++ # Now, years later, we are told that they are the same and ++ # that the values of attrs set with attrd_updater can be ++ # retrieved with crm_attribute. Or with attrd_updater -Q ++ # now that we no longer have to deal with rhel6. + + return ${OCF_SUCCESS} + fi +@@ -182,26 +177,16 @@ ganesha_mon_monitor() + # the remaining ganesha.nfsds into grace before + # initiating the VIP fail-over. + +- attrd_updater -D -n ${OCF_RESKEY_grace_active} +- if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed" +- fi +- +- host=$(hostname -s) +- crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null ++ attrd_updater --delete --name ${OCF_RESKEY_grace_active} + if [ $? -ne 0 ]; then +- host=$(hostname) +- crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null +- if [ $? -ne 0 ]; then +- ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed" +- fi ++ ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_grace_active} failed" + fi + + sleep ${OCF_RESKEY_grace_delay} + +- attrd_updater -D -n ${OCF_RESKEY_ganesha_active} ++ attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} + if [ $? -ne 0 ]; then +- ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_ganesha_active} failed" ++ ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} failed" + fi + + return ${OCF_SUCCESS} +-- +1.8.3.1 + diff --git a/SOURCES/0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch b/SOURCES/0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch new file mode 100644 index 0000000..e3a107f --- /dev/null +++ b/SOURCES/0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch @@ -0,0 +1,52 @@ +From fcfd40132624df5e888d53b4a8c4ce1cf7087413 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 2 Jun 2021 07:40:04 -0400 +Subject: [PATCH 551/584] common-ha: ensure shared_storage is mounted before + setup (#2296) + +If gluster shared-storage isn't mounted, ganesha will fail to start + +commit a249b9020d281d0482db0aeb52e8856acd931e02 +https://github.com/gluster/glusterfs/issues/2278 +https://github.com/gluster/glusterfs/pull/2296 + +Change-Id: I6ed7044ea6b6c61b013ebe17088bfde311b109b7 +BUG: 1918018 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244592 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/ganesha/scripts/ganesha-ha.sh | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh +index 491c61d..012084f 100644 +--- a/extras/ganesha/scripts/ganesha-ha.sh ++++ b/extras/ganesha/scripts/ganesha-ha.sh +@@ -195,9 +195,22 @@ setup_cluster() + local servers=${3} + local unclean="" + local quorum_policy="stop" ++ local dfresult="" + + logger "setting up cluster ${name} with the following ${servers}" + ++ # check that shared_storage is mounted ++ dfresult=$(df -T ${HA_VOL_MNT}) ++ if [[ -z "${dfresult}" ]]; then ++ logger "gluster shared_storage is not mounted, exiting..." ++ exit 1 ++ fi ++ ++ if [[ "${dfresult}" != *"fuse.glusterfs"* ]]; then ++ logger "gluster shared_storage is not mounted, exiting..." ++ exit 1 ++ fi ++ + # pcs cluster setup --force ${PCS9OR10_PCS_CNAME_OPTION} ${name} ${servers} + pcs cluster setup --force ${PCS9OR10_PCS_CNAME_OPTION} ${name} --enable ${servers} + if [ $? -ne 0 ]; then +-- +1.8.3.1 + diff --git a/SOURCES/0552-cluster-afr-Change-default-self-heal-window-size-to-.patch b/SOURCES/0552-cluster-afr-Change-default-self-heal-window-size-to-.patch new file mode 100644 index 0000000..41b94cd --- /dev/null +++ b/SOURCES/0552-cluster-afr-Change-default-self-heal-window-size-to-.patch @@ -0,0 +1,67 @@ +From e9e1b0bc6e2deaf44190636ab6826065ed3c0392 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar Karampuri +Date: Wed, 3 Feb 2021 18:10:40 +0530 +Subject: [PATCH 552/584] cluster/afr: Change default self-heal-window-size to + 1MB (#2068) + +At the moment self-heal-window-size is 128KB. This leads to healing data +in 128KB chunks. With the growth of data and the avg file sizes +nowadays, 1MB seems like a better default. + +Upstream patch details: +> https://github.com/gluster/glusterfs/pull/2111 +> Change-Id: I70c42c83b16c7adb53d6b5762969e878477efb5c +> Fixes: #2067 +> Signed-off-by: Pranith Kumar K + +BUG: 1946171 +Change-Id: Icd6a5c02ca16a1a6095f7bc10feed8ddc2505f41 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244557 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heal-data.c | 6 ++++++ + xlators/cluster/afr/src/afr.c | 6 +++--- + 2 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c +index b97c66b..156cb18 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-data.c ++++ b/xlators/cluster/afr/src/afr-self-heal-data.c +@@ -337,6 +337,12 @@ afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + } + + block = 128 * 1024 * priv->data_self_heal_window_size; ++ if (HAS_HOLES((&replies[source].poststat))) { ++ /*Reduce the possibility of data-block allocations in case of files ++ * with holes. Correct way to fix it would be to use seek fop while ++ * healing data*/ ++ block = 128 * 1024; ++ } + + type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies); + +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 33fe4d8..0956e5a 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -910,12 +910,12 @@ struct volume_options options[] = { + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, +- .default_value = "1", ++ .default_value = "8", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, +- .description = "Maximum number blocks per file for which self-heal " +- "process would be applied simultaneously."}, ++ .description = "Maximum number of 128KB blocks per file for which " ++ "self-heal process would be applied simultaneously."}, + {.key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", +-- +1.8.3.1 + diff --git a/SOURCES/0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch b/SOURCES/0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch new file mode 100644 index 0000000..2144845 --- /dev/null +++ b/SOURCES/0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch @@ -0,0 +1,46 @@ +From 1fa01865eb9bf6a1113669c262fc526ef11f61f2 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Sat, 6 Feb 2021 01:53:28 +0100 +Subject: [PATCH 553/584] cluster/ec: Change self-heal-window-size to 4MiB by + default (#2071) + +The current block size used for self-heal by default is 128 KiB. This +requires a significant amount of management requests for a very small +portion of data healed. + +With this patch the block size is increased to 4 MiB. For a standard +EC volume configuration of 4+2, this means that each healed block of +a file will update 1 MiB on each brick. + +Upstream patch details: +> https://github.com/gluster/glusterfs/pull/2071 +> Change-Id: Ifeec4a2d54988017d038085720513c121b03445b +> Updates: #2067 +> Signed-off-by: Xavi Hernandez + +BUG: 1946171 +Change-Id: I9e3eed2d83c9de54242e6161b2e3951c2f6f8000 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244558 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 4118c3b..a930089 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -1644,7 +1644,7 @@ struct volume_options options[] = { + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, +- .default_value = "1", ++ .default_value = "32", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, +-- +1.8.3.1 + diff --git a/SOURCES/0554-dht-fix-rebalance-of-sparse-files.patch b/SOURCES/0554-dht-fix-rebalance-of-sparse-files.patch new file mode 100644 index 0000000..935303b --- /dev/null +++ b/SOURCES/0554-dht-fix-rebalance-of-sparse-files.patch @@ -0,0 +1,245 @@ +From 2cb90b7798fa469f2d7d938ae88733eb1962d63d Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Fri, 9 Apr 2021 18:13:30 +0200 +Subject: [PATCH 554/584] dht: fix rebalance of sparse files + +Current implementation of rebalance for sparse files has a bug that, +in some cases, causes a read of 0 bytes from the source subvolume. +Posix xlator doesn't allow 0 byte reads and fails them with EINVAL, +which causes rebalance to abort the migration. + +This patch implements a more robust way of finding data segments in +a sparse file that avoids 0 byte reads, allowing the file to be +migrated successfully. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2318 +> Fixes: #2317 +> Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c +> Signed-off-by: Xavi Hernandez + +BUG: 1957641 +Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244551 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/distribute/issue-2317.t | 29 ++++++++ + tests/volume.rc | 4 ++ + xlators/cluster/dht/src/dht-rebalance.c | 116 +++++++++++++++++--------------- + 3 files changed, 93 insertions(+), 56 deletions(-) + create mode 100755 tests/bugs/distribute/issue-2317.t + +diff --git a/tests/bugs/distribute/issue-2317.t b/tests/bugs/distribute/issue-2317.t +new file mode 100755 +index 0000000..e29d003 +--- /dev/null ++++ b/tests/bugs/distribute/issue-2317.t +@@ -0,0 +1,29 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++TESTS_EXPECTED_IN_LOOP=126 ++ ++cleanup ++ ++TEST glusterd ++TEST ${CLI} volume create ${V0} replica 3 ${H0}:/$B0/${V0}_{0..2} ++TEST ${CLI} volume start ${V0} ++ ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++# Create several files to make sure that at least some of them should be ++# migrated by rebalance. ++for i in {0..63}; do ++ TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 ++ TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 seek=128 ++done ++ ++TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{3..5} ++TEST ${CLI} volume rebalance ${V0} start force ++EXPECT_WITHIN ${REBALANCE_TIMEOUT} "completed" rebalance_status_field "${V0}" ++ ++EXPECT "^0$" rebalance_failed_field "${V0}" ++ ++cleanup +diff --git a/tests/volume.rc b/tests/volume.rc +index 9a002d9..f5dd0b1 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -75,6 +75,10 @@ function rebalance_status_field { + $CLI volume rebalance $1 status | awk '{print $7}' | sed -n 3p + } + ++function rebalance_failed_field { ++ $CLI volume rebalance $1 status | awk '{print $5}' | sed -n 3p ++} ++ + function fix-layout_status_field { + #The fix-layout status can be up to 3 words, (ex:'fix-layout in progress'), hence the awk-print $2 thru $4. + #But if the status is less than 3 words, it also prints the next field i.e the run_time_in_secs.(ex:'completed 3.00'). +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index 072896d..eab7558 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -1024,6 +1024,46 @@ out: + return ret; + } + ++static int32_t ++dht_rebalance_sparse_segment(xlator_t *subvol, fd_t *fd, off_t *offset, ++ size_t *size) ++{ ++ off_t hole; ++ int32_t ret; ++ ++ do { ++ ret = syncop_seek(subvol, fd, *offset, GF_SEEK_DATA, NULL, offset); ++ if (ret >= 0) { ++ /* Starting at the offset of the last data segment, find the ++ * next hole. After a data segment there should always be a ++ * hole, since EOF is considered a hole. */ ++ ret = syncop_seek(subvol, fd, *offset, GF_SEEK_HOLE, NULL, &hole); ++ } ++ ++ if (ret < 0) { ++ if (ret == -ENXIO) { ++ /* This can happen if there are no more data segments (i.e. ++ * the offset is at EOF), or there was a data segment but the ++ * file has been truncated to a smaller size between both ++ * seek requests. In both cases we are done. The file doesn't ++ * contain more data. */ ++ ret = 0; ++ } ++ return ret; ++ } ++ ++ /* It could happen that at the same offset we detected data in the ++ * first seek, there could be a hole in the second seek if user is ++ * modifying the file concurrently. In this case we need to find a ++ * new data segment to migrate. */ ++ } while (hole <= *offset); ++ ++ /* Calculate the total size of the current data block */ ++ *size = hole - *offset; ++ ++ return 1; ++} ++ + static int + __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, +@@ -1032,8 +1072,6 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + int ret = 0; + int count = 0; + off_t offset = 0; +- off_t data_offset = 0; +- off_t hole_offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; +@@ -1048,71 +1086,36 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + while (total < ia_size) { + /* This is a regular file - read it sequentially */ + if (!hole_exists) { +- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) +- ? DHT_REBALANCE_BLKSIZE +- : (ia_size - total)); ++ data_block_size = ia_size - total; + } else { + /* This is a sparse file - read only the data segments in the file + */ + + /* If the previous data block is fully copied, find the next data +- * segment +- * starting at the offset of the last read and written byte, */ ++ * segment starting at the offset of the last read and written ++ * byte. */ + if (data_block_size <= 0) { +- ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, +- &data_offset); +- if (ret) { +- if (ret == -ENXIO) +- ret = 0; /* No more data segments */ +- else +- *fop_errno = -ret; /* Error occurred */ +- ++ ret = dht_rebalance_sparse_segment(from, src, &offset, ++ &data_block_size); ++ if (ret <= 0) { ++ *fop_errno = -ret; + break; + } +- +- /* If the position of the current data segment is greater than +- * the position of the next hole, find the next hole in order to +- * calculate the length of the new data segment */ +- if (data_offset > hole_offset) { +- /* Starting at the offset of the last data segment, find the +- * next hole */ +- ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, +- NULL, &hole_offset); +- if (ret) { +- /* If an error occurred here it's a real error because +- * if the seek for a data segment was successful then +- * necessarily another hole must exist (EOF is a hole) +- */ +- *fop_errno = -ret; +- break; +- } +- +- /* Calculate the total size of the current data block */ +- data_block_size = hole_offset - data_offset; +- } +- } else { +- /* There is still data in the current segment, move the +- * data_offset to the position of the last written byte */ +- data_offset = offset; + } +- +- /* Calculate how much data needs to be read and written. If the data +- * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and +- * write DHT_REBALANCE_BLKSIZE data length and the rest in the +- * next iteration(s) */ +- read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) +- ? DHT_REBALANCE_BLKSIZE +- : data_block_size); +- +- /* Calculate the remaining size of the data block - maybe there's no +- * need to seek for data in the next iteration */ +- data_block_size -= read_size; +- +- /* Set offset to the offset of the data segment so read and write +- * will have the correct position */ +- offset = data_offset; + } + ++ /* Calculate how much data needs to be read and written. If the data ++ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and ++ * write DHT_REBALANCE_BLKSIZE data length and the rest in the ++ * next iteration(s) */ ++ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) ++ ? DHT_REBALANCE_BLKSIZE ++ : data_block_size); ++ ++ /* Calculate the remaining size of the data block - maybe there's no ++ * need to seek for data in the next iteration */ ++ data_block_size -= read_size; ++ + ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, + &iobref, NULL, NULL, NULL); + +@@ -1177,6 +1180,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + iobref = NULL; + vector = NULL; + } ++ + if (iobref) + iobref_unref(iobref); + GF_FREE(vector); +-- +1.8.3.1 + diff --git a/SOURCES/0555-geo-rep-Improve-handling-of-gfid-mismatches.patch b/SOURCES/0555-geo-rep-Improve-handling-of-gfid-mismatches.patch new file mode 100644 index 0000000..85b19e0 --- /dev/null +++ b/SOURCES/0555-geo-rep-Improve-handling-of-gfid-mismatches.patch @@ -0,0 +1,79 @@ +From f2d3866e617d25ea62cda01afddc81ef0db3356e Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 4 May 2021 22:39:03 +0200 +Subject: [PATCH 555/584] geo-rep: Improve handling of gfid mismatches + +In some circumstances geo-replication can detect mismatching gfids +between primary and secondary. These entries are fixed in an iterative +way, assuming that after a fix, a previously failing entry could +succeed. + +Previous code was trying to fix them in a loop that can be executed +up to 10 times. If some entry cannot be fixed after 10 attempts, it's +discarded. These fixes are very slow, so trying to do them many times +causes geo-replication to get out of sync. + +To minimize the number of iterations done, this patch checks if the +number of entries and failures remains constant after each iteration. +If they are constant, it means that nothing else can be fixed, so it +makes no sense to do more iterations. This reduces the number of +iterations to 2 or 3 in most of the cases, improving geo-replication +performance. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2389 +> Fixes: #2388 +> Change-Id: I6d9a623a60045694e1a832195e1dc1fb9e88ae54 +> Signed-off-by: Xavi Hernandez + +BUG: 1957191 +Change-Id: I6d9a623a60045694e1a832195e1dc1fb9e88ae54 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244550 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/master.py | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py +index 98637e7..aef9373 100644 +--- a/geo-replication/syncdaemon/master.py ++++ b/geo-replication/syncdaemon/master.py +@@ -1224,9 +1224,11 @@ class GMasterChangelogMixin(GMasterCommon): + + if gconf.get("gfid-conflict-resolution"): + count = 0 ++ num_entries = len(entries) ++ num_failures = len(failures) + if failures: + logging.info(lf('Entry ops failed with gfid mismatch', +- count=len(failures))) ++ count=num_failures)) + while failures and count < self.MAX_OE_RETRIES: + count += 1 + self.handle_entry_failures(failures, entries) +@@ -1237,6 +1239,20 @@ class GMasterChangelogMixin(GMasterCommon): + "gfid mismatch") + break + ++ # If this iteration has not removed any entry or reduced ++ # the number of failures compared to the previous one, we ++ # don't need to keep iterating because we'll get the same ++ # result in all other attempts. ++ if ((num_entries == len(entries)) and ++ (num_failures == len(failures))): ++ logging.info(lf("No more gfid mismatches can be fixed", ++ entries=num_entries, ++ failures=num_failures)) ++ break ++ ++ num_entries = len(entries) ++ num_failures = len(failures) ++ + self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY') + self.status.dec_value("entry", len(entries)) + +-- +1.8.3.1 + diff --git a/SOURCES/0556-dht-don-t-ignore-xdata-in-fgetxattr.patch b/SOURCES/0556-dht-don-t-ignore-xdata-in-fgetxattr.patch new file mode 100644 index 0000000..0cf3545 --- /dev/null +++ b/SOURCES/0556-dht-don-t-ignore-xdata-in-fgetxattr.patch @@ -0,0 +1,52 @@ +From a7f6ad0c617a36414c8232cb692471703923b16d Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 19 Jan 2021 18:03:33 +0100 +Subject: [PATCH 556/584] dht: don't ignore xdata in fgetxattr + +DHT was passing NULL for xdata in fgetxattr() request, ignoring any +data sent by upper xlators. + +This patch fixes the issue by sending the received xdata to lower +xlators, as it's currently done for getxattr(). + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2020 +> Fixes: #1991 +> Change-Id: If3d3f1f2ce6215f3b1acc46480e133cb4294eaec +> Signed-off-by: Xavi Hernandez + +BUG: 1919132 +Change-Id: If3d3f1f2ce6215f3b1acc46480e133cb4294eaec +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244538 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 7425c1a..0773092 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -5262,7 +5262,7 @@ dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + + if (!ret && key && local->mds_subvol && dht_match_xattr(key)) { + STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol, +- local->mds_subvol->fops->fgetxattr, fd, key, NULL); ++ local->mds_subvol->fops->fgetxattr, fd, key, xdata); + + return 0; + } +@@ -5274,7 +5274,7 @@ dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd, +- key, NULL); ++ key, xdata); + } + return 0; + +-- +1.8.3.1 + diff --git a/SOURCES/0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch b/SOURCES/0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch new file mode 100644 index 0000000..2add6cb --- /dev/null +++ b/SOURCES/0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch @@ -0,0 +1,306 @@ +From ba57b043db1e19196cf860baeeeb1acfc9985cd2 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Wed, 24 Feb 2021 15:04:23 +0100 +Subject: [PATCH 557/584] cluster/dht: Fix stack overflow in readdir(p) + +When parallel-readdir is enabled, readdir(p) requests sent by DHT can be +immediately processed and answered in the same thread before the call to +STACK_WIND_COOKIE() completes. + +This means that the readdir(p) cbk is processed synchronously. In some +cases it may decide to send another readdir(p) request, which causes a +recursive call. + +When some special conditions happen and the directories are big, it's +possible that the number of nested calls is so high that the process +crashes because of a stack overflow. + +This patch fixes this by not allowing nested readdir(p) calls. When a +nested call is detected, it's queued instead of sending it. The queued +request is processed when the current call finishes by the top level +stack function. + +Backport of 3 patches: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2170 +> Fixes: #2169 +> Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099 +> Signed-off-by: Xavi Hernandez + +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2202 +> Updates: #2169 +> Change-Id: I97e73c0aae74fc5d80c975f56f2f7a64e3e1ae95 +> Signed-off-by: Xavi Hernandez + +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2242 +> Fixes: #2239 +> Change-Id: I6b2e48e87c85de27fad67a12d97abd91fa27c0c1 +> Signed-off-by: Pranith Kumar K + +BUG: 1798897 +Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244549 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/distribute/issue-2169.t | 33 +++++++++ + xlators/cluster/dht/src/dht-common.c | 134 ++++++++++++++++++++++++++++++++--- + xlators/cluster/dht/src/dht-common.h | 5 ++ + 3 files changed, 162 insertions(+), 10 deletions(-) + create mode 100755 tests/bugs/distribute/issue-2169.t + +diff --git a/tests/bugs/distribute/issue-2169.t b/tests/bugs/distribute/issue-2169.t +new file mode 100755 +index 0000000..91fa72a +--- /dev/null ++++ b/tests/bugs/distribute/issue-2169.t +@@ -0,0 +1,33 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++ ++TEST glusterd ++TEST ${CLI} volume create ${V0} ${H0}:/$B0/${V0}_0 ++TEST ${CLI} volume set ${V0} readdir-ahead on ++TEST ${CLI} volume set ${V0} parallel-readdir on ++TEST ${CLI} volume start ${V0} ++ ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++TEST mkdir -p ${M0}/d/d.{000..999} ++ ++EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0} ++ ++TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{1..7} ++ ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++ls -l ${M0}/d/ | wc -l ++ ++EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0} ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++ ++ls -l ${M0}/d/ | wc -l ++ ++TEST ls ${M0}/d ++ ++cleanup +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 0773092..ce0fbbf 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -24,8 +24,15 @@ + #include + #include + ++#include ++ + int run_defrag = 0; + ++static int ++dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int op_ret, int op_errno, gf_dirent_t *entries, ++ dict_t *xdata); ++ + int + dht_link2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); + +@@ -6681,6 +6688,94 @@ out: + return; + } + ++/* Execute a READDIR request if no other request is in progress. Otherwise ++ * queue it to be executed when the current one finishes. ++ * ++ * When parallel-readdir is enabled and directory contents are cached, the ++ * callback of a readdirp will be called before returning from STACK_WIND. ++ * If the returned contents are not useful for DHT, and the buffer is not ++ * yet full, a nested readdirp request will be sent. This means that there ++ * will be many recursive calls. In the worst case there might be a stack ++ * overflow. ++ * ++ * To avoid this, we only wind a request if no other request is being wound. ++ * If there's another request, we simple store the values for the next call. ++ * When the thread processing the current wind completes it, it will take ++ * the new arguments and send the request from the top level stack. */ ++static void ++dht_queue_readdir(call_frame_t *frame, xlator_t *xl, off_t offset, ++ fop_readdir_cbk_t cbk) ++{ ++ dht_local_t *local; ++ int32_t queue; ++ xlator_t *this = NULL; ++ ++ local = frame->local; ++ this = frame->this; ++ ++ local->queue_xl = xl; ++ local->queue_offset = offset; ++ ++ if (uatomic_add_return(&local->queue, 1) == 1) { ++ /* If we are here it means that we are the first one to send a ++ * readdir request. Any attempt to send more readdir requests will ++ * find local->queue > 1, so it won't do anything. The needed data ++ * to send the request has been stored into local->queue_*. ++ * ++ * Note: this works because we will only have 1 additional request ++ * at most (the one called by the cbk function) while we are ++ * processing another readdir. */ ++ do { ++ STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl, ++ local->queue_xl->fops->readdir, local->fd, ++ local->size, local->queue_offset, local->xattr); ++ ++ /* If a new readdirp request has been added before returning ++ * from winding, we process it. */ ++ } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0); ++ ++ if (queue < 0) { ++ /* A negative value means that an unwind has been called before ++ * returning from the previous wind. This means that 'local' is ++ * not needed anymore and must be destroyed. */ ++ dht_local_wipe(this, local); ++ } ++ } ++} ++ ++/* Execute a READDIRP request if no other request is in progress. Otherwise ++ * queue it to be executed when the current one finishes. */ ++static void ++dht_queue_readdirp(call_frame_t *frame, xlator_t *xl, off_t offset, ++ fop_readdirp_cbk_t cbk) ++{ ++ dht_local_t *local; ++ int32_t queue; ++ xlator_t *this = NULL; ++ ++ local = frame->local; ++ this = frame->this; ++ ++ local->queue_xl = xl; ++ local->queue_offset = offset; ++ ++ /* Check dht_queue_readdir() comments for an explanation of this. */ ++ if (uatomic_add_return(&local->queue, 1) == 1) { ++ do { ++ STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl, ++ local->queue_xl->fops->readdirp, local->fd, ++ local->size, local->queue_offset, local->xattr); ++ } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0); ++ ++ if (queue < 0) { ++ /* A negative value means that an unwind has been called before ++ * returning from the previous wind. This means that 'local' is ++ * not needed anymore and must be destroyed. */ ++ dht_local_wipe(this, local); ++ } ++ } ++} ++ + /* Posix returns op_errno = ENOENT to indicate that there are no more + * entries + */ +@@ -6950,9 +7045,8 @@ done: + } + } + +- STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol, +- next_subvol->fops->readdirp, local->fd, local->size, +- next_offset, local->xattr); ++ dht_queue_readdirp(frame, next_subvol, next_offset, dht_readdirp_cbk); ++ + return 0; + } + +@@ -6970,6 +7064,17 @@ unwind: + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + ++ /* If we are inside a recursive call (or not inside a recursive call but ++ * the cbk is completed before the wind returns), local->queue will be 1. ++ * In this case we cannot destroy 'local' because it will be needed by ++ * the caller of STACK_WIND. In this case, we decrease the value to let ++ * the caller know that the operation has terminated and it must destroy ++ * 'local'. If local->queue 0, we can destroy it here because there are ++ * no other users. */ ++ if (uatomic_sub_return(&local->queue, 1) >= 0) { ++ frame->local = NULL; ++ } ++ + DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); +@@ -7071,9 +7176,8 @@ done: + goto unwind; + } + +- STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol, +- next_subvol->fops->readdir, local->fd, local->size, +- next_offset, NULL); ++ dht_queue_readdir(frame, next_subvol, next_offset, dht_readdir_cbk); ++ + return 0; + } + +@@ -7089,6 +7193,17 @@ unwind: + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + ++ /* If we are inside a recursive call (or not inside a recursive call but ++ * the cbk is completed before the wind returns), local->queue will be 1. ++ * In this case we cannot destroy 'local' because it will be needed by ++ * the caller of STACK_WIND. In this case, we decrease the value to let ++ * the caller know that the operation has terminated and it must destroy ++ * 'local'. If local->queue 0, we can destroy it here because there are ++ * no other users. */ ++ if (uatomic_sub_return(&local->queue, 1) >= 0) { ++ frame->local = NULL; ++ } ++ + if (!skip_hashed_check) { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); + gf_dirent_free(&entries); +@@ -7096,6 +7211,7 @@ unwind: + } else { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL); + } ++ + return 0; + } + +@@ -7172,11 +7288,9 @@ dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + } + } + +- STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol, +- xvol->fops->readdirp, fd, size, yoff, local->xattr); ++ dht_queue_readdirp(frame, xvol, yoff, dht_readdirp_cbk); + } else { +- STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol, +- xvol->fops->readdir, fd, size, yoff, local->xattr); ++ dht_queue_readdir(frame, xvol, yoff, dht_readdir_cbk); + } + + return 0; +diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h +index 92f1b89..132b3b3 100644 +--- a/xlators/cluster/dht/src/dht-common.h ++++ b/xlators/cluster/dht/src/dht-common.h +@@ -369,6 +369,11 @@ struct dht_local { + + dht_dir_transaction_t lock[2], *current; + ++ /* for nested readdirs */ ++ xlator_t *queue_xl; ++ off_t queue_offset; ++ int32_t queue; ++ + /* inodelks during filerename for backward compatibility */ + dht_lock_t **rename_inodelk_backward_compatible; + int rename_inodelk_bc_count; +-- +1.8.3.1 + diff --git a/SOURCES/0558-afr-fix-directory-entry-count.patch b/SOURCES/0558-afr-fix-directory-entry-count.patch new file mode 100644 index 0000000..4134f77 --- /dev/null +++ b/SOURCES/0558-afr-fix-directory-entry-count.patch @@ -0,0 +1,238 @@ +From 9bf6986f8ea3edd9de3d2629404f7ab11c1597de Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Tue, 9 Mar 2021 00:24:07 +0100 +Subject: [PATCH 558/584] afr: fix directory entry count + +AFR may hide some existing entries from a directory when reading it +because they are generated internally for private management. However +the returned number of entries from readdir() function is not updated +accordingly. So it may return a number higher than the real entries +present in the gf_dirent list. + +This may cause unexpected behavior of clients, including gfapi which +incorrectly assumes that there was an entry when the list was actually +empty. + +This patch also makes the check in gfapi more robust to avoid similar +issues that could appear in the future. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2233 +> Fixes: #2232 +> Change-Id: I81ba3699248a53ebb0ee4e6e6231a4301436f763 +> Signed-off-by: Xavi Hernandez + +BUG: 1927411 +Change-Id: I81ba3699248a53ebb0ee4e6e6231a4301436f763 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244535 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + api/src/glfs-fops.c | 3 +- + tests/bugs/replicate/issue-2232.c | 85 ++++++++++++++++++++++++++++++++++ + tests/bugs/replicate/issue-2232.t | 34 ++++++++++++++ + xlators/cluster/afr/src/afr-dir-read.c | 11 +++-- + 4 files changed, 129 insertions(+), 4 deletions(-) + create mode 100644 tests/bugs/replicate/issue-2232.c + create mode 100644 tests/bugs/replicate/issue-2232.t + +diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c +index 6dc3b66..821d250 100644 +--- a/api/src/glfs-fops.c ++++ b/api/src/glfs-fops.c +@@ -3748,8 +3748,9 @@ glfd_entry_refresh(struct glfs_fd *glfd, int plus) + errno = 0; + } + +- if (ret > 0) ++ if ((ret > 0) && !list_empty(&glfd->entries)) { + glfd->next = list_entry(glfd->entries.next, gf_dirent_t, list); ++ } + + gf_dirent_free(&old); + out: +diff --git a/tests/bugs/replicate/issue-2232.c b/tests/bugs/replicate/issue-2232.c +new file mode 100644 +index 0000000..df547c2 +--- /dev/null ++++ b/tests/bugs/replicate/issue-2232.c +@@ -0,0 +1,85 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int main(int argc, char **argv) ++{ ++ char log[128]; ++ struct dirent entry; ++ struct dirent *ent; ++ glfs_xreaddirp_stat_t *xstat; ++ int ret, flags; ++ ++ if (argc != 3) { ++ fprintf(stderr, "Syntax: %s \n", argv[0]); ++ exit(1); ++ } ++ char *hostname = argv[1]; ++ char *volname = argv[2]; ++ ++ glfs_t *fs = glfs_new(volname); ++ if (!fs) { ++ fprintf(stderr, "glfs_new() failed\n"); ++ exit(1); ++ } ++ ++ ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_set_volfile_server() failed\n"); ++ return ret; ++ } ++ ++ sprintf(log, "/tmp/logs-%d.log", getpid()); ++ ret = glfs_set_logging(fs, log, 9); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_set_logging() failed\n"); ++ return ret; ++ } ++ ++ ret = glfs_init(fs); ++ if (ret < 0) { ++ fprintf(stderr, "glfs_init() failed\n"); ++ return ret; ++ } ++ ++ glfs_fd_t *fd = glfs_opendir(fs, "/"); ++ if (fd == NULL) { ++ fprintf(stderr, "glfs_opendir() failed\n"); ++ return 1; ++ } ++ ++ flags = GFAPI_XREADDIRP_STAT | GFAPI_XREADDIRP_HANDLE; ++ xstat = NULL; ++ while ((ret = glfs_xreaddirplus_r(fd, flags, &xstat, &entry, &ent)) > 0) { ++ if (xstat != NULL) { ++ glfs_free(xstat); ++ } ++ if ((strcmp(ent->d_name, ".") == 0) || ++ (strcmp(ent->d_name, "..") == 0)) { ++ xstat = NULL; ++ continue; ++ } ++ if ((xstat == NULL) || ((ret & GFAPI_XREADDIRP_HANDLE) == 0)) { ++ fprintf(stderr, "glfs_xreaddirplus_r() failed: %s\n", ++ strerror(errno)); ++ return 1; ++ } ++ ++ xstat = NULL; ++ } ++ ++ if (ret < 0) { ++ fprintf(stderr, "glfs_xreaddirplus_r() failed\n"); ++ return ret; ++ } ++ ++ glfs_close(fd); ++ ++ glfs_fini(fs); ++ ++ return ret; ++} +diff --git a/tests/bugs/replicate/issue-2232.t b/tests/bugs/replicate/issue-2232.t +new file mode 100644 +index 0000000..66a41e0 +--- /dev/null ++++ b/tests/bugs/replicate/issue-2232.t +@@ -0,0 +1,34 @@ ++#!/bin/bash ++ ++. $(dirname "${0}")/../../include.rc ++. $(dirname "${0}")/../../volume.rc ++ ++cleanup; ++TEST gcc $(dirname "${0}")/issue-2232.c -o $(dirname "${0}")/issue-2232 -lgfapi ++TEST glusterd ++TEST pidof glusterd ++ ++TEST $CLI volume create ${V0} replica 3 ${H0}:${B0}/${V0}{0..2} ++ ++# Create a fake .glusterfs-anonymous-inode-... entry ++ANONINO=".glusterfs-anonymous-inode-aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" ++TEST mkdir ${B0}/${V0}{0..2}/${ANONINO} ++gfid="$(uuidgen)" ++hex="0x$(echo "${gfid}" | tr -d '-')" ++TEST assign_gfid "${hex}" "${B0}/${V0}0/${ANONINO}" ++TEST assign_gfid "${hex}" "${B0}/${V0}1/${ANONINO}" ++TEST assign_gfid "${hex}" "${B0}/${V0}2/${ANONINO}" ++TEST mkdir -p "${B0}/${V0}0/.glusterfs/${gfid:0:2}/${gfid:2:2}" ++TEST mkdir -p "${B0}/${V0}1/.glusterfs/${gfid:0:2}/${gfid:2:2}" ++TEST mkdir -p "${B0}/${V0}2/.glusterfs/${gfid:0:2}/${gfid:2:2}" ++TEST ln -s "../../00/00/00000000-0000-0000-0000-000000000001/${ANONINO}" "${B0}/${V0}0/.glusterfs/${gfid:0:2}/${gfid:2:2}/${gfid}" ++TEST ln -s "../../00/00/00000000-0000-0000-0000-000000000001/${ANONINO}" "${B0}/${V0}1/.glusterfs/${gfid:0:2}/${gfid:2:2}/${gfid}" ++TEST ln -s "../../00/00/00000000-0000-0000-0000-000000000001/${ANONINO}" "${B0}/${V0}2/.glusterfs/${gfid:0:2}/${gfid:2:2}/${gfid}" ++ ++TEST $CLI volume start ${V0} ++ ++TEST $(dirname "${0}")/issue-2232 ${H0} ${V0} ++ ++TEST rm -f $(dirname $0)/issue-2232 ++ ++cleanup +diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c +index d64b6a9..a98f8df 100644 +--- a/xlators/cluster/afr/src/afr-dir-read.c ++++ b/xlators/cluster/afr/src/afr-dir-read.c +@@ -157,7 +157,7 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) + return 0; + } + +-static void ++static int32_t + afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) + { +@@ -168,6 +168,7 @@ afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + afr_private_t *priv = NULL; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t validate_subvol = _gf_false; ++ int32_t count = 0; + + this = THIS; + priv = this->private; +@@ -184,6 +185,7 @@ afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + + list_del_init(&entry->list); + list_add_tail(&entry->list, &entries->list); ++ count++; + + if (!validate_subvol) + continue; +@@ -197,6 +199,8 @@ afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + } + } + } ++ ++ return count; + } + + int32_t +@@ -222,8 +226,9 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + } + + if (op_ret >= 0) +- afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, +- &entries, local->fd); ++ op_ret = afr_readdir_transform_entries(frame, subvol_entries, ++ (long)cookie, &entries, ++ local->fd); + + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); + +-- +1.8.3.1 + diff --git a/SOURCES/0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch b/SOURCES/0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch new file mode 100644 index 0000000..91add36 --- /dev/null +++ b/SOURCES/0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch @@ -0,0 +1,163 @@ +From 2b6e6c234dffa72c9f2af747908b1e1f29080698 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Thu, 25 Mar 2021 11:52:13 +0530 +Subject: [PATCH 559/584] afr: make fsync post-op aware of inodelk count + (#2273) + +Problem: +Since commit bd540db1e, eager-locking was enabled for fsync. But on +certain VM workloads wit sharding enabled, shard xlator keeps sending +fsync on the base shard. This can cause blocked inodelks from other +clients (including shd) to time out due to call bail. + +Fix: +Make afr fsync aware of inodelk count and not delay post-op + unlock +when inodelk count > 1, just like writev. + +Code is restructured so that any fd based AFR_DATA_TRANSACTION can be made +aware by setting GLUSTERFS_INODELK_DOM_COUNT in xdata request. + +Note: We do not know yet why VMs go in to paused state because of the +blocked inodelks but this patch should be a first step in reducing the +occurence. + +Upstream patch details: +> https://github.com/gluster/glusterfs/pull/2273/ +> Updates: #2198 +> Change-Id: Ib91ebdd3101d590c326e69c829cf9335003e260b +> Signed-off-by: Ravishankar N + +BUG: 1943467 +Change-Id: Id407ca54007e3bbb206a1d9431ebaf89a2167f74 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244516 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-inode-write.c | 40 ++++++++++++++++++------------- + xlators/features/locks/src/posix.c | 1 + + 2 files changed, 24 insertions(+), 17 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index df82b6e..962a7b1 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -42,6 +42,7 @@ __afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) + struct iatt *stbuf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; ++ afr_lock_t *lock = NULL; + afr_read_subvol_args_t args = { + 0, + }; +@@ -50,6 +51,12 @@ __afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, local->inode, out); + ++ if (local->update_num_inodelks && ++ local->transaction.type == AFR_DATA_TRANSACTION) { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ lock->num_inodelks = local->num_inodelks; ++ } ++ + /*This code needs to stay till DHT sends fops on linked + * inodes*/ + if (!inode_is_linked(local->inode)) { +@@ -134,6 +141,7 @@ __afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; ++ int num_inodelks = 0; + + local = frame->local; + priv = this->private; +@@ -146,8 +154,16 @@ __afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; +- if (xdata) ++ if (xdata) { + local->replies[child_index].xdata = dict_ref(xdata); ++ if (dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, ++ &num_inodelks) == 0) { ++ if (num_inodelks > local->num_inodelks) { ++ local->num_inodelks = num_inodelks; ++ local->update_num_inodelks = _gf_true; ++ } ++ } ++ } + + if (op_ret >= 0) { + if (prebuf) +@@ -284,7 +300,6 @@ afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + afr_local_t *local = frame->local; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; +- int32_t num_inodelks = 0; + + LOCK(&frame->lock); + { +@@ -306,15 +321,6 @@ afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } +- +- ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, +- &num_inodelks); +- if (ret < 0) +- goto unlock; +- if (num_inodelks > local->num_inodelks) { +- local->num_inodelks = num_inodelks; +- local->update_num_inodelks = _gf_true; +- } + } + unlock: + UNLOCK(&frame->lock); +@@ -324,7 +330,6 @@ void + afr_process_post_writev(call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; +- afr_lock_t *lock = NULL; + + local = frame->local; + +@@ -343,11 +348,6 @@ afr_process_post_writev(call_frame_t *frame, xlator_t *this) + + if (local->update_open_fd_count) + local->inode_ctx->open_fd_count = local->open_fd_count; +- if (local->update_num_inodelks && +- local->transaction.type == AFR_DATA_TRANSACTION) { +- lock = &local->inode_ctx->lock[local->transaction.type]; +- lock->num_inodelks = local->num_inodelks; +- } + } + + int +@@ -2516,6 +2516,12 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + if (!local->xdata_req) + goto out; + ++ if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT, ++ this->name)) { ++ op_errno = ENOMEM; ++ goto out; ++ } ++ + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index cdd1ff7..22ef5b8 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -4943,6 +4943,7 @@ struct xlator_fops fops = { + .rchecksum = pl_rchecksum, + .statfs = pl_statfs, + .fsyncdir = pl_fsyncdir, ++ .fsync = pl_fsync, + .readdir = pl_readdir, + .symlink = pl_symlink, + .link = pl_link, +-- +1.8.3.1 + diff --git a/SOURCES/0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch b/SOURCES/0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch new file mode 100644 index 0000000..cccac36 --- /dev/null +++ b/SOURCES/0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch @@ -0,0 +1,73 @@ +From e56605d5808b41335026a5470fa10f5e5b5389f3 Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Mon, 6 Apr 2020 21:58:03 +0530 +Subject: [PATCH 560/584] posix: Avoid dict_del logs in posix_is_layout_stale + while key is NULL + +Problem: The key "GF_PREOP_PARENT_KEY" has been populated by dht and + for non-distribute volume like 1x3 key is not populated so + posix_is_layout stale throw a message while a file is created + +Solution: To avoid a log put a condition before delete a key + +Upstream patch details: +> https://review.gluster.org/#/c/glusterfs/+/24297/ +> Change-Id: I813ee7960633e7f9f5e9ad2f42f288053d9eb71f +> Fixes: #1150 +> Signed-off-by: Mohit Agrawal + +BUG: 1942816 +Change-Id: I746a2619989265f3bc9bb648c4b8e4bbefaedc56 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244925 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/glusterd/brick-mux-validation.t | 4 ++-- + xlators/storage/posix/src/posix-helpers.c | 5 +++-- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/tests/bugs/glusterd/brick-mux-validation.t b/tests/bugs/glusterd/brick-mux-validation.t +index 03a4768..61b0455 100644 +--- a/tests/bugs/glusterd/brick-mux-validation.t ++++ b/tests/bugs/glusterd/brick-mux-validation.t +@@ -24,7 +24,7 @@ TEST $CLI volume create $V0 $H0:$B0/${V0}{1..3} + TEST $CLI volume start $V0 + + EXPECT 1 count_brick_processes +-EXPECT 1 count_brick_pids ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 count_brick_pids + EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 online_brick_count + + pkill gluster +@@ -101,4 +101,4 @@ TEST $CLI_IGNORE_PARTITION volume reset-brick $V1 $H0:$B0/${V1}1 $H0:$B0/${V1}1 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT 6 online_brick_count + EXPECT 1 count_brick_processes + +-cleanup; +\ No newline at end of file ++cleanup; +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index 110d383..16351d8 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -3596,13 +3596,14 @@ posix_is_layout_stale(dict_t *xdata, char *par_path, xlator_t *this) + op_ret = dict_get_str_sizen(xdata, GF_PREOP_PARENT_KEY, &xattr_name); + if (xattr_name == NULL) { + op_ret = 0; +- goto out; ++ return is_stale; + } + + arg_data = dict_get(xdata, xattr_name); + if (!arg_data) { + op_ret = 0; +- goto out; ++ dict_del_sizen(xdata, GF_PREOP_PARENT_KEY); ++ return is_stale; + } + + size = sys_lgetxattr(par_path, xattr_name, value_buf, +-- +1.8.3.1 + diff --git a/SOURCES/0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch b/SOURCES/0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch new file mode 100644 index 0000000..4f191cc --- /dev/null +++ b/SOURCES/0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch @@ -0,0 +1,202 @@ +From 488a5aa4932842334e2749224e9c39f8b6fd379c Mon Sep 17 00:00:00 2001 +From: Ashish Pandey +Date: Wed, 20 May 2020 11:30:17 +0530 +Subject: [PATCH 561/584] cluster/ec: Inform failure when some bricks are + unavailable. + +Provide proper information about failure when a fop +fails on some of the brick. +Also provide information about parent fop and +the map of the bricks on which it is failing. + +Upstream patch details: +>Change-Id: If812739617df65cd146c8e667fbacff653717248 +>updates #1434 +>Signed-off-by: Ashish Pandey +>https://review.gluster.org/#/c/glusterfs/+/24858/ + +Change-Id: I3549d637e7345f05f21ac1c0e8106973c69d1be9 +BUG: 1908635 +Signed-off-by: Ashish Pandey +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244926 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec-common.c | 76 +++++++++++++++++++++++--------------- + xlators/cluster/ec/src/ec.c | 14 ++++++- + 2 files changed, 58 insertions(+), 32 deletions(-) + +diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c +index e3f8769..a9624d8 100644 +--- a/xlators/cluster/ec/src/ec-common.c ++++ b/xlators/cluster/ec/src/ec-common.c +@@ -316,17 +316,19 @@ ec_check_status(ec_fop_data_t *fop) + } + } + +- gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, +- "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " +- "remaining=%s, good=%s, bad=%s, %s)", +- gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, +- ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), +- ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), +- ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), +- ec_bin(str4, sizeof(str4), fop->good, ec->nodes), +- ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), +- ec->nodes), +- ec_msg_str(fop)); ++ gf_msg( ++ fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, ++ "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " ++ "remaining=%s, good=%s, bad=%s," ++ "(Least significant bit represents first client/brick of subvol), %s)", ++ gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, ++ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), ++ ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), ++ ec_bin(str4, sizeof(str4), fop->good, ec->nodes), ++ ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), ++ ec->nodes), ++ ec_msg_str(fop)); + if (fop->use_fd) { + if (fop->fd != NULL) { + ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, +@@ -614,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop) + loc_t *loc2 = NULL; + char gfid1[64] = {0}; + char gfid2[64] = {0}; ++ ec_fop_data_t *parent = fop->parent; + + if (fop->errstr) + return fop->errstr; +- + if (!fop->use_fd) { + loc1 = &fop->loc[0]; + loc2 = &fop->loc[1]; +@@ -625,23 +627,45 @@ ec_msg_str(ec_fop_data_t *fop) + if (fop->id == GF_FOP_RENAME) { + gf_asprintf(&fop->errstr, + "FOP : '%s' failed on '%s' and '%s' with gfids " +- "%s and %s respectively", ++ "%s and %s respectively. Parent FOP: %s", + ec_fop_name(fop->id), loc1->path, loc2->path, + uuid_utoa_r(loc1->gfid, gfid1), +- uuid_utoa_r(loc2->gfid, gfid2)); ++ uuid_utoa_r(loc2->gfid, gfid2), ++ parent ? ec_fop_name(parent->id) : "No Parent"); + } else { +- gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s", +- ec_fop_name(fop->id), loc1->path, +- uuid_utoa_r(loc1->gfid, gfid1)); ++ gf_asprintf( ++ &fop->errstr, ++ "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s", ++ ec_fop_name(fop->id), loc1->path, ++ uuid_utoa_r(loc1->gfid, gfid1), ++ parent ? ec_fop_name(parent->id) : "No Parent"); + } + } else { +- gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s", +- ec_fop_name(fop->id), +- uuid_utoa_r(fop->fd->inode->gfid, gfid1)); ++ gf_asprintf( ++ &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s", ++ ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1), ++ parent ? ec_fop_name(parent->id) : "No Parent"); + } + return fop->errstr; + } + ++static void ++ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need, ++ int32_t loglevel) ++{ ++ ec_t *ec = fop->xl->private; ++ char str1[32], str2[32], str3[32]; ++ ++ gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT, ++ "Insufficient available children for this request: " ++ "Have : %d, Need : %u : Child UP : %s " ++ "Mask: %s, Healing : %s : %s ", ++ have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), ++ ec_bin(str3, sizeof(str3), fop->healing, ec->nodes), ++ ec_msg_str(fop)); ++} ++ + static int32_t + ec_child_select(ec_fop_data_t *fop) + { +@@ -699,11 +723,7 @@ ec_child_select(ec_fop_data_t *fop) + ec_trace("SELECT", fop, ""); + + if ((num < fop->minimum) && (num < ec->fragments)) { +- gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, +- "Insufficient available children " +- "for this request (have %d, need " +- "%d). %s", +- num, fop->minimum, ec_msg_str(fop)); ++ ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR); + return 0; + } + +@@ -711,11 +731,7 @@ ec_child_select(ec_fop_data_t *fop) + (fop->locks[0].update[EC_DATA_TXN] || + fop->locks[0].update[EC_METADATA_TXN])) { + if (ec->quorum_count && (num < ec->quorum_count)) { +- gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, +- "Insufficient available children " +- "for this request (have %d, need " +- "%d). %s", +- num, ec->quorum_count, ec_msg_str(fop)); ++ ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR); + return 0; + } + } +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index a930089..047cdd8 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -325,13 +325,18 @@ ec_get_event_from_state(ec_t *ec) + void + ec_up(xlator_t *this, ec_t *ec) + { ++ char str1[32], str2[32]; ++ + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + ec->up = 1; +- gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP"); ++ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, ++ "Going UP : Child UP = %s Child Notify = %s", ++ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); + + gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name); + } +@@ -339,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec) + void + ec_down(xlator_t *this, ec_t *ec) + { ++ char str1[32], str2[32]; ++ + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + ec->up = 0; +- gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN"); ++ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, ++ "Going DOWN : Child UP = %s Child Notify = %s", ++ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ++ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); + + gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name); + } +-- +1.8.3.1 + diff --git a/SOURCES/0562-shard.c-Fix-formatting.patch b/SOURCES/0562-shard.c-Fix-formatting.patch new file mode 100644 index 0000000..14fbed6 --- /dev/null +++ b/SOURCES/0562-shard.c-Fix-formatting.patch @@ -0,0 +1,12513 @@ +From ea96fcd832de0b49f0e050f535d22a500da1503a Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Thu, 3 Jun 2021 13:14:04 +0200 +Subject: [PATCH 562/584] shard.c: Fix formatting + +A previous downstream change [1] had changed the formatting of the +entire xlators/features/shard/src/shard.c. This patch reapplies the +correct formatting. No other changes have been made. + +[1] https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/185716/ + +BUG: 1925425 +Change-Id: Ie655ddaaa26aa884878e66bc0d9ce1f021f6a85f +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244956 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 11701 ++++++++++++++++++----------------- + 1 file changed, 6084 insertions(+), 5617 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 099b062..c5cc224 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -16,5813 +16,6226 @@ + #include + #include + +-static gf_boolean_t __is_shard_dir(uuid_t gfid) { +- shard_priv_t *priv = THIS->private; ++static gf_boolean_t ++__is_shard_dir(uuid_t gfid) ++{ ++ shard_priv_t *priv = THIS->private; + +- if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) +- return _gf_true; ++ if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) ++ return _gf_true; + +- return _gf_false; ++ return _gf_false; + } + +-static gf_boolean_t __is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) { +- if (frame->root->pid == GF_CLIENT_PID_GSYNCD && +- (__is_shard_dir(loc->pargfid) || +- (loc->parent && __is_shard_dir(loc->parent->gfid)))) +- return _gf_true; ++static gf_boolean_t ++__is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) ++{ ++ if (frame->root->pid == GF_CLIENT_PID_GSYNCD && ++ (__is_shard_dir(loc->pargfid) || ++ (loc->parent && __is_shard_dir(loc->parent->gfid)))) ++ return _gf_true; + +- return _gf_false; ++ return _gf_false; + } + +-void shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; ++void ++shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = { ++ 0, ++ }; + +- gf_uuid_unparse(gfid, gfid_str); +- snprintf(buf, len, "%s.%d", gfid_str, block_num); ++ gf_uuid_unparse(gfid, gfid_str); ++ snprintf(buf, len, "%s.%d", gfid_str, block_num); + } + +-void shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, +- size_t len) { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; ++void ++shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = { ++ 0, ++ }; + +- gf_uuid_unparse(gfid, gfid_str); +- snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); ++ gf_uuid_unparse(gfid, gfid_str); ++ snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); + } + +-int __shard_inode_ctx_get(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t **ctx) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx_p = NULL; ++int ++__shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx_p = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret == 0) { +- *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; +- return ret; +- } ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret == 0) { ++ *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ return ret; ++ } + +- ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); +- if (!ctx_p) +- return ret; ++ ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); ++ if (!ctx_p) ++ return ret; + +- INIT_LIST_HEAD(&ctx_p->ilist); +- INIT_LIST_HEAD(&ctx_p->to_fsync_list); ++ INIT_LIST_HEAD(&ctx_p->ilist); ++ INIT_LIST_HEAD(&ctx_p->to_fsync_list); + +- ret = __inode_ctx_set(inode, this, (uint64_t *)&ctx_p); +- if (ret < 0) { +- GF_FREE(ctx_p); +- return ret; +- } ++ ret = __inode_ctx_set(inode, this, (uint64_t *)&ctx_p); ++ if (ret < 0) { ++ GF_FREE(ctx_p); ++ return ret; ++ } + +- *ctx = ctx_p; ++ *ctx = ctx_p; + +- return ret; ++ return ret; + } + +-int shard_inode_ctx_get(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t **ctx) { +- int ret = 0; ++int ++shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) ++{ ++ int ret = 0; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get(inode, this, ctx); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get(inode, this, ctx); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, +- uint64_t block_size, int32_t valid) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, ++ uint64_t block_size, int32_t valid) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- if (valid & SHARD_MASK_BLOCK_SIZE) +- ctx->block_size = block_size; ++ if (valid & SHARD_MASK_BLOCK_SIZE) ++ ctx->block_size = block_size; + +- if (valid & SHARD_MASK_PROT) +- ctx->stat.ia_prot = stbuf->ia_prot; ++ if (valid & SHARD_MASK_PROT) ++ ctx->stat.ia_prot = stbuf->ia_prot; + +- if (valid & SHARD_MASK_NLINK) +- ctx->stat.ia_nlink = stbuf->ia_nlink; ++ if (valid & SHARD_MASK_NLINK) ++ ctx->stat.ia_nlink = stbuf->ia_nlink; + +- if (valid & SHARD_MASK_UID) +- ctx->stat.ia_uid = stbuf->ia_uid; ++ if (valid & SHARD_MASK_UID) ++ ctx->stat.ia_uid = stbuf->ia_uid; + +- if (valid & SHARD_MASK_GID) +- ctx->stat.ia_gid = stbuf->ia_gid; ++ if (valid & SHARD_MASK_GID) ++ ctx->stat.ia_gid = stbuf->ia_gid; + +- if (valid & SHARD_MASK_SIZE) +- ctx->stat.ia_size = stbuf->ia_size; ++ if (valid & SHARD_MASK_SIZE) ++ ctx->stat.ia_size = stbuf->ia_size; + +- if (valid & SHARD_MASK_BLOCKS) +- ctx->stat.ia_blocks = stbuf->ia_blocks; ++ if (valid & SHARD_MASK_BLOCKS) ++ ctx->stat.ia_blocks = stbuf->ia_blocks; + +- if (valid & SHARD_MASK_TIMES) { +- SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, +- stbuf->ia_mtime, stbuf->ia_mtime_nsec); +- SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, +- stbuf->ia_ctime, stbuf->ia_ctime_nsec); +- SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, +- stbuf->ia_atime, stbuf->ia_atime_nsec); +- } ++ if (valid & SHARD_MASK_TIMES) { ++ SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, ++ stbuf->ia_mtime, stbuf->ia_mtime_nsec); ++ SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, ++ stbuf->ia_ctime, stbuf->ia_ctime_nsec); ++ SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, ++ stbuf->ia_atime, stbuf->ia_atime_nsec); ++ } + +- if (valid & SHARD_MASK_OTHERS) { +- ctx->stat.ia_ino = stbuf->ia_ino; +- gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); +- ctx->stat.ia_dev = stbuf->ia_dev; +- ctx->stat.ia_type = stbuf->ia_type; +- ctx->stat.ia_rdev = stbuf->ia_rdev; +- ctx->stat.ia_blksize = stbuf->ia_blksize; +- } ++ if (valid & SHARD_MASK_OTHERS) { ++ ctx->stat.ia_ino = stbuf->ia_ino; ++ gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); ++ ctx->stat.ia_dev = stbuf->ia_dev; ++ ctx->stat.ia_type = stbuf->ia_type; ++ ctx->stat.ia_rdev = stbuf->ia_rdev; ++ ctx->stat.ia_blksize = stbuf->ia_blksize; ++ } + +- if (valid & SHARD_MASK_REFRESH_RESET) +- ctx->refresh = _gf_false; ++ if (valid & SHARD_MASK_REFRESH_RESET) ++ ctx->refresh = _gf_false; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, +- uint64_t block_size, int32_t valid) { +- int ret = -1; ++int ++shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, ++ uint64_t block_size, int32_t valid) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- ctx->refresh = _gf_true; ++ ctx->refresh = _gf_true; + +- return 0; ++ return 0; + } +-int shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) { +- int ret = -1; ++int ++shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_set_refresh_flag(inode, this); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_set_refresh_flag(inode, this); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- ctx->refreshed = _gf_true; +- return 0; ++ ctx->refreshed = _gf_true; ++ return 0; + } + +-int shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) { +- int ret = -1; ++int ++shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, +- inode_t *shard_inode) { +- int ret = -1; +- shard_inode_ctx_t *base_ictx = NULL; +- shard_inode_ctx_t *shard_ictx = NULL; ++int ++__shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *base_ictx = NULL; ++ shard_inode_ctx_t *shard_ictx = NULL; + +- ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); ++ if (ret) ++ return ret; + +- ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); ++ if (ret) ++ return ret; + +- if (shard_ictx->fsync_needed) { +- shard_ictx->fsync_needed++; +- return 1; +- } ++ if (shard_ictx->fsync_needed) { ++ shard_ictx->fsync_needed++; ++ return 1; ++ } + +- list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); +- shard_ictx->inode = shard_inode; +- shard_ictx->fsync_needed++; +- base_ictx->fsync_count++; +- shard_ictx->base_inode = base_inode; ++ list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); ++ shard_ictx->inode = shard_inode; ++ shard_ictx->fsync_needed++; ++ base_ictx->fsync_count++; ++ shard_ictx->base_inode = base_inode; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, +- inode_t *shard_inode) { +- int ret = -1; ++int ++shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; + +- /* This ref acts as a refkeepr on the base inode. We +- * need to keep this inode alive as it holds the head +- * of the to_fsync_list. +- */ +- inode_ref(base_inode); +- inode_ref(shard_inode); ++ /* This ref acts as a refkeepr on the base inode. We ++ * need to keep this inode alive as it holds the head ++ * of the to_fsync_list. ++ */ ++ inode_ref(base_inode); ++ inode_ref(shard_inode); + +- LOCK(&base_inode->lock); +- LOCK(&shard_inode->lock); +- { ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, shard_inode); } +- UNLOCK(&shard_inode->lock); +- UNLOCK(&base_inode->lock); ++ LOCK(&base_inode->lock); ++ LOCK(&shard_inode->lock); ++ { ++ ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, ++ shard_inode); ++ } ++ UNLOCK(&shard_inode->lock); ++ UNLOCK(&base_inode->lock); + +- /* Unref the base inode corresponding to the ref above, if the shard is +- * found to be already part of the fsync list. +- */ +- if (ret != 0) { +- inode_unref(base_inode); +- inode_unref(shard_inode); +- } +- return ret; ++ /* Unref the base inode corresponding to the ref above, if the shard is ++ * found to be already part of the fsync list. ++ */ ++ if (ret != 0) { ++ inode_unref(base_inode); ++ inode_unref(shard_inode); ++ } ++ return ret; + } + +-gf_boolean_t __shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++gf_boolean_t ++__shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- /* If inode ctx get fails, better to err on the side of caution and +- * try again? Unless the failure is due to mem-allocation. +- */ +- if (ret) +- return _gf_true; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ /* If inode ctx get fails, better to err on the side of caution and ++ * try again? Unless the failure is due to mem-allocation. ++ */ ++ if (ret) ++ return _gf_true; + +- return !ctx->refreshed; ++ return !ctx->refreshed; + } + +-gf_boolean_t shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) { +- gf_boolean_t flag = _gf_false; ++gf_boolean_t ++shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) ++{ ++ gf_boolean_t flag = _gf_false; + +- LOCK(&inode->lock); +- { flag = __shard_inode_ctx_needs_lookup(inode, this); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ flag = __shard_inode_ctx_needs_lookup(inode, this); ++ } ++ UNLOCK(&inode->lock); + +- return flag; ++ return flag; + } +-int __shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, +- struct iatt *stbuf) { +- int ret = -1; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __shard_inode_ctx_get(inode, this, &ctx); +- if (ret) +- return ret; ++ ret = __shard_inode_ctx_get(inode, this, &ctx); ++ if (ret) ++ return ret; + +- if ((stbuf->ia_size != ctx->stat.ia_size) || +- (stbuf->ia_blocks != ctx->stat.ia_blocks)) +- ctx->refresh = _gf_true; ++ if ((stbuf->ia_size != ctx->stat.ia_size) || ++ (stbuf->ia_blocks != ctx->stat.ia_blocks)) ++ ctx->refresh = _gf_true; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, +- struct iatt *stbuf) { +- int ret = -1; ++int ++shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_invalidate(inode, this, stbuf); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_invalidate(inode, this, stbuf); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, +- uint64_t *block_size) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, ++ uint64_t *block_size) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- *block_size = ctx->block_size; ++ *block_size = ctx->block_size; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, +- uint64_t *block_size) { +- int ret = -1; ++int ++shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, ++ uint64_t *block_size) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get_block_size(inode, this, block_size); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_block_size(inode, this, block_size); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, +- int *fsync_count) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- *fsync_count = ctx->fsync_needed; ++ *fsync_count = ctx->fsync_needed; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, +- int *fsync_count) { +- int ret = -1; ++int ++shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } +-int __shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t *ctx_out) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, ++ shard_inode_ctx_t *ctx_out) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); +- return 0; ++ memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); ++ return 0; + } + +-int shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, +- shard_inode_ctx_t *ctx_out) { +- int ret = -1; ++int ++shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, ++ shard_inode_ctx_t *ctx_out) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { ret = __shard_inode_ctx_get_all(inode, this, ctx_out); } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_all(inode, this, ctx_out); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-int __shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, +- struct iatt *buf, +- gf_boolean_t *need_refresh) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++int ++__shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, ++ struct iatt *buf, ++ gf_boolean_t *need_refresh) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- if (ctx->refresh == _gf_false) +- *buf = ctx->stat; +- else +- *need_refresh = _gf_true; ++ if (ctx->refresh == _gf_false) ++ *buf = ctx->stat; ++ else ++ *need_refresh = _gf_true; + +- return 0; ++ return 0; + } + +-int shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, +- struct iatt *buf, +- gf_boolean_t *need_refresh) { +- int ret = -1; ++int ++shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, ++ struct iatt *buf, ++ gf_boolean_t *need_refresh) ++{ ++ int ret = -1; + +- LOCK(&inode->lock); +- { +- ret = +- __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, need_refresh); +- } +- UNLOCK(&inode->lock); ++ LOCK(&inode->lock); ++ { ++ ret = __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, ++ need_refresh); ++ } ++ UNLOCK(&inode->lock); + +- return ret; ++ return ret; + } + +-void shard_local_wipe(shard_local_t *local) { +- int i = 0; +- int count = 0; ++void ++shard_local_wipe(shard_local_t *local) ++{ ++ int i = 0; ++ int count = 0; ++ ++ count = local->num_blocks; ++ ++ syncbarrier_destroy(&local->barrier); ++ loc_wipe(&local->loc); ++ loc_wipe(&local->dot_shard_loc); ++ loc_wipe(&local->dot_shard_rm_loc); ++ loc_wipe(&local->loc2); ++ loc_wipe(&local->tmp_loc); ++ loc_wipe(&local->int_inodelk.loc); ++ loc_wipe(&local->int_entrylk.loc); ++ loc_wipe(&local->newloc); ++ ++ if (local->name) ++ GF_FREE(local->name); ++ ++ if (local->int_entrylk.basename) ++ GF_FREE(local->int_entrylk.basename); ++ if (local->fd) ++ fd_unref(local->fd); + +- count = local->num_blocks; ++ if (local->xattr_req) ++ dict_unref(local->xattr_req); ++ if (local->xattr_rsp) ++ dict_unref(local->xattr_rsp); + +- syncbarrier_destroy(&local->barrier); +- loc_wipe(&local->loc); +- loc_wipe(&local->dot_shard_loc); +- loc_wipe(&local->dot_shard_rm_loc); +- loc_wipe(&local->loc2); +- loc_wipe(&local->tmp_loc); +- loc_wipe(&local->int_inodelk.loc); +- loc_wipe(&local->int_entrylk.loc); +- loc_wipe(&local->newloc); ++ for (i = 0; i < count; i++) { ++ if (!local->inode_list) ++ break; + +- if (local->name) +- GF_FREE(local->name); ++ if (local->inode_list[i]) ++ inode_unref(local->inode_list[i]); ++ } + +- if (local->int_entrylk.basename) +- GF_FREE(local->int_entrylk.basename); +- if (local->fd) +- fd_unref(local->fd); ++ GF_FREE(local->inode_list); + +- if (local->xattr_req) +- dict_unref(local->xattr_req); +- if (local->xattr_rsp) +- dict_unref(local->xattr_rsp); ++ GF_FREE(local->vector); ++ if (local->iobref) ++ iobref_unref(local->iobref); ++ if (local->list_inited) ++ gf_dirent_free(&local->entries_head); ++ if (local->inodelk_frame) ++ SHARD_STACK_DESTROY(local->inodelk_frame); ++ if (local->entrylk_frame) ++ SHARD_STACK_DESTROY(local->entrylk_frame); ++} + +- for (i = 0; i < count; i++) { +- if (!local->inode_list) +- break; +- +- if (local->inode_list[i]) +- inode_unref(local->inode_list[i]); +- } +- +- GF_FREE(local->inode_list); +- +- GF_FREE(local->vector); +- if (local->iobref) +- iobref_unref(local->iobref); +- if (local->list_inited) +- gf_dirent_free(&local->entries_head); +- if (local->inodelk_frame) +- SHARD_STACK_DESTROY(local->inodelk_frame); +- if (local->entrylk_frame) +- SHARD_STACK_DESTROY(local->entrylk_frame); +-} +- +-int shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) { +- int ret = -1; +- void *size_attr = NULL; +- uint64_t size_array[4]; +- +- ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); +- if (ret) { +- gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, +- SHARD_MSG_INTERNAL_XATTR_MISSING, +- "Failed to " +- "get " GF_XATTR_SHARD_FILE_SIZE " for %s", +- uuid_utoa(stbuf->ia_gfid)); +- return ret; +- } ++int ++shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) ++{ ++ int ret = -1; ++ void *size_attr = NULL; ++ uint64_t size_array[4]; ++ ++ ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); ++ if (ret) { ++ gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INTERNAL_XATTR_MISSING, ++ "Failed to " ++ "get " GF_XATTR_SHARD_FILE_SIZE " for %s", ++ uuid_utoa(stbuf->ia_gfid)); ++ return ret; ++ } + +- memcpy(size_array, size_attr, sizeof(size_array)); ++ memcpy(size_array, size_attr, sizeof(size_array)); + +- stbuf->ia_size = ntoh64(size_array[0]); +- stbuf->ia_blocks = ntoh64(size_array[2]); ++ stbuf->ia_size = ntoh64(size_array[0]); ++ stbuf->ia_blocks = ntoh64(size_array[2]); + +- return 0; ++ return 0; + } + +-int shard_call_count_return(call_frame_t *frame) { +- int call_count = 0; +- shard_local_t *local = NULL; ++int ++shard_call_count_return(call_frame_t *frame) ++{ ++ int call_count = 0; ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- LOCK(&frame->lock); +- { call_count = --local->call_count; } +- UNLOCK(&frame->lock); ++ LOCK(&frame->lock); ++ { ++ call_count = --local->call_count; ++ } ++ UNLOCK(&frame->lock); + +- return call_count; ++ return call_count; + } + +-static char *shard_internal_dir_string(shard_internal_dir_type_t type) { +- char *str = NULL; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- str = GF_SHARD_DIR; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- str = GF_SHARD_REMOVE_ME_DIR; +- break; +- default: +- break; +- } +- return str; ++static char * ++shard_internal_dir_string(shard_internal_dir_type_t type) ++{ ++ char *str = NULL; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ str = GF_SHARD_DIR; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ str = GF_SHARD_REMOVE_ME_DIR; ++ break; ++ default: ++ break; ++ } ++ return str; + } + +-static int shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, +- shard_internal_dir_type_t type) { +- int ret = -1; +- char *bname = NULL; +- inode_t *parent = NULL; +- loc_t *internal_dir_loc = NULL; +- shard_priv_t *priv = NULL; ++static int ++shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, ++ shard_internal_dir_type_t type) ++{ ++ int ret = -1; ++ char *bname = NULL; ++ inode_t *parent = NULL; ++ loc_t *internal_dir_loc = NULL; ++ shard_priv_t *priv = NULL; + +- priv = this->private; +- if (!local) +- return -1; ++ priv = this->private; ++ if (!local) ++ return -1; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ internal_dir_loc = &local->dot_shard_loc; ++ bname = GF_SHARD_DIR; ++ parent = inode_ref(this->itable->root); ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ internal_dir_loc = &local->dot_shard_rm_loc; ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ parent = inode_ref(priv->dot_shard_inode); ++ break; ++ default: ++ break; ++ } + +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- internal_dir_loc = &local->dot_shard_loc; +- bname = GF_SHARD_DIR; +- parent = inode_ref(this->itable->root); +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- internal_dir_loc = &local->dot_shard_rm_loc; +- bname = GF_SHARD_REMOVE_ME_DIR; +- parent = inode_ref(priv->dot_shard_inode); +- break; +- default: +- break; +- } +- +- internal_dir_loc->inode = inode_new(this->itable); +- internal_dir_loc->parent = parent; +- ret = inode_path(internal_dir_loc->parent, bname, +- (char **)&internal_dir_loc->path); +- if (ret < 0 || !(internal_dir_loc->inode)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", bname); +- goto out; +- } +- +- internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); +- if (internal_dir_loc->name) +- internal_dir_loc->name++; +- +- ret = 0; +-out: +- return ret; +-} +- +-inode_t *__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, +- inode_t *base_inode, int block_num, +- uuid_t gfid) { +- char block_bname[256] = { +- 0, +- }; +- inode_t *lru_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *lru_inode_ctx = NULL; +- shard_inode_ctx_t *lru_base_inode_ctx = NULL; +- inode_t *fsync_inode = NULL; +- inode_t *lru_base_inode = NULL; +- gf_boolean_t do_fsync = _gf_false; +- +- priv = this->private; +- +- shard_inode_ctx_get(linked_inode, this, &ctx); +- +- if (list_empty(&ctx->ilist)) { +- if (priv->inode_count + 1 <= priv->lru_limit) { +- /* If this inode was linked here for the first time (indicated +- * by empty list), and if there is still space in the priv list, +- * add this ctx to the tail of the list. +- */ +- /* For as long as an inode is in lru list, we try to +- * keep it alive by holding a ref on it. +- */ +- inode_ref(linked_inode); +- if (base_inode) +- gf_uuid_copy(ctx->base_gfid, base_inode->gfid); +- else +- gf_uuid_copy(ctx->base_gfid, gfid); +- ctx->block_num = block_num; +- list_add_tail(&ctx->ilist, &priv->ilist_head); +- priv->inode_count++; +- ctx->base_inode = inode_ref(base_inode); +- } else { +- /*If on the other hand there is no available slot for this inode +- * in the list, delete the lru inode from the head of the list, +- * unlink it. And in its place add this new inode into the list. +- */ +- lru_inode_ctx = +- list_first_entry(&priv->ilist_head, shard_inode_ctx_t, ilist); +- GF_ASSERT(lru_inode_ctx->block_num > 0); +- lru_base_inode = lru_inode_ctx->base_inode; +- list_del_init(&lru_inode_ctx->ilist); +- lru_inode = inode_find(linked_inode->table, lru_inode_ctx->stat.ia_gfid); +- /* If the lru inode was part of the pending-fsync list, +- * the base inode needs to be unref'd, the lru inode +- * deleted from fsync list and fsync'd in a new frame, +- * and then unlinked in memory and forgotten. +- */ +- if (!lru_base_inode) +- goto after_fsync_check; +- LOCK(&lru_base_inode->lock); +- LOCK(&lru_inode->lock); +- { +- if (!list_empty(&lru_inode_ctx->to_fsync_list)) { +- list_del_init(&lru_inode_ctx->to_fsync_list); +- lru_inode_ctx->fsync_needed = 0; +- do_fsync = _gf_true; +- __shard_inode_ctx_get(lru_base_inode, this, &lru_base_inode_ctx); +- lru_base_inode_ctx->fsync_count--; +- } +- } +- UNLOCK(&lru_inode->lock); +- UNLOCK(&lru_base_inode->lock); +- +- after_fsync_check: +- if (!do_fsync) { +- shard_make_block_bname(lru_inode_ctx->block_num, +- lru_inode_ctx->base_gfid, block_bname, +- sizeof(block_bname)); +- /* The following unref corresponds to the ref held at +- * the time the shard was added to the lru list. +- */ +- inode_unref(lru_inode); +- inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); +- inode_forget(lru_inode, 0); +- } else { +- /* The following unref corresponds to the ref +- * held when the shard was added to fsync list. +- */ +- inode_unref(lru_inode); +- fsync_inode = lru_inode; +- if (lru_base_inode) +- inode_unref(lru_base_inode); +- } +- /* The following unref corresponds to the ref +- * held by inode_find() above. +- */ +- inode_unref(lru_inode); +- +- /* The following unref corresponds to the ref held on the base shard +- * at the time of adding shard inode to lru list +- */ +- if (lru_base_inode) +- inode_unref(lru_base_inode); +- +- /* For as long as an inode is in lru list, we try to +- * keep it alive by holding a ref on it. +- */ +- inode_ref(linked_inode); +- if (base_inode) +- gf_uuid_copy(ctx->base_gfid, base_inode->gfid); +- else +- gf_uuid_copy(ctx->base_gfid, gfid); +- ctx->block_num = block_num; +- ctx->base_inode = inode_ref(base_inode); +- list_add_tail(&ctx->ilist, &priv->ilist_head); +- } +- } else { +- /* If this is not the first time this inode is being operated on, move +- * it to the most recently used end of the list. +- */ +- list_move_tail(&ctx->ilist, &priv->ilist_head); +- } +- return fsync_inode; +-} +- +-int shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, +- int32_t op_ret, int32_t op_errno) { +- switch (fop) { +- case GF_FOP_LOOKUP: +- SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, NULL, NULL); +- break; +- case GF_FOP_STAT: +- SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_FSTAT: +- SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_TRUNCATE: +- SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_FTRUNCATE: +- SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_MKNOD: +- SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL); +- break; +- case GF_FOP_LINK: +- SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL); +- break; +- case GF_FOP_CREATE: +- SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL, NULL); +- break; +- case GF_FOP_UNLINK: +- SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_RENAME: +- SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, +- NULL, NULL); +- break; +- case GF_FOP_WRITE: +- SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_FALLOCATE: +- SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_ZEROFILL: +- SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_DISCARD: +- SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_READ: +- SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, NULL, +- NULL); +- break; +- case GF_FOP_FSYNC: +- SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_REMOVEXATTR: +- SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_FREMOVEXATTR: +- SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_FGETXATTR: +- SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_GETXATTR: +- SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); +- break; +- case GF_FOP_FSETXATTR: +- SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_SETXATTR: +- SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); +- break; +- case GF_FOP_SETATTR: +- SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_FSETATTR: +- SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, NULL); +- break; +- case GF_FOP_SEEK: +- SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); +- break; +- default: +- gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", fop); +- break; +- } +- return 0; +-} +- +-int shard_common_inode_write_success_unwind(glusterfs_fop_t fop, +- call_frame_t *frame, +- int32_t op_ret) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- switch (fop) { +- case GF_FOP_WRITE: +- SHARD_STACK_UNWIND(writev, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- case GF_FOP_FALLOCATE: +- SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- case GF_FOP_ZEROFILL: +- SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- case GF_FOP_DISCARD: +- SHARD_STACK_UNWIND(discard, frame, op_ret, 0, &local->prebuf, +- &local->postbuf, local->xattr_rsp); +- break; +- default: +- gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", fop); +- break; +- } +- return 0; +-} +- +-int shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) { +- char block_bname[256] = { +- 0, +- }; +- fd_t *anon_fd = cookie; +- inode_t *shard_inode = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- +- if (anon_fd == NULL || op_ret < 0) { +- gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, +- "fsync failed on shard"); +- goto out; +- } +- shard_inode = anon_fd->inode; +- +- LOCK(&priv->lock); +- LOCK(&shard_inode->lock); +- { +- __shard_inode_ctx_get(shard_inode, this, &ctx); +- if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { +- shard_make_block_bname(ctx->block_num, shard_inode->gfid, block_bname, +- sizeof(block_bname)); +- inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); +- /* The following unref corresponds to the ref held by +- * inode_link() at the time the shard was created or +- * looked up +- */ +- inode_unref(shard_inode); +- inode_forget(shard_inode, 0); +- } +- } +- UNLOCK(&shard_inode->lock); +- UNLOCK(&priv->lock); ++ internal_dir_loc->inode = inode_new(this->itable); ++ internal_dir_loc->parent = parent; ++ ret = inode_path(internal_dir_loc->parent, bname, ++ (char **)&internal_dir_loc->path); ++ if (ret < 0 || !(internal_dir_loc->inode)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", bname); ++ goto out; ++ } ++ ++ internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); ++ if (internal_dir_loc->name) ++ internal_dir_loc->name++; + ++ ret = 0; + out: +- if (anon_fd) +- fd_unref(anon_fd); +- STACK_DESTROY(frame->root); +- return 0; ++ return ret; + } + +-int shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) { +- fd_t *anon_fd = NULL; +- call_frame_t *fsync_frame = NULL; +- +- fsync_frame = create_frame(this, this->ctx->pool); +- if (!fsync_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create new frame " +- "to fsync shard"); +- return -1; +- } +- +- anon_fd = fd_anonymous(inode); +- if (!anon_fd) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create anon fd to" +- " fsync shard"); +- STACK_DESTROY(fsync_frame->root); +- return -1; +- } +- +- STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, anon_fd, +- 1, NULL); +- return 0; +-} +- +-int shard_common_resolve_shards( +- call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t post_res_handler) { +- int i = -1; +- uint32_t shard_idx_iter = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- inode_t *inode = NULL; +- inode_t *res_inode = NULL; +- inode_t *fsync_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- priv = this->private; +- local = frame->local; +- local->call_count = 0; +- shard_idx_iter = local->first_block; +- res_inode = local->resolver_base_inode; +- if (res_inode) +- gf_uuid_copy(gfid, res_inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- if ((local->op_ret < 0) || (local->resolve_not)) +- goto out; +- +- while (shard_idx_iter <= local->last_block) { +- i++; +- if (shard_idx_iter == 0) { +- local->inode_list[i] = inode_ref(res_inode); +- shard_idx_iter++; +- continue; +- } +- +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- +- inode = NULL; +- inode = inode_resolve(this->itable, path); +- if (inode) { +- gf_msg_debug(this->name, 0, "Shard %d already " +- "present. gfid=%s. Saving inode for future.", +- shard_idx_iter, uuid_utoa(inode->gfid)); +- local->inode_list[i] = inode; +- /* Let the ref on the inodes that are already present +- * in inode table still be held so that they don't get +- * forgotten by the time the fop reaches the actual +- * write stage. +- */ +- LOCK(&priv->lock); +- { +- fsync_inode = __shard_update_shards_inode_list(inode, this, res_inode, +- shard_idx_iter, gfid); +- } +- UNLOCK(&priv->lock); +- shard_idx_iter++; +- if (fsync_inode) +- shard_initiate_evicted_inode_fsync(this, fsync_inode); +- continue; ++inode_t * ++__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, ++ inode_t *base_inode, int block_num, ++ uuid_t gfid) ++{ ++ char block_bname[256] = { ++ 0, ++ }; ++ inode_t *lru_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *lru_inode_ctx = NULL; ++ shard_inode_ctx_t *lru_base_inode_ctx = NULL; ++ inode_t *fsync_inode = NULL; ++ inode_t *lru_base_inode = NULL; ++ gf_boolean_t do_fsync = _gf_false; ++ ++ priv = this->private; ++ ++ shard_inode_ctx_get(linked_inode, this, &ctx); ++ ++ if (list_empty(&ctx->ilist)) { ++ if (priv->inode_count + 1 <= priv->lru_limit) { ++ /* If this inode was linked here for the first time (indicated ++ * by empty list), and if there is still space in the priv list, ++ * add this ctx to the tail of the list. ++ */ ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref(linked_inode); ++ if (base_inode) ++ gf_uuid_copy(ctx->base_gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(ctx->base_gfid, gfid); ++ ctx->block_num = block_num; ++ list_add_tail(&ctx->ilist, &priv->ilist_head); ++ priv->inode_count++; ++ ctx->base_inode = inode_ref(base_inode); ++ } else { ++ /*If on the other hand there is no available slot for this inode ++ * in the list, delete the lru inode from the head of the list, ++ * unlink it. And in its place add this new inode into the list. ++ */ ++ lru_inode_ctx = list_first_entry(&priv->ilist_head, ++ shard_inode_ctx_t, ilist); ++ GF_ASSERT(lru_inode_ctx->block_num > 0); ++ lru_base_inode = lru_inode_ctx->base_inode; ++ list_del_init(&lru_inode_ctx->ilist); ++ lru_inode = inode_find(linked_inode->table, ++ lru_inode_ctx->stat.ia_gfid); ++ /* If the lru inode was part of the pending-fsync list, ++ * the base inode needs to be unref'd, the lru inode ++ * deleted from fsync list and fsync'd in a new frame, ++ * and then unlinked in memory and forgotten. ++ */ ++ if (!lru_base_inode) ++ goto after_fsync_check; ++ LOCK(&lru_base_inode->lock); ++ LOCK(&lru_inode->lock); ++ { ++ if (!list_empty(&lru_inode_ctx->to_fsync_list)) { ++ list_del_init(&lru_inode_ctx->to_fsync_list); ++ lru_inode_ctx->fsync_needed = 0; ++ do_fsync = _gf_true; ++ __shard_inode_ctx_get(lru_base_inode, this, ++ &lru_base_inode_ctx); ++ lru_base_inode_ctx->fsync_count--; ++ } ++ } ++ UNLOCK(&lru_inode->lock); ++ UNLOCK(&lru_base_inode->lock); ++ ++ after_fsync_check: ++ if (!do_fsync) { ++ shard_make_block_bname(lru_inode_ctx->block_num, ++ lru_inode_ctx->base_gfid, block_bname, ++ sizeof(block_bname)); ++ /* The following unref corresponds to the ref held at ++ * the time the shard was added to the lru list. ++ */ ++ inode_unref(lru_inode); ++ inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); ++ inode_forget(lru_inode, 0); ++ } else { ++ /* The following unref corresponds to the ref ++ * held when the shard was added to fsync list. ++ */ ++ inode_unref(lru_inode); ++ fsync_inode = lru_inode; ++ if (lru_base_inode) ++ inode_unref(lru_base_inode); ++ } ++ /* The following unref corresponds to the ref ++ * held by inode_find() above. ++ */ ++ inode_unref(lru_inode); ++ ++ /* The following unref corresponds to the ref held on the base shard ++ * at the time of adding shard inode to lru list ++ */ ++ if (lru_base_inode) ++ inode_unref(lru_base_inode); ++ ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref(linked_inode); ++ if (base_inode) ++ gf_uuid_copy(ctx->base_gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(ctx->base_gfid, gfid); ++ ctx->block_num = block_num; ++ ctx->base_inode = inode_ref(base_inode); ++ list_add_tail(&ctx->ilist, &priv->ilist_head); ++ } + } else { +- local->call_count++; +- shard_idx_iter++; ++ /* If this is not the first time this inode is being operated on, move ++ * it to the most recently used end of the list. ++ */ ++ list_move_tail(&ctx->ilist, &priv->ilist_head); + } +- } +-out: +- post_res_handler(frame, this); +- return 0; ++ return fsync_inode; + } + +-int shard_update_file_size_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- dict_t *dict, dict_t *xdata) { +- inode_t *inode = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if ((local->fd) && (local->fd->inode)) +- inode = local->fd->inode; +- else if (local->loc.inode) +- inode = local->loc.inode; +- +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, +- SHARD_MSG_UPDATE_FILE_SIZE_FAILED, "Update to file size" +- " xattr failed on %s", +- uuid_utoa(inode->gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto err; +- } +- +- if (shard_modify_size_and_block_count(&local->postbuf, dict)) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +-err: +- local->post_update_size_handler(frame, this); +- return 0; ++int ++shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, ++ int32_t op_ret, int32_t op_errno) ++{ ++ switch (fop) { ++ case GF_FOP_LOOKUP: ++ SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_STAT: ++ SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_FSTAT: ++ SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_TRUNCATE: ++ SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_FTRUNCATE: ++ SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_MKNOD: ++ SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_LINK: ++ SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_CREATE: ++ SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_UNLINK: ++ SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_RENAME: ++ SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_WRITE: ++ SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_FALLOCATE: ++ SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_ZEROFILL: ++ SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_DISCARD: ++ SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_READ: ++ SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_FSYNC: ++ SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_REMOVEXATTR: ++ SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_FREMOVEXATTR: ++ SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_FGETXATTR: ++ SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_GETXATTR: ++ SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_FSETXATTR: ++ SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_SETXATTR: ++ SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_SETATTR: ++ SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_FSETATTR: ++ SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_SEEK: ++ SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); ++ break; ++ default: ++ gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", fop); ++ break; ++ } ++ return 0; + } + +-int shard_set_size_attrs(int64_t size, int64_t block_count, +- int64_t **size_attr_p) { +- int ret = -1; +- int64_t *size_attr = NULL; ++int ++shard_common_inode_write_success_unwind(glusterfs_fop_t fop, ++ call_frame_t *frame, int32_t op_ret) ++{ ++ shard_local_t *local = NULL; + +- if (!size_attr_p) +- goto out; ++ local = frame->local; + +- size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); +- if (!size_attr) +- goto out; ++ switch (fop) { ++ case GF_FOP_WRITE: ++ SHARD_STACK_UNWIND(writev, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ case GF_FOP_FALLOCATE: ++ SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ case GF_FOP_ZEROFILL: ++ SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ case GF_FOP_DISCARD: ++ SHARD_STACK_UNWIND(discard, frame, op_ret, 0, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ break; ++ default: ++ gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", fop); ++ break; ++ } ++ return 0; ++} + +- size_attr[0] = hton64(size); +- /* As sharding evolves, it _may_ be necessary to embed more pieces of +- * information within the same xattr. So allocating slots for them in +- * advance. For now, only bytes 0-63 and 128-191 which would make up the +- * current size and block count respectively of the file are valid. +- */ +- size_attr[2] = hton64(block_count); ++int ++shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ char block_bname[256] = { ++ 0, ++ }; ++ fd_t *anon_fd = cookie; ++ inode_t *shard_inode = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_priv_t *priv = NULL; + +- *size_attr_p = size_attr; ++ priv = this->private; + +- ret = 0; +-out: +- return ret; +-} ++ if (anon_fd == NULL || op_ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, ++ "fsync failed on shard"); ++ goto out; ++ } ++ shard_inode = anon_fd->inode; + +-int shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, +- loc_t *loc, +- shard_post_update_size_fop_handler_t handler) { +- int ret = -1; +- int64_t *size_attr = NULL; +- int64_t delta_blocks = 0; +- inode_t *inode = NULL; +- shard_local_t *local = NULL; +- dict_t *xattr_req = NULL; ++ LOCK(&priv->lock); ++ LOCK(&shard_inode->lock); ++ { ++ __shard_inode_ctx_get(shard_inode, this, &ctx); ++ if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { ++ shard_make_block_bname(ctx->block_num, shard_inode->gfid, ++ block_bname, sizeof(block_bname)); ++ inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); ++ /* The following unref corresponds to the ref held by ++ * inode_link() at the time the shard was created or ++ * looked up ++ */ ++ inode_unref(shard_inode); ++ inode_forget(shard_inode, 0); ++ } ++ } ++ UNLOCK(&shard_inode->lock); ++ UNLOCK(&priv->lock); + +- local = frame->local; +- local->post_update_size_handler = handler; ++out: ++ if (anon_fd) ++ fd_unref(anon_fd); ++ STACK_DESTROY(frame->root); ++ return 0; ++} + +- xattr_req = dict_new(); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } +- +- if (fd) +- inode = fd->inode; +- else +- inode = loc->inode; +- +- /* If both size and block count have not changed, then skip the xattrop. +- */ +- delta_blocks = GF_ATOMIC_GET(local->delta_blocks); +- if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) { +- goto out; +- } +- +- ret = shard_set_size_attrs(local->delta_size + local->hole_size, delta_blocks, +- &size_attr); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, +- "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } +- +- ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set key %s into dict. gfid=%s", GF_XATTR_SHARD_FILE_SIZE, +- uuid_utoa(inode->gfid)); +- GF_FREE(size_attr); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } ++int ++shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) ++{ ++ fd_t *anon_fd = NULL; ++ call_frame_t *fsync_frame = NULL; ++ ++ fsync_frame = create_frame(this, this->ctx->pool); ++ if (!fsync_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create new frame " ++ "to fsync shard"); ++ return -1; ++ } + +- if (fd) +- STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fxattrop, fd, GF_XATTROP_ADD_ARRAY64, +- xattr_req, NULL); +- else +- STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->xattrop, loc, GF_XATTROP_ADD_ARRAY64, +- xattr_req, NULL); ++ anon_fd = fd_anonymous(inode); ++ if (!anon_fd) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create anon fd to" ++ " fsync shard"); ++ STACK_DESTROY(fsync_frame->root); ++ return -1; ++ } + +- dict_unref(xattr_req); +- return 0; ++ STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, ++ anon_fd, 1, NULL); ++ return 0; ++} + +-out: +- if (xattr_req) +- dict_unref(xattr_req); +- handler(frame, this); +- return 0; +-} +- +-static inode_t *shard_link_internal_dir_inode(shard_local_t *local, +- inode_t *inode, struct iatt *buf, +- shard_internal_dir_type_t type) { +- inode_t *linked_inode = NULL; +- shard_priv_t *priv = NULL; +- char *bname = NULL; +- inode_t **priv_inode = NULL; +- inode_t *parent = NULL; +- +- priv = THIS->private; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- bname = GF_SHARD_DIR; +- priv_inode = &priv->dot_shard_inode; +- parent = inode->table->root; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- bname = GF_SHARD_REMOVE_ME_DIR; +- priv_inode = &priv->dot_shard_rm_inode; +- parent = priv->dot_shard_inode; +- break; +- default: +- break; +- } +- +- linked_inode = inode_link(inode, parent, bname, buf); +- inode_lookup(linked_inode); +- *priv_inode = linked_inode; +- return linked_inode; +-} +- +-int shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- shard_local_t *local = NULL; +- inode_t *linked_inode = NULL; +- shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; +- +- local = frame->local; +- +- if (op_ret) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto out; +- } +- +- /* To-Do: Fix refcount increment per call to +- * shard_link_internal_dir_inode(). +- */ +- linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); +- shard_inode_ctx_mark_dir_refreshed(linked_inode, this); +-out: +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- return 0; +-} +- +-int shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_internal_dir_type_t type) { +- loc_t loc = { +- 0, +- }; +- inode_t *inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- uuid_t gfid = { +- 0, +- }; +- +- local = frame->local; +- priv = this->private; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- gf_uuid_copy(gfid, priv->dot_shard_gfid); +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); +- break; +- default: +- break; +- } +- +- inode = inode_find(this->itable, gfid); +- +- if (!shard_inode_ctx_needs_lookup(inode, this)) { +- local->op_ret = 0; +- goto out; +- } ++int ++shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t post_res_handler) ++{ ++ int i = -1; ++ uint32_t shard_idx_iter = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ inode_t *inode = NULL; ++ inode_t *res_inode = NULL; ++ inode_t *fsync_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; + +- /* Plain assignment because the ref is already taken above through +- * call to inode_find() +- */ +- loc.inode = inode; +- gf_uuid_copy(loc.gfid, gfid); ++ priv = this->private; ++ local = frame->local; ++ local->call_count = 0; ++ shard_idx_iter = local->first_block; ++ res_inode = local->resolver_base_inode; ++ if (res_inode) ++ gf_uuid_copy(gfid, res_inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); + +- STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, +- NULL); +- loc_wipe(&loc); ++ if ((local->op_ret < 0) || (local->resolve_not)) ++ goto out; + +- return 0; ++ while (shard_idx_iter <= local->last_block) { ++ i++; ++ if (shard_idx_iter == 0) { ++ local->inode_list[i] = inode_ref(res_inode); ++ shard_idx_iter++; ++ continue; ++ } + ++ shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); ++ ++ inode = NULL; ++ inode = inode_resolve(this->itable, path); ++ if (inode) { ++ gf_msg_debug(this->name, 0, ++ "Shard %d already " ++ "present. gfid=%s. Saving inode for future.", ++ shard_idx_iter, uuid_utoa(inode->gfid)); ++ local->inode_list[i] = inode; ++ /* Let the ref on the inodes that are already present ++ * in inode table still be held so that they don't get ++ * forgotten by the time the fop reaches the actual ++ * write stage. ++ */ ++ LOCK(&priv->lock); ++ { ++ fsync_inode = __shard_update_shards_inode_list( ++ inode, this, res_inode, shard_idx_iter, gfid); ++ } ++ UNLOCK(&priv->lock); ++ shard_idx_iter++; ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync(this, fsync_inode); ++ continue; ++ } else { ++ local->call_count++; ++ shard_idx_iter++; ++ } ++ } + out: +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- return 0; ++ post_res_handler(frame, this); ++ return 0; + } + +-int shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- inode_t *link_inode = NULL; +- shard_local_t *local = NULL; +- shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; ++int ++shard_update_file_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ inode_t *inode = NULL; ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (op_ret) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } +- +- if (!IA_ISDIR(buf->ia_type)) { +- gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, +- "%s already exists and " +- "is not a directory. Please remove it from all bricks " +- "and try again", +- shard_internal_dir_string(type)); +- local->op_ret = -1; +- local->op_errno = EIO; +- goto unwind; +- } +- +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); +- if (link_inode != inode) { +- shard_refresh_internal_dir(frame, this, type); +- } else { +- shard_inode_ctx_mark_dir_refreshed(link_inode, this); +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- } +- return 0; ++ if ((local->fd) && (local->fd->inode)) ++ inode = local->fd->inode; ++ else if (local->loc.inode) ++ inode = local->loc.inode; + +-unwind: +- local->post_res_handler(frame, this); +- return 0; +-} +- +-int shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t post_res_handler, +- shard_internal_dir_type_t type) { +- int ret = -1; +- dict_t *xattr_req = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- uuid_t *gfid = NULL; +- loc_t *loc = NULL; +- gf_boolean_t free_gfid = _gf_true; +- +- local = frame->local; +- priv = this->private; +- local->post_res_handler = post_res_handler; +- +- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); +- if (!gfid) +- goto err; +- +- xattr_req = dict_new(); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- gf_uuid_copy(*gfid, priv->dot_shard_gfid); +- loc = &local->dot_shard_loc; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); +- loc = &local->dot_shard_rm_loc; +- break; +- default: +- bzero(*gfid, sizeof(uuid_t)); +- break; +- } +- +- ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set gfid of %s into dict", +- shard_internal_dir_string(type)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } else { +- free_gfid = _gf_false; +- } +- +- STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, +- xattr_req); +- +- dict_unref(xattr_req); +- return 0; +- +-err: +- if (xattr_req) +- dict_unref(xattr_req); +- if (free_gfid) +- GF_FREE(gfid); +- post_res_handler(frame, this); +- return 0; +-} +- +-static void shard_inode_ctx_update(inode_t *inode, xlator_t *this, +- dict_t *xdata, struct iatt *buf) { +- int ret = 0; +- uint64_t size = 0; +- void *bsize = NULL; +- +- if (shard_inode_ctx_get_block_size(inode, this, &size)) { +- /* Fresh lookup */ +- ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); +- if (!ret) +- size = ntoh64(*((uint64_t *)bsize)); +- /* If the file is sharded, set its block size, otherwise just +- * set 0. +- */ +- +- shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); +- } +- /* If the file is sharded, also set the remaining attributes, +- * except for ia_size and ia_blocks. +- */ +- if (size) { +- shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); +- (void)shard_inode_ctx_invalidate(inode, this, buf); +- } +-} +- +-int shard_delete_shards(void *opaque); +- +-int shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); +- +-int shard_start_background_deletion(xlator_t *this) { +- int ret = 0; +- gf_boolean_t i_cleanup = _gf_true; +- shard_priv_t *priv = NULL; +- call_frame_t *cleanup_frame = NULL; +- +- priv = this->private; +- +- LOCK(&priv->lock); +- { +- switch (priv->bg_del_state) { +- case SHARD_BG_DELETION_NONE: +- i_cleanup = _gf_true; +- priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; +- break; +- case SHARD_BG_DELETION_LAUNCHING: +- i_cleanup = _gf_false; +- break; +- case SHARD_BG_DELETION_IN_PROGRESS: +- priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; +- i_cleanup = _gf_false; +- break; +- default: +- break; +- } +- } +- UNLOCK(&priv->lock); +- if (!i_cleanup) +- return 0; +- +- cleanup_frame = create_frame(this, this->ctx->pool); +- if (!cleanup_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create " +- "new frame to delete shards"); +- ret = -ENOMEM; +- goto err; +- } +- +- set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); +- +- ret = synctask_new(this->ctx->env, shard_delete_shards, +- shard_delete_shards_cbk, cleanup_frame, cleanup_frame); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_WARNING, errno, SHARD_MSG_SHARDS_DELETION_FAILED, +- "failed to create task to do background " +- "cleanup of shards"); +- STACK_DESTROY(cleanup_frame->root); +- goto err; +- } +- return 0; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_UPDATE_FILE_SIZE_FAILED, ++ "Update to file size" ++ " xattr failed on %s", ++ uuid_utoa(inode->gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto err; ++ } + ++ if (shard_modify_size_and_block_count(&local->postbuf, dict)) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } + err: +- LOCK(&priv->lock); +- { priv->bg_del_state = SHARD_BG_DELETION_NONE; } +- UNLOCK(&priv->lock); +- return ret; ++ local->post_update_size_handler(frame, this); ++ return 0; + } + +-int shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, struct iatt *postparent) { +- int ret = -1; +- shard_priv_t *priv = NULL; +- gf_boolean_t i_start_cleanup = _gf_false; +- +- priv = this->private; +- +- if (op_ret < 0) +- goto unwind; +- +- if (IA_ISDIR(buf->ia_type)) +- goto unwind; +- +- /* Also, if the file is sharded, get the file size and block cnt xattr, +- * and store them in the stbuf appropriately. +- */ +- +- if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && +- frame->root->pid != GF_CLIENT_PID_GSYNCD) +- shard_modify_size_and_block_count(buf, xdata); +- +- /* If this was a fresh lookup, there are two possibilities: +- * 1) If the file is sharded (indicated by the presence of block size +- * xattr), store this block size, along with rdev and mode in its +- * inode ctx. +- * 2) If the file is not sharded, store size along with rdev and mode +- * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is +- * already initialised to all zeroes, nothing more needs to be done. +- */ ++int ++shard_set_size_attrs(int64_t size, int64_t block_count, int64_t **size_attr_p) ++{ ++ int ret = -1; ++ int64_t *size_attr = NULL; + +- (void)shard_inode_ctx_update(inode, this, xdata, buf); ++ if (!size_attr_p) ++ goto out; + +- LOCK(&priv->lock); +- { +- if (priv->first_lookup_done == _gf_false) { +- priv->first_lookup_done = _gf_true; +- i_start_cleanup = _gf_true; +- } +- } +- UNLOCK(&priv->lock); ++ size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); ++ if (!size_attr) ++ goto out; + +- if (!i_start_cleanup) +- goto unwind; ++ size_attr[0] = hton64(size); ++ /* As sharding evolves, it _may_ be necessary to embed more pieces of ++ * information within the same xattr. So allocating slots for them in ++ * advance. For now, only bytes 0-63 and 128-191 which would make up the ++ * current size and block count respectively of the file are valid. ++ */ ++ size_attr[2] = hton64(block_count); + +- ret = shard_start_background_deletion(this); +- if (ret < 0) { +- LOCK(&priv->lock); +- { priv->first_lookup_done = _gf_false; } +- UNLOCK(&priv->lock); +- } ++ *size_attr_p = size_attr; + +-unwind: +- SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, +- postparent); +- return 0; ++ ret = 0; ++out: ++ return ret; + } + +-int shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, +- dict_t *xattr_req) { +- int ret = -1; +- int32_t op_errno = ENOMEM; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- this->itable = loc->inode->table; +- if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && +- (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) { +- SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); +- } ++int ++shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ loc_t *loc, shard_post_update_size_fop_handler_t handler) ++{ ++ int ret = -1; ++ int64_t *size_attr = NULL; ++ int64_t delta_blocks = 0; ++ inode_t *inode = NULL; ++ shard_local_t *local = NULL; ++ dict_t *xattr_req = NULL; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = frame->local; ++ local->post_update_size_handler = handler; + +- frame->local = local; ++ xattr_req = dict_new(); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; ++ } + +- loc_copy(&local->loc, loc); ++ if (fd) ++ inode = fd->inode; ++ else ++ inode = loc->inode; + +- local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ /* If both size and block count have not changed, then skip the xattrop. ++ */ ++ delta_blocks = GF_ATOMIC_GET(local->delta_blocks); ++ if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) { ++ goto out; ++ } + +- if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ ret = shard_set_size_attrs(local->delta_size + local->hole_size, ++ delta_blocks, &size_attr); + if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict" +- " value: key:%s for path %s", +- GF_XATTR_SHARD_BLOCK_SIZE, loc->path); +- goto err; ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, ++ "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; + } +- } + +- if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); ++ ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); + if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict value: key:%s for path %s.", +- GF_XATTR_SHARD_FILE_SIZE, loc->path); +- goto err; ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key %s into dict. gfid=%s", ++ GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(inode->gfid)); ++ GF_FREE(size_attr); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; + } +- } + +- if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) +- dict_del(xattr_req, GF_CONTENT_KEY); ++ if (fd) ++ STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fxattrop, fd, ++ GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); ++ else ++ STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->xattrop, loc, ++ GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); + +- STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); +- return 0; +-} ++ dict_unref(xattr_req); ++ return 0; + +-int shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- inode_t *inode, struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- int ret = -1; +- int32_t mask = SHARD_INODE_WRITE_MASK; +- shard_local_t *local = NULL; +- shard_inode_ctx_t ctx = { +- 0, +- }; +- +- local = frame->local; +- +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, +- SHARD_MSG_BASE_FILE_LOOKUP_FAILED, "Lookup on base file" +- " failed : %s", +- loc_gfid_utoa(&(local->loc))); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } ++out: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ handler(frame, this); ++ return 0; ++} + +- local->prebuf = *buf; +- if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { +- local->op_ret = -1; +- local->op_errno = EINVAL; +- goto unwind; +- } +- +- if (shard_inode_ctx_get_all(inode, this, &ctx)) +- mask = SHARD_ALL_MASK; +- +- ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, +- (mask | SHARD_MASK_REFRESH_RESET)); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, +- "Failed to set inode" +- " write params into inode ctx for %s", +- uuid_utoa(buf->ia_gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto unwind; +- } ++static inode_t * ++shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, ++ struct iatt *buf, shard_internal_dir_type_t type) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ char *bname = NULL; ++ inode_t **priv_inode = NULL; ++ inode_t *parent = NULL; ++ ++ priv = THIS->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ bname = GF_SHARD_DIR; ++ priv_inode = &priv->dot_shard_inode; ++ parent = inode->table->root; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ priv_inode = &priv->dot_shard_rm_inode; ++ parent = priv->dot_shard_inode; ++ break; ++ default: ++ break; ++ } + +-unwind: +- local->handler(frame, this); +- return 0; +-} +- +-int shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, +- shard_post_fop_handler_t handler) { +- int ret = -1; +- shard_local_t *local = NULL; +- dict_t *xattr_req = NULL; +- gf_boolean_t need_refresh = _gf_false; +- +- local = frame->local; +- local->handler = handler; +- +- ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, +- &need_refresh); +- /* By this time, inode ctx should have been created either in create, +- * mknod, readdirp or lookup. If not it is a bug! +- */ +- if ((ret == 0) && (need_refresh == _gf_false)) { +- gf_msg_debug(this->name, 0, "Skipping lookup on base file: %s" +- "Serving prebuf off the inode ctx cache", +- uuid_utoa(loc->gfid)); +- goto out; +- } +- +- xattr_req = dict_new(); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto out; +- } ++ linked_inode = inode_link(inode, parent, bname, buf); ++ inode_lookup(linked_inode); ++ *priv_inode = linked_inode; ++ return linked_inode; ++} + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); ++int ++shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, int32_t op_errno, ++ inode_t *inode, struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ shard_local_t *local = NULL; ++ inode_t *linked_inode = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + +- STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xattr_req); ++ local = frame->local; + +- dict_unref(xattr_req); +- return 0; ++ if (op_ret) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto out; ++ } + ++ /* To-Do: Fix refcount increment per call to ++ * shard_link_internal_dir_inode(). ++ */ ++ linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ shard_inode_ctx_mark_dir_refreshed(linked_inode, this); + out: +- if (xattr_req) +- dict_unref(xattr_req); +- handler(frame, this); +- return 0; ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ return 0; + } + +-int shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_internal_dir_type_t type) ++{ ++ loc_t loc = { ++ 0, ++ }; ++ inode_t *inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ uuid_t gfid = { ++ 0, ++ }; + +- local = frame->local; ++ local = frame->local; ++ priv = this->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy(gfid, priv->dot_shard_gfid); ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); ++ break; ++ default: ++ break; ++ } + +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, +- SHARD_LOOKUP_MASK); ++ inode = inode_find(this->itable, gfid); + +- SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, +- &local->prebuf, local->xattr_rsp); +- return 0; +-} ++ if (!shard_inode_ctx_needs_lookup(inode, this)) { ++ local->op_ret = 0; ++ goto out; ++ } + +-int shard_post_stat_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ /* Plain assignment because the ref is already taken above through ++ * call to inode_find() ++ */ ++ loc.inode = inode; ++ gf_uuid_copy(loc.gfid, gfid); + +- local = frame->local; ++ STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, ++ NULL); ++ loc_wipe(&loc); + +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, +- SHARD_LOOKUP_MASK); ++ return 0; + +- SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, +- &local->prebuf, local->xattr_rsp); +- return 0; ++out: ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ return 0; + } + +-int shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *buf, +- dict_t *xdata) { +- inode_t *inode = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; ++int ++shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ inode_t *link_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, +- "stat failed: %s", local->fd ? uuid_utoa(local->fd->inode->gfid) +- : uuid_utoa((local->loc.inode)->gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } ++ local = frame->local; + +- local->prebuf = *buf; +- if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { +- local->op_ret = -1; +- local->op_errno = EINVAL; +- goto unwind; +- } +- local->xattr_rsp = dict_ref(xdata); ++ if (op_ret) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } + +- if (local->loc.inode) +- inode = local->loc.inode; +- else +- inode = local->fd->inode; ++ if (!IA_ISDIR(buf->ia_type)) { ++ gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, ++ "%s already exists and " ++ "is not a directory. Please remove it from all bricks " ++ "and try again", ++ shard_internal_dir_string(type)); ++ local->op_ret = -1; ++ local->op_errno = EIO; ++ goto unwind; ++ } + +- shard_inode_ctx_invalidate(inode, this, &local->prebuf); ++ link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ if (link_inode != inode) { ++ shard_refresh_internal_dir(frame, this, type); ++ } else { ++ shard_inode_ctx_mark_dir_refreshed(link_inode, this); ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ } ++ return 0; + + unwind: +- local->handler(frame, this); +- return 0; ++ local->post_res_handler(frame, this); ++ return 0; + } + +-int shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t post_res_handler, ++ shard_internal_dir_type_t type) ++{ ++ int ret = -1; ++ dict_t *xattr_req = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ uuid_t *gfid = NULL; ++ loc_t *loc = NULL; ++ gf_boolean_t free_gfid = _gf_true; + +- if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { +- STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->stat, loc, xdata); +- return 0; +- } ++ local = frame->local; ++ priv = this->private; ++ local->post_res_handler = post_res_handler; + +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); ++ if (!gfid) ++ goto err; + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->stat, loc, xdata); +- return 0; +- } ++ xattr_req = dict_new(); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy(*gfid, priv->dot_shard_gfid); ++ loc = &local->dot_shard_loc; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); ++ loc = &local->dot_shard_rm_loc; ++ break; ++ default: ++ bzero(*gfid, sizeof(uuid_t)); ++ break; ++ } + +- frame->local = local; ++ ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set gfid of %s into dict", ++ shard_internal_dir_string(type)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } else { ++ free_gfid = _gf_false; ++ } + +- local->handler = shard_post_stat_handler; +- loc_copy(&local->loc, loc); +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, ++ xattr_req); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, +- local, err); ++ dict_unref(xattr_req); ++ return 0; + +- STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); +- return 0; + err: +- shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); +- return 0; ++ if (xattr_req) ++ dict_unref(xattr_req); ++ if (free_gfid) ++ GF_FREE(gfid); ++ post_res_handler(frame, this); ++ return 0; + } + +-int shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++static void ++shard_inode_ctx_update(inode_t *inode, xlator_t *this, dict_t *xdata, ++ struct iatt *buf) ++{ ++ int ret = 0; ++ uint64_t size = 0; ++ void *bsize = NULL; ++ ++ if (shard_inode_ctx_get_block_size(inode, this, &size)) { ++ /* Fresh lookup */ ++ ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); ++ if (!ret) ++ size = ntoh64(*((uint64_t *)bsize)); ++ /* If the file is sharded, set its block size, otherwise just ++ * set 0. ++ */ ++ ++ shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); ++ } ++ /* If the file is sharded, also set the remaining attributes, ++ * except for ia_size and ia_blocks. ++ */ ++ if (size) { ++ shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); ++ (void)shard_inode_ctx_invalidate(inode, this, buf); ++ } ++} + +- if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { +- STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fstat, fd, xdata); +- return 0; +- } ++int ++shard_delete_shards(void *opaque); + +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } ++int ++shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fstat, fd, xdata); +- return 0; +- } ++int ++shard_start_background_deletion(xlator_t *this) ++{ ++ int ret = 0; ++ gf_boolean_t i_cleanup = _gf_true; ++ shard_priv_t *priv = NULL; ++ call_frame_t *cleanup_frame = NULL; + +- if (!this->itable) +- this->itable = fd->inode->table; ++ priv = this->private; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ LOCK(&priv->lock); ++ { ++ switch (priv->bg_del_state) { ++ case SHARD_BG_DELETION_NONE: ++ i_cleanup = _gf_true; ++ priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; ++ break; ++ case SHARD_BG_DELETION_LAUNCHING: ++ i_cleanup = _gf_false; ++ break; ++ case SHARD_BG_DELETION_IN_PROGRESS: ++ priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; ++ i_cleanup = _gf_false; ++ break; ++ default: ++ break; ++ } ++ } ++ UNLOCK(&priv->lock); ++ if (!i_cleanup) ++ return 0; + +- frame->local = local; ++ cleanup_frame = create_frame(this, this->ctx->pool); ++ if (!cleanup_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create " ++ "new frame to delete shards"); ++ ret = -ENOMEM; ++ goto err; ++ } + +- local->handler = shard_post_fstat_handler; +- local->fd = fd_ref(fd); +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, +- local, err); ++ ret = synctask_new(this->ctx->env, shard_delete_shards, ++ shard_delete_shards_cbk, cleanup_frame, cleanup_frame); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, errno, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "failed to create task to do background " ++ "cleanup of shards"); ++ STACK_DESTROY(cleanup_frame->root); ++ goto err; ++ } ++ return 0; + +- STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); +- return 0; + err: +- shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); +- return 0; ++ LOCK(&priv->lock); ++ { ++ priv->bg_del_state = SHARD_BG_DELETION_NONE; ++ } ++ UNLOCK(&priv->lock); ++ return ret; + } + +-int shard_post_update_size_truncate_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, struct iatt *postparent) ++{ ++ int ret = -1; ++ shard_priv_t *priv = NULL; ++ gf_boolean_t i_start_cleanup = _gf_false; + +- local = frame->local; ++ priv = this->private; + +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, NULL); +- else +- SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, NULL); +- return 0; +-} ++ if (op_ret < 0) ++ goto unwind; + +-int shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) { +- inode_t *inode = NULL; +- int64_t delta_blocks = 0; +- shard_local_t *local = NULL; ++ if (IA_ISDIR(buf->ia_type)) ++ goto unwind; + +- local = frame->local; ++ /* Also, if the file is sharded, get the file size and block cnt xattr, ++ * and store them in the stbuf appropriately. ++ */ + +- SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && ++ frame->root->pid != GF_CLIENT_PID_GSYNCD) ++ shard_modify_size_and_block_count(buf, xdata); ++ ++ /* If this was a fresh lookup, there are two possibilities: ++ * 1) If the file is sharded (indicated by the presence of block size ++ * xattr), store this block size, along with rdev and mode in its ++ * inode ctx. ++ * 2) If the file is not sharded, store size along with rdev and mode ++ * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is ++ * already initialised to all zeroes, nothing more needs to be done. ++ */ + +- inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, +- SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, "truncate on last" +- " shard failed : %s", +- uuid_utoa(inode->gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto err; +- } +- +- local->postbuf.ia_size = local->offset; +- /* Let the delta be negative. We want xattrop to do subtraction */ +- local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; +- delta_blocks = GF_ATOMIC_ADD(local->delta_blocks, +- postbuf->ia_blocks - prebuf->ia_blocks); +- GF_ASSERT(delta_blocks <= 0); +- local->postbuf.ia_blocks += delta_blocks; +- local->hole_size = 0; +- +- shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES); +- shard_update_file_size(frame, this, NULL, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +-} +- +-int shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, +- inode_t *inode) { +- size_t last_shard_size_after = 0; +- loc_t loc = { +- 0, +- }; +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- /* A NULL inode could be due to the fact that the last shard which +- * needs to be truncated does not exist due to it lying in a hole +- * region. So the only thing left to do in that case would be an +- * update to file size xattr. +- */ +- if (!inode) { +- gf_msg_debug(this->name, 0, +- "Last shard to be truncated absent in backend:%" PRIu64 +- " of gfid: %s. Directly proceeding to update file size", +- local->first_block, uuid_utoa(local->loc.inode->gfid)); +- shard_update_file_size(frame, this, NULL, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +- } ++ (void)shard_inode_ctx_update(inode, this, xdata, buf); + +- SHARD_SET_ROOT_FS_ID(frame, local); ++ LOCK(&priv->lock); ++ { ++ if (priv->first_lookup_done == _gf_false) { ++ priv->first_lookup_done = _gf_true; ++ i_start_cleanup = _gf_true; ++ } ++ } ++ UNLOCK(&priv->lock); + +- loc.inode = inode_ref(inode); +- gf_uuid_copy(loc.gfid, inode->gfid); ++ if (!i_start_cleanup) ++ goto unwind; + +- last_shard_size_after = (local->offset % local->block_size); ++ ret = shard_start_background_deletion(this); ++ if (ret < 0) { ++ LOCK(&priv->lock); ++ { ++ priv->first_lookup_done = _gf_false; ++ } ++ UNLOCK(&priv->lock); ++ } + +- STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, +- NULL); +- loc_wipe(&loc); +- return 0; ++unwind: ++ SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, ++ postparent); ++ return 0; + } + +-void shard_unlink_block_inode(shard_local_t *local, int shard_block_num); ++int ++shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) ++{ ++ int ret = -1; ++ int32_t op_errno = ENOMEM; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +-int shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) { +- int ret = 0; +- int call_count = 0; +- int shard_block_num = (long)cookie; +- uint64_t block_count = 0; +- shard_local_t *local = NULL; ++ this->itable = loc->inode->table; ++ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && ++ (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) { ++ SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); ++ } + +- local = frame->local; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto done; +- } +- ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count); +- if (!ret) { +- GF_ATOMIC_SUB(local->delta_blocks, block_count); +- } else { +- /* dict_get failed possibly due to a heterogeneous cluster? */ +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to get key %s from dict during truncate of gfid %s", +- GF_GET_FILE_BLOCK_COUNT, +- uuid_utoa(local->resolver_base_inode->gfid)); +- } +- +- shard_unlink_block_inode(local, shard_block_num); +-done: +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- shard_truncate_last_shard(frame, this, local->inode_list[0]); +- } +- return 0; +-} +- +-int shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) { +- int i = 1; +- int ret = -1; +- int call_count = 0; +- uint32_t cur_block = 0; +- uint32_t last_block = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- char *bname = NULL; +- loc_t loc = { +- 0, +- }; +- gf_boolean_t wind_failed = _gf_false; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- dict_t *xdata_req = NULL; +- +- local = frame->local; +- priv = this->private; +- +- cur_block = local->first_block + 1; +- last_block = local->last_block; +- +- /* Determine call count */ +- for (i = 1; i < local->num_blocks; i++) { +- if (!local->inode_list[i]) +- continue; +- call_count++; +- } +- +- if (!call_count) { +- /* Call count = 0 implies that all of the shards that need to be +- * unlinked do not exist. So shard xlator would now proceed to +- * do the final truncate + size updates. +- */ +- gf_msg_debug(this->name, 0, "Shards to be unlinked as part of " +- "truncate absent in backend: %s. Directly " +- "proceeding to update file size", +- uuid_utoa(inode->gfid)); +- local->postbuf.ia_size = local->offset; +- local->postbuf.ia_blocks = local->prebuf.ia_blocks; +- local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- local->hole_size = 0; +- shard_update_file_size(frame, this, local->fd, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +- } ++ frame->local = local; + +- local->call_count = call_count; +- i = 1; +- xdata_req = dict_new(); +- if (!xdata_req) { +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +- } +- ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set key %s into dict during truncate of %s", +- GF_GET_FILE_BLOCK_COUNT, +- uuid_utoa(local->resolver_base_inode->gfid)); +- dict_unref(xdata_req); +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +- } ++ loc_copy(&local->loc, loc); + +- SHARD_SET_ROOT_FS_ID(frame, local); +- while (cur_block <= last_block) { +- if (!local->inode_list[i]) { +- cur_block++; +- i++; +- continue; +- } +- if (wind_failed) { +- shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, +- NULL, NULL, NULL); +- goto next; +- } ++ local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; +- loc.parent = inode_ref(priv->dot_shard_inode); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s. Base file gfid = %s", +- bname, uuid_utoa(inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- loc_wipe(&loc); +- wind_failed = _gf_true; +- shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, +- NULL, NULL, NULL); +- goto next; ++ if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict" ++ " value: key:%s for path %s", ++ GF_XATTR_SHARD_BLOCK_SIZE, loc->path); ++ goto err; ++ } + } +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- loc.inode = inode_ref(local->inode_list[i]); + +- STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, (void *)(long)cur_block, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &loc, +- 0, xdata_req); +- loc_wipe(&loc); +- next: +- i++; +- cur_block++; +- if (!--call_count) +- break; +- } +- dict_unref(xdata_req); +- return 0; +-} +- +-int shard_truncate_do(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, ++ 8 * 4); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s for path %s.", ++ GF_XATTR_SHARD_FILE_SIZE, loc->path); ++ goto err; ++ } ++ } + +- local = frame->local; ++ if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) ++ dict_del(xattr_req, GF_CONTENT_KEY); + +- if (local->num_blocks == 1) { +- /* This means that there are no shards to be unlinked. +- * The fop boils down to truncating the last shard, updating +- * the size and unwinding. +- */ +- shard_truncate_last_shard(frame, this, local->inode_list[0]); ++ STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); + return 0; +- } else { +- shard_truncate_htol(frame, this, local->loc.inode); +- } +- return 0; + } + +-int shard_post_lookup_shards_truncate_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ int ret = -1; ++ int32_t mask = SHARD_INODE_WRITE_MASK; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t ctx = { ++ 0, ++ }; + +- local = frame->local; ++ local = frame->local; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +- } +- +- shard_truncate_do(frame, this); +- return 0; +-} +- +-void shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, +- struct iatt *buf) { +- int list_index = 0; +- char block_bname[256] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- inode_t *linked_inode = NULL; +- xlator_t *this = NULL; +- inode_t *fsync_inode = NULL; +- shard_priv_t *priv = NULL; +- inode_t *base_inode = NULL; +- +- this = THIS; +- priv = this->private; +- if (local->loc.inode) { +- gf_uuid_copy(gfid, local->loc.inode->gfid); +- base_inode = local->loc.inode; +- } else if (local->resolver_base_inode) { +- gf_uuid_copy(gfid, local->resolver_base_inode->gfid); +- base_inode = local->resolver_base_inode; +- } else { +- gf_uuid_copy(gfid, local->base_gfid); +- } +- +- shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); +- +- shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); +- linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); +- inode_lookup(linked_inode); +- list_index = block_num - local->first_block; +- local->inode_list[list_index] = linked_inode; +- +- LOCK(&priv->lock); +- { +- fsync_inode = __shard_update_shards_inode_list(linked_inode, this, +- base_inode, block_num, gfid); +- } +- UNLOCK(&priv->lock); +- if (fsync_inode) +- shard_initiate_evicted_inode_fsync(this, fsync_inode); +-} +- +-int shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- int call_count = 0; +- int shard_block_num = (long)cookie; +- uuid_t gfid = { +- 0, +- }; +- shard_local_t *local = NULL; +- +- local = frame->local; +- if (local->resolver_base_inode) +- gf_uuid_copy(gfid, local->resolver_base_inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- if (op_ret < 0) { +- /* Ignore absence of shards in the backend in truncate fop. */ +- switch (local->fop) { +- case GF_FOP_TRUNCATE: +- case GF_FOP_FTRUNCATE: +- case GF_FOP_RENAME: +- case GF_FOP_UNLINK: +- if (op_errno == ENOENT) +- goto done; +- break; +- case GF_FOP_WRITE: +- case GF_FOP_READ: +- case GF_FOP_ZEROFILL: +- case GF_FOP_DISCARD: +- case GF_FOP_FALLOCATE: +- if ((!local->first_lookup_done) && (op_errno == ENOENT)) { +- LOCK(&frame->lock); +- { local->create_count++; } +- UNLOCK(&frame->lock); +- goto done; +- } +- break; +- default: +- break; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_BASE_FILE_LOOKUP_FAILED, ++ "Lookup on base file" ++ " failed : %s", ++ loc_gfid_utoa(&(local->loc))); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; + } + +- /* else */ +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_LOOKUP_SHARD_FAILED, +- "Lookup on shard %d " +- "failed. Base file gfid = %s", +- shard_block_num, uuid_utoa(gfid)); +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto done; +- } +- +- shard_link_block_inode(local, shard_block_num, inode, buf); +- +-done: +- if (local->lookup_shards_barriered) { +- syncbarrier_wake(&local->barrier); +- return 0; +- } else { +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- if (!local->first_lookup_done) +- local->first_lookup_done = _gf_true; +- local->pls_fop_handler(frame, this); ++ local->prebuf = *buf; ++ if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { ++ local->op_ret = -1; ++ local->op_errno = EINVAL; ++ goto unwind; + } +- } +- return 0; +-} + +-dict_t *shard_create_gfid_dict(dict_t *dict) { +- int ret = 0; +- dict_t *new = NULL; +- unsigned char *gfid = NULL; ++ if (shard_inode_ctx_get_all(inode, this, &ctx)) ++ mask = SHARD_ALL_MASK; + +- new = dict_copy_with_ref(dict, NULL); +- if (!new) +- return NULL; ++ ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, ++ (mask | SHARD_MASK_REFRESH_RESET)); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, ++ "Failed to set inode" ++ " write params into inode ctx for %s", ++ uuid_utoa(buf->ia_gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto unwind; ++ } + +- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); +- if (!gfid) { +- ret = -1; +- goto out; +- } ++unwind: ++ local->handler(frame, this); ++ return 0; ++} + +- gf_uuid_generate(gfid); ++int ++shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ shard_post_fop_handler_t handler) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ dict_t *xattr_req = NULL; ++ gf_boolean_t need_refresh = _gf_false; + +- ret = dict_set_gfuuid(new, "gfid-req", gfid, false); ++ local = frame->local; ++ local->handler = handler; + +-out: +- if (ret) { +- dict_unref(new); +- new = NULL; +- GF_FREE(gfid); +- } +- +- return new; +-} +- +-int shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, +- inode_t *inode, +- shard_post_lookup_shards_fop_handler_t handler) { +- int i = 0; +- int ret = 0; +- int count = 0; +- int call_count = 0; +- int32_t shard_idx_iter = 0; +- int last_block = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- char *bname = NULL; +- uuid_t gfid = { +- 0, +- }; +- loc_t loc = { +- 0, +- }; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- gf_boolean_t wind_failed = _gf_false; +- dict_t *xattr_req = NULL; +- +- priv = this->private; +- local = frame->local; +- count = call_count = local->call_count; +- shard_idx_iter = local->first_block; +- last_block = local->last_block; +- local->pls_fop_handler = handler; +- if (local->lookup_shards_barriered) +- local->barrier.waitfor = local->call_count; +- +- if (inode) +- gf_uuid_copy(gfid, inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- while (shard_idx_iter <= last_block) { +- if (local->inode_list[i]) { +- i++; +- shard_idx_iter++; +- continue; +- } +- +- if (wind_failed) { +- shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, +- -1, ENOMEM, NULL, NULL, NULL, NULL); +- goto next; +- } +- +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- +- bname = strrchr(path, '/') + 1; +- loc.inode = inode_new(this->itable); +- loc.parent = inode_ref(priv->dot_shard_inode); +- gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0 || !(loc.inode)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- loc_wipe(&loc); +- wind_failed = _gf_true; +- shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, +- -1, ENOMEM, NULL, NULL, NULL, NULL); +- goto next; ++ ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, ++ &need_refresh); ++ /* By this time, inode ctx should have been created either in create, ++ * mknod, readdirp or lookup. If not it is a bug! ++ */ ++ if ((ret == 0) && (need_refresh == _gf_false)) { ++ gf_msg_debug(this->name, 0, ++ "Skipping lookup on base file: %s" ++ "Serving prebuf off the inode ctx cache", ++ uuid_utoa(loc->gfid)); ++ goto out; + } + +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- +- xattr_req = shard_create_gfid_dict(local->xattr_req); ++ xattr_req = dict_new(); + if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- loc_wipe(&loc); +- shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, +- -1, ENOMEM, NULL, NULL, NULL, NULL); +- goto next; +- } +- +- STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, +- (void *)(long)shard_idx_iter, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); +- loc_wipe(&loc); +- dict_unref(xattr_req); +- next: +- shard_idx_iter++; +- i++; +- +- if (!--call_count) +- break; +- } +- if (local->lookup_shards_barriered) { +- syncbarrier_wait(&local->barrier, count); +- local->pls_fop_handler(frame, this); +- } +- return 0; +-} +- +-int shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (local->op_ret < 0) { +- if (local->op_errno == ENOENT) { +- /* If lookup on /.shard fails with ENOENT, it means that +- * the file was 0-byte in size but truncated sometime in +- * the past to a higher size which is reflected in the +- * size xattr, and now being truncated to a lower size. +- * In this case, the only thing that needs to be done is +- * to update the size xattr of the file and unwind. +- */ +- local->first_block = local->last_block = 0; +- local->num_blocks = 1; +- local->call_count = 0; +- local->op_ret = 0; +- local->postbuf.ia_size = local->offset; +- shard_update_file_size(frame, this, local->fd, &local->loc, +- shard_post_update_size_truncate_handler); +- return 0; +- } else { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto out; + } +- } + +- if (!local->call_count) +- shard_truncate_do(frame, this); +- else +- shard_common_lookup_shards(frame, this, local->loc.inode, +- shard_post_lookup_shards_truncate_handler); +- +- return 0; +-} +- +-int shard_truncate_begin(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; +- +- /* First participant block here is the lowest numbered block that would +- * hold the last byte of the file post successful truncation. +- * Last participant block is the block that contains the last byte in +- * the current state of the file. +- * If (first block == last_block): +- * then that means that the file only needs truncation of the +- * first (or last since both are same) block. +- * Else +- * if (new_size % block_size == 0) +- * then that means there is no truncate to be done with +- * only shards from first_block + 1 through the last +- * block needing to be unlinked. +- * else +- * both truncate of the first block and unlink of the +- * remaining shards until end of file is required. +- */ +- local->first_block = +- (local->offset == 0) ? 0 : get_lowest_block(local->offset - 1, +- local->block_size); +- local->last_block = +- get_highest_block(0, local->prebuf.ia_size, local->block_size); +- +- local->num_blocks = local->last_block - local->first_block + 1; +- GF_ASSERT(local->num_blocks > 0); +- local->resolver_base_inode = +- (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; +- +- if ((local->first_block == 0) && (local->num_blocks == 1)) { +- if (local->fop == GF_FOP_TRUNCATE) +- STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->truncate, &local->loc, local->offset, +- local->xattr_req); +- else +- STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->ftruncate, local->fd, local->offset, +- local->xattr_req); +- return 0; +- } ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); + +- local->inode_list = +- GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) +- goto err; ++ STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- ret = +- shard_init_internal_dir_loc(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret) +- goto err; +- shard_lookup_internal_dir(frame, this, shard_post_resolve_truncate_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_post_resolve_truncate_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- return 0; ++ dict_unref(xattr_req); ++ return 0; + +-err: +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; ++out: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ handler(frame, this); ++ return 0; + } + +-int shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- struct iatt tmp_stbuf = { +- 0, +- }; +- +- local = frame->local; ++int ++shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ local = frame->local; + +- local->postbuf = tmp_stbuf = local->prebuf; ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, ++ SHARD_LOOKUP_MASK); + +- if (local->prebuf.ia_size == local->offset) { +- /* If the file size is same as requested size, unwind the call +- * immediately. +- */ +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, &local->postbuf, +- NULL); +- else +- SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, +- &local->postbuf, NULL); +- } else if (local->offset > local->prebuf.ia_size) { +- /* If the truncate is from a lower to a higher size, set the +- * new size xattr and unwind. +- */ +- local->hole_size = local->offset - local->prebuf.ia_size; +- local->delta_size = 0; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- local->postbuf.ia_size = local->offset; +- tmp_stbuf.ia_size = local->offset; +- shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, +- SHARD_INODE_WRITE_MASK); +- shard_update_file_size(frame, this, NULL, &local->loc, +- shard_post_update_size_truncate_handler); +- } else { +- /* ... else +- * i. unlink all shards that need to be unlinked. +- * ii. truncate the last of the shards. +- * iii. update the new size using setxattr. +- * and unwind the fop. +- */ +- local->hole_size = 0; +- local->delta_size = (local->offset - local->prebuf.ia_size); +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- tmp_stbuf.ia_size = local->offset; +- shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, +- SHARD_INODE_WRITE_MASK); +- shard_truncate_begin(frame, this); +- } +- return 0; ++ SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, ++ &local->prebuf, local->xattr_rsp); ++ return 0; + } + +-/* TO-DO: +- * Fix updates to size and block count with racing write(s) and truncate(s). +- */ ++int ++shard_post_stat_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +-int shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, +- off_t offset, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++ local = frame->local; + +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, ++ SHARD_LOOKUP_MASK); + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); ++ SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, ++ &local->prebuf, local->xattr_rsp); + return 0; +- } +- +- if (!this->itable) +- this->itable = loc->inode->table; ++} + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto err; +- loc_copy(&local->loc, loc); +- local->offset = offset; +- local->block_size = block_size; +- local->fop = GF_FOP_TRUNCATE; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->resolver_base_inode = loc->inode; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); +- return 0; ++int ++shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ dict_t *xdata) ++{ ++ inode_t *inode = NULL; ++ shard_local_t *local = NULL; + +-err: +- shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); +- return 0; +- } +- +- if (!this->itable) +- this->itable = fd->inode->table; +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto err; +- local->fd = fd_ref(fd); +- local->offset = offset; +- local->block_size = block_size; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->fop = GF_FOP_FTRUNCATE; +- +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); +- local->resolver_base_inode = fd->inode; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); +- return 0; +-} ++ local = frame->local; + +-int shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- int ret = -1; +- shard_local_t *local = NULL; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, ++ "stat failed: %s", ++ local->fd ? uuid_utoa(local->fd->inode->gfid) ++ : uuid_utoa((local->loc.inode)->gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } + +- local = frame->local; ++ local->prebuf = *buf; ++ if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { ++ local->op_ret = -1; ++ local->op_errno = EINVAL; ++ goto unwind; ++ } ++ local->xattr_rsp = dict_ref(xdata); + +- if (op_ret == -1) +- goto unwind; ++ if (local->loc.inode) ++ inode = local->loc.inode; ++ else ++ inode = local->fd->inode; + +- ret = +- shard_inode_ctx_set(inode, this, buf, local->block_size, SHARD_ALL_MASK); +- if (ret) +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, +- "Failed to set inode " +- "ctx for %s", +- uuid_utoa(inode->gfid)); ++ shard_inode_ctx_invalidate(inode, this, &local->prebuf); + + unwind: +- SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, +- postparent, xdata); +- +- return 0; ++ local->handler(frame, this); ++ return 0; + } + +-int shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, +- dev_t rdev, mode_t umask, dict_t *xdata) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; ++int ++shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { ++ STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->stat, loc, xdata); ++ return 0; ++ } + +- priv = this->private; +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } + +- frame->local = local; +- local->block_size = priv->block_size; +- if (!__is_gsyncd_on_shard_dir(frame, loc)) { +- SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->stat, loc, xdata); ++ return 0; ++ } + +- STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); +- return 0; +-} ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +-int32_t shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- shard_local_t *local = NULL; ++ frame->local = local; + +- local = frame->local; +- if (op_ret < 0) +- goto err; ++ local->handler = shard_post_stat_handler; ++ loc_copy(&local->loc, loc); ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- shard_inode_ctx_set(inode, this, buf, 0, SHARD_MASK_NLINK | SHARD_MASK_TIMES); +- buf->ia_size = local->prebuf.ia_size; +- buf->ia_blocks = local->prebuf.ia_blocks; ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, ++ local, err); + +- SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, +- postparent, xdata); +- return 0; ++ STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (local->op_ret < 0) { +- SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, NULL, +- NULL, NULL, NULL); +- return 0; +- } ++int ++shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, +- local->xattr_req); +- return 0; +-} ++ if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { ++ STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, xdata); ++ return 0; ++ } + +-int32_t shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, +- loc_t *newloc, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } + +- ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(oldloc->inode->gfid)); +- goto err; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, xdata); ++ return 0; ++ } + +- if (!block_size) { +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, +- oldloc, newloc, xdata); +- return 0; +- } ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- if (!this->itable) +- this->itable = oldloc->inode->table; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ frame->local = local; + +- frame->local = local; ++ local->handler = shard_post_fstat_handler; ++ local->fd = fd_ref(fd); ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- loc_copy(&local->loc, oldloc); +- loc_copy(&local->loc2, newloc); +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, ++ local, err); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_link_handler); +- return 0; ++ STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); +- +-int shard_post_lookup_shards_unlink_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- uuid_t gfid = { +- 0, +- }; ++int ++shard_post_update_size_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (local->resolver_base_inode) +- gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ if (local->fop == GF_FOP_TRUNCATE) ++ SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, NULL); + else +- gf_uuid_copy(gfid, local->base_gfid); +- +- if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { +- gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, +- "failed to delete shards of %s", uuid_utoa(gfid)); ++ SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, NULL); + return 0; +- } +- local->op_ret = 0; +- local->op_errno = 0; +- +- shard_unlink_shards_do(frame, this, local->resolver_base_inode); +- return 0; + } + +-int shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- local->lookup_shards_barriered = _gf_true; +- +- if (!local->call_count) +- shard_unlink_shards_do(frame, this, local->resolver_base_inode); +- else +- shard_common_lookup_shards(frame, this, local->resolver_base_inode, +- shard_post_lookup_shards_unlink_handler); +- return 0; +-} +- +-void shard_unlink_block_inode(shard_local_t *local, int shard_block_num) { +- char block_bname[256] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- inode_t *inode = NULL; +- inode_t *base_inode = NULL; +- xlator_t *this = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *base_ictx = NULL; +- int unref_base_inode = 0; +- int unref_shard_inode = 0; +- +- this = THIS; +- priv = this->private; +- +- inode = local->inode_list[shard_block_num - local->first_block]; +- shard_inode_ctx_get(inode, this, &ctx); +- base_inode = ctx->base_inode; +- if (base_inode) +- gf_uuid_copy(gfid, base_inode->gfid); +- else +- gf_uuid_copy(gfid, ctx->base_gfid); +- shard_make_block_bname(shard_block_num, gfid, block_bname, +- sizeof(block_bname)); +- +- LOCK(&priv->lock); +- if (base_inode) +- LOCK(&base_inode->lock); +- LOCK(&inode->lock); +- { +- __shard_inode_ctx_get(inode, this, &ctx); +- if (!list_empty(&ctx->ilist)) { +- list_del_init(&ctx->ilist); +- priv->inode_count--; +- unref_base_inode++; +- unref_shard_inode++; +- GF_ASSERT(priv->inode_count >= 0); +- } +- if (ctx->fsync_needed) { +- unref_base_inode++; +- unref_shard_inode++; +- list_del_init(&ctx->to_fsync_list); +- if (base_inode) { +- __shard_inode_ctx_get(base_inode, this, &base_ictx); +- base_ictx->fsync_count--; +- } +- } +- } +- UNLOCK(&inode->lock); +- if (base_inode) +- UNLOCK(&base_inode->lock); ++int ++shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ inode_t *inode = NULL; ++ int64_t delta_blocks = 0; ++ shard_local_t *local = NULL; + +- inode_unlink(inode, priv->dot_shard_inode, block_bname); +- inode_ref_reduce_by_n(inode, unref_shard_inode); +- inode_forget(inode, 0); ++ local = frame->local; + +- if (base_inode && unref_base_inode) +- inode_ref_reduce_by_n(base_inode, unref_base_inode); +- UNLOCK(&priv->lock); +-} ++ SHARD_UNSET_ROOT_FS_ID(frame, local); + +-int shard_rename_cbk(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode ++ : local->fd->inode; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, ++ "truncate on last" ++ " shard failed : %s", ++ uuid_utoa(inode->gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto err; ++ } + +- local = frame->local; ++ local->postbuf.ia_size = local->offset; ++ /* Let the delta be negative. We want xattrop to do subtraction */ ++ local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; ++ delta_blocks = GF_ATOMIC_ADD(local->delta_blocks, ++ postbuf->ia_blocks - prebuf->ia_blocks); ++ GF_ASSERT(delta_blocks <= 0); ++ local->postbuf.ia_blocks += delta_blocks; ++ local->hole_size = 0; + +- SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->preoldparent, +- &local->postoldparent, &local->prenewparent, +- &local->postnewparent, local->xattr_rsp); +- return 0; ++ shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES); ++ shard_update_file_size(frame, this, NULL, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; + } + +-int32_t shard_unlink_cbk(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = frame->local; ++int ++shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ size_t last_shard_size_after = 0; ++ loc_t loc = { ++ 0, ++ }; ++ shard_local_t *local = NULL; + +- SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, +- &local->preoldparent, &local->postoldparent, +- local->xattr_rsp); +- return 0; +-} ++ local = frame->local; + +-int shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) { +- int shard_block_num = (long)cookie; +- shard_local_t *local = NULL; ++ /* A NULL inode could be due to the fact that the last shard which ++ * needs to be truncated does not exist due to it lying in a hole ++ * region. So the only thing left to do in that case would be an ++ * update to file size xattr. ++ */ ++ if (!inode) { ++ gf_msg_debug(this->name, 0, ++ "Last shard to be truncated absent in backend:%" PRIu64 ++ " of gfid: %s. Directly proceeding to update file size", ++ local->first_block, uuid_utoa(local->loc.inode->gfid)); ++ shard_update_file_size(frame, this, NULL, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++ } + +- local = frame->local; ++ SHARD_SET_ROOT_FS_ID(frame, local); + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto done; +- } ++ loc.inode = inode_ref(inode); ++ gf_uuid_copy(loc.gfid, inode->gfid); + +- shard_unlink_block_inode(local, shard_block_num); +-done: +- syncbarrier_wake(&local->barrier); +- return 0; +-} +- +-int shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, +- inode_t *inode) { +- int i = 0; +- int ret = -1; +- int count = 0; +- uint32_t cur_block = 0; +- uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ +- char *bname = NULL; +- char path[PATH_MAX] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; +- loc_t loc = { +- 0, +- }; +- gf_boolean_t wind_failed = _gf_false; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; +- +- if (inode) +- gf_uuid_copy(gfid, inode->gfid); +- else +- gf_uuid_copy(gfid, local->base_gfid); +- +- for (i = 0; i < local->num_blocks; i++) { +- if (!local->inode_list[i]) +- continue; +- count++; +- } +- +- if (!count) { +- /* callcount = 0 implies that all of the shards that need to be +- * unlinked are non-existent (in other words the file is full of +- * holes). +- */ +- gf_msg_debug(this->name, 0, "All shards that need to be " +- "unlinked are non-existent: %s", +- uuid_utoa(gfid)); ++ last_shard_size_after = (local->offset % local->block_size); ++ ++ STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, ++ NULL); ++ loc_wipe(&loc); + return 0; +- } ++} + +- SHARD_SET_ROOT_FS_ID(frame, local); +- local->barrier.waitfor = count; +- cur_block = cur_block_idx + local->first_block; ++void ++shard_unlink_block_inode(shard_local_t *local, int shard_block_num); + +- while (cur_block_idx < local->num_blocks) { +- if (!local->inode_list[cur_block_idx]) +- goto next; ++int ++shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ int call_count = 0; ++ int shard_block_num = (long)cookie; ++ uint64_t block_count = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; + +- if (wind_failed) { +- shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count); ++ if (!ret) { ++ GF_ATOMIC_SUB(local->delta_blocks, block_count); ++ } else { ++ /* dict_get failed possibly due to a heterogeneous cluster? */ ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get key %s from dict during truncate of gfid %s", ++ GF_GET_FILE_BLOCK_COUNT, ++ uuid_utoa(local->resolver_base_inode->gfid)); + } + +- shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; +- loc.parent = inode_ref(priv->dot_shard_inode); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- loc_wipe(&loc); +- wind_failed = _gf_true; +- shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; ++ shard_unlink_block_inode(local, shard_block_num); ++done: ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ shard_truncate_last_shard(frame, this, local->inode_list[0]); + } ++ return 0; ++} + +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- loc.inode = inode_ref(local->inode_list[cur_block_idx]); ++int ++shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ int i = 1; ++ int ret = -1; ++ int call_count = 0; ++ uint32_t cur_block = 0; ++ uint32_t last_block = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ char *bname = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ gf_boolean_t wind_failed = _gf_false; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ dict_t *xdata_req = NULL; + +- STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, +- (void *)(long)cur_block, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, +- local->xattr_req); +- loc_wipe(&loc); +- next: +- cur_block++; +- cur_block_idx++; +- } +- syncbarrier_wait(&local->barrier, count); +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- return 0; +-} +- +-int shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, +- int now, int first_block, +- gf_dirent_t *entry) { +- int i = 0; +- int ret = 0; +- shard_local_t *local = NULL; +- uuid_t gfid = { +- 0, +- }; +- +- local = cleanup_frame->local; +- +- local->inode_list = GF_CALLOC(now, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) +- return -ENOMEM; +- +- local->first_block = first_block; +- local->last_block = first_block + now - 1; +- local->num_blocks = now; +- gf_uuid_parse(entry->d_name, gfid); +- gf_uuid_copy(local->base_gfid, gfid); +- local->resolver_base_inode = inode_find(this->itable, gfid); +- local->call_count = 0; +- ret = syncbarrier_init(&local->barrier); +- if (ret) { +- GF_FREE(local->inode_list); +- local->inode_list = NULL; +- inode_unref(local->resolver_base_inode); +- local->resolver_base_inode = NULL; +- return -errno; +- } +- shard_common_resolve_shards(cleanup_frame, this, +- shard_post_resolve_unlink_handler); +- +- for (i = 0; i < local->num_blocks; i++) { +- if (local->inode_list[i]) +- inode_unref(local->inode_list[i]); +- } +- GF_FREE(local->inode_list); +- local->inode_list = NULL; +- if (local->op_ret) +- ret = -local->op_errno; +- syncbarrier_destroy(&local->barrier); +- inode_unref(local->resolver_base_inode); +- local->resolver_base_inode = NULL; +- STACK_RESET(cleanup_frame->root); +- return ret; +-} +- +-int __shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, +- gf_dirent_t *entry, inode_t *inode) { +- int ret = 0; +- int shard_count = 0; +- int first_block = 0; +- int now = 0; +- uint64_t size = 0; +- uint64_t block_size = 0; +- uint64_t size_array[4] = { +- 0, +- }; +- void *bsize = NULL; +- void *size_attr = NULL; +- dict_t *xattr_rsp = NULL; +- loc_t loc = { +- 0, +- }; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = cleanup_frame->local; +- ret = dict_reset(local->xattr_req); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to reset dict"); +- ret = -ENOMEM; +- goto err; +- } +- +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); +- ret = -ENOMEM; +- goto err; +- } +- +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); +- if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); +- ret = -ENOMEM; +- goto err; +- } +- +- loc.inode = inode_ref(inode); +- loc.parent = inode_ref(priv->dot_shard_rm_inode); +- ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", entry->d_name); +- ret = -ENOMEM; +- goto err; +- } +- +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, +- &xattr_rsp); +- if (ret) +- goto err; +- +- ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); +- goto err; +- } +- block_size = ntoh64(*((uint64_t *)bsize)); +- +- ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); +- goto err; +- } +- +- memcpy(size_array, size_attr, sizeof(size_array)); +- size = ntoh64(size_array[0]); +- +- shard_count = (size / block_size) - 1; +- if (shard_count < 0) { +- gf_msg_debug(this->name, 0, "Size of %s hasn't grown beyond " +- "its shard-block-size. Nothing to delete. " +- "Returning", +- entry->d_name); +- /* File size < shard-block-size, so nothing to delete */ +- ret = 0; +- goto delete_marker; +- } +- if ((size % block_size) > 0) +- shard_count++; +- +- if (shard_count == 0) { +- gf_msg_debug(this->name, 0, "Size of %s is exactly equal to " +- "its shard-block-size. Nothing to delete. " +- "Returning", +- entry->d_name); +- ret = 0; +- goto delete_marker; +- } +- gf_msg_debug(this->name, 0, +- "base file = %s, " +- "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 ", " +- "shard_count=%d", +- entry->d_name, block_size, size, shard_count); +- +- /* Perform a gfid-based lookup to see if gfid corresponding to marker +- * file's base name exists. +- */ +- loc_wipe(&loc); +- loc.inode = inode_new(this->itable); +- if (!loc.inode) { +- ret = -ENOMEM; +- goto err; +- } +- gf_uuid_parse(entry->d_name, loc.gfid); +- ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); +- if (!ret) { +- gf_msg_debug(this->name, 0, "Base shard corresponding to gfid " +- "%s is present. Skipping shard deletion. " +- "Returning", +- entry->d_name); +- ret = 0; +- goto delete_marker; +- } ++ local = frame->local; ++ priv = this->private; + +- first_block = 1; ++ cur_block = local->first_block + 1; ++ last_block = local->last_block; + +- while (shard_count) { +- if (shard_count < local->deletion_rate) { +- now = shard_count; +- shard_count = 0; +- } else { +- now = local->deletion_rate; +- shard_count -= local->deletion_rate; ++ /* Determine call count */ ++ for (i = 1; i < local->num_blocks; i++) { ++ if (!local->inode_list[i]) ++ continue; ++ call_count++; + } + +- gf_msg_debug(this->name, 0, "deleting %d shards starting from " +- "block %d of gfid %s", +- now, first_block, entry->d_name); +- ret = shard_regulated_shards_deletion(cleanup_frame, this, now, first_block, +- entry); +- if (ret) +- goto err; +- first_block += now; +- } ++ if (!call_count) { ++ /* Call count = 0 implies that all of the shards that need to be ++ * unlinked do not exist. So shard xlator would now proceed to ++ * do the final truncate + size updates. ++ */ ++ gf_msg_debug(this->name, 0, ++ "Shards to be unlinked as part of " ++ "truncate absent in backend: %s. Directly " ++ "proceeding to update file size", ++ uuid_utoa(inode->gfid)); ++ local->postbuf.ia_size = local->offset; ++ local->postbuf.ia_blocks = local->prebuf.ia_blocks; ++ local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ local->hole_size = 0; ++ shard_update_file_size(frame, this, local->fd, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++ } + +-delete_marker: +- loc_wipe(&loc); +- loc.inode = inode_ref(inode); +- loc.parent = inode_ref(priv->dot_shard_rm_inode); +- ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", entry->d_name); +- ret = -ENOMEM; +- goto err; +- } +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, +- "Failed to delete %s " +- "from /%s", +- entry->d_name, GF_SHARD_REMOVE_ME_DIR); +-err: +- if (xattr_rsp) +- dict_unref(xattr_rsp); +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, +- gf_dirent_t *entry, inode_t *inode) { +- int ret = -1; +- loc_t loc = { +- 0, +- }; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- loc.inode = inode_ref(priv->dot_shard_rm_inode); +- +- ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, +- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); +- if (ret < 0) { +- if (ret == -EAGAIN) { +- ret = 0; +- } +- goto out; +- } +- { ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); } +- syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, +- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); +-out: +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) { +- SHARD_STACK_DESTROY(frame); +- return 0; +-} +- +-int shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, +- shard_internal_dir_type_t type) { +- int ret = 0; +- char *bname = NULL; +- loc_t *loc = NULL; +- shard_priv_t *priv = NULL; +- uuid_t gfid = { +- 0, +- }; +- struct iatt stbuf = { +- 0, +- }; +- +- priv = this->private; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- loc = &local->dot_shard_loc; +- gf_uuid_copy(gfid, priv->dot_shard_gfid); +- bname = GF_SHARD_DIR; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- loc = &local->dot_shard_rm_loc; +- gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); +- bname = GF_SHARD_REMOVE_ME_DIR; +- break; +- default: +- break; +- } +- +- loc->inode = inode_find(this->itable, gfid); +- if (!loc->inode) { +- ret = shard_init_internal_dir_loc(this, local, type); +- if (ret) +- goto err; +- ret = dict_reset(local->xattr_req); ++ local->call_count = call_count; ++ i = 1; ++ xdata_req = dict_new(); ++ if (!xdata_req) { ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++ } ++ ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8); + if (ret) { +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to reset " +- "dict"); +- ret = -ENOMEM; +- goto err; +- } +- ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); +- ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, local->xattr_req, +- NULL); +- if (ret < 0) { +- if (ret != -ENOENT) +- gf_msg(this->name, GF_LOG_ERROR, -ret, SHARD_MSG_SHARDS_DELETION_FAILED, +- "Lookup on %s failed, exiting", bname); +- goto err; +- } else { +- shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key %s into dict during truncate of %s", ++ GF_GET_FILE_BLOCK_COUNT, ++ uuid_utoa(local->resolver_base_inode->gfid)); ++ dict_unref(xdata_req); ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; + } +- } +- ret = 0; +-err: +- return ret; +-} +- +-int shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, +- gf_dirent_t *entry) { +- int ret = 0; +- loc_t loc = { +- 0, +- }; +- +- loc.inode = inode_new(this->itable); +- if (!loc.inode) { +- ret = -ENOMEM; +- goto err; +- } +- loc.parent = inode_ref(local->fd->inode); +- +- ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", entry->d_name); +- ret = -ENOMEM; +- goto err; +- } +- +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; +- +- ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); +- if (ret < 0) { +- goto err; +- } +- entry->inode = inode_ref(loc.inode); +- ret = 0; +-err: +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_delete_shards(void *opaque) { +- int ret = 0; +- off_t offset = 0; +- loc_t loc = { +- 0, +- }; +- inode_t *link_inode = NULL; +- xlator_t *this = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- gf_dirent_t entries; +- gf_dirent_t *entry = NULL; +- call_frame_t *cleanup_frame = NULL; +- gf_boolean_t done = _gf_false; +- +- this = THIS; +- priv = this->private; +- INIT_LIST_HEAD(&entries.list); +- +- cleanup_frame = opaque; +- +- local = mem_get0(this->local_pool); +- if (!local) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create local to " +- "delete shards"); +- ret = -ENOMEM; +- goto err; +- } +- cleanup_frame->local = local; +- local->fop = GF_FOP_UNLINK; +- +- local->xattr_req = dict_new(); +- if (!local->xattr_req) { +- ret = -ENOMEM; +- goto err; +- } +- local->deletion_rate = priv->deletion_rate; +- +- ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret == -ENOENT) { +- gf_msg_debug(this->name, 0, ".shard absent. Nothing to" +- " delete. Exiting"); +- ret = 0; +- goto err; +- } else if (ret < 0) { +- goto err; +- } + +- ret = shard_resolve_internal_dir(this, local, +- SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); +- if (ret == -ENOENT) { +- gf_msg_debug(this->name, 0, ".remove_me absent. " +- "Nothing to delete. Exiting"); +- ret = 0; +- goto err; +- } else if (ret < 0) { +- goto err; +- } +- +- local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); +- if (!local->fd) { +- ret = -ENOMEM; +- goto err; +- } +- +- for (;;) { +- offset = 0; ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ while (cur_block <= last_block) { ++ if (!local->inode_list[i]) { ++ cur_block++; ++ i++; ++ continue; ++ } ++ if (wind_failed) { ++ shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); ++ bname = strrchr(path, '/') + 1; ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on %s. Base file gfid = %s", ++ bname, uuid_utoa(inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ loc_wipe(&loc); ++ wind_failed = _gf_true; ++ shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ loc.inode = inode_ref(local->inode_list[i]); ++ ++ STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, ++ (void *)(long)cur_block, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &loc, 0, xdata_req); ++ loc_wipe(&loc); ++ next: ++ i++; ++ cur_block++; ++ if (!--call_count) ++ break; ++ } ++ dict_unref(xdata_req); ++ return 0; ++} ++ ++int ++shard_truncate_do(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->num_blocks == 1) { ++ /* This means that there are no shards to be unlinked. ++ * The fop boils down to truncating the last shard, updating ++ * the size and unwinding. ++ */ ++ shard_truncate_last_shard(frame, this, local->inode_list[0]); ++ return 0; ++ } else { ++ shard_truncate_htol(frame, this, local->loc.inode); ++ } ++ return 0; ++} ++ ++int ++shard_post_lookup_shards_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ shard_truncate_do(frame, this); ++ return 0; ++} ++ ++void ++shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, ++ struct iatt *buf) ++{ ++ int list_index = 0; ++ char block_bname[256] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ inode_t *linked_inode = NULL; ++ xlator_t *this = NULL; ++ inode_t *fsync_inode = NULL; ++ shard_priv_t *priv = NULL; ++ inode_t *base_inode = NULL; ++ ++ this = THIS; ++ priv = this->private; ++ if (local->loc.inode) { ++ gf_uuid_copy(gfid, local->loc.inode->gfid); ++ base_inode = local->loc.inode; ++ } else if (local->resolver_base_inode) { ++ gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ base_inode = local->resolver_base_inode; ++ } else { ++ gf_uuid_copy(gfid, local->base_gfid); ++ } ++ ++ shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); ++ ++ shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); ++ linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); ++ inode_lookup(linked_inode); ++ list_index = block_num - local->first_block; ++ local->inode_list[list_index] = linked_inode; ++ + LOCK(&priv->lock); + { +- if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { +- priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; +- } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { +- priv->bg_del_state = SHARD_BG_DELETION_NONE; +- done = _gf_true; +- } ++ fsync_inode = __shard_update_shards_inode_list( ++ linked_inode, this, base_inode, block_num, gfid); + } + UNLOCK(&priv->lock); +- if (done) +- break; +- while ((ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, +- &entries, local->xattr_req, NULL))) { +- if (ret > 0) +- ret = 0; +- list_for_each_entry(entry, &entries.list, list) { +- offset = entry->d_off; ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync(this, fsync_inode); ++} ++ ++int ++shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, int32_t op_errno, ++ inode_t *inode, struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ int call_count = 0; ++ int shard_block_num = (long)cookie; ++ uuid_t gfid = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ if (local->resolver_base_inode) ++ gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ if (op_ret < 0) { ++ /* Ignore absence of shards in the backend in truncate fop. */ ++ switch (local->fop) { ++ case GF_FOP_TRUNCATE: ++ case GF_FOP_FTRUNCATE: ++ case GF_FOP_RENAME: ++ case GF_FOP_UNLINK: ++ if (op_errno == ENOENT) ++ goto done; ++ break; ++ case GF_FOP_WRITE: ++ case GF_FOP_READ: ++ case GF_FOP_ZEROFILL: ++ case GF_FOP_DISCARD: ++ case GF_FOP_FALLOCATE: ++ if ((!local->first_lookup_done) && (op_errno == ENOENT)) { ++ LOCK(&frame->lock); ++ { ++ local->create_count++; ++ } ++ UNLOCK(&frame->lock); ++ goto done; ++ } ++ break; ++ default: ++ break; ++ } ++ ++ /* else */ ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_LOOKUP_SHARD_FAILED, ++ "Lookup on shard %d " ++ "failed. Base file gfid = %s", ++ shard_block_num, uuid_utoa(gfid)); ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ++ shard_link_block_inode(local, shard_block_num, inode, buf); ++ ++done: ++ if (local->lookup_shards_barriered) { ++ syncbarrier_wake(&local->barrier); ++ return 0; ++ } else { ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ if (!local->first_lookup_done) ++ local->first_lookup_done = _gf_true; ++ local->pls_fop_handler(frame, this); ++ } ++ } ++ return 0; ++} ++ ++dict_t * ++shard_create_gfid_dict(dict_t *dict) ++{ ++ int ret = 0; ++ dict_t *new = NULL; ++ unsigned char *gfid = NULL; ++ ++ new = dict_copy_with_ref(dict, NULL); ++ if (!new) ++ return NULL; ++ ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); ++ if (!gfid) { ++ ret = -1; ++ goto out; ++ } ++ ++ gf_uuid_generate(gfid); ++ ++ ret = dict_set_gfuuid(new, "gfid-req", gfid, false); + +- if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) +- continue; ++out: ++ if (ret) { ++ dict_unref(new); ++ new = NULL; ++ GF_FREE(gfid); ++ } ++ ++ return new; ++} ++ ++int ++shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, ++ shard_post_lookup_shards_fop_handler_t handler) ++{ ++ int i = 0; ++ int ret = 0; ++ int count = 0; ++ int call_count = 0; ++ int32_t shard_idx_iter = 0; ++ int last_block = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ char *bname = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ loc_t loc = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ dict_t *xattr_req = NULL; + +- if (!entry->inode) { +- ret = shard_lookup_marker_entry(this, local, entry); +- if (ret < 0) ++ priv = this->private; ++ local = frame->local; ++ count = call_count = local->call_count; ++ shard_idx_iter = local->first_block; ++ last_block = local->last_block; ++ local->pls_fop_handler = handler; ++ if (local->lookup_shards_barriered) ++ local->barrier.waitfor = local->call_count; ++ ++ if (inode) ++ gf_uuid_copy(gfid, inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ while (shard_idx_iter <= last_block) { ++ if (local->inode_list[i]) { ++ i++; ++ shard_idx_iter++; + continue; + } +- link_inode = inode_link(entry->inode, local->fd->inode, entry->d_name, +- &entry->d_stat); + +- gf_msg_debug(this->name, 0, "Initiating deletion of " +- "shards of gfid %s", +- entry->d_name); +- ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, +- link_inode); +- inode_unlink(link_inode, local->fd->inode, entry->d_name); +- inode_unref(link_inode); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, -ret, +- SHARD_MSG_SHARDS_DELETION_FAILED, +- "Failed to clean up shards of gfid %s", entry->d_name); +- continue; ++ if (wind_failed) { ++ shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, ++ this, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ goto next; + } +- gf_msg(this->name, GF_LOG_INFO, 0, SHARD_MSG_SHARD_DELETION_COMPLETED, +- "Deleted " +- "shards of gfid=%s from backend", +- entry->d_name); +- } +- gf_dirent_free(&entries); +- if (ret) +- break; +- } +- } +- ret = 0; +- loc_wipe(&loc); +- return ret; ++ ++ shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); ++ ++ bname = strrchr(path, '/') + 1; ++ loc.inode = inode_new(this->itable); ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0 || !(loc.inode)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on %s, base file gfid = %s", ++ bname, uuid_utoa(gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ loc_wipe(&loc); ++ wind_failed = _gf_true; ++ shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, ++ this, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ goto next; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ loc_wipe(&loc); ++ shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, ++ this, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ goto next; ++ } ++ ++ STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, ++ (void *)(long)shard_idx_iter, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); ++ loc_wipe(&loc); ++ dict_unref(xattr_req); ++ next: ++ shard_idx_iter++; ++ i++; ++ ++ if (!--call_count) ++ break; ++ } ++ if (local->lookup_shards_barriered) { ++ syncbarrier_wait(&local->barrier, count); ++ local->pls_fop_handler(frame, this); ++ } ++ return 0; ++} ++ ++int ++shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ if (local->op_errno == ENOENT) { ++ /* If lookup on /.shard fails with ENOENT, it means that ++ * the file was 0-byte in size but truncated sometime in ++ * the past to a higher size which is reflected in the ++ * size xattr, and now being truncated to a lower size. ++ * In this case, the only thing that needs to be done is ++ * to update the size xattr of the file and unwind. ++ */ ++ local->first_block = local->last_block = 0; ++ local->num_blocks = 1; ++ local->call_count = 0; ++ local->op_ret = 0; ++ local->postbuf.ia_size = local->offset; ++ shard_update_file_size(frame, this, local->fd, &local->loc, ++ shard_post_update_size_truncate_handler); ++ return 0; ++ } else { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ } ++ ++ if (!local->call_count) ++ shard_truncate_do(frame, this); ++ else ++ shard_common_lookup_shards(frame, this, local->loc.inode, ++ shard_post_lookup_shards_truncate_handler); ++ ++ return 0; ++} ++ ++int ++shard_truncate_begin(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ /* First participant block here is the lowest numbered block that would ++ * hold the last byte of the file post successful truncation. ++ * Last participant block is the block that contains the last byte in ++ * the current state of the file. ++ * If (first block == last_block): ++ * then that means that the file only needs truncation of the ++ * first (or last since both are same) block. ++ * Else ++ * if (new_size % block_size == 0) ++ * then that means there is no truncate to be done with ++ * only shards from first_block + 1 through the last ++ * block needing to be unlinked. ++ * else ++ * both truncate of the first block and unlink of the ++ * remaining shards until end of file is required. ++ */ ++ local->first_block = (local->offset == 0) ++ ? 0 ++ : get_lowest_block(local->offset - 1, ++ local->block_size); ++ local->last_block = get_highest_block(0, local->prebuf.ia_size, ++ local->block_size); ++ ++ local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); ++ local->resolver_base_inode = (local->fop == GF_FOP_TRUNCATE) ++ ? local->loc.inode ++ : local->fd->inode; ++ ++ if ((local->first_block == 0) && (local->num_blocks == 1)) { ++ if (local->fop == GF_FOP_TRUNCATE) ++ STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, &local->loc, ++ local->offset, local->xattr_req); ++ else ++ STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->ftruncate, local->fd, ++ local->offset, local->xattr_req); ++ return 0; ++ } ++ ++ local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ goto err; ++ ++ local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ ret = shard_init_internal_dir_loc(this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret) ++ goto err; ++ shard_lookup_internal_dir(frame, this, ++ shard_post_resolve_truncate_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ local->post_res_handler = shard_post_resolve_truncate_handler; ++ shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); ++ } ++ return 0; + + err: +- LOCK(&priv->lock); +- { priv->bg_del_state = SHARD_BG_DELETION_NONE; } +- UNLOCK(&priv->lock); +- loc_wipe(&loc); +- return ret; +-} +- +-int shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- if (op_ret) +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Unlock failed. Please check brick logs for " +- "more details"); +- SHARD_STACK_DESTROY(frame); +- return 0; +-} +- +-int shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) { +- loc_t *loc = NULL; +- call_frame_t *lk_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *lk_local = NULL; +- shard_inodelk_t *lock = NULL; +- +- local = frame->local; +- lk_frame = local->inodelk_frame; +- lk_local = lk_frame->local; +- local->inodelk_frame = NULL; +- loc = &local->int_inodelk.loc; +- lock = &lk_local->int_inodelk; +- lock->flock.l_type = F_UNLCK; +- +- STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, +- &lock->flock, NULL); +- local->int_inodelk.acquired_lock = _gf_false; +- return 0; +-} +- +-int shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *buf, +- struct iatt *preoldparent, struct iatt *postoldparent, +- struct iatt *prenewparent, struct iatt *postnewparent, +- dict_t *xdata); +-int shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- loc_t *dst_loc = NULL; +- loc_t tmp_loc = { +- 0, +- }; +- shard_local_t *local = frame->local; +- +- if (local->dst_block_size) { +- tmp_loc.parent = inode_ref(local->loc2.parent); +- ret = inode_path(tmp_loc.parent, local->loc2.name, (char **)&tmp_loc.path); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on pargfid=%s bname=%s", +- uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +- +- tmp_loc.name = strrchr(tmp_loc.path, '/'); +- if (tmp_loc.name) +- tmp_loc.name++; +- dst_loc = &tmp_loc; +- } else { +- dst_loc = &local->loc2; +- } +- +- /* To-Do: Request open-fd count on dst base file */ +- STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, +- local->xattr_req); +- loc_wipe(&tmp_loc); +- return 0; +-err: +- loc_wipe(&tmp_loc); +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +-} +- +-int shard_unlink_base_file(call_frame_t *frame, xlator_t *this); +- +-int shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, dict_t *dict, +- dict_t *xdata) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- priv = this->private; +- local = frame->local; +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Xattrop on marker file failed " +- "while performing %s; entry gfid=%s", +- gf_fop_string(local->fop), local->newloc.name); +- goto err; +- } +- +- inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, +- local->newloc.name); +- +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_base_file(frame, this); +- else if (local->fop == GF_FOP_RENAME) +- shard_rename_src_base_file(frame, this); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); +- return 0; +-} +- +-int shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) { +- int op_errno = ENOMEM; +- uint64_t bs = 0; +- dict_t *xdata = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- xdata = dict_new(); +- if (!xdata) +- goto err; +- +- if (local->fop == GF_FOP_UNLINK) +- bs = local->block_size; +- else if (local->fop == GF_FOP_RENAME) +- bs = local->dst_block_size; +- SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, +- local->prebuf.ia_size, 0, err); +- STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->xattrop, &local->newloc, +- GF_XATTROP_GET_AND_SET, xdata, NULL); +- dict_unref(xdata); +- return 0; +-err: +- if (xdata) +- dict_unref(xdata); +- shard_common_failure_unwind(local->fop, frame, -1, op_errno); +- return 0; ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) { +- inode_t *linked_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- priv = this->private; +- +- if (op_ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Lookup on marker file failed " +- "while performing %s; entry gfid=%s", +- gf_fop_string(local->fop), local->newloc.name); +- goto err; +- } +- +- linked_inode = +- inode_link(inode, priv->dot_shard_rm_inode, local->newloc.name, buf); +- inode_unref(local->newloc.inode); +- local->newloc.inode = linked_inode; +- shard_set_size_attrs_on_marker_file(frame, this); +- return 0; ++int ++shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ struct iatt tmp_stbuf = { ++ 0, ++ }; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ local->postbuf = tmp_stbuf = local->prebuf; ++ ++ if (local->prebuf.ia_size == local->offset) { ++ /* If the file size is same as requested size, unwind the call ++ * immediately. ++ */ ++ if (local->fop == GF_FOP_TRUNCATE) ++ SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, ++ &local->postbuf, NULL); ++ else ++ SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, ++ &local->postbuf, NULL); ++ } else if (local->offset > local->prebuf.ia_size) { ++ /* If the truncate is from a lower to a higher size, set the ++ * new size xattr and unwind. ++ */ ++ local->hole_size = local->offset - local->prebuf.ia_size; ++ local->delta_size = 0; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ local->postbuf.ia_size = local->offset; ++ tmp_stbuf.ia_size = local->offset; ++ shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, ++ SHARD_INODE_WRITE_MASK); ++ shard_update_file_size(frame, this, NULL, &local->loc, ++ shard_post_update_size_truncate_handler); ++ } else { ++ /* ... else ++ * i. unlink all shards that need to be unlinked. ++ * ii. truncate the last of the shards. ++ * iii. update the new size using setxattr. ++ * and unwind the fop. ++ */ ++ local->hole_size = 0; ++ local->delta_size = (local->offset - local->prebuf.ia_size); ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ tmp_stbuf.ia_size = local->offset; ++ shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, ++ SHARD_INODE_WRITE_MASK); ++ shard_truncate_begin(frame, this); ++ } ++ return 0; ++} ++ ++/* TO-DO: ++ * Fix updates to size and block count with racing write(s) and truncate(s). ++ */ ++ ++int ++shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = loc->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto err; ++ loc_copy(&local->loc, loc); ++ local->offset = offset; ++ local->block_size = block_size; ++ local->fop = GF_FOP_TRUNCATE; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->resolver_base_inode = loc->inode; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_truncate_handler); ++ return 0; ++ + err: +- shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) { +- int op_errno = ENOMEM; +- dict_t *xattr_req = NULL; +- shard_local_t *local = NULL; ++int ++shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = fd->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto err; ++ local->fd = fd_ref(fd); ++ local->offset = offset; ++ local->block_size = block_size; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->fop = GF_FOP_FTRUNCATE; ++ ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); ++ local->resolver_base_inode = fd->inode; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_truncate_handler); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret == -1) ++ goto unwind; ++ ++ ret = shard_inode_ctx_set(inode, this, buf, local->block_size, ++ SHARD_ALL_MASK); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, ++ "Failed to set inode " ++ "ctx for %s", ++ uuid_utoa(inode->gfid)); ++ ++unwind: ++ SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, ++ postparent, xdata); ++ ++ return 0; ++} ++ ++int ++shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, ++ dev_t rdev, mode_t umask, dict_t *xdata) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ local->block_size = priv->block_size; ++ if (!__is_gsyncd_on_shard_dir(frame, loc)) { ++ SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); ++ } ++ ++ STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int32_t ++shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ if (op_ret < 0) ++ goto err; ++ ++ shard_inode_ctx_set(inode, this, buf, 0, ++ SHARD_MASK_NLINK | SHARD_MASK_TIMES); ++ buf->ia_size = local->prebuf.ia_size; ++ buf->ia_blocks = local->prebuf.ia_blocks; ++ ++ SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, ++ postparent, xdata); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, ++ NULL, NULL, NULL, NULL); ++ return 0; ++ } ++ ++ STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, ++ local->xattr_req); ++ return 0; ++} ++ ++int32_t ++shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(oldloc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size) { ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, ++ oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = oldloc->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ loc_copy(&local->loc, oldloc); ++ loc_copy(&local->loc2, newloc); ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_link_handler); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); ++ ++int ++shard_post_lookup_shards_unlink_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ ++ local = frame->local; ++ ++ if (local->resolver_base_inode) ++ gf_uuid_copy(gfid, local->resolver_base_inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { ++ gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, ++ "failed to delete shards of %s", uuid_utoa(gfid)); ++ return 0; ++ } ++ local->op_ret = 0; ++ local->op_errno = 0; ++ ++ shard_unlink_shards_do(frame, this, local->resolver_base_inode); ++ return 0; ++} ++ ++int ++shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ local->lookup_shards_barriered = _gf_true; ++ ++ if (!local->call_count) ++ shard_unlink_shards_do(frame, this, local->resolver_base_inode); ++ else ++ shard_common_lookup_shards(frame, this, local->resolver_base_inode, ++ shard_post_lookup_shards_unlink_handler); ++ return 0; ++} ++ ++void ++shard_unlink_block_inode(shard_local_t *local, int shard_block_num) ++{ ++ char block_bname[256] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ inode_t *inode = NULL; ++ inode_t *base_inode = NULL; ++ xlator_t *this = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ int unref_base_inode = 0; ++ int unref_shard_inode = 0; ++ ++ this = THIS; ++ priv = this->private; ++ ++ inode = local->inode_list[shard_block_num - local->first_block]; ++ shard_inode_ctx_get(inode, this, &ctx); ++ base_inode = ctx->base_inode; ++ if (base_inode) ++ gf_uuid_copy(gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(gfid, ctx->base_gfid); ++ shard_make_block_bname(shard_block_num, gfid, block_bname, ++ sizeof(block_bname)); ++ ++ LOCK(&priv->lock); ++ if (base_inode) ++ LOCK(&base_inode->lock); ++ LOCK(&inode->lock); ++ { ++ __shard_inode_ctx_get(inode, this, &ctx); ++ if (!list_empty(&ctx->ilist)) { ++ list_del_init(&ctx->ilist); ++ priv->inode_count--; ++ unref_base_inode++; ++ unref_shard_inode++; ++ GF_ASSERT(priv->inode_count >= 0); ++ } ++ if (ctx->fsync_needed) { ++ unref_base_inode++; ++ unref_shard_inode++; ++ list_del_init(&ctx->to_fsync_list); ++ if (base_inode) { ++ __shard_inode_ctx_get(base_inode, this, &base_ictx); ++ base_ictx->fsync_count--; ++ } ++ } ++ } ++ UNLOCK(&inode->lock); ++ if (base_inode) ++ UNLOCK(&base_inode->lock); ++ ++ inode_unlink(inode, priv->dot_shard_inode, block_bname); ++ inode_ref_reduce_by_n(inode, unref_shard_inode); ++ inode_forget(inode, 0); ++ ++ if (base_inode && unref_base_inode) ++ inode_ref_reduce_by_n(base_inode, unref_base_inode); ++ UNLOCK(&priv->lock); ++} ++ ++int ++shard_rename_cbk(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->preoldparent, ++ &local->postoldparent, &local->prenewparent, ++ &local->postnewparent, local->xattr_rsp); ++ return 0; ++} ++ ++int32_t ++shard_unlink_cbk(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ ++ SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, ++ &local->preoldparent, &local->postoldparent, ++ local->xattr_rsp); ++ return 0; ++} ++ ++int ++shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int shard_block_num = (long)cookie; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ++ shard_unlink_block_inode(local, shard_block_num); ++done: ++ syncbarrier_wake(&local->barrier); ++ return 0; ++} ++ ++int ++shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ int i = 0; ++ int ret = -1; ++ int count = 0; ++ uint32_t cur_block = 0; ++ uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ ++ char *bname = NULL; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ uuid_t gfid = { ++ 0, ++ }; ++ loc_t loc = { ++ 0, ++ }; ++ gf_boolean_t wind_failed = _gf_false; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ if (inode) ++ gf_uuid_copy(gfid, inode->gfid); ++ else ++ gf_uuid_copy(gfid, local->base_gfid); ++ ++ for (i = 0; i < local->num_blocks; i++) { ++ if (!local->inode_list[i]) ++ continue; ++ count++; ++ } ++ ++ if (!count) { ++ /* callcount = 0 implies that all of the shards that need to be ++ * unlinked are non-existent (in other words the file is full of ++ * holes). ++ */ ++ gf_msg_debug(this->name, 0, ++ "All shards that need to be " ++ "unlinked are non-existent: %s", ++ uuid_utoa(gfid)); ++ return 0; ++ } ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ local->barrier.waitfor = count; ++ cur_block = cur_block_idx + local->first_block; ++ ++ while (cur_block_idx < local->num_blocks) { ++ if (!local->inode_list[cur_block_idx]) ++ goto next; ++ ++ if (wind_failed) { ++ shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); ++ bname = strrchr(path, '/') + 1; ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on %s, base file gfid = %s", ++ bname, uuid_utoa(gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ loc_wipe(&loc); ++ wind_failed = _gf_true; ++ shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ loc.inode = inode_ref(local->inode_list[cur_block_idx]); ++ ++ STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, ++ (void *)(long)cur_block, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, ++ local->xattr_req); ++ loc_wipe(&loc); ++ next: ++ cur_block++; ++ cur_block_idx++; ++ } ++ syncbarrier_wait(&local->barrier, count); ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ return 0; ++} ++ ++int ++shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, ++ int now, int first_block, gf_dirent_t *entry) ++{ ++ int i = 0; ++ int ret = 0; ++ shard_local_t *local = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ ++ local = cleanup_frame->local; ++ ++ local->inode_list = GF_CALLOC(now, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ return -ENOMEM; ++ ++ local->first_block = first_block; ++ local->last_block = first_block + now - 1; ++ local->num_blocks = now; ++ gf_uuid_parse(entry->d_name, gfid); ++ gf_uuid_copy(local->base_gfid, gfid); ++ local->resolver_base_inode = inode_find(this->itable, gfid); ++ local->call_count = 0; ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) { ++ GF_FREE(local->inode_list); ++ local->inode_list = NULL; ++ inode_unref(local->resolver_base_inode); ++ local->resolver_base_inode = NULL; ++ return -errno; ++ } ++ shard_common_resolve_shards(cleanup_frame, this, ++ shard_post_resolve_unlink_handler); ++ ++ for (i = 0; i < local->num_blocks; i++) { ++ if (local->inode_list[i]) ++ inode_unref(local->inode_list[i]); ++ } ++ GF_FREE(local->inode_list); ++ local->inode_list = NULL; ++ if (local->op_ret) ++ ret = -local->op_errno; ++ syncbarrier_destroy(&local->barrier); ++ inode_unref(local->resolver_base_inode); ++ local->resolver_base_inode = NULL; ++ STACK_RESET(cleanup_frame->root); ++ return ret; ++} ++ ++int ++__shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, ++ gf_dirent_t *entry, inode_t *inode) ++{ ++ int ret = 0; ++ int shard_count = 0; ++ int first_block = 0; ++ int now = 0; ++ uint64_t size = 0; ++ uint64_t block_size = 0; ++ uint64_t size_array[4] = { ++ 0, ++ }; ++ void *bsize = NULL; ++ void *size_attr = NULL; ++ dict_t *xattr_rsp = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = cleanup_frame->local; ++ ret = dict_reset(local->xattr_req); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to reset dict"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.inode = inode_ref(inode); ++ loc.parent = inode_ref(priv->dot_shard_rm_inode); ++ ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, ++ &xattr_rsp); ++ if (ret) ++ goto err; ++ ++ ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); ++ goto err; ++ } ++ block_size = ntoh64(*((uint64_t *)bsize)); ++ ++ ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); ++ goto err; ++ } ++ ++ memcpy(size_array, size_attr, sizeof(size_array)); ++ size = ntoh64(size_array[0]); ++ ++ shard_count = (size / block_size) - 1; ++ if (shard_count < 0) { ++ gf_msg_debug(this->name, 0, ++ "Size of %s hasn't grown beyond " ++ "its shard-block-size. Nothing to delete. " ++ "Returning", ++ entry->d_name); ++ /* File size < shard-block-size, so nothing to delete */ ++ ret = 0; ++ goto delete_marker; ++ } ++ if ((size % block_size) > 0) ++ shard_count++; ++ ++ if (shard_count == 0) { ++ gf_msg_debug(this->name, 0, ++ "Size of %s is exactly equal to " ++ "its shard-block-size. Nothing to delete. " ++ "Returning", ++ entry->d_name); ++ ret = 0; ++ goto delete_marker; ++ } ++ gf_msg_debug(this->name, 0, ++ "base file = %s, " ++ "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 ++ ", " ++ "shard_count=%d", ++ entry->d_name, block_size, size, shard_count); ++ ++ /* Perform a gfid-based lookup to see if gfid corresponding to marker ++ * file's base name exists. ++ */ ++ loc_wipe(&loc); ++ loc.inode = inode_new(this->itable); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ gf_uuid_parse(entry->d_name, loc.gfid); ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); ++ if (!ret) { ++ gf_msg_debug(this->name, 0, ++ "Base shard corresponding to gfid " ++ "%s is present. Skipping shard deletion. " ++ "Returning", ++ entry->d_name); ++ ret = 0; ++ goto delete_marker; ++ } ++ ++ first_block = 1; ++ ++ while (shard_count) { ++ if (shard_count < local->deletion_rate) { ++ now = shard_count; ++ shard_count = 0; ++ } else { ++ now = local->deletion_rate; ++ shard_count -= local->deletion_rate; ++ } ++ ++ gf_msg_debug(this->name, 0, ++ "deleting %d shards starting from " ++ "block %d of gfid %s", ++ now, first_block, entry->d_name); ++ ret = shard_regulated_shards_deletion(cleanup_frame, this, now, ++ first_block, entry); ++ if (ret) ++ goto err; ++ first_block += now; ++ } ++ ++delete_marker: ++ loc_wipe(&loc); ++ loc.inode = inode_ref(inode); ++ loc.parent = inode_ref(priv->dot_shard_rm_inode); ++ ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); ++ if (ret) ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Failed to delete %s " ++ "from /%s", ++ entry->d_name, GF_SHARD_REMOVE_ME_DIR); ++err: ++ if (xattr_rsp) ++ dict_unref(xattr_rsp); ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, ++ gf_dirent_t *entry, inode_t *inode) ++{ ++ int ret = -1; ++ loc_t loc = { ++ 0, ++ }; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ loc.inode = inode_ref(priv->dot_shard_rm_inode); ++ ++ ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, ++ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); ++ if (ret < 0) { ++ if (ret == -EAGAIN) { ++ ret = 0; ++ } ++ goto out; ++ } ++ { ++ ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); ++ } ++ syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, ++ ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); ++out: ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) ++{ ++ SHARD_STACK_DESTROY(frame); ++ return 0; ++} ++ ++int ++shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, ++ shard_internal_dir_type_t type) ++{ ++ int ret = 0; ++ char *bname = NULL; ++ loc_t *loc = NULL; ++ shard_priv_t *priv = NULL; ++ uuid_t gfid = { ++ 0, ++ }; ++ struct iatt stbuf = { ++ 0, ++ }; ++ ++ priv = this->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ loc = &local->dot_shard_loc; ++ gf_uuid_copy(gfid, priv->dot_shard_gfid); ++ bname = GF_SHARD_DIR; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ loc = &local->dot_shard_rm_loc; ++ gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ break; ++ default: ++ break; ++ } ++ ++ loc->inode = inode_find(this->itable, gfid); ++ if (!loc->inode) { ++ ret = shard_init_internal_dir_loc(this, local, type); ++ if (ret) ++ goto err; ++ ret = dict_reset(local->xattr_req); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to reset " ++ "dict"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); ++ ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, ++ local->xattr_req, NULL); ++ if (ret < 0) { ++ if (ret != -ENOENT) ++ gf_msg(this->name, GF_LOG_ERROR, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Lookup on %s failed, exiting", bname); ++ goto err; ++ } else { ++ shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); ++ } ++ } ++ ret = 0; ++err: ++ return ret; ++} ++ ++int ++shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, ++ gf_dirent_t *entry) ++{ ++ int ret = 0; ++ loc_t loc = { ++ 0, ++ }; ++ ++ loc.inode = inode_new(this->itable); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ loc.parent = inode_ref(local->fd->inode); ++ ++ ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); ++ if (ret < 0) { ++ goto err; ++ } ++ entry->inode = inode_ref(loc.inode); ++ ret = 0; ++err: ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards(void *opaque) ++{ ++ int ret = 0; ++ off_t offset = 0; ++ loc_t loc = { ++ 0, ++ }; ++ inode_t *link_inode = NULL; ++ xlator_t *this = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ gf_dirent_t entries; ++ gf_dirent_t *entry = NULL; ++ call_frame_t *cleanup_frame = NULL; ++ gf_boolean_t done = _gf_false; ++ ++ this = THIS; ++ priv = this->private; ++ INIT_LIST_HEAD(&entries.list); ++ ++ cleanup_frame = opaque; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create local to " ++ "delete shards"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ cleanup_frame->local = local; ++ local->fop = GF_FOP_UNLINK; ++ ++ local->xattr_req = dict_new(); ++ if (!local->xattr_req) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ local->deletion_rate = priv->deletion_rate; ++ ++ ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret == -ENOENT) { ++ gf_msg_debug(this->name, 0, ++ ".shard absent. Nothing to" ++ " delete. Exiting"); ++ ret = 0; ++ goto err; ++ } else if (ret < 0) { ++ goto err; ++ } ++ ++ ret = shard_resolve_internal_dir(this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ if (ret == -ENOENT) { ++ gf_msg_debug(this->name, 0, ++ ".remove_me absent. " ++ "Nothing to delete. Exiting"); ++ ret = 0; ++ goto err; ++ } else if (ret < 0) { ++ goto err; ++ } ++ ++ local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); ++ if (!local->fd) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ for (;;) { ++ offset = 0; ++ LOCK(&priv->lock); ++ { ++ if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { ++ priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; ++ } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { ++ priv->bg_del_state = SHARD_BG_DELETION_NONE; ++ done = _gf_true; ++ } ++ } ++ UNLOCK(&priv->lock); ++ if (done) ++ break; ++ while ( ++ (ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, ++ &entries, local->xattr_req, NULL))) { ++ if (ret > 0) ++ ret = 0; ++ list_for_each_entry(entry, &entries.list, list) ++ { ++ offset = entry->d_off; ++ ++ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) ++ continue; ++ ++ if (!entry->inode) { ++ ret = shard_lookup_marker_entry(this, local, entry); ++ if (ret < 0) ++ continue; ++ } ++ link_inode = inode_link(entry->inode, local->fd->inode, ++ entry->d_name, &entry->d_stat); ++ ++ gf_msg_debug(this->name, 0, ++ "Initiating deletion of " ++ "shards of gfid %s", ++ entry->d_name); ++ ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, ++ link_inode); ++ inode_unlink(link_inode, local->fd->inode, entry->d_name); ++ inode_unref(link_inode); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Failed to clean up shards of gfid %s", ++ entry->d_name); ++ continue; ++ } ++ gf_msg(this->name, GF_LOG_INFO, 0, ++ SHARD_MSG_SHARD_DELETION_COMPLETED, ++ "Deleted " ++ "shards of gfid=%s from backend", ++ entry->d_name); ++ } ++ gf_dirent_free(&entries); ++ if (ret) ++ break; ++ } ++ } ++ ret = 0; ++ loc_wipe(&loc); ++ return ret; ++ ++err: ++ LOCK(&priv->lock); ++ { ++ priv->bg_del_state = SHARD_BG_DELETION_NONE; ++ } ++ UNLOCK(&priv->lock); ++ loc_wipe(&loc); ++ return ret; ++} ++ ++int ++shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ if (op_ret) ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Unlock failed. Please check brick logs for " ++ "more details"); ++ SHARD_STACK_DESTROY(frame); ++ return 0; ++} ++ ++int ++shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_inodelk_t *lock = NULL; ++ ++ local = frame->local; ++ lk_frame = local->inodelk_frame; ++ lk_local = lk_frame->local; ++ local->inodelk_frame = NULL; ++ loc = &local->int_inodelk.loc; ++ lock = &lk_local->int_inodelk; ++ lock->flock.l_type = F_UNLCK; ++ ++ STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, ++ &lock->flock, NULL); ++ local->int_inodelk.acquired_lock = _gf_false; ++ return 0; ++} ++ ++int ++shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ struct iatt *preoldparent, struct iatt *postoldparent, ++ struct iatt *prenewparent, struct iatt *postnewparent, ++ dict_t *xdata); ++int ++shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ loc_t *dst_loc = NULL; ++ loc_t tmp_loc = { ++ 0, ++ }; ++ shard_local_t *local = frame->local; ++ ++ if (local->dst_block_size) { ++ tmp_loc.parent = inode_ref(local->loc2.parent); ++ ret = inode_path(tmp_loc.parent, local->loc2.name, ++ (char **)&tmp_loc.path); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ " on pargfid=%s bname=%s", ++ uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } ++ ++ tmp_loc.name = strrchr(tmp_loc.path, '/'); ++ if (tmp_loc.name) ++ tmp_loc.name++; ++ dst_loc = &tmp_loc; ++ } else { ++ dst_loc = &local->loc2; ++ } ++ ++ /* To-Do: Request open-fd count on dst base file */ ++ STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, ++ local->xattr_req); ++ loc_wipe(&tmp_loc); ++ return 0; ++err: ++ loc_wipe(&tmp_loc); ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++} ++ ++int ++shard_unlink_base_file(call_frame_t *frame, xlator_t *this); ++ ++int ++shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Xattrop on marker file failed " ++ "while performing %s; entry gfid=%s", ++ gf_fop_string(local->fop), local->newloc.name); ++ goto err; ++ } ++ ++ inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, ++ local->newloc.name); ++ ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file(frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) ++{ ++ int op_errno = ENOMEM; ++ uint64_t bs = 0; ++ dict_t *xdata = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ xdata = dict_new(); ++ if (!xdata) ++ goto err; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ bs = local->block_size; ++ else if (local->fop == GF_FOP_RENAME) ++ bs = local->dst_block_size; ++ SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, ++ local->prebuf.ia_size, 0, err); ++ STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, ++ &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL); ++ dict_unref(xdata); ++ return 0; ++err: ++ if (xdata) ++ dict_unref(xdata); ++ shard_common_failure_unwind(local->fop, frame, -1, op_errno); ++ return 0; ++} ++ ++int ++shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ if (op_ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Lookup on marker file failed " ++ "while performing %s; entry gfid=%s", ++ gf_fop_string(local->fop), local->newloc.name); ++ goto err; ++ } ++ ++ linked_inode = inode_link(inode, priv->dot_shard_rm_inode, ++ local->newloc.name, buf); ++ inode_unref(local->newloc.inode); ++ local->newloc.inode = linked_inode; ++ shard_set_size_attrs_on_marker_file(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) ++{ ++ int op_errno = ENOMEM; ++ dict_t *xattr_req = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) ++ goto err; ++ ++ STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); ++ dict_unref(xattr_req); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, op_errno); ++ return 0; ++} ++ ++int ++shard_create_marker_file_under_remove_me_cbk( ++ call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (op_ret < 0) { ++ if ((op_errno != EEXIST) && (op_errno != ENODATA)) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Marker file creation " ++ "failed while performing %s; entry gfid=%s", ++ gf_fop_string(local->fop), local->newloc.name); ++ goto err; ++ } else { ++ shard_lookup_marker_file(frame, this); ++ return 0; ++ } ++ } ++ ++ linked_inode = inode_link(inode, priv->dot_shard_rm_inode, ++ local->newloc.name, buf); ++ inode_unref(local->newloc.inode); ++ local->newloc.inode = linked_inode; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file(frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++} ++ ++int ++shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, ++ loc_t *loc) ++{ ++ int ret = 0; ++ int op_errno = ENOMEM; ++ uint64_t bs = 0; ++ char g1[64] = { ++ 0, ++ }; ++ char g2[64] = { ++ 0, ++ }; ++ dict_t *xattr_req = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) ++ goto err; ++ ++ local->newloc.inode = inode_new(this->itable); ++ local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); ++ ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), ++ (char **)&local->newloc.path); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on " ++ "pargfid=%s bname=%s", ++ uuid_utoa_r(priv->dot_shard_rm_gfid, g1), ++ uuid_utoa_r(loc->inode->gfid, g2)); ++ goto err; ++ } ++ local->newloc.name = strrchr(local->newloc.path, '/'); ++ if (local->newloc.name) ++ local->newloc.name++; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ bs = local->block_size; ++ else if (local->fop == GF_FOP_RENAME) ++ bs = local->dst_block_size; ++ ++ SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, ++ local->prebuf.ia_size, 0, err); ++ ++ STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, ++ &local->newloc, 0, 0, 0644, xattr_req); ++ dict_unref(xattr_req); ++ return 0; ++ ++err: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, ++ NULL, NULL, NULL, NULL, NULL); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); ++ ++int ++shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } else { ++ shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); ++ local->preoldparent = *preparent; ++ local->postoldparent = *postparent; ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ if (local->cleanup_required) ++ shard_start_background_deletion(this); ++ } ++ ++ if (local->entrylk_frame) { ++ ret = shard_unlock_entrylk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ } ++ ++ ret = shard_unlock_inodelk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ ++ shard_unlink_cbk(frame, this); ++ return 0; ++} ++ ++int ++shard_unlink_base_file(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ ++ /* To-Do: Request open-fd count on base file */ ++ STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, ++ local->xattr_req); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ if (op_ret) ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Unlock failed. Please check brick logs for " ++ "more details"); ++ SHARD_STACK_DESTROY(frame); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_entrylk_t *lock = NULL; ++ ++ local = frame->local; ++ lk_frame = local->entrylk_frame; ++ lk_local = lk_frame->local; ++ local->entrylk_frame = NULL; ++ lock = &lk_local->int_entrylk; ++ loc = &lock->loc; ++ ++ STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->entrylk, this->name, loc, ++ lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, ++ NULL); ++ local->int_entrylk.acquired_lock = _gf_false; ++ return 0; ++} ++ ++int ++shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ switch (local->fop) { ++ case GF_FOP_UNLINK: ++ case GF_FOP_RENAME: ++ shard_create_marker_file_under_remove_me(frame, this, ++ &local->int_inodelk.loc); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "post-entrylk handler not defined. This case should not" ++ " be hit"); ++ break; ++ } ++ return 0; ++} ++ ++int ++shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ call_frame_t *main_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *main_local = NULL; ++ ++ local = frame->local; ++ main_frame = local->main_frame; ++ main_local = main_frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(main_local->fop, main_frame, op_ret, ++ op_errno); ++ return 0; ++ } ++ main_local->int_entrylk.acquired_lock = _gf_true; ++ shard_post_entrylk_fop_handler(main_frame, this); ++ return 0; ++} ++ ++int ++shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, ++ uuid_t gfid) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ shard_local_t *entrylk_local = NULL; ++ shard_entrylk_t *int_entrylk = NULL; ++ call_frame_t *entrylk_frame = NULL; ++ ++ local = frame->local; ++ entrylk_frame = create_frame(this, this->ctx->pool); ++ if (!entrylk_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create new frame " ++ "to lock marker file"); ++ goto err; ++ } ++ ++ entrylk_local = mem_get0(this->local_pool); ++ if (!entrylk_local) { ++ STACK_DESTROY(entrylk_frame->root); ++ goto err; ++ } ++ ++ entrylk_frame->local = entrylk_local; ++ entrylk_local->main_frame = frame; ++ int_entrylk = &entrylk_local->int_entrylk; ++ ++ int_entrylk->loc.inode = inode_ref(inode); ++ set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); ++ local->entrylk_frame = entrylk_frame; ++ gf_uuid_unparse(gfid, gfid_str); ++ int_entrylk->basename = gf_strdup(gfid_str); ++ ++ STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, ++ int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++ } ++ ++ if (local->prebuf.ia_nlink > 1) { ++ gf_msg_debug(this->name, 0, ++ "link count on %s > 1:%d, " ++ "performing rename()/unlink()", ++ local->int_inodelk.loc.path, local->prebuf.ia_nlink); ++ if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file(frame, this); ++ else if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file(frame, this); ++ } else { ++ gf_msg_debug(this->name, 0, ++ "link count on %s = 1, creating " ++ "file under .remove_me", ++ local->int_inodelk.loc.path); ++ local->cleanup_required = _gf_true; ++ shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, ++ local->prebuf.ia_gfid); ++ } ++ return 0; ++} ++ ++int ++shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ switch (local->fop) { ++ case GF_FOP_UNLINK: ++ case GF_FOP_RENAME: ++ shard_lookup_base_file(frame, this, &local->int_inodelk.loc, ++ shard_post_lookup_base_shard_rm_handler); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "post-inodelk handler not defined. This case should not" ++ " be hit"); ++ break; ++ } ++ return 0; ++} ++ ++int ++shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ call_frame_t *main_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *main_local = NULL; ++ ++ local = frame->local; ++ main_frame = local->main_frame; ++ main_local = main_frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(main_local->fop, main_frame, op_ret, ++ op_errno); ++ return 0; ++ } ++ main_local->int_inodelk.acquired_lock = _gf_true; ++ shard_post_inodelk_fop_handler(main_frame, this); ++ return 0; ++} ++ ++int ++shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) ++{ ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_inodelk_t *int_inodelk = NULL; ++ ++ local = frame->local; ++ lk_frame = create_frame(this, this->ctx->pool); ++ if (!lk_frame) { ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create new frame " ++ "to lock base shard"); ++ goto err; ++ } ++ lk_local = mem_get0(this->local_pool); ++ if (!lk_local) { ++ STACK_DESTROY(lk_frame->root); ++ goto err; ++ } ++ ++ lk_frame->local = lk_local; ++ lk_local->main_frame = frame; ++ int_inodelk = &lk_local->int_inodelk; ++ ++ int_inodelk->flock.l_len = 0; ++ int_inodelk->flock.l_start = 0; ++ int_inodelk->domain = this->name; ++ int_inodelk->flock.l_type = F_WRLCK; ++ loc_copy(&local->int_inodelk.loc, loc); ++ set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); ++ local->inodelk_frame = lk_frame; ++ ++ STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, ++ &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++ } ++ if (local->fop == GF_FOP_UNLINK) ++ loc = &local->loc; ++ else if (local->fop == GF_FOP_RENAME) ++ loc = &local->loc2; ++ shard_acquire_inodelk(frame, this, loc); ++ return 0; ++} ++ ++int ++shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type); ++int ++shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); ++ return 0; ++ } ++ shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ return 0; ++} ++ ++void ++shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ local->dot_shard_rm_loc.inode = inode_find(this->itable, ++ priv->dot_shard_rm_gfid); ++ if (!local->dot_shard_rm_loc.inode) { ++ local->dot_shard_loc.inode = inode_find(this->itable, ++ priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ local->post_res_handler = shard_pre_mkdir_rm_handler; ++ shard_refresh_internal_dir(frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } ++ } else { ++ local->post_res_handler = shard_post_mkdir_rm_handler; ++ shard_refresh_internal_dir(frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ } ++} ++ ++int ++shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); ++ return 0; ++ } ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ loc_copy(&local->loc, loc); ++ local->xflag = xflag; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ local->block_size = block_size; ++ local->resolver_base_inode = loc->inode; ++ local->fop = GF_FOP_UNLINK; ++ if (!this->itable) ++ this->itable = (local->loc.inode)->table; ++ ++ local->resolve_not = _gf_true; ++ shard_begin_rm_resolution(frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_rename_cbk(frame, this); ++ return 0; ++} ++ ++int ++shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ struct iatt *preoldparent, struct iatt *postoldparent, ++ struct iatt *prenewparent, struct iatt *postnewparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto err; ++ } ++ /* Set ctx->refresh to TRUE to force a lookup on disk when ++ * shard_lookup_base_file() is called next to refresh the hard link ++ * count in ctx. Note that this is applicable only to the case where ++ * the rename dst is already existent and sharded. ++ */ ++ if ((local->dst_block_size) && (!local->cleanup_required)) ++ shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); ++ ++ local->prebuf = *buf; ++ local->preoldparent = *preoldparent; ++ local->postoldparent = *postoldparent; ++ local->prenewparent = *prenewparent; ++ local->postnewparent = *postnewparent; ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ ++ if (local->dst_block_size) { ++ if (local->entrylk_frame) { ++ ret = shard_unlock_entrylk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ } ++ ++ ret = shard_unlock_inodelk(frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ goto err; ++ } ++ if (local->cleanup_required) ++ shard_start_background_deletion(this); ++ } ++ ++ /* Now the base file of src, if sharded, is looked up to gather ia_size ++ * and ia_blocks.*/ ++ if (local->block_size) { ++ local->tmp_loc.inode = inode_new(this->itable); ++ gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); ++ shard_lookup_base_file(frame, this, &local->tmp_loc, ++ shard_post_rename_lookup_handler); ++ } else { ++ shard_rename_cbk(frame, this); ++ } ++ return 0; ++err: ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++} ++ ++int ++shard_post_lookup_dst_base_file_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ /* Save dst base file attributes into postbuf so the information is not ++ * lost when it is overwritten after lookup on base file of src in ++ * shard_lookup_base_file_cbk(). ++ */ ++ local->postbuf = local->prebuf; ++ shard_rename_src_base_file(frame, this); ++ return 0; ++} ++ ++int ++shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ uint64_t dst_block_size = 0; ++ shard_local_t *local = NULL; ++ ++ if (IA_ISDIR(oldloc->inode->ia_type)) { ++ STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); ++ if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa(oldloc->inode->gfid)); ++ goto err; ++ } ++ ++ if (newloc->inode) ++ ret = shard_inode_ctx_get_block_size(newloc->inode, this, ++ &dst_block_size); ++ ++ /* The following stack_wind covers the case where: ++ * a. the src file is not sharded and dst doesn't exist, OR ++ * b. the src and dst both exist but are not sharded. ++ */ ++ if (((!block_size) && (!dst_block_size)) || ++ frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ loc_copy(&local->loc, oldloc); ++ loc_copy(&local->loc2, newloc); ++ local->resolver_base_inode = newloc->inode; ++ local->fop = GF_FOP_RENAME; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ ++ local->block_size = block_size; ++ local->dst_block_size = dst_block_size; ++ if (!this->itable) ++ this->itable = (local->loc.inode)->table; ++ local->resolve_not = _gf_true; ++ ++ /* The following if-block covers the case where the dst file exists ++ * and is sharded. ++ */ ++ if (local->dst_block_size) { ++ shard_begin_rm_resolution(frame, this); ++ } else { ++ /* The following block covers the case where the dst either doesn't ++ * exist or is NOT sharded but the src is sharded. In this case, shard ++ * xlator would go ahead and rename src to dst. Once done, it would also ++ * lookup the base shard of src to get the ia_size and ia_blocks xattr ++ * values. ++ */ ++ shard_rename_src_base_file(frame, this); ++ } ++ return 0; ++ ++err: ++ shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, ++ struct iatt *stbuf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret == -1) ++ goto unwind; ++ ++ ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, ++ SHARD_ALL_MASK); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, ++ "Failed to set inode " ++ "ctx for %s", ++ uuid_utoa(inode->gfid)); ++ ++unwind: ++ SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, ++ preparent, postparent, xdata); ++ return 0; ++} ++ ++int ++shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ local->block_size = priv->block_size; ++ ++ if (!__is_gsyncd_on_shard_dir(frame, loc)) { ++ SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); ++ } ++ ++ STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, ++ xdata); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) ++{ ++ /* To-Do: Handle open with O_TRUNC under locks */ ++ SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); ++ return 0; ++} ++ ++int ++shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ fd_t *fd, dict_t *xdata) ++{ ++ STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); ++ return 0; ++} ++ ++int ++shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iovec *vector, ++ int32_t count, struct iatt *stbuf, struct iobref *iobref, ++ dict_t *xdata) ++{ ++ int i = 0; ++ int call_count = 0; ++ void *address = NULL; ++ uint64_t block_num = 0; ++ off_t off = 0; ++ struct iovec vec = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ fd_t *anon_fd = cookie; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ local = frame->local; ++ ++ /* If shard has already seen a failure here before, there is no point ++ * in aggregating subsequent reads, so just go to out. ++ */ ++ if (local->op_ret < 0) ++ goto out; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto out; ++ } ++ ++ if (local->op_ret >= 0) ++ local->op_ret += op_ret; ++ ++ shard_inode_ctx_get(anon_fd->inode, this, &ctx); ++ block_num = ctx->block_num; ++ ++ if (block_num == local->first_block) { ++ address = local->iobuf->ptr; ++ } else { ++ /* else ++ * address to start writing to = beginning of buffer + ++ * number of bytes until end of first block + ++ * + block_size times number of blocks ++ * between the current block and the first ++ */ ++ address = (char *)local->iobuf->ptr + ++ (local->block_size - (local->offset % local->block_size)) + ++ ((block_num - local->first_block - 1) * local->block_size); ++ } ++ ++ for (i = 0; i < count; i++) { ++ address = (char *)address + off; ++ memcpy(address, vector[i].iov_base, vector[i].iov_len); ++ off += vector[i].iov_len; ++ } ++ ++out: ++ if (anon_fd) ++ fd_unref(anon_fd); ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ vec.iov_base = local->iobuf->ptr; ++ if (local->offset + local->req_size > local->prebuf.ia_size) ++ local->total_size = local->prebuf.ia_size - local->offset; ++ vec.iov_len = local->total_size; ++ local->op_ret = local->total_size; ++ SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, ++ &vec, 1, &local->prebuf, local->iobref, ++ local->xattr_rsp); ++ return 0; ++ } ++ } ++ ++ return 0; ++} ++ ++int ++shard_readv_do(call_frame_t *frame, xlator_t *this) ++{ ++ int i = 0; ++ int call_count = 0; ++ int last_block = 0; ++ int cur_block = 0; ++ off_t orig_offset = 0; ++ off_t shard_offset = 0; ++ size_t read_size = 0; ++ size_t remaining_size = 0; ++ fd_t *fd = NULL; ++ fd_t *anon_fd = NULL; ++ shard_local_t *local = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ ++ local = frame->local; ++ fd = local->fd; ++ ++ orig_offset = local->offset; ++ cur_block = local->first_block; ++ last_block = local->last_block; ++ remaining_size = local->total_size; ++ local->call_count = call_count = local->num_blocks; ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ ++ if (fd->flags & O_DIRECT) ++ local->flags = O_DIRECT; ++ ++ while (cur_block <= last_block) { ++ if (wind_failed) { ++ shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, ++ 0, NULL, NULL, NULL); ++ goto next; ++ } + +- local = frame->local; ++ shard_offset = orig_offset % local->block_size; ++ read_size = local->block_size - shard_offset; ++ if (read_size > remaining_size) ++ read_size = remaining_size; ++ ++ remaining_size -= read_size; ++ ++ if (cur_block == 0) { ++ anon_fd = fd_ref(fd); ++ } else { ++ anon_fd = fd_anonymous(local->inode_list[i]); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, ++ ENOMEM, NULL, 0, NULL, NULL, NULL); ++ goto next; ++ } ++ } + +- xattr_req = shard_create_gfid_dict(local->xattr_req); +- if (!xattr_req) +- goto err; ++ STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readv, anon_fd, read_size, ++ shard_offset, local->flags, local->xattr_req); + +- STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); +- dict_unref(xattr_req); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, -1, op_errno); +- return 0; ++ orig_offset += read_size; ++ next: ++ cur_block++; ++ i++; ++ call_count--; ++ } ++ return 0; + } + +-int shard_create_marker_file_under_remove_me_cbk( +- call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- inode_t *linked_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- local = frame->local; +- priv = this->private; +- +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- if (op_ret < 0) { +- if ((op_errno != EEXIST) && (op_errno != ENODATA)) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Marker file creation " +- "failed while performing %s; entry gfid=%s", +- gf_fop_string(local->fop), local->newloc.name); +- goto err; +- } else { +- shard_lookup_marker_file(frame, this); +- return 0; ++int ++shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int shard_block_num = (long)cookie; ++ int call_count = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ if (op_errno == EEXIST) { ++ LOCK(&frame->lock); ++ { ++ local->eexist_count++; ++ } ++ UNLOCK(&frame->lock); ++ } else { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } ++ gf_msg_debug(this->name, 0, ++ "mknod of shard %d " ++ "failed: %s", ++ shard_block_num, strerror(op_errno)); ++ goto done; + } +- } + +- linked_inode = +- inode_link(inode, priv->dot_shard_rm_inode, local->newloc.name, buf); +- inode_unref(local->newloc.inode); +- local->newloc.inode = linked_inode; ++ shard_link_block_inode(local, shard_block_num, inode, buf); + +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_base_file(frame, this); +- else if (local->fop == GF_FOP_RENAME) +- shard_rename_src_base_file(frame, this); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); +- return 0; +-} +- +-int shard_create_marker_file_under_remove_me(call_frame_t *frame, +- xlator_t *this, loc_t *loc) { +- int ret = 0; +- int op_errno = ENOMEM; +- uint64_t bs = 0; +- char g1[64] = { +- 0, +- }; +- char g2[64] = { +- 0, +- }; +- dict_t *xattr_req = NULL; +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; +- +- priv = this->private; +- local = frame->local; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- xattr_req = shard_create_gfid_dict(local->xattr_req); +- if (!xattr_req) +- goto err; +- +- local->newloc.inode = inode_new(this->itable); +- local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); +- ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), +- (char **)&local->newloc.path); +- if (ret < 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on " +- "pargfid=%s bname=%s", +- uuid_utoa_r(priv->dot_shard_rm_gfid, g1), +- uuid_utoa_r(loc->inode->gfid, g2)); +- goto err; +- } +- local->newloc.name = strrchr(local->newloc.path, '/'); +- if (local->newloc.name) +- local->newloc.name++; +- +- if (local->fop == GF_FOP_UNLINK) +- bs = local->block_size; +- else if (local->fop == GF_FOP_RENAME) +- bs = local->dst_block_size; +- +- SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, +- local->prebuf.ia_size, 0, err); +- +- STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, &local->newloc, +- 0, 0, 0644, xattr_req); +- dict_unref(xattr_req); +- return 0; ++done: ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ local->create_count = 0; ++ local->post_mknod_handler(frame, this); ++ } + +-err: +- if (xattr_req) +- dict_unref(xattr_req); +- shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, +- NULL, NULL, NULL, NULL, NULL); +- return 0; ++ return 0; + } + +-int shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); +- +-int shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) { +- int ret = 0; +- shard_local_t *local = NULL; ++int ++shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, ++ shard_post_mknod_fop_handler_t post_mknod_handler) ++{ ++ int i = 0; ++ int shard_idx_iter = 0; ++ int last_block = 0; ++ int ret = 0; ++ int call_count = 0; ++ char path[PATH_MAX] = { ++ 0, ++ }; ++ mode_t mode = 0; ++ char *bname = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t ctx_tmp = { ++ 0, ++ }; ++ shard_local_t *local = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ fd_t *fd = NULL; ++ loc_t loc = { ++ 0, ++ }; ++ dict_t *xattr_req = NULL; + +- local = frame->local; ++ local = frame->local; ++ priv = this->private; ++ fd = local->fd; ++ shard_idx_iter = local->first_block; ++ last_block = local->last_block; ++ call_count = local->call_count = local->create_count; ++ local->post_mknod_handler = post_mknod_handler; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- } else { +- shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); +- local->preoldparent = *preparent; +- local->postoldparent = *postparent; +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- if (local->cleanup_required) +- shard_start_background_deletion(this); +- } ++ SHARD_SET_ROOT_FS_ID(frame, local); + +- if (local->entrylk_frame) { +- ret = shard_unlock_entrylk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; ++ ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get inode " ++ "ctx for %s", ++ uuid_utoa(fd->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; + } +- } ++ mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); + +- ret = shard_unlock_inodelk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; +- } +- +- shard_unlink_cbk(frame, this); +- return 0; +-} +- +-int shard_unlink_base_file(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = frame->local; +- +- /* To-Do: Request open-fd count on base file */ +- STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, +- local->xattr_req); +- return 0; +-} +- +-int shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- if (op_ret) +- gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, +- "Unlock failed. Please check brick logs for " +- "more details"); +- SHARD_STACK_DESTROY(frame); +- return 0; +-} +- +-int shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) { +- loc_t *loc = NULL; +- call_frame_t *lk_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *lk_local = NULL; +- shard_entrylk_t *lock = NULL; +- +- local = frame->local; +- lk_frame = local->entrylk_frame; +- lk_local = lk_frame->local; +- local->entrylk_frame = NULL; +- lock = &lk_local->int_entrylk; +- loc = &lock->loc; +- +- STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->entrylk, this->name, loc, +- lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, +- NULL); +- local->int_entrylk.acquired_lock = _gf_false; +- return 0; +-} +- +-int shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- switch (local->fop) { +- case GF_FOP_UNLINK: +- case GF_FOP_RENAME: +- shard_create_marker_file_under_remove_me(frame, this, +- &local->int_inodelk.loc); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "post-entrylk handler not defined. This case should not" +- " be hit"); +- break; +- } +- return 0; +-} +- +-int shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- call_frame_t *main_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *main_local = NULL; +- +- local = frame->local; +- main_frame = local->main_frame; +- main_local = main_frame->local; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(main_local->fop, main_frame, op_ret, op_errno); +- return 0; +- } +- main_local->int_entrylk.acquired_lock = _gf_true; +- shard_post_entrylk_fop_handler(main_frame, this); +- return 0; +-} +- +-int shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, +- uuid_t gfid) { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; +- shard_local_t *local = NULL; +- shard_local_t *entrylk_local = NULL; +- shard_entrylk_t *int_entrylk = NULL; +- call_frame_t *entrylk_frame = NULL; +- +- local = frame->local; +- entrylk_frame = create_frame(this, this->ctx->pool); +- if (!entrylk_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create new frame " +- "to lock marker file"); +- goto err; +- } +- +- entrylk_local = mem_get0(this->local_pool); +- if (!entrylk_local) { +- STACK_DESTROY(entrylk_frame->root); +- goto err; +- } +- +- entrylk_frame->local = entrylk_local; +- entrylk_local->main_frame = frame; +- int_entrylk = &entrylk_local->int_entrylk; +- +- int_entrylk->loc.inode = inode_ref(inode); +- set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); +- local->entrylk_frame = entrylk_frame; +- gf_uuid_unparse(gfid, gfid_str); +- int_entrylk->basename = gf_strdup(gfid_str); +- +- STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, +- int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +-} ++ while (shard_idx_iter <= last_block) { ++ if (local->inode_list[i]) { ++ shard_idx_iter++; ++ i++; ++ continue; ++ } + +-int shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; ++ if (wind_failed) { ++ shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, ++ -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); ++ goto next; ++ } + +- priv = this->private; +- local = frame->local; ++ shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, ++ sizeof(path)); ++ ++ xattr_req = shard_create_gfid_dict(local->xattr_req); ++ if (!xattr_req) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, ++ -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ bname = strrchr(path, '/') + 1; ++ loc.inode = inode_new(this->itable); ++ loc.parent = inode_ref(priv->dot_shard_inode); ++ ret = inode_path(loc.parent, bname, (char **)&(loc.path)); ++ if (ret < 0 || !(loc.inode)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed" ++ "on %s, base file gfid = %s", ++ bname, uuid_utoa(fd->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ loc_wipe(&loc); ++ dict_unref(xattr_req); ++ shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, ++ -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); ++ goto next; ++ } ++ ++ loc.name = strrchr(loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, ++ (void *)(long)shard_idx_iter, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->mknod, &loc, mode, ++ ctx_tmp.stat.ia_rdev, 0, xattr_req); ++ loc_wipe(&loc); ++ dict_unref(xattr_req); ++ ++ next: ++ shard_idx_iter++; ++ i++; ++ if (!--call_count) ++ break; ++ } + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; +- } +- +- if (local->prebuf.ia_nlink > 1) { +- gf_msg_debug(this->name, 0, "link count on %s > 1:%d, " +- "performing rename()/unlink()", +- local->int_inodelk.loc.path, local->prebuf.ia_nlink); +- if (local->fop == GF_FOP_RENAME) +- shard_rename_src_base_file(frame, this); +- else if (local->fop == GF_FOP_UNLINK) +- shard_unlink_base_file(frame, this); +- } else { +- gf_msg_debug(this->name, 0, "link count on %s = 1, creating " +- "file under .remove_me", +- local->int_inodelk.loc.path); +- local->cleanup_required = _gf_true; +- shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, +- local->prebuf.ia_gfid); +- } +- return 0; +-} +- +-int shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- switch (local->fop) { +- case GF_FOP_UNLINK: +- case GF_FOP_RENAME: +- shard_lookup_base_file(frame, this, &local->int_inodelk.loc, +- shard_post_lookup_base_shard_rm_handler); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "post-inodelk handler not defined. This case should not" +- " be hit"); +- break; +- } +- return 0; +-} +- +-int shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- call_frame_t *main_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *main_local = NULL; +- +- local = frame->local; +- main_frame = local->main_frame; +- main_local = main_frame->local; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(main_local->fop, main_frame, op_ret, op_errno); +- return 0; +- } +- main_local->int_inodelk.acquired_lock = _gf_true; +- shard_post_inodelk_fop_handler(main_frame, this); +- return 0; +-} +- +-int shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) { +- call_frame_t *lk_frame = NULL; +- shard_local_t *local = NULL; +- shard_local_t *lk_local = NULL; +- shard_inodelk_t *int_inodelk = NULL; +- +- local = frame->local; +- lk_frame = create_frame(this, this->ctx->pool); +- if (!lk_frame) { +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create new frame " +- "to lock base shard"); +- goto err; +- } +- lk_local = mem_get0(this->local_pool); +- if (!lk_local) { +- STACK_DESTROY(lk_frame->root); +- goto err; +- } +- +- lk_frame->local = lk_local; +- lk_local->main_frame = frame; +- int_inodelk = &lk_local->int_inodelk; +- +- int_inodelk->flock.l_len = 0; +- int_inodelk->flock.l_start = 0; +- int_inodelk->domain = this->name; +- int_inodelk->flock.l_type = F_WRLCK; +- loc_copy(&local->int_inodelk.loc, loc); +- set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); +- local->inodelk_frame = lk_frame; +- +- STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, +- &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); +- return 0; + err: +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; ++ /* ++ * This block is for handling failure in shard_inode_ctx_get_all(). ++ * Failures in the while-loop are handled within the loop. ++ */ ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ post_mknod_handler(frame, this); ++ return 0; + } + +-int shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) { +- loc_t *loc = NULL; +- shard_local_t *local = NULL; ++int ++shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); + +- local = frame->local; ++int ++shard_post_lookup_shards_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); +- return 0; +- } +- if (local->fop == GF_FOP_UNLINK) +- loc = &local->loc; +- else if (local->fop == GF_FOP_RENAME) +- loc = &local->loc2; +- shard_acquire_inodelk(frame, this, loc); +- return 0; +-} ++ local = frame->local; + +-int shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler, +- shard_internal_dir_type_t type); +-int shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- local = frame->local; ++ if (local->create_count) { ++ shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); ++ } else { ++ shard_readv_do(frame, this); ++ } + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; +- } +- shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); +- return 0; + } + +-void shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; ++int ++shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- priv = this->private; +- local = frame->local; ++ local = frame->local; + +- local->dot_shard_rm_loc.inode = +- inode_find(this->itable, priv->dot_shard_rm_gfid); +- if (!local->dot_shard_rm_loc.inode) { +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_pre_mkdir_rm_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- } else { +- local->post_res_handler = shard_post_mkdir_rm_handler; +- shard_refresh_internal_dir(frame, this, +- SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); +- } +-} +- +-int shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, +- dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); +- return 0; +- } +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- loc_copy(&local->loc, loc); +- local->xflag = xflag; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- local->block_size = block_size; +- local->resolver_base_inode = loc->inode; +- local->fop = GF_FOP_UNLINK; +- if (!this->itable) +- this->itable = (local->loc.inode)->table; +- +- local->resolve_not = _gf_true; +- shard_begin_rm_resolution(frame, this); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); +- return 0; +-} ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +-int shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) { +- shard_rename_cbk(frame, this); +- return 0; ++ if (!local->eexist_count) { ++ shard_readv_do(frame, this); ++ } else { ++ local->call_count = local->eexist_count; ++ shard_common_lookup_shards(frame, this, local->loc.inode, ++ shard_post_lookup_shards_readv_handler); ++ } ++ return 0; + } + +-int shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *buf, +- struct iatt *preoldparent, struct iatt *postoldparent, +- struct iatt *prenewparent, struct iatt *postnewparent, +- dict_t *xdata) { +- int ret = 0; +- shard_local_t *local = NULL; +- +- local = frame->local; ++int ++shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto err; +- } +- /* Set ctx->refresh to TRUE to force a lookup on disk when +- * shard_lookup_base_file() is called next to refresh the hard link +- * count in ctx. Note that this is applicable only to the case where +- * the rename dst is already existent and sharded. +- */ +- if ((local->dst_block_size) && (!local->cleanup_required)) +- shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); +- +- local->prebuf = *buf; +- local->preoldparent = *preoldparent; +- local->postoldparent = *postoldparent; +- local->prenewparent = *prenewparent; +- local->postnewparent = *postnewparent; +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); ++ local = frame->local; + +- if (local->dst_block_size) { +- if (local->entrylk_frame) { +- ret = shard_unlock_entrylk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; +- } ++ if (local->op_ret < 0) { ++ if (local->op_errno != ENOENT) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } else { ++ struct iovec vec = { ++ 0, ++ }; ++ ++ vec.iov_base = local->iobuf->ptr; ++ vec.iov_len = local->total_size; ++ local->op_ret = local->total_size; ++ SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1, ++ &local->prebuf, local->iobref, NULL); ++ return 0; ++ } + } + +- ret = shard_unlock_inodelk(frame, this); +- if (ret < 0) { +- local->op_ret = -1; +- local->op_errno = -ret; +- goto err; +- } +- if (local->cleanup_required) +- shard_start_background_deletion(this); +- } +- +- /* Now the base file of src, if sharded, is looked up to gather ia_size +- * and ia_blocks.*/ +- if (local->block_size) { +- local->tmp_loc.inode = inode_new(this->itable); +- gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); +- shard_lookup_base_file(frame, this, &local->tmp_loc, +- shard_post_rename_lookup_handler); +- } else { +- shard_rename_cbk(frame, this); +- } +- return 0; +-err: +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +-} +- +-int shard_post_lookup_dst_base_file_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; ++ if (local->call_count) { ++ shard_common_lookup_shards(frame, this, local->resolver_base_inode, ++ shard_post_lookup_shards_readv_handler); ++ } else { ++ shard_readv_do(frame, this); ++ } + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); + return 0; +- } +- +- /* Save dst base file attributes into postbuf so the information is not +- * lost when it is overwritten after lookup on base file of src in +- * shard_lookup_base_file_cbk(). +- */ +- local->postbuf = local->prebuf; +- shard_rename_src_base_file(frame, this); +- return 0; +-} +- +-int shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, +- loc_t *newloc, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- uint64_t dst_block_size = 0; +- shard_local_t *local = NULL; +- +- if (IA_ISDIR(oldloc->inode->ia_type)) { +- STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); +- return 0; +- } +- +- ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); +- if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa(oldloc->inode->gfid)); +- goto err; +- } +- +- if (newloc->inode) +- ret = shard_inode_ctx_get_block_size(newloc->inode, this, &dst_block_size); +- +- /* The following stack_wind covers the case where: +- * a. the src file is not sharded and dst doesn't exist, OR +- * b. the src and dst both exist but are not sharded. +- */ +- if (((!block_size) && (!dst_block_size)) || +- frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); +- return 0; +- } +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- loc_copy(&local->loc, oldloc); +- loc_copy(&local->loc2, newloc); +- local->resolver_base_inode = newloc->inode; +- local->fop = GF_FOP_RENAME; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- +- local->block_size = block_size; +- local->dst_block_size = dst_block_size; +- if (!this->itable) +- this->itable = (local->loc.inode)->table; +- local->resolve_not = _gf_true; +- +- /* The following if-block covers the case where the dst file exists +- * and is sharded. +- */ +- if (local->dst_block_size) { +- shard_begin_rm_resolution(frame, this); +- } else { +- /* The following block covers the case where the dst either doesn't +- * exist or is NOT sharded but the src is sharded. In this case, shard +- * xlator would go ahead and rename src to dst. Once done, it would also +- * lookup the base shard of src to get the ia_size and ia_blocks xattr +- * values. +- */ +- shard_rename_src_base_file(frame, this); +- } +- return 0; +- +-err: +- shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); +- return 0; + } + +-int shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, +- struct iatt *stbuf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- int ret = -1; +- shard_local_t *local = NULL; ++int ++shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ struct iobuf *iobuf = NULL; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; + +- local = frame->local; ++ priv = this->private; ++ local = frame->local; + +- if (op_ret == -1) +- goto unwind; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, +- SHARD_ALL_MASK); +- if (ret) +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, +- "Failed to set inode " +- "ctx for %s", +- uuid_utoa(inode->gfid)); ++ if (local->offset >= local->prebuf.ia_size) { ++ /* If the read is being performed past the end of the file, ++ * unwind the FOP with 0 bytes read as status. ++ */ ++ struct iovec vec = { ++ 0, ++ }; + +-unwind: +- SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, +- preparent, postparent, xdata); +- return 0; +-} ++ iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); ++ if (!iobuf) ++ goto err; + +-int shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +- mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { +- shard_priv_t *priv = NULL; +- shard_local_t *local = NULL; ++ vec.iov_base = iobuf->ptr; ++ vec.iov_len = 0; ++ local->iobref = iobref_new(); ++ iobref_add(local->iobref, iobuf); ++ iobuf_unref(iobuf); + +- priv = this->private; +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, ++ local->iobref, NULL); ++ return 0; ++ } + +- frame->local = local; +- local->block_size = priv->block_size; ++ local->first_block = get_lowest_block(local->offset, local->block_size); + +- if (!__is_gsyncd_on_shard_dir(frame, loc)) { +- SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); +- } ++ local->total_size = local->req_size; + +- STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, +- xdata); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { +- /* To-Do: Handle open with O_TRUNC under locks */ +- SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); +- return 0; +-} +- +-int shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +- fd_t *fd, dict_t *xdata) { +- STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); +- return 0; +-} +- +-int shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iovec *vector, +- int32_t count, struct iatt *stbuf, struct iobref *iobref, +- dict_t *xdata) { +- int i = 0; +- int call_count = 0; +- void *address = NULL; +- uint64_t block_num = 0; +- off_t off = 0; +- struct iovec vec = { +- 0, +- }; +- shard_local_t *local = NULL; +- fd_t *anon_fd = cookie; +- shard_inode_ctx_t *ctx = NULL; +- +- local = frame->local; +- +- /* If shard has already seen a failure here before, there is no point +- * in aggregating subsequent reads, so just go to out. +- */ +- if (local->op_ret < 0) +- goto out; +- +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto out; +- } ++ local->last_block = get_highest_block(local->offset, local->total_size, ++ local->block_size); + +- if (local->op_ret >= 0) +- local->op_ret += op_ret; ++ local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); ++ local->resolver_base_inode = local->loc.inode; + +- shard_inode_ctx_get(anon_fd->inode, this, &ctx); +- block_num = ctx->block_num; +- +- if (block_num == local->first_block) { +- address = local->iobuf->ptr; +- } else { +- /* else +- * address to start writing to = beginning of buffer + +- * number of bytes until end of first block + +- * + block_size times number of blocks +- * between the current block and the first +- */ +- address = (char *)local->iobuf->ptr + +- (local->block_size - (local->offset % local->block_size)) + +- ((block_num - local->first_block - 1) * local->block_size); +- } ++ local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ goto err; + +- for (i = 0; i < count; i++) { +- address = (char *)address + off; +- memcpy(address, vector[i].iov_base, vector[i].iov_len); +- off += vector[i].iov_len; +- } ++ iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); ++ if (!iobuf) ++ goto err; + +-out: +- if (anon_fd) +- fd_unref(anon_fd); +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- } else { +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- vec.iov_base = local->iobuf->ptr; +- if (local->offset + local->req_size > local->prebuf.ia_size) +- local->total_size = local->prebuf.ia_size - local->offset; +- vec.iov_len = local->total_size; +- local->op_ret = local->total_size; +- SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, &vec, 1, +- &local->prebuf, local->iobref, local->xattr_rsp); +- return 0; +- } +- } +- +- return 0; +-} +- +-int shard_readv_do(call_frame_t *frame, xlator_t *this) { +- int i = 0; +- int call_count = 0; +- int last_block = 0; +- int cur_block = 0; +- off_t orig_offset = 0; +- off_t shard_offset = 0; +- size_t read_size = 0; +- size_t remaining_size = 0; +- fd_t *fd = NULL; +- fd_t *anon_fd = NULL; +- shard_local_t *local = NULL; +- gf_boolean_t wind_failed = _gf_false; +- +- local = frame->local; +- fd = local->fd; +- +- orig_offset = local->offset; +- cur_block = local->first_block; +- last_block = local->last_block; +- remaining_size = local->total_size; +- local->call_count = call_count = local->num_blocks; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- if (fd->flags & O_DIRECT) +- local->flags = O_DIRECT; +- +- while (cur_block <= last_block) { +- if (wind_failed) { +- shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, 0, +- NULL, NULL, NULL); +- goto next; +- } +- +- shard_offset = orig_offset % local->block_size; +- read_size = local->block_size - shard_offset; +- if (read_size > remaining_size) +- read_size = remaining_size; +- +- remaining_size -= read_size; +- +- if (cur_block == 0) { +- anon_fd = fd_ref(fd); +- } else { +- anon_fd = fd_anonymous(local->inode_list[i]); +- if (!anon_fd) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, NULL, +- 0, NULL, NULL, NULL); +- goto next; +- } ++ local->iobref = iobref_new(); ++ if (!local->iobref) { ++ iobuf_unref(iobuf); ++ goto err; + } + +- STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readv, anon_fd, read_size, +- shard_offset, local->flags, local->xattr_req); ++ if (iobref_add(local->iobref, iobuf) != 0) { ++ iobuf_unref(iobuf); ++ goto err; ++ } + +- orig_offset += read_size; +- next: +- cur_block++; +- i++; +- call_count--; +- } +- return 0; +-} ++ memset(iobuf->ptr, 0, local->total_size); ++ iobuf_unref(iobuf); ++ local->iobuf = iobuf; + +-int shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- int shard_block_num = (long)cookie; +- int call_count = 0; +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (op_ret < 0) { +- if (op_errno == EEXIST) { +- LOCK(&frame->lock); +- { local->eexist_count++; } +- UNLOCK(&frame->lock); ++ local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ ret = shard_init_internal_dir_loc(this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret) ++ goto err; ++ shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { +- local->op_ret = op_ret; +- local->op_errno = op_errno; ++ local->post_res_handler = shard_post_resolve_readv_handler; ++ shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } +- gf_msg_debug(this->name, 0, "mknod of shard %d " +- "failed: %s", +- shard_block_num, strerror(op_errno)); +- goto done; +- } +- +- shard_link_block_inode(local, shard_block_num, inode, buf); ++ return 0; ++err: ++ shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); ++ return 0; ++} + +-done: +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- local->create_count = 0; +- local->post_mknod_handler(frame, this); +- } +- +- return 0; +-} +- +-int shard_common_resume_mknod( +- call_frame_t *frame, xlator_t *this, +- shard_post_mknod_fop_handler_t post_mknod_handler) { +- int i = 0; +- int shard_idx_iter = 0; +- int last_block = 0; +- int ret = 0; +- int call_count = 0; +- char path[PATH_MAX] = { +- 0, +- }; +- mode_t mode = 0; +- char *bname = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t ctx_tmp = { +- 0, +- }; +- shard_local_t *local = NULL; +- gf_boolean_t wind_failed = _gf_false; +- fd_t *fd = NULL; +- loc_t loc = { +- 0, +- }; +- dict_t *xattr_req = NULL; +- +- local = frame->local; +- priv = this->private; +- fd = local->fd; +- shard_idx_iter = local->first_block; +- last_block = local->last_block; +- call_count = local->call_count = local->create_count; +- local->post_mknod_handler = post_mknod_handler; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get inode " +- "ctx for %s", +- uuid_utoa(fd->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- goto err; +- } +- mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); ++int ++shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, uint32_t flags, dict_t *xdata) ++{ ++ int ret = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- while (shard_idx_iter <= last_block) { +- if (local->inode_list[i]) { +- shard_idx_iter++; +- i++; +- continue; ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; + } + +- if (wind_failed) { +- shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, +- ENOMEM, NULL, NULL, NULL, NULL, NULL); +- goto next; ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ /* block_size = 0 means that the file was created before ++ * sharding was enabled on the volume. ++ */ ++ STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, ++ xdata); ++ return 0; + } + +- shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, +- sizeof(path)); +- +- xattr_req = shard_create_gfid_dict(local->xattr_req); +- if (!xattr_req) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, +- ENOMEM, NULL, NULL, NULL, NULL, NULL); +- goto next; +- } ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- bname = strrchr(path, '/') + 1; +- loc.inode = inode_new(this->itable); +- loc.parent = inode_ref(priv->dot_shard_inode); +- ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +- if (ret < 0 || !(loc.inode)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- "on %s, base file gfid = %s", +- bname, uuid_utoa(fd->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- loc_wipe(&loc); +- dict_unref(xattr_req); +- shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, +- ENOMEM, NULL, NULL, NULL, NULL, NULL); +- goto next; +- } ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- loc.name = strrchr(loc.path, '/'); +- if (loc.name) +- loc.name++; ++ frame->local = local; + +- STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, +- (void *)(long)shard_idx_iter, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->mknod, &loc, mode, +- ctx_tmp.stat.ia_rdev, 0, xattr_req); +- loc_wipe(&loc); +- dict_unref(xattr_req); ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto err; ++ local->fd = fd_ref(fd); ++ local->block_size = block_size; ++ local->offset = offset; ++ local->req_size = size; ++ local->flags = flags; ++ local->fop = GF_FOP_READ; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- next: +- shard_idx_iter++; +- i++; +- if (!--call_count) +- break; +- } ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- return 0; ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_readv_handler); ++ return 0; + err: +- /* +- * This block is for handling failure in shard_inode_ctx_get_all(). +- * Failures in the while-loop are handled within the loop. +- */ +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- post_mknod_handler(frame, this); +- return 0; ++ shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); +- +-int shard_post_lookup_shards_readv_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_common_inode_write_post_update_size_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ shard_common_inode_write_success_unwind(local->fop, frame, ++ local->written_size); ++ } + return 0; +- } +- +- if (local->create_count) { +- shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); +- } else { +- shard_readv_do(frame, this); +- } +- +- return 0; + } + +-int shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++static gf_boolean_t ++shard_is_appending_write(shard_local_t *local) ++{ ++ if (local->fop != GF_FOP_WRITE) ++ return _gf_false; ++ if (local->flags & O_APPEND) ++ return _gf_true; ++ if (local->fd->flags & O_APPEND) ++ return _gf_true; ++ return _gf_false; ++} + +- local = frame->local; ++int ++__shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- if (!local->eexist_count) { +- shard_readv_do(frame, this); +- } else { +- local->call_count = local->eexist_count; +- shard_common_lookup_shards(frame, this, local->loc.inode, +- shard_post_lookup_shards_readv_handler); +- } +- return 0; +-} ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +-int shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++ if (shard_is_appending_write(local)) { ++ local->delta_size = local->total_size; ++ } else if (local->offset + local->total_size > ctx->stat.ia_size) { ++ local->delta_size = (local->offset + local->total_size) - ++ ctx->stat.ia_size; ++ } else { ++ local->delta_size = 0; ++ } ++ ctx->stat.ia_size += (local->delta_size); ++ local->postbuf = ctx->stat; + +- local = frame->local; ++ return 0; ++} + +- if (local->op_ret < 0) { +- if (local->op_errno != ENOENT) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- return 0; +- } else { +- struct iovec vec = { +- 0, +- }; ++int ++shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = -1; + +- vec.iov_base = local->iobuf->ptr; +- vec.iov_len = local->total_size; +- local->op_ret = local->total_size; +- SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1, +- &local->prebuf, local->iobref, NULL); +- return 0; ++ LOCK(&inode->lock); ++ { ++ ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); + } +- } ++ UNLOCK(&inode->lock); + +- if (local->call_count) { +- shard_common_lookup_shards(frame, this, local->resolver_base_inode, +- shard_post_lookup_shards_readv_handler); +- } else { +- shard_readv_do(frame, this); +- } +- +- return 0; ++ return ret; + } + +-int shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- struct iobuf *iobuf = NULL; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; ++int ++shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, struct iatt *pre, ++ struct iatt *post, dict_t *xdata) ++{ ++ int call_count = 0; ++ fd_t *anon_fd = cookie; ++ shard_local_t *local = NULL; ++ glusterfs_fop_t fop = 0; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ local = frame->local; ++ fop = local->fop; + +- if (local->offset >= local->prebuf.ia_size) { +- /* If the read is being performed past the end of the file, +- * unwind the FOP with 0 bytes read as status. +- */ +- struct iovec vec = { +- 0, +- }; ++ LOCK(&frame->lock); ++ { ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } else { ++ local->written_size += op_ret; ++ GF_ATOMIC_ADD(local->delta_blocks, ++ post->ia_blocks - pre->ia_blocks); ++ local->delta_size += (post->ia_size - pre->ia_size); ++ shard_inode_ctx_set(local->fd->inode, this, post, 0, ++ SHARD_MASK_TIMES); ++ if (local->fd->inode != anon_fd->inode) ++ shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, ++ anon_fd->inode); ++ } ++ } ++ UNLOCK(&frame->lock); + +- iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); +- if (!iobuf) +- goto err; ++ if (anon_fd) ++ fd_unref(anon_fd); + +- vec.iov_base = iobuf->ptr; +- vec.iov_len = 0; +- local->iobref = iobref_new(); +- iobref_add(local->iobref, iobuf); +- iobuf_unref(iobuf); ++ call_count = shard_call_count_return(frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID(frame, local); ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(fop, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); ++ local->hole_size = 0; ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ shard_update_file_size( ++ frame, this, local->fd, NULL, ++ shard_common_inode_write_post_update_size_handler); ++ } ++ } + +- SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, +- local->iobref, NULL); + return 0; +- } ++} + +- local->first_block = get_lowest_block(local->offset, local->block_size); ++int ++shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ struct iovec *vec, int count, off_t shard_offset, ++ size_t size) ++{ ++ shard_local_t *local = NULL; + +- local->total_size = local->req_size; ++ local = frame->local; + +- local->last_block = +- get_highest_block(local->offset, local->total_size, local->block_size); ++ switch (local->fop) { ++ case GF_FOP_WRITE: ++ STACK_WIND_COOKIE( ++ frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->writev, fd, vec, count, shard_offset, ++ local->flags, local->iobref, local->xattr_req); ++ break; ++ case GF_FOP_FALLOCATE: ++ STACK_WIND_COOKIE( ++ frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fallocate, fd, local->flags, ++ shard_offset, size, local->xattr_req); ++ break; ++ case GF_FOP_ZEROFILL: ++ STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->zerofill, fd, ++ shard_offset, size, local->xattr_req); ++ break; ++ case GF_FOP_DISCARD: ++ STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->discard, fd, ++ shard_offset, size, local->xattr_req); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", local->fop); ++ break; ++ } ++ return 0; ++} + +- local->num_blocks = local->last_block - local->first_block + 1; +- GF_ASSERT(local->num_blocks > 0); +- local->resolver_base_inode = local->loc.inode; ++int ++shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) ++{ ++ int i = 0; ++ int count = 0; ++ int call_count = 0; ++ int last_block = 0; ++ uint32_t cur_block = 0; ++ fd_t *fd = NULL; ++ fd_t *anon_fd = NULL; ++ shard_local_t *local = NULL; ++ struct iovec *vec = NULL; ++ gf_boolean_t wind_failed = _gf_false; ++ gf_boolean_t odirect = _gf_false; ++ off_t orig_offset = 0; ++ off_t shard_offset = 0; ++ off_t vec_offset = 0; ++ size_t remaining_size = 0; ++ size_t shard_write_size = 0; + +- local->inode_list = +- GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) +- goto err; ++ local = frame->local; ++ fd = local->fd; ++ ++ orig_offset = local->offset; ++ remaining_size = local->total_size; ++ cur_block = local->first_block; ++ local->call_count = call_count = local->num_blocks; ++ last_block = local->last_block; ++ ++ SHARD_SET_ROOT_FS_ID(frame, local); ++ ++ if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC ++ " into " ++ "dict: %s", ++ uuid_utoa(fd->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ local->call_count = 1; ++ shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ return 0; ++ } + +- iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); +- if (!iobuf) +- goto err; ++ if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) ++ odirect = _gf_true; + +- local->iobref = iobref_new(); +- if (!local->iobref) { +- iobuf_unref(iobuf); +- goto err; +- } ++ while (cur_block <= last_block) { ++ if (wind_failed) { ++ shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } + +- if (iobref_add(local->iobref, iobuf) != 0) { +- iobuf_unref(iobuf); +- goto err; +- } ++ shard_offset = orig_offset % local->block_size; ++ shard_write_size = local->block_size - shard_offset; ++ if (shard_write_size > remaining_size) ++ shard_write_size = remaining_size; ++ ++ remaining_size -= shard_write_size; ++ ++ if (local->fop == GF_FOP_WRITE) { ++ count = iov_subset(local->vector, local->count, vec_offset, ++ vec_offset + shard_write_size, NULL); ++ ++ vec = GF_CALLOC(count, sizeof(struct iovec), gf_shard_mt_iovec); ++ if (!vec) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ GF_FREE(vec); ++ shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, ++ -1, ENOMEM, NULL, NULL, NULL); ++ goto next; ++ } ++ count = iov_subset(local->vector, local->count, vec_offset, ++ vec_offset + shard_write_size, vec); ++ } + +- memset(iobuf->ptr, 0, local->total_size); +- iobuf_unref(iobuf); +- local->iobuf = iobuf; ++ if (cur_block == 0) { ++ anon_fd = fd_ref(fd); ++ } else { ++ anon_fd = fd_anonymous(local->inode_list[i]); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ wind_failed = _gf_true; ++ GF_FREE(vec); ++ shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); ++ goto next; ++ } ++ ++ if (local->fop == GF_FOP_WRITE) { ++ if (odirect) ++ local->flags = O_DIRECT; ++ else ++ local->flags = GF_ANON_FD_FLAGS; ++ } ++ } + +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- ret = +- shard_init_internal_dir_loc(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret) +- goto err; +- shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_post_resolve_readv_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +- off_t offset, uint32_t flags, dict_t *xdata) { +- int ret = 0; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size for %s from its inode ctx", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- /* block_size = 0 means that the file was created before +- * sharding was enabled on the volume. +- */ +- STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); +- return 0; +- } +- +- if (!this->itable) +- this->itable = fd->inode->table; +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto err; +- local->fd = fd_ref(fd); +- local->block_size = block_size; +- local->offset = offset; +- local->req_size = size; +- local->flags = flags; +- local->fop = GF_FOP_READ; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_readv_handler); +- return 0; +-err: +- shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); +- return 0; ++ shard_common_inode_write_wind(frame, this, anon_fd, vec, count, ++ shard_offset, shard_write_size); ++ if (vec) ++ vec_offset += shard_write_size; ++ orig_offset += shard_write_size; ++ GF_FREE(vec); ++ vec = NULL; ++ next: ++ cur_block++; ++ i++; ++ call_count--; ++ } ++ return 0; + } + +-int shard_common_inode_write_post_update_size_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; ++int ++shard_common_inode_write_post_mknod_handler(call_frame_t *frame, ++ xlator_t *this); + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- } else { +- shard_common_inode_write_success_unwind(local->fop, frame, +- local->written_size); +- } +- return 0; +-} ++int ++shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +-static gf_boolean_t shard_is_appending_write(shard_local_t *local) { +- if (local->fop != GF_FOP_WRITE) +- return _gf_false; +- if (local->flags & O_APPEND) +- return _gf_true; +- if (local->fd->flags & O_APPEND) +- return _gf_true; +- return _gf_false; +-} ++ local = frame->local; + +-int __shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ if (local->create_count) { ++ shard_common_resume_mknod(frame, this, ++ shard_common_inode_write_post_mknod_handler); ++ } else { ++ shard_common_inode_write_do(frame, this); ++ } + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ return 0; ++} + +- if (shard_is_appending_write(local)) { +- local->delta_size = local->total_size; +- } else if (local->offset + local->total_size > ctx->stat.ia_size) { +- local->delta_size = (local->offset + local->total_size) - ctx->stat.ia_size; +- } else { +- local->delta_size = 0; +- } +- ctx->stat.ia_size += (local->delta_size); +- local->postbuf = ctx->stat; ++int ++shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- return 0; +-} ++ local = frame->local; + +-int shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = -1; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- LOCK(&inode->lock); +- { ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); } +- UNLOCK(&inode->lock); ++ if (!local->eexist_count) { ++ shard_common_inode_write_do(frame, this); ++ } else { ++ local->call_count = local->eexist_count; ++ shard_common_lookup_shards( ++ frame, this, local->loc.inode, ++ shard_common_inode_write_post_lookup_shards_handler); ++ } + +- return ret; ++ return 0; + } + +-int shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, struct iatt *pre, +- struct iatt *post, dict_t *xdata) { +- int call_count = 0; +- fd_t *anon_fd = cookie; +- shard_local_t *local = NULL; +- glusterfs_fop_t fop = 0; ++int ++shard_common_inode_write_post_resolve_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; +- fop = local->fop; ++ local = frame->local; + +- LOCK(&frame->lock); +- { +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- } else { +- local->written_size += op_ret; +- GF_ATOMIC_ADD(local->delta_blocks, post->ia_blocks - pre->ia_blocks); +- local->delta_size += (post->ia_size - pre->ia_size); +- shard_inode_ctx_set(local->fd->inode, this, post, 0, SHARD_MASK_TIMES); +- if (local->fd->inode != anon_fd->inode) +- shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, +- anon_fd->inode); +- } +- } +- UNLOCK(&frame->lock); +- +- if (anon_fd) +- fd_unref(anon_fd); +- +- call_count = shard_call_count_return(frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID(frame, local); + if (local->op_ret < 0) { +- shard_common_failure_unwind(fop, frame, local->op_ret, local->op_errno); ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } ++ ++ if (local->call_count) { ++ shard_common_lookup_shards( ++ frame, this, local->resolver_base_inode, ++ shard_common_inode_write_post_lookup_shards_handler); + } else { +- shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); +- local->hole_size = 0; +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- shard_update_file_size(frame, this, local->fd, NULL, +- shard_common_inode_write_post_update_size_handler); ++ shard_common_inode_write_do(frame, this); + } +- } + +- return 0; ++ return 0; + } + +-int shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, +- struct iovec *vec, int count, +- off_t shard_offset, size_t size) { +- shard_local_t *local = NULL; ++int ++shard_common_inode_write_post_lookup_handler(call_frame_t *frame, ++ xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ shard_priv_t *priv = this->private; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(local->fop, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +- local = frame->local; ++ local->postbuf = local->prebuf; ++ ++ /*Adjust offset to EOF so that correct shard is chosen for append*/ ++ if (shard_is_appending_write(local)) ++ local->offset = local->prebuf.ia_size; ++ ++ local->first_block = get_lowest_block(local->offset, local->block_size); ++ local->last_block = get_highest_block(local->offset, local->total_size, ++ local->block_size); ++ local->num_blocks = local->last_block - local->first_block + 1; ++ GF_ASSERT(local->num_blocks > 0); ++ local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) { ++ shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); ++ return 0; ++ } + +- switch (local->fop) { +- case GF_FOP_WRITE: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd, +- vec, count, shard_offset, local->flags, local->iobref, +- local->xattr_req); +- break; +- case GF_FOP_FALLOCATE: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, fd, +- local->flags, shard_offset, size, local->xattr_req); +- break; +- case GF_FOP_ZEROFILL: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, fd, +- shard_offset, size, local->xattr_req); +- break; +- case GF_FOP_DISCARD: +- STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, fd, +- shard_offset, size, local->xattr_req); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", local->fop); +- break; +- } +- return 0; +-} +- +-int shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) { +- int i = 0; +- int count = 0; +- int call_count = 0; +- int last_block = 0; +- uint32_t cur_block = 0; +- fd_t *fd = NULL; +- fd_t *anon_fd = NULL; +- shard_local_t *local = NULL; +- struct iovec *vec = NULL; +- gf_boolean_t wind_failed = _gf_false; +- gf_boolean_t odirect = _gf_false; +- off_t orig_offset = 0; +- off_t shard_offset = 0; +- off_t vec_offset = 0; +- size_t remaining_size = 0; +- size_t shard_write_size = 0; +- +- local = frame->local; +- fd = local->fd; +- +- orig_offset = local->offset; +- remaining_size = local->total_size; +- cur_block = local->first_block; +- local->call_count = call_count = local->num_blocks; +- last_block = local->last_block; +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC " into " +- "dict: %s", +- uuid_utoa(fd->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- local->call_count = 1; +- shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, +- NULL, NULL, NULL); +- return 0; +- } ++ gf_msg_trace(this->name, 0, ++ "%s: gfid=%s first_block=%" PRIu64 ++ " " ++ "last_block=%" PRIu64 " num_blocks=%" PRIu64 " offset=%" PRId64 ++ " total_size=%zu flags=%" PRId32 "", ++ gf_fop_list[local->fop], ++ uuid_utoa(local->resolver_base_inode->gfid), ++ local->first_block, local->last_block, local->num_blocks, ++ local->offset, local->total_size, local->flags); + +- if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) +- odirect = _gf_true; ++ local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + +- while (cur_block <= last_block) { +- if (wind_failed) { +- shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, +- NULL, NULL, NULL); +- goto next; ++ if (!local->dot_shard_loc.inode) { ++ /*change handler*/ ++ shard_mkdir_internal_dir(frame, this, ++ shard_common_inode_write_post_resolve_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ /*change handler*/ ++ local->post_res_handler = shard_common_inode_write_post_resolve_handler; ++ shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } ++ return 0; ++} + +- shard_offset = orig_offset % local->block_size; +- shard_write_size = local->block_size - shard_offset; +- if (shard_write_size > remaining_size) +- shard_write_size = remaining_size; ++int ++shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ inode_t *link_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + +- remaining_size -= shard_write_size; ++ local = frame->local; + +- if (local->fop == GF_FOP_WRITE) { +- count = iov_subset(local->vector, local->count, vec_offset, +- vec_offset + shard_write_size, NULL); ++ SHARD_UNSET_ROOT_FS_ID(frame, local); + +- vec = GF_CALLOC(count, sizeof(struct iovec), gf_shard_mt_iovec); +- if (!vec) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- GF_FREE(vec); +- shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; +- } +- count = iov_subset(local->vector, local->count, vec_offset, +- vec_offset + shard_write_size, vec); ++ if (op_ret == -1) { ++ if (op_errno != EEXIST) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } else { ++ gf_msg_debug(this->name, 0, ++ "mkdir on %s failed " ++ "with EEXIST. Attempting lookup now", ++ shard_internal_dir_string(type)); ++ shard_lookup_internal_dir(frame, this, local->post_res_handler, ++ type); ++ return 0; ++ } + } + +- if (cur_block == 0) { +- anon_fd = fd_ref(fd); ++ link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ if (link_inode != inode) { ++ shard_refresh_internal_dir(frame, this, type); + } else { +- anon_fd = fd_anonymous(local->inode_list[i]); +- if (!anon_fd) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- GF_FREE(vec); +- shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, this, -1, +- ENOMEM, NULL, NULL, NULL); +- goto next; +- } +- +- if (local->fop == GF_FOP_WRITE) { +- if (odirect) +- local->flags = O_DIRECT; +- else +- local->flags = GF_ANON_FD_FLAGS; +- } +- } +- +- shard_common_inode_write_wind(frame, this, anon_fd, vec, count, +- shard_offset, shard_write_size); +- if (vec) +- vec_offset += shard_write_size; +- orig_offset += shard_write_size; +- GF_FREE(vec); +- vec = NULL; +- next: +- cur_block++; +- i++; +- call_count--; +- } +- return 0; ++ shard_inode_ctx_mark_dir_refreshed(link_inode, this); ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ } ++ return 0; ++unwind: ++ shard_common_resolve_shards(frame, this, local->post_res_handler); ++ return 0; + } + +-int shard_common_inode_write_post_mknod_handler(call_frame_t *frame, +- xlator_t *this); ++int ++shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type) ++{ ++ int ret = -1; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ dict_t *xattr_req = NULL; ++ uuid_t *gfid = NULL; ++ loc_t *loc = NULL; ++ gf_boolean_t free_gfid = _gf_true; + +-int shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++ local = frame->local; ++ priv = this->private; + +- local = frame->local; ++ local->post_res_handler = handler; ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); ++ if (!gfid) ++ goto err; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); +- return 0; +- } ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy(*gfid, priv->dot_shard_gfid); ++ loc = &local->dot_shard_loc; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); ++ loc = &local->dot_shard_rm_loc; ++ break; ++ default: ++ bzero(*gfid, sizeof(uuid_t)); ++ break; ++ } + +- if (local->create_count) { +- shard_common_resume_mknod(frame, this, +- shard_common_inode_write_post_mknod_handler); +- } else { +- shard_common_inode_write_do(frame, this); +- } ++ xattr_req = dict_new(); ++ if (!xattr_req) ++ goto err; + +- return 0; +-} ++ ret = shard_init_internal_dir_loc(this, local, type); ++ if (ret) ++ goto err; + +-int shard_common_inode_write_post_mknod_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; ++ ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set gfid-req for %s", ++ shard_internal_dir_string(type)); ++ goto err; ++ } else { ++ free_gfid = _gf_false; ++ } + +- local = frame->local; ++ SHARD_SET_ROOT_FS_ID(frame, local); + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); ++ STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, ++ 0755, 0, xattr_req); ++ dict_unref(xattr_req); + return 0; +- } + +- if (!local->eexist_count) { +- shard_common_inode_write_do(frame, this); +- } else { +- local->call_count = local->eexist_count; +- shard_common_lookup_shards( +- frame, this, local->loc.inode, +- shard_common_inode_write_post_lookup_shards_handler); +- } +- +- return 0; ++err: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ if (free_gfid) ++ GF_FREE(gfid); ++ handler(frame, this); ++ return 0; + } + +-int shard_common_inode_write_post_resolve_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); ++int ++shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ /* To-Do: Wind flush on all shards of the file */ ++ SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); + return 0; +- } +- +- if (local->call_count) { +- shard_common_lookup_shards( +- frame, this, local->resolver_base_inode, +- shard_common_inode_write_post_lookup_shards_handler); +- } else { +- shard_common_inode_write_do(frame, this); +- } +- +- return 0; + } + +-int shard_common_inode_write_post_lookup_handler(call_frame_t *frame, +- xlator_t *this) { +- shard_local_t *local = frame->local; +- shard_priv_t *priv = this->private; +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(local->fop, frame, local->op_ret, +- local->op_errno); ++int ++shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) ++{ ++ STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +- } +- +- local->postbuf = local->prebuf; ++} + +- /*Adjust offset to EOF so that correct shard is chosen for append*/ +- if (shard_is_appending_write(local)) +- local->offset = local->prebuf.ia_size; ++int ++__shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; + +- local->first_block = get_lowest_block(local->offset, local->block_size); +- local->last_block = +- get_highest_block(local->offset, local->total_size, local->block_size); +- local->num_blocks = local->last_block - local->first_block + 1; +- GF_ASSERT(local->num_blocks > 0); +- local->inode_list = +- GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); +- if (!local->inode_list) { +- shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); +- return 0; +- } ++ ret = __inode_ctx_get(inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; + +- gf_msg_trace( +- this->name, 0, "%s: gfid=%s first_block=%" PRIu64 " " +- "last_block=%" PRIu64 " num_blocks=%" PRIu64 +- " offset=%" PRId64 " total_size=%zu flags=%" PRId32 "", +- gf_fop_list[local->fop], uuid_utoa(local->resolver_base_inode->gfid), +- local->first_block, local->last_block, local->num_blocks, local->offset, +- local->total_size, local->flags); ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); ++ local->postbuf.ia_ctime = ctx->stat.ia_ctime; ++ local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; ++ local->postbuf.ia_atime = ctx->stat.ia_atime; ++ local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; ++ local->postbuf.ia_mtime = ctx->stat.ia_mtime; ++ local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; + +- if (!local->dot_shard_loc.inode) { +- /*change handler*/ +- shard_mkdir_internal_dir(frame, this, +- shard_common_inode_write_post_resolve_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- /*change handler*/ +- local->post_res_handler = shard_common_inode_write_post_resolve_handler; +- shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- return 0; ++ return 0; + } + +-int shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) { +- inode_t *link_inode = NULL; +- shard_local_t *local = NULL; +- shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; +- +- local = frame->local; +- +- SHARD_UNSET_ROOT_FS_ID(frame, local); +- +- if (op_ret == -1) { +- if (op_errno != EEXIST) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } else { +- gf_msg_debug(this->name, 0, "mkdir on %s failed " +- "with EEXIST. Attempting lookup now", +- shard_internal_dir_string(type)); +- shard_lookup_internal_dir(frame, this, local->post_res_handler, type); +- return 0; +- } +- } +- +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); +- if (link_inode != inode) { +- shard_refresh_internal_dir(frame, this, type); +- } else { +- shard_inode_ctx_mark_dir_refreshed(link_inode, this); +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- } +- return 0; +-unwind: +- shard_common_resolve_shards(frame, this, local->post_res_handler); +- return 0; +-} +- +-int shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler, +- shard_internal_dir_type_t type) { +- int ret = -1; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- dict_t *xattr_req = NULL; +- uuid_t *gfid = NULL; +- loc_t *loc = NULL; +- gf_boolean_t free_gfid = _gf_true; +- +- local = frame->local; +- priv = this->private; +- +- local->post_res_handler = handler; +- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); +- if (!gfid) +- goto err; +- +- switch (type) { +- case SHARD_INTERNAL_DIR_DOT_SHARD: +- gf_uuid_copy(*gfid, priv->dot_shard_gfid); +- loc = &local->dot_shard_loc; +- break; +- case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: +- gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); +- loc = &local->dot_shard_rm_loc; +- break; +- default: +- bzero(*gfid, sizeof(uuid_t)); +- break; +- } +- +- xattr_req = dict_new(); +- if (!xattr_req) +- goto err; +- +- ret = shard_init_internal_dir_loc(this, local, type); +- if (ret) +- goto err; +- +- ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, +- "Failed to set gfid-req for %s", shard_internal_dir_string(type)); +- goto err; +- } else { +- free_gfid = _gf_false; +- } +- +- SHARD_SET_ROOT_FS_ID(frame, local); +- +- STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, +- 0755, 0, xattr_req); +- dict_unref(xattr_req); +- return 0; ++int ++shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = 0; + +-err: +- if (xattr_req) +- dict_unref(xattr_req); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- if (free_gfid) +- GF_FREE(gfid); +- handler(frame, this); +- return 0; +-} ++ LOCK(&inode->lock); ++ { ++ ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); ++ } ++ UNLOCK(&inode->lock); + +-int shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *xdata) { +- /* To-Do: Wind flush on all shards of the file */ +- SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); +- return 0; ++ return ret; + } + +-int shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { +- STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->flush, fd, xdata); +- return 0; +-} ++int ++shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int call_count = 0; ++ uint64_t fsync_count = 0; ++ fd_t *anon_fd = cookie; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ inode_t *base_inode = NULL; ++ gf_boolean_t unref_shard_inode = _gf_false; ++ ++ local = frame->local; ++ base_inode = local->fd->inode; + +-int __shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = -1; +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; ++ if (local->op_ret < 0) ++ goto out; + +- ret = __inode_ctx_get(inode, this, &ctx_uint); +- if (ret < 0) +- return ret; ++ LOCK(&frame->lock); ++ { ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ UNLOCK(&frame->lock); ++ goto out; ++ } ++ shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, ++ SHARD_MASK_TIMES); ++ } ++ UNLOCK(&frame->lock); ++ fd_ctx_get(anon_fd, this, &fsync_count); ++out: ++ if (anon_fd && (base_inode != anon_fd->inode)) { ++ LOCK(&base_inode->lock); ++ LOCK(&anon_fd->inode->lock); ++ { ++ __shard_inode_ctx_get(anon_fd->inode, this, &ctx); ++ __shard_inode_ctx_get(base_inode, this, &base_ictx); ++ if (op_ret == 0) ++ ctx->fsync_needed -= fsync_count; ++ GF_ASSERT(ctx->fsync_needed >= 0); ++ if (ctx->fsync_needed != 0) { ++ list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); ++ base_ictx->fsync_count++; ++ } else { ++ unref_shard_inode = _gf_true; ++ } ++ } ++ UNLOCK(&anon_fd->inode->lock); ++ UNLOCK(&base_inode->lock); ++ } + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ if (unref_shard_inode) ++ inode_unref(anon_fd->inode); ++ if (anon_fd) ++ fd_unref(anon_fd); + +- local->postbuf.ia_ctime = ctx->stat.ia_ctime; +- local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; +- local->postbuf.ia_atime = ctx->stat.ia_atime; +- local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; +- local->postbuf.ia_mtime = ctx->stat.ia_mtime; +- local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; ++ call_count = shard_call_count_return(frame); ++ if (call_count != 0) ++ return 0; + +- return 0; ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, ++ local->op_errno); ++ } else { ++ shard_get_timestamps_from_inode_ctx(local, base_inode, this); ++ SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, local->xattr_rsp); ++ } ++ return 0; + } + +-int shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, +- xlator_t *this) { +- int ret = 0; ++int ++shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ int call_count = 0; ++ int fsync_count = 0; ++ fd_t *anon_fd = NULL; ++ inode_t *base_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *iter = NULL; ++ struct list_head copy = { ++ 0, ++ }; ++ shard_inode_ctx_t *tmp = NULL; + +- LOCK(&inode->lock); +- { ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); } +- UNLOCK(&inode->lock); ++ local = frame->local; ++ base_inode = local->fd->inode; ++ local->postbuf = local->prebuf; ++ INIT_LIST_HEAD(©); + +- return ret; +-} ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, ++ local->op_errno); ++ return 0; ++ } + +-int shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *prebuf, struct iatt *postbuf, +- dict_t *xdata) { +- int call_count = 0; +- uint64_t fsync_count = 0; +- fd_t *anon_fd = cookie; +- shard_local_t *local = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *base_ictx = NULL; +- inode_t *base_inode = NULL; +- gf_boolean_t unref_shard_inode = _gf_false; +- +- local = frame->local; +- base_inode = local->fd->inode; +- +- if (local->op_ret < 0) +- goto out; +- +- LOCK(&frame->lock); +- { +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- UNLOCK(&frame->lock); +- goto out; +- } +- shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, SHARD_MASK_TIMES); +- } +- UNLOCK(&frame->lock); +- fd_ctx_get(anon_fd, this, &fsync_count); +-out: +- if (anon_fd && (base_inode != anon_fd->inode)) { + LOCK(&base_inode->lock); +- LOCK(&anon_fd->inode->lock); + { +- __shard_inode_ctx_get(anon_fd->inode, this, &ctx); +- __shard_inode_ctx_get(base_inode, this, &base_ictx); +- if (op_ret == 0) +- ctx->fsync_needed -= fsync_count; +- GF_ASSERT(ctx->fsync_needed >= 0); +- if (ctx->fsync_needed != 0) { +- list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); +- base_ictx->fsync_count++; +- } else { +- unref_shard_inode = _gf_true; +- } +- } +- UNLOCK(&anon_fd->inode->lock); ++ __shard_inode_ctx_get(base_inode, this, &ctx); ++ list_splice_init(&ctx->to_fsync_list, ©); ++ call_count = ctx->fsync_count; ++ ctx->fsync_count = 0; ++ } + UNLOCK(&base_inode->lock); +- } +- +- if (unref_shard_inode) +- inode_unref(anon_fd->inode); +- if (anon_fd) +- fd_unref(anon_fd); +- +- call_count = shard_call_count_return(frame); +- if (call_count != 0) +- return 0; + +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, +- local->op_errno); +- } else { +- shard_get_timestamps_from_inode_ctx(local, base_inode, this); +- SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, local->xattr_rsp); +- } +- return 0; +-} +- +-int shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) { +- int ret = 0; +- int call_count = 0; +- int fsync_count = 0; +- fd_t *anon_fd = NULL; +- inode_t *base_inode = NULL; +- shard_local_t *local = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *iter = NULL; +- struct list_head copy = { +- 0, +- }; +- shard_inode_ctx_t *tmp = NULL; +- +- local = frame->local; +- base_inode = local->fd->inode; +- local->postbuf = local->prebuf; +- INIT_LIST_HEAD(©); +- +- if (local->op_ret < 0) { +- shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, +- local->op_errno); +- return 0; +- } +- +- LOCK(&base_inode->lock); +- { +- __shard_inode_ctx_get(base_inode, this, &ctx); +- list_splice_init(&ctx->to_fsync_list, ©); +- call_count = ctx->fsync_count; +- ctx->fsync_count = 0; +- } +- UNLOCK(&base_inode->lock); +- +- local->call_count = ++call_count; +- +- /* Send fsync() on the base shard first */ +- anon_fd = fd_ref(local->fd); +- STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, +- local->xattr_req); +- call_count--; +- anon_fd = NULL; +- +- list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) { +- list_del_init(&iter->to_fsync_list); +- fsync_count = 0; +- shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); +- GF_ASSERT(fsync_count > 0); +- anon_fd = fd_anonymous(iter->inode); +- if (!anon_fd) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, +- "Failed to create " +- "anon fd to fsync shard"); +- shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, +- NULL, NULL, NULL); +- continue; +- } ++ local->call_count = ++call_count; + +- ret = fd_ctx_set(anon_fd, this, fsync_count); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, +- "Failed to set fd " +- "ctx for shard inode gfid=%s", +- uuid_utoa(iter->inode->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, +- NULL, NULL, NULL); +- continue; +- } ++ /* Send fsync() on the base shard first */ ++ anon_fd = fd_ref(local->fd); + STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, + local->xattr_req); + call_count--; +- } ++ anon_fd = NULL; + +- return 0; ++ list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) ++ { ++ list_del_init(&iter->to_fsync_list); ++ fsync_count = 0; ++ shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); ++ GF_ASSERT(fsync_count > 0); ++ anon_fd = fd_anonymous(iter->inode); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, ++ "Failed to create " ++ "anon fd to fsync shard"); ++ shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ continue; ++ } ++ ++ ret = fd_ctx_set(anon_fd, this, fsync_count); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, ++ "Failed to set fd " ++ "ctx for shard inode gfid=%s", ++ uuid_utoa(iter->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ++ ENOMEM, NULL, NULL, NULL); ++ continue; ++ } ++ STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, ++ anon_fd, local->datasync, local->xattr_req); ++ call_count--; ++ } ++ ++ return 0; + } + +-int shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +- dict_t *xdata) { +- int ret = 0; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size for %s from its inode ctx", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); +- return 0; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); ++ return 0; ++ } + +- if (!this->itable) +- this->itable = fd->inode->table; ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- frame->local = local; ++ frame->local = local; + +- local->fd = fd_ref(fd); +- local->fop = GF_FOP_FSYNC; +- local->datasync = datasync; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; ++ local->fd = fd_ref(fd); ++ local->fop = GF_FOP_FSYNC; ++ local->datasync = datasync; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; + +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_fsync_handler); +- return 0; ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_post_lookup_fsync_handler); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, +- gf_dirent_t *orig_entries, dict_t *xdata) { +- gf_dirent_t *entry = NULL; +- gf_dirent_t *tmp = NULL; +- shard_local_t *local = NULL; ++int ++shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, gf_dirent_t *orig_entries, ++ dict_t *xdata) ++{ ++ gf_dirent_t *entry = NULL; ++ gf_dirent_t *tmp = NULL; ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (op_ret < 0) +- goto unwind; ++ if (op_ret < 0) ++ goto unwind; + +- list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) { +- list_del_init(&entry->list); +- list_add_tail(&entry->list, &local->entries_head.list); ++ list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) ++ { ++ list_del_init(&entry->list); ++ list_add_tail(&entry->list, &local->entries_head.list); + +- if (!entry->dict) +- continue; ++ if (!entry->dict) ++ continue; + +- if (IA_ISDIR(entry->d_stat.ia_type)) +- continue; ++ if (IA_ISDIR(entry->d_stat.ia_type)) ++ continue; + +- if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) +- shard_modify_size_and_block_count(&entry->d_stat, entry->dict); +- if (!entry->inode) +- continue; ++ if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) ++ shard_modify_size_and_block_count(&entry->d_stat, entry->dict); ++ if (!entry->inode) ++ continue; + +- shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); +- } +- local->op_ret += op_ret; ++ shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); ++ } ++ local->op_ret += op_ret; + + unwind: +- if (local->fop == GF_FOP_READDIR) +- SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, +- &local->entries_head, xdata); +- else +- SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &local->entries_head, +- xdata); +- return 0; ++ if (local->fop == GF_FOP_READDIR) ++ SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, ++ &local->entries_head, xdata); ++ else ++ SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, ++ &local->entries_head, xdata); ++ return 0; + } + +-int32_t shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- gf_dirent_t *orig_entries, dict_t *xdata) { +- fd_t *fd = NULL; +- gf_dirent_t *entry = NULL; +- gf_dirent_t *tmp = NULL; +- shard_local_t *local = NULL; +- gf_boolean_t last_entry = _gf_false; ++int32_t ++shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, ++ dict_t *xdata) ++{ ++ fd_t *fd = NULL; ++ gf_dirent_t *entry = NULL; ++ gf_dirent_t *tmp = NULL; ++ shard_local_t *local = NULL; ++ gf_boolean_t last_entry = _gf_false; + +- local = frame->local; +- fd = local->fd; ++ local = frame->local; ++ fd = local->fd; + +- if (op_ret < 0) +- goto unwind; ++ if (op_ret < 0) ++ goto unwind; + +- list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) { +- if (last_entry) +- last_entry = _gf_false; ++ list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) ++ { ++ if (last_entry) ++ last_entry = _gf_false; ++ ++ if (__is_root_gfid(fd->inode->gfid) && ++ !(strcmp(entry->d_name, GF_SHARD_DIR))) { ++ local->offset = entry->d_off; ++ op_ret--; ++ last_entry = _gf_true; ++ continue; ++ } + +- if (__is_root_gfid(fd->inode->gfid) && +- !(strcmp(entry->d_name, GF_SHARD_DIR))) { +- local->offset = entry->d_off; +- op_ret--; +- last_entry = _gf_true; +- continue; +- } ++ list_del_init(&entry->list); ++ list_add_tail(&entry->list, &local->entries_head.list); + +- list_del_init(&entry->list); +- list_add_tail(&entry->list, &local->entries_head.list); ++ if (!entry->dict) ++ continue; + +- if (!entry->dict) +- continue; ++ if (IA_ISDIR(entry->d_stat.ia_type)) ++ continue; + +- if (IA_ISDIR(entry->d_stat.ia_type)) +- continue; ++ if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && ++ frame->root->pid != GF_CLIENT_PID_GSYNCD) ++ shard_modify_size_and_block_count(&entry->d_stat, entry->dict); + +- if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && +- frame->root->pid != GF_CLIENT_PID_GSYNCD) +- shard_modify_size_and_block_count(&entry->d_stat, entry->dict); ++ if (!entry->inode) ++ continue; + +- if (!entry->inode) +- continue; ++ shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); ++ } + +- shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); +- } ++ local->op_ret = op_ret; + +- local->op_ret = op_ret; ++ if (last_entry) { ++ if (local->fop == GF_FOP_READDIR) ++ STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, ++ local->fd, local->readdir_size, local->offset, ++ local->xattr_req); ++ else ++ STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, ++ local->fd, local->readdir_size, local->offset, ++ local->xattr_req); ++ return 0; ++ } + +- if (last_entry) { ++unwind: + if (local->fop == GF_FOP_READDIR) +- STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdir, local->fd, +- local->readdir_size, local->offset, local->xattr_req); ++ SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, ++ &local->entries_head, xdata); + else +- STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdirp, local->fd, +- local->readdir_size, local->offset, local->xattr_req); ++ SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, ++ &local->entries_head, xdata); + return 0; +- } ++} + +-unwind: +- if (local->fop == GF_FOP_READDIR) +- SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, &local->entries_head, +- xdata); +- else +- SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &local->entries_head, +- xdata); +- return 0; +-} +- +-int shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +- off_t offset, int whichop, dict_t *xdata) { +- int ret = 0; +- shard_local_t *local = NULL; +- +- local = mem_get0(this->local_pool); +- if (!local) { +- goto err; +- } +- +- frame->local = local; +- +- local->fd = fd_ref(fd); +- local->fop = whichop; +- local->readdir_size = size; +- INIT_LIST_HEAD(&local->entries_head.list); +- local->list_inited = _gf_true; +- +- if (whichop == GF_FOP_READDIR) { +- STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); +- } else { +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, +- local, err); +- ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); +- if (ret) { +- gf_log(this->name, GF_LOG_WARNING, +- "Failed to set " +- "dict value: key:%s, directory gfid=%s", +- GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); +- goto err; ++int ++shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, int whichop, dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) { ++ goto err; + } + +- STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->readdirp, fd, size, offset, +- local->xattr_req); +- } ++ frame->local = local; ++ ++ local->fd = fd_ref(fd); ++ local->fop = whichop; ++ local->readdir_size = size; ++ INIT_LIST_HEAD(&local->entries_head.list); ++ local->list_inited = _gf_true; ++ ++ if (whichop == GF_FOP_READDIR) { ++ STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); ++ } else { ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, ++ local, err); ++ ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_log(this->name, GF_LOG_WARNING, ++ "Failed to set " ++ "dict value: key:%s, directory gfid=%s", ++ GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } ++ ++ STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readdirp, fd, size, offset, ++ local->xattr_req); ++ } + +- return 0; ++ return 0; + + err: +- STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); +- return 0; ++ STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); ++ return 0; + } + +-int32_t shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, +- size_t size, off_t offset, dict_t *xdata) { +- shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); +- return 0; ++int32_t ++shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, dict_t *xdata) ++{ ++ shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); ++ return 0; + } + +-int32_t shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, +- size_t size, off_t offset, dict_t *xdata) { +- shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); +- return 0; ++int32_t ++shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t offset, dict_t *xdata) ++{ ++ shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); ++ return 0; + } + + int32_t +@@ -6037,77 +6450,86 @@ shard_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + return 0; + } + +-int32_t shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *dict, +- dict_t *xdata) { +- if (op_ret < 0) +- goto unwind; ++int32_t ++shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ if (op_ret < 0) ++ goto unwind; + +- if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { +- dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); +- dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); +- } ++ if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { ++ dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); ++ dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); ++ } + + unwind: +- SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); +- return 0; ++ SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); ++ return 0; + } + +-int32_t shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, +- const char *name, dict_t *xdata) { +- int op_errno = EINVAL; ++int32_t ++shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, ++ dict_t *xdata) ++{ ++ int op_errno = EINVAL; + +- if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && +- (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { +- op_errno = ENODATA; +- goto out; +- } ++ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && ++ (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { ++ op_errno = ENODATA; ++ goto out; ++ } + +- STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); +- return 0; ++ STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); ++ return 0; + out: +- shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); ++ return 0; + } + +-int32_t shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, dict_t *dict, +- dict_t *xdata) { +- if (op_ret < 0) +- goto unwind; ++int32_t ++shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *dict, ++ dict_t *xdata) ++{ ++ if (op_ret < 0) ++ goto unwind; + +- if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { +- dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); +- dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); +- } ++ if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { ++ dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); ++ dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); ++ } + + unwind: +- SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); +- return 0; ++ SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); ++ return 0; + } + +-int32_t shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, +- const char *name, dict_t *xdata) { +- int op_errno = EINVAL; ++int32_t ++shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ const char *name, dict_t *xdata) ++{ ++ int op_errno = EINVAL; + +- if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && +- (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { +- op_errno = ENODATA; +- goto out; +- } ++ if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && ++ (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { ++ op_errno = ENODATA; ++ goto out; ++ } + +- STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); +- return 0; ++ STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); ++ return 0; + out: +- shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); +- return 0; ++ shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); ++ return 0; + } + +-int32_t shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, dict_t *xdata) { ++int32_t ++shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ + int ret = -1; + shard_local_t *local = NULL; + +@@ -6141,8 +6563,9 @@ err: + return 0; + } + +-int32_t shard_post_lookup_set_xattr_handler(call_frame_t *frame, +- xlator_t *this) { ++int32_t ++shard_post_lookup_set_xattr_handler(call_frame_t *frame, xlator_t *this) ++{ + shard_local_t *local = NULL; + + local = frame->local; +@@ -6164,9 +6587,11 @@ int32_t shard_post_lookup_set_xattr_handler(call_frame_t *frame, + return 0; + } + +-int32_t shard_common_set_xattr(call_frame_t *frame, xlator_t *this, +- glusterfs_fop_t fop, loc_t *loc, fd_t *fd, +- dict_t *dict, int32_t flags, dict_t *xdata) { ++int32_t ++shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, ++ loc_t *loc, fd_t *fd, dict_t *dict, int32_t flags, ++ dict_t *xdata) ++{ + int ret = -1; + int op_errno = ENOMEM; + uint64_t block_size = 0; +@@ -6249,489 +6674,531 @@ err: + return 0; + } + +-int32_t shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, +- dict_t *dict, int32_t flags, dict_t *xdata) { ++int32_t ++shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, ++ int32_t flags, dict_t *xdata) ++{ + shard_common_set_xattr(frame, this, GF_FOP_FSETXATTR, NULL, fd, dict, flags, + xdata); + return 0; + } + +-int32_t shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, +- dict_t *dict, int32_t flags, dict_t *xdata) { ++int32_t ++shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, ++ int32_t flags, dict_t *xdata) ++{ + shard_common_set_xattr(frame, this, GF_FOP_SETXATTR, loc, NULL, dict, flags, + xdata); + return 0; + } + +-int shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) { +- shard_local_t *local = NULL; ++int ++shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (local->fop == GF_FOP_SETATTR) { +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, +- SHARD_LOOKUP_MASK); +- SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, local->xattr_rsp); +- } else if (local->fop == GF_FOP_FSETATTR) { +- if (local->op_ret >= 0) +- shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, +- SHARD_LOOKUP_MASK); +- SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->postbuf, local->xattr_rsp); +- } ++ if (local->fop == GF_FOP_SETATTR) { ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, ++ SHARD_LOOKUP_MASK); ++ SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, local->xattr_rsp); ++ } else if (local->fop == GF_FOP_FSETATTR) { ++ if (local->op_ret >= 0) ++ shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, ++ SHARD_LOOKUP_MASK); ++ SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->postbuf, local->xattr_rsp); ++ } + +- return 0; ++ return 0; + } + +-int shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *prebuf, struct iatt *postbuf, +- dict_t *xdata) { +- shard_local_t *local = NULL; ++int ++shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ shard_local_t *local = NULL; + +- local = frame->local; ++ local = frame->local; + +- if (op_ret < 0) { +- local->op_ret = op_ret; +- local->op_errno = op_errno; +- goto unwind; +- } ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto unwind; ++ } + +- local->prebuf = *prebuf; +- if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { +- local->op_ret = -1; +- local->op_errno = EINVAL; +- goto unwind; +- } +- if (xdata) +- local->xattr_rsp = dict_ref(xdata); +- local->postbuf = *postbuf; +- local->postbuf.ia_size = local->prebuf.ia_size; +- local->postbuf.ia_blocks = local->prebuf.ia_blocks; ++ local->prebuf = *prebuf; ++ if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { ++ local->op_ret = -1; ++ local->op_errno = EINVAL; ++ goto unwind; ++ } ++ if (xdata) ++ local->xattr_rsp = dict_ref(xdata); ++ local->postbuf = *postbuf; ++ local->postbuf.ia_size = local->prebuf.ia_size; ++ local->postbuf.ia_blocks = local->prebuf.ia_blocks; + + unwind: +- local->handler(frame, this); +- return 0; ++ local->handler(frame, this); ++ return 0; + } + +-int shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, +- struct iatt *stbuf, int32_t valid, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ struct iatt *stbuf, int32_t valid, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { +- STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); +- return 0; +- } ++ if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { ++ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); ++ return 0; ++ } + +- ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block size from inode ctx of %s", +- uuid_utoa(loc->inode->gfid)); +- goto err; +- } ++ ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block size from inode ctx of %s", ++ uuid_utoa(loc->inode->gfid)); ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); +- return 0; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); ++ return 0; ++ } + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- frame->local = local; ++ frame->local = local; + +- local->handler = shard_post_setattr_handler; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->fop = GF_FOP_SETATTR; +- loc_copy(&local->loc, loc); ++ local->handler = shard_post_setattr_handler; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->fop = GF_FOP_SETATTR; ++ loc_copy(&local->loc, loc); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, +- local, err); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, ++ local, err); + +- STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, +- local->xattr_req); +- return 0; ++ STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, ++ local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, +- struct iatt *stbuf, int32_t valid, dict_t *xdata) { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++int ++shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ struct iatt *stbuf, int32_t valid, dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; + +- if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { +- STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); +- return 0; +- } ++ if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { ++ STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); ++ return 0; ++ } + +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block size from inode ctx of %s", +- uuid_utoa(fd->inode->gfid)); +- goto err; +- } ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block size from inode ctx of %s", ++ uuid_utoa(fd->inode->gfid)); ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); +- return 0; +- } ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); ++ return 0; ++ } + +- if (!this->itable) +- this->itable = fd->inode->table; ++ if (!this->itable) ++ this->itable = fd->inode->table; + +- local = mem_get0(this->local_pool); +- if (!local) +- goto err; ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto err; + +- frame->local = local; ++ frame->local = local; + +- local->handler = shard_post_setattr_handler; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto err; +- local->fop = GF_FOP_FSETATTR; +- local->fd = fd_ref(fd); ++ local->handler = shard_post_setattr_handler; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto err; ++ local->fop = GF_FOP_FSETATTR; ++ local->fd = fd_ref(fd); + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, +- local, err); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, ++ local, err); + +- STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, +- local->xattr_req); +- return 0; ++ STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, ++ local->xattr_req); ++ return 0; + err: +- shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); +- return 0; +-} +- +-int shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, +- glusterfs_fop_t fop, fd_t *fd, +- struct iovec *vector, int32_t count, +- off_t offset, uint32_t flags, size_t len, +- struct iobref *iobref, dict_t *xdata) { +- int ret = 0; +- int i = 0; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; +- +- ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); +- if (ret) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, +- "Failed to get block " +- "size for %s from its inode ctx", +- uuid_utoa(fd->inode->gfid)); +- goto out; +- } +- +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- /* block_size = 0 means that the file was created before +- * sharding was enabled on the volume. +- */ +- switch (fop) { +- case GF_FOP_WRITE: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, +- fd, vector, count, offset, flags, iobref, xdata); +- break; +- case GF_FOP_FALLOCATE: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fallocate, fd, flags, offset, +- len, xdata); +- break; +- case GF_FOP_ZEROFILL: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->zerofill, fd, offset, len, +- xdata); +- break; +- case GF_FOP_DISCARD: +- STACK_WIND_TAIL(frame, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); +- break; +- default: +- gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +- "Invalid fop id = %d", fop); +- break; +- } +- return 0; +- } +- +- if (!this->itable) +- this->itable = fd->inode->table; +- +- local = mem_get0(this->local_pool); +- if (!local) +- goto out; +- +- frame->local = local; +- +- ret = syncbarrier_init(&local->barrier); +- if (ret) +- goto out; +- local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); +- if (!local->xattr_req) +- goto out; +- +- if (vector) { +- local->vector = iov_dup(vector, count); +- if (!local->vector) +- goto out; +- for (i = 0; i < count; i++) +- local->total_size += vector[i].iov_len; +- local->count = count; +- } else { +- local->total_size = len; +- } +- +- local->fop = fop; +- local->offset = offset; +- local->flags = flags; +- if (iobref) +- local->iobref = iobref_ref(iobref); +- local->fd = fd_ref(fd); +- local->block_size = block_size; +- local->resolver_base_inode = local->fd->inode; +- GF_ATOMIC_INIT(local->delta_blocks, 0); +- +- local->loc.inode = inode_ref(fd->inode); +- gf_uuid_copy(local->loc.gfid, fd->inode->gfid); +- +- shard_lookup_base_file(frame, this, &local->loc, +- shard_common_inode_write_post_lookup_handler); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, ++ glusterfs_fop_t fop, fd_t *fd, ++ struct iovec *vector, int32_t count, ++ off_t offset, uint32_t flags, size_t len, ++ struct iobref *iobref, dict_t *xdata) ++{ ++ int ret = 0; ++ int i = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, ++ "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa(fd->inode->gfid)); ++ goto out; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ /* block_size = 0 means that the file was created before ++ * sharding was enabled on the volume. ++ */ ++ switch (fop) { ++ case GF_FOP_WRITE: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->writev, fd, vector, ++ count, offset, flags, iobref, xdata); ++ break; ++ case GF_FOP_FALLOCATE: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fallocate, fd, flags, ++ offset, len, xdata); ++ break; ++ case GF_FOP_ZEROFILL: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->zerofill, fd, offset, ++ len, xdata); ++ break; ++ case GF_FOP_DISCARD: ++ STACK_WIND_TAIL(frame, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->discard, fd, offset, ++ len, xdata); ++ break; ++ default: ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "Invalid fop id = %d", fop); ++ break; ++ } ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = fd->inode->table; ++ ++ local = mem_get0(this->local_pool); ++ if (!local) ++ goto out; ++ ++ frame->local = local; ++ ++ ret = syncbarrier_init(&local->barrier); ++ if (ret) ++ goto out; ++ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); ++ if (!local->xattr_req) ++ goto out; ++ ++ if (vector) { ++ local->vector = iov_dup(vector, count); ++ if (!local->vector) ++ goto out; ++ for (i = 0; i < count; i++) ++ local->total_size += vector[i].iov_len; ++ local->count = count; ++ } else { ++ local->total_size = len; ++ } ++ ++ local->fop = fop; ++ local->offset = offset; ++ local->flags = flags; ++ if (iobref) ++ local->iobref = iobref_ref(iobref); ++ local->fd = fd_ref(fd); ++ local->block_size = block_size; ++ local->resolver_base_inode = local->fd->inode; ++ GF_ATOMIC_INIT(local->delta_blocks, 0); ++ ++ local->loc.inode = inode_ref(fd->inode); ++ gf_uuid_copy(local->loc.gfid, fd->inode->gfid); ++ ++ shard_lookup_base_file(frame, this, &local->loc, ++ shard_common_inode_write_post_lookup_handler); ++ return 0; + out: +- shard_common_failure_unwind(fop, frame, -1, ENOMEM); +- return 0; ++ shard_common_failure_unwind(fop, frame, -1, ENOMEM); ++ return 0; + } + +-int shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, +- struct iovec *vector, int32_t count, off_t offset, +- uint32_t flags, struct iobref *iobref, dict_t *xdata) { +- shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, +- offset, flags, 0, iobref, xdata); +- return 0; ++int ++shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ struct iovec *vector, int32_t count, off_t offset, uint32_t flags, ++ struct iobref *iobref, dict_t *xdata) ++{ ++ shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, ++ offset, flags, 0, iobref, xdata); ++ return 0; + } + +-int shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, +- int32_t keep_size, off_t offset, size_t len, +- dict_t *xdata) { +- if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && +- (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) +- goto out; ++int ++shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ int32_t keep_size, off_t offset, size_t len, dict_t *xdata) ++{ ++ if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && ++ (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) ++ goto out; + +- shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, +- offset, keep_size, len, NULL, xdata); +- return 0; ++ shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, ++ offset, keep_size, len, NULL, xdata); ++ return 0; + out: +- shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); +- return 0; ++ shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); ++ return 0; + } + +-int shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- off_t len, dict_t *xdata) { +- shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, +- offset, 0, len, NULL, xdata); +- return 0; ++int ++shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ off_t len, dict_t *xdata) ++{ ++ shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, ++ offset, 0, len, NULL, xdata); ++ return 0; + } + +-int shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- size_t len, dict_t *xdata) { +- shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, +- offset, 0, len, NULL, xdata); +- return 0; ++int ++shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ size_t len, dict_t *xdata) ++{ ++ shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, ++ offset, 0, len, NULL, xdata); ++ return 0; + } + +-int32_t shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +- gf_seek_what_t what, dict_t *xdata) { +- /* TBD */ +- gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, +- "seek called on %s.", uuid_utoa(fd->inode->gfid)); +- shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); +- return 0; ++int32_t ++shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ gf_seek_what_t what, dict_t *xdata) ++{ ++ /* TBD */ ++ gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, ++ "seek called on %s.", uuid_utoa(fd->inode->gfid)); ++ shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); ++ return 0; + } + +-int32_t mem_acct_init(xlator_t *this) { +- int ret = -1; ++int32_t ++mem_acct_init(xlator_t *this) ++{ ++ int ret = -1; + +- if (!this) +- return ret; ++ if (!this) ++ return ret; + +- ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); ++ ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); + +- if (ret != 0) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, +- "Memory accounting init" +- "failed"); +- return ret; +- } ++ if (ret != 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, ++ "Memory accounting init" ++ "failed"); ++ return ret; ++ } + +- return ret; ++ return ret; + } + +-int init(xlator_t *this) { +- int ret = -1; +- shard_priv_t *priv = NULL; ++int ++init(xlator_t *this) ++{ ++ int ret = -1; ++ shard_priv_t *priv = NULL; ++ ++ if (!this) { ++ gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, ++ "this is NULL. init() failed"); ++ return -1; ++ } + +- if (!this) { +- gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, +- "this is NULL. init() failed"); +- return -1; +- } +- +- if (!this->parents) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, +- "Dangling volume. Check volfile"); +- goto out; +- } +- +- if (!this->children || this->children->next) { +- gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, +- "shard not configured with exactly one sub-volume. " +- "Check volfile"); +- goto out; +- } +- +- priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); +- if (!priv) +- goto out; +- +- GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); +- +- GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); +- +- GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); +- +- this->local_pool = mem_pool_new(shard_local_t, 128); +- if (!this->local_pool) { +- ret = -1; +- goto out; +- } +- gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); +- gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); +- +- this->private = priv; +- LOCK_INIT(&priv->lock); +- INIT_LIST_HEAD(&priv->ilist_head); +- ret = 0; ++ if (!this->parents) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, ++ "Dangling volume. Check volfile"); ++ goto out; ++ } ++ ++ if (!this->children || this->children->next) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, ++ "shard not configured with exactly one sub-volume. " ++ "Check volfile"); ++ goto out; ++ } ++ ++ priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); ++ if (!priv) ++ goto out; ++ ++ GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); ++ ++ GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); ++ ++ GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); ++ ++ this->local_pool = mem_pool_new(shard_local_t, 128); ++ if (!this->local_pool) { ++ ret = -1; ++ goto out; ++ } ++ gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); ++ gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); ++ ++ this->private = priv; ++ LOCK_INIT(&priv->lock); ++ INIT_LIST_HEAD(&priv->ilist_head); ++ ret = 0; + out: +- if (ret) { +- GF_FREE(priv); +- mem_pool_destroy(this->local_pool); +- } ++ if (ret) { ++ GF_FREE(priv); ++ mem_pool_destroy(this->local_pool); ++ } + +- return ret; ++ return ret; + } + +-void fini(xlator_t *this) { +- shard_priv_t *priv = NULL; ++void ++fini(xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; + +- GF_VALIDATE_OR_GOTO("shard", this, out); ++ GF_VALIDATE_OR_GOTO("shard", this, out); + +- mem_pool_destroy(this->local_pool); +- this->local_pool = NULL; ++ mem_pool_destroy(this->local_pool); ++ this->local_pool = NULL; + +- priv = this->private; +- if (!priv) +- goto out; ++ priv = this->private; ++ if (!priv) ++ goto out; + +- this->private = NULL; +- LOCK_DESTROY(&priv->lock); +- GF_FREE(priv); ++ this->private = NULL; ++ LOCK_DESTROY(&priv->lock); ++ GF_FREE(priv); + + out: +- return; ++ return; + } + +-int reconfigure(xlator_t *this, dict_t *options) { +- int ret = -1; +- shard_priv_t *priv = NULL; ++int ++reconfigure(xlator_t *this, dict_t *options) ++{ ++ int ret = -1; ++ shard_priv_t *priv = NULL; + +- priv = this->private; ++ priv = this->private; + +- GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); ++ GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); + +- GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, uint32, +- out); +- ret = 0; ++ GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, ++ uint32, out); ++ ret = 0; + + out: +- return ret; ++ return ret; + } + +-int shard_forget(xlator_t *this, inode_t *inode) { +- uint64_t ctx_uint = 0; +- shard_inode_ctx_t *ctx = NULL; +- shard_priv_t *priv = NULL; ++int ++shard_forget(xlator_t *this, inode_t *inode) ++{ ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_priv_t *priv = NULL; + +- priv = this->private; +- if (!priv) +- return 0; ++ priv = this->private; ++ if (!priv) ++ return 0; + +- inode_ctx_del(inode, this, &ctx_uint); +- if (!ctx_uint) +- return 0; ++ inode_ctx_del(inode, this, &ctx_uint); ++ if (!ctx_uint) ++ return 0; + +- ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + +- /* When LRU limit reaches inode will be forcefully removed from the +- * table, inode needs to be removed from LRU of shard as well. +- */ +- if (!list_empty(&ctx->ilist)) { +- LOCK(&priv->lock); +- { +- list_del_init(&ctx->ilist); +- priv->inode_count--; ++ /* When LRU limit reaches inode will be forcefully removed from the ++ * table, inode needs to be removed from LRU of shard as well. ++ */ ++ if (!list_empty(&ctx->ilist)) { ++ LOCK(&priv->lock); ++ { ++ list_del_init(&ctx->ilist); ++ priv->inode_count--; ++ } ++ UNLOCK(&priv->lock); + } +- UNLOCK(&priv->lock); +- } +- GF_FREE(ctx); ++ GF_FREE(ctx); + +- return 0; ++ return 0; + } + +-int shard_release(xlator_t *this, fd_t *fd) { +- /* TBD */ +- return 0; ++int ++shard_release(xlator_t *this, fd_t *fd) ++{ ++ /* TBD */ ++ return 0; + } + +-int shard_priv_dump(xlator_t *this) { +- shard_priv_t *priv = NULL; +- char key_prefix[GF_DUMP_MAX_BUF_LEN] = { +- 0, +- }; +- char *str = NULL; ++int ++shard_priv_dump(xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; ++ char key_prefix[GF_DUMP_MAX_BUF_LEN] = { ++ 0, ++ }; ++ char *str = NULL; + +- priv = this->private; ++ priv = this->private; + +- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); +- gf_proc_dump_add_section("%s", key_prefix); +- str = gf_uint64_2human_readable(priv->block_size); +- gf_proc_dump_write("shard-block-size", "%s", str); +- gf_proc_dump_write("inode-count", "%d", priv->inode_count); +- gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); +- gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); ++ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); ++ gf_proc_dump_add_section("%s", key_prefix); ++ str = gf_uint64_2human_readable(priv->block_size); ++ gf_proc_dump_write("shard-block-size", "%s", str); ++ gf_proc_dump_write("inode-count", "%d", priv->inode_count); ++ gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); ++ gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); + +- GF_FREE(str); ++ GF_FREE(str); + +- return 0; ++ return 0; + } + +-int shard_releasedir(xlator_t *this, fd_t *fd) { return 0; } ++int ++shard_releasedir(xlator_t *this, fd_t *fd) ++{ ++ return 0; ++} + + struct xlator_fops fops = { + .lookup = shard_lookup, +-- +1.8.3.1 + diff --git a/SOURCES/0563-features-shard-Use-fd-lookup-post-file-open.patch b/SOURCES/0563-features-shard-Use-fd-lookup-post-file-open.patch new file mode 100644 index 0000000..c680f92 --- /dev/null +++ b/SOURCES/0563-features-shard-Use-fd-lookup-post-file-open.patch @@ -0,0 +1,318 @@ +From a19fa252942938a308ffa655fca3814d0660c6e2 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Wed, 3 Jun 2020 18:58:56 +0530 +Subject: [PATCH 563/584] features/shard: Use fd lookup post file open + +Issue: +When a process has the open fd and the same file is +unlinked in middle of the operations, then file based +lookup fails with ENOENT or stale file + +Solution: +When the file already open and fd is available, use fstat +to get the file attributes + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/24528/ +> Change-Id: I0e83aee9f11b616dcfe13769ebfcda6742e4e0f4 +> Fixes: #1281 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: I0e83aee9f11b616dcfe13769ebfcda6742e4e0f4 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244957 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-1281.t | 34 +++++++++++ + xlators/features/shard/src/shard.c | 119 +++++++++++++++++++++++-------------- + 2 files changed, 110 insertions(+), 43 deletions(-) + create mode 100644 tests/bugs/shard/issue-1281.t + +diff --git a/tests/bugs/shard/issue-1281.t b/tests/bugs/shard/issue-1281.t +new file mode 100644 +index 0000000..9704caa +--- /dev/null ++++ b/tests/bugs/shard/issue-1281.t +@@ -0,0 +1,34 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++#Open a file and store descriptor in fd = 5 ++exec 5>$M0/foo ++ ++#Unlink the same file which is opened in prev step ++TEST unlink $M0/foo ++ ++#Write something on the file using the open fd = 5 ++echo "issue-1281" >&5 ++ ++#Write on the descriptor should be succesful ++EXPECT 0 echo $? ++ ++#Close the fd = 5 ++exec 5>&- ++ ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index c5cc224..2ba4528 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -1653,26 +1653,24 @@ err: + } + + int +-shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) ++shard_set_iattr_invoke_post_handler(call_frame_t *frame, xlator_t *this, ++ inode_t *inode, int32_t op_ret, ++ int32_t op_errno, struct iatt *buf, ++ dict_t *xdata) + { + int ret = -1; + int32_t mask = SHARD_INODE_WRITE_MASK; +- shard_local_t *local = NULL; ++ shard_local_t *local = frame->local; + shard_inode_ctx_t ctx = { + 0, + }; + +- local = frame->local; +- + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_BASE_FILE_LOOKUP_FAILED, + "Lookup on base file" + " failed : %s", +- loc_gfid_utoa(&(local->loc))); ++ uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; +@@ -1706,18 +1704,57 @@ unwind: + } + + int +-shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, +- shard_post_fop_handler_t handler) ++shard_fstat_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ dict_t *xdata) ++{ ++ shard_local_t *local = frame->local; ++ ++ shard_set_iattr_invoke_post_handler(frame, this, local->fd->inode, op_ret, ++ op_errno, buf, xdata); ++ return 0; ++} ++ ++int ++shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ /* In case of op_ret < 0, inode passed to this function will be NULL ++ ex: in case of op_errno = ENOENT. So refer prefilled inode data ++ which is part of local. ++ Note: Reassigning/overriding the inode passed to this cbk with inode ++ which is part of *struct shard_local_t* won't cause any issue as ++ both inodes have same reference/address as of the inode passed */ ++ inode = ((shard_local_t *)frame->local)->loc.inode; ++ ++ shard_set_iattr_invoke_post_handler(frame, this, inode, op_ret, op_errno, ++ buf, xdata); ++ return 0; ++} ++ ++/* This function decides whether to make file based lookup or ++ * fd based lookup (fstat) depending on the 3rd and 4th arg. ++ * If fd != NULL and loc == NULL then call is for fstat ++ * If fd == NULL and loc != NULL then call is for file based ++ * lookup. Please pass args based on the requirement. ++ */ ++int ++shard_refresh_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ fd_t *fd, shard_post_fop_handler_t handler) + { + int ret = -1; ++ inode_t *inode = NULL; + shard_local_t *local = NULL; + dict_t *xattr_req = NULL; + gf_boolean_t need_refresh = _gf_false; + + local = frame->local; + local->handler = handler; ++ inode = fd ? fd->inode : loc->inode; + +- ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, ++ ret = shard_inode_ctx_fill_iatt_from_cache(inode, this, &local->prebuf, + &need_refresh); + /* By this time, inode ctx should have been created either in create, + * mknod, readdirp or lookup. If not it is a bug! +@@ -1726,7 +1763,7 @@ shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_msg_debug(this->name, 0, + "Skipping lookup on base file: %s" + "Serving prebuf off the inode ctx cache", +- uuid_utoa(loc->gfid)); ++ uuid_utoa(inode->gfid)); + goto out; + } + +@@ -1737,10 +1774,14 @@ shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, + goto out; + } + +- SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); ++ SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, inode->gfid, local, out); + +- STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xattr_req); ++ if (fd) ++ STACK_WIND(frame, shard_fstat_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fstat, fd, xattr_req); ++ else ++ STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + + dict_unref(xattr_req); + return 0; +@@ -2718,8 +2759,8 @@ shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + local->resolver_base_inode = loc->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); ++ shard_refresh_base_file(frame, this, &local->loc, NULL, ++ shard_post_lookup_truncate_handler); + return 0; + + err: +@@ -2774,8 +2815,8 @@ shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->resolver_base_inode = fd->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_truncate_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_post_lookup_truncate_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); +@@ -2919,8 +2960,8 @@ shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + if (!local->xattr_req) + goto err; + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_link_handler); ++ shard_refresh_base_file(frame, this, &local->loc, NULL, ++ shard_post_lookup_link_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); +@@ -4249,8 +4290,8 @@ shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) + switch (local->fop) { + case GF_FOP_UNLINK: + case GF_FOP_RENAME: +- shard_lookup_base_file(frame, this, &local->int_inodelk.loc, +- shard_post_lookup_base_shard_rm_handler); ++ shard_refresh_base_file(frame, this, &local->int_inodelk.loc, NULL, ++ shard_post_lookup_base_shard_rm_handler); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, +@@ -4505,8 +4546,8 @@ shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + if (local->block_size) { + local->tmp_loc.inode = inode_new(this->itable); + gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); +- shard_lookup_base_file(frame, this, &local->tmp_loc, +- shard_post_rename_lookup_handler); ++ shard_refresh_base_file(frame, this, &local->tmp_loc, NULL, ++ shard_post_rename_lookup_handler); + } else { + shard_rename_cbk(frame, this); + } +@@ -5242,8 +5283,8 @@ shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_readv_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_post_lookup_readv_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); +@@ -6046,8 +6087,8 @@ shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_fsync_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_post_lookup_fsync_handler); + return 0; + err: + shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); +@@ -6420,12 +6461,8 @@ shard_common_remove_xattr(call_frame_t *frame, xlator_t *this, + if (xdata) + local->xattr_req = dict_ref(xdata); + +- /* To-Do: Switch from LOOKUP which is path-based, to FSTAT if the fop is +- * on an fd. This comes under a generic class of bugs in shard tracked by +- * bz #1782428. +- */ +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_remove_xattr_handler); ++ shard_refresh_base_file(frame, this, loc, fd, ++ shard_post_lookup_remove_xattr_handler); + return 0; + err: + shard_common_failure_unwind(fop, frame, -1, op_errno); +@@ -6662,12 +6699,8 @@ shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + if (xdata) + local->xattr_rsp = dict_ref(xdata); + +- /* To-Do: Switch from LOOKUP which is path-based, to FSTAT if the fop is +- * on an fd. This comes under a generic class of bugs in shard tracked by +- * bz #1782428. +- */ +- shard_lookup_base_file(frame, this, &local->loc, +- shard_post_lookup_set_xattr_handler); ++ shard_refresh_base_file(frame, this, loc, fd, ++ shard_post_lookup_set_xattr_handler); + return 0; + err: + shard_common_failure_unwind(fop, frame, -1, op_errno); +@@ -6951,8 +6984,8 @@ shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + +- shard_lookup_base_file(frame, this, &local->loc, +- shard_common_inode_write_post_lookup_handler); ++ shard_refresh_base_file(frame, this, NULL, fd, ++ shard_common_inode_write_post_lookup_handler); + return 0; + out: + shard_common_failure_unwind(fop, frame, -1, ENOMEM); +-- +1.8.3.1 + diff --git a/SOURCES/0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch b/SOURCES/0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch new file mode 100644 index 0000000..35cda2e --- /dev/null +++ b/SOURCES/0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch @@ -0,0 +1,215 @@ +From a7a56c079df2eb0253efdd53e1538656c0ce9095 Mon Sep 17 00:00:00 2001 +From: Yaniv Kaul +Date: Mon, 25 Nov 2019 15:37:46 +0200 +Subject: [PATCH 564/584] store.c/glusterd-store.c: remove sys_stat calls + +Instead of querying for the file size and allocating a char array +according to its size, let's just use a fixed size. +Those calls are not really needed, and are either expensive or +cached anyway. Since we do dynamic allocation/free, let's just use +a fixed array instead. + +I'll see if there are other sys_stat() calls that are not really +useful and try to eliminate them in separate patches. + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/23752/ +> Change-Id: I76b40e78a52ab38f613fc0cdef4be60e6253bf20 +> updates: bz#1193929 +> Signed-off-by: Yaniv Kaul + +BUG: 1925425 +Change-Id: I76b40e78a52ab38f613fc0cdef4be60e6253bf20 +Signed-off-by: Yaniv Kaul +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244958 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/store.h | 4 +- + libglusterfs/src/store.c | 71 ++++-------------------------- + xlators/mgmt/glusterd/src/glusterd-store.c | 5 +-- + 3 files changed, 12 insertions(+), 68 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/store.h b/libglusterfs/src/glusterfs/store.h +index 3b3a24c..f63bd05 100644 +--- a/libglusterfs/src/glusterfs/store.h ++++ b/libglusterfs/src/glusterfs/store.h +@@ -59,8 +59,8 @@ int32_t + gf_store_unlink_tmppath(gf_store_handle_t *shandle); + + int +-gf_store_read_and_tokenize(FILE *file, char *str, int size, char **iter_key, +- char **iter_val, gf_store_op_errno_t *store_errno); ++gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, ++ gf_store_op_errno_t *store_errno); + + int32_t + gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value); +diff --git a/libglusterfs/src/store.c b/libglusterfs/src/store.c +index cdf0aea..fa3649b 100644 +--- a/libglusterfs/src/store.c ++++ b/libglusterfs/src/store.c +@@ -184,8 +184,8 @@ out: + } + + int +-gf_store_read_and_tokenize(FILE *file, char *str, int size, char **iter_key, +- char **iter_val, gf_store_op_errno_t *store_errno) ++gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, ++ gf_store_op_errno_t *store_errno) + { + int32_t ret = -1; + char *savetok = NULL; +@@ -193,15 +193,15 @@ gf_store_read_and_tokenize(FILE *file, char *str, int size, char **iter_key, + char *value = NULL; + char *temp = NULL; + size_t str_len = 0; ++ char str[8192]; + + GF_ASSERT(file); +- GF_ASSERT(str); + GF_ASSERT(iter_key); + GF_ASSERT(iter_val); + GF_ASSERT(store_errno); + + retry: +- temp = fgets(str, size, file); ++ temp = fgets(str, 8192, file); + if (temp == NULL || feof(file)) { + ret = -1; + *store_errno = GD_STORE_EOF; +@@ -241,13 +241,8 @@ int32_t + gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value) + { + int32_t ret = -1; +- char *scan_str = NULL; + char *iter_key = NULL; + char *iter_val = NULL; +- char *free_str = NULL; +- struct stat st = { +- 0, +- }; + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT(handle); +@@ -279,32 +274,9 @@ gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value) + } else { + fseek(handle->read, 0, SEEK_SET); + } +- ret = sys_fstat(handle->fd, &st); +- if (ret < 0) { +- gf_msg("", GF_LOG_WARNING, errno, LG_MSG_FILE_OP_FAILED, +- "stat on file %s failed", handle->path); +- ret = -1; +- store_errno = GD_STORE_STAT_FAILED; +- goto out; +- } +- +- /* "st.st_size + 1" is used as we are fetching each +- * line of a file using fgets, fgets will append "\0" +- * to the end of the string +- */ +- scan_str = GF_CALLOC(1, st.st_size + 1, gf_common_mt_char); +- +- if (scan_str == NULL) { +- ret = -1; +- store_errno = GD_STORE_ENOMEM; +- goto out; +- } +- +- free_str = scan_str; +- + do { +- ret = gf_store_read_and_tokenize(handle->read, scan_str, st.st_size + 1, +- &iter_key, &iter_val, &store_errno); ++ ret = gf_store_read_and_tokenize(handle->read, &iter_key, &iter_val, ++ &store_errno); + if (ret < 0) { + gf_msg_trace("", 0, + "error while reading key '%s': " +@@ -334,8 +306,6 @@ out: + sys_close(handle->fd); + } + +- GF_FREE(free_str); +- + return ret; + } + +@@ -561,40 +531,16 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + gf_store_op_errno_t *op_errno) + { + int32_t ret = -1; +- char *scan_str = NULL; + char *iter_key = NULL; + char *iter_val = NULL; +- struct stat st = { +- 0, +- }; + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT(iter); + GF_ASSERT(key); + GF_ASSERT(value); + +- ret = sys_stat(iter->filepath, &st); +- if (ret < 0) { +- gf_msg("", GF_LOG_WARNING, errno, LG_MSG_FILE_OP_FAILED, +- "stat on file failed"); +- ret = -1; +- store_errno = GD_STORE_STAT_FAILED; +- goto out; +- } +- +- /* "st.st_size + 1" is used as we are fetching each +- * line of a file using fgets, fgets will append "\0" +- * to the end of the string +- */ +- scan_str = GF_CALLOC(1, st.st_size + 1, gf_common_mt_char); +- if (!scan_str) { +- ret = -1; +- store_errno = GD_STORE_ENOMEM; +- goto out; +- } +- +- ret = gf_store_read_and_tokenize(iter->file, scan_str, st.st_size + 1, +- &iter_key, &iter_val, &store_errno); ++ ret = gf_store_read_and_tokenize(iter->file, &iter_key, &iter_val, ++ &store_errno); + if (ret < 0) { + goto out; + } +@@ -619,7 +565,6 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + ret = 0; + + out: +- GF_FREE(scan_str); + if (ret) { + GF_FREE(*key); + GF_FREE(*value); +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index 4fa8116..da63c03 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -4092,7 +4092,6 @@ out: + int32_t + glusterd_store_retrieve_missed_snaps_list(xlator_t *this) + { +- char buf[PATH_MAX] = ""; + char path[PATH_MAX] = ""; + char *snap_vol_id = NULL; + char *missed_node_info = NULL; +@@ -4129,8 +4128,8 @@ glusterd_store_retrieve_missed_snaps_list(xlator_t *this) + } + + do { +- ret = gf_store_read_and_tokenize( +- fp, buf, sizeof(buf), &missed_node_info, &value, &store_errno); ++ ret = gf_store_read_and_tokenize(fp, &missed_node_info, &value, ++ &store_errno); + if (ret) { + if (store_errno == GD_STORE_EOF) { + gf_msg_debug(this->name, 0, "EOF for missed_snap_list"); +-- +1.8.3.1 + diff --git a/SOURCES/0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch b/SOURCES/0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch new file mode 100644 index 0000000..5e91703 --- /dev/null +++ b/SOURCES/0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch @@ -0,0 +1,124 @@ +From d491843640658e91a77f15647cefd1c00422c731 Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Wed, 21 Oct 2020 16:14:29 +0530 +Subject: [PATCH 565/584] libglusterfs/coverity: pointer to local outside the + scope + +issue: gf_store_read_and_tokenize() returns the address +of the locally referred string. + +fix: pass the buf to gf_store_read_and_tokenize() and +use it for tokenize. + +CID: 1430143 + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1675 +> Updates: #1060 +> Change-Id: Ifc346540c263f58f4014ba2ba8c1d491c20ac609 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: Ifc346540c263f58f4014ba2ba8c1d491c20ac609 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244959 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/store.h | 3 ++- + libglusterfs/src/store.c | 13 ++++++++----- + xlators/mgmt/glusterd/src/glusterd-store.c | 3 ++- + 3 files changed, 12 insertions(+), 7 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/store.h b/libglusterfs/src/glusterfs/store.h +index f63bd05..68a20ad 100644 +--- a/libglusterfs/src/glusterfs/store.h ++++ b/libglusterfs/src/glusterfs/store.h +@@ -60,7 +60,8 @@ gf_store_unlink_tmppath(gf_store_handle_t *shandle); + + int + gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, +- gf_store_op_errno_t *store_errno); ++ gf_store_op_errno_t *store_errno, char *str, ++ size_t buf_size); + + int32_t + gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value); +diff --git a/libglusterfs/src/store.c b/libglusterfs/src/store.c +index fa3649b..3af627a 100644 +--- a/libglusterfs/src/store.c ++++ b/libglusterfs/src/store.c +@@ -185,7 +185,8 @@ out: + + int + gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, +- gf_store_op_errno_t *store_errno) ++ gf_store_op_errno_t *store_errno, char *str, ++ size_t buf_size) + { + int32_t ret = -1; + char *savetok = NULL; +@@ -193,7 +194,6 @@ gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, + char *value = NULL; + char *temp = NULL; + size_t str_len = 0; +- char str[8192]; + + GF_ASSERT(file); + GF_ASSERT(iter_key); +@@ -201,7 +201,7 @@ gf_store_read_and_tokenize(FILE *file, char **iter_key, char **iter_val, + GF_ASSERT(store_errno); + + retry: +- temp = fgets(str, 8192, file); ++ temp = fgets(str, buf_size, file); + if (temp == NULL || feof(file)) { + ret = -1; + *store_errno = GD_STORE_EOF; +@@ -275,8 +275,9 @@ gf_store_retrieve_value(gf_store_handle_t *handle, char *key, char **value) + fseek(handle->read, 0, SEEK_SET); + } + do { ++ char buf[8192]; + ret = gf_store_read_and_tokenize(handle->read, &iter_key, &iter_val, +- &store_errno); ++ &store_errno, buf, 8192); + if (ret < 0) { + gf_msg_trace("", 0, + "error while reading key '%s': " +@@ -533,6 +534,8 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + int32_t ret = -1; + char *iter_key = NULL; + char *iter_val = NULL; ++ char buf[8192]; ++ + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT(iter); +@@ -540,7 +543,7 @@ gf_store_iter_get_next(gf_store_iter_t *iter, char **key, char **value, + GF_ASSERT(value); + + ret = gf_store_read_and_tokenize(iter->file, &iter_key, &iter_val, +- &store_errno); ++ &store_errno, buf, 8192); + if (ret < 0) { + goto out; + } +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index da63c03..a8651d8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -4128,8 +4128,9 @@ glusterd_store_retrieve_missed_snaps_list(xlator_t *this) + } + + do { ++ char buf[8192]; + ret = gf_store_read_and_tokenize(fp, &missed_node_info, &value, +- &store_errno); ++ &store_errno, buf, 8192); + if (ret) { + if (store_errno == GD_STORE_EOF) { + gf_msg_debug(this->name, 0, "EOF for missed_snap_list"); +-- +1.8.3.1 + diff --git a/SOURCES/0566-enahancement-debug-Option-to-generate-core-dump-with.patch b/SOURCES/0566-enahancement-debug-Option-to-generate-core-dump-with.patch new file mode 100644 index 0000000..548271e --- /dev/null +++ b/SOURCES/0566-enahancement-debug-Option-to-generate-core-dump-with.patch @@ -0,0 +1,236 @@ +From e66ab728426e147bf4fc594109137ebfb1f2dda6 Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Mon, 23 Nov 2020 08:09:44 +0530 +Subject: [PATCH 566/584] enahancement/debug: Option to generate core dump + without killing the process + +Comments and idea proposed by: Xavi Hernandez(jahernan@redhat.com): + +On production systems sometimes we see a log message saying that an assertion +has failed. But it's hard to track why it failed without additional information +(on debug builds, a GF_ASSERT() generates a core dump and kills the process, +so it can be used to debug the issue, but many times we are only able to +reproduce assertion failures on production systems, where GF_ASSERT() only logs +a message and continues). + +In other cases we may have a core dump caused by a bug, but the core dump doesn't +necessarily happen when the bug has happened. Sometimes the crash happens so much +later that the causes that triggered the bug are lost. In these cases we can add +more assertions to the places that touch the potential candidates to cause the bug, +but the only thing we'll get is a log message, which may not be enough. + +One solution would be to always generate a core dump in case of assertion failure, +but this was already discussed and it was decided that it was too drastic. If a +core dump was really needed, a new macro was created to do so: GF_ABORT(), +but GF_ASSERT() would continue to not kill the process on production systems. + +I'm proposing to modify GF_ASSERT() on production builds so that it conditionally +triggers a signal when a debugger is attached. When this happens, the debugger +will generate a core dump and continue the process as if nothing had happened. +If there's no debugger attached, GF_ASSERT() will behave as always. + +The idea I have is to use SIGCONT to do that. This signal is harmless, so we can +unmask it (we currently mask all unneeded signals) and raise it inside a GF_ASSERT() +when some global variable is set to true. + +To produce the core dump, run the script under extras/debug/gfcore.py on other +terminal. gdb breaks and produces coredump when GF_ASSERT is hit. + +The script is copied from #1810 which is written by Xavi Hernandez(jahernan@redhat.com) + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1814 +> Fixes: #1810 +> Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1927640 +Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244960 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/debug/gfcore.py | 77 +++++++++++++++++++++++++++++++ + libglusterfs/src/common-utils.c | 11 +++++ + libglusterfs/src/glusterfs/common-utils.h | 10 +++- + libglusterfs/src/libglusterfs.sym | 16 +++++++ + 4 files changed, 112 insertions(+), 2 deletions(-) + create mode 100755 extras/debug/gfcore.py + +diff --git a/extras/debug/gfcore.py b/extras/debug/gfcore.py +new file mode 100755 +index 0000000..9f097f0 +--- /dev/null ++++ b/extras/debug/gfcore.py +@@ -0,0 +1,77 @@ ++#!/usr/bin/env python3 ++ ++def launch(): ++ if len(sys.argv) < 3: ++ sys.stderr.write("Syntax: {} []\n".format(os.path.basename(sys.argv[0]))) ++ sys.exit(1) ++ ++ pid = int(sys.argv[1]) ++ count = int(sys.argv[2]) ++ base = os.getcwd() ++ if len(sys.argv) > 3: ++ base = sys.argv[3] ++ base = os.path.realpath(base) ++ ++ subprocess.run([ ++ "gdb", "-batch", ++ "-p", str(pid), ++ "-ex", "py arg_count = {}".format(count), ++ "-ex", "py arg_dir = '{}'".format(base), ++ "-x", __file__ ++ ]) ++ ++class GFCore(object): ++ def __init__(self, count, base): ++ self.count = count ++ self.base = base ++ gdb.execute('set pagination off') ++ gdb.execute('set gf_signal_on_assert = 1') ++ gdb.events.stop.connect(self.gf_stop) ++ ++ self.cont() ++ ++ def cont(self, quit = False): ++ if not(quit) and (self.count > 0): ++ gdb.execute('continue') ++ else: ++ gdb.execute('set gf_signal_on_assert = 0') ++ gdb.execute('quit') ++ ++ def gf_stop(self, event): ++ quit = False ++ ++ if isinstance(event, gdb.SignalEvent): ++ if event.stop_signal == 'SIGCONT': ++ now = datetime.utcnow().isoformat() ++ pid = gdb.selected_inferior().pid ++ name = "{}/gfcore.{}.{}".format(self.base, pid, now) ++ print("Generating coredump '{}'".format(name)) ++ gdb.execute('gcore {}'.format(name)) ++ self.count -= 1 ++ ++ elif event.stop_signal == 'SIGINT': ++ print("SIGINT received. Exiting") ++ quit = True ++ ++ else: ++ print("Ignoring signal {}".format(event.stop_signal)) ++ else: ++ print("Unexpected event {}".format(type(event))) ++ ++ self.cont(quit) ++ ++# Module 'gdb' is not available when running outside gdb. ++try: ++ import gdb ++ from datetime import datetime ++ ++ GFCore(arg_count, arg_dir) ++except ModuleNotFoundError: ++ import sys ++ import os ++ import subprocess ++ ++ try: ++ launch() ++ except KeyboardInterrupt: ++ pass +diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c +index 70d5d21..d351b93 100644 +--- a/libglusterfs/src/common-utils.c ++++ b/libglusterfs/src/common-utils.c +@@ -77,9 +77,19 @@ char *vol_type_str[] = { + "Distributed-Disperse", + }; + ++gf_boolean_t gf_signal_on_assert = false; ++ + typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size); + typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size); + ++void gf_assert(void) ++{ ++ if (gf_signal_on_assert) { ++ raise(SIGCONT); ++ } ++ ++} ++ + void + gf_xxh64_wrapper(const unsigned char *data, size_t const len, + unsigned long long const seed, char *xxh64) +@@ -4021,6 +4031,7 @@ gf_thread_vcreate(pthread_t *thread, const pthread_attr_t *attr, + sigdelset(&set, SIGSYS); + sigdelset(&set, SIGFPE); + sigdelset(&set, SIGABRT); ++ sigdelset(&set, SIGCONT); + + pthread_sigmask(SIG_BLOCK, &set, &old); + +diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h +index f0a0a41..604afd0 100644 +--- a/libglusterfs/src/glusterfs/common-utils.h ++++ b/libglusterfs/src/glusterfs/common-utils.h +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #ifndef ffsll + #define ffsll(x) __builtin_ffsll(x) +@@ -431,14 +432,19 @@ BIT_VALUE(unsigned char *array, unsigned int index) + #define GF_FILE_CONTENT_REQUESTED(_xattr_req, _content_limit) \ + (dict_get_uint64(_xattr_req, "glusterfs.content", _content_limit) == 0) + ++void gf_assert(void); ++ + #ifdef DEBUG + #define GF_ASSERT(x) assert(x); + #else + #define GF_ASSERT(x) \ + do { \ +- if (!(x)) { \ ++ if (caa_unlikely(!(x))) { \ ++ gf_assert(); \ + gf_msg_callingfn("", GF_LOG_ERROR, 0, LG_MSG_ASSERTION_FAILED, \ +- "Assertion failed: " #x); \ ++ "Assertion failed: To attach gdb and coredump," \ ++ " Run the script under " \ ++ "\"glusterfs/extras/debug/gfcore.py\""); \ + } \ + } while (0) + #endif +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index 0a0862e..9072afa 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -1167,3 +1167,19 @@ gf_changelog_register_generic + gf_gfid_generate_from_xxh64 + find_xlator_option_in_cmd_args_t + gf_d_type_from_ia_type ++glusterfs_graph_fini ++glusterfs_process_svc_attach_volfp ++glusterfs_mux_volfile_reconfigure ++glusterfs_process_svc_detach ++mgmt_is_multiplexed_daemon ++xlator_is_cleanup_starting ++gf_nanosleep ++gf_syncfs ++graph_total_client_xlator ++get_xattrs_to_heal ++gf_latency_statedump_and_reset ++gf_latency_new ++gf_latency_reset ++gf_latency_update ++gf_frame_latency_update ++gf_assert +-- +1.8.3.1 + diff --git a/SOURCES/0567-inode-create-inode-outside-locked-region.patch b/SOURCES/0567-inode-create-inode-outside-locked-region.patch new file mode 100644 index 0000000..23d51c4 --- /dev/null +++ b/SOURCES/0567-inode-create-inode-outside-locked-region.patch @@ -0,0 +1,86 @@ +From 5c81d813c8b1f494d31d54c1ab09a3f0153ebfd4 Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Sat, 9 Feb 2019 13:13:47 +0530 +Subject: [PATCH 567/584] inode: create inode outside locked region + +Only linking of inode to the table, and inserting it in +a list needs to be in locked region. + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/22183/ +> Updates: bz#1670031 +> Change-Id: I6ea7e956b80cf2765c2233d761909c4bf9c7253c +> Signed-off-by: Amar Tumballi + +BUG: 1927640 +Change-Id: I6ea7e956b80cf2765c2233d761909c4bf9c7253c +Signed-off-by: Amar Tumballi +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244961 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/inode.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 98f8ea6..46db04f 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -620,7 +620,7 @@ out: + } + + static inode_t * +-__inode_create(inode_table_t *table) ++inode_create(inode_table_t *table) + { + inode_t *newi = NULL; + +@@ -647,11 +647,7 @@ __inode_create(inode_table_t *table) + goto out; + } + +- list_add(&newi->list, &table->lru); +- table->lru_size++; +- + out: +- + return newi; + } + +@@ -668,14 +664,16 @@ inode_new(inode_table_t *table) + return NULL; + } + +- pthread_mutex_lock(&table->lock); +- { +- inode = __inode_create(table); +- if (inode != NULL) { ++ inode = inode_create(table); ++ if (inode) { ++ pthread_mutex_lock(&table->lock); ++ { ++ list_add(&inode->list, &table->lru); ++ table->lru_size++; + __inode_ref(inode, false); + } ++ pthread_mutex_unlock(&table->lock); + } +- pthread_mutex_unlock(&table->lock); + + return inode; + } +@@ -1613,7 +1611,10 @@ __inode_table_init_root(inode_table_t *table) + if (!table) + return; + +- root = __inode_create(table); ++ root = inode_create(table); ++ ++ list_add(&root->list, &table->lru); ++ table->lru_size++; + + iatt.ia_gfid[15] = 1; + iatt.ia_ino = 1; +-- +1.8.3.1 + diff --git a/SOURCES/0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch b/SOURCES/0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch new file mode 100644 index 0000000..22c6790 --- /dev/null +++ b/SOURCES/0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch @@ -0,0 +1,131 @@ +From 2640ee56201d320b838909f95608abe07e3ff9b0 Mon Sep 17 00:00:00 2001 +From: mohit84 +Date: Tue, 24 Nov 2020 15:29:58 +0530 +Subject: [PATCH 568/584] core: tcmu-runner process continuous growing logs + lru_size showing -1 + +* core: tcmu-runner process continuous growing logs lru_size showing -1 + +At the time of calling inode_table_prune it checks if current lru_size +is greater than lru_limit but lru_list is empty it throws a log message +"Empty inode lru list found but with (%d) lru_size".As per code reading +it seems lru_size is out of sync with the actual number of inodes in +lru_list. Due to throwing continuous error messages entire disk is +getting full and the user has to restart the tcmu-runner process to use +the volumes.The log message was introduce by a patch +https://review.gluster.org/#/c/glusterfs/+/15087/. + +Solution: Introduce a flag in_lru_list to take decision about inode is + being part of lru_list or not. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1776 +> Fixes: #1775 +> Change-Id: I4b836bebf4b5db65fbf88ff41c6c88f4a7ac55c1 +> Signed-off-by: Mohit Agrawal + +BUG: 1927640 +Change-Id: I4b836bebf4b5db65fbf88ff41c6c88f4a7ac55c1 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244962 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/inode.h | 1 + + libglusterfs/src/inode.c | 14 ++++++++++++++ + 2 files changed, 15 insertions(+) + +diff --git a/libglusterfs/src/glusterfs/inode.h b/libglusterfs/src/glusterfs/inode.h +index 62c093d..17d0340 100644 +--- a/libglusterfs/src/glusterfs/inode.h ++++ b/libglusterfs/src/glusterfs/inode.h +@@ -110,6 +110,7 @@ struct _inode { + struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */ + bool in_invalidate_list; /* Set if inode is in table invalidate list */ + bool invalidate_sent; /* Set it if invalidator_fn is called for inode */ ++ bool in_lru_list; /* Set if inode is in table lru list */ + }; + + #define UUID0_STR "00000000-0000-0000-0000-000000000000" +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index 46db04f..8e91197 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -417,8 +417,10 @@ __inode_passivate(inode_t *inode) + dentry_t *dentry = NULL; + dentry_t *t = NULL; + ++ GF_ASSERT(!inode->in_lru_list); + list_move_tail(&inode->list, &inode->table->lru); + inode->table->lru_size++; ++ inode->in_lru_list = _gf_true; + + list_for_each_entry_safe(dentry, t, &inode->dentry_list, inode_list) + { +@@ -531,7 +533,10 @@ __inode_ref(inode_t *inode, bool is_invalidate) + inode->in_invalidate_list = false; + inode->table->invalidate_size--; + } else { ++ GF_ASSERT(inode->table->lru_size > 0); ++ GF_ASSERT(inode->in_lru_list); + inode->table->lru_size--; ++ inode->in_lru_list = _gf_false; + } + if (is_invalidate) { + inode->in_invalidate_list = true; +@@ -670,6 +675,8 @@ inode_new(inode_table_t *table) + { + list_add(&inode->list, &table->lru); + table->lru_size++; ++ GF_ASSERT(!inode->in_lru_list); ++ inode->in_lru_list = _gf_true; + __inode_ref(inode, false); + } + pthread_mutex_unlock(&table->lock); +@@ -1533,6 +1540,7 @@ inode_table_prune(inode_table_t *table) + lru_size = table->lru_size; + while (lru_size > (table->lru_limit)) { + if (list_empty(&table->lru)) { ++ GF_ASSERT(0); + gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, + LG_MSG_INVALID_INODE_LIST, + "Empty inode lru list found" +@@ -1543,6 +1551,7 @@ inode_table_prune(inode_table_t *table) + + lru_size--; + entry = list_entry(table->lru.next, inode_t, list); ++ GF_ASSERT(entry->in_lru_list); + /* The logic of invalidation is required only if invalidator_fn + is present */ + if (table->invalidator_fn) { +@@ -1560,6 +1569,7 @@ inode_table_prune(inode_table_t *table) + } + + table->lru_size--; ++ entry->in_lru_list = _gf_false; + __inode_retire(entry); + ret++; + } +@@ -1615,6 +1625,7 @@ __inode_table_init_root(inode_table_t *table) + + list_add(&root->list, &table->lru); + table->lru_size++; ++ root->in_lru_list = _gf_true; + + iatt.ia_gfid[15] = 1; + iatt.ia_ino = 1; +@@ -1873,8 +1884,11 @@ inode_table_destroy(inode_table_t *inode_table) + while (!list_empty(&inode_table->lru)) { + trav = list_first_entry(&inode_table->lru, inode_t, list); + inode_forget_atomic(trav, 0); ++ GF_ASSERT(inode_table->lru_size > 0); ++ GF_ASSERT(trav->in_lru_list); + __inode_retire(trav); + inode_table->lru_size--; ++ trav->in_lru_list = _gf_false; + } + + /* Same logic for invalidate list */ +-- +1.8.3.1 + diff --git a/SOURCES/0569-features-shard-optimization-over-shard-lookup-in-cas.patch b/SOURCES/0569-features-shard-optimization-over-shard-lookup-in-cas.patch new file mode 100644 index 0000000..fff8223 --- /dev/null +++ b/SOURCES/0569-features-shard-optimization-over-shard-lookup-in-cas.patch @@ -0,0 +1,200 @@ +From 1b86a4bda540ff4cf307c7f38d3041318636ecb7 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Thu, 6 Aug 2020 14:39:59 +0530 +Subject: [PATCH 569/584] features/shard: optimization over shard lookup in + case of prealloc + +Assume that we are preallocating a VM of size 1TB with a shard +block size of 64MB then there will be ~16k shards. + +This creation happens in 2 steps shard_fallocate() path i.e + +1. lookup for the shards if any already present and +2. mknod over those shards do not exist. + +But in case of fresh creation, we dont have to lookup for all +shards which are not present as the the file size will be 0. +Through this, we can save lookup on all shards which are not +present. This optimization is quite useful in the case of +preallocating big vm. + +Also if the file is already present and the call is to +extend it to bigger size then we need not to lookup for non- +existent shards. Just lookup preexisting shards, populate +the inodes and issue mknod on extended size. + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/24813/ +> Fixes: #1425 +> Change-Id: I60036fe8302c696e0ca80ff11ab0ef5bcdbd7880 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: I60036fe8302c696e0ca80ff11ab0ef5bcdbd7880 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244963 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-1425.t | 45 +++++++++++++++++++++++++++++++++++++ + xlators/features/shard/src/shard.c | 46 ++++++++++++++++++++++++++++++++------ + 2 files changed, 84 insertions(+), 7 deletions(-) + create mode 100644 tests/bugs/shard/issue-1425.t + +diff --git a/tests/bugs/shard/issue-1425.t b/tests/bugs/shard/issue-1425.t +new file mode 100644 +index 0000000..bbe82c0 +--- /dev/null ++++ b/tests/bugs/shard/issue-1425.t +@@ -0,0 +1,45 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume start $V0 ++TEST $CLI volume profile $V0 start ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST fallocate -l 20M $M0/foo ++gfid_new=$(get_gfid_string $M0/foo) ++ ++# Check for the base shard ++TEST stat $M0/foo ++TEST stat $B0/${V0}0/foo ++ ++# There should be 4 associated shards ++EXPECT_WITHIN $FILE_COUNT_TIME 4 get_file_count $B0/${V0}0/.shard/$gfid_new ++ ++# There should be 1+4 shards and we expect 4 lookups less than on the build without this patch ++EXPECT "21" echo `$CLI volume profile $V0 info incremental | grep -w LOOKUP | awk '{print $8}'` ++ ++# Delete the base shard and check shards get cleaned up ++TEST unlink $M0/foo ++ ++TEST ! stat $M0/foo ++TEST ! stat $B0/${V0}0/foo ++ ++# There should be no shards now ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_new ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 2ba4528..a6ad1b8 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -995,6 +995,10 @@ shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) + } + + int ++shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, ++ xlator_t *this); ++ ++int + shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler) + { +@@ -1011,21 +1015,47 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; ++ uint64_t resolve_count = 0; + + priv = this->private; + local = frame->local; + local->call_count = 0; + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; ++ ++ if ((local->op_ret < 0) || (local->resolve_not)) ++ goto out; ++ ++ /* If this prealloc FOP is for fresh file creation, then the size of the ++ * file will be 0. Then there will be no shards associated with this file. ++ * So we can skip the lookup process for the shards which do not exists ++ * and directly issue mknod to crete shards. ++ * ++ * In case the prealloc fop is to extend the preallocated file to bigger ++ * size then just lookup and populate inodes of existing shards and ++ * update the create count ++ */ ++ if (local->fop == GF_FOP_FALLOCATE) { ++ if (!local->prebuf.ia_size) { ++ local->inode_list[0] = inode_ref(res_inode); ++ local->create_count = local->last_block; ++ shard_common_inode_write_post_lookup_shards_handler(frame, this); ++ return 0; ++ } ++ if (local->prebuf.ia_size < local->total_size) ++ local->create_count = local->last_block - ++ ((local->prebuf.ia_size - 1) / ++ local->block_size); ++ } ++ ++ resolve_count = local->last_block - local->create_count; ++ + if (res_inode) + gf_uuid_copy(gfid, res_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + +- if ((local->op_ret < 0) || (local->resolve_not)) +- goto out; +- +- while (shard_idx_iter <= local->last_block) { ++ while (shard_idx_iter <= resolve_count) { + i++; + if (shard_idx_iter == 0) { + local->inode_list[i] = inode_ref(res_inode); +@@ -2434,7 +2464,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + int count = 0; + int call_count = 0; + int32_t shard_idx_iter = 0; +- int last_block = 0; ++ int lookup_count = 0; + char path[PATH_MAX] = { + 0, + }; +@@ -2454,7 +2484,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + local = frame->local; + count = call_count = local->call_count; + shard_idx_iter = local->first_block; +- last_block = local->last_block; ++ lookup_count = local->last_block - local->create_count; + local->pls_fop_handler = handler; + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; +@@ -2464,7 +2494,7 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + else + gf_uuid_copy(gfid, local->base_gfid); + +- while (shard_idx_iter <= last_block) { ++ while (shard_idx_iter <= lookup_count) { + if (local->inode_list[i]) { + i++; + shard_idx_iter++; +@@ -5651,6 +5681,8 @@ shard_common_inode_write_post_resolve_handler(call_frame_t *frame, + shard_common_lookup_shards( + frame, this, local->resolver_base_inode, + shard_common_inode_write_post_lookup_shards_handler); ++ } else if (local->create_count) { ++ shard_common_inode_write_post_lookup_shards_handler(frame, this); + } else { + shard_common_inode_write_do(frame, this); + } +-- +1.8.3.1 + diff --git a/SOURCES/0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch b/SOURCES/0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch new file mode 100644 index 0000000..4d87bcb --- /dev/null +++ b/SOURCES/0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch @@ -0,0 +1,340 @@ +From 1a8b001a121ada4d3d338b52b312896f1790f2bb Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Mon, 11 Jan 2021 12:34:55 +0530 +Subject: [PATCH 570/584] features/shard: avoid repeatative calls to + gf_uuid_unparse() + +The issue is shard_make_block_abspath() calls gf_uuid_unparse() +every time while constructing shard path. The gfid can be parsed +and saved once and passed while constructing the path. Thus +we can avoid calling gf_uuid_unparse(). + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1689 +> Fixes: #1423 +> Change-Id: Ia26fbd5f09e812bbad9e5715242f14143c013c9c +> Signed-off-by: Vinayakswami Hariharmath vharihar@redhat.com + +BUG: 1925425 +Change-Id: Ia26fbd5f09e812bbad9e5715242f14143c013c9c +Signed-off-by: Vinayakswami Hariharmath vharihar@redhat.com +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244964 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-1425.t | 9 ++- + xlators/features/shard/src/shard.c | 119 ++++++++++++++++++------------------- + 2 files changed, 65 insertions(+), 63 deletions(-) + +diff --git a/tests/bugs/shard/issue-1425.t b/tests/bugs/shard/issue-1425.t +index bbe82c0..8b77705 100644 +--- a/tests/bugs/shard/issue-1425.t ++++ b/tests/bugs/shard/issue-1425.t +@@ -21,7 +21,13 @@ TEST $CLI volume profile $V0 start + + TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 + ++$CLI volume profile $V0 info clear ++ + TEST fallocate -l 20M $M0/foo ++ ++# There should be 1+4 shards and we expect 4 lookups less than on the build without this patch ++EXPECT "5" echo `$CLI volume profile $V0 info incremental | grep -w LOOKUP | awk '{print $8}'` ++ + gfid_new=$(get_gfid_string $M0/foo) + + # Check for the base shard +@@ -31,9 +37,6 @@ TEST stat $B0/${V0}0/foo + # There should be 4 associated shards + EXPECT_WITHIN $FILE_COUNT_TIME 4 get_file_count $B0/${V0}0/.shard/$gfid_new + +-# There should be 1+4 shards and we expect 4 lookups less than on the build without this patch +-EXPECT "21" echo `$CLI volume profile $V0 info incremental | grep -w LOOKUP | awk '{print $8}'` +- + # Delete the base shard and check shards get cleaned up + TEST unlink $M0/foo + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index a6ad1b8..d1d7d7a 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -16,6 +16,8 @@ + #include + #include + ++#define SHARD_PATH_MAX (sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE + 16) ++ + static gf_boolean_t + __is_shard_dir(uuid_t gfid) + { +@@ -49,15 +51,19 @@ shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) + snprintf(buf, len, "%s.%d", gfid_str, block_num); + } + +-void +-shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) ++static int ++shard_make_base_path(char *path, uuid_t gfid) + { +- char gfid_str[GF_UUID_BUF_SIZE] = { +- 0, +- }; ++ strcpy(path, "/" GF_SHARD_DIR "/"); ++ uuid_utoa_r(gfid, path + sizeof(GF_SHARD_DIR) + 1); ++ return (sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE); ++} + +- gf_uuid_unparse(gfid, gfid_str); +- snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); ++static inline void ++shard_append_index(char *path, int path_size, int prefix_len, ++ int shard_idx_iter) ++{ ++ snprintf(path + prefix_len, path_size - prefix_len, ".%d", shard_idx_iter); + } + + int +@@ -1004,9 +1010,8 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + { + int i = -1; + uint32_t shard_idx_iter = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ int prefix_len = 0; ++ char path[SHARD_PATH_MAX]; + uuid_t gfid = { + 0, + }; +@@ -1055,6 +1060,9 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + else + gf_uuid_copy(gfid, local->base_gfid); + ++ /* Build base shard path before appending index of the shard */ ++ prefix_len = shard_make_base_path(path, gfid); ++ + while (shard_idx_iter <= resolve_count) { + i++; + if (shard_idx_iter == 0) { +@@ -1062,16 +1070,13 @@ shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + shard_idx_iter++; + continue; + } +- +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, shard_idx_iter); + inode = NULL; + inode = inode_resolve(this->itable, path); + if (inode) { + gf_msg_debug(this->name, 0, +- "Shard %d already " +- "present. gfid=%s. Saving inode for future.", +- shard_idx_iter, uuid_utoa(inode->gfid)); ++ "Shard %s already present. Saving inode for future.", ++ path); + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get +@@ -2153,9 +2158,8 @@ shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) + int call_count = 0; + uint32_t cur_block = 0; + uint32_t last_block = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ int prefix_len = 0; ++ char path[SHARD_PATH_MAX]; + char *bname = NULL; + loc_t loc = { + 0, +@@ -2216,6 +2220,10 @@ shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) + return 0; + } + ++ /* Build base shard path before appending index of the shard */ ++ prefix_len = shard_make_base_path(path, inode->gfid); ++ bname = path + sizeof(GF_SHARD_DIR) + 1; ++ + SHARD_SET_ROOT_FS_ID(frame, local); + while (cur_block <= last_block) { + if (!local->inode_list[i]) { +@@ -2229,15 +2237,12 @@ shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) + goto next; + } + +- shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, cur_block); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s. Base file gfid = %s", +- bname, uuid_utoa(inode->gfid)); ++ "Inode path failed on %s.", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); +@@ -2465,13 +2470,8 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + int call_count = 0; + int32_t shard_idx_iter = 0; + int lookup_count = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ char path[SHARD_PATH_MAX]; + char *bname = NULL; +- uuid_t gfid = { +- 0, +- }; + loc_t loc = { + 0, + }; +@@ -2489,10 +2489,16 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; + ++ /* Build base shard path before appending index of the shard */ ++ strcpy(path, "/" GF_SHARD_DIR "/"); ++ + if (inode) +- gf_uuid_copy(gfid, inode->gfid); ++ uuid_utoa_r(inode->gfid, path + sizeof(GF_SHARD_DIR) + 1); + else +- gf_uuid_copy(gfid, local->base_gfid); ++ uuid_utoa_r(local->base_gfid, path + sizeof(GF_SHARD_DIR) + 1); ++ ++ int prefix_len = sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE; ++ bname = path + sizeof(GF_SHARD_DIR) + 1; + + while (shard_idx_iter <= lookup_count) { + if (local->inode_list[i]) { +@@ -2508,18 +2514,14 @@ shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + goto next; + } + +- shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); +- +- bname = strrchr(path, '/') + 1; ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, shard_idx_iter); + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0 || !(loc.inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); ++ "Inode path failed on %s", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); +@@ -3168,12 +3170,7 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + uint32_t cur_block = 0; + uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ + char *bname = NULL; +- char path[PATH_MAX] = { +- 0, +- }; +- uuid_t gfid = { +- 0, +- }; ++ char path[SHARD_PATH_MAX]; + loc_t loc = { + 0, + }; +@@ -3184,10 +3181,16 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + priv = this->private; + local = frame->local; + ++ /* Build base shard path before appending index of the shard */ ++ strcpy(path, "/" GF_SHARD_DIR "/"); ++ + if (inode) +- gf_uuid_copy(gfid, inode->gfid); ++ uuid_utoa_r(inode->gfid, path + sizeof(GF_SHARD_DIR) + 1); + else +- gf_uuid_copy(gfid, local->base_gfid); ++ uuid_utoa_r(local->base_gfid, path + sizeof(GF_SHARD_DIR) + 1); ++ ++ int prefix_len = sizeof(GF_SHARD_DIR) + GF_UUID_BUF_SIZE; ++ bname = path + sizeof(GF_SHARD_DIR) + 1; + + for (i = 0; i < local->num_blocks; i++) { + if (!local->inode_list[i]) +@@ -3203,7 +3206,7 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + gf_msg_debug(this->name, 0, + "All shards that need to be " + "unlinked are non-existent: %s", +- uuid_utoa(gfid)); ++ path); + return 0; + } + +@@ -3221,15 +3224,12 @@ shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) + goto next; + } + +- shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); +- bname = strrchr(path, '/') + 1; ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, cur_block); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed" +- " on %s, base file gfid = %s", +- bname, uuid_utoa(gfid)); ++ "Inode path failed on %s", bname); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); +@@ -4971,9 +4971,8 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + int last_block = 0; + int ret = 0; + int call_count = 0; +- char path[PATH_MAX] = { +- 0, +- }; ++ int prefix_len = 0; ++ char path[SHARD_PATH_MAX]; + mode_t mode = 0; + char *bname = NULL; + shard_priv_t *priv = NULL; +@@ -4996,6 +4995,10 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + call_count = local->call_count = local->create_count; + local->post_mknod_handler = post_mknod_handler; + ++ /* Build base shard path before appending index of the shard */ ++ prefix_len = shard_make_base_path(path, fd->inode->gfid); ++ bname = path + sizeof(GF_SHARD_DIR) + 1; ++ + SHARD_SET_ROOT_FS_ID(frame, local); + + ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); +@@ -5022,10 +5025,7 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } +- +- shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, +- sizeof(path)); +- ++ shard_append_index(path, SHARD_PATH_MAX, prefix_len, shard_idx_iter); + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; +@@ -5036,7 +5036,6 @@ shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + goto next; + } + +- bname = strrchr(path, '/') + 1; + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); +-- +1.8.3.1 + diff --git a/SOURCES/0571-NetBSD-build-fixes.patch b/SOURCES/0571-NetBSD-build-fixes.patch new file mode 100644 index 0000000..8a6d4a4 --- /dev/null +++ b/SOURCES/0571-NetBSD-build-fixes.patch @@ -0,0 +1,98 @@ +From 2c0d11bb406e50fb515abf0c5a4006e1b362ac8e Mon Sep 17 00:00:00 2001 +From: Emmanuel Dreyfus +Date: Tue, 30 Jun 2020 16:42:36 +0200 +Subject: [PATCH 571/584] NetBSD build fixes + +- Make sure -largp is used at link time +- PTHREAD_MUTEX_ADAPTIVE_NP is not available, use PTHREAD_MUTEX_DEFAULT instead +- Avoid non POSIX [[ ]] in scripts +- Do not check of lock.spinlock is NULL since it is not a pointer + (it is not a pointer on Linux either) + +Backport of: +> Upstream-patch: https://review.gluster.org/#/c/glusterfs/+/24648/ +> Change-Id: I5e04a7c552d24f8a473c2b837828d1bddfa7e128 +> Fixes: #1347 +> Type: Bug +> Signed-off-by: Emmanuel Dreyfus + +BUG: 1925425 +Change-Id: I5e04a7c552d24f8a473c2b837828d1bddfa7e128 +Signed-off-by: Emmanuel Dreyfus +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245040 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + configure.ac | 3 +++ + rpc/rpc-lib/src/rpcsvc.c | 4 ++++ + tools/gfind_missing_files/gfind_missing_files.sh | 2 +- + xlators/performance/write-behind/src/write-behind.c | 4 ++-- + 4 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/configure.ac b/configure.ac +index 327733e..6138a59 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -998,6 +998,9 @@ case $host_os in + CFLAGS="${CFLAGS} -isystem /usr/local/include" + ARGP_LDADD=-largp + ;; ++ *netbsd*) ++ ARGP_LDADD=-largp ++ ;; + esac + dnl argp-standalone does not provide a pkg-config file + AC_CHECK_HEADER([argp.h], AC_DEFINE(HAVE_ARGP, 1, [have argp])) +diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c +index 3f184bf..b031d93 100644 +--- a/rpc/rpc-lib/src/rpcsvc.c ++++ b/rpc/rpc-lib/src/rpcsvc.c +@@ -46,6 +46,10 @@ + #include "xdr-rpcclnt.h" + #include + ++#ifndef PTHREAD_MUTEX_ADAPTIVE_NP ++#define PTHREAD_MUTEX_ADAPTIVE_NP PTHREAD_MUTEX_DEFAULT ++#endif ++ + struct rpcsvc_program gluster_dump_prog; + + #define rpcsvc_alloc_request(svc, request) \ +diff --git a/tools/gfind_missing_files/gfind_missing_files.sh b/tools/gfind_missing_files/gfind_missing_files.sh +index f42fe7b..e7aaa0b 100644 +--- a/tools/gfind_missing_files/gfind_missing_files.sh ++++ b/tools/gfind_missing_files/gfind_missing_files.sh +@@ -61,7 +61,7 @@ mount_slave() + + parse_cli() + { +- if [[ $# -ne 4 ]]; then ++ if [ "$#" -ne 4 ]; then + echo "Usage: gfind_missing_files " + exit 1 + else +diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c +index 31ab723..76d257f 100644 +--- a/xlators/performance/write-behind/src/write-behind.c ++++ b/xlators/performance/write-behind/src/write-behind.c +@@ -2490,7 +2490,7 @@ wb_mark_readdirp_start(xlator_t *this, inode_t *directory) + + wb_directory_inode = wb_inode_create(this, directory); + +- if (!wb_directory_inode || !wb_directory_inode->lock.spinlock) ++ if (!wb_directory_inode) + return; + + LOCK(&wb_directory_inode->lock); +@@ -2510,7 +2510,7 @@ wb_mark_readdirp_end(xlator_t *this, inode_t *directory) + + wb_directory_inode = wb_inode_ctx_get(this, directory); + +- if (!wb_directory_inode || !wb_directory_inode->lock.spinlock) ++ if (!wb_directory_inode) + return; + + LOCK(&wb_directory_inode->lock); +-- +1.8.3.1 + diff --git a/SOURCES/0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch b/SOURCES/0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch new file mode 100644 index 0000000..1447916 --- /dev/null +++ b/SOURCES/0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch @@ -0,0 +1,183 @@ +From 1491b33007e84be0a0a74354e89deca8a21ed198 Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Tue, 19 Jan 2021 15:39:35 +0530 +Subject: [PATCH 572/584] locks: remove unused conditional switch to spin_lock + code + +use of spin_locks is depend on the variable use_spinlocks +but the same is commented in the current code base through +https://review.gluster.org/#/c/glusterfs/+/14763/. So it is +of no use to have conditional switching to spin_lock or +mutex. Removing the dead code as part of the patch + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2007 +> Fixes: #1996 +> Change-Id: Ib005dd86969ce33d3409164ef3e1011bb3169129 +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1925425 +Change-Id: Ib005dd86969ce33d3409164ef3e1011bb3169129 +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244965 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + configure.ac | 7 ----- + libglusterfs/src/Makefile.am | 2 +- + libglusterfs/src/common-utils.c | 5 ---- + libglusterfs/src/glusterfs/locking.h | 51 ------------------------------------ + libglusterfs/src/locking.c | 27 ------------------- + 5 files changed, 1 insertion(+), 91 deletions(-) + delete mode 100644 libglusterfs/src/locking.c + +diff --git a/configure.ac b/configure.ac +index 6138a59..3d99f6a 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -585,13 +585,6 @@ AC_CHECK_HEADERS([linux/falloc.h]) + + AC_CHECK_HEADERS([linux/oom.h], AC_DEFINE(HAVE_LINUX_OOM_H, 1, [have linux/oom.h])) + +-dnl Mac OS X does not have spinlocks +-AC_CHECK_FUNC([pthread_spin_init], [have_spinlock=yes]) +-if test "x${have_spinlock}" = "xyes"; then +- AC_DEFINE(HAVE_SPINLOCK, 1, [define if found spinlock]) +-fi +-AC_SUBST(HAVE_SPINLOCK) +- + dnl some os may not have GNU defined strnlen function + AC_CHECK_FUNC([strnlen], [have_strnlen=yes]) + if test "x${have_strnlen}" = "xyes"; then +diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am +index 970f4b7..830a0c3 100644 +--- a/libglusterfs/src/Makefile.am ++++ b/libglusterfs/src/Makefile.am +@@ -35,7 +35,7 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \ + strfd.c parse-utils.c $(CONTRIBDIR)/mount/mntent.c \ + $(CONTRIBDIR)/libexecinfo/execinfo.c quota-common-utils.c rot-buffs.c \ + $(CONTRIBDIR)/timer-wheel/timer-wheel.c \ +- $(CONTRIBDIR)/timer-wheel/find_last_bit.c default-args.c locking.c \ ++ $(CONTRIBDIR)/timer-wheel/find_last_bit.c default-args.c \ + $(CONTRIBDIR)/xxhash/xxhash.c \ + compound-fop-utils.c throttle-tbf.c monitoring.c + +diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c +index d351b93..c2dfe28 100644 +--- a/libglusterfs/src/common-utils.c ++++ b/libglusterfs/src/common-utils.c +@@ -860,11 +860,6 @@ gf_dump_config_flags() + gf_msg_plain_nomem(GF_LOG_ALERT, "setfsid 1"); + #endif + +-/* define if found spinlock */ +-#ifdef HAVE_SPINLOCK +- gf_msg_plain_nomem(GF_LOG_ALERT, "spinlock 1"); +-#endif +- + /* Define to 1 if you have the header file. */ + #ifdef HAVE_SYS_EPOLL_H + gf_msg_plain_nomem(GF_LOG_ALERT, "epoll.h 1"); +diff --git a/libglusterfs/src/glusterfs/locking.h b/libglusterfs/src/glusterfs/locking.h +index 43cc877..63097bb 100644 +--- a/libglusterfs/src/glusterfs/locking.h ++++ b/libglusterfs/src/glusterfs/locking.h +@@ -22,55 +22,6 @@ + #define pthread_spin_init(l, v) (*l = v) + #endif + +-#if defined(HAVE_SPINLOCK) +- +-typedef union { +- pthread_spinlock_t spinlock; +- pthread_mutex_t mutex; +-} gf_lock_t; +- +-#if !defined(LOCKING_IMPL) +-extern int use_spinlocks; +- +-/* +- * Using a dispatch table would be unpleasant because we're dealing with two +- * different types. If the dispatch contains direct pointers to pthread_xx +- * or mutex_xxx then we have to hope that every possible union alternative +- * starts at the same address as the union itself. I'm old enough to remember +- * compilers where this was not the case (for alignment reasons) so I'm a bit +- * paranoid about that. Also, I don't like casting arguments through "void *" +- * which we'd also have to do to avoid type errors. The other alternative would +- * be to define actual functions which pick out the right union member, and put +- * those in the dispatch tables. Now we have a pointer dereference through the +- * dispatch table plus a function call, which is likely to be worse than the +- * branching here from the ?: construct. If it were a clear win it might be +- * worth the extra complexity, but for now this way seems preferable. +- */ +- +-#define LOCK_INIT(x) \ +- (use_spinlocks ? pthread_spin_init(&((x)->spinlock), 0) \ +- : pthread_mutex_init(&((x)->mutex), 0)) +- +-#define LOCK(x) \ +- (use_spinlocks ? pthread_spin_lock(&((x)->spinlock)) \ +- : pthread_mutex_lock(&((x)->mutex))) +- +-#define TRY_LOCK(x) \ +- (use_spinlocks ? pthread_spin_trylock(&((x)->spinlock)) \ +- : pthread_mutex_trylock(&((x)->mutex))) +- +-#define UNLOCK(x) \ +- (use_spinlocks ? pthread_spin_unlock(&((x)->spinlock)) \ +- : pthread_mutex_unlock(&((x)->mutex))) +- +-#define LOCK_DESTROY(x) \ +- (use_spinlocks ? pthread_spin_destroy(&((x)->spinlock)) \ +- : pthread_mutex_destroy(&((x)->mutex))) +- +-#endif +- +-#else +- + typedef pthread_mutex_t gf_lock_t; + + #define LOCK_INIT(x) pthread_mutex_init(x, 0) +@@ -79,6 +30,4 @@ typedef pthread_mutex_t gf_lock_t; + #define UNLOCK(x) pthread_mutex_unlock(x) + #define LOCK_DESTROY(x) pthread_mutex_destroy(x) + +-#endif /* HAVE_SPINLOCK */ +- + #endif /* _LOCKING_H */ +diff --git a/libglusterfs/src/locking.c b/libglusterfs/src/locking.c +deleted file mode 100644 +index 7577054..0000000 +--- a/libglusterfs/src/locking.c ++++ /dev/null +@@ -1,27 +0,0 @@ +-/* +- Copyright (c) 2015 Red Hat, Inc. +- This file is part of GlusterFS. +- +- This file is licensed to you under your choice of the GNU Lesser +- General Public License, version 3 or any later version (LGPLv3 or +- later), or the GNU General Public License, version 2 (GPLv2), in all +- cases as published by the Free Software Foundation. +-*/ +- +-#if defined(HAVE_SPINLOCK) +-/* None of this matters otherwise. */ +- +-#include +-#include +- +-#define LOCKING_IMPL +-#include "glusterfs/locking.h" +- +-int use_spinlocks = 0; +- +-static void __attribute__((constructor)) gf_lock_setup(void) +-{ +- // use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1); +-} +- +-#endif +-- +1.8.3.1 + diff --git a/SOURCES/0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch b/SOURCES/0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch new file mode 100644 index 0000000..3033727 --- /dev/null +++ b/SOURCES/0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch @@ -0,0 +1,148 @@ +From 0e453ede1f248a004965d0d368e2c4beb83f2ce1 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Mon, 25 Jan 2021 17:32:14 +0530 +Subject: [PATCH 573/584] features/shard: unlink fails due to nospace to mknod + marker file + +When we hit the max capacity of the storage space, shard_unlink() +starts failing if there is no space left on the brick to create a +marker file. + +shard_unlink() happens in below steps: + +1. create a marker file in the name of gfid of the base file under +BRICK_PATH/.shard/.remove_me +2. unlink the base file +3. shard_delete_shards() deletes the shards in background by +picking the entries in BRICK_PATH/.shard/.remove_me + +If a marker file creation fails then we can't really delete the +shards which eventually a problem for user who is looking to make +space by deleting unwanted data. + +Solution: +Create the marker file by marking xdata = GLUSTERFS_INTERNAL_FOP_KEY +which is considered to be internal op and allowed to create under +reserved space. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/2057 +> Fixes: #2038 +> Change-Id: I7facebab940f9aeee81d489df429e00ef4fb7c5d +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1891403 +Change-Id: I7facebab940f9aeee81d489df429e00ef4fb7c5d +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244966 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/issue-2038.t | 56 ++++++++++++++++++++++++++++++++++++++ + xlators/features/shard/src/shard.c | 20 ++++++++++++++ + 2 files changed, 76 insertions(+) + create mode 100644 tests/bugs/shard/issue-2038.t + +diff --git a/tests/bugs/shard/issue-2038.t b/tests/bugs/shard/issue-2038.t +new file mode 100644 +index 0000000..fc3e7f9 +--- /dev/null ++++ b/tests/bugs/shard/issue-2038.t +@@ -0,0 +1,56 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../snapshot.rc ++ ++cleanup ++ ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++TEST verify_lvm_version ++TEST glusterd ++TEST pidof glusterd ++TEST init_n_bricks 1 ++TEST setup_lvm 1 ++ ++TEST $CLI volume create $V0 $H0:$L1 ++TEST $CLI volume start $V0 ++ ++$CLI volume info ++ ++TEST $CLI volume set $V0 features.shard on ++TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++#Setting the size in percentage ++TEST $CLI volume set $V0 storage.reserve 40 ++ ++#wait 5s to reset disk_space_full flag ++sleep 5 ++ ++TEST touch $M0/test ++TEST unlink $M0/test ++ ++TEST dd if=/dev/zero of=$M0/a bs=80M count=1 ++TEST dd if=/dev/zero of=$M0/b bs=10M count=1 ++ ++gfid_new=$(get_gfid_string $M0/a) ++ ++# Wait 5s to update disk_space_full flag because thread check disk space ++# after every 5s ++ ++sleep 5 ++# setup_lvm create lvm partition of 150M and 40M are reserve so after ++# consuming more than 110M next unlink should not fail ++# Delete the base shard and check shards get cleaned up ++TEST unlink $M0/a ++TEST ! stat $M0/a ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index d1d7d7a..8d4a970 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -4078,6 +4078,16 @@ shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, + SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, + local->prebuf.ia_size, 0, err); + ++ /* Mark this as an internal operation, so that in case of disk full, ++ * the marker file will be created as part of reserve space */ ++ ret = dict_set_int32_sizen(xattr_req, GLUSTERFS_INTERNAL_FOP_KEY, 1); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key: %s on path %s", GLUSTERFS_INTERNAL_FOP_KEY, ++ local->newloc.path); ++ goto err; ++ } ++ + STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + &local->newloc, 0, 0, 0644, xattr_req); +@@ -5843,6 +5853,16 @@ shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, + + SHARD_SET_ROOT_FS_ID(frame, local); + ++ /* Mark this as an internal operation, so that in case of disk full ++ * the internal dir will be created as part of reserve space */ ++ ret = dict_set_int32_sizen(xattr_req, GLUSTERFS_INTERNAL_FOP_KEY, 1); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set key: %s on path %s", GLUSTERFS_INTERNAL_FOP_KEY, ++ loc->path); ++ goto err; ++ } ++ + STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, + 0755, 0, xattr_req); +-- +1.8.3.1 + diff --git a/SOURCES/0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch b/SOURCES/0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch new file mode 100644 index 0000000..810abd4 --- /dev/null +++ b/SOURCES/0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch @@ -0,0 +1,712 @@ +From cb0d240004e6d40f8d7f30d177d5970ebc8e25fb Mon Sep 17 00:00:00 2001 +From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com> +Date: Wed, 3 Feb 2021 17:04:25 +0530 +Subject: [PATCH 574/584] features/shard: delay unlink of a file that has + fd_count > 0 + +When there are multiple processes working on a file and if any +process unlinks that file then unlink operation shouldn't harm +other processes working on it. This is a posix a compliant +behavior and this should be supported when shard feature is +enabled also. + +Problem description: +Let's consider 2 clients C1 and C2 working on a file F1 with 5 +shards on gluster mount and gluster server has 4 bricks +B1, B2, B3, B4. + +Assume that base file/shard is present on B1, 1st, 2nd shards +on B2, 3rd and 4th shards on B3 and 5th shard falls on B4 C1 +has opened the F1 in append mode and is writing to it. The +write FOP goes to 5th shard in this case. So the +inode->fd_count = 1 on B1(base file) and B4 (5th shard). + +C2 at the same time issued unlink to F1. On the server, the +base file has fd_count = 1 (since C1 has opened the file), +the base file is renamed under .glusterfs/unlink and +returned to C2. Then unlink will be sent to shards on all +bricks and shards on B2 and B3 will be deleted which have +no open reference yet. C1 starts getting errors while +accessing the remaining shards though it has open references +for the file. + +This is one such undefined behavior. Likewise we will +encounter many such undefined behaviors as we dont have one +global lock to access all shards as one. Of Course having such +global lock will lead to performance hit as it reduces window +for parallel access of shards. + +Solution: +The above undefined behavior can be addressed by delaying the +unlink of a file when there are open references on it. +File unlink happens in 2 steps. +step 1: client creates marker file under .shard/remove_me and +sends unlink on base file to the server +step 2: on return from the server, the associated shards will +be cleaned up and finally marker file will be removed. + +In step 2, the back ground deletion process does nameless +lookup using marker file name (marker file is named after the +gfid of the base file) in glusterfs/unlink dir. If the nameless +look up is successful then that means the gfid still has open +fds and deletion of shards has to be delayed. If nameless +lookup fails then that indicates the gfid is unlinked and no +open fds on that file (the gfid path is unlinked during final +close on the file). The shards on which deletion is delayed +are unlinked one the all open fds are closed and this is +done through a thread which wakes up every 10 mins. + +Also removed active_fd_count from inode structure and +referring fd_count wherever active_fd_count was used. + +Backport of: +> Upstream-patch: https://github.com/gluster/glusterfs/pull/1563 +> Fixes: #1358 +> Change-Id: I8985093386e26215e0b0dce294c534a66f6ca11c +> Signed-off-by: Vinayakswami Hariharmath + +BUG: 1782428 +Change-Id: I8985093386e26215e0b0dce294c534a66f6ca11c +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244967 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/glusterfs.h | 1 + + tests/bugs/shard/issue-1358.t | 100 +++++++++++++ + tests/bugs/shard/unlinks-and-renames.t | 5 + + xlators/features/shard/src/shard.c | 199 ++++++++++++++++++++++++- + xlators/features/shard/src/shard.h | 11 ++ + xlators/storage/posix/src/posix-entry-ops.c | 36 +++++ + xlators/storage/posix/src/posix-inode-fd-ops.c | 64 +++++--- + 7 files changed, 391 insertions(+), 25 deletions(-) + create mode 100644 tests/bugs/shard/issue-1358.t + +diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h +index d3400bf..4401cf6 100644 +--- a/libglusterfs/src/glusterfs/glusterfs.h ++++ b/libglusterfs/src/glusterfs/glusterfs.h +@@ -261,6 +261,7 @@ enum gf_internal_fop_indicator { + #define GF_XATTROP_PURGE_INDEX "glusterfs.xattrop-purge-index" + + #define GF_GFIDLESS_LOOKUP "gfidless-lookup" ++#define GF_UNLINKED_LOOKUP "unlinked-lookup" + /* replace-brick and pump related internal xattrs */ + #define RB_PUMP_CMD_START "glusterfs.pump.start" + #define RB_PUMP_CMD_PAUSE "glusterfs.pump.pause" +diff --git a/tests/bugs/shard/issue-1358.t b/tests/bugs/shard/issue-1358.t +new file mode 100644 +index 0000000..1838e06 +--- /dev/null ++++ b/tests/bugs/shard/issue-1358.t +@@ -0,0 +1,100 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST mkdir $M0/dir ++TEST dd if=/dev/urandom of=$M0/dir/foo bs=4M count=5 ++gfid_new=$(get_gfid_string $M0/dir/foo) ++ ++# Ensure its shards dir is created now. ++TEST stat $B0/${V0}0/.shard/$gfid_new.1 ++TEST stat $B0/${V0}1/.shard/$gfid_new.1 ++TEST stat $B0/${V0}0/.shard/$gfid_new.2 ++TEST stat $B0/${V0}1/.shard/$gfid_new.2 ++ ++# Open a file and store descriptor in fd = 5 ++exec 5>$M0/dir/foo ++ ++# Write something on the file using the open fd = 5 ++echo "issue-1358" >&5 ++ ++# Write on the descriptor should be succesful ++EXPECT 0 echo $? ++ ++# Unlink the same file which is opened in prev step ++TEST unlink $M0/dir/foo ++ ++# Check the base file ++TEST ! stat $M0/dir/foo ++TEST ! stat $B0/${V0}0/foo ++TEST ! stat $B0/${V0}1/foo ++ ++# Write something on the file using the open fd = 5 ++echo "issue-1281" >&5 ++ ++# Write on the descriptor should be succesful ++EXPECT 0 echo $? ++ ++# Check ".shard/.remove_me" ++EXPECT_WITHIN $FILE_COUNT_TIME 1 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 1 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new ++ ++# Close the fd = 5 ++exec 5>&- ++ ++###### To see the shards deleted, wait for 10 mins or repeat the same steps i.e open a file ##### ++###### write something to it, unlink it and close it. This will wake up the thread that is ###### ++###### responsible to delete the shards ++ ++TEST touch $M0/dir/new ++exec 6>$M0/dir/new ++echo "issue-1358" >&6 ++EXPECT 0 echo $? ++TEST unlink $M0/dir/new ++exec 6>&- ++ ++# Now check the ".shard/remove_me" and the gfid will not be there ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new ++ ++# check for the absence of shards ++TEST ! stat $B0/${V0}0/.shard/$gfid_new.1 ++TEST ! stat $B0/${V0}1/.shard/$gfid_new.1 ++TEST ! stat $B0/${V0}0/.shard/$gfid_new.2 ++TEST ! stat $B0/${V0}1/.shard/$gfid_new.2 ++ ++#### Create the file with same name and check creation and deletion works fine ###### ++TEST dd if=/dev/urandom of=$M0/dir/foo bs=4M count=5 ++gfid_new=$(get_gfid_string $M0/dir/foo) ++ ++# Ensure its shards dir is created now. ++TEST stat $B0/${V0}0/.shard/$gfid_new.1 ++TEST stat $B0/${V0}1/.shard/$gfid_new.1 ++TEST stat $B0/${V0}0/.shard/$gfid_new.2 ++TEST stat $B0/${V0}1/.shard/$gfid_new.2 ++ ++TEST unlink $M0/dir/foo ++cleanup ++ +diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t +index 990ca69..3280fcb 100644 +--- a/tests/bugs/shard/unlinks-and-renames.t ++++ b/tests/bugs/shard/unlinks-and-renames.t +@@ -24,6 +24,11 @@ TEST pidof glusterd + TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} + TEST $CLI volume set $V0 features.shard on + TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.write-behind off ++ + TEST $CLI volume start $V0 + TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 8d4a970..b828ff9 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -1242,7 +1242,8 @@ out: + + static inode_t * + shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, +- struct iatt *buf, shard_internal_dir_type_t type) ++ xlator_t *this, struct iatt *buf, ++ shard_internal_dir_type_t type) + { + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; +@@ -1250,7 +1251,7 @@ shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, + inode_t **priv_inode = NULL; + inode_t *parent = NULL; + +- priv = THIS->private; ++ priv = this->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: +@@ -1294,7 +1295,7 @@ shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, + /* To-Do: Fix refcount increment per call to + * shard_link_internal_dir_inode(). + */ +- linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ linked_inode = shard_link_internal_dir_inode(local, inode, this, buf, type); + shard_inode_ctx_mark_dir_refreshed(linked_inode, this); + out: + shard_common_resolve_shards(frame, this, local->post_res_handler); +@@ -1383,7 +1384,7 @@ shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + goto unwind; + } + +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ link_inode = shard_link_internal_dir_inode(local, inode, this, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { +@@ -3586,7 +3587,8 @@ shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, + "Lookup on %s failed, exiting", bname); + goto err; + } else { +- shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); ++ shard_link_internal_dir_inode(local, loc->inode, this, &stbuf, ++ type); + } + } + ret = 0; +@@ -3633,6 +3635,45 @@ err: + return ret; + } + ++static int ++shard_nameless_lookup_base_file(xlator_t *this, char *gfid) ++{ ++ int ret = 0; ++ loc_t loc = { ++ 0, ++ }; ++ dict_t *xattr_req = dict_new(); ++ if (!xattr_req) { ++ ret = -1; ++ goto out; ++ } ++ ++ loc.inode = inode_new(this->itable); ++ if (loc.inode == NULL) { ++ ret = -1; ++ goto out; ++ } ++ ++ ret = gf_uuid_parse(gfid, loc.gfid); ++ if (ret < 0) ++ goto out; ++ ++ ret = dict_set_uint32(xattr_req, GF_UNLINKED_LOOKUP, 1); ++ if (ret < 0) ++ goto out; ++ ++ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, xattr_req, NULL); ++ if (ret < 0) ++ goto out; ++ ++out: ++ if (xattr_req) ++ dict_unref(xattr_req); ++ loc_wipe(&loc); ++ ++ return ret; ++} ++ + int + shard_delete_shards(void *opaque) + { +@@ -3734,6 +3775,11 @@ shard_delete_shards(void *opaque) + if (ret < 0) + continue; + } ++ ++ ret = shard_nameless_lookup_base_file(this, entry->d_name); ++ if (!ret) ++ continue; ++ + link_inode = inode_link(entry->inode, local->fd->inode, + entry->d_name, &entry->d_stat); + +@@ -4105,6 +4151,9 @@ err: + int + shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); + ++static int ++shard_unlink_handler_spawn(xlator_t *this); ++ + int + shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, +@@ -4126,7 +4175,7 @@ shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + if (xdata) + local->xattr_rsp = dict_ref(xdata); + if (local->cleanup_required) +- shard_start_background_deletion(this); ++ shard_unlink_handler_spawn(this); + } + + if (local->entrylk_frame) { +@@ -5785,7 +5834,7 @@ shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + } + } + +- link_inode = shard_link_internal_dir_inode(local, inode, buf, type); ++ link_inode = shard_link_internal_dir_inode(local, inode, this, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { +@@ -7098,6 +7147,132 @@ shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + return 0; + } + ++static void ++shard_unlink_wait(shard_unlink_thread_t *ti) ++{ ++ struct timespec wait_till = { ++ 0, ++ }; ++ ++ pthread_mutex_lock(&ti->mutex); ++ { ++ /* shard_unlink_handler() runs every 10 mins of interval */ ++ wait_till.tv_sec = time(NULL) + 600; ++ ++ while (!ti->rerun) { ++ if (pthread_cond_timedwait(&ti->cond, &ti->mutex, &wait_till) == ++ ETIMEDOUT) ++ break; ++ } ++ ti->rerun = _gf_false; ++ } ++ pthread_mutex_unlock(&ti->mutex); ++} ++ ++static void * ++shard_unlink_handler(void *data) ++{ ++ shard_unlink_thread_t *ti = data; ++ xlator_t *this = ti->this; ++ ++ THIS = this; ++ ++ while (!ti->stop) { ++ shard_start_background_deletion(this); ++ shard_unlink_wait(ti); ++ } ++ return NULL; ++} ++ ++static int ++shard_unlink_handler_spawn(xlator_t *this) ++{ ++ int ret = 0; ++ shard_priv_t *priv = this->private; ++ shard_unlink_thread_t *ti = &priv->thread_info; ++ ++ ti->this = this; ++ ++ pthread_mutex_lock(&ti->mutex); ++ { ++ if (ti->running) { ++ pthread_cond_signal(&ti->cond); ++ } else { ++ ret = gf_thread_create(&ti->thread, NULL, shard_unlink_handler, ti, ++ "shard_unlink"); ++ if (ret < 0) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to create \"shard_unlink\" thread"); ++ goto unlock; ++ } ++ ti->running = _gf_true; ++ } ++ ++ ti->rerun = _gf_true; ++ } ++unlock: ++ pthread_mutex_unlock(&ti->mutex); ++ return ret; ++} ++ ++static int ++shard_unlink_handler_init(shard_unlink_thread_t *ti) ++{ ++ int ret = 0; ++ xlator_t *this = THIS; ++ ++ ret = pthread_mutex_init(&ti->mutex, NULL); ++ if (ret) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to init mutex for \"shard_unlink\" thread"); ++ goto out; ++ } ++ ++ ret = pthread_cond_init(&ti->cond, NULL); ++ if (ret) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to init cond var for \"shard_unlink\" thread"); ++ pthread_mutex_destroy(&ti->mutex); ++ goto out; ++ } ++ ++ ti->running = _gf_false; ++ ti->rerun = _gf_false; ++ ti->stop = _gf_false; ++ ++out: ++ return -ret; ++} ++ ++static void ++shard_unlink_handler_fini(shard_unlink_thread_t *ti) ++{ ++ int ret = 0; ++ xlator_t *this = THIS; ++ if (!ti) ++ return; ++ ++ pthread_mutex_lock(&ti->mutex); ++ if (ti->running) { ++ ti->rerun = _gf_true; ++ ti->stop = _gf_true; ++ pthread_cond_signal(&ti->cond); ++ } ++ pthread_mutex_unlock(&ti->mutex); ++ ++ if (ti->running) { ++ ret = pthread_join(ti->thread, NULL); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, 0, ++ "Failed to clean up shard unlink thread."); ++ ti->running = _gf_false; ++ } ++ ti->thread = 0; ++ ++ pthread_cond_destroy(&ti->cond); ++ pthread_mutex_destroy(&ti->mutex); ++} ++ + int32_t + mem_acct_init(xlator_t *this) + { +@@ -7164,6 +7339,14 @@ init(xlator_t *this) + this->private = priv; + LOCK_INIT(&priv->lock); + INIT_LIST_HEAD(&priv->ilist_head); ++ ++ ret = shard_unlink_handler_init(&priv->thread_info); ++ if (ret) { ++ gf_log(this->name, GF_LOG_ERROR, ++ "Failed to initialize resources for \"shard_unlink\" thread"); ++ goto out; ++ } ++ + ret = 0; + out: + if (ret) { +@@ -7188,6 +7371,8 @@ fini(xlator_t *this) + if (!priv) + goto out; + ++ shard_unlink_handler_fini(&priv->thread_info); ++ + this->private = NULL; + LOCK_DESTROY(&priv->lock); + GF_FREE(priv); +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 4fe181b..3dcb112 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -207,6 +207,16 @@ typedef enum { + + /* rm = "remove me" */ + ++typedef struct shard_unlink_thread { ++ pthread_mutex_t mutex; ++ pthread_cond_t cond; ++ pthread_t thread; ++ gf_boolean_t running; ++ gf_boolean_t rerun; ++ gf_boolean_t stop; ++ xlator_t *this; ++} shard_unlink_thread_t; ++ + typedef struct shard_priv { + uint64_t block_size; + uuid_t dot_shard_gfid; +@@ -220,6 +230,7 @@ typedef struct shard_priv { + shard_bg_deletion_state_t bg_del_state; + gf_boolean_t first_lookup_done; + uint64_t lru_limit; ++ shard_unlink_thread_t thread_info; + } shard_priv_t; + + typedef struct { +diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c +index b3a5381..1511e68 100644 +--- a/xlators/storage/posix/src/posix-entry-ops.c ++++ b/xlators/storage/posix/src/posix-entry-ops.c +@@ -183,6 +183,11 @@ posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + struct posix_private *priv = NULL; + posix_inode_ctx_t *ctx = NULL; + int ret = 0; ++ uint32_t lookup_unlink_dir = 0; ++ char *unlink_path = NULL; ++ struct stat lstatbuf = { ++ 0, ++ }; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); +@@ -208,7 +213,36 @@ posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + op_ret = -1; + if (gf_uuid_is_null(loc->pargfid) || (loc->name == NULL)) { + /* nameless lookup */ ++ op_ret = op_errno = errno = 0; + MAKE_INODE_HANDLE(real_path, this, loc, &buf); ++ ++ /* The gfid will be renamed to ".glusterfs/unlink" in case ++ * there are any open fds on the file in posix_unlink path. ++ * So client can request server to do nameless lookup with ++ * xdata = GF_UNLINKED_LOOKUP in ".glusterfs/unlink" ++ * dir if a client wants to know the status of the all open fds ++ * on the unlinked file. If the file still present in the ++ * ".glusterfs/unlink" dir then it indicates there still ++ * open fds present on the file and the file is still under ++ * unlink process */ ++ if (op_ret < 0 && errno == ENOENT) { ++ ret = dict_get_uint32(xdata, GF_UNLINKED_LOOKUP, ++ &lookup_unlink_dir); ++ if (!ret && lookup_unlink_dir) { ++ op_ret = op_errno = errno = 0; ++ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, loc->gfid, ++ unlink_path); ++ ret = sys_lstat(unlink_path, &lstatbuf); ++ if (ret) { ++ op_ret = -1; ++ op_errno = errno; ++ } else { ++ iatt_from_stat(&buf, &lstatbuf); ++ buf.ia_nlink = 0; ++ } ++ goto nameless_lookup_unlink_dir_out; ++ } ++ } + } else { + MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &buf); + if (!real_path || !par_path) { +@@ -328,6 +362,8 @@ out: + + if (op_ret == 0) + op_errno = 0; ++ ++nameless_lookup_unlink_dir_out: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, + (loc) ? loc->inode : NULL, &buf, xattr, &postparent); + +diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c +index 761e018..4c2983a 100644 +--- a/xlators/storage/posix/src/posix-inode-fd-ops.c ++++ b/xlators/storage/posix/src/posix-inode-fd-ops.c +@@ -2504,6 +2504,39 @@ out: + return 0; + } + ++static int ++posix_unlink_renamed_file(xlator_t *this, inode_t *inode) ++{ ++ int ret = 0; ++ char *unlink_path = NULL; ++ uint64_t ctx_uint = 0; ++ posix_inode_ctx_t *ctx = NULL; ++ struct posix_private *priv = this->private; ++ ++ ret = inode_ctx_get(inode, this, &ctx_uint); ++ ++ if (ret < 0) ++ goto out; ++ ++ ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint; ++ ++ if (ctx->unlink_flag == GF_UNLINK_TRUE) { ++ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, inode->gfid, unlink_path); ++ if (!unlink_path) { ++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, ++ "Failed to remove gfid :%s", uuid_utoa(inode->gfid)); ++ ret = -1; ++ } else { ++ ret = sys_unlink(unlink_path); ++ if (!ret) ++ ctx->unlink_flag = GF_UNLINK_FALSE; ++ } ++ } ++ ++out: ++ return ret; ++} ++ + int32_t + posix_release(xlator_t *this, fd_t *fd) + { +@@ -2514,6 +2547,9 @@ posix_release(xlator_t *this, fd_t *fd) + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd, out); + ++ if (fd->inode->active_fd_count == 0) ++ posix_unlink_renamed_file(this, fd->inode); ++ + ret = fd_ctx_del(fd, this, &tmp_pfd); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_PFD_NULL, +@@ -5881,41 +5917,33 @@ posix_forget(xlator_t *this, inode_t *inode) + uint64_t ctx_uint1 = 0; + uint64_t ctx_uint2 = 0; + posix_inode_ctx_t *ctx = NULL; +- posix_mdata_t *mdata = NULL; +- struct posix_private *priv_posix = NULL; +- +- priv_posix = (struct posix_private *)this->private; +- if (!priv_posix) +- return 0; ++ struct posix_private *priv = this->private; + + ret = inode_ctx_del2(inode, this, &ctx_uint1, &ctx_uint2); ++ ++ if (ctx_uint2) ++ GF_FREE((posix_mdata_t *)(uintptr_t)ctx_uint2); ++ + if (!ctx_uint1) +- goto check_ctx2; ++ return 0; + + ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint1; + + if (ctx->unlink_flag == GF_UNLINK_TRUE) { +- POSIX_GET_FILE_UNLINK_PATH(priv_posix->base_path, inode->gfid, +- unlink_path); ++ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, inode->gfid, unlink_path); + if (!unlink_path) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED, + "Failed to remove gfid :%s", uuid_utoa(inode->gfid)); + ret = -1; +- goto ctx_free; ++ } else { ++ ret = sys_unlink(unlink_path); + } +- ret = sys_unlink(unlink_path); + } +-ctx_free: ++ + pthread_mutex_destroy(&ctx->xattrop_lock); + pthread_mutex_destroy(&ctx->write_atomic_lock); + pthread_mutex_destroy(&ctx->pgfid_lock); + GF_FREE(ctx); + +-check_ctx2: +- if (ctx_uint2) { +- mdata = (posix_mdata_t *)(uintptr_t)ctx_uint2; +- } +- +- GF_FREE(mdata); + return ret; + } +-- +1.8.3.1 + diff --git a/SOURCES/0575-libglusterfs-add-functions-to-calculate-time-differe.patch b/SOURCES/0575-libglusterfs-add-functions-to-calculate-time-differe.patch new file mode 100644 index 0000000..98ffc3c --- /dev/null +++ b/SOURCES/0575-libglusterfs-add-functions-to-calculate-time-differe.patch @@ -0,0 +1,160 @@ +From 59e69ae1c7ccda74a8cbf8c9b2ae37bc74cbf612 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Fri, 4 Jun 2021 10:55:37 +0530 +Subject: [PATCH 575/584] libglusterfs: add functions to calculate time + difference + +Add gf_tvdiff() and gf_tsdiff() to calculate the difference +between 'struct timeval' and 'struct timespec' values, use +them where appropriate. + +Upstream patch details: +> https://github.com/gluster/glusterfs/commit/ba7f24b1cedf2549394c21b3f0df1661227cefae +> Change-Id: I172be06ee84e99a1da76847c15e5ea3fbc059338 +> Signed-off-by: Dmitry Antipov +> Updates: #1002 + +BUG: 1928676 +Change-Id: I723ab9555b0f8caef108742acc2cb63d6a32eb96 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245294 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfsd/src/glusterfsd-mgmt.c | 4 ++-- + libglusterfs/src/glusterfs/common-utils.h | 32 +++++++++++++++++++++++++++++++ + libglusterfs/src/latency.c | 3 +-- + xlators/cluster/dht/src/dht-rebalance.c | 6 ++---- + xlators/debug/io-stats/src/io-stats.c | 8 ++------ + 5 files changed, 39 insertions(+), 14 deletions(-) + +diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c +index 61d1b21..a51dd9e 100644 +--- a/glusterfsd/src/glusterfsd-mgmt.c ++++ b/glusterfsd/src/glusterfsd-mgmt.c +@@ -534,7 +534,7 @@ glusterfs_volume_top_write_perf(uint32_t blk_size, uint32_t blk_count, + } + + gettimeofday(&end, NULL); +- *time = (end.tv_sec - begin.tv_sec) * 1e6 + (end.tv_usec - begin.tv_usec); ++ *time = gf_tvdiff(&begin, &end); + *throughput = total_blks / *time; + gf_log("glusterd", GF_LOG_INFO, + "Throughput %.2f Mbps time %.2f secs " +@@ -653,7 +653,7 @@ glusterfs_volume_top_read_perf(uint32_t blk_size, uint32_t blk_count, + } + + gettimeofday(&end, NULL); +- *time = (end.tv_sec - begin.tv_sec) * 1e6 + (end.tv_usec - begin.tv_usec); ++ *time = gf_tvdiff(&begin, &end); + *throughput = total_blks / *time; + gf_log("glusterd", GF_LOG_INFO, + "Throughput %.2f Mbps time %.2f secs " +diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h +index 604afd0..bd48b6f 100644 +--- a/libglusterfs/src/glusterfs/common-utils.h ++++ b/libglusterfs/src/glusterfs/common-utils.h +@@ -1090,4 +1090,36 @@ find_xlator_option_in_cmd_args_t(const char *option_name, cmd_args_t *args); + int + gf_d_type_from_ia_type(ia_type_t type); + ++/* Return delta value in microseconds. */ ++ ++static inline double ++gf_tvdiff(struct timeval *start, struct timeval *end) ++{ ++ struct timeval t; ++ ++ if (start->tv_usec > end->tv_usec) ++ t.tv_sec = end->tv_sec - 1, t.tv_usec = end->tv_usec + 1000000; ++ else ++ t.tv_sec = end->tv_sec, t.tv_usec = end->tv_usec; ++ ++ return (double)(t.tv_sec - start->tv_sec) * 1e6 + ++ (double)(t.tv_usec - start->tv_usec); ++} ++ ++/* Return delta value in nanoseconds. */ ++ ++static inline double ++gf_tsdiff(struct timespec *start, struct timespec *end) ++{ ++ struct timespec t; ++ ++ if (start->tv_nsec > end->tv_nsec) ++ t.tv_sec = end->tv_sec - 1, t.tv_nsec = end->tv_nsec + 1000000000; ++ else ++ t.tv_sec = end->tv_sec, t.tv_nsec = end->tv_nsec; ++ ++ return (double)(t.tv_sec - start->tv_sec) * 1e9 + ++ (double)(t.tv_nsec - start->tv_nsec); ++} ++ + #endif /* _COMMON_UTILS_H */ +diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c +index e1e6de7..ce61399 100644 +--- a/libglusterfs/src/latency.c ++++ b/libglusterfs/src/latency.c +@@ -33,8 +33,7 @@ gf_update_latency(call_frame_t *frame) + if (!(begin->tv_sec && end->tv_sec)) + goto out; + +- elapsed = (end->tv_sec - begin->tv_sec) * 1e9 + +- (end->tv_nsec - begin->tv_nsec); ++ elapsed = gf_tsdiff(begin, end); + + if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { + gf_log("[core]", GF_LOG_WARNING, "Invalid frame op value: %d", +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index eab7558..e07dec0 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -2927,8 +2927,7 @@ gf_defrag_migrate_single_file(void *opaque) + + if (defrag->stats == _gf_true) { + gettimeofday(&end, NULL); +- elapsed = (end.tv_sec - start.tv_sec) * 1e6 + +- (end.tv_usec - start.tv_usec); ++ elapsed = gf_tvdiff(&start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration of " + "file:%s size:%" PRIu64 +@@ -3529,8 +3528,7 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + } + + gettimeofday(&end, NULL); +- elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + +- (end.tv_usec - dir_start.tv_usec); ++ elapsed = gf_tvdiff(&dir_start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration operation on dir %s took " + "%.2f secs", +diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c +index 9b34895..8ad96fb 100644 +--- a/xlators/debug/io-stats/src/io-stats.c ++++ b/xlators/debug/io-stats/src/io-stats.c +@@ -281,9 +281,7 @@ is_fop_latency_started(call_frame_t *frame) + begin = &frame->begin; \ + end = &frame->end; \ + \ +- elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + \ +- (end->tv_nsec - begin->tv_nsec)) / \ +- 1000; \ ++ elapsed = gf_tsdiff(begin, end) / 1000.0; \ + throughput = op_ret / elapsed; \ + \ + conf = this->private; \ +@@ -1774,9 +1772,7 @@ update_ios_latency(struct ios_conf *conf, call_frame_t *frame, + begin = &frame->begin; + end = &frame->end; + +- elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + +- (end->tv_nsec - begin->tv_nsec)) / +- 1000; ++ elapsed = gf_tsdiff(begin, end) / 1000.0; + + update_ios_latency_stats(&conf->cumulative, elapsed, op); + update_ios_latency_stats(&conf->incremental, elapsed, op); +-- +1.8.3.1 + diff --git a/SOURCES/0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch b/SOURCES/0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch new file mode 100644 index 0000000..6883559 --- /dev/null +++ b/SOURCES/0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch @@ -0,0 +1,573 @@ +From f2b9d3a089cc9ff9910da0075defe306851aca5c Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Fri, 4 Jun 2021 12:27:57 +0530 +Subject: [PATCH 576/584] rpcsvc: Add latency tracking for rpc programs + +Added latency tracking of rpc-handling code. With this change we +should be able to monitor the amount of time rpc-handling code is +consuming for each of the rpc call. + +Upstream patch details: +> https://review.gluster.org/#/c/glusterfs/+/24955/ +> fixes: #1466 +> Change-Id: I04fc7f3b12bfa5053c0fc36885f271cb78f581cd +> Signed-off-by: Pranith Kumar K + +BUG: 1928676 +Change-Id: Ibcedddb5db3ff4906607050cf9f7ea3ebb266cc5 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245295 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez Juan +Reviewed-by: Ashish Pandey +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/latency.h | 22 +++++--- + libglusterfs/src/glusterfs/mem-types.h | 1 + + libglusterfs/src/glusterfs/stack.h | 7 +-- + libglusterfs/src/glusterfs/statedump.h | 2 + + libglusterfs/src/glusterfs/xlator.h | 2 +- + libglusterfs/src/latency.c | 93 +++++++++++++++------------------- + libglusterfs/src/libglusterfs.sym | 5 ++ + libglusterfs/src/monitoring.c | 8 +-- + libglusterfs/src/statedump.c | 38 +++++++++++++- + libglusterfs/src/xlator.c | 5 ++ + rpc/rpc-lib/src/libgfrpc.sym | 1 + + rpc/rpc-lib/src/rpcsvc.c | 72 +++++++++++++++++++++++++- + rpc/rpc-lib/src/rpcsvc.h | 5 ++ + xlators/protocol/server/src/server.c | 2 + + 14 files changed, 193 insertions(+), 70 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/latency.h b/libglusterfs/src/glusterfs/latency.h +index ed47b1f..4d601bb 100644 +--- a/libglusterfs/src/glusterfs/latency.h ++++ b/libglusterfs/src/glusterfs/latency.h +@@ -11,13 +11,23 @@ + #ifndef __LATENCY_H__ + #define __LATENCY_H__ + +-#include "glusterfs/glusterfs.h" ++#include ++#include + +-typedef struct fop_latency { +- double min; /* min time for the call (microseconds) */ +- double max; /* max time for the call (microseconds) */ +- double total; /* total time (microseconds) */ ++typedef struct _gf_latency { ++ uint64_t min; /* min time for the call (nanoseconds) */ ++ uint64_t max; /* max time for the call (nanoseconds) */ ++ uint64_t total; /* total time (nanoseconds) */ + uint64_t count; +-} fop_latency_t; ++} gf_latency_t; + ++gf_latency_t * ++gf_latency_new(size_t n); ++ ++void ++gf_latency_reset(gf_latency_t *lat); ++ ++void ++gf_latency_update(gf_latency_t *lat, struct timespec *begin, ++ struct timespec *end); + #endif /* __LATENCY_H__ */ +diff --git a/libglusterfs/src/glusterfs/mem-types.h b/libglusterfs/src/glusterfs/mem-types.h +index 92730a9..970b9ff 100644 +--- a/libglusterfs/src/glusterfs/mem-types.h ++++ b/libglusterfs/src/glusterfs/mem-types.h +@@ -139,6 +139,7 @@ enum gf_common_mem_types_ { + gf_common_mt_mgmt_v3_lock_timer_t, /* used only in one location */ + gf_common_mt_server_cmdline_t, /* used only in one location */ + gf_mt_gfdb_query_record_t, ++ gf_common_mt_latency_t, + gf_common_mt_end + }; + #endif +diff --git a/libglusterfs/src/glusterfs/stack.h b/libglusterfs/src/glusterfs/stack.h +index bd466d8..536a330 100644 +--- a/libglusterfs/src/glusterfs/stack.h ++++ b/libglusterfs/src/glusterfs/stack.h +@@ -45,6 +45,9 @@ typedef int32_t (*ret_fn_t)(call_frame_t *frame, call_frame_t *prev_frame, + xlator_t *this, int32_t op_ret, int32_t op_errno, + ...); + ++void ++gf_frame_latency_update(call_frame_t *frame); ++ + struct call_pool { + union { + struct list_head all_frames; +@@ -149,8 +152,6 @@ struct _call_stack { + } while (0); + + struct xlator_fops; +-void +-gf_update_latency(call_frame_t *frame); + + static inline void + FRAME_DESTROY(call_frame_t *frame) +@@ -158,7 +159,7 @@ FRAME_DESTROY(call_frame_t *frame) + void *local = NULL; + + if (frame->root->ctx->measure_latency) +- gf_update_latency(frame); ++ gf_frame_latency_update(frame); + + list_del_init(&frame->frames); + if (frame->local) { +diff --git a/libglusterfs/src/glusterfs/statedump.h b/libglusterfs/src/glusterfs/statedump.h +index 89d04f9..ce08270 100644 +--- a/libglusterfs/src/glusterfs/statedump.h ++++ b/libglusterfs/src/glusterfs/statedump.h +@@ -127,4 +127,6 @@ gf_proc_dump_xlator_meminfo(xlator_t *this, strfd_t *strfd); + void + gf_proc_dump_xlator_profile(xlator_t *this, strfd_t *strfd); + ++void ++gf_latency_statedump_and_reset(char *key, gf_latency_t *lat); + #endif /* STATEDUMP_H */ +diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h +index 273039a..ecb9fa4 100644 +--- a/libglusterfs/src/glusterfs/xlator.h ++++ b/libglusterfs/src/glusterfs/xlator.h +@@ -808,7 +808,7 @@ struct _xlator { + + struct { + /* for latency measurement */ +- fop_latency_t latencies[GF_FOP_MAXVALUE]; ++ gf_latency_t latencies[GF_FOP_MAXVALUE]; + /* for latency measurement */ + fop_metrics_t metrics[GF_FOP_MAXVALUE]; + +diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c +index ce61399..ce4b0e8 100644 +--- a/libglusterfs/src/latency.c ++++ b/libglusterfs/src/latency.c +@@ -14,39 +14,34 @@ + */ + + #include "glusterfs/glusterfs.h" +-#include "glusterfs/xlator.h" +-#include "glusterfs/common-utils.h" + #include "glusterfs/statedump.h" +-#include "glusterfs/libglusterfs-messages.h" + +-void +-gf_update_latency(call_frame_t *frame) ++gf_latency_t * ++gf_latency_new(size_t n) + { +- double elapsed; +- struct timespec *begin, *end; +- +- fop_latency_t *lat; +- +- begin = &frame->begin; +- end = &frame->end; ++ int i = 0; ++ gf_latency_t *lat = NULL; + +- if (!(begin->tv_sec && end->tv_sec)) +- goto out; ++ lat = GF_MALLOC(n * sizeof(*lat), gf_common_mt_latency_t); ++ if (!lat) ++ return NULL; + +- elapsed = gf_tsdiff(begin, end); ++ for (i = 0; i < n; i++) { ++ gf_latency_reset(lat + i); ++ } ++ return lat; ++} + +- if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { +- gf_log("[core]", GF_LOG_WARNING, "Invalid frame op value: %d", +- frame->op); ++void ++gf_latency_update(gf_latency_t *lat, struct timespec *begin, ++ struct timespec *end) ++{ ++ if (!(begin->tv_sec && end->tv_sec)) { ++ /*Measure latency might have been enabled/disabled during the op*/ + return; + } + +- /* Can happen mostly at initiator xlator, as STACK_WIND/UNWIND macros +- set it right anyways for those frames */ +- if (!frame->op) +- frame->op = frame->root->op; +- +- lat = &frame->this->stats.interval.latencies[frame->op]; ++ double elapsed = gf_tsdiff(begin, end); + + if (lat->max < elapsed) + lat->max = elapsed; +@@ -56,40 +51,34 @@ gf_update_latency(call_frame_t *frame) + + lat->total += elapsed; + lat->count++; +-out: +- return; + } + + void +-gf_proc_dump_latency_info(xlator_t *xl) ++gf_latency_reset(gf_latency_t *lat) + { +- char key_prefix[GF_DUMP_MAX_BUF_LEN]; +- char key[GF_DUMP_MAX_BUF_LEN]; +- int i; +- +- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.latency", xl->name); +- gf_proc_dump_add_section("%s", key_prefix); +- +- for (i = 0; i < GF_FOP_MAXVALUE; i++) { +- gf_proc_dump_build_key(key, key_prefix, "%s", (char *)gf_fop_list[i]); +- +- fop_latency_t *lat = &xl->stats.interval.latencies[i]; ++ if (!lat) ++ return; ++ memset(lat, 0, sizeof(*lat)); ++ lat->min = ULLONG_MAX; ++ /* make sure 'min' is set to high value, so it would be ++ properly set later */ ++} + +- /* Doesn't make sense to continue if there are no fops +- came in the given interval */ +- if (!lat->count) +- continue; ++void ++gf_frame_latency_update(call_frame_t *frame) ++{ ++ gf_latency_t *lat; ++ /* Can happen mostly at initiator xlator, as STACK_WIND/UNWIND macros ++ set it right anyways for those frames */ ++ if (!frame->op) ++ frame->op = frame->root->op; + +- gf_proc_dump_write(key, "%.03f,%" PRId64 ",%.03f", +- (lat->total / lat->count), lat->count, lat->total); ++ if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { ++ gf_log("[core]", GF_LOG_WARNING, "Invalid frame op value: %d", ++ frame->op); ++ return; + } + +- memset(xl->stats.interval.latencies, 0, +- sizeof(xl->stats.interval.latencies)); +- +- /* make sure 'min' is set to high value, so it would be +- properly set later */ +- for (i = 0; i < GF_FOP_MAXVALUE; i++) { +- xl->stats.interval.latencies[i].min = 0xffffffff; +- } ++ lat = &frame->this->stats.interval.latencies[frame->op]; ++ gf_latency_update(lat, &frame->begin, &frame->end); + } +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index 9072afa..4f968e1 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -1183,3 +1183,8 @@ gf_latency_reset + gf_latency_update + gf_frame_latency_update + gf_assert ++gf_latency_statedump_and_reset ++gf_latency_new ++gf_latency_reset ++gf_latency_update ++gf_frame_latency_update +diff --git a/libglusterfs/src/monitoring.c b/libglusterfs/src/monitoring.c +index 6d9bfb1..20b7f52 100644 +--- a/libglusterfs/src/monitoring.c ++++ b/libglusterfs/src/monitoring.c +@@ -113,15 +113,15 @@ dump_latency_and_count(xlator_t *xl, int fd) + dprintf(fd, "%s.interval.%s.fail_count %" PRIu64 "\n", xl->name, + gf_fop_list[index], cbk); + } +- if (xl->stats.interval.latencies[index].count != 0.0) { ++ if (xl->stats.interval.latencies[index].count != 0) { + dprintf(fd, "%s.interval.%s.latency %lf\n", xl->name, + gf_fop_list[index], +- (xl->stats.interval.latencies[index].total / ++ (((double)xl->stats.interval.latencies[index].total) / + xl->stats.interval.latencies[index].count)); +- dprintf(fd, "%s.interval.%s.max %lf\n", xl->name, ++ dprintf(fd, "%s.interval.%s.max %" PRIu64 "\n", xl->name, + gf_fop_list[index], + xl->stats.interval.latencies[index].max); +- dprintf(fd, "%s.interval.%s.min %lf\n", xl->name, ++ dprintf(fd, "%s.interval.%s.min %" PRIu64 "\n", xl->name, + gf_fop_list[index], + xl->stats.interval.latencies[index].min); + } +diff --git a/libglusterfs/src/statedump.c b/libglusterfs/src/statedump.c +index d18b50f..4bf4cc2 100644 +--- a/libglusterfs/src/statedump.c ++++ b/libglusterfs/src/statedump.c +@@ -201,6 +201,40 @@ gf_proc_dump_write(char *key, char *value, ...) + return ret; + } + ++void ++gf_latency_statedump_and_reset(char *key, gf_latency_t *lat) ++{ ++ /* Doesn't make sense to continue if there are no fops ++ came in the given interval */ ++ if (!lat || !lat->count) ++ return; ++ gf_proc_dump_write(key, ++ "AVG:%lf CNT:%" PRIu64 " TOTAL:%" PRIu64 " MIN:%" PRIu64 ++ " MAX:%" PRIu64, ++ (((double)lat->total) / lat->count), lat->count, ++ lat->total, lat->min, lat->max); ++ gf_latency_reset(lat); ++} ++ ++void ++gf_proc_dump_xl_latency_info(xlator_t *xl) ++{ ++ char key_prefix[GF_DUMP_MAX_BUF_LEN]; ++ char key[GF_DUMP_MAX_BUF_LEN]; ++ int i; ++ ++ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.latency", xl->name); ++ gf_proc_dump_add_section("%s", key_prefix); ++ ++ for (i = 0; i < GF_FOP_MAXVALUE; i++) { ++ gf_proc_dump_build_key(key, key_prefix, "%s", (char *)gf_fop_list[i]); ++ ++ gf_latency_t *lat = &xl->stats.interval.latencies[i]; ++ ++ gf_latency_statedump_and_reset(key, lat); ++ } ++} ++ + static void + gf_proc_dump_xlator_mem_info(xlator_t *xl) + { +@@ -487,7 +521,7 @@ gf_proc_dump_single_xlator_info(xlator_t *trav) + return; + + if (ctx->measure_latency) +- gf_proc_dump_latency_info(trav); ++ gf_proc_dump_xl_latency_info(trav); + + gf_proc_dump_xlator_mem_info(trav); + +@@ -1024,7 +1058,7 @@ gf_proc_dump_xlator_profile(xlator_t *this, strfd_t *strfd) + { + gf_dump_strfd = strfd; + +- gf_proc_dump_latency_info(this); ++ gf_proc_dump_xl_latency_info(this); + + gf_dump_strfd = NULL; + } +diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c +index 36cc32c..b9ad411 100644 +--- a/libglusterfs/src/xlator.c ++++ b/libglusterfs/src/xlator.c +@@ -246,6 +246,7 @@ xlator_dynload_apis(xlator_t *xl) + void *handle = NULL; + volume_opt_list_t *vol_opt = NULL; + xlator_api_t *xlapi = NULL; ++ int i = 0; + + handle = xl->dlhandle; + +@@ -343,6 +344,10 @@ xlator_dynload_apis(xlator_t *xl) + memcpy(xl->op_version, xlapi->op_version, + sizeof(uint32_t) * GF_MAX_RELEASES); + ++ for (i = 0; i < GF_FOP_MAXVALUE; i++) { ++ gf_latency_reset(&xl->stats.interval.latencies[i]); ++ } ++ + ret = 0; + out: + return ret; +diff --git a/rpc/rpc-lib/src/libgfrpc.sym b/rpc/rpc-lib/src/libgfrpc.sym +index f3544e3..a1757cc 100644 +--- a/rpc/rpc-lib/src/libgfrpc.sym ++++ b/rpc/rpc-lib/src/libgfrpc.sym +@@ -66,3 +66,4 @@ rpc_transport_unix_options_build + rpc_transport_unref + rpc_clnt_mgmt_pmap_signout + rpcsvc_autoscale_threads ++rpcsvc_statedump +diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c +index b031d93..855b512 100644 +--- a/rpc/rpc-lib/src/rpcsvc.c ++++ b/rpc/rpc-lib/src/rpcsvc.c +@@ -25,6 +25,7 @@ + #include + #include "rpc-drc.h" + #include "protocol-common.h" ++#include + + #include + #include +@@ -377,6 +378,10 @@ rpcsvc_program_actor(rpcsvc_request_t *req) + goto err; + } + ++ if (svc->xl->ctx->measure_latency) { ++ timespec_now(&req->begin); ++ } ++ + req->ownthread = program->ownthread; + req->synctask = program->synctask; + +@@ -1526,10 +1531,18 @@ rpcsvc_submit_generic(rpcsvc_request_t *req, struct iovec *proghdr, + size_t hdrlen = 0; + char new_iobref = 0; + rpcsvc_drc_globals_t *drc = NULL; ++ gf_latency_t *lat = NULL; + + if ((!req) || (!req->trans)) + return -1; + ++ if (req->prog && req->begin.tv_sec) { ++ if ((req->procnum >= 0) && (req->procnum < req->prog->numactors)) { ++ timespec_now(&req->end); ++ lat = &req->prog->latencies[req->procnum]; ++ gf_latency_update(lat, &req->begin, &req->end); ++ } ++ } + trans = req->trans; + + for (i = 0; i < hdrcount; i++) { +@@ -1860,6 +1873,15 @@ rpcsvc_submit_message(rpcsvc_request_t *req, struct iovec *proghdr, + iobref); + } + ++void ++rpcsvc_program_destroy(rpcsvc_program_t *program) ++{ ++ if (program) { ++ GF_FREE(program->latencies); ++ GF_FREE(program); ++ } ++} ++ + int + rpcsvc_program_unregister(rpcsvc_t *svc, rpcsvc_program_t *program) + { +@@ -1917,8 +1939,7 @@ rpcsvc_program_unregister(rpcsvc_t *svc, rpcsvc_program_t *program) + + ret = 0; + out: +- if (prog) +- GF_FREE(prog); ++ rpcsvc_program_destroy(prog); + + if (ret == -1) { + if (program) { +@@ -2303,6 +2324,11 @@ rpcsvc_program_register(rpcsvc_t *svc, rpcsvc_program_t *program, + } + + memcpy(newprog, program, sizeof(*program)); ++ newprog->latencies = gf_latency_new(program->numactors); ++ if (!newprog->latencies) { ++ rpcsvc_program_destroy(newprog); ++ goto out; ++ } + + INIT_LIST_HEAD(&newprog->program); + pthread_mutexattr_init(&thr_attr); +@@ -3240,6 +3266,48 @@ out: + return ret; + } + ++void ++rpcsvc_program_dump(rpcsvc_program_t *prog) ++{ ++ char key_prefix[GF_DUMP_MAX_BUF_LEN]; ++ char key[GF_DUMP_MAX_BUF_LEN]; ++ int i; ++ ++ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s", prog->progname); ++ gf_proc_dump_add_section("%s", key_prefix); ++ ++ gf_proc_dump_build_key(key, key_prefix, "program-number"); ++ gf_proc_dump_write(key, "%d", prog->prognum); ++ ++ gf_proc_dump_build_key(key, key_prefix, "program-version"); ++ gf_proc_dump_write(key, "%d", prog->progver); ++ ++ strncat(key_prefix, ".latency", ++ sizeof(key_prefix) - strlen(key_prefix) - 1); ++ ++ for (i = 0; i < prog->numactors; i++) { ++ gf_proc_dump_build_key(key, key_prefix, "%s", prog->actors[i].procname); ++ gf_latency_statedump_and_reset(key, &prog->latencies[i]); ++ } ++} ++ ++void ++rpcsvc_statedump(rpcsvc_t *svc) ++{ ++ rpcsvc_program_t *prog = NULL; ++ int ret = 0; ++ ret = pthread_rwlock_tryrdlock(&svc->rpclock); ++ if (ret) ++ return; ++ { ++ list_for_each_entry(prog, &svc->programs, program) ++ { ++ rpcsvc_program_dump(prog); ++ } ++ } ++ pthread_rwlock_unlock(&svc->rpclock); ++} ++ + rpcsvc_actor_t gluster_dump_actors[GF_DUMP_MAXVALUE] = { + [GF_DUMP_NULL] = {"NULL", GF_DUMP_NULL, NULL, NULL, 0, DRC_NA}, + [GF_DUMP_DUMP] = {"DUMP", GF_DUMP_DUMP, rpcsvc_dump, NULL, 0, DRC_NA}, +diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h +index a51edc7..e336d00 100644 +--- a/rpc/rpc-lib/src/rpcsvc.h ++++ b/rpc/rpc-lib/src/rpcsvc.h +@@ -275,6 +275,8 @@ struct rpcsvc_request { + gf_boolean_t ownthread; + + gf_boolean_t synctask; ++ struct timespec begin; /*req handling start time*/ ++ struct timespec end; /*req handling end time*/ + }; + + #define rpcsvc_request_program(req) ((rpcsvc_program_t *)((req)->prog)) +@@ -431,6 +433,7 @@ struct rpcsvc_program { + + /* Program specific state handed to actors */ + void *private; ++ gf_latency_t *latencies; /*Tracks latency statistics for the rpc call*/ + + /* This upcall is provided by the program during registration. + * It is used to notify the program about events like connection being +@@ -696,4 +699,6 @@ rpcsvc_autoscale_threads(glusterfs_ctx_t *ctx, rpcsvc_t *rpc, int incr); + + extern int + rpcsvc_destroy(rpcsvc_t *svc); ++void ++rpcsvc_statedump(rpcsvc_t *svc); + #endif +diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c +index 54d9c0f..90eb3ff 100644 +--- a/xlators/protocol/server/src/server.c ++++ b/xlators/protocol/server/src/server.c +@@ -267,6 +267,8 @@ server_priv(xlator_t *this) + gf_proc_dump_build_key(key, "server", "total-bytes-write"); + gf_proc_dump_write(key, "%" PRIu64, total_write); + ++ rpcsvc_statedump(conf->rpc); ++ + ret = 0; + out: + if (ret) +-- +1.8.3.1 + diff --git a/SOURCES/0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch b/SOURCES/0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch new file mode 100644 index 0000000..1a5d0ea --- /dev/null +++ b/SOURCES/0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch @@ -0,0 +1,472 @@ +From d7665cf3249310c5faf87368f395b4e25cb86b48 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 15 Apr 2021 10:29:06 +0530 +Subject: [PATCH 577/584] protocol/client: don't reopen fds on which POSIX + locks are held after a reconnect + +XXXXXXXXXXXXXXXXXXX + IMPORTANT: +XXXXXXXXXXXXXXXXXXX +As a best pratice, with this patch we are bumping up the op-version +from GD_OP_VERSION_7_1 to GD_OP_VERSION_7_2 since it introduces a +new volume option. Enabling the new option will have effect only +after all the servers and clients are upgraded to this version. +---------------------------------------------------------------------- + +Bricks cleanup any granted locks after a client disconnects and +currently these locks are not healed after a reconnect. This means +post reconnect a competing process could be granted a lock even though +the first process which was granted locks has not unlocked. By not +re-opening fds, subsequent operations on such fds will fail forcing +the application to close the current fd and reopen a new one. This way +we prevent any silent corruption. + +A new option "client.strict-locks" is introduced to control this +behaviour. This option is set to "off" by default. + +> Upstream patch: https://review.gluster.org/#/c/glusterfs/+/22712/ +> Change-Id: Ieed545efea466cb5e8f5a36199aa26380c301b9e +> Signed-off-by: Raghavendra G +> updates: bz#1694920 + +BUG: 1689375 +Change-Id: Ieed545efea466cb5e8f5a36199aa26380c301b9e +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244909 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Ravishankar Narayanankutty +--- + libglusterfs/src/glusterfs/globals.h | 4 +- + tests/bugs/bug-1694920.t | 63 ++++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 14 ++++++ + xlators/protocol/client/src/client-handshake.c | 3 +- + xlators/protocol/client/src/client-helpers.c | 5 +- + xlators/protocol/client/src/client-lk.c | 2 +- + xlators/protocol/client/src/client-rpc-fops.c | 45 ++++++++++++++++- + xlators/protocol/client/src/client-rpc-fops_v2.c | 32 +++++++++++- + xlators/protocol/client/src/client.c | 13 +++++ + xlators/protocol/client/src/client.h | 16 ++++++ + 10 files changed, 190 insertions(+), 7 deletions(-) + create mode 100644 tests/bugs/bug-1694920.t + +diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h +index 33fb023..ce2d110 100644 +--- a/libglusterfs/src/glusterfs/globals.h ++++ b/libglusterfs/src/glusterfs/globals.h +@@ -50,7 +50,7 @@ + 1 /* MIN is the fresh start op-version, mostly \ + should not change */ + #define GD_OP_VERSION_MAX \ +- GD_OP_VERSION_7_1 /* MAX VERSION is the maximum \ ++ GD_OP_VERSION_7_2 /* MAX VERSION is the maximum \ + count in VME table, should \ + keep changing with \ + introduction of newer \ +@@ -140,6 +140,8 @@ + + #define GD_OP_VERSION_7_1 70100 /* Op-version for GlusterFS 7.1 */ + ++#define GD_OP_VERSION_7_2 70200 /* Op-version for GlusterFS 7.2 */ ++ + #include "glusterfs/xlator.h" + #include "glusterfs/options.h" + +diff --git a/tests/bugs/bug-1694920.t b/tests/bugs/bug-1694920.t +new file mode 100644 +index 0000000..5bf93c9 +--- /dev/null ++++ b/tests/bugs/bug-1694920.t +@@ -0,0 +1,63 @@ ++#!/bin/bash ++ ++SCRIPT_TIMEOUT=300 ++ ++. $(dirname $0)/../include.rc ++. $(dirname $0)/../volume.rc ++. $(dirname $0)/../fileio.rc ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/${V0}; ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume start $V0 ++TEST $GFS -s $H0 --volfile-id=$V0 $M0; ++ ++TEST touch $M0/a ++ ++#When all bricks are up, lock and unlock should succeed ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++TEST fd_close $fd1 ++ ++#When all bricks are down, lock/unlock should fail ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST $CLI volume stop $V0 ++TEST ! flock -x $fd1 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" client_connected_status_meta $M0 $V0-client-0 ++TEST fd_close $fd1 ++ ++#When a brick goes down and comes back up operations on fd which had locks on it should succeed by default ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++TEST $CLI volume stop $V0 ++sleep 2 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" client_connected_status_meta $M0 $V0-client-0 ++TEST fd_write $fd1 "data" ++TEST fd_close $fd1 ++ ++#When a brick goes down and comes back up operations on fd which had locks on it should fail when client.strict-locks is on ++TEST $CLI volume set $V0 client.strict-locks on ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++TEST $CLI volume stop $V0 ++sleep 2 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" client_connected_status_meta $M0 $V0-client-0 ++TEST ! fd_write $fd1 "data" ++TEST fd_close $fd1 ++ ++cleanup +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index c1ca190..01f3912 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -2022,6 +2022,20 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .value = "9", + .flags = VOLOPT_FLAG_CLIENT_OPT}, + ++ {.key = "client.strict-locks", ++ .voltype = "protocol/client", ++ .option = "strict-locks", ++ .value = "off", ++ .op_version = GD_OP_VERSION_7_2, ++ .validate_fn = validate_boolean, ++ .type = GLOBAL_DOC, ++ .description = "When set, doesn't reopen saved fds after reconnect " ++ "if POSIX locks are held on them. Hence subsequent " ++ "operations on these fds will fail. This is " ++ "necessary for stricter lock complaince as bricks " ++ "cleanup any granted locks when a client " ++ "disconnects."}, ++ + /* Server xlator options */ + {.key = "network.tcp-window-size", + .voltype = "protocol/server", +diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c +index 6b20d92..a12472b 100644 +--- a/xlators/protocol/client/src/client-handshake.c ++++ b/xlators/protocol/client/src/client-handshake.c +@@ -910,7 +910,8 @@ client_post_handshake(call_frame_t *frame, xlator_t *this) + { + list_for_each_entry_safe(fdctx, tmp, &conf->saved_fds, sfd_pos) + { +- if (fdctx->remote_fd != -1) ++ if (fdctx->remote_fd != -1 || ++ (!list_empty(&fdctx->lock_list) && conf->strict_locks)) + continue; + + fdctx->reopen_done = client_child_up_reopen_done; +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index 53b4484..6543100 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -410,6 +410,7 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) + { + clnt_fd_ctx_t *fdctx = NULL; + clnt_conf_t *conf = NULL; ++ gf_boolean_t locks_held = _gf_false; + + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, remote_fd, out); +@@ -431,11 +432,13 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) + *remote_fd = -1; + else + *remote_fd = fdctx->remote_fd; ++ ++ locks_held = !list_empty(&fdctx->lock_list); + } + } + pthread_spin_unlock(&conf->fd_lock); + +- if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1)) ++ if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && (!locks_held)) + *remote_fd = GF_ANON_FD_NO; + + return 0; +diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c +index 679e198..c1fb055 100644 +--- a/xlators/protocol/client/src/client-lk.c ++++ b/xlators/protocol/client/src/client-lk.c +@@ -351,7 +351,7 @@ delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner) + + list_for_each_entry_safe(lock, tmp, &fdctx->lock_list, list) + { +- if (!is_same_lkowner(&lock->owner, owner)) { ++ if (is_same_lkowner(&lock->owner, owner)) { + list_del_init(&lock->list); + list_add_tail(&lock->list, &delete_list); + count++; +diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c +index 1c8b31b..3110c78 100644 +--- a/xlators/protocol/client/src/client-rpc-fops.c ++++ b/xlators/protocol/client/src/client-rpc-fops.c +@@ -22,8 +22,18 @@ int32_t + client3_getspec(call_frame_t *frame, xlator_t *this, void *data); + rpc_clnt_prog_t clnt3_3_fop_prog; + +-/* CBK */ ++int ++client_is_setlk(int32_t cmd) ++{ ++ if ((cmd == F_SETLK) || (cmd == F_SETLK64) || (cmd == F_SETLKW) || ++ (cmd == F_SETLKW64)) { ++ return 1; ++ } + ++ return 0; ++} ++ ++/* CBK */ + int + client3_3_symlink_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +@@ -816,7 +826,8 @@ client3_3_flush_cbk(struct rpc_req *req, struct iovec *iov, int count, + goto out; + } + +- if (rsp.op_ret >= 0 && !fd_is_anonymous(local->fd)) { ++ if ((rsp.op_ret >= 0 || (rsp.op_errno == ENOTCONN)) && ++ !fd_is_anonymous(local->fd)) { + /* Delete all saved locks of the owner issuing flush */ + ret = delete_granted_locks_owner(local->fd, &local->owner); + gf_msg_trace(this->name, 0, "deleting locks of owner (%s) returned %d", +@@ -2388,10 +2399,12 @@ client3_3_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; ++ clnt_local_t *local = NULL; + + this = THIS; + + frame = myframe; ++ local = frame->local; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; +@@ -2412,6 +2425,18 @@ client3_3_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + ret = client_post_lk(this, &rsp, &lock, &xdata); + if (ret < 0) + goto out; ++ ++ /* Save the lock to the client lock cache to be able ++ to recover in the case of server reboot.*/ ++ ++ if (client_is_setlk(local->cmd)) { ++ ret = client_add_lock_for_recovery(local->fd, &lock, &local->owner, ++ local->cmd); ++ if (ret < 0) { ++ rsp.op_ret = -1; ++ rsp.op_errno = -ret; ++ } ++ } + } + + out: +@@ -4263,8 +4288,16 @@ client3_3_flush(call_frame_t *frame, xlator_t *this, void *data) + ret = client_pre_flush(this, &req, args->fd, args->xdata); + if (ret) { + op_errno = -ret; ++ if (op_errno == EBADF) { ++ ret = delete_granted_locks_owner(local->fd, &local->owner); ++ gf_msg_trace(this->name, 0, ++ "deleting locks of owner (%s) returned %d", ++ lkowner_utoa(&local->owner), ret); ++ } ++ + goto unwind; + } ++ + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_FLUSH, + client3_3_flush_cbk, NULL, + (xdrproc_t)xdr_gfs3_flush_req); +@@ -5199,8 +5232,16 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + args->xdata); + if (ret) { + op_errno = -ret; ++ ++ if ((op_errno == EBADF) && (args->flock->l_type == F_UNLCK) && ++ client_is_setlk(local->cmd)) { ++ client_add_lock_for_recovery(local->fd, args->flock, &local->owner, ++ local->cmd); ++ } ++ + goto unwind; + } ++ + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_LK, + client3_3_lk_cbk, NULL, + (xdrproc_t)xdr_gfs3_lk_req); +diff --git a/xlators/protocol/client/src/client-rpc-fops_v2.c b/xlators/protocol/client/src/client-rpc-fops_v2.c +index 613dda8..954fc58 100644 +--- a/xlators/protocol/client/src/client-rpc-fops_v2.c ++++ b/xlators/protocol/client/src/client-rpc-fops_v2.c +@@ -723,7 +723,8 @@ client4_0_flush_cbk(struct rpc_req *req, struct iovec *iov, int count, + goto out; + } + +- if (rsp.op_ret >= 0 && !fd_is_anonymous(local->fd)) { ++ if ((rsp.op_ret >= 0 || (rsp.op_errno == ENOTCONN)) && ++ !fd_is_anonymous(local->fd)) { + /* Delete all saved locks of the owner issuing flush */ + ret = delete_granted_locks_owner(local->fd, &local->owner); + gf_msg_trace(this->name, 0, "deleting locks of owner (%s) returned %d", +@@ -2193,10 +2194,12 @@ client4_0_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; ++ clnt_local_t *local = NULL; + + this = THIS; + + frame = myframe; ++ local = frame->local; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; +@@ -2217,6 +2220,18 @@ client4_0_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + ret = client_post_lk_v2(this, &rsp, &lock, &xdata); + if (ret < 0) + goto out; ++ ++ /* Save the lock to the client lock cache to be able ++ to recover in the case of server reboot.*/ ++ ++ if (client_is_setlk(local->cmd)) { ++ ret = client_add_lock_for_recovery(local->fd, &lock, &local->owner, ++ local->cmd); ++ if (ret < 0) { ++ rsp.op_ret = -1; ++ rsp.op_errno = -ret; ++ } ++ } + } + + out: +@@ -3998,6 +4013,13 @@ client4_0_flush(call_frame_t *frame, xlator_t *this, void *data) + ret = client_pre_flush_v2(this, &req, args->fd, args->xdata); + if (ret) { + op_errno = -ret; ++ if (op_errno == EBADF) { ++ ret = delete_granted_locks_owner(local->fd, &local->owner); ++ gf_msg_trace(this->name, 0, ++ "deleting locks of owner (%s) returned %d", ++ lkowner_utoa(&local->owner), ret); ++ } ++ + goto unwind; + } + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_FLUSH, +@@ -4771,8 +4793,16 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + args->xdata); + if (ret) { + op_errno = -ret; ++ ++ if ((op_errno == EBADF) && (args->flock->l_type == F_UNLCK) && ++ client_is_setlk(local->cmd)) { ++ client_add_lock_for_recovery(local->fd, args->flock, &local->owner, ++ local->cmd); ++ } ++ + goto unwind; + } ++ + ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_LK, + client4_0_lk_cbk, NULL, + (xdrproc_t)xdr_gfx_lk_req); +diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c +index ed855ca..63c90ea 100644 +--- a/xlators/protocol/client/src/client.c ++++ b/xlators/protocol/client/src/client.c +@@ -2491,6 +2491,7 @@ build_client_config(xlator_t *this, clnt_conf_t *conf) + GF_OPTION_INIT("filter-O_DIRECT", conf->filter_o_direct, bool, out); + + GF_OPTION_INIT("send-gids", conf->send_gids, bool, out); ++ GF_OPTION_INIT("strict-locks", conf->strict_locks, bool, out); + + conf->client_id = glusterfs_leaf_position(this); + +@@ -2676,6 +2677,7 @@ reconfigure(xlator_t *this, dict_t *options) + out); + + GF_OPTION_RECONF("send-gids", conf->send_gids, options, bool, out); ++ GF_OPTION_RECONF("strict-locks", conf->strict_locks, options, bool, out); + + ret = 0; + out: +@@ -3032,6 +3034,17 @@ struct volume_options options[] = { + " power. Range 1-32 threads.", + .op_version = {GD_OP_VERSION_RHS_3_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, ++ {.key = {"strict-locks"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .default_value = "off", ++ .op_version = {GD_OP_VERSION_7_2}, ++ .flags = OPT_FLAG_SETTABLE, ++ .description = "When set, doesn't reopen saved fds after reconnect " ++ "if POSIX locks are held on them. Hence subsequent " ++ "operations on these fds will fail. This is " ++ "necessary for stricter lock complaince as bricks " ++ "cleanup any granted locks when a client " ++ "disconnects."}, + {.key = {NULL}}, + }; + +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index f12fa61..bde3d1a 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -235,6 +235,15 @@ typedef struct clnt_conf { + * up, disconnects can be + * logged + */ ++ ++ gf_boolean_t strict_locks; /* When set, doesn't reopen saved fds after ++ reconnect if POSIX locks are held on them. ++ Hence subsequent operations on these fds will ++ fail. This is necessary for stricter lock ++ complaince as bricks cleanup any granted ++ locks when a client disconnects. ++ */ ++ + } clnt_conf_t; + + typedef struct _client_fd_ctx { +@@ -513,4 +522,11 @@ compound_request_cleanup_v2(gfx_compound_req *req); + void + client_compound_rsp_cleanup_v2(gfx_compound_rsp *rsp, int len); + ++int ++client_add_lock_for_recovery(fd_t *fd, struct gf_flock *flock, ++ gf_lkowner_t *owner, int32_t cmd); ++ ++int ++client_is_setlk(int32_t cmd); ++ + #endif /* !_CLIENT_H */ +-- +1.8.3.1 + diff --git a/SOURCES/0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch b/SOURCES/0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch new file mode 100644 index 0000000..d5df9e2 --- /dev/null +++ b/SOURCES/0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch @@ -0,0 +1,46 @@ +From ffb4085b3e04878e85bf505a541203aa2ee71e9c Mon Sep 17 00:00:00 2001 +From: l17zhou +Date: Fri, 6 Mar 2020 03:54:02 +0200 +Subject: [PATCH 578/584] protocol/client: fallback to anonymous fd for fsync + +> Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24203/ +> Change-Id: I32f801206ce7fbd05aa693f44c2f140304f2e275 +> Fixes: bz#1810842 + +BUG: 1689375 +Change-Id: I32f801206ce7fbd05aa693f44c2f140304f2e275 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245538 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/protocol/client/src/client-common.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/xlators/protocol/client/src/client-common.c b/xlators/protocol/client/src/client-common.c +index 64db98d..1417a60 100644 +--- a/xlators/protocol/client/src/client-common.c ++++ b/xlators/protocol/client/src/client-common.c +@@ -449,7 +449,8 @@ client_pre_fsync(xlator_t *this, gfs3_fsync_req *req, fd_t *fd, int32_t flags, + int64_t remote_fd = -1; + int op_errno = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, ++ out); + + req->fd = remote_fd; + req->data = flags; +@@ -2641,7 +2642,8 @@ client_pre_fsync_v2(xlator_t *this, gfx_fsync_req *req, fd_t *fd, int32_t flags, + int64_t remote_fd = -1; + int op_errno = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, ++ out); + + req->fd = remote_fd; + req->data = flags; +-- +1.8.3.1 + diff --git a/SOURCES/0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch b/SOURCES/0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch new file mode 100644 index 0000000..d568966 --- /dev/null +++ b/SOURCES/0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch @@ -0,0 +1,168 @@ +From 96c4c3c47c914aced8864e7d178a4d57f7fced05 Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Sun, 6 Jun 2021 14:26:18 +0300 +Subject: [PATCH 579/584] cli: changing rebal task ID to "None" in case status + is being reset + +Rebalance status is being reset during replace/reset-brick operations. +This cause 'volume status' to shows rebalance as "not started". + +Fix: +change rebalance-status to "reset due to (replace|reset)-brick" + +Backport of: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/1869 +> Change-Id: Ia73a8bea3dcd8e51acf4faa6434c3cb0d09856d0 +> Signed-off-by: Tamar Shacked +> Fixes: #1717 + +BUG: 1889966 + +Signed-off-by: Tamar Shacked +Change-Id: Ia73a8bea3dcd8e51acf4faa6434c3cb0d09856d0 +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245402 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli-rpc-ops.c | 15 ++++++- + rpc/xdr/src/cli1-xdr.x | 2 + + tests/bugs/glusterd/reset-rebalance-state.t | 46 ++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-replace-brick.c | 4 +- + xlators/mgmt/glusterd/src/glusterd-reset-brick.c | 3 +- + 5 files changed, 65 insertions(+), 5 deletions(-) + create mode 100644 tests/bugs/glusterd/reset-rebalance-state.t + +diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c +index 51b5447..4167c68 100644 +--- a/cli/src/cli-rpc-ops.c ++++ b/cli/src/cli-rpc-ops.c +@@ -72,6 +72,8 @@ char *cli_vol_task_status_str[] = {"not started", + "fix-layout stopped", + "fix-layout completed", + "fix-layout failed", ++ "reset due to replace-brick", ++ "reset due to reset-brick", + "unknown"}; + + int32_t +@@ -8357,12 +8359,21 @@ cli_print_volume_status_tasks(dict_t *dict) + ret = dict_get_str(dict, key, &task_id_str); + if (ret) + return; +- cli_out("%-20s : %-20s", "ID", task_id_str); + + snprintf(key, sizeof(key), "task%d.status", i); + ret = dict_get_int32(dict, key, &status); +- if (ret) ++ if (ret) { ++ cli_out("%-20s : %-20s", "ID", task_id_str); + return; ++ } ++ ++ if (!strcmp(op, "Rebalance") && ++ (status == GF_DEFRAG_STATUS_RESET_DUE_REPLACE_BRC || ++ status == GF_DEFRAG_STATUS_RESET_DUE_RESET_BRC)) { ++ task_id_str = "None"; ++ } ++ ++ cli_out("%-20s : %-20s", "ID", task_id_str); + + snprintf(task, sizeof(task), "task%d", i); + +diff --git a/rpc/xdr/src/cli1-xdr.x b/rpc/xdr/src/cli1-xdr.x +index 777cb00..17d96f1 100644 +--- a/rpc/xdr/src/cli1-xdr.x ++++ b/rpc/xdr/src/cli1-xdr.x +@@ -45,6 +45,8 @@ + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, ++ GF_DEFRAG_STATUS_RESET_DUE_REPLACE_BRC, ++ GF_DEFRAG_STATUS_RESET_DUE_RESET_BRC, + GF_DEFRAG_STATUS_MAX + }; + +diff --git a/tests/bugs/glusterd/reset-rebalance-state.t b/tests/bugs/glusterd/reset-rebalance-state.t +new file mode 100644 +index 0000000..829d2b1 +--- /dev/null ++++ b/tests/bugs/glusterd/reset-rebalance-state.t +@@ -0,0 +1,46 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../cluster.rc ++. $(dirname $0)/../../volume.rc ++ ++ ++get_rebalance_status() { ++ $CLI volume status $V0 | egrep ^"Status " | awk '{print $3}' ++} ++ ++run_rebal_check_status() { ++ TEST $CLI volume rebalance $V0 start ++ EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 ++ REBAL_STATE=$(get_rebalance_status) ++ TEST [ $REBAL_STATE == "completed" ] ++} ++ ++replace_brick_check_status() { ++ TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_replace commit force ++ REBAL_STATE=$(get_rebalance_status) ++ TEST [ $REBAL_STATE == "reset" ] ++} ++ ++reset_brick_check_status() { ++ TEST $CLI volume reset-brick $V0 $H0:$B0/${V0}2 start ++ TEST $CLI volume reset-brick $V0 $H0:$B0/${V0}2 $H0:$B0/${V0}2 commit force ++ REBAL_STATE=$(get_rebalance_status) ++ TEST [ $REBAL_STATE == "reset" ] ++} ++ ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd; ++ ++TEST $CLI volume info; ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..6} force; ++TEST $CLI volume start $V0; ++ ++run_rebal_check_status; ++replace_brick_check_status; ++reset_brick_check_status; ++ ++cleanup; ++ +diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +index 0615081..80b80e4 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c ++++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +@@ -548,8 +548,8 @@ glusterd_op_replace_brick(dict_t *dict, dict_t *rsp_dict) + (void)glusterd_svcs_manager(volinfo); + goto out; + } +- +- volinfo->rebal.defrag_status = 0; ++ if (volinfo->rebal.defrag_status != GF_DEFRAG_STATUS_NOT_STARTED) ++ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_RESET_DUE_REPLACE_BRC; + + ret = glusterd_svcs_manager(volinfo); + if (ret) { +diff --git a/xlators/mgmt/glusterd/src/glusterd-reset-brick.c b/xlators/mgmt/glusterd/src/glusterd-reset-brick.c +index cf04ce8..19d7549 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-reset-brick.c ++++ b/xlators/mgmt/glusterd/src/glusterd-reset-brick.c +@@ -342,7 +342,8 @@ glusterd_op_reset_brick(dict_t *dict, dict_t *rsp_dict) + goto out; + } + +- volinfo->rebal.defrag_status = 0; ++ if (volinfo->rebal.defrag_status != GF_DEFRAG_STATUS_NOT_STARTED) ++ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_RESET_DUE_RESET_BRC; + + ret = glusterd_svcs_manager(volinfo); + if (ret) { +-- +1.8.3.1 + diff --git a/SOURCES/0580-cluster-dht-suppress-file-migration-error-for-node-n.patch b/SOURCES/0580-cluster-dht-suppress-file-migration-error-for-node-n.patch new file mode 100644 index 0000000..06befeb --- /dev/null +++ b/SOURCES/0580-cluster-dht-suppress-file-migration-error-for-node-n.patch @@ -0,0 +1,138 @@ +From a5da8bb830e86b6dd77a06cd59d220052e80b21c Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Sun, 6 Jun 2021 11:57:06 +0300 +Subject: [PATCH 580/584] cluster/dht: suppress file migration error for node + not supposed to migrate file + +A rebalance process does a lookup for every file in the dir it is processing +before checking if it supposed to migrate the file. +In this issue there are two rebalance processses running on a replica subvol: +R1 is migrating the FILE. +R2 is not supposed to migrate the FILE, but it does a lookup and + finds a stale linkfile which is mostly due to a stale layout. + Then, it tries to unlink the stale linkfile and gets EBUSY + as the linkfile fd is open due R1 migration. + As a result a misleading error msg about FILE migration failure + due EBUSY is logged in R2 logfile. + +Fix: +suppress the error in case it occured in a node that +is not supposed to migrate the file. + +Backport of: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24712/ +> fixes: #1371 +> Change-Id: I37832b404e2b0cc40ac5caf45f14c32c891e71f3 +> Signed-off-by: Tamar Shacked + +BUG: 1815462 +Signed-off-by: Tamar Shacked +Change-Id: I915ee8e7470d85a849b198bfa7d58d368a246aae +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245401 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-rebalance.c | 38 ++++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 13 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index e07dec0..cc0f2c9 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -2604,10 +2604,10 @@ out: + * all hardlinks. + */ + +-int ++gf_boolean_t + gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) + { +- int ret = 0; ++ gf_boolean_t ret = _gf_false; + int i = local_subvol_index; + char *str = NULL; + uint32_t hashval = 0; +@@ -2629,12 +2629,11 @@ gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) + } + + str = uuid_utoa_r(gfid, buf); +- ret = dht_hash_compute(this, 0, str, &hashval); +- if (ret == 0) { ++ if (dht_hash_compute(this, 0, str, &hashval) == 0) { + index = (hashval % entry->count); + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ +- ret = 1; ++ ret = _gf_true; + goto out; + } + +@@ -2647,12 +2646,12 @@ gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) + /* None of the bricks in the subvol are up. + * CHILD_DOWN will kill the process soon */ + +- return 0; ++ return _gf_false; + } + + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ +- ret = 1; ++ ret = _gf_true; + goto out; + } + } +@@ -2701,6 +2700,7 @@ gf_defrag_migrate_single_file(void *opaque) + struct iatt *iatt_ptr = NULL; + gf_boolean_t update_skippedcount = _gf_true; + int i = 0; ++ gf_boolean_t should_i_migrate = 0; + + rebal_entry = (struct dht_container *)opaque; + if (!rebal_entry) { +@@ -2754,11 +2754,29 @@ gf_defrag_migrate_single_file(void *opaque) + goto out; + } + ++ should_i_migrate = gf_defrag_should_i_migrate( ++ this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid); ++ + gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); + + gf_uuid_copy(entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); ++ ++ if (!should_i_migrate) { ++ /* this node isn't supposed to migrate the file. suppressing any ++ * potential error from lookup as this file is under migration by ++ * another node */ ++ if (ret) { ++ gf_msg_debug(this->name, -ret, ++ "Ignoring lookup failure: node isn't migrating %s", ++ entry_loc.path); ++ ret = 0; ++ } ++ gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path); ++ goto out; ++ } ++ + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s lookup failed", entry_loc.path); +@@ -2779,12 +2797,6 @@ gf_defrag_migrate_single_file(void *opaque) + goto out; + } + +- if (!gf_defrag_should_i_migrate(this, rebal_entry->local_subvol_index, +- entry->d_stat.ia_gfid)) { +- gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path); +- goto out; +- } +- + iatt_ptr = &iatt; + + hashed_subvol = dht_subvol_get_hashed(this, &entry_loc); +-- +1.8.3.1 + diff --git a/SOURCES/0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch b/SOURCES/0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch new file mode 100644 index 0000000..1267608 --- /dev/null +++ b/SOURCES/0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch @@ -0,0 +1,1431 @@ +From 57c794e31c0333f508ada740227c9afa1889f8ae Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 15 Apr 2021 11:27:57 +0530 +Subject: [PATCH 581/584] afr: don't reopen fds on which POSIX locks are held + +When client.strict-locks is enabled on a volume and there are POSIX +locks held on the files, after disconnect and reconnection of the +clients do not re-open such fds which might lead to multiple clients +acquiring the locks and cause data corruption. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/1980/commits/56bde56c2741c5eac59937a6cf951a14f2878460 +> Change-Id: I8777ffbc2cc8d15ab57b58b72b56eb67521787c5 +> Fixes: #1977 +> Signed-off-by: karthik-us + +BUG: 1689375 +Change-Id: I8777ffbc2cc8d15ab57b58b72b56eb67521787c5 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245414 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Ravishankar Narayanankutty +--- + rpc/rpc-lib/src/protocol-common.h | 6 + + tests/bugs/replicate/do-not-reopen-fd.t | 206 +++++++++++++++++ + xlators/cluster/afr/src/afr-common.c | 15 +- + xlators/cluster/afr/src/afr-open.c | 280 +++++++++++++++++++---- + xlators/cluster/afr/src/afr.h | 3 + + xlators/protocol/client/src/client-common.c | 148 ++++++++---- + xlators/protocol/client/src/client-common.h | 4 + + xlators/protocol/client/src/client-helpers.c | 22 +- + xlators/protocol/client/src/client-rpc-fops.c | 23 +- + xlators/protocol/client/src/client-rpc-fops_v2.c | 25 +- + xlators/protocol/client/src/client.c | 21 +- + xlators/protocol/client/src/client.h | 8 +- + 12 files changed, 654 insertions(+), 107 deletions(-) + create mode 100644 tests/bugs/replicate/do-not-reopen-fd.t + +diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h +index 779878f..f56aaaa 100644 +--- a/rpc/rpc-lib/src/protocol-common.h ++++ b/rpc/rpc-lib/src/protocol-common.h +@@ -312,6 +312,12 @@ enum glusterd_mgmt_v3_procnum { + GLUSTERD_MGMT_V3_MAXVALUE, + }; + ++enum gf_fd_reopen_status { ++ FD_REOPEN_ALLOWED = 0, ++ FD_REOPEN_NOT_ALLOWED, ++ FD_BAD, ++}; ++ + typedef struct gf_gsync_detailed_status_ gf_gsync_status_t; + + enum gf_get_volume_info_type { +diff --git a/tests/bugs/replicate/do-not-reopen-fd.t b/tests/bugs/replicate/do-not-reopen-fd.t +new file mode 100644 +index 0000000..76d8e70 +--- /dev/null ++++ b/tests/bugs/replicate/do-not-reopen-fd.t +@@ -0,0 +1,206 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../fileio.rc ++ ++cleanup; ++ ++TEST glusterd; ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 client.strict-locks on ++TEST $CLI volume heal $V0 disable ++TEST $CLI volume start $V0 ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M1 ++ ++TEST touch $M0/a ++ ++# Kill one brick and take lock on the fd and do a write. ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++ ++TEST flock -x $fd1 ++TEST fd_write $fd1 "data-1" ++ ++# Restart the brick and then write. Now fd should not get re-opened but write ++# should still succeed as there were no quorum disconnects. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd_write $fd1 "data-2" ++EXPECT "" cat $B0/${V0}0/a ++EXPECT "data-2" cat $B0/${V0}1/a ++EXPECT "data-2" cat $B0/${V0}2/a ++ ++# Check there is no fd opened on the 1st brick by checking for the gfid inside ++# /proc/pid-of-brick/fd/ directory ++gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) ++gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) ++ ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++TEST fd2=`fd_available` ++TEST fd_open $fd2 'rw' $M1/a ++ ++# Kill 2nd brick and try writing to the file. The write should fail due to ++# quorum failure. ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++TEST ! fd_write $fd1 "data-3" ++TEST ! fd_cat $fd1 ++ ++# Restart the bricks and try writing to the file. This should fail as two bricks ++# which were down previously, will return EBADFD now. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++TEST ! fd_write $fd1 "data-4" ++TEST ! fd_cat $fd1 ++ ++# Enable heal and check the files will have same content on all the bricks after ++# the heal is completed. ++EXPECT_WITHIN $HEAL_TIMEOUT "^2$" get_pending_heal_count $V0 ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++ ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++EXPECT "data-4" cat $B0/${V0}0/a ++EXPECT "data-4" cat $B0/${V0}1/a ++EXPECT "data-4" cat $B0/${V0}2/a ++TEST $CLI volume heal $V0 disable ++ ++# Try writing to the file again on the same fd, which should fail again, since ++# it is not yet re-opened. ++TEST ! fd_write $fd1 "data-5" ++ ++# At this point only one brick will have the lock. Try taking the lock again on ++# the bad fd, which should also fail with EBADFD. ++TEST ! flock -x $fd1 ++ ++# Kill the only brick that is having lock and try taking lock on another client ++# which should succeed. ++TEST kill_brick $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++TEST flock -x $fd2 ++TEST fd_write $fd2 "data-6" ++ ++# Bring the brick up and try writing & reading on the old fd, which should still ++# fail and operations on the 2nd fd should succeed. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++TEST ! fd_write $fd1 "data-7" ++ ++TEST ! fd_cat $fd1 ++TEST fd_cat $fd2 ++ ++# Close both the fds which will release the locks and then re-open and take lock ++# on the old fd. Operations on that fd should succeed afterwards. ++TEST fd_close $fd1 ++TEST fd_close $fd2 ++ ++TEST ! ls /proc/$$/fd/$fd1 ++TEST ! ls /proc/$$/fd/$fd2 ++EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++TEST flock -x $fd1 ++TEST fd_write $fd1 "data-8" ++TEST fd_cat $fd1 ++ ++EXPECT "data-8" head -n 1 $B0/${V0}0/a ++EXPECT "data-8" head -n 1 $B0/${V0}1/a ++EXPECT "data-8" head -n 1 $B0/${V0}2/a ++ ++TEST fd_close $fd1 ++ ++# Heal the volume ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++ ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++TEST $CLI volume heal $V0 disable ++ ++# Kill one brick and open a fd. ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++ ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Restart the brick and then write. Now fd should get re-opened and write should ++# succeed on the previously down brick as well since there are no locks held on ++# any of the bricks. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd_write $fd1 "data-10" ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++ ++EXPECT "data-10" head -n 1 $B0/${V0}0/a ++EXPECT "data-10" head -n 1 $B0/${V0}1/a ++EXPECT "data-10" head -n 1 $B0/${V0}2/a ++TEST fd_close $fd1 ++ ++# Kill one brick, open and take lock on a fd. ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++TEST flock -x $fd1 ++ ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Kill & restart another brick so that it will return EBADFD ++TEST kill_brick $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 ++ ++# Restart the bricks and then write. Now fd should not get re-opened since lock ++# is still held on one brick and write should also fail as there is no quorum. ++ ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++TEST ! fd_write $fd1 "data-11" ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++EXPECT "data-10" head -n 1 $B0/${V0}0/a ++EXPECT "data-10" head -n 1 $B0/${V0}1/a ++EXPECT "data-11" head -n 1 $B0/${V0}2/a ++ ++TEST fd_close $fd1 ++cleanup +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 416012c..bd46e59 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2067,6 +2067,8 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this) + dict_unref(local->cont.entrylk.xdata); + } + ++ GF_FREE(local->need_open); ++ + if (local->xdata_req) + dict_unref(local->xdata_req); + +@@ -5689,6 +5691,14 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) + } + local->is_new_entry = _gf_false; + ++ local->need_open = GF_CALLOC(priv->child_count, sizeof(*local->need_open), ++ gf_afr_mt_char); ++ if (!local->need_open) { ++ if (op_errno) ++ *op_errno = ENOMEM; ++ goto out; ++ } ++ + INIT_LIST_HEAD(&local->healer); + return 0; + out: +@@ -6124,9 +6134,8 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) + char *substr = NULL; + char *status = NULL; + +- ret = afr_lockless_inspect(frame, this, loc->gfid, &inode, +- &entry_selfheal, &data_selfheal, +- &metadata_selfheal, &pending); ++ ret = afr_lockless_inspect(frame, this, loc->gfid, &inode, &entry_selfheal, ++ &data_selfheal, &metadata_selfheal, &pending); + + if (ret == -ENOMEM) { + ret = -1; +diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c +index ff72c73..73c1552 100644 +--- a/xlators/cluster/afr/src/afr-open.c ++++ b/xlators/cluster/afr/src/afr-open.c +@@ -35,6 +35,8 @@ + #include "afr-dir-read.h" + #include "afr-dir-write.h" + #include "afr-transaction.h" ++#include "afr-self-heal.h" ++#include "protocol-common.h" + + gf_boolean_t + afr_is_fd_fixable(fd_t *fd) +@@ -239,8 +241,32 @@ afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + return 0; + } + ++static void ++afr_fd_ctx_reset_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) ++{ ++ afr_fd_ctx_t *fd_ctx = NULL; ++ afr_private_t *priv = NULL; ++ int i = 0; ++ ++ priv = this->private; ++ fd_ctx = afr_fd_ctx_get(fd, this); ++ if (!fd_ctx) ++ return; ++ ++ LOCK(&fd->lock); ++ { ++ for (i = 0; i < priv->child_count; i++) { ++ if (fd_ctx->opened_on[i] == AFR_FD_OPENING && need_open[i]) { ++ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; ++ need_open[i] = 0; ++ } ++ } ++ } ++ UNLOCK(&fd->lock); ++} ++ + static int +-afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) ++afr_fd_ctx_set_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) + { + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; +@@ -248,7 +274,6 @@ afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) + int count = 0; + + priv = this->private; +- + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + return 0; +@@ -271,21 +296,217 @@ afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) + return count; + } + ++static int ++afr_do_fix_open(call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = frame->local; ++ afr_private_t *priv = NULL; ++ int i = 0; ++ int need_open_count = 0; ++ ++ priv = this->private; ++ ++ need_open_count = AFR_COUNT(local->need_open, priv->child_count); ++ if (!need_open_count) { ++ goto out; ++ } ++ gf_msg_debug(this->name, 0, "need open count: %d", need_open_count); ++ local->call_count = need_open_count; ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!local->need_open[i]) ++ continue; ++ ++ if (IA_IFDIR == local->fd->inode->ia_type) { ++ gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", ++ local->loc.path, priv->children[i]->name); ++ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, ++ priv->children[i], ++ priv->children[i]->fops->opendir, &local->loc, ++ local->fd, NULL); ++ } else { ++ gf_msg_debug(this->name, 0, ++ "opening fd for file %s on subvolume %s", ++ local->loc.path, priv->children[i]->name); ++ ++ STACK_WIND_COOKIE( ++ frame, afr_openfd_fix_open_cbk, (void *)(long)i, ++ priv->children[i], priv->children[i]->fops->open, &local->loc, ++ local->fd_ctx->flags & ~(O_CREAT | O_EXCL | O_TRUNC), local->fd, ++ NULL); ++ } ++ if (!--need_open_count) ++ break; ++ } ++ return 0; ++ ++out: ++ afr_fd_ctx_reset_need_open(local->fd, this, local->need_open); ++ AFR_STACK_DESTROY(frame); ++ return 0; ++} ++ ++static int ++afr_is_reopen_allowed_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct gf_flock *lock, dict_t *xdata) ++{ ++ afr_local_t *local = frame->local; ++ afr_private_t *priv = NULL; ++ int ret = -1; ++ int call_count = 0; ++ int i = (long)cookie; ++ int32_t fd_reopen_status = -1; ++ int32_t final_reopen_status = -1; ++ ++ priv = this->private; ++ local->replies[i].valid = 1; ++ local->replies[i].op_ret = op_ret; ++ local->replies[i].op_errno = op_errno; ++ if (op_ret != 0) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_DICT_GET_FAILED, ++ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); ++ } ++ ++ if (xdata) ++ local->replies[i].xdata = dict_ref(xdata); ++ ++ call_count = afr_frame_return(frame); ++ ++ if (call_count) ++ return 0; ++ ++ /* Currently we get 3 values from the lower layer (protocol/client) in the ++ * getlk_cbk. ++ * FD_REOPEN_ALLOWED : No conflicting locks are held and reopen is allowed ++ * FD_REOPEN_NOT_ALLOWED : Conflicting locks are held and reopen is not ++ * allowed ++ * FD_BAD : FD is not valid ++ * ++ * - If we get FD_REOPEN_NOT_ALLOWED from any of the bricks, will block the ++ * reopen taking this as high priority. ++ * - If we get FD_BAD from all the replies, we will not reopen since we do ++ * not know the correct status. ++ * - If we get FD_BAD from few brick and FD_REOPEN_NOT_ALLOWED from one or ++ * more bricks, then we will block reopen. ++ * - If we get FD_BAD from few bricks and FD_REOPEN_ALLOWED from one or ++ * more bricks, then we will allow the reopen. ++ * ++ * We will update the final_reopen_status only when the value returned ++ * from lower layer is >= FD_REOPEN_ALLOWED and < FD_BAD. We will not set ++ * FD_BAD in final_reopen_status, since it can lead to unexpected ++ * behaviours. ++ * ++ * At the end of this loop, if we still have final_reopen_status as -1 ++ * i.e., the init value, it means we failed to get the fd status from any ++ * of the bricks or we do not have a valid fd on any of the bricks. We ++ * will not reopen the fd in this case as well. ++ */ ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (final_reopen_status != FD_REOPEN_NOT_ALLOWED && ++ local->replies[i].xdata) { ++ ret = dict_get_int32(xdata, "fd-reopen-status", &fd_reopen_status); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, ++ "Failed to get whether reopen is allowed or not on fd " ++ "for file %s on subvolume %s.", ++ local->loc.path, priv->children[i]->name); ++ } else if (fd_reopen_status >= FD_REOPEN_ALLOWED && ++ fd_reopen_status < FD_BAD) { ++ final_reopen_status = fd_reopen_status; ++ } ++ } ++ ++ if (final_reopen_status == FD_REOPEN_NOT_ALLOWED) ++ break; ++ } ++ ++ if (final_reopen_status == FD_REOPEN_NOT_ALLOWED) { ++ gf_log(this->name, GF_LOG_INFO, ++ "Conflicting locks held on file %s. FD reopen is not allowed.", ++ local->loc.path); ++ } else if (final_reopen_status == -1) { ++ gf_log(this->name, GF_LOG_INFO, ++ "Failed to get the lock information " ++ "on file %s. FD reopen is not allowed.", ++ local->loc.path); ++ } else { ++ afr_local_replies_wipe(local, priv); ++ afr_do_fix_open(frame, this); ++ return 0; ++ } ++ ++ afr_fd_ctx_reset_need_open(local->fd, this, local->need_open); ++ AFR_STACK_DESTROY(frame); ++ return 0; ++} ++ + void +-afr_fix_open(fd_t *fd, xlator_t *this) ++afr_is_reopen_allowed(xlator_t *this, call_frame_t *frame) + { + afr_private_t *priv = NULL; ++ afr_local_t *local = NULL; ++ dict_t *xdata = NULL; + int i = 0; ++ int call_count = 0; ++ struct gf_flock flock = { ++ 0, ++ }; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ flock.l_type = F_WRLCK; ++ afr_set_lk_owner(frame, this, frame->root); ++ lk_owner_copy(&flock.l_owner, &frame->root->lk_owner); ++ ++ call_count = AFR_COUNT(local->child_up, priv->child_count); ++ if (!call_count) ++ goto out; ++ local->call_count = call_count; ++ ++ xdata = dict_new(); ++ if (xdata == NULL) ++ goto out; ++ ++ if (dict_set_int32(xdata, "fd-reopen-status", -1)) ++ goto out; ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (local->child_up[i]) { ++ STACK_WIND_COOKIE(frame, afr_is_reopen_allowed_cbk, (void *)(long)i, ++ priv->children[i], priv->children[i]->fops->lk, ++ local->fd, F_GETLK, &flock, xdata); ++ } else { ++ continue; ++ } ++ ++ if (!--call_count) ++ break; ++ } ++ ++ dict_unref(xdata); ++ return; ++ ++out: ++ if (xdata) ++ dict_unref(xdata); ++ afr_fd_ctx_reset_need_open(local->fd, this, local->need_open); ++ AFR_STACK_DESTROY(frame); ++ return; ++} ++ ++void ++afr_fix_open(fd_t *fd, xlator_t *this) ++{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; +- unsigned char *need_open = NULL; + int call_count = 0; + +- priv = this->private; +- + if (!afr_is_fd_fixable(fd)) + goto out; + +@@ -293,12 +514,6 @@ afr_fix_open(fd_t *fd, xlator_t *this) + if (!fd_ctx) + goto out; + +- need_open = alloca0(priv->child_count); +- +- call_count = afr_fd_ctx_need_open(fd, this, need_open); +- if (!call_count) +- goto out; +- + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; +@@ -307,47 +522,24 @@ afr_fix_open(fd_t *fd, xlator_t *this) + if (!local) + goto out; + ++ call_count = afr_fd_ctx_set_need_open(fd, this, local->need_open); ++ if (!call_count) ++ goto out; ++ + local->loc.inode = inode_ref(fd->inode); + ret = loc_path(&local->loc, NULL); + if (ret < 0) + goto out; +- + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; + +- local->call_count = call_count; +- +- gf_msg_debug(this->name, 0, "need open count: %d", call_count); +- +- for (i = 0; i < priv->child_count; i++) { +- if (!need_open[i]) +- continue; +- +- if (IA_IFDIR == fd->inode->ia_type) { +- gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", +- local->loc.path, priv->children[i]->name); +- +- STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, +- priv->children[i], +- priv->children[i]->fops->opendir, &local->loc, +- local->fd, NULL); +- } else { +- gf_msg_debug(this->name, 0, +- "opening fd for file %s on subvolume %s", +- local->loc.path, priv->children[i]->name); +- +- STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, +- priv->children[i], priv->children[i]->fops->open, +- &local->loc, fd_ctx->flags & (~O_TRUNC), +- local->fd, NULL); +- } +- +- if (!--call_count) +- break; +- } +- ++ afr_is_reopen_allowed(this, frame); + return; ++ + out: ++ if (call_count) ++ afr_fd_ctx_reset_need_open(fd, this, local->need_open); + if (frame) + AFR_STACK_DESTROY(frame); ++ return; + } +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 6a9a763..ffc7317 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -895,6 +895,9 @@ typedef struct _afr_local { + afr_ta_fop_state_t fop_state; + int ta_failed_subvol; + gf_boolean_t is_new_entry; ++ ++ /* For fix_open */ ++ unsigned char *need_open; + } afr_local_t; + + typedef struct afr_spbc_timeout { +diff --git a/xlators/protocol/client/src/client-common.c b/xlators/protocol/client/src/client-common.c +index 1417a60..92cda12 100644 +--- a/xlators/protocol/client/src/client-common.c ++++ b/xlators/protocol/client/src/client-common.c +@@ -343,7 +343,7 @@ client_pre_readv(xlator_t *this, gfs3_read_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_READ, out); + + req->size = size; + req->offset = offset; +@@ -368,7 +368,7 @@ client_pre_writev(xlator_t *this, gfs3_write_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_WRITE, out); + + req->size = size; + req->offset = offset; +@@ -429,7 +429,8 @@ client_pre_flush(xlator_t *this, gfs3_flush_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FLUSH, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -450,7 +451,7 @@ client_pre_fsync(xlator_t *this, gfs3_fsync_req *req, fd_t *fd, int32_t flags, + int op_errno = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FSYNC, out); + + req->fd = remote_fd; + req->data = flags; +@@ -591,7 +592,8 @@ client_pre_fsyncdir(xlator_t *this, gfs3_fsyncdir_req *req, fd_t *fd, + int32_t op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSYNCDIR, out); + + req->fd = remote_fd; + req->data = flags; +@@ -668,7 +670,8 @@ client_pre_ftruncate(xlator_t *this, gfs3_ftruncate_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = EINVAL; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FTRUNCATE, out); + + req->offset = offset; + req->fd = remote_fd; +@@ -687,7 +690,8 @@ client_pre_fstat(xlator_t *this, gfs3_fstat_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSTAT, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -710,7 +714,8 @@ client_pre_lk(xlator_t *this, gfs3_lk_req *req, int32_t cmd, + int32_t gf_type = 0; + int ret = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_LK, out); + + ret = client_cmd_to_gf_cmd(cmd, &gf_cmd); + if (ret) { +@@ -787,7 +792,8 @@ client_pre_readdir(xlator_t *this, gfs3_readdir_req *req, fd_t *fd, size_t size, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIR, out); + + req->size = size; + req->offset = offset; +@@ -869,7 +875,7 @@ client_pre_finodelk(xlator_t *this, gfs3_finodelk_req *req, fd_t *fd, int cmd, + int32_t gf_cmd = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FINODELK, out); + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; +@@ -952,7 +958,8 @@ client_pre_fentrylk(xlator_t *this, gfs3_fentrylk_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FENTRYLK, out); + + req->fd = remote_fd; + req->cmd = cmd_entrylk; +@@ -1013,7 +1020,7 @@ client_pre_fxattrop(xlator_t *this, gfs3_fxattrop_req *req, fd_t *fd, + int64_t remote_fd = -1; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FXATTROP, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -1039,7 +1046,8 @@ client_pre_fgetxattr(xlator_t *this, gfs3_fgetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FGETXATTR, out); + + req->namelen = 1; /* Use it as a flag */ + req->fd = remote_fd; +@@ -1065,7 +1073,8 @@ client_pre_fsetxattr(xlator_t *this, gfs3_fsetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETXATTR, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -1091,7 +1100,8 @@ client_pre_rchecksum(xlator_t *this, gfs3_rchecksum_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_RCHECKSUM, out); + + req->len = len; + req->offset = offset; +@@ -1141,7 +1151,8 @@ client_pre_fsetattr(xlator_t *this, gfs3_fsetattr_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETATTR, out); + + req->fd = remote_fd; + req->valid = valid; +@@ -1161,7 +1172,8 @@ client_pre_readdirp(xlator_t *this, gfs3_readdirp_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIRP, out); + + req->size = size; + req->offset = offset; +@@ -1187,7 +1199,8 @@ client_pre_fremovexattr(xlator_t *this, gfs3_fremovexattr_req *req, fd_t *fd, + if (!(fd && fd->inode)) + goto out; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FREMOVEXATTR, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->name = (char *)name; +@@ -1208,7 +1221,8 @@ client_pre_fallocate(xlator_t *this, gfs3_fallocate_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FALLOCATE, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -1230,7 +1244,8 @@ client_pre_discard(xlator_t *this, gfs3_discard_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_DISCARD, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -1251,7 +1266,8 @@ client_pre_zerofill(xlator_t *this, gfs3_zerofill_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_ZEROFILL, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -1286,7 +1302,8 @@ client_pre_seek(xlator_t *this, gfs3_seek_req *req, fd_t *fd, off_t offset, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_SEEK, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->fd = remote_fd; +@@ -2508,7 +2525,7 @@ client_pre_readv_v2(xlator_t *this, gfx_read_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_READ, out); + + req->size = size; + req->offset = offset; +@@ -2532,7 +2549,7 @@ client_pre_writev_v2(xlator_t *this, gfx_write_req *req, fd_t *fd, size_t size, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_WRITE, out); + + req->size = size; + req->offset = offset; +@@ -2567,10 +2584,10 @@ client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req, + int op_errno = ESTALE; + + CLIENT_GET_REMOTE_FD(this, fd_in, FALLBACK_TO_ANON_FD, remote_fd_in, +- op_errno, out); ++ op_errno, GFS3_OP_COPY_FILE_RANGE, out); + + CLIENT_GET_REMOTE_FD(this, fd_out, FALLBACK_TO_ANON_FD, remote_fd_out, +- op_errno, out); ++ op_errno, GFS3_OP_COPY_FILE_RANGE, out); + req->size = size; + req->off_in = off_in; + req->off_out = off_out; +@@ -2623,7 +2640,8 @@ client_pre_flush_v2(xlator_t *this, gfx_flush_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FLUSH, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -2643,7 +2661,7 @@ client_pre_fsync_v2(xlator_t *this, gfx_fsync_req *req, fd_t *fd, int32_t flags, + int op_errno = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FSYNC, out); + + req->fd = remote_fd; + req->data = flags; +@@ -2778,7 +2796,8 @@ client_pre_fsyncdir_v2(xlator_t *this, gfx_fsyncdir_req *req, fd_t *fd, + int32_t op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSYNCDIR, out); + + req->fd = remote_fd; + req->data = flags; +@@ -2852,7 +2871,8 @@ client_pre_ftruncate_v2(xlator_t *this, gfx_ftruncate_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = EINVAL; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FTRUNCATE, out); + + req->offset = offset; + req->fd = remote_fd; +@@ -2870,7 +2890,8 @@ client_pre_fstat_v2(xlator_t *this, gfx_fstat_req *req, fd_t *fd, dict_t *xdata) + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSTAT, out); + + req->fd = remote_fd; + memcpy(req->gfid, fd->inode->gfid, 16); +@@ -2892,7 +2913,8 @@ client_pre_lk_v2(xlator_t *this, gfx_lk_req *req, int32_t cmd, + int32_t gf_type = 0; + int ret = 0; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_LK, out); + + ret = client_cmd_to_gf_cmd(cmd, &gf_cmd); + if (ret) { +@@ -2967,7 +2989,8 @@ client_pre_readdir_v2(xlator_t *this, gfx_readdir_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIR, out); + + req->size = size; + req->offset = offset; +@@ -3048,7 +3071,7 @@ client_pre_finodelk_v2(xlator_t *this, gfx_finodelk_req *req, fd_t *fd, int cmd, + int32_t gf_cmd = 0; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FINODELK, out); + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; +@@ -3129,7 +3152,8 @@ client_pre_fentrylk_v2(xlator_t *this, gfx_fentrylk_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FENTRYLK, out); + + req->fd = remote_fd; + req->cmd = cmd_entrylk; +@@ -3185,7 +3209,7 @@ client_pre_fxattrop_v2(xlator_t *this, gfx_fxattrop_req *req, fd_t *fd, + int64_t remote_fd = -1; + + CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno, +- out); ++ GFS3_OP_FXATTROP, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -3207,7 +3231,8 @@ client_pre_fgetxattr_v2(xlator_t *this, gfx_fgetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FGETXATTR, out); + + req->namelen = 1; /* Use it as a flag */ + req->fd = remote_fd; +@@ -3232,7 +3257,8 @@ client_pre_fsetxattr_v2(xlator_t *this, gfx_fsetxattr_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETXATTR, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -3256,7 +3282,8 @@ client_pre_rchecksum_v2(xlator_t *this, gfx_rchecksum_req *req, fd_t *fd, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_RCHECKSUM, out); + + req->len = len; + req->offset = offset; +@@ -3304,7 +3331,8 @@ client_pre_fsetattr_v2(xlator_t *this, gfx_fsetattr_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FSETATTR, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->fd = remote_fd; +@@ -3324,7 +3352,8 @@ client_pre_readdirp_v2(xlator_t *this, gfx_readdirp_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_READDIRP, out); + + req->size = size; + req->offset = offset; +@@ -3349,7 +3378,8 @@ client_pre_fremovexattr_v2(xlator_t *this, gfx_fremovexattr_req *req, fd_t *fd, + if (!(fd && fd->inode)) + goto out; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FREMOVEXATTR, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->name = (char *)name; +@@ -3369,7 +3399,8 @@ client_pre_fallocate_v2(xlator_t *this, gfx_fallocate_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_FALLOCATE, out); + + req->fd = remote_fd; + req->flags = flags; +@@ -3390,7 +3421,8 @@ client_pre_discard_v2(xlator_t *this, gfx_discard_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_DISCARD, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -3410,7 +3442,8 @@ client_pre_zerofill_v2(xlator_t *this, gfx_zerofill_req *req, fd_t *fd, + int op_errno = ESTALE; + int64_t remote_fd = -1; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_ZEROFILL, out); + + req->fd = remote_fd; + req->offset = offset; +@@ -3439,7 +3472,8 @@ client_pre_seek_v2(xlator_t *this, gfx_seek_req *req, fd_t *fd, off_t offset, + int64_t remote_fd = -1; + int op_errno = ESTALE; + +- CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out); ++ CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, ++ GFS3_OP_SEEK, out); + + memcpy(req->gfid, fd->inode->gfid, 16); + req->fd = remote_fd; +@@ -3587,3 +3621,25 @@ client_post_rename_v2(xlator_t *this, gfx_rename_rsp *rsp, struct iatt *stbuf, + + return xdr_to_dict(&rsp->xdata, xdata); + } ++ ++void ++set_fd_reopen_status(xlator_t *this, dict_t *xdata, ++ enum gf_fd_reopen_status fd_reopen_status) ++{ ++ clnt_conf_t *conf = NULL; ++ ++ conf = this->private; ++ if (!conf) { ++ gf_msg_debug(this->name, ENOMEM, "Failed to get client conf"); ++ return; ++ } ++ ++ if (!conf->strict_locks) ++ fd_reopen_status = FD_REOPEN_ALLOWED; ++ ++ if (dict_set_int32(xdata, "fd-reopen-status", fd_reopen_status)) ++ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, PC_MSG_DICT_SET_FAILED, ++ NULL); ++ ++ return; ++} +diff --git a/xlators/protocol/client/src/client-common.h b/xlators/protocol/client/src/client-common.h +index a2043d8..16fb167 100644 +--- a/xlators/protocol/client/src/client-common.h ++++ b/xlators/protocol/client/src/client-common.h +@@ -627,4 +627,8 @@ client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req, + off64_t off_out, size_t size, int32_t flags, + dict_t **xdata); + ++void ++set_fd_reopen_status(xlator_t *this, dict_t *xdata, ++ enum gf_fd_reopen_status fd_reopen_allowed); ++ + #endif /* __CLIENT_COMMON_H__ */ +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index 6543100..48b6448 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -406,11 +406,12 @@ clnt_readdir_rsp_cleanup_v2(gfx_readdir_rsp *rsp) + } + + int +-client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) ++client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd, ++ enum gf_fop_procnum fop) + { + clnt_fd_ctx_t *fdctx = NULL; + clnt_conf_t *conf = NULL; +- gf_boolean_t locks_held = _gf_false; ++ gf_boolean_t locks_involved = _gf_false; + + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, remote_fd, out); +@@ -423,23 +424,32 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd) + if (fd->anonymous) { + *remote_fd = GF_ANON_FD_NO; + } else { ++ if (conf->strict_locks && ++ (fop == GFS3_OP_WRITE || fop == GFS3_OP_FTRUNCATE || ++ fop == GFS3_OP_FALLOCATE || fop == GFS3_OP_ZEROFILL || ++ fop == GFS3_OP_DISCARD)) { ++ locks_involved = _gf_true; ++ } + *remote_fd = -1; + gf_msg_debug(this->name, EBADF, "not a valid fd for gfid: %s", + uuid_utoa(fd->inode->gfid)); + } + } else { +- if (__is_fd_reopen_in_progress(fdctx)) ++ if (__is_fd_reopen_in_progress(fdctx)) { + *remote_fd = -1; +- else ++ } else { + *remote_fd = fdctx->remote_fd; ++ } + +- locks_held = !list_empty(&fdctx->lock_list); ++ locks_involved = !list_empty(&fdctx->lock_list); + } + } + pthread_spin_unlock(&conf->fd_lock); + +- if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && (!locks_held)) ++ if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && ++ (!locks_involved)) { + *remote_fd = GF_ANON_FD_NO; ++ } + + return 0; + out: +diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c +index 3110c78..46ac544 100644 +--- a/xlators/protocol/client/src/client-rpc-fops.c ++++ b/xlators/protocol/client/src/client-rpc-fops.c +@@ -2439,6 +2439,13 @@ client3_3_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + } + } + ++ if (local->check_reopen) { ++ if (lock.l_type == F_WRLCK) ++ set_fd_reopen_status(this, xdata, FD_REOPEN_NOT_ALLOWED); ++ else ++ set_fd_reopen_status(this, xdata, FD_REOPEN_ALLOWED); ++ } ++ + out: + if ((rsp.op_ret == -1) && (EAGAIN != gf_error_to_errno(rsp.op_errno))) { + gf_msg(this->name, GF_LOG_WARNING, gf_error_to_errno(rsp.op_errno), +@@ -5198,6 +5205,7 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + 0, + }, + }; ++ dict_t *xdata = NULL; + int32_t gf_cmd = 0; + clnt_local_t *local = NULL; + clnt_conf_t *conf = NULL; +@@ -5224,6 +5232,10 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + goto unwind; + } + ++ ret = dict_get_int32(args->xdata, "fd-reopen-status", &local->check_reopen); ++ if (ret) ++ local->check_reopen = 0; ++ + local->owner = frame->root->lk_owner; + local->cmd = args->cmd; + local->fd = fd_ref(args->fd); +@@ -5237,6 +5249,13 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + client_is_setlk(local->cmd)) { + client_add_lock_for_recovery(local->fd, args->flock, &local->owner, + local->cmd); ++ } else if (local->check_reopen) { ++ xdata = dict_new(); ++ if (xdata == NULL) { ++ op_errno = ENOMEM; ++ goto unwind; ++ } ++ set_fd_reopen_status(this, xdata, FD_BAD); + } + + goto unwind; +@@ -5254,8 +5273,10 @@ client3_3_lk(call_frame_t *frame, xlator_t *this, void *data) + + return 0; + unwind: +- CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); ++ CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, xdata); + GF_FREE(req.xdata.xdata_val); ++ if (xdata) ++ dict_unref(xdata); + + return 0; + } +diff --git a/xlators/protocol/client/src/client-rpc-fops_v2.c b/xlators/protocol/client/src/client-rpc-fops_v2.c +index 954fc58..d0055e9 100644 +--- a/xlators/protocol/client/src/client-rpc-fops_v2.c ++++ b/xlators/protocol/client/src/client-rpc-fops_v2.c +@@ -2234,6 +2234,13 @@ client4_0_lk_cbk(struct rpc_req *req, struct iovec *iov, int count, + } + } + ++ if (local->check_reopen) { ++ if (lock.l_type == F_WRLCK) ++ set_fd_reopen_status(this, xdata, FD_REOPEN_NOT_ALLOWED); ++ else ++ set_fd_reopen_status(this, xdata, FD_REOPEN_ALLOWED); ++ } ++ + out: + if ((rsp.op_ret == -1) && (EAGAIN != gf_error_to_errno(rsp.op_errno))) { + gf_msg(this->name, GF_LOG_WARNING, gf_error_to_errno(rsp.op_errno), +@@ -4759,6 +4766,7 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + 0, + }, + }; ++ dict_t *xdata = NULL; + int32_t gf_cmd = 0; + clnt_local_t *local = NULL; + clnt_conf_t *conf = NULL; +@@ -4785,6 +4793,10 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + goto unwind; + } + ++ ret = dict_get_int32(args->xdata, "fd-reopen-status", &local->check_reopen); ++ if (ret) ++ local->check_reopen = 0; ++ + local->owner = frame->root->lk_owner; + local->cmd = args->cmd; + local->fd = fd_ref(args->fd); +@@ -4798,6 +4810,13 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + client_is_setlk(local->cmd)) { + client_add_lock_for_recovery(local->fd, args->flock, &local->owner, + local->cmd); ++ } else if (local->check_reopen) { ++ xdata = dict_new(); ++ if (xdata == NULL) { ++ op_errno = ENOMEM; ++ goto unwind; ++ } ++ set_fd_reopen_status(this, xdata, FD_BAD); + } + + goto unwind; +@@ -4815,8 +4834,10 @@ client4_0_lk(call_frame_t *frame, xlator_t *this, void *data) + + return 0; + unwind: +- CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); ++ CLIENT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, xdata); + GF_FREE(req.xdata.pairs.pairs_val); ++ if (xdata) ++ dict_unref(xdata); + + return 0; + } +@@ -6094,7 +6115,7 @@ client4_0_rchecksum(call_frame_t *frame, xlator_t *this, void *data) + conf = this->private; + + CLIENT_GET_REMOTE_FD(this, args->fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, +- unwind); ++ GFS3_OP_RCHECKSUM, unwind); + + req.len = args->len; + req.offset = args->offset; +diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c +index 63c90ea..35a5340 100644 +--- a/xlators/protocol/client/src/client.c ++++ b/xlators/protocol/client/src/client.c +@@ -864,9 +864,11 @@ int32_t + client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) + { +- int ret = -1; ++ int ret = 0; ++ int op_errno = ENOTCONN; + clnt_conf_t *conf = NULL; + rpc_clnt_procedure_t *proc = NULL; ++ clnt_fd_ctx_t *fdctx = NULL; + clnt_args_t args = { + 0, + }; +@@ -875,6 +877,21 @@ client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + if (!conf || !conf->fops) + goto out; + ++ if (conf->strict_locks) { ++ pthread_spin_lock(&conf->fd_lock); ++ { ++ fdctx = this_fd_get_ctx(fd, this); ++ if (fdctx && !list_empty(&fdctx->lock_list)) { ++ ret = -1; ++ op_errno = EBADFD; ++ } ++ } ++ pthread_spin_unlock(&conf->fd_lock); ++ ++ if (ret) ++ goto out; ++ } ++ + args.loc = loc; + args.fd = fd; + args.xdata = xdata; +@@ -888,7 +905,7 @@ client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + + out: + if (ret) +- STACK_UNWIND_STRICT(open, frame, -1, ENOTCONN, NULL, NULL); ++ STACK_UNWIND_STRICT(open, frame, -1, op_errno, NULL, NULL); + + return 0; + } +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index bde3d1a..2a50625 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -98,10 +98,10 @@ typedef enum { + free(_this_rsp->xdata.xdata_val); \ + } while (0) + +-#define CLIENT_GET_REMOTE_FD(xl, fd, flags, remote_fd, op_errno, label) \ ++#define CLIENT_GET_REMOTE_FD(xl, fd, flags, remote_fd, op_errno, fop, label) \ + do { \ + int _ret = 0; \ +- _ret = client_get_remote_fd(xl, fd, flags, &remote_fd); \ ++ _ret = client_get_remote_fd(xl, fd, flags, &remote_fd, fop); \ + if (_ret < 0) { \ + op_errno = errno; \ + goto label; \ +@@ -286,6 +286,7 @@ typedef struct client_local { + client_posix_lock_t *client_lock; + gf_lkowner_t owner; + int32_t cmd; ++ int32_t check_reopen; + struct list_head lock_list; + pthread_mutex_t mutex; + char *name; +@@ -435,7 +436,8 @@ client_default_reopen_done(clnt_fd_ctx_t *fdctx, int64_t rfd, xlator_t *this); + void + client_attempt_reopen(fd_t *fd, xlator_t *this); + int +-client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd); ++client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd, ++ enum gf_fop_procnum fop); + int + client_fd_fop_prepare_local(call_frame_t *frame, fd_t *fd, int64_t remote_fd); + gf_boolean_t +-- +1.8.3.1 + diff --git a/SOURCES/0582-protocol-client-Fix-lock-memory-leak.patch b/SOURCES/0582-protocol-client-Fix-lock-memory-leak.patch new file mode 100644 index 0000000..3fd1dae --- /dev/null +++ b/SOURCES/0582-protocol-client-Fix-lock-memory-leak.patch @@ -0,0 +1,501 @@ +From adeec3d5d85baad8b50d203f34a47ad5360d7cd7 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 7 Jun 2021 18:36:11 +0530 +Subject: [PATCH 582/584] protocol/client: Fix lock memory leak + +Problem-1: +When an overlapping lock is issued the merged lock is not assigned the +owner. When flush is issued on the fd, this particular lock is not freed +leading to memory leak + +Fix-1: +Assign the owner while merging the locks. + +Problem-2: +On fd-destroy lock structs could be present in fdctx. For some reason +with flock -x command and closing of the bash fd, it leads to this code +path. Which leaks the lock structs. + +Fix-2: +When fdctx is being destroyed in client, make sure to cleanup any lock +structs. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2338/commits/926402f639471d2664bf00c6692221ba297c525f +> fixes: gluster#2337 +> Change-Id: I298124213ce5a1cf2b1f1756d5e8a9745d9c0a1c +> Signed-off-by: Pranith Kumar K + +BUG: 1689375 +Change-Id: I298124213ce5a1cf2b1f1756d5e8a9745d9c0a1c +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245603 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/client/issue-2337-lock-mem-leak.c | 52 ++++++++++++++++++ + tests/bugs/client/issue-2337-lock-mem-leak.t | 42 ++++++++++++++ + tests/bugs/replicate/do-not-reopen-fd.t | 65 ++++++++++++++-------- + tests/volume.rc | 8 +++ + xlators/protocol/client/src/client-helpers.c | 10 ++++ + xlators/protocol/client/src/client-lk.c | 82 ++++++++++++++++++---------- + xlators/protocol/client/src/client.h | 8 ++- + 7 files changed, 213 insertions(+), 54 deletions(-) + create mode 100644 tests/bugs/client/issue-2337-lock-mem-leak.c + create mode 100644 tests/bugs/client/issue-2337-lock-mem-leak.t + +diff --git a/tests/bugs/client/issue-2337-lock-mem-leak.c b/tests/bugs/client/issue-2337-lock-mem-leak.c +new file mode 100644 +index 0000000..d4e02a7 +--- /dev/null ++++ b/tests/bugs/client/issue-2337-lock-mem-leak.c +@@ -0,0 +1,52 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int ++main(int argc, char *argv[]) ++{ ++ int fd = -1; ++ char *filename = NULL; ++ struct flock lock = { ++ 0, ++ }; ++ int i = 0; ++ int ret = -1; ++ ++ if (argc != 2) { ++ fprintf(stderr, "Usage: %s ", argv[0]); ++ goto out; ++ } ++ ++ filename = argv[1]; ++ ++ fd = open(filename, O_RDWR | O_CREAT, 0); ++ if (fd < 0) { ++ fprintf(stderr, "open (%s) failed (%s)\n", filename, strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_type = F_WRLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 2; ++ ++ while (i < 100) { ++ lock.l_start = i; ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++ ++ i++; ++ } ++ ++ ret = 0; ++ ++out: ++ return ret; ++} +diff --git a/tests/bugs/client/issue-2337-lock-mem-leak.t b/tests/bugs/client/issue-2337-lock-mem-leak.t +new file mode 100644 +index 0000000..64132a2 +--- /dev/null ++++ b/tests/bugs/client/issue-2337-lock-mem-leak.t +@@ -0,0 +1,42 @@ ++#!/bin/bash ++ ++#Test that lock fop is not leaking any memory for overlapping regions ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../fileio.rc ++ ++cleanup; ++ ++LOCK_TEST=$(dirname $0)/issue-2337-lock-mem-leak ++build_tester $(dirname $0)/issue-2337-lock-mem-leak.c -o ${LOCK_TEST} ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}1 ++#Guard against flush-behind ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume start $V0 ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++TEST touch $M0/a ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'w' $M0/a ++TEST flock -x $fd1 ++statedump=$(generate_mount_statedump $V0 $M0) ++EXPECT_NOT "^nostatedump$" echo $statedump ++#Making sure no one changes this mem-tracker name ++TEST grep gf_client_mt_clnt_lock_t $statedump ++TEST fd_close $fd1 ++ ++statedump=$(generate_mount_statedump $V0 $M0) ++EXPECT_NOT "^nostatedump$" echo $statedump ++TEST ! grep gf_client_mt_clnt_lock_t $statedump ++ ++TEST ${LOCK_TEST} $M0/a ++ ++statedump=$(generate_mount_statedump $V0 $M0) ++EXPECT_NOT "^nostatedump$" echo $statedump ++TEST ! grep gf_client_mt_clnt_lock_t $statedump ++TEST cleanup_mount_statedump $V0 ++TEST rm ${LOCK_TEST} ++cleanup +diff --git a/tests/bugs/replicate/do-not-reopen-fd.t b/tests/bugs/replicate/do-not-reopen-fd.t +index 76d8e70..13b5218 100644 +--- a/tests/bugs/replicate/do-not-reopen-fd.t ++++ b/tests/bugs/replicate/do-not-reopen-fd.t +@@ -45,13 +45,17 @@ EXPECT "data-2" cat $B0/${V0}2/a + gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) + gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) + +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + TEST fd2=`fd_available` + TEST fd_open $fd2 'rw' $M1/a + ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ + # Kill 2nd brick and try writing to the file. The write should fail due to + # quorum failure. + TEST kill_brick $V0 $H0 $B0/${V0}1 +@@ -66,6 +70,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-4" + TEST ! fd_cat $fd1 ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Enable heal and check the files will have same content on all the bricks after + # the heal is completed. +@@ -89,7 +96,9 @@ TEST ! fd_write $fd1 "data-5" + + # At this point only one brick will have the lock. Try taking the lock again on + # the bad fd, which should also fail with EBADFD. +-TEST ! flock -x $fd1 ++# TODO: At the moment quorum failure in lk leads to unlock on the bricks where ++# lock succeeds. This will change lock state on 3rd brick, commenting for now ++#TEST ! flock -x $fd1 + + # Kill the only brick that is having lock and try taking lock on another client + # which should succeed. +@@ -97,15 +106,25 @@ TEST kill_brick $V0 $H0 $B0/${V0}2 + EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 2 + TEST flock -x $fd2 + TEST fd_write $fd2 "data-6" ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++ + + # Bring the brick up and try writing & reading on the old fd, which should still + # fail and operations on the 2nd fd should succeed. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}2 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M1 $V0-replicate-0 2 ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST ! fd_write $fd1 "data-7" + + TEST ! fd_cat $fd1 ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST fd_cat $fd2 + + # Close both the fds which will release the locks and then re-open and take lock +@@ -113,17 +132,15 @@ TEST fd_cat $fd2 + TEST fd_close $fd1 + TEST fd_close $fd2 + +-TEST ! ls /proc/$$/fd/$fd1 +-TEST ! ls /proc/$$/fd/$fd2 +-EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a +-EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT_WITHIN $REOPEN_TIMEOUT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + TEST flock -x $fd1 + TEST fd_write $fd1 "data-8" +@@ -134,6 +151,10 @@ EXPECT "data-8" head -n 1 $B0/${V0}1/a + EXPECT "data-8" head -n 1 $B0/${V0}2/a + + TEST fd_close $fd1 ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ + + # Heal the volume + TEST $CLI volume heal $V0 enable +@@ -152,9 +173,9 @@ EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replica + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Restart the brick and then write. Now fd should get re-opened and write should + # succeed on the previously down brick as well since there are no locks held on +@@ -163,7 +184,7 @@ TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd_write $fd1 "data-10" +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + + EXPECT "data-10" head -n 1 $B0/${V0}0/a + EXPECT "data-10" head -n 1 $B0/${V0}1/a +@@ -177,9 +198,9 @@ TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + TEST flock -x $fd1 + +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Kill & restart another brick so that it will return EBADFD + TEST kill_brick $V0 $H0 $B0/${V0}1 +@@ -194,9 +215,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-11" +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +-EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "Y" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + EXPECT "data-10" head -n 1 $B0/${V0}0/a + EXPECT "data-10" head -n 1 $B0/${V0}1/a +diff --git a/tests/volume.rc b/tests/volume.rc +index f5dd0b1..17c3835 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -407,6 +407,14 @@ function gf_check_file_opened_in_brick { + fi + } + ++function gf_open_file_count_in_brick { ++ vol=$1 ++ host=$2 ++ brick=$3 ++ realpath=$4 ++ ls -l /proc/$(get_brick_pid $vol $host $brick)/fd | grep "${realpath}$" | wc -l ++} ++ + function gf_get_gfid_backend_file_path { + brickpath=$1 + filepath_in_brick=$2 +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index 48b6448..a80f303 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -3156,11 +3156,14 @@ client_fdctx_destroy(xlator_t *this, clnt_fd_ctx_t *fdctx) + int32_t ret = -1; + char parent_down = 0; + fd_lk_ctx_t *lk_ctx = NULL; ++ gf_lkowner_t null_owner = {0}; ++ struct list_head deleted_list; + + GF_VALIDATE_OR_GOTO("client", this, out); + GF_VALIDATE_OR_GOTO(this->name, fdctx, out); + + conf = (clnt_conf_t *)this->private; ++ INIT_LIST_HEAD(&deleted_list); + + if (fdctx->remote_fd == -1) { + gf_msg_debug(this->name, 0, "not a valid fd"); +@@ -3174,6 +3177,13 @@ client_fdctx_destroy(xlator_t *this, clnt_fd_ctx_t *fdctx) + pthread_mutex_unlock(&conf->lock); + lk_ctx = fdctx->lk_ctx; + fdctx->lk_ctx = NULL; ++ pthread_spin_lock(&conf->fd_lock); ++ { ++ __delete_granted_locks_owner_from_fdctx(fdctx, &null_owner, ++ &deleted_list); ++ } ++ pthread_spin_unlock(&conf->fd_lock); ++ destroy_client_locks_from_list(&deleted_list); + + if (lk_ctx) + fd_lk_ctx_unref(lk_ctx); +diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c +index c1fb055..cb4e894 100644 +--- a/xlators/protocol/client/src/client-lk.c ++++ b/xlators/protocol/client/src/client-lk.c +@@ -253,6 +253,7 @@ __insert_and_merge(clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock) + sum = add_locks(lock, conf); + + sum->fd = lock->fd; ++ sum->owner = conf->owner; + + __delete_client_lock(conf); + __destroy_client_lock(conf); +@@ -320,56 +321,77 @@ destroy_client_lock(client_posix_lock_t *lock) + GF_FREE(lock); + } + +-int32_t +-delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner) ++void ++destroy_client_locks_from_list(struct list_head *deleted) + { +- clnt_fd_ctx_t *fdctx = NULL; + client_posix_lock_t *lock = NULL; + client_posix_lock_t *tmp = NULL; +- xlator_t *this = NULL; +- clnt_conf_t *conf = NULL; +- +- struct list_head delete_list; +- int ret = 0; ++ xlator_t *this = THIS; + int count = 0; + +- INIT_LIST_HEAD(&delete_list); +- this = THIS; +- conf = this->private; ++ list_for_each_entry_safe(lock, tmp, deleted, list) ++ { ++ list_del_init(&lock->list); ++ destroy_client_lock(lock); ++ count++; ++ } + +- pthread_spin_lock(&conf->fd_lock); ++ /* FIXME: Need to actually print the locks instead of count */ ++ gf_msg_trace(this->name, 0, "Number of locks cleared=%d", count); ++} + +- fdctx = this_fd_get_ctx(fd, this); +- if (!fdctx) { +- pthread_spin_unlock(&conf->fd_lock); ++void ++__delete_granted_locks_owner_from_fdctx(clnt_fd_ctx_t *fdctx, ++ gf_lkowner_t *owner, ++ struct list_head *deleted) ++{ ++ client_posix_lock_t *lock = NULL; ++ client_posix_lock_t *tmp = NULL; + +- gf_msg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_FD_CTX_INVALID, +- "fdctx not valid"); +- ret = -1; +- goto out; ++ gf_boolean_t is_null_lkowner = _gf_false; ++ ++ if (is_lk_owner_null(owner)) { ++ is_null_lkowner = _gf_true; + } + + list_for_each_entry_safe(lock, tmp, &fdctx->lock_list, list) + { +- if (is_same_lkowner(&lock->owner, owner)) { ++ if (is_null_lkowner || is_same_lkowner(&lock->owner, owner)) { + list_del_init(&lock->list); +- list_add_tail(&lock->list, &delete_list); +- count++; ++ list_add_tail(&lock->list, deleted); + } + } ++} + +- pthread_spin_unlock(&conf->fd_lock); ++int32_t ++delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner) ++{ ++ clnt_fd_ctx_t *fdctx = NULL; ++ xlator_t *this = NULL; ++ clnt_conf_t *conf = NULL; ++ int ret = 0; ++ struct list_head deleted_locks; + +- if (!list_empty(&delete_list)) { +- list_for_each_entry_safe(lock, tmp, &delete_list, list) +- { +- list_del_init(&lock->list); +- destroy_client_lock(lock); ++ this = THIS; ++ conf = this->private; ++ INIT_LIST_HEAD(&deleted_locks); ++ ++ pthread_spin_lock(&conf->fd_lock); ++ { ++ fdctx = this_fd_get_ctx(fd, this); ++ if (!fdctx) { ++ pthread_spin_unlock(&conf->fd_lock); ++ ++ gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_FD_CTX_INVALID, ++ NULL); ++ ret = -1; ++ goto out; + } ++ __delete_granted_locks_owner_from_fdctx(fdctx, owner, &deleted_locks); + } ++ pthread_spin_unlock(&conf->fd_lock); + +- /* FIXME: Need to actually print the locks instead of count */ +- gf_msg_trace(this->name, 0, "Number of locks cleared=%d", count); ++ destroy_client_locks_from_list(&deleted_locks); + + out: + return ret; +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index 2a50625..f952aea 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -406,8 +406,12 @@ int + client_attempt_lock_recovery(xlator_t *this, clnt_fd_ctx_t *fdctx); + int32_t + delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner); +-int32_t +-delete_granted_locks_fd(clnt_fd_ctx_t *fdctx); ++void ++__delete_granted_locks_owner_from_fdctx(clnt_fd_ctx_t *fdctx, ++ gf_lkowner_t *owner, ++ struct list_head *deleted); ++void ++destroy_client_locks_from_list(struct list_head *deleted); + int32_t + client_cmd_to_gf_cmd(int32_t cmd, int32_t *gf_cmd); + void +-- +1.8.3.1 + diff --git a/SOURCES/0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch b/SOURCES/0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch new file mode 100644 index 0000000..1ac1777 --- /dev/null +++ b/SOURCES/0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch @@ -0,0 +1,138 @@ +From f114ba25fab57d1ab9a51fc1f101f2b5571f167a Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 7 Jun 2021 19:24:55 +0530 +Subject: [PATCH 583/584] protocol/client: Initialize list head to prevent NULL + de-reference + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2456/commits/00761df0cd14833ff256b69dba7cf8e2b699554c +> fixes: #2443 +> Change-Id: I86ef0270d41d6fb924db97fde3196d7c98c8b564 +> Signed-off-by: Pranith Kumar K + +BUG: 1689375 +Change-Id: I86ef0270d41d6fb924db97fde3196d7c98c8b564 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245613 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/locks/issue-2443-crash.c | 67 +++++++++++++++++++++++++++++++++ + tests/bugs/locks/issue-2443-crash.t | 18 +++++++++ + xlators/protocol/client/src/client-lk.c | 1 + + 3 files changed, 86 insertions(+) + create mode 100644 tests/bugs/locks/issue-2443-crash.c + create mode 100644 tests/bugs/locks/issue-2443-crash.t + +diff --git a/tests/bugs/locks/issue-2443-crash.c b/tests/bugs/locks/issue-2443-crash.c +new file mode 100644 +index 0000000..5f580bf +--- /dev/null ++++ b/tests/bugs/locks/issue-2443-crash.c +@@ -0,0 +1,67 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int ++main(int argc, char *argv[]) ++{ ++ int fd = -1; ++ char *filename = NULL; ++ struct flock lock = { ++ 0, ++ }; ++ int i = 0; ++ int ret = -1; ++ ++ if (argc != 2) { ++ fprintf(stderr, "Usage: %s ", argv[0]); ++ goto out; ++ } ++ ++ filename = argv[1]; ++ ++ fd = open(filename, O_RDWR | O_CREAT, 0); ++ if (fd < 0) { ++ fprintf(stderr, "open (%s) failed (%s)\n", filename, strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_start = 0; ++ lock.l_type = F_RDLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 2; ++ ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_start = 2; ++ lock.l_type = F_WRLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 2; ++ ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++ ++ lock.l_start = 0; ++ lock.l_type = F_RDLCK; ++ lock.l_whence = SEEK_SET; ++ lock.l_len = 4; ++ ++ ret = fcntl(fd, F_SETLK, &lock); ++ if (ret < 0) { ++ fprintf(stderr, "fcntl setlk failed (%s)\n", strerror(errno)); ++ goto out; ++ } ++out: ++ return ret; ++} +diff --git a/tests/bugs/locks/issue-2443-crash.t b/tests/bugs/locks/issue-2443-crash.t +new file mode 100644 +index 0000000..162a4d7 +--- /dev/null ++++ b/tests/bugs/locks/issue-2443-crash.t +@@ -0,0 +1,18 @@ ++#!/bin/bash ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/brick0 ++TEST $CLI volume start $V0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++build_tester $(dirname $0)/issue-2443-crash.c ++TEST mv $(dirname $0)/issue-2443-crash $M0 ++cd $M0 ++TEST ./issue-2443-crash a ++ ++cd - ++cleanup; +diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c +index cb4e894..37c1d35 100644 +--- a/xlators/protocol/client/src/client-lk.c ++++ b/xlators/protocol/client/src/client-lk.c +@@ -101,6 +101,7 @@ add_locks(client_posix_lock_t *l1, client_posix_lock_t *l2) + sum = GF_CALLOC(1, sizeof(*sum), gf_client_mt_clnt_lock_t); + if (!sum) + return NULL; ++ INIT_LIST_HEAD(&sum->list); + + sum->fl_start = min(l1->fl_start, l2->fl_start); + sum->fl_end = max(l1->fl_end, l2->fl_end); +-- +1.8.3.1 + diff --git a/SOURCES/0584-dht-fixing-xattr-inconsistency.patch b/SOURCES/0584-dht-fixing-xattr-inconsistency.patch new file mode 100644 index 0000000..bf2c6b9 --- /dev/null +++ b/SOURCES/0584-dht-fixing-xattr-inconsistency.patch @@ -0,0 +1,429 @@ +From 2c6c4ad77ba5511a62846af932840deb5bc389ae Mon Sep 17 00:00:00 2001 +From: Tamar Shacked +Date: Mon, 7 Jun 2021 12:25:57 +0300 +Subject: [PATCH 584/584] dht - fixing xattr inconsistency + +The scenario of setting an xattr to a dir, killing one of the bricks, +removing the xattr, bringing back the brick results in xattr +inconsistency - The downed brick will still have the xattr, but the rest +won't. +This patch add a mechanism that will remove the extra xattrs during +lookup. + +Backport of: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24687/ +> fixes: #1324 +> Change-Id: Ifec0b7aea6cd40daa8b0319b881191cf83e031d1 +> Signed-off-by: Barak Sason Rofman + +BUG: 1600379 +Change-Id: I588f69b283e5354cd362d74486d6ec6d226ecc96 +Signed-off-by: Tamar Shacked +Signed-off-by: srijan-sivakumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/245560 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/common-utils.c | 20 +++++++- + libglusterfs/src/glusterfs/common-utils.h | 6 +++ + tests/bugs/distribute/bug-1600379.t | 54 ++++++++++++++++++++ + xlators/cluster/dht/src/dht-common.c | 14 ++---- + xlators/cluster/dht/src/dht-common.h | 4 -- + xlators/cluster/dht/src/dht-helper.c | 4 ++ + xlators/cluster/dht/src/dht-selfheal.c | 11 ++++ + xlators/storage/posix/src/posix-helpers.c | 19 +++++++ + xlators/storage/posix/src/posix-inode-fd-ops.c | 69 ++++++++++++++++++++++++++ + xlators/storage/posix/src/posix.h | 3 ++ + 10 files changed, 189 insertions(+), 15 deletions(-) + create mode 100644 tests/bugs/distribute/bug-1600379.t + +diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c +index c2dfe28..d8b7c6e 100644 +--- a/libglusterfs/src/common-utils.c ++++ b/libglusterfs/src/common-utils.c +@@ -54,6 +54,7 @@ + #include "xxhash.h" + #include + #include "glusterfs/libglusterfs-messages.h" ++#include "glusterfs/glusterfs-acl.h" + #include "protocol-common.h" + #ifdef __FreeBSD__ + #include +@@ -82,12 +83,21 @@ gf_boolean_t gf_signal_on_assert = false; + typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size); + typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size); + +-void gf_assert(void) ++char *xattrs_to_heal[] = {"user.", ++ POSIX_ACL_ACCESS_XATTR, ++ POSIX_ACL_DEFAULT_XATTR, ++ QUOTA_LIMIT_KEY, ++ QUOTA_LIMIT_OBJECTS_KEY, ++ GF_SELINUX_XATTR_KEY, ++ GF_XATTR_MDATA_KEY, ++ NULL}; ++ ++void ++gf_assert(void) + { + if (gf_signal_on_assert) { + raise(SIGCONT); + } +- + } + + void +@@ -5430,3 +5440,9 @@ gf_d_type_from_ia_type(ia_type_t type) + return DT_UNKNOWN; + } + } ++ ++char ** ++get_xattrs_to_heal() ++{ ++ return xattrs_to_heal; ++} +diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h +index bd48b6f..8439bb6 100644 +--- a/libglusterfs/src/glusterfs/common-utils.h ++++ b/libglusterfs/src/glusterfs/common-utils.h +@@ -183,6 +183,12 @@ enum _gf_xlator_ipc_targets { + typedef enum _gf_special_pid gf_special_pid_t; + typedef enum _gf_xlator_ipc_targets _gf_xlator_ipc_targets_t; + ++/* Array to hold custom xattr keys */ ++extern char *xattrs_to_heal[]; ++ ++char ** ++get_xattrs_to_heal(); ++ + /* The DHT file rename operation is not a straightforward rename. + * It involves creating linkto and linkfiles, and can unlink or rename the + * source file depending on the hashed and cached subvols for the source +diff --git a/tests/bugs/distribute/bug-1600379.t b/tests/bugs/distribute/bug-1600379.t +new file mode 100644 +index 0000000..8d2f615 +--- /dev/null ++++ b/tests/bugs/distribute/bug-1600379.t +@@ -0,0 +1,54 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++# Initialize ++#------------------------------------------------------------ ++cleanup; ++ ++# Start glusterd ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++# Create a volume ++TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2} ++ ++# Verify volume creation ++EXPECT "$V0" volinfo_field $V0 'Volume Name'; ++EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++# Start volume and verify successful start ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; ++#------------------------------------------------------------ ++ ++# Test case - Remove xattr from killed brick on lookup ++#------------------------------------------------------------ ++# Create a dir and set custom xattr ++TEST mkdir $M0/testdir ++TEST setfattr -n user.attr -v val $M0/testdir ++xattr_val=`getfattr -d $B0/${V0}2/testdir | awk '{print $1}'`; ++TEST ${xattr_val}='user.attr="val"'; ++ ++# Kill 2nd brick process ++TEST kill_brick $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "1" online_brick_count ++ ++# Remove custom xattr ++TEST setfattr -x user.attr $M0/testdir ++ ++# Bring up the killed brick process ++TEST $CLI volume start $V0 force ++ ++# Perform lookup ++sleep 5 ++TEST ls $M0/testdir ++ ++# Check brick xattrs ++xattr_val_2=`getfattr -d $B0/${V0}2/testdir`; ++TEST [ ${xattr_val_2} = ''] ; ++ ++cleanup; +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index ce0fbbf..edfc6e7 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -127,15 +128,6 @@ dht_read_iatt_from_xdata(xlator_t *this, dict_t *xdata, struct iatt *stbuf) + int + dht_rmdir_unlock(call_frame_t *frame, xlator_t *this); + +-char *xattrs_to_heal[] = {"user.", +- POSIX_ACL_ACCESS_XATTR, +- POSIX_ACL_DEFAULT_XATTR, +- QUOTA_LIMIT_KEY, +- QUOTA_LIMIT_OBJECTS_KEY, +- GF_SELINUX_XATTR_KEY, +- GF_XATTR_MDATA_KEY, +- NULL}; +- + char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + + /* Return true if key exists in array +@@ -143,6 +135,8 @@ char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + static gf_boolean_t + dht_match_xattr(const char *key) + { ++ char **xattrs_to_heal = get_xattrs_to_heal(); ++ + return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0; + } + +@@ -5399,11 +5393,13 @@ dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + int call_cnt = 0; + dht_local_t *local = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; ++ char **xattrs_to_heal; + + conf = this->private; + local = frame->local; + call_cnt = conf->subvolume_cnt; + local->flags = flags; ++ xattrs_to_heal = get_xattrs_to_heal(); + + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); +diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h +index 132b3b3..b856c68 100644 +--- a/xlators/cluster/dht/src/dht-common.h ++++ b/xlators/cluster/dht/src/dht-common.h +@@ -54,10 +54,6 @@ + #define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*" + #define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol." + +-/* Array to hold custom xattr keys +- */ +-extern char *xattrs_to_heal[]; +- + /* Rebalance nodeuuid flags */ + #define REBAL_NODEUUID_MINE 0x01 + +diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c +index 4f7370d..4c3940a 100644 +--- a/xlators/cluster/dht/src/dht-helper.c ++++ b/xlators/cluster/dht/src/dht-helper.c +@@ -2289,6 +2289,7 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + int luret = -1; + int luflag = -1; + int i = 0; ++ char **xattrs_to_heal; + + if (!src || !dst) { + gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, +@@ -2305,6 +2306,9 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + and set it to dst dict, here index start from 1 because + user xattr already checked in previous statement + */ ++ ++ xattrs_to_heal = get_xattrs_to_heal(); ++ + for (i = 1; xattrs_to_heal[i]; i++) { + keyval = dict_get(src, xattrs_to_heal[i]); + if (keyval) { +diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c +index f4e17d1..8af7301 100644 +--- a/xlators/cluster/dht/src/dht-selfheal.c ++++ b/xlators/cluster/dht/src/dht-selfheal.c +@@ -2315,6 +2315,15 @@ dht_dir_heal_xattrs(void *data) + if (subvol == mds_subvol) + continue; + if (uret || uflag) { ++ /* Custom xattr heal is required - let posix handle it */ ++ ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true); ++ if (ret) { ++ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, ++ "path=%s", local->loc.path, "key=%s", ++ "sync_backend_xattrs", NULL); ++ goto out; ++ } ++ + ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata, + NULL); + if (ret) { +@@ -2325,6 +2334,8 @@ dht_dir_heal_xattrs(void *data) + "user xattr on path %s on " + "subvol %s, gfid = %s ", + local->loc.path, subvol->name, gfid); ++ } else { ++ dict_del(xdata, "sync_backend_xattrs"); + } + } + } +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index 16351d8..40a9ee4 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -3656,3 +3656,22 @@ out: + + return is_stale; + } ++ ++/* Delete user xattr from the file at the file-path specified by data and from ++ * dict */ ++int ++posix_delete_user_xattr(dict_t *dict, char *k, data_t *v, void *data) ++{ ++ int ret; ++ char *real_path = data; ++ ++ ret = sys_lremovexattr(real_path, k); ++ if (ret) { ++ gf_msg("posix-helpers", GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, errno, ++ "removexattr failed. key %s path %s", k, real_path); ++ } ++ ++ dict_del(dict, k); ++ ++ return ret; ++} +diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c +index 4c2983a..be22c5e 100644 +--- a/xlators/storage/posix/src/posix-inode-fd-ops.c ++++ b/xlators/storage/posix/src/posix-inode-fd-ops.c +@@ -62,6 +62,7 @@ + #include + #include "posix-gfid-path.h" + #include ++#include + + extern char *marker_xattrs[]; + #define ALIGN_SIZE 4096 +@@ -2733,6 +2734,7 @@ posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t ret = 0; + ssize_t acl_size = 0; + dict_t *xattr = NULL; ++ dict_t *subvol_xattrs = NULL; + posix_xattr_filler_t filler = { + 0, + }; +@@ -2748,6 +2750,10 @@ posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + struct mdata_iatt mdata_iatt = { + 0, + }; ++ int8_t sync_backend_xattrs = _gf_false; ++ data_pair_t *custom_xattrs; ++ data_t *keyval = NULL; ++ char **xattrs_to_heal = get_xattrs_to_heal(); + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID(frame->root->uid, frame->root->gid); +@@ -2930,6 +2936,66 @@ posix_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + goto out; + } + ++ ret = dict_get_int8(xdata, "sync_backend_xattrs", &sync_backend_xattrs); ++ if (ret) { ++ gf_msg_debug(this->name, -ret, "Unable to get sync_backend_xattrs"); ++ } ++ ++ if (sync_backend_xattrs) { ++ /* List all custom xattrs */ ++ subvol_xattrs = dict_new(); ++ if (!subvol_xattrs) ++ goto out; ++ ++ ret = dict_set_int32_sizen(xdata, "list-xattr", 1); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, ++ "Unable to set list-xattr in dict "); ++ goto out; ++ } ++ ++ subvol_xattrs = posix_xattr_fill(this, real_path, loc, NULL, -1, xdata, ++ NULL); ++ ++ /* Remove all user xattrs from the file */ ++ dict_foreach_fnmatch(subvol_xattrs, "user.*", posix_delete_user_xattr, ++ real_path); ++ ++ /* Remove all custom xattrs from the file */ ++ for (i = 1; xattrs_to_heal[i]; i++) { ++ keyval = dict_get(subvol_xattrs, xattrs_to_heal[i]); ++ if (keyval) { ++ ret = sys_lremovexattr(real_path, xattrs_to_heal[i]); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, P_MSG_XATTR_NOT_REMOVED, ++ errno, "removexattr failed. key %s path %s", ++ xattrs_to_heal[i], loc->path); ++ goto out; ++ } ++ ++ dict_del(subvol_xattrs, xattrs_to_heal[i]); ++ keyval = NULL; ++ } ++ } ++ ++ /* Set custom xattrs based on info provided by DHT */ ++ custom_xattrs = dict->members_list; ++ ++ while (custom_xattrs != NULL) { ++ ret = sys_lsetxattr(real_path, custom_xattrs->key, ++ custom_xattrs->value->data, ++ custom_xattrs->value->len, flags); ++ if (ret) { ++ op_errno = errno; ++ gf_log(this->name, GF_LOG_ERROR, "setxattr failed - %s %d", ++ custom_xattrs->key, ret); ++ goto out; ++ } ++ ++ custom_xattrs = custom_xattrs->next; ++ } ++ } ++ + xattr = dict_new(); + if (!xattr) + goto out; +@@ -3037,6 +3103,9 @@ out: + if (xattr) + dict_unref(xattr); + ++ if (subvol_xattrs) ++ dict_unref(subvol_xattrs); ++ + return 0; + } + +diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h +index 4be979c..b357d34 100644 +--- a/xlators/storage/posix/src/posix.h ++++ b/xlators/storage/posix/src/posix.h +@@ -686,4 +686,7 @@ posix_update_iatt_buf(struct iatt *buf, int fd, char *loc, dict_t *xdata); + gf_boolean_t + posix_is_layout_stale(dict_t *xdata, char *par_path, xlator_t *this); + ++int ++posix_delete_user_xattr(dict_t *dict, char *k, data_t *v, void *data); ++ + #endif /* _POSIX_H */ +-- +1.8.3.1 + diff --git a/SOURCES/0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch b/SOURCES/0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch new file mode 100644 index 0000000..e3fa401 --- /dev/null +++ b/SOURCES/0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch @@ -0,0 +1,77 @@ +From ba399a083a56963bb7414535ede6eff6afcd1a0a Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Mon, 14 Jun 2021 12:32:06 -0400 +Subject: [PATCH 585/585] ganesha_ha: ganesha_grace RA fails in start() and/or + fails in monitor () (#2523) + +shell [[ ]] string compare fails to match returned attr to the +pattern and subsequently returns status of "not running", resulting +in dependencies such as the IPaddr (cluster_ip) RA not starting + +Change-Id: I2c8d6f5c4cf0480672d52d8aa0d9226950441dc9 +commit 8ec66a43eedd505ec0b40f55c05f13a77fe8074e +PR: https://github.com/gluster/glusterfs/pull/2523 +issue: https://github.com/gluster/glusterfs/issues/2522 +BUG: 1945143 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/247613 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/ganesha/ocf/ganesha_grace | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace +index edc6fa2..ca219af 100644 +--- a/extras/ganesha/ocf/ganesha_grace ++++ b/extras/ganesha/ocf/ganesha_grace +@@ -122,15 +122,18 @@ ganesha_grace_start() + + # case 1 + if [[ -z "${attr}" ]]; then ++ ocf_log debug "grace start: returning success case 1" + return ${OCF_SUCCESS} + fi + + # case 2 +- if [[ "${attr}" = *"value=1" ]]; then ++ if [[ "${attr}" = *"host=\"${host}\" value=\"1\"" ]]; then ++ ocf_log debug "grace start: returning success case 2" + return ${OCF_SUCCESS} + fi + + # case 3 ++ ocf_log info "grace start returning: not running case 3 (${attr})" + return ${OCF_NOT_RUNNING} + } + +@@ -162,7 +165,7 @@ ganesha_grace_monitor() + { + local host=$(ocf_local_nodename) + +- ocf_log debug "ganesha_grace monitor ${host}" ++ ocf_log debug "ganesha_grace_monitor ${host}" + + attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + if [ $? -ne 0 ]; then +@@ -174,13 +177,16 @@ ganesha_grace_monitor() + # chance to create it. In which case we'll pretend + # everything is okay this time around + if [[ -z "${attr}" ]]; then ++ ocf_log debug "grace monitor: returning success case 1" + return ${OCF_SUCCESS} + fi + +- if [[ "${attr}" = *"value=1" ]]; then ++ if [[ "${attr}" = *"host=\"${host}\" value=\"1\"" ]]; then ++ ocf_log debug "grace monitor: returning success case 2" + return ${OCF_SUCCESS} + fi + ++ ocf_log info "grace monitor: returning not running case 3 (${attr})" + return ${OCF_NOT_RUNNING} + } + +-- +1.8.3.1 + diff --git a/SOURCES/0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch b/SOURCES/0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch new file mode 100644 index 0000000..62c574d --- /dev/null +++ b/SOURCES/0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch @@ -0,0 +1,298 @@ +From e431321f1348b5d51733a6b6c5e046fd8c6e28cc Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 5 Jul 2021 10:52:10 +0530 +Subject: [PATCH 586/586] protocol/client: Do not reopen fd post handshake if + posix lock is held + +Problem: +With client.strict-locks enabled, in some cases where the posix lock is +taken after a brick gets disconnected, the fd is getting reopened when +the brick gets reconnected to the client as part of client_post_handshake. +In such cases the saved fdctx's lock_list may not have the latest +information. + +Fix: +Check the lock information in the fdctx->lk_ctx as well post handshake +which will have the latest information on the locks. +Also check for this field in other places as well to prevent writes +happening with anonymous fd even without re-opening the fd on the +restarted brick. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2582 +> Fixes: #2581 +> Change-Id: I7a0799e242ce188c6597dec0a65b4dae7dcd815b +> Signed-off-by: karthik-us ksubrahm@redhat.com + +BUG: 1689375 +Change-Id: I7a0799e242ce188c6597dec0a65b4dae7dcd815b +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/252588 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/replicate/do-not-reopen-fd.t | 76 ++++++++++++++++++-------- + xlators/protocol/client/src/client-handshake.c | 2 +- + xlators/protocol/client/src/client-helpers.c | 11 +++- + xlators/protocol/client/src/client.c | 2 +- + xlators/protocol/client/src/client.h | 3 + + 5 files changed, 67 insertions(+), 27 deletions(-) + +diff --git a/tests/bugs/replicate/do-not-reopen-fd.t b/tests/bugs/replicate/do-not-reopen-fd.t +index 13b5218..f346709 100644 +--- a/tests/bugs/replicate/do-not-reopen-fd.t ++++ b/tests/bugs/replicate/do-not-reopen-fd.t +@@ -20,10 +20,41 @@ TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 + TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M1 + + TEST touch $M0/a ++gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) ++gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) ++ ++ ++# Open fd from a client, check for open fd on all the bricks. ++TEST fd1=`fd_available` ++TEST fd_open $fd1 'rw' $M0/a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Kill a brick and take lock on the fd ++TEST kill_brick $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++TEST flock -x $fd1 ++ ++# Restart the brick and check for no open fd on the restarted brick. ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a ++EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++ ++# Write on the fd. It should fail on the restarted brick. ++TEST fd_write $fd1 "data-0" ++EXPECT "" cat $B0/${V0}0/a ++EXPECT "data-0" cat $B0/${V0}1/a ++EXPECT "data-0" cat $B0/${V0}2/a ++ ++TEST fd_close $fd1 + + # Kill one brick and take lock on the fd and do a write. + TEST kill_brick $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + +@@ -34,7 +65,7 @@ TEST fd_write $fd1 "data-1" + # should still succeed as there were no quorum disconnects. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd_write $fd1 "data-2" + EXPECT "" cat $B0/${V0}0/a + EXPECT "data-2" cat $B0/${V0}1/a +@@ -42,9 +73,6 @@ EXPECT "data-2" cat $B0/${V0}2/a + + # Check there is no fd opened on the 1st brick by checking for the gfid inside + # /proc/pid-of-brick/fd/ directory +-gfid_a=$(gf_get_gfid_xattr $B0/${V0}0/a) +-gfid_str_a=$(gf_gfid_xattr_to_str $gfid_a) +- + EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a +@@ -59,7 +87,7 @@ EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + # Kill 2nd brick and try writing to the file. The write should fail due to + # quorum failure. + TEST kill_brick $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-3" + TEST ! fd_cat $fd1 + +@@ -67,7 +95,7 @@ TEST ! fd_cat $fd1 + # which were down previously, will return EBADFD now. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-4" + TEST ! fd_cat $fd1 + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +@@ -79,9 +107,9 @@ EXPECT "^2$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + EXPECT_WITHIN $HEAL_TIMEOUT "^2$" get_pending_heal_count $V0 + TEST $CLI volume heal $V0 enable + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 + + TEST $CLI volume heal $V0 + EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +@@ -103,7 +131,7 @@ TEST ! fd_write $fd1 "data-5" + # Kill the only brick that is having lock and try taking lock on another client + # which should succeed. + TEST kill_brick $V0 $H0 $B0/${V0}2 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 2 + TEST flock -x $fd2 + TEST fd_write $fd2 "data-6" + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a +@@ -114,17 +142,17 @@ EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a + # fail and operations on the 2nd fd should succeed. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}2 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M1 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M1 $V0-replicate-0 2 + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST ! fd_write $fd1 "data-7" + + TEST ! fd_cat $fd1 + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +-EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a ++EXPECT "^0" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + TEST fd_cat $fd2 + + # Close both the fds which will release the locks and then re-open and take lock +@@ -159,9 +187,9 @@ EXPECT_WITHIN $REOPEN_TIMEOUT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0 + # Heal the volume + TEST $CLI volume heal $V0 enable + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 + + TEST $CLI volume heal $V0 + EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +@@ -169,7 +197,7 @@ TEST $CLI volume heal $V0 disable + + # Kill one brick and open a fd. + TEST kill_brick $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + +@@ -182,7 +210,7 @@ EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + # any of the bricks. + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd_write $fd1 "data-10" + EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + +@@ -193,7 +221,7 @@ TEST fd_close $fd1 + + # Kill one brick, open and take lock on a fd. + TEST kill_brick $V0 $H0 $B0/${V0}0 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" afr_child_up_status_meta $M0 $V0-replicate-0 0 + TEST fd1=`fd_available` + TEST fd_open $fd1 'rw' $M0/a + TEST flock -x $fd1 +@@ -204,7 +232,7 @@ EXPECT "^1$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}2 $gfid_str_a + + # Kill & restart another brick so that it will return EBADFD + TEST kill_brick $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" brick_up_status $V0 $H0 $B0/${V0}1 + + # Restart the bricks and then write. Now fd should not get re-opened since lock + # is still held on one brick and write should also fail as there is no quorum. +@@ -212,8 +240,8 @@ EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" brick_up_status $V0 $H0 $B0/${V0}1 + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}0 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" afr_child_up_status_meta $M0 $V0-replicate-0 1 + TEST ! fd_write $fd1 "data-11" + EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}0 $gfid_str_a + EXPECT "^0$" gf_open_file_count_in_brick $V0 $H0 $B0/${V0}1 $gfid_str_a +diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c +index a12472b..20e03d8 100644 +--- a/xlators/protocol/client/src/client-handshake.c ++++ b/xlators/protocol/client/src/client-handshake.c +@@ -911,7 +911,7 @@ client_post_handshake(call_frame_t *frame, xlator_t *this) + list_for_each_entry_safe(fdctx, tmp, &conf->saved_fds, sfd_pos) + { + if (fdctx->remote_fd != -1 || +- (!list_empty(&fdctx->lock_list) && conf->strict_locks)) ++ (!fdctx_lock_lists_empty(fdctx) && conf->strict_locks)) + continue; + + fdctx->reopen_done = client_child_up_reopen_done; +diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c +index a80f303..b4a7294 100644 +--- a/xlators/protocol/client/src/client-helpers.c ++++ b/xlators/protocol/client/src/client-helpers.c +@@ -15,6 +15,15 @@ + #include + #include + ++gf_boolean_t ++fdctx_lock_lists_empty(clnt_fd_ctx_t *fdctx) ++{ ++ if (list_empty(&fdctx->lock_list) && fd_lk_ctx_empty(fdctx->lk_ctx)) ++ return _gf_true; ++ ++ return _gf_false; ++} ++ + int + client_fd_lk_list_empty(fd_lk_ctx_t *lk_ctx, gf_boolean_t try_lock) + { +@@ -441,7 +450,7 @@ client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd, + *remote_fd = fdctx->remote_fd; + } + +- locks_involved = !list_empty(&fdctx->lock_list); ++ locks_involved = !fdctx_lock_lists_empty(fdctx); + } + } + pthread_spin_unlock(&conf->fd_lock); +diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c +index 35a5340..6df2ed1 100644 +--- a/xlators/protocol/client/src/client.c ++++ b/xlators/protocol/client/src/client.c +@@ -881,7 +881,7 @@ client_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + pthread_spin_lock(&conf->fd_lock); + { + fdctx = this_fd_get_ctx(fd, this); +- if (fdctx && !list_empty(&fdctx->lock_list)) { ++ if (fdctx && !fdctx_lock_lists_empty(fdctx)) { + ret = -1; + op_errno = EBADFD; + } +diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h +index f952aea..799fe6e 100644 +--- a/xlators/protocol/client/src/client.h ++++ b/xlators/protocol/client/src/client.h +@@ -535,4 +535,7 @@ client_add_lock_for_recovery(fd_t *fd, struct gf_flock *flock, + int + client_is_setlk(int32_t cmd); + ++gf_boolean_t ++fdctx_lock_lists_empty(clnt_fd_ctx_t *fdctx); ++ + #endif /* !_CLIENT_H */ +-- +1.8.3.1 + diff --git a/SOURCES/0587-Update-rfc.sh-to-rhgs-3.5.6.patch b/SOURCES/0587-Update-rfc.sh-to-rhgs-3.5.6.patch new file mode 100644 index 0000000..420a4cf --- /dev/null +++ b/SOURCES/0587-Update-rfc.sh-to-rhgs-3.5.6.patch @@ -0,0 +1,26 @@ +From f72780b560ea8efe1508aa9ddc574e6dc066bf9a Mon Sep 17 00:00:00 2001 +From: Csaba Henk +Date: Wed, 29 Sep 2021 10:44:37 +0200 +Subject: [PATCH 587/610] Update rfc.sh to rhgs-3.5.6 + +Signed-off-by: Csaba Henk +--- + rfc.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/rfc.sh b/rfc.sh +index daeff32..67798cb 100755 +--- a/rfc.sh ++++ b/rfc.sh +@@ -18,7 +18,7 @@ done + shift $((OPTIND-1)) + + +-branch="rhgs-3.5.5"; ++branch="rhgs-3.5.6"; + + set_hooks_commit_msg() + { +-- +1.8.3.1 + diff --git a/SOURCES/0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch b/SOURCES/0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch new file mode 100644 index 0000000..1e6c488 --- /dev/null +++ b/SOURCES/0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch @@ -0,0 +1,388 @@ +From e3813685237dbdf8dc7cf28726fff2caf2288706 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Mon, 19 Jul 2021 15:37:02 +0200 +Subject: [PATCH 588/610] locks: Fix null gfid in lock contention notifications + +This patch fixes 3 problems: + +First problem: + +After commit c0bd592e, the pl_inode_t object was also created in the +cbk of lookup requests. Lookup requests are a bit different than any +other request because the inode received may not be completely +initialized. In particular, inode->gfid may be null. + +This caused that the gfid stored in the pl_inode_t object was null in +some cases. This gfid is used mostly for logs, but also to send lock +contention notifications. This meant that some notifications could be +sent with a null gfid, making impossible for the client xlator to +correctly identify the contending inode, so the lock was not released +immediately when eager-lock was also enabled. + +Second problem: + +The feature introduced by c0bd592e needed to track the number of +hardlinks of each inode to detect when it was deleted. However it +was done using the 'get-link-count' special xattr on lookup, while +posix only implements it for unlink and rename. + +Also, the number of hardlinks was not incremented for mkdir, mknod, +rename, ..., so it didn't work correctly for directories. + +Third problem: + +When the last hardlink of an open file is deleted, all locks will be +denied with ESTALE error, but that's not correct. Access to the open +fd must succeed. + +The first problem is fixed by avoiding creating pl_inode_t objects +during lookup. Second and third problems are fixed by completely +ignoring if the file has been deleted or not. Even if we grant a +lock on a non-existing file, the next operation done by the client +inside the lock will return the correct error, which should be enough. + +Upstream patch: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2553 +> Fixes: #2551 +> Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253 +> Signed-off-by: Xavi Hernandez + +BUG: 1962972 +Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279192 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/locks/issue-2551.t | 58 ++++++++++++++++++ + xlators/features/locks/src/common.c | 31 +++------- + xlators/features/locks/src/locks.h | 2 - + xlators/features/locks/src/posix.c | 118 +++--------------------------------- + 4 files changed, 74 insertions(+), 135 deletions(-) + create mode 100644 tests/bugs/locks/issue-2551.t + +diff --git a/tests/bugs/locks/issue-2551.t b/tests/bugs/locks/issue-2551.t +new file mode 100644 +index 0000000..a32af02 +--- /dev/null ++++ b/tests/bugs/locks/issue-2551.t +@@ -0,0 +1,58 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++function check_time() { ++ local max="${1}" ++ local start="$(date +"%s")" ++ ++ shift ++ ++ if "${@}"; then ++ if [[ $(($(date +"%s") - ${start})) -lt ${max} ]]; then ++ return 0 ++ fi ++ fi ++ ++ return 1 ++} ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/brick{0..2} ++TEST $CLI volume set $V0 disperse.eager-lock on ++TEST $CLI volume set $V0 disperse.eager-lock-timeout 30 ++TEST $CLI volume set $V0 features.locks-notify-contention on ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.open-behind off ++TEST $CLI volume set $V0 performance.quick-read off ++ ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 ++ ++TEST mkdir $M0/dir ++TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1 ++ ++TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 conv=notrunc ++TEST check_time 5 dd if=/dev/zero of=$M1/dir/test bs=4k count=1 conv=notrunc +diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c +index cddbfa6..5403086 100644 +--- a/xlators/features/locks/src/common.c ++++ b/xlators/features/locks/src/common.c +@@ -468,9 +468,7 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) + pl_inode->check_mlock_info = _gf_true; + pl_inode->mlock_enforced = _gf_false; + +- /* -2 means never looked up. -1 means something went wrong and link +- * tracking is disabled. */ +- pl_inode->links = -2; ++ pl_inode->remove_running = 0; + + ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); + if (ret) { +@@ -1403,11 +1401,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + + pthread_mutex_lock(&pl_inode->mutex); + +- if (pl_inode->removed) { +- error = ESTALE; +- goto unlock; +- } +- + if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { + error = -1; + /* We skip the unlock here because the caller must create a stub when +@@ -1420,7 +1413,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode->is_locked = _gf_true; + pl_inode->remove_running++; + +-unlock: + pthread_mutex_unlock(&pl_inode->mutex); + + done: +@@ -1490,20 +1482,18 @@ pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) + + pthread_mutex_lock(&pl_inode->mutex); + +- if (error == 0) { +- if (pl_inode->links >= 0) { +- pl_inode->links--; +- } +- if (pl_inode->links == 0) { +- pl_inode->removed = _gf_true; +- } +- } +- + pl_inode->remove_running--; + + if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { + pl_inode->is_locked = _gf_false; + ++ /* At this point it's possible that the inode has been deleted, but ++ * there could be open fd's still referencing it, so we can't prevent ++ * pending locks from being granted. If the file has really been ++ * deleted, whatever the client does once the lock is granted will ++ * fail with the appropriate error, so we don't need to worry about ++ * it here. */ ++ + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, +@@ -1555,11 +1545,6 @@ pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) + pl_dom_list_t *dom; + pl_inode_lock_t *ilock; + +- /* If the inode has been deleted, we won't allow any lock. */ +- if (pl_inode->removed) { +- return -ESTALE; +- } +- + /* We only synchronize with locks made for regular operations coming from + * the user. Locks done for internal purposes are hard to control and could + * lead to long delays or deadlocks quite easily. */ +diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h +index 6666feb..2406dcd 100644 +--- a/xlators/features/locks/src/locks.h ++++ b/xlators/features/locks/src/locks.h +@@ -202,10 +202,8 @@ struct __pl_inode { + int fop_wind_count; + pthread_cond_t check_fop_wind_count; + +- int32_t links; /* Number of hard links the inode has. */ + uint32_t remove_running; /* Number of remove operations running. */ + gf_boolean_t is_locked; /* Regular locks will be blocked. */ +- gf_boolean_t removed; /* The inode has been deleted. */ + }; + typedef struct __pl_inode pl_inode_t; + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index 22ef5b8..d5effef 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -2975,104 +2975,24 @@ out: + return ret; + } + +-static int32_t +-pl_request_link_count(dict_t **pxdata) +-{ +- dict_t *xdata; +- +- xdata = *pxdata; +- if (xdata == NULL) { +- xdata = dict_new(); +- if (xdata == NULL) { +- return ENOMEM; +- } +- } else { +- dict_ref(xdata); +- } +- +- if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) { +- dict_unref(xdata); +- return ENOMEM; +- } +- +- *pxdata = xdata; +- +- return 0; +-} +- +-static int32_t +-pl_check_link_count(dict_t *xdata) +-{ +- int32_t count; +- +- /* In case we are unable to read the link count from xdata, we take a +- * conservative approach and return -2, which will prevent the inode from +- * being considered deleted. In fact it will cause link tracking for this +- * inode to be disabled completely to avoid races. */ +- +- if (xdata == NULL) { +- return -2; +- } +- +- if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) { +- return -2; +- } +- +- return count; +-} +- + int32_t + pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) + { +- pl_inode_t *pl_inode; +- +- if (op_ret >= 0) { +- pl_inode = pl_inode_get(this, inode, NULL); +- if (pl_inode == NULL) { +- PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL, +- NULL); +- return 0; +- } +- +- pthread_mutex_lock(&pl_inode->mutex); +- +- /* We only update the link count if we previously didn't know it. +- * Doing it always can lead to races since lookup is not executed +- * atomically most of the times. */ +- if (pl_inode->links == -2) { +- pl_inode->links = pl_check_link_count(xdata); +- if (buf->ia_type == IA_IFDIR) { +- /* Directories have at least 2 links. To avoid special handling +- * for directories, we simply decrement the value here to make +- * them equivalent to regular files. */ +- pl_inode->links--; +- } +- } +- +- pthread_mutex_unlock(&pl_inode->mutex); +- } +- + PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata, + postparent); ++ + return 0; + } + + int32_t + pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + { +- int32_t error; ++ PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); ++ STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xdata); + +- error = pl_request_link_count(&xdata); +- if (error == 0) { +- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); +- STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xdata); +- dict_unref(xdata); +- } else { +- STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL); +- } + return 0; + } + +@@ -3881,9 +3801,7 @@ unlock: + __dump_posixlks(pl_inode); + } + +- gf_proc_dump_write("links", "%d", pl_inode->links); + gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running); +- gf_proc_dump_write("removed", "%u", pl_inode->removed); + } + pthread_mutex_unlock(&pl_inode->mutex); + +@@ -4508,21 +4426,9 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) + { +- pl_inode_t *pl_inode = (pl_inode_t *)cookie; +- +- if (op_ret >= 0) { +- pthread_mutex_lock(&pl_inode->mutex); +- +- /* TODO: can happen pl_inode->links == 0 ? */ +- if (pl_inode->links >= 0) { +- pl_inode->links++; +- } +- +- pthread_mutex_unlock(&pl_inode->mutex); +- } +- + PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); ++ + return 0; + } + +@@ -4530,18 +4436,10 @@ int + pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) + { +- pl_inode_t *pl_inode; +- +- pl_inode = pl_inode_get(this, oldloc->inode, NULL); +- if (pl_inode == NULL) { +- STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, +- NULL); +- return 0; +- } +- + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); +- STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); ++ STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); ++ + return 0; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch b/SOURCES/0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch new file mode 100644 index 0000000..861791f --- /dev/null +++ b/SOURCES/0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch @@ -0,0 +1,63 @@ +From 0bb71e1492b1ad442758399eb8dcb5f087d77f12 Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Wed, 28 Apr 2021 02:14:27 +0530 +Subject: [PATCH 589/610] extras: fix for postscript failure on logrotation of + snapd logs (#2310) + +Issue: +On executing the logrotate command, the postscript runs as a separate process, +and when we do a grep for the snapd process it returns the PID of that +short-term process as well, and executing a kill on that throws the error. +To check a similar error could be seen if we replace the killall for bricks +log rotation with a for loop on PIDs. + +Fix: +Use the killall command on the list of snapd processes instead of +using the kill command to individually kill them. + +>Fixes: #2360 +>Change-Id: I1ad6e3e4d74128706e71900d02e715635294ff72 +>Signed-off-by: nik-redhat + +Upstream patch: https://github.com/gluster/glusterfs/pull/2310 +BUG: 1668303 + +Change-Id: I59910fc3660e11e131b1aa813848c2e19cbffefd +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279533 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/glusterfs-logrotate | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/extras/glusterfs-logrotate b/extras/glusterfs-logrotate +index 75f700e..2b9028b 100644 +--- a/extras/glusterfs-logrotate ++++ b/extras/glusterfs-logrotate +@@ -45,3 +45,22 @@ + compress + delaycompress + } ++ ++# Rotate snapd log ++/var/log/glusterfs/snaps/*/*.log { ++ sharedscripts ++ weekly ++ maxsize 10M ++ minsize 100k ++ ++ # 6 months of logs are good enough ++ rotate 26 ++ ++ missingok ++ compress ++ delaycompress ++ notifempty ++ postrotate ++ /usr/bin/killall -HUP `pgrep -f "glusterfs.*snapd"` > /dev/null 2>&1 || true ++ endscript ++} +-- +1.8.3.1 + diff --git a/SOURCES/0590-cluster-afr-Don-t-check-for-stale-entry-index.patch b/SOURCES/0590-cluster-afr-Don-t-check-for-stale-entry-index.patch new file mode 100644 index 0000000..c7ff40a --- /dev/null +++ b/SOURCES/0590-cluster-afr-Don-t-check-for-stale-entry-index.patch @@ -0,0 +1,128 @@ +From 87138f86b8cb98d1c9d1a4c9a2393e7978d20b1d Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Tue, 5 Oct 2021 12:33:01 +0530 +Subject: [PATCH 590/610] cluster/afr: Don't check for stale entry-index + +Problem: +In every entry index heal there is a check to see if the +index is stale or not. + 1. If a file is created when the brick is down this +will lead to an extra index lookup because the name is not stale. + 2. If a file is deleted when the brick is down this will also lead to + and extra index lookup because the name is not stale. + 3. If a file is created and deleted when the brick is down then the + index is stale and this will save entry-heal i.e. 2 entrylks and 2 lookups + +Since 1, 2 happen significantly more than 3, this is a bad tradeoff. + +Fix: +Let stale index be removed as part of normal entry heal detecting 'the +name is already deleted' code path. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2612 +> fixes: gluster#2611 +> Change-Id: I29bcc07f2480877a83b30dbd7e2e5631a74df8e8 +> Signed-off-by: Pranith Kumar K + +BUG: 1994593 +Change-Id: I29bcc07f2480877a83b30dbd7e2e5631a74df8e8 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279606 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heal-entry.c | 46 +++++++-------------------- + 1 file changed, 11 insertions(+), 35 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c +index a17dd93..14b7417 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-entry.c ++++ b/xlators/cluster/afr/src/afr-self-heal-entry.c +@@ -933,37 +933,8 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data) + { + int ret = 0; +- loc_t loc = { +- 0, +- }; +- struct iatt iatt = { +- 0, +- }; + afr_granular_esh_args_t *args = data; + +- /* Look up the actual inode associated with entry. If the lookup returns +- * ESTALE or ENOENT, then it means we have a stale index. Remove it. +- * This is analogous to the check in afr_shd_index_heal() except that +- * here it is achieved through LOOKUP and in afr_shd_index_heal() through +- * a GETXATTR. +- */ +- +- loc.inode = inode_new(args->xl->itable); +- loc.parent = inode_ref(args->heal_fd->inode); +- gf_uuid_copy(loc.pargfid, loc.parent->gfid); +- loc.name = entry->d_name; +- +- ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL); +- if ((ret == -ENOENT) || (ret == -ESTALE)) { +- /* The name indices under the pgfid index dir are guaranteed +- * to be regular files. Hence the hardcoding. +- */ +- afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); +- ret = 0; +- goto out; +- } +- /* TBD: afr_shd_zero_xattrop? */ +- + ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd, + entry->d_name, parent->inode, subvol, + _gf_false); +@@ -974,8 +945,6 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + if (ret == -1) + args->mismatch = _gf_true; + +-out: +- loc_wipe(&loc); + return ret; + } + +@@ -1050,7 +1019,9 @@ afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + local = frame->local; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, +- "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid)); ++ "performing %s entry selfheal on %s", ++ (local->need_full_crawl ? "full" : "granular"), ++ uuid_utoa(fd->inode->gfid)); + + for (i = 0; i < priv->child_count; i++) { + /* Expunge */ +@@ -1112,6 +1083,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t did_sh = _gf_true; ++ char *heal_type = "granular entry"; + + priv = this->private; + local = frame->local; +@@ -1194,11 +1166,15 @@ postop_unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, + postop_lock, NULL); + out: +- if (did_sh) +- afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources, ++ if (did_sh) { ++ if (local->need_full_crawl) { ++ heal_type = "full entry"; ++ } ++ afr_log_selfheal(fd->inode->gfid, this, ret, heal_type, source, sources, + healed_sinks); +- else ++ } else { + ret = 1; ++ } + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); +-- +1.8.3.1 + diff --git a/SOURCES/0591-afr-check-for-valid-iatt.patch b/SOURCES/0591-afr-check-for-valid-iatt.patch new file mode 100644 index 0000000..8f1e48e --- /dev/null +++ b/SOURCES/0591-afr-check-for-valid-iatt.patch @@ -0,0 +1,44 @@ +From 19460ebc988795eeabaeb8e25d6eba9a3cf2864b Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 4 Oct 2021 12:44:21 +0530 +Subject: [PATCH 591/610] afr: check for valid iatt + +Problem: +If the entry being processed by afr_shd_anon_inode_cleaner() is no +longer present, gfid lookup fails with ENOENT on all bricks and iatt +will never be assigned, causing a crash due to null dereference. + +Fix: +Add a null-check for iatt. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2660 +> Fixes: gluster#2659 +> Change-Id: I6abfc8063677861ce9388ca4efdf491ec956dc74 +> Signed-off-by: Ravishankar N + +BUG: 1995029 +Change-Id: I6abfc8063677861ce9388ca4efdf491ec956dc74 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279529 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heald.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c +index 18aed93..bc720cf 100644 +--- a/xlators/cluster/afr/src/afr-self-heald.c ++++ b/xlators/cluster/afr/src/afr-self-heald.c +@@ -870,7 +870,7 @@ afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + } + + /*Inode is deleted from subvol*/ +- if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { ++ if (count == 1 || (iatt && iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); +-- +1.8.3.1 + diff --git a/SOURCES/0592-md-cache-fix-integer-signedness-mismatch.patch b/SOURCES/0592-md-cache-fix-integer-signedness-mismatch.patch new file mode 100644 index 0000000..94cfe88 --- /dev/null +++ b/SOURCES/0592-md-cache-fix-integer-signedness-mismatch.patch @@ -0,0 +1,119 @@ +From be3448ed5d9d59752cff4df8325ee67eb7d41531 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Mon, 19 Jul 2021 06:56:18 +0200 +Subject: [PATCH 592/610] md-cache: fix integer signedness mismatch + +md-cache uses a mechanism based on a generation number to detect +modifications made by other clients to the entries and invalidate +the cached data. + +This generation number is a 32 bit integer. When it overflows, +special management is done to avoid problems. This overflow condition +is tracked with a single bit. + +For many fops, when they are received, the overflow bit and the +current generation number are recorded in a single 64-bit value +which is used later in the cbk. + +This is the problematic function: + + uint64_t + __mdc_get_generation(xlator_t *this, struct md_cache *mdc) + { + uint64_t gen = 0, rollover; + struct mdc_conf *conf = NULL; + + conf = this->private; + + gen = GF_ATOMIC_INC(conf->generation); + if (gen == 0) { + gf_log("MDC", GF_LOG_NOTICE, "%p Reset 1", mdc); + mdc->gen_rollover = !mdc->gen_rollover; + gen = GF_ATOMIC_INC(conf->generation); + mdc->ia_time = 0; + mdc->generation = 0; + mdc->invalidation_time = gen - 1; + } + + rollover = mdc->gen_rollover; + gen |= (rollover << 32); + return gen; + } + +'conf->generation' is declared as an atomic signed 32-bit integer, +and 'gen' is an unsigned 64-bit value. When 'gen' is assigned from +a signed int, the sign bit is extended to fill the high 32 bits of +'gen'. If the counter has overflown the maximum signed positive +value, it will become negative (sign bit = 1). + +In this case, when 'rollover' is later combined with 'gen', all the +high bits remain at '1'. + +This value is used later in 'mdc_inode_iatt_set_validate' during +callback processing. The overflow condition and generation numbers +from when the operation was received are recovered this way: + + rollover = incident_time >> 32; + incident_time = (incident_time & 0xffffffff); + +('incident_time' is the saved value from '__mdc_get_generation'). + +So here rollover will be 0xffffffff, when it's expected to be 0 +or 1 only. When this is compared later with the cached overflow +bit, it doesn't match, which prevents updating the cached info. + +This is bad in general, but it's even worse when an entry is not +cached and 'rollover' is 0xffffffff the first time. When md-cache +doesn't have cached data it assumes it's everything 0. This causes +a mismatch, which sends an invalidation request to the kernel, but +since the 'rollover' doesn't match, the cached data is not updated. +So the next time the cached data is checked, it will also send an +invalidation to the kernel, indefinitely. + +This patch fixes two things: + +1. The 'generation' field is made unsigned to avoid sign extension. +2. Invalidation requests are only sent if we already had valid cached + data. Otherwise it doesn't make sense to send an invalidation. + +Upstream patch: +> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2619 +> Fixes: #2617 +> Change-Id: Ie40e68288cf143e1bc1a40f46da98f51bb2d6864 +> Signed-off-by: Xavi Hernandez + +BUG: 1904137 +Change-Id: Ie40e68288cf143e1bc1a40f46da98f51bb2d6864 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279188 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/md-cache/src/md-cache.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c +index bbbee3b..e0256d6 100644 +--- a/xlators/performance/md-cache/src/md-cache.c ++++ b/xlators/performance/md-cache/src/md-cache.c +@@ -79,7 +79,7 @@ struct mdc_conf { + gf_boolean_t cache_statfs; + struct mdc_statfs_cache statfs_cache; + char *mdc_xattr_str; +- gf_atomic_int32_t generation; ++ gf_atomic_uint32_t generation; + }; + + struct mdc_local; +@@ -537,7 +537,7 @@ mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf, + (iatt->ia_mtime_nsec != mdc->md_mtime_nsec) || + (iatt->ia_ctime != mdc->md_ctime) || + (iatt->ia_ctime_nsec != mdc->md_ctime_nsec)) { +- if (conf->global_invalidation && ++ if (conf->global_invalidation && mdc->valid && + (!prebuf || (prebuf->ia_mtime != mdc->md_mtime) || + (prebuf->ia_mtime_nsec != mdc->md_mtime_nsec) || + (prebuf->ia_ctime != mdc->md_ctime) || +-- +1.8.3.1 + diff --git a/SOURCES/0593-dht-explicit-null-dereference.patch b/SOURCES/0593-dht-explicit-null-dereference.patch new file mode 100644 index 0000000..4ad9eea --- /dev/null +++ b/SOURCES/0593-dht-explicit-null-dereference.patch @@ -0,0 +1,58 @@ +From 76c9faf5c750428e5eb69462b82ee0c12cbdabc0 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 25 Sep 2020 18:39:51 +0530 +Subject: [PATCH 593/610] dht: explicit null dereference + +Added a null check for uuid_list_copy, to avoid +null dereference in strtok_r() in case of strdup() +failure. + +CID: 1325612 +CID: 1274223 + +>Updates: #1060 + +>Change-Id: I641a5068cd76d7b2ed92eccf39e7f97d6f7b2480 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/25046 +BUG: 1997447 + +Change-Id: I576b4ce610948bdb84eb30377a684c54df718bdc +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280063 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 2 ++ + xlators/cluster/dht/src/dht-shared.c | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index edfc6e7..e6a16ff 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -4296,6 +4296,8 @@ dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + index = conf->local_subvols_cnt; + + uuid_list_copy = gf_strdup(uuid_list); ++ if (!uuid_list_copy) ++ goto unlock; + + for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str; + uuid_str = next_uuid_str) { +diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c +index 58e3339..cca272a 100644 +--- a/xlators/cluster/dht/src/dht-shared.c ++++ b/xlators/cluster/dht/src/dht-shared.c +@@ -567,6 +567,8 @@ gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, + pattern_str = strtok_r(data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup(pattern_str); ++ if (!dup_str) ++ goto out; + pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); + if (!pattern_list) { + goto out; +-- +1.8.3.1 + diff --git a/SOURCES/0594-glusterd-resource-leaks.patch b/SOURCES/0594-glusterd-resource-leaks.patch new file mode 100644 index 0000000..ccc2f3b --- /dev/null +++ b/SOURCES/0594-glusterd-resource-leaks.patch @@ -0,0 +1,52 @@ +From 663df92f9b4b9f35ae10f84487494829987e2f58 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 25 Sep 2020 17:56:19 +0530 +Subject: [PATCH 594/610] glusterd: resource leaks + +Issue: +iobref was not freed before exiting the function. + +Fix: +Modified the code to free iobref before exiting. + +CID: 1430107 +>Updates: #1060 + +>Change-Id: I89351b3aa645792eb8dda6292d1e559057b02d8b +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/25042 +BUG: 1997447 + +Change-Id: Iea56afca015a7c0f15ab32f490ea27f5ea323a07 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280066 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 6d40be5..c037933 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -6042,7 +6042,6 @@ send_attach_req(xlator_t *this, struct rpc_clnt *rpc, char *path, + GF_ATOMIC_INC(conf->blockers); + ret = rpc_clnt_submit(rpc, &gd_brick_prog, op, cbkfn, &iov, 1, NULL, 0, + iobref, frame, NULL, 0, NULL, 0, NULL); +- return ret; + + free_iobref: + iobref_unref(iobref); +@@ -6051,7 +6050,7 @@ maybe_free_iobuf: + iobuf_unref(iobuf); + } + err: +- return -1; ++ return ret; + } + + extern size_t +-- +1.8.3.1 + diff --git a/SOURCES/0595-glusterd-use-after-free-coverity-issue.patch b/SOURCES/0595-glusterd-use-after-free-coverity-issue.patch new file mode 100644 index 0000000..7430838 --- /dev/null +++ b/SOURCES/0595-glusterd-use-after-free-coverity-issue.patch @@ -0,0 +1,51 @@ +From 025718f1734655c411475ea338cee1659d96763e Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 3 Sep 2020 15:42:45 +0530 +Subject: [PATCH 595/610] glusterd: use after free (coverity issue) + +Issue: +dict_unref is called on the same dict again, +in the out label of the code, which causes the +use after free issue. + +Fix: +Set the dict to NULL after unref, to avoid +use after free issue. + +CID: 1430127 + +>Updates: #1060 + +>Change-Id: Ide9a5cbc5f496705c671e72b0260da6d4c06f16d +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24946 +BUG: 1997447 + +Change-Id: Id1e58cd6226b9329ad49bd5b75ee96a3a5ec5ab7 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280067 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +index 386eed2..b0fa490 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c +@@ -2039,8 +2039,9 @@ glusterd_update_snaps_synctask(void *opaque) + "Failed to remove snap %s", snap->snapname); + goto out; + } +- if (dict) +- dict_unref(dict); ++ ++ dict_unref(dict); ++ dict = NULL; + } + snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix); + ret = dict_get_int32(peer_data, buf, &val); +-- +1.8.3.1 + diff --git a/SOURCES/0596-locks-null-dereference.patch b/SOURCES/0596-locks-null-dereference.patch new file mode 100644 index 0000000..4ad016f --- /dev/null +++ b/SOURCES/0596-locks-null-dereference.patch @@ -0,0 +1,43 @@ +From 099fcac6fecef6fc367d8fcae8442195f3f174db Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 25 Sep 2020 18:19:39 +0530 +Subject: [PATCH 596/610] locks: null dereference + +Added a null check before executing the strtok_r() +to avoid null dereference in case of strdup() failure. + +CID: 1407938 +>Updates: #1060 + +>Change-Id: Iec6e72ae8cb54f6d0a287615c43756325b2026ec +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/25045 +BUG: 1997447 + +Change-Id: I47e6e2402badaf4103607b4164f19142a99a2f71 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280065 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/locks/src/posix.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index d5effef..03c4907 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -494,6 +494,9 @@ pl_inodelk_xattr_fill_multiple(dict_t *this, char *key, data_t *value, + char *save_ptr = NULL; + + tmp_key = gf_strdup(key); ++ if (!tmp_key) ++ return -1; ++ + strtok_r(tmp_key, ":", &save_ptr); + if (!*save_ptr) { + gf_msg(THIS->name, GF_LOG_ERROR, 0, EINVAL, +-- +1.8.3.1 + diff --git a/SOURCES/0597-glusterd-memory-deallocated-twice.patch b/SOURCES/0597-glusterd-memory-deallocated-twice.patch new file mode 100644 index 0000000..7e2c49f --- /dev/null +++ b/SOURCES/0597-glusterd-memory-deallocated-twice.patch @@ -0,0 +1,163 @@ +From 59c05230c0df58765e30553c66bbcc0c9965d362 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Tue, 11 Aug 2020 23:12:26 +0530 +Subject: [PATCH 597/610] glusterd: memory deallocated twice + +Issue: +If the the pointer tmptier is destroyed in the function +code it still it checks for the same in the out label. +And tries to destroy the same pointer again. + +Fix: +So, instead of passing the ptr by value, if we +pass it by reference then, on making the ptr in the +function the value will persist, in the calling +function and next time when the gf_store_iter_destory() +is called it won't try to free the ptr again. + +CID: 1430122 + +>Updates: #1060 + +>Change-Id: I019cea8e301c7cc87be792c03b58722fc96f04ef +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24855 +BUG: 1997447 + +Change-Id: Ib403efd08d47a69d25f291ae61c9cbfcaaa05da8 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280076 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs/store.h | 2 +- + libglusterfs/src/store.c | 12 +++++++----- + xlators/mgmt/glusterd/src/glusterd-store.c | 16 ++++++++-------- + 3 files changed, 16 insertions(+), 14 deletions(-) + +diff --git a/libglusterfs/src/glusterfs/store.h b/libglusterfs/src/glusterfs/store.h +index 68a20ad..76af2df 100644 +--- a/libglusterfs/src/glusterfs/store.h ++++ b/libglusterfs/src/glusterfs/store.h +@@ -93,7 +93,7 @@ int32_t + gf_store_iter_get_matching(gf_store_iter_t *iter, char *key, char **value); + + int32_t +-gf_store_iter_destroy(gf_store_iter_t *iter); ++gf_store_iter_destroy(gf_store_iter_t **iter); + + char * + gf_store_strerror(gf_store_op_errno_t op_errno); +diff --git a/libglusterfs/src/store.c b/libglusterfs/src/store.c +index 3af627a..e4931bf 100644 +--- a/libglusterfs/src/store.c ++++ b/libglusterfs/src/store.c +@@ -606,23 +606,25 @@ out: + } + + int32_t +-gf_store_iter_destroy(gf_store_iter_t *iter) ++gf_store_iter_destroy(gf_store_iter_t **iter) + { + int32_t ret = -1; + +- if (!iter) ++ if (!(*iter)) + return 0; + + /* gf_store_iter_new will not return a valid iter object with iter->file + * being NULL*/ +- ret = fclose(iter->file); ++ ret = fclose((*iter)->file); + if (ret) + gf_msg("", GF_LOG_ERROR, errno, LG_MSG_FILE_OP_FAILED, + "Unable" + " to close file: %s, ret: %d", +- iter->filepath, ret); ++ (*iter)->filepath, ret); ++ ++ GF_FREE(*iter); ++ *iter = NULL; + +- GF_FREE(iter); + return ret; + } + +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index a8651d8..e027575 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -2576,7 +2576,7 @@ glusterd_store_retrieve_snapd(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -2895,13 +2895,13 @@ glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(tmpiter)) { ++ if (gf_store_iter_destroy(&tmpiter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; + } + +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -3067,7 +3067,7 @@ glusterd_store_retrieve_node_state(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -3379,7 +3379,7 @@ glusterd_store_update_volinfo(glusterd_volinfo_t *volinfo) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -3574,7 +3574,7 @@ glusterd_store_retrieve_options(xlator_t *this) + goto out; + ret = 0; + out: +- (void)gf_store_iter_destroy(iter); ++ (void)gf_store_iter_destroy(&iter); + gf_store_handle_destroy(shandle); + return ret; + } +@@ -4026,7 +4026,7 @@ glusterd_store_update_snap(glusterd_snap_t *snap) + ret = 0; + + out: +- if (gf_store_iter_destroy(iter)) { ++ if (gf_store_iter_destroy(&iter)) { + gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL, + "Failed to destroy store iter"); + ret = -1; +@@ -4774,7 +4774,7 @@ glusterd_store_retrieve_peers(xlator_t *this) + is_ok = _gf_true; + + next: +- (void)gf_store_iter_destroy(iter); ++ (void)gf_store_iter_destroy(&iter); + + if (!is_ok) { + gf_log(this->name, GF_LOG_WARNING, +-- +1.8.3.1 + diff --git a/SOURCES/0598-glusterd-null-dereference.patch b/SOURCES/0598-glusterd-null-dereference.patch new file mode 100644 index 0000000..fac1b8f --- /dev/null +++ b/SOURCES/0598-glusterd-null-dereference.patch @@ -0,0 +1,51 @@ +From 84aaaded4e958a10c7492233c053e3c681f2d575 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 2 Jul 2020 18:10:32 +0530 +Subject: [PATCH 598/610] glusterd: null dereference + +Issue: +There has been either an explicit null +dereference or a dereference after null +check in some cases. + +Fix: +Added the proper condition for null check +and fixed null derefencing. + +CID: 1430106 : Dereference after null check +CID: 1430120 : Explicit null dereferenced +CID: 1430132 : Dereference after null check +CID: 1430134 : Dereference after null check + +>Change-Id: I7e795cf9f7146a633097c26a766f16b159881fa3 +>Updates: #1060 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24664 +BUG: 1997447 + +Change-Id: I2b2632c93094d0e7b9fbd65a2ca2b0eaf6212d79 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280083 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-syncop.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c +index 05c9e11..f1807cd 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c ++++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c +@@ -1797,7 +1797,7 @@ gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + pending_node = NULL; + ret = 0; + out: +- if (pending_node) ++ if (pending_node && pending_node->node) + glusterd_pending_node_put_rpc(pending_node); + + if (rsp_dict) +-- +1.8.3.1 + diff --git a/SOURCES/0599-afr-null-dereference-nagative-value.patch b/SOURCES/0599-afr-null-dereference-nagative-value.patch new file mode 100644 index 0000000..7d59cc7 --- /dev/null +++ b/SOURCES/0599-afr-null-dereference-nagative-value.patch @@ -0,0 +1,59 @@ +From 4186f81596a481a5c0c5a707fc9b2358ee8f49f0 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Fri, 3 Jul 2020 17:18:33 +0530 +Subject: [PATCH 599/610] afr: null dereference & nagative value + +Added a check for NULL before dereferencing +the object as it may be NULL in few cases +inside the funtion. Also, added a check for +the negative value of gfid_idx. + +CID: 1430140 +CID: 1430145 + +>Change-Id: Ib7d23459b48bbc471dbcccab6d20572261882d11 +>Updates: #1060 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24671 +BUG: 1997447 + +Change-Id: I7e705a106d97001b67f5cde8589413c0c24ee507 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280085 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-self-heal-common.c | 2 +- + xlators/cluster/afr/src/afr-self-heal-name.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 0954d2c..cbd5117 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -140,7 +140,7 @@ heal: + } + } + out: +- if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) { ++ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { + ret = -afr_final_errno(local, priv); + } + loc_wipe(&loc); +diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c +index 9ec2066..c5ab8d7 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-name.c ++++ b/xlators/cluster/afr/src/afr-self-heal-name.c +@@ -353,7 +353,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode, + replies, gfid, locked_on, source, sources, + is_gfid_absent, &gfid_idx); +- if (ret) ++ if (ret || (gfid_idx < 0)) + return ret; + + ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname, +-- +1.8.3.1 + diff --git a/SOURCES/0600-dht-xlator-integer-handling-issue.patch b/SOURCES/0600-dht-xlator-integer-handling-issue.patch new file mode 100644 index 0000000..c3970ac --- /dev/null +++ b/SOURCES/0600-dht-xlator-integer-handling-issue.patch @@ -0,0 +1,161 @@ +From 1cd16553d436fa703f5e18d71c35108d0e179e8b Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 9 Apr 2020 11:36:34 +0530 +Subject: [PATCH 600/610] dht xlator: integer handling issue + +Issue: The ret value is passed to the function +instead of the proper errno value + +Fix: Passing the errno generated to +the log function + +CID: 1415824 : Improper use of negative value +CID: 1420205 : Improper use of negative value +>Change-Id: Iaa7407ebd03eda46a2c027695e6bf0f598b371b2 +>Updates: #1060 +>Signed-off-by: nik-redhat + +Upstream link: https://review.gluster.org/c/glusterfs/+/24314 +BUG: 1997447 + +Change-Id: Ibb7f432dbcc9ffd8dff6be6f984a6705894d6bef +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280086 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 12 ++++++++---- + xlators/cluster/dht/src/dht-common.h | 2 +- + xlators/cluster/dht/src/dht-helper.c | 9 ++++++--- + xlators/cluster/dht/src/dht-selfheal.c | 8 +++++--- + 4 files changed, 20 insertions(+), 11 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index e6a16ff..5eaaa1e 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -672,13 +672,14 @@ dht_discover_complete(xlator_t *this, call_frame_t *discover_frame) + + if (local->need_xattr_heal && !heal_path) { + local->need_xattr_heal = 0; +- ret = dht_dir_xattr_heal(this, local); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, ret, ++ ret = dht_dir_xattr_heal(this, local, &op_errno); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for " + "directory gfid is %s ", + gfid_local); ++ } + } + } + +@@ -1205,7 +1206,7 @@ dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, + to non hashed subvol + */ + int +-dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) ++dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno) + { + dht_local_t *copy_local = NULL; + call_frame_t *copy = NULL; +@@ -1217,6 +1218,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) + "No gfid exists for path %s " + "so healing xattr is not possible", + local->loc.path); ++ *op_errno = EIO; + goto out; + } + +@@ -1230,6 +1232,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) + "Memory allocation failed " + "for path %s gfid %s ", + local->loc.path, gfid_local); ++ *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } else { + copy_local->stbuf = local->stbuf; +@@ -1244,6 +1247,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local) + "Synctask creation failed to heal xattr " + "for path %s gfid %s ", + local->loc.path, gfid_local); ++ *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } + } +diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h +index b856c68..1cb1c0c 100644 +--- a/xlators/cluster/dht/src/dht-common.h ++++ b/xlators/cluster/dht/src/dht-common.h +@@ -1493,7 +1493,7 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag); + + int +-dht_dir_xattr_heal(xlator_t *this, dht_local_t *local); ++dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno); + + int32_t + dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, +diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c +index 4c3940a..d3444b3 100644 +--- a/xlators/cluster/dht/src/dht-helper.c ++++ b/xlators/cluster/dht/src/dht-helper.c +@@ -2105,6 +2105,7 @@ dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) + dht_local_t *local = NULL; + xlator_t *this = NULL; + int ret = -1; ++ int op_errno = 0; + + local = heal_frame->local; + main_frame = local->main_frame; +@@ -2114,10 +2115,12 @@ dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) + dht_set_fixed_dir_stat(&local->postparent); + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; +- ret = dht_dir_xattr_heal(this, local); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, ret, DHT_MSG_DIR_XATTR_HEAL_FAILED, ++ ret = dht_dir_xattr_heal(this, local, &op_errno); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, ++ DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for directory %s ", local->loc.path); ++ } + } + + DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf, +diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c +index 8af7301..2da9817 100644 +--- a/xlators/cluster/dht/src/dht-selfheal.c ++++ b/xlators/cluster/dht/src/dht-selfheal.c +@@ -1471,6 +1471,7 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, + { + int missing_dirs = 0; + int i = 0; ++ int op_errno = 0; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *this = NULL; +@@ -1493,13 +1494,14 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; +- ret = dht_dir_xattr_heal(this, local); +- if (ret) +- gf_msg(this->name, GF_LOG_ERROR, ret, ++ ret = dht_dir_xattr_heal(this, local, &op_errno); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "%s:xattr heal failed for " + "directory (gfid = %s)", + local->loc.path, local->gfid); ++ } + } else { + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_copy(loc->gfid, local->gfid); +-- +1.8.3.1 + diff --git a/SOURCES/0601-coverity-resource-leak-2321.patch b/SOURCES/0601-coverity-resource-leak-2321.patch new file mode 100644 index 0000000..35dc964 --- /dev/null +++ b/SOURCES/0601-coverity-resource-leak-2321.patch @@ -0,0 +1,99 @@ +From 6d7049a19029331266f70f68d860bbccef01a35d Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Thu, 8 Jul 2021 11:26:54 +0530 +Subject: [PATCH 601/610] coverity: resource leak (#2321) + +Issue: +Variable `arg` is not freed before the function exits, +and leads to resource leak. + +Fix: +Free the arg variable if the status of function call +`glusterd_compare_friend_volume` is +`GLUSTERD_VOL_COMP_UPDATE_REQ`, or if the `glusterd_launch_synctask` +fails to start the process. + +And, added a check for return value on calling +`glusterd_launch_synctask` function and exit if the +thread creation fails. + +CID: 1401716 +>Updates: #1060 + +>Change-Id: I4abd621771f88853d8d01e9039cdee2f3d862c4f +>Signed-off-by: nik-redhat + +Upstream link: https://github.com/gluster/glusterfs/pull/2321 +BUG: 1997447 + +Change-Id: Ida81dfcd58c5ef45d3ae036d6bd6b36dc6693538 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280090 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 10 +++++++--- + xlators/mgmt/glusterd/src/glusterd-utils.h | 2 +- + 2 files changed, 8 insertions(+), 4 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index c037933..cec9c20 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -5371,6 +5371,7 @@ glusterd_compare_friend_data(dict_t *peer_data, dict_t *cmp, int32_t *status, + + if (GLUSTERD_VOL_COMP_RJT == *status) { + ret = 0; ++ update = _gf_false; + goto out; + } + if (GLUSTERD_VOL_COMP_UPDATE_REQ == *status) { +@@ -5385,11 +5386,12 @@ glusterd_compare_friend_data(dict_t *peer_data, dict_t *cmp, int32_t *status, + * first brick to come up before attaching the subsequent bricks + * in case brick multiplexing is enabled + */ +- glusterd_launch_synctask(glusterd_import_friend_volumes_synctask, arg); ++ ret = glusterd_launch_synctask(glusterd_import_friend_volumes_synctask, ++ arg); + } + + out: +- if (ret && arg) { ++ if ((ret || !update) && arg) { + dict_unref(arg->peer_data); + dict_unref(arg->peer_ver_data); + GF_FREE(arg); +@@ -13115,7 +13117,7 @@ gd_default_synctask_cbk(int ret, call_frame_t *frame, void *opaque) + return ret; + } + +-void ++int + glusterd_launch_synctask(synctask_fn_t fn, void *opaque) + { + xlator_t *this = NULL; +@@ -13131,6 +13133,8 @@ glusterd_launch_synctask(synctask_fn_t fn, void *opaque) + gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SPAWN_SVCS_FAIL, + "Failed to spawn bricks" + " and other volume related services"); ++ ++ return ret; + } + + /* +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 4541471..3f4f3b8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -681,7 +681,7 @@ int32_t + glusterd_take_lvm_snapshot(glusterd_brickinfo_t *brickinfo, + char *origin_brick_path); + +-void ++int + glusterd_launch_synctask(synctask_fn_t fn, void *opaque); + + int +-- +1.8.3.1 + diff --git a/SOURCES/0602-coverity-null-dereference-2395.patch b/SOURCES/0602-coverity-null-dereference-2395.patch new file mode 100644 index 0000000..6edc3aa --- /dev/null +++ b/SOURCES/0602-coverity-null-dereference-2395.patch @@ -0,0 +1,87 @@ +From 2ff83650a5f05e3f06853df6d79d3b18f88dfb23 Mon Sep 17 00:00:00 2001 +From: Nikhil Ladha +Date: Thu, 6 May 2021 10:45:46 +0530 +Subject: [PATCH 602/610] coverity: null dereference (#2395) + +Fix: +Updated the code to make it more readable and fixed +the NULL dereferencing. + +CID: 1234622 +>Updates: #1060 + +>Change-Id: I05bd203bc46fe84be86398bd664a3485409c3bfe +>Signed-off-by: nik-redhat + +Upstream link: https://github.com/gluster/glusterfs/pull/2395 +BUG: 1997447 + +Change-Id: If39cc85115de673a83b6c97137ea8d1f0f825245 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280093 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-lock.c | 32 +++++++++++++++----------------- + 1 file changed, 15 insertions(+), 17 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c +index f9bac4f..6474dfa 100644 +--- a/xlators/cluster/dht/src/dht-lock.c ++++ b/xlators/cluster/dht/src/dht-lock.c +@@ -914,37 +914,35 @@ dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; ++ dht_ilock_wrap_t *my_layout; + + local = frame->local; + lk_index = (long)cookie; + ++ my_layout = &(local->lock[0].layout.my_layout); ++ + if (op_ret == -1) { +- local->lock[0].layout.my_layout.op_ret = -1; +- local->lock[0].layout.my_layout.op_errno = op_errno; +- +- if (local && local->lock[0].layout.my_layout.locks[lk_index]) { +- uuid_utoa_r(local->lock[0] +- .layout.my_layout.locks[lk_index] +- ->loc.inode->gfid, +- gfid); +- +- gf_msg_debug( +- this->name, op_errno, +- "inodelk failed on gfid: %s " +- "subvolume: %s", +- gfid, +- local->lock[0].layout.my_layout.locks[lk_index]->xl->name); ++ my_layout->op_ret = -1; ++ my_layout->op_errno = op_errno; ++ ++ if (my_layout->locks[lk_index]) { ++ uuid_utoa_r(my_layout->locks[lk_index]->loc.inode->gfid, gfid); ++ ++ gf_msg_debug(this->name, op_errno, ++ "inodelk failed on gfid: %s " ++ "subvolume: %s", ++ gfid, my_layout->locks[lk_index]->xl->name); + } + + goto out; + } + +- local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true; ++ my_layout->locks[lk_index]->locked = _gf_true; + + out: + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { +- if (local->lock[0].layout.my_layout.op_ret < 0) { ++ if (my_layout->op_ret < 0) { + dht_inodelk_cleanup(frame); + return 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0603-Coverity-Resource-leak-fix-CID-1356547.patch b/SOURCES/0603-Coverity-Resource-leak-fix-CID-1356547.patch new file mode 100644 index 0000000..8c6b53b --- /dev/null +++ b/SOURCES/0603-Coverity-Resource-leak-fix-CID-1356547.patch @@ -0,0 +1,51 @@ +From 015e6cac71b0a0c330f1e4792f9d60214b191f45 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 7 Oct 2021 21:07:46 +0530 +Subject: [PATCH 603/610] Coverity: Resource leak fix (CID: 1356547) + +Issue: +In function gf_svc_readdirp() there is a chance that 'local' will be allocated +memory but not released in the failure path. + +Fix: +Assign 'local' to 'frame->local' immediately after the successful allocation, so +it will be released by the existing failure path code itself. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2362/ +> Change-Id: I4474dc4d4be5432d169cb7d434728f211054997e +> Signed-off-by: karthik-us +> Updates: gluster#1060 + +BUG: 1997447 +Change-Id: I4474dc4d4be5432d169cb7d434728f211054997e +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280100 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/snapview-client/src/snapview-client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/snapview-client/src/snapview-client.c b/xlators/features/snapview-client/src/snapview-client.c +index 9c789ae..e97db89 100644 +--- a/xlators/features/snapview-client/src/snapview-client.c ++++ b/xlators/features/snapview-client/src/snapview-client.c +@@ -2156,6 +2156,7 @@ gf_svc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + "failed to allocate local"); + goto out; + } ++ frame->local = local; + + /* + * This is mainly for samba shares (or windows clients). As part of +@@ -2184,7 +2185,6 @@ gf_svc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + + local->subvolume = subvolume; + local->fd = fd_ref(fd); +- frame->local = local; + + STACK_WIND(frame, gf_svc_readdirp_cbk, subvolume, subvolume->fops->readdirp, + fd, size, off, xdata); +-- +1.8.3.1 + diff --git a/SOURCES/0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch b/SOURCES/0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch new file mode 100644 index 0000000..a680327 --- /dev/null +++ b/SOURCES/0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch @@ -0,0 +1,50 @@ +From dee1c932df22ee12fe4568b40e58a475309e62fd Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 7 Oct 2021 21:18:49 +0530 +Subject: [PATCH 604/610] Coverity: Fix dereference before null check (CID: + 1391415) + +Problem: +In function gf_client_dump_inodes_to_dict() there is a null check for +a variable which is already dereferenced in the previous line. This +means that there could be a chance that this variable is null. But it +is not being validate for null before dereferencing it in the first +place. + +Fix: +Added null check before dereferencing the variable at the first place. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2369/ +> Change-Id: I988b0e93542782353a8059e33db1522b6a5e55f8 +> Signed-off-by: karthik-us +> Updates: gluster#1060 + +BUG: 1997447 +Change-Id: I988b0e93542782353a8059e33db1522b6a5e55f8 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280103 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/client_t.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c +index e875c8b..216900a 100644 +--- a/libglusterfs/src/client_t.c ++++ b/libglusterfs/src/client_t.c +@@ -828,8 +828,9 @@ gf_client_dump_inodes_to_dict(xlator_t *this, dict_t *dict) + clienttable->cliententries[count].next_free) + continue; + client = clienttable->cliententries[count].client; +- if (!strcmp(client->bound_xl->name, this->name)) { +- if (client->bound_xl && client->bound_xl->itable) { ++ if (client->bound_xl && ++ !strcmp(client->bound_xl->name, this->name)) { ++ if (client->bound_xl->itable) { + /* Presently every brick contains only + * one bound_xl for all connections. + * This will lead to duplicating of +-- +1.8.3.1 + diff --git a/SOURCES/0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch b/SOURCES/0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch new file mode 100644 index 0000000..849c959 --- /dev/null +++ b/SOURCES/0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch @@ -0,0 +1,53 @@ +From 25fc2530f7ee6d7267e2ccc1b75a47a3ae539dff Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Thu, 7 Oct 2021 21:29:27 +0530 +Subject: [PATCH 605/610] Coverity: Fix copy into fixed size buffer (CID: + 1325542) + +Problem: +In __mnt3_fresh_lookup() mres->resolveloc.path is being copied into +a fixed size string mres->remainingdir, with strncpy without checking +the size of the source string. This could lead to string overflow. + +Fix: +Copy only till the destination string length and check whether the +soruce string overflows. If so log an error message and return. + +> Upstream patch: https://github.com/gluster/glusterfs/pull/2474/ +> Change-Id: I26dd0653d2636c667ad4e356d12d3d51956c77c3 +> Signed-off-by: karthik-us +> Updates: gluster#1060 + +BUG: 1997447 +Change-Id: I26dd0653d2636c667ad4e356d12d3d51956c77c3 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280106 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/nfs/server/src/mount3.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c +index 734453c..3951b9e 100644 +--- a/xlators/nfs/server/src/mount3.c ++++ b/xlators/nfs/server/src/mount3.c +@@ -1104,8 +1104,13 @@ __mnt3_fresh_lookup(mnt3_resolve_t *mres) + { + inode_unlink(mres->resolveloc.inode, mres->resolveloc.parent, + mres->resolveloc.name); +- strncpy(mres->remainingdir, mres->resolveloc.path, +- strlen(mres->resolveloc.path)); ++ if (snprintf(mres->remainingdir, sizeof(mres->remainingdir), "%s", ++ mres->resolveloc.path) >= sizeof(mres->remainingdir)) { ++ gf_msg(GF_MNT, GF_LOG_ERROR, EFAULT, NFS_MSG_RESOLVE_INODE_FAIL, ++ "Failed to copy resolve path: %s", mres->resolveloc.path); ++ nfs_loc_wipe(&mres->resolveloc); ++ return -EFAULT; ++ } + nfs_loc_wipe(&mres->resolveloc); + return __mnt3_resolve_subdir(mres); + } +-- +1.8.3.1 + diff --git a/SOURCES/0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch b/SOURCES/0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch new file mode 100644 index 0000000..05ca17b --- /dev/null +++ b/SOURCES/0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch @@ -0,0 +1,69 @@ +From a6ba95b73469ad81d8c5a27293f8d09cc26928a3 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Fri, 18 Dec 2020 16:28:29 +0530 +Subject: [PATCH 606/610] dht: handle DHT_SUBVOL_STATUS_KEY in dht_pt_getxattr + (#1934) + +In non distribute volumes (plain replicate, ec), DHT uses pass-through +FOPs (dht_pt_getxattr) instead of the usual FOPS (dht_getxattr). The +pass through FOP was not handling the DHT_SUBVOL_STATUS_KEY virtual +xattr because of which geo-rep session was going into a faulty state. +Fixing it now. + +> updates: #1925 +> Change-Id: I766b5b5c047c954a9957ab78aca680eedef1ff1f +> Signed-off-by: Ravishankar N + +Upstream patch: https://github.com/gluster/glusterfs/pull/1934 + +BUG: 2006205 +Change-Id: I766b5b5c047c954a9957ab78aca680eedef1ff1f +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280112 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-common.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 5eaaa1e..c8980e5 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -11584,9 +11584,33 @@ int + dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) + { ++ int op_errno = -1; ++ dht_local_t *local = NULL; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(loc, err); ++ VALIDATE_OR_GOTO(loc->inode, err); ++ VALIDATE_OR_GOTO(this->private, err); ++ ++ local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR); ++ if (!local) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ if (key && ++ strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) { ++ dht_vgetxattr_subvol_status(frame, this, key); ++ return 0; ++ } ++ + STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; ++err: ++ DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); ++ return 0; + } + + static int +-- +1.8.3.1 + diff --git a/SOURCES/0607-SELinux-Fix-boolean-management.patch b/SOURCES/0607-SELinux-Fix-boolean-management.patch new file mode 100644 index 0000000..4a62b03 --- /dev/null +++ b/SOURCES/0607-SELinux-Fix-boolean-management.patch @@ -0,0 +1,121 @@ +From 4b65ff0d1a3d70fcf3cfa8ab769135ae12f529d8 Mon Sep 17 00:00:00 2001 +From: nik-redhat +Date: Thu, 7 Oct 2021 22:02:32 +0530 +Subject: [PATCH 607/610] SELinux: Fix boolean management + +Remove %triggerun ganesha +This trigger shouldn't be needed to begin with since removing +selinux-policy-targeted means that the user is switching SELinux off, or +is is switching the policy (to "mls" or "minimum"). In either case the +current boolean setting is not going to be used any more. The last +option, removal of glusterfs-ganesha, is covered by '%postun ganesha'. +But more importantly, the trigger is called every time +selinux-policy-targeted is updated (which can be avoided). +%triggerun is executed after %triggerin - +https://docs.fedoraproject.org/en-US/packaging-guidelines/Scriptlets/#ordering +So when selinux-policy-targeted is updated, the new version is installed +first triggering `semanage boolean -m ganesha_use_fusefs --on`, +and then the old version is uninstalled triggering +`semanage boolean -m ganesha_use_fusefs --off`. + +* use selinux_[un]set_booleans instead of "semanage boolean" + The macro pair properly manages SELinux stores and doesn't disable the + boolean in case it was enabled before ${name}-ganesha was installed. + +* Only change booleans when the package is first installed or + uninstalled +Updating ${name}-ganesha would disable the boolean because %postun is +called after %post (same issue as with the triggers). + +Signed-off-by: Vit Mojzis +Signed-off-by: Kaleb S. KEITHLEY +Change-Id: Ibb926ffbe00c9f000bd740708c0a4b3435ee7871 +PR: https://github.com/gluster/glusterfs/pull/2833 +Issue: https://github.com/gluster/glusterfs/issues/2522 +Resolves: rhbz#1973566 +Resolves: rhbz#1975400 + +BUG: 1973566 +Change-Id: Idef6cbd6bce35151518d6f76e5b74774e5756fc9 +Signed-off-by: nik-redhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280114 +Tested-by: RHGS Build Bot +Reviewed-by: Kaleb Keithley +--- + glusterfs.spec.in | 34 +++++++++++++++++++++------------- + 1 file changed, 21 insertions(+), 13 deletions(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 424f4ab..a9a83b1 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -954,7 +954,10 @@ exit 0 + %if ( 0%{!?_without_server:1} ) + %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %post ganesha +-semanage boolean -m ganesha_use_fusefs --on ++# first install ++if [ $1 -eq 1 ]; then ++ %selinux_set_booleans ganesha_use_fusefs=1 ++fi + exit 0 + %endif + %endif +@@ -962,7 +965,9 @@ exit 0 + %if ( 0%{!?_without_georeplication:1} ) + %post geo-replication + %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) +-%selinux_set_booleans %{selinuxbooleans} ++if [ $1 -eq 1 ]; then ++ %selinux_set_booleans %{selinuxbooleans} ++fi + %endif + if [ $1 -ge 1 ]; then + %systemd_postun_with_restart glusterd +@@ -1089,29 +1094,32 @@ exit 0 + %if ( 0%{!?_without_server:1} ) + %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %postun ganesha +-semanage boolean -m ganesha_use_fusefs --off ++if [ $1 -eq 0 ]; then ++ # use the value of ganesha_use_fusefs from before glusterfs-ganesha was installed ++ %selinux_unset_booleans ganesha_use_fusefs=1 ++fi + exit 0 + %endif + %endif + +-##----------------------------------------------------------------------------- +-## All %%trigger should be placed here and keep them sorted +-## +-%if ( 0%{!?_without_server:1} ) +-%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +-%trigger ganesha -- selinux-policy-targeted +-semanage boolean -m ganesha_use_fusefs --on ++%if ( 0%{!?_without_georeplication:1} ) ++%postun geo-replication ++%if ( 0%{?rhel} && 0%{?rhel} >= 8 ) ++if [ $1 -eq 0 ]; then ++ %selinux_unset_booleans %{selinuxbooleans} ++fi + exit 0 + %endif + %endif + + ##----------------------------------------------------------------------------- +-## All %%triggerun should be placed here and keep them sorted ++## All %%trigger should be placed here and keep them sorted + ## + %if ( 0%{!?_without_server:1} ) + %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +-%triggerun ganesha -- selinux-policy-targeted +-semanage boolean -m ganesha_use_fusefs --off ++# ensure ganesha_use_fusefs is on in case of policy mode switch (eg. mls->targeted) ++%triggerin ganesha -- selinux-policy-targeted ++semanage boolean -m ganesha_use_fusefs --on -S targeted + exit 0 + %endif + %endif +-- +1.8.3.1 + diff --git a/SOURCES/0608-cluster-ec-Track-heal-statistics-in-shd.patch b/SOURCES/0608-cluster-ec-Track-heal-statistics-in-shd.patch new file mode 100644 index 0000000..b08d7a9 --- /dev/null +++ b/SOURCES/0608-cluster-ec-Track-heal-statistics-in-shd.patch @@ -0,0 +1,143 @@ +From d806760f1d4c78a2519b01f1c2d07aba0c533755 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Fri, 28 Aug 2020 16:03:54 +0530 +Subject: [PATCH 608/610] cluster/ec: Track heal statistics in shd + +With this change we should be able to inspect number of heals +attempted and completed by each shd. + +> Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24926/ +> fixes: #1453 +> Change-Id: I10f5d86efcc0a8e4d648da808751d37725682c39 +> Signed-off-by: Pranith Kumar K + +BUG: 1853631 +Change-Id: I10f5d86efcc0a8e4d648da808751d37725682c39 +Signed-off-by: Sheetal Pamecha +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280208 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec-heald.c | 49 ++++++++++++++++++++++++++++++++++++++- + xlators/cluster/ec/src/ec-types.h | 5 ++++ + xlators/cluster/ec/src/ec.c | 6 +++++ + 3 files changed, 59 insertions(+), 1 deletion(-) + +diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c +index 4f4b6aa..cd4d3ad 100644 +--- a/xlators/cluster/ec/src/ec-heald.c ++++ b/xlators/cluster/ec/src/ec-heald.c +@@ -152,15 +152,58 @@ ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name) + return ret; + } + ++static gf_boolean_t ++ec_is_heal_completed(char *status) ++{ ++ char *bad_pos = NULL; ++ char *zero_pos = NULL; ++ ++ if (!status) { ++ return _gf_false; ++ } ++ ++ /*Logic: ++ * Status will be of the form Good: , Bad: ++ * If heal completes, if we do strchr for '0' it should be present after ++ * 'Bad:' i.e. strRchr for ':' ++ * */ ++ ++ zero_pos = strchr(status, '0'); ++ bad_pos = strrchr(status, ':'); ++ if (!zero_pos || !bad_pos) { ++ /*malformed status*/ ++ return _gf_false; ++ } ++ ++ if (zero_pos > bad_pos) { ++ return _gf_true; ++ } ++ ++ return _gf_false; ++} ++ + int + ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, + gf_boolean_t full) + { + dict_t *xdata = NULL; ++ dict_t *dict = NULL; + uint32_t count; + int32_t ret; ++ char *heal_status = NULL; ++ ec_t *ec = healer->this->private; ++ ++ GF_ATOMIC_INC(ec->stats.shd.attempted); ++ ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL, ++ &xdata); ++ if (ret == 0) { ++ if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) { ++ if (ec_is_heal_completed(heal_status)) { ++ GF_ATOMIC_INC(ec->stats.shd.completed); ++ } ++ } ++ } + +- ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata); + if (!full && (loc->inode->ia_type == IA_IFDIR)) { + /* If we have just healed a directory, it's possible that + * other index entries have appeared to be healed. */ +@@ -179,6 +222,10 @@ ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, + dict_unref(xdata); + } + ++ if (dict) { ++ dict_unref(dict); ++ } ++ + return ret; + } + +diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h +index 700dc39..ef7a7fe 100644 +--- a/xlators/cluster/ec/src/ec-types.h ++++ b/xlators/cluster/ec/src/ec-types.h +@@ -626,6 +626,11 @@ struct _ec_statistics { + requests. (Basically memory allocation + errors). */ + } stripe_cache; ++ struct { ++ gf_atomic_t attempted; /*Number of heals attempted on ++ files/directories*/ ++ gf_atomic_t completed; /*Number of heals complted on files/directories*/ ++ } shd; + }; + + struct _ec { +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 047cdd8..24de9e8 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -649,6 +649,8 @@ ec_statistics_init(ec_t *ec) + GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0); ++ GF_ATOMIC_INIT(ec->stats.shd.attempted, 0); ++ GF_ATOMIC_INIT(ec->stats.shd.completed, 0); + } + + int32_t +@@ -1445,6 +1447,10 @@ ec_dump_private(xlator_t *this) + GF_ATOMIC_GET(ec->stats.stripe_cache.allocs)); + gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.errors)); ++ gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC, ++ GF_ATOMIC_GET(ec->stats.shd.attempted)); ++ gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC, ++ GF_ATOMIC_GET(ec->stats.shd.completed)); + + return 0; + } +-- +1.8.3.1 + diff --git a/SOURCES/0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch b/SOURCES/0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch new file mode 100644 index 0000000..a3290cb --- /dev/null +++ b/SOURCES/0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch @@ -0,0 +1,43 @@ +From 89cdfb40264c12105a1b4990fa9b45290aa6cef0 Mon Sep 17 00:00:00 2001 +From: Vinayakswami Hariharmath +Date: Fri, 8 Oct 2021 09:40:41 +0530 +Subject: [PATCH 609/610] feature/shard: wrong dname results in dentry not + found error + +Due to wrong dname passed to inode_unlink in +shard_evicted_inode_fsync_cbk() resulting in dentry not found +error. + +This patch addresses the issue. + +> upstream patch: https://github.com/gluster/glusterfs/pull/2475 +> Fixes: #2470 +> Change-Id: I6c479980ae3fa7ba558327055a9e5e5c2d2a850f +> Signed-off-by: Vinayakswami Hariharmath vharihar@redhat.com + +BUG: 1911665 +Change-Id: I96aa5f57303b69a08990de039ddeecad7e7ae6af +Signed-off-by: Vinayakswami Hariharmath +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280202 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index b828ff9..882373f 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -950,7 +950,7 @@ shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + { + __shard_inode_ctx_get(shard_inode, this, &ctx); + if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { +- shard_make_block_bname(ctx->block_num, shard_inode->gfid, ++ shard_make_block_bname(ctx->block_num, ctx->base_gfid, + block_bname, sizeof(block_bname)); + inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); + /* The following unref corresponds to the ref held by +-- +1.8.3.1 + diff --git a/SOURCES/0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch b/SOURCES/0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch new file mode 100644 index 0000000..132da9c --- /dev/null +++ b/SOURCES/0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch @@ -0,0 +1,51 @@ +From b3e86a66de224107f6760157a7cb692227e42954 Mon Sep 17 00:00:00 2001 +From: Shwetha Acharya +Date: Mon, 30 Aug 2021 18:54:15 +0530 +Subject: [PATCH 610/610] glusterfs.spec.in: remove condtionals from tar + dependency (#2734) + +* glusterfs.spec.in: remove condtionals from tar dependency + +The conditional on rhel minor version fails and tar is not +marked as required. + +As there is not any universal macro to specify the +minor release, removing the conditionals above the +"Requires: tar" statement + +with this change irrespective of rhel 8.3 and +above, tar will be marked required for geo-rep. + +> Change-Id: Id1e3320a0b1a245fc9cd8c7acb09cc119fca18b8 +> Signed-off-by: Shwetha K Acharya + +Upstream patch: https://github.com/gluster/glusterfs/pull/2734 + +BUG: 1901468 +Change-Id: Id1e3320a0b1a245fc9cd8c7acb09cc119fca18b8 +Signed-off-by: Shwetha K Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/280116 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: RHGS Build Bot +--- + glusterfs.spec.in | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index a9a83b1..8b6646f 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -521,9 +521,8 @@ Requires: python%{_pythonver}-gluster = %{version}-%{release} + Requires: rsync + Requires: util-linux + Requires: %{name}-libs%{?_isa} = %{version}-%{release} +-%if ( 0%{?rhel} && ( ( 0%{?rhel} == 8 && 0%{?rhel_minor_version} >= 3 ) || 0%{?rhel} >= 9 ) ) + Requires: tar +-%endif ++ + # required for setting selinux bools + %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) + Requires(post): policycoreutils-python-utils +-- +1.8.3.1 + diff --git a/SOURCES/0611-SELinux-Fix-boolean-management-again.patch b/SOURCES/0611-SELinux-Fix-boolean-management-again.patch new file mode 100644 index 0000000..a5b2612 --- /dev/null +++ b/SOURCES/0611-SELinux-Fix-boolean-management-again.patch @@ -0,0 +1,54 @@ +From 5ad4711f40c0e8ab7c196ac1c9025bf78b8b94e0 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Thu, 18 Nov 2021 09:21:56 -0500 +Subject: [PATCH 611/611] SELinux: Fix boolean management, again + +When upgrading from a version of the package that does not include +the previous fix this means the flawed scriptlet is still executed, +undoing the setting of the boolean. + +In order to work the boolean needs to be set in %posttrans. This is +a temporary change that can (or should) be removed in the next version +of RHGS, i.e. 3.5.7. + +Issue: https://github.com/gluster/glusterfs/issues/2522 +Resolves: rhbz#1973566 +Resolves: rhbz#1975400 + +Label: DOWNSTREAM ONLY + +BUG: 1973566 +Change-Id: Ida39a3ee5e6b4b0d3255bfef95601890afd80709 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/292189 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfs.spec.in | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 8b6646f..87176c9 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -1123,6 +1123,17 @@ exit 0 + %endif + %endif + ++%if ( 0%{!?_without_server:1} ) ++%if ( ( 0%{?fedora} && 0%{?fedora} > 25 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) ++# temporary fix to be removed in the next version (i.e. RHGS 3.5.7). This ++# is only needed when upgrading from the flawed versions (e.g. RHGS 3.5.5 ++# and earlier.) ++%posttrans ganesha ++semanage boolean -m ganesha_use_fusefs --on -S targeted ++exit 0 ++%endif ++%endif ++ + ##----------------------------------------------------------------------------- + ## All %%files should be placed here and keep them grouped + ## +-- +1.8.3.1 + diff --git a/SPECS/glusterfs.spec b/SPECS/glusterfs.spec index c0e2ed4..50c331e 100644 --- a/SPECS/glusterfs.spec +++ b/SPECS/glusterfs.spec @@ -237,7 +237,7 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist} %else Name: glusterfs Version: 6.0 -Release: 56.4%{?dist} +Release: 61.2%{?dist} ExcludeArch: i686 %endif License: GPLv2 or LGPLv3+ @@ -858,7 +858,74 @@ Patch0540: 0540-extras-Disable-write-behind-for-group-samba.patch Patch0541: 0541-glusterd-volgen-Add-functionality-to-accept-any-cust.patch Patch0542: 0542-xlaotrs-mgmt-Fixing-coverity-issue-1445996.patch Patch0543: 0543-glusterd-handle-custom-xlator-failure-cases.patch -Patch0544: 0544-RHGS-3.5.4-rebuild-to-ship-with-RHEL-8.5.patch +Patch0544: 0544-tests-avoid-empty-paths-in-environment-variables.patch +Patch0545: 0545-tests-Excluded-tests-for-unsupported-components.patch +Patch0546: 0546-Update-rfc.sh-to-rhgs-3.5.5.patch +Patch0547: 0547-perf-write-behind-Clear-frame-local-on-conflict-erro.patch +Patch0548: 0548-Add-tar-as-dependency-to-geo-rep-rpm-for-RHEL-8.3-an.patch +Patch0549: 0549-geo-rep-Change-in-attribute-for-getting-function-nam.patch +Patch0550: 0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch +Patch0551: 0551-common-ha-ensure-shared_storage-is-mounted-before-se.patch +Patch0552: 0552-cluster-afr-Change-default-self-heal-window-size-to-.patch +Patch0553: 0553-cluster-ec-Change-self-heal-window-size-to-4MiB-by-d.patch +Patch0554: 0554-dht-fix-rebalance-of-sparse-files.patch +Patch0555: 0555-geo-rep-Improve-handling-of-gfid-mismatches.patch +Patch0556: 0556-dht-don-t-ignore-xdata-in-fgetxattr.patch +Patch0557: 0557-cluster-dht-Fix-stack-overflow-in-readdir-p.patch +Patch0558: 0558-afr-fix-directory-entry-count.patch +Patch0559: 0559-afr-make-fsync-post-op-aware-of-inodelk-count-2273.patch +Patch0560: 0560-posix-Avoid-dict_del-logs-in-posix_is_layout_stale-w.patch +Patch0561: 0561-cluster-ec-Inform-failure-when-some-bricks-are-unava.patch +Patch0562: 0562-shard.c-Fix-formatting.patch +Patch0563: 0563-features-shard-Use-fd-lookup-post-file-open.patch +Patch0564: 0564-store.c-glusterd-store.c-remove-sys_stat-calls.patch +Patch0565: 0565-libglusterfs-coverity-pointer-to-local-outside-the-s.patch +Patch0566: 0566-enahancement-debug-Option-to-generate-core-dump-with.patch +Patch0567: 0567-inode-create-inode-outside-locked-region.patch +Patch0568: 0568-core-tcmu-runner-process-continuous-growing-logs-lru.patch +Patch0569: 0569-features-shard-optimization-over-shard-lookup-in-cas.patch +Patch0570: 0570-features-shard-avoid-repeatative-calls-to-gf_uuid_un.patch +Patch0571: 0571-NetBSD-build-fixes.patch +Patch0572: 0572-locks-remove-unused-conditional-switch-to-spin_lock-.patch +Patch0573: 0573-features-shard-unlink-fails-due-to-nospace-to-mknod-.patch +Patch0574: 0574-features-shard-delay-unlink-of-a-file-that-has-fd_co.patch +Patch0575: 0575-libglusterfs-add-functions-to-calculate-time-differe.patch +Patch0576: 0576-rpcsvc-Add-latency-tracking-for-rpc-programs.patch +Patch0577: 0577-protocol-client-don-t-reopen-fds-on-which-POSIX-lock.patch +Patch0578: 0578-protocol-client-fallback-to-anonymous-fd-for-fsync.patch +Patch0579: 0579-cli-changing-rebal-task-ID-to-None-in-case-status-is.patch +Patch0580: 0580-cluster-dht-suppress-file-migration-error-for-node-n.patch +Patch0581: 0581-afr-don-t-reopen-fds-on-which-POSIX-locks-are-held.patch +Patch0582: 0582-protocol-client-Fix-lock-memory-leak.patch +Patch0583: 0583-protocol-client-Initialize-list-head-to-prevent-NULL.patch +Patch0584: 0584-dht-fixing-xattr-inconsistency.patch +Patch0585: 0585-ganesha_ha-ganesha_grace-RA-fails-in-start-and-or-fa.patch +Patch0586: 0586-protocol-client-Do-not-reopen-fd-post-handshake-if-p.patch +Patch0587: 0587-Update-rfc.sh-to-rhgs-3.5.6.patch +Patch0588: 0588-locks-Fix-null-gfid-in-lock-contention-notifications.patch +Patch0589: 0589-extras-fix-for-postscript-failure-on-logrotation-of-.patch +Patch0590: 0590-cluster-afr-Don-t-check-for-stale-entry-index.patch +Patch0591: 0591-afr-check-for-valid-iatt.patch +Patch0592: 0592-md-cache-fix-integer-signedness-mismatch.patch +Patch0593: 0593-dht-explicit-null-dereference.patch +Patch0594: 0594-glusterd-resource-leaks.patch +Patch0595: 0595-glusterd-use-after-free-coverity-issue.patch +Patch0596: 0596-locks-null-dereference.patch +Patch0597: 0597-glusterd-memory-deallocated-twice.patch +Patch0598: 0598-glusterd-null-dereference.patch +Patch0599: 0599-afr-null-dereference-nagative-value.patch +Patch0600: 0600-dht-xlator-integer-handling-issue.patch +Patch0601: 0601-coverity-resource-leak-2321.patch +Patch0602: 0602-coverity-null-dereference-2395.patch +Patch0603: 0603-Coverity-Resource-leak-fix-CID-1356547.patch +Patch0604: 0604-Coverity-Fix-dereference-before-null-check-CID-13914.patch +Patch0605: 0605-Coverity-Fix-copy-into-fixed-size-buffer-CID-1325542.patch +Patch0606: 0606-dht-handle-DHT_SUBVOL_STATUS_KEY-in-dht_pt_getxattr-.patch +Patch0607: 0607-SELinux-Fix-boolean-management.patch +Patch0608: 0608-cluster-ec-Track-heal-statistics-in-shd.patch +Patch0609: 0609-feature-shard-wrong-dname-results-in-dentry-not-foun.patch +Patch0610: 0610-glusterfs.spec.in-remove-condtionals-from-tar-depend.patch +Patch0611: 0611-SELinux-Fix-boolean-management-again.patch %description GlusterFS is a distributed file-system capable of scaling to several @@ -1067,6 +1134,8 @@ Requires: python%{_pythonver}-gluster = %{version}-%{release} Requires: rsync Requires: util-linux Requires: %{name}-libs%{?_isa} = %{version}-%{release} +Requires: tar + # required for setting selinux bools %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) Requires(post): policycoreutils-python-utils @@ -1570,7 +1639,10 @@ exit 0 %if ( 0%{!?_without_server:1} ) %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %post ganesha -semanage boolean -m ganesha_use_fusefs --on +# first install +if [ $1 -eq 1 ]; then + %selinux_set_booleans ganesha_use_fusefs=1 +fi exit 0 %endif %endif @@ -1578,7 +1650,9 @@ exit 0 %if ( 0%{!?_without_georeplication:1} ) %post geo-replication %if ( 0%{?rhel} && 0%{?rhel} >= 8 ) -%selinux_set_booleans %{selinuxbooleans} +if [ $1 -eq 1 ]; then + %selinux_set_booleans %{selinuxbooleans} +fi %endif if [ $1 -ge 1 ]; then %systemd_postun_with_restart glusterd @@ -1705,7 +1779,20 @@ exit 0 %if ( 0%{!?_without_server:1} ) %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %postun ganesha -semanage boolean -m ganesha_use_fusefs --off +if [ $1 -eq 0 ]; then + # use the value of ganesha_use_fusefs from before glusterfs-ganesha was installed + %selinux_unset_booleans ganesha_use_fusefs=1 +fi +exit 0 +%endif +%endif + +%if ( 0%{!?_without_georeplication:1} ) +%postun geo-replication +%if ( 0%{?rhel} && 0%{?rhel} >= 8 ) +if [ $1 -eq 0 ]; then + %selinux_unset_booleans %{selinuxbooleans} +fi exit 0 %endif %endif @@ -1715,19 +1802,20 @@ exit 0 ## %if ( 0%{!?_without_server:1} ) %if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) -%trigger ganesha -- selinux-policy-targeted -semanage boolean -m ganesha_use_fusefs --on +# ensure ganesha_use_fusefs is on in case of policy mode switch (eg. mls->targeted) +%triggerin ganesha -- selinux-policy-targeted +semanage boolean -m ganesha_use_fusefs --on -S targeted exit 0 %endif %endif -##----------------------------------------------------------------------------- -## All %%triggerun should be placed here and keep them sorted -## %if ( 0%{!?_without_server:1} ) -%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) -%triggerun ganesha -- selinux-policy-targeted -semanage boolean -m ganesha_use_fusefs --off +%if ( ( 0%{?fedora} && 0%{?fedora} > 25 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +# temporary fix to be removed in the next version (i.e. RHGS 3.5.7). This +# is only needed when upgrading from the flawed versions (e.g. RHGS 3.5.5 +# and earlier.) +%posttrans ganesha +semanage boolean -m ganesha_use_fusefs --on -S targeted exit 0 %endif %endif @@ -2601,11 +2689,30 @@ fi %endif %changelog -* Mon Aug 30 2021 Gluster Jenkins - 6.0-56.4 -- Add gating.yaml, fixes bugs bz#1996984 +* Tue Feb 22 2022 Tamar Shacked - 6.0-61.2 +- Rebuilt with rhel-8.5.0-z-build target. fixes bugs bz#2056953 + +* Sun Feb 20 2022 Tamar Shacked - 6.0-61.1 +- Rebuilt for rhel-8.5.0.z. fixes bugs bz#2056647 + +* Mon Nov 29 2021 Gluster Jenkins - 6.0-61 +- fixes bugs bz#1973566 + +* Mon Oct 11 2021 Gluster Jenkins - 6.0-60 +- fixes bugs bz#1668303 bz#1853631 bz#1901468 bz#1904137 bz#1911665 + bz#1962972 bz#1973566 bz#1994593 bz#1995029 bz#1997447 bz#2006205 + +* Tue Jul 06 2021 Gluster Jenkins - 6.0-59 +- fixes bugs bz#1689375 + +* Wed Jun 16 2021 Gluster Jenkins - 6.0-58 +- fixes bugs bz#1945143 -* Tue Aug 24 2021 Gluster Jenkins - 6.0-56.3 -- fixes bugs bz#1996984 +* Tue Jun 08 2021 Gluster Jenkins - 6.0-57 +- fixes bugs bz#1600379 bz#1689375 bz#1782428 bz#1798897 bz#1815462 + bz#1889966 bz#1891403 bz#1901468 bz#1903911 bz#1908635 bz#1917488 bz#1918018 + bz#1919132 bz#1925425 bz#1927411 bz#1927640 bz#1928676 bz#1942816 bz#1943467 + bz#1945143 bz#1946171 bz#1957191 bz#1957641 * Thu May 06 2021 Gluster Jenkins - 6.0-56.2 - fixes bugs bz#1953901