12a457
From a468ff0af57528dbd7a711f985c0524251ec90d3 Mon Sep 17 00:00:00 2001
12a457
From: Ravishankar N <ravishankar@redhat.com>
12a457
Date: Tue, 5 Apr 2016 11:40:05 +0530
12a457
Subject: [PATCH 40/80] afr: Add throttled background client-side heals
12a457
12a457
Backport of: http://review.gluster.org/13207
12a457
12a457
If a heal is needed after inode refresh (lookup, read_txn), launch it in
12a457
the background instead of blocking the fop (that triggered refresh)
12a457
until the heal happens.
12a457
12a457
afr_replies_interpret() is modified such that the heal is
12a457
launched only if atleast one sink brick is up.
12a457
12a457
Max. no of heals that can happen in parallel is configurable via the
12a457
'background-self-heal-count' volume option. Any number greater than that
12a457
is put in a wait queue whose length is configurable via
12a457
'heal-wait-queue-leng' volume option. If the wait queue is also full,
12a457
further heals will be ignored.
12a457
12a457
Default values:  background-self-heal-count=8, heal-wait-queue-leng=128
12a457
12a457
Change-Id: Ief20b915f8b3064dfbde41e9216b080de45f31f5
12a457
BUG: 1300875
12a457
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
12a457
Reviewed-on: https://code.engineering.redhat.com/gerrit/71393
12a457
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
12a457
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
12a457
---
12a457
 tests/basic/afr/client-side-heal.t              |    1 +
12a457
 tests/bugs/glusterd/859927/repl.t               |    4 +-
12a457
 tests/bugs/quota/bug-1035576.t                  |    1 -
12a457
 tests/bugs/replicate/bug-802417.t               |    5 +-
12a457
 tests/bugs/replicate/bug-977797.t               |   52 +++++------
12a457
 xlators/cluster/afr/src/afr-common.c            |   89 +++++++++----------
12a457
 xlators/cluster/afr/src/afr-dir-write.c         |    2 +-
12a457
 xlators/cluster/afr/src/afr-self-heal-common.c  |  110 +++++++++++++++++++++++
12a457
 xlators/cluster/afr/src/afr-self-heal.h         |    3 +
12a457
 xlators/cluster/afr/src/afr.c                   |   26 +++++-
12a457
 xlators/cluster/afr/src/afr.h                   |   28 +++++--
12a457
 xlators/mgmt/glusterd/src/glusterd-volume-set.c |    8 ++
12a457
 12 files changed, 237 insertions(+), 92 deletions(-)
12a457
12a457
diff --git a/tests/basic/afr/client-side-heal.t b/tests/basic/afr/client-side-heal.t
12a457
index 18f7626..d87f4b1 100644
12a457
--- a/tests/basic/afr/client-side-heal.t
12a457
+++ b/tests/basic/afr/client-side-heal.t
12a457
@@ -70,6 +70,7 @@ EXPECT 7 get_pending_heal_count $V0
12a457
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
12a457
 TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
12a457
 TEST cat $M0/datafile
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT 6 get_pending_heal_count $V0
12a457
 
12a457
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
12a457
 TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
12a457
diff --git a/tests/bugs/glusterd/859927/repl.t b/tests/bugs/glusterd/859927/repl.t
12a457
index a500961..40e8602 100755
12a457
--- a/tests/bugs/glusterd/859927/repl.t
12a457
+++ b/tests/bugs/glusterd/859927/repl.t
12a457
@@ -23,7 +23,6 @@ TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2};
12a457
 TEST $CLI volume set $V0 cluster.self-heal-daemon off
12a457
 TEST $CLI volume set $V0 performance.stat-prefetch off
12a457
 TEST $CLI volume set $V0 client-log-level DEBUG
12a457
-TEST $CLI volume set $V0 cluster.background-self-heal-count 0
12a457
 TEST $CLI volume start $V0
12a457
 TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0;
12a457
 
12a457
@@ -34,6 +33,7 @@ EXPECT full volume_option $V0 cluster.data-self-heal-algorithm
12a457
 create_setup_for_self_heal $M0/a
12a457
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
12a457
 cat $file 2>&1 > /dev/null
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
12a457
 TEST cmp $B0/${V0}1/a $B0/${V0}2/a
12a457
 
12a457
 TEST $CLI volume set $V0 cluster.data-self-heal-algorithm diff
12a457
@@ -41,12 +41,14 @@ EXPECT diff volume_option $V0 cluster.data-self-heal-algorithm
12a457
 create_setup_for_self_heal $M0/a
12a457
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
12a457
 cat $file 2>&1 > /dev/null
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
12a457
 TEST cmp $B0/${V0}1/a $B0/${V0}2/a
12a457
 
12a457
 TEST $CLI volume reset $V0 cluster.data-self-heal-algorithm
12a457
 create_setup_for_self_heal $M0/a
12a457
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
12a457
 cat $file 2>&1 > /dev/null
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
12a457
 TEST cmp $B0/${V0}1/a $B0/${V0}2/a
12a457
 
12a457
 TEST ! $CLI volume set $V0 cluster.data-self-heal-algorithm ""
12a457
diff --git a/tests/bugs/quota/bug-1035576.t b/tests/bugs/quota/bug-1035576.t
12a457
index 99b3925..eaf4439 100644
12a457
--- a/tests/bugs/quota/bug-1035576.t
12a457
+++ b/tests/bugs/quota/bug-1035576.t
12a457
@@ -17,7 +17,6 @@ TEST $CLI volume set $V0 performance.io-cache off
12a457
 TEST $CLI volume set $V0 performance.write-behind off
12a457
 TEST $CLI volume set $V0 performance.stat-prefetch off
12a457
 TEST $CLI volume set $V0 performance.read-ahead off
12a457
-TEST $CLI volume set $V0 background-self-heal-count 0
12a457
 TEST $CLI volume set $V0 self-heal-daemon off
12a457
 TEST $CLI volume quota $V0 enable
12a457
 
12a457
diff --git a/tests/bugs/replicate/bug-802417.t b/tests/bugs/replicate/bug-802417.t
12a457
index df989b1..c5ba98b 100755
12a457
--- a/tests/bugs/replicate/bug-802417.t
12a457
+++ b/tests/bugs/replicate/bug-802417.t
12a457
@@ -32,7 +32,6 @@ TEST $CLI volume set $V0 performance.stat-prefetch off
12a457
 ## Make sure automatic self-heal doesn't perturb our results.
12a457
 TEST $CLI volume set $V0 cluster.self-heal-daemon off
12a457
 TEST $CLI volume set $V0 cluster.data-self-heal on
12a457
-TEST $CLI volume set $V0 cluster.background-self-heal-count 0
12a457
 
12a457
 ## Start volume and verify
12a457
 TEST $CLI volume start $V0;
12a457
@@ -70,8 +69,8 @@ tgt_xattr_2="trusted.afr.${V0}-client-2"
12a457
 actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_0)
12a457
 EXPECT "0x000000000000000000000000|^\$" echo $actual
12a457
 
12a457
-actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_1)
12a457
-EXPECT "0x000000000000000000000000|^\$" echo $actual
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT "0x000000000000000000000000" \
12a457
+afr_get_changelog_xattr $obs_path_0 $tgt_xattr_1
12a457
 
12a457
 actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_2)
12a457
 EXPECT "0x000000030000000000000000" echo $actual
12a457
diff --git a/tests/bugs/replicate/bug-977797.t b/tests/bugs/replicate/bug-977797.t
12a457
index 3ff14ec..72c616b 100755
12a457
--- a/tests/bugs/replicate/bug-977797.t
12a457
+++ b/tests/bugs/replicate/bug-977797.t
12a457
@@ -26,7 +26,6 @@ TEST $CLI volume set $V0 quick-read off
12a457
 TEST $CLI volume set $V0 read-ahead off
12a457
 TEST $CLI volume set $V0 write-behind off
12a457
 TEST $CLI volume set $V0 io-cache off
12a457
-TEST $CLI volume set $V0 background-self-heal-count 0
12a457
 
12a457
 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
12a457
 
12a457
@@ -56,34 +55,29 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1;
12a457
 
12a457
 TEST dd if=$M0/a/file of=/dev/null bs=1024k
12a457
 
12a457
-b1c0dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \
12a457
-          trusted.afr.$V0-client-0 "entry")
12a457
-b1c1dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \
12a457
-          trusted.afr.$V0-client-1 "entry")
12a457
-b2c0dir=$(afr_get_specific_changelog_xattr \
12a457
-          $B0/$V0"2"/a trusted.afr.$V0-client-0 "entry")
12a457
-b2c1dir=$(afr_get_specific_changelog_xattr \
12a457
-          $B0/$V0"2"/a trusted.afr.$V0-client-1 "entry")
12a457
-
12a457
-
12a457
-b1c0f=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a/file \
12a457
-        trusted.afr.$V0-client-0 "data")
12a457
-b1c1f=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a/file \
12a457
-        trusted.afr.$V0-client-1 "data")
12a457
-b2c0f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \
12a457
-        trusted.afr.$V0-client-0 "data")
12a457
-b2c1f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \
12a457
-        trusted.afr.$V0-client-1 "data")
12a457
-
12a457
-EXPECT "00000000|^$" echo $b1c0f
12a457
-EXPECT "00000000|^$" echo $b1c1f
12a457
-EXPECT "00000000|^$" echo $b2c0f
12a457
-EXPECT "00000000|^$" echo $b2c1f
12a457
-
12a457
-EXPECT "00000000|^$" echo $b1c0dir
12a457
-EXPECT "00000000|^$" echo $b1c1dir
12a457
-EXPECT "00000000|^$" echo $b2c0dir
12a457
-EXPECT "00000000|^$" echo $b2c1dir
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000"  \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"1"/a/file trusted.afr.$V0-client-0 "data"
12a457
+
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"1"/a/file trusted.afr.$V0-client-1 "data"
12a457
+
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"2"/a/file trusted.afr.$V0-client-0 "data"
12a457
+
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"2"/a/file trusted.afr.$V0-client-1 "data"
12a457
+
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"1"/a trusted.afr.$V0-client-0 "entry"
12a457
+
12a457
+EXPECT_WITHIN HEAL_TIMEOUT "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"1"/a trusted.afr.$V0-client-1 "entry"
12a457
+
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"2"/a trusted.afr.$V0-client-0 "entry"
12a457
+
12a457
+EXPECT_WITHIN $HEAL_TIMEOUT  "00000000" \
12a457
+afr_get_specific_changelog_xattr $B0/$V0"2"/a trusted.afr.$V0-client-1 "entry"
12a457
 
12a457
 ## Finish up
12a457
 TEST $CLI volume stop $V0;
12a457
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
12a457
index b232a2d..b5d07ac 100644
12a457
--- a/xlators/cluster/afr/src/afr-common.c
12a457
+++ b/xlators/cluster/afr/src/afr-common.c
12a457
@@ -697,7 +697,8 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
12a457
 }
12a457
 
12a457
 int
12a457
-afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
12a457
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
12a457
+                       gf_boolean_t *start_heal)
12a457
 {
12a457
 	afr_local_t *local = NULL;
12a457
 	afr_private_t *priv = NULL;
12a457
@@ -777,6 +778,13 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
12a457
 		}
12a457
 	}
12a457
 
12a457
+	for (i = 0; i < priv->child_count; i++) {
12a457
+                if (start_heal && priv->child_up[i] &&
12a457
+                    (!data_readable[i] || !metadata_readable[i])) {
12a457
+                        *start_heal = _gf_true;
12a457
+                        break;
12a457
+                }
12a457
+        }
12a457
 	afr_inode_read_subvol_set (inode, this, data_readable,
12a457
 				   metadata_readable, event_generation);
12a457
 	return ret;
12a457
@@ -815,36 +823,6 @@ ret:
12a457
 	return -err;
12a457
 }
12a457
 
12a457
-
12a457
-int
12a457
-afr_refresh_selfheal_wrap (void *opaque)
12a457
-{
12a457
-	call_frame_t *frame = opaque;
12a457
-	afr_local_t *local = NULL;
12a457
-	xlator_t *this = NULL;
12a457
-	int err = 0;
12a457
-
12a457
-	local = frame->local;
12a457
-	this = frame->this;
12a457
-
12a457
-	afr_selfheal (frame->this, local->refreshinode->gfid);
12a457
-
12a457
-	afr_selfheal_unlocked_discover (frame, local->refreshinode,
12a457
-					local->refreshinode->gfid,
12a457
-					local->replies);
12a457
-
12a457
-	afr_replies_interpret (frame, this, local->refreshinode);
12a457
-
12a457
-	err = afr_inode_refresh_err (frame, this);
12a457
-
12a457
-        afr_local_replies_wipe (local, this->private);
12a457
-
12a457
-	local->refreshfn (frame, this, err);
12a457
-
12a457
-	return 0;
12a457
-}
12a457
-
12a457
-
12a457
 gf_boolean_t
12a457
 afr_selfheal_enabled (xlator_t *this)
12a457
 {
12a457
@@ -860,35 +838,43 @@ afr_selfheal_enabled (xlator_t *this)
12a457
 	return data || priv->metadata_self_heal || priv->entry_self_heal;
12a457
 }
12a457
 
12a457
-
12a457
 int
12a457
 afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
12a457
 {
12a457
-	call_frame_t *heal = NULL;
12a457
+	call_frame_t *heal_frame = NULL;
12a457
 	afr_local_t *local = NULL;
12a457
+        gf_boolean_t start_heal = _gf_false;
12a457
+        afr_local_t *heal_local = NULL;
12a457
+        int op_errno = ENOMEM;
12a457
 	int ret = 0;
12a457
 	int err = 0;
12a457
 
12a457
 	local = frame->local;
12a457
 
12a457
-	ret = afr_replies_interpret (frame, this, local->refreshinode);
12a457
+	ret = afr_replies_interpret (frame, this, local->refreshinode,
12a457
+                                     &start_heal);
12a457
 
12a457
 	err = afr_inode_refresh_err (frame, this);
12a457
 
12a457
         afr_local_replies_wipe (local, this->private);
12a457
 
12a457
-	if (ret && afr_selfheal_enabled (this)) {
12a457
-		heal = copy_frame (frame);
12a457
-		if (heal)
12a457
-			heal->root->pid = GF_CLIENT_PID_SELF_HEALD;
12a457
-		ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
12a457
-				    afr_refresh_selfheal_done, heal, frame);
12a457
-		if (ret)
12a457
-			goto refresh_done;
12a457
-	} else {
12a457
-	refresh_done:
12a457
-		local->refreshfn (frame, this, err);
12a457
-	}
12a457
+	if (ret && afr_selfheal_enabled (this) && start_heal) {
12a457
+                heal_frame = copy_frame (frame);
12a457
+                if (!heal_frame)
12a457
+                        goto refresh_done;
12a457
+                heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
12a457
+                heal_local = AFR_FRAME_INIT (heal_frame, op_errno);
12a457
+                if (!heal_local) {
12a457
+                        AFR_STACK_DESTROY (heal_frame);
12a457
+                        goto refresh_done;
12a457
+                }
12a457
+                heal_local->refreshinode = inode_ref (local->refreshinode);
12a457
+                heal_local->heal_frame = heal_frame;
12a457
+                afr_throttled_selfheal (heal_frame, this);
12a457
+        }
12a457
+
12a457
+refresh_done:
12a457
+        local->refreshfn (frame, this, err);
12a457
 
12a457
 	return 0;
12a457
 }
12a457
@@ -1785,7 +1771,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
12a457
 		*/
12a457
                 gf_uuid_copy (args.gfid, read_gfid);
12a457
                 args.ia_type = ia_type;
12a457
-		if (afr_replies_interpret (frame, this, local->inode)) {
12a457
+		if (afr_replies_interpret (frame, this, local->inode, NULL)) {
12a457
                         read_subvol = afr_read_subvol_decide (local->inode,
12a457
                                                               this, &args);
12a457
 			afr_inode_read_subvol_reset (local->inode, this);
12a457
@@ -2246,7 +2232,7 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
12a457
                 goto unwind;
12a457
 	}
12a457
 
12a457
-	afr_replies_interpret (frame, this, local->inode);
12a457
+	afr_replies_interpret (frame, this, local->inode, NULL);
12a457
 
12a457
 	read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
12a457
 	if (read_subvol == -1) {
12a457
@@ -3899,6 +3885,12 @@ afr_priv_dump (xlator_t *this)
12a457
         gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);
12a457
         gf_proc_dump_write("wait_count", "%u", priv->wait_count);
12a457
         gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads);
12a457
+        gf_proc_dump_write("heal-wait-queue-length", "%d",
12a457
+                           priv->heal_wait_qlen);
12a457
+        gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
12a457
+        gf_proc_dump_write("background-self-heal-count", "%d",
12a457
+                           priv->background_self_heal_count);
12a457
+        gf_proc_dump_write("healers", "%d", priv->healers);
12a457
 
12a457
         return 0;
12a457
 }
12a457
@@ -4205,6 +4197,7 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
12a457
 		goto out;
12a457
 	}
12a457
 
12a457
+        INIT_LIST_HEAD (&local->healer);
12a457
 	return 0;
12a457
 out:
12a457
         return -1;
12a457
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
12a457
index 3d586dd..887298b 100644
12a457
--- a/xlators/cluster/afr/src/afr-dir-write.c
12a457
+++ b/xlators/cluster/afr/src/afr-dir-write.c
12a457
@@ -93,7 +93,7 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
12a457
 	priv = this->private;
12a457
 
12a457
 	if (local->inode) {
12a457
-		afr_replies_interpret (frame, this, local->inode);
12a457
+		afr_replies_interpret (frame, this, local->inode, NULL);
12a457
 		inode_read_subvol = afr_data_subvol_get (local->inode, this,
12a457
 							 NULL, NULL, NULL);
12a457
 	}
12a457
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
12a457
index 6e90de0..73d7e94 100644
12a457
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
12a457
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
12a457
@@ -20,6 +20,9 @@
12a457
 #include "protocol-common.h"
12a457
 #include "afr-messages.h"
12a457
 
12a457
+void
12a457
+afr_heal_synctask (xlator_t *this, afr_local_t *local);
12a457
+
12a457
 int
12a457
 afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
12a457
 			  int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
12a457
@@ -1423,3 +1426,110 @@ afr_selfheal (xlator_t *this, uuid_t gfid)
12a457
 
12a457
 	return ret;
12a457
 }
12a457
+
12a457
+afr_local_t*
12a457
+__afr_dequeue_heals (afr_private_t *priv)
12a457
+{
12a457
+        afr_local_t *local = NULL;
12a457
+
12a457
+        if (list_empty (&priv->heal_waiting))
12a457
+                goto none;
12a457
+        if ((priv->background_self_heal_count > 0) &&
12a457
+            (priv->healers >= priv->background_self_heal_count))
12a457
+                goto none;
12a457
+
12a457
+        local = list_entry (priv->heal_waiting.next, afr_local_t, healer);
12a457
+        priv->heal_waiters--;
12a457
+        GF_ASSERT (priv->heal_waiters >= 0);
12a457
+        list_del_init(&local->healer);
12a457
+        list_add(&local->healer, &priv->healing);
12a457
+        priv->healers++;
12a457
+        return local;
12a457
+none:
12a457
+        gf_msg_debug (THIS->name, 0, "Nothing dequeued. "
12a457
+                      "Num healers: %d, Num Waiters: %d",
12a457
+                      priv->healers, priv->heal_waiters);
12a457
+        return NULL;
12a457
+}
12a457
+
12a457
+int
12a457
+afr_refresh_selfheal_wrap (void *opaque)
12a457
+{
12a457
+        call_frame_t *heal_frame = opaque;
12a457
+        afr_local_t *local = heal_frame->local;
12a457
+        int ret = 0;
12a457
+
12a457
+        ret = afr_selfheal (heal_frame->this, local->refreshinode->gfid);
12a457
+        return ret;
12a457
+}
12a457
+
12a457
+int
12a457
+afr_refresh_heal_done (int ret, call_frame_t *frame, void *opaque)
12a457
+{
12a457
+        call_frame_t *heal_frame = opaque;
12a457
+        xlator_t *this = heal_frame->this;
12a457
+        afr_private_t *priv = this->private;
12a457
+        afr_local_t *local = heal_frame->local;
12a457
+
12a457
+        LOCK (&priv->lock);
12a457
+        {
12a457
+                list_del_init(&local->healer);
12a457
+                priv->healers--;
12a457
+                GF_ASSERT (priv->healers >= 0);
12a457
+                local = __afr_dequeue_heals (priv);
12a457
+        }
12a457
+        UNLOCK (&priv->lock);
12a457
+
12a457
+        if (heal_frame)
12a457
+                AFR_STACK_DESTROY (heal_frame);
12a457
+
12a457
+        if (local)
12a457
+                afr_heal_synctask (this, local);
12a457
+        return 0;
12a457
+
12a457
+}
12a457
+
12a457
+void
12a457
+afr_heal_synctask (xlator_t *this, afr_local_t *local)
12a457
+{
12a457
+        int ret = 0;
12a457
+        call_frame_t *heal_frame = NULL;
12a457
+
12a457
+        heal_frame = local->heal_frame;
12a457
+        ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
12a457
+                            afr_refresh_heal_done, heal_frame, heal_frame);
12a457
+        if (ret < 0)
12a457
+                /* Heal not launched. Will be queued when the next inode
12a457
+                 * refresh happens and shd hasn't healed it yet. */
12a457
+                afr_refresh_heal_done (ret, heal_frame, heal_frame);
12a457
+}
12a457
+
12a457
+void
12a457
+afr_throttled_selfheal (call_frame_t *frame, xlator_t *this)
12a457
+{
12a457
+        gf_boolean_t can_heal = _gf_true;
12a457
+        afr_private_t *priv = this->private;
12a457
+        afr_local_t *local = frame->local;
12a457
+
12a457
+        LOCK (&priv->lock);
12a457
+        {
12a457
+                if ((priv->background_self_heal_count > 0) &&
12a457
+                    (priv->heal_wait_qlen + priv->background_self_heal_count) >
12a457
+                    (priv->heal_waiters + priv->healers)) {
12a457
+                        list_add_tail(&local->healer, &priv->heal_waiting);
12a457
+                        priv->heal_waiters++;
12a457
+                        local = __afr_dequeue_heals (priv);
12a457
+                } else {
12a457
+                        can_heal = _gf_false;
12a457
+                }
12a457
+        }
12a457
+        UNLOCK (&priv->lock);
12a457
+
12a457
+        if (can_heal) {
12a457
+                if (local)
12a457
+                        afr_heal_synctask (this, local);
12a457
+                else
12a457
+                        gf_msg_debug (this->name, 0, "Max number of heals are "
12a457
+                                      "pending, background self-heal rejected.");
12a457
+        }
12a457
+}
12a457
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
12a457
index 74e852a..b298fa1 100644
12a457
--- a/xlators/cluster/afr/src/afr-self-heal.h
12a457
+++ b/xlators/cluster/afr/src/afr-self-heal.h
12a457
@@ -85,6 +85,9 @@
12a457
 int
12a457
 afr_selfheal (xlator_t *this, uuid_t gfid);
12a457
 
12a457
+void
12a457
+afr_throttled_selfheal (call_frame_t *frame, xlator_t *this);
12a457
+
12a457
 int
12a457
 afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
12a457
                    void *gfid_req);
12a457
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
12a457
index 5ef920a..d65895a 100644
12a457
--- a/xlators/cluster/afr/src/afr.c
12a457
+++ b/xlators/cluster/afr/src/afr.c
12a457
@@ -128,6 +128,10 @@ reconfigure (xlator_t *this, dict_t *options)
12a457
                           priv->background_self_heal_count, options, uint32,
12a457
                           out);
12a457
 
12a457
+        GF_OPTION_RECONF ("heal-wait-queue-length",
12a457
+                          priv->heal_wait_qlen, options, uint32, out);
12a457
+
12a457
+
12a457
         GF_OPTION_RECONF ("metadata-self-heal",
12a457
                           priv->metadata_self_heal, options, bool, out);
12a457
 
12a457
@@ -277,6 +281,8 @@ init (xlator_t *this)
12a457
         priv->read_child = -1;
12a457
 
12a457
         GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out);
12a457
+        INIT_LIST_HEAD (&priv->healing);
12a457
+        INIT_LIST_HEAD (&priv->heal_waiting);
12a457
 
12a457
         priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
12a457
 
12a457
@@ -329,6 +335,9 @@ init (xlator_t *this)
12a457
         GF_OPTION_INIT ("background-self-heal-count",
12a457
                         priv->background_self_heal_count, uint32, out);
12a457
 
12a457
+        GF_OPTION_INIT ("heal-wait-queue-length",
12a457
+                        priv->heal_wait_qlen, uint32, out);
12a457
+
12a457
         GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out);
12a457
 
12a457
         GF_OPTION_INIT ("data-self-heal-algorithm",
12a457
@@ -587,10 +596,21 @@ struct volume_options options[] = {
12a457
         { .key  = {"background-self-heal-count"},
12a457
           .type = GF_OPTION_TYPE_INT,
12a457
           .min  = 0,
12a457
-          .default_value = "16",
12a457
+          .max  = 256,
12a457
+          .default_value = "8",
12a457
+          .validate = GF_OPT_VALIDATE_MIN,
12a457
+          .description = "This specifies the number of per client self-heal "
12a457
+                         "jobs that can perform parallel heals in the "
12a457
+                         "background."
12a457
+        },
12a457
+        { .key  = {"heal-wait-queue-length"},
12a457
+          .type = GF_OPTION_TYPE_INT,
12a457
+          .min  = 0,
12a457
+          .max  = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
12a457
+          .default_value = "128",
12a457
           .validate = GF_OPT_VALIDATE_MIN,
12a457
-          .description = "This specifies the number of self-heals that can be "
12a457
-                         " performed in background without blocking the fop"
12a457
+          .description = "This specifies the number of heals that can be queued"
12a457
+                         " for the parallel background self heal jobs."
12a457
         },
12a457
         { .key  = {"data-self-heal"},
12a457
           .type = GF_OPTION_TYPE_STR,
12a457
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
12a457
index 52f9c51..9915344 100644
12a457
--- a/xlators/cluster/afr/src/afr.h
12a457
+++ b/xlators/cluster/afr/src/afr.h
12a457
@@ -74,8 +74,17 @@ typedef struct _afr_private {
12a457
         unsigned int data_self_heal_window_size;  /* max number of pipelined
12a457
                                                      read/writes */
12a457
 
12a457
-        unsigned int background_self_heal_count;
12a457
-        unsigned int background_self_heals_started;
12a457
+        struct list_head heal_waiting; /*queue for files that need heal*/
12a457
+        uint32_t  heal_wait_qlen; /*configurable queue length for heal_waiting*/
12a457
+        int32_t  heal_waiters; /* No. of elements currently in wait queue.*/
12a457
+
12a457
+        struct list_head healing;/* queue for files that are undergoing
12a457
+                                    background heal*/
12a457
+        uint32_t  background_self_heal_count;/*configurable queue length for
12a457
+                                               healing queue*/
12a457
+        int32_t  healers;/* No. of elements currently undergoing background
12a457
+                          heal*/
12a457
+
12a457
         gf_boolean_t metadata_self_heal;   /* on/off */
12a457
         gf_boolean_t entry_self_heal;      /* on/off */
12a457
 
12a457
@@ -127,12 +136,14 @@ typedef struct _afr_private {
12a457
 
12a457
 	afr_self_heald_t       shd;
12a457
 
12a457
-	/* pump dependencies */
12a457
-	void                   *pump_private;
12a457
-	gf_boolean_t           use_afr_in_pump;
12a457
         gf_boolean_t           consistent_metadata;
12a457
         uint64_t               spb_choice_timeout;
12a457
         gf_boolean_t           need_heal;
12a457
+
12a457
+	/* pump dependencies */
12a457
+	void                   *pump_private;
12a457
+	gf_boolean_t           use_afr_in_pump;
12a457
+
12a457
 } afr_private_t;
12a457
 
12a457
 
12a457
@@ -740,6 +751,10 @@ typedef struct _afr_local {
12a457
         int             xflag;
12a457
         gf_boolean_t    do_discovery;
12a457
 	struct afr_reply *replies;
12a457
+
12a457
+        /* For  client side background heals. */
12a457
+        struct list_head healer;
12a457
+        call_frame_t *heal_frame;
12a457
 } afr_local_t;
12a457
 
12a457
 
12a457
@@ -891,7 +906,8 @@ int
12a457
 afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
12a457
 
12a457
 int
12a457
-afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
12a457
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
12a457
+                       gf_boolean_t *start_heal);
12a457
 
12a457
 void
12a457
 afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv);
12a457
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
12a457
index f5746c8..1b68c1b 100644
12a457
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
12a457
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
12a457
@@ -1129,6 +1129,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {
12a457
           .op_version = GD_OP_VERSION_RHS_3_0_4,
12a457
           .flags      = OPT_FLAG_CLIENT_OPT
12a457
         },
12a457
+        { .key        = "cluster.heal-wait-queue-length",
12a457
+          .voltype    = "cluster/replicate",
12a457
+          .type       = DOC,
12a457
+          .op_version = GD_OP_VERSION_3_7_10,
12a457
+          .flags      = OPT_FLAG_CLIENT_OPT
12a457
+        },
12a457
+
12a457
+        /* stripe xlator options */
12a457
         { .key         = "cluster.stripe-block-size",
12a457
           .voltype     = "cluster/stripe",
12a457
           .option      = "block-size",
12a457
-- 
12a457
1.7.1
12a457