e7a346
From 93ef66173442aaf4aeaeb161c6d6108eda54014a Mon Sep 17 00:00:00 2001
e7a346
From: Krutika Dhananjay <kdhananj@redhat.com>
e7a346
Date: Thu, 12 Apr 2018 15:47:00 +0530
e7a346
Subject: [PATCH 430/444] features/shard: Perform shards deletion in the
e7a346
 background
e7a346
e7a346
> Upstream: https://review.gluster.org/19970
e7a346
> BUG: 1568521
e7a346
> Change-Id: Ia83117230c9dd7d0d9cae05235644f8475e97bc3
e7a346
e7a346
A synctask is created that would scan the indices from
e7a346
.shard/.remove_me, to delete the shards associated with the
e7a346
gfid corresponding to the index bname and the rate of deletion
e7a346
is controlled by the option features.shard-deletion-rate whose
e7a346
default value is 100.
e7a346
The task is launched on two accounts:
e7a346
1. when shard receives its first-ever lookup on the volume
e7a346
2. when a rename or unlink deleted an inode
e7a346
e7a346
Change-Id: Ia83117230c9dd7d0d9cae05235644f8475e97bc3
e7a346
BUG: 1520882
e7a346
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/154864
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
e7a346
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e7a346
---
e7a346
 libglusterfs/src/globals.h                      |   1 +
e7a346
 tests/bugs/shard/bug-1568521-EEXIST.t           |  30 +-
e7a346
 tests/bugs/shard/bug-1568521.t                  |  53 ++
e7a346
 tests/bugs/shard/bug-shard-discard.t            |  19 +-
e7a346
 tests/bugs/shard/shard-inode-refcount-test.t    |   5 +-
e7a346
 tests/bugs/shard/unlinks-and-renames.t          | 123 ++--
e7a346
 xlators/features/shard/src/shard-messages.h     |  18 +-
e7a346
 xlators/features/shard/src/shard.c              | 816 +++++++++++++++++++-----
e7a346
 xlators/features/shard/src/shard.h              |  19 +-
e7a346
 xlators/mgmt/glusterd/src/glusterd-volume-set.c |   5 +
e7a346
 10 files changed, 829 insertions(+), 260 deletions(-)
e7a346
 create mode 100644 tests/bugs/shard/bug-1568521.t
e7a346
e7a346
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
e7a346
index 8e218cb..699e73e 100644
e7a346
--- a/libglusterfs/src/globals.h
e7a346
+++ b/libglusterfs/src/globals.h
e7a346
@@ -109,6 +109,7 @@
e7a346
 
e7a346
 #define GD_OP_VERSION_3_13_2   31302 /* Op-version for GlusterFS 3.13.2 */
e7a346
 
e7a346
+#define GD_OP_VERSION_4_2_0    40200 /* Op-version for GlusterFs  4.2.0 */
e7a346
 
e7a346
 /* Downstream only change */
e7a346
 #define GD_OP_VERSION_3_11_2   31102 /* Op-version for RHGS 3.3.1-async */
e7a346
diff --git a/tests/bugs/shard/bug-1568521-EEXIST.t b/tests/bugs/shard/bug-1568521-EEXIST.t
e7a346
index e4c3d41..7de400d 100644
e7a346
--- a/tests/bugs/shard/bug-1568521-EEXIST.t
e7a346
+++ b/tests/bugs/shard/bug-1568521-EEXIST.t
e7a346
@@ -5,6 +5,12 @@
e7a346
 
e7a346
 cleanup
e7a346
 
e7a346
+function get_file_count {
e7a346
+    ls $1* | wc -l
e7a346
+}
e7a346
+
e7a346
+FILE_COUNT_TIME=5
e7a346
+
e7a346
 TEST glusterd
e7a346
 TEST pidof glusterd
e7a346
 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
e7a346
@@ -41,10 +47,14 @@ TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x000000000050000000000000
e7a346
 sleep 2
e7a346
 
e7a346
 TEST unlink $M0/dir/file
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_file
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_file
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_file
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_file
e7a346
+
e7a346
+TEST ! stat $B0/${V0}0/dir/file
e7a346
+TEST ! stat $B0/${V0}1/dir/file
e7a346
+
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_file
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_file
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_file
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_file
e7a346
 
e7a346
 ##############################
e7a346
 ### Repeat test for rename ###
e7a346
@@ -71,9 +81,13 @@ TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x000000000050000000000000
e7a346
 sleep 2
e7a346
 
e7a346
 TEST mv -f $M0/src $M0/dir/dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+
e7a346
+TEST ! stat $B0/${V0}0/src
e7a346
+TEST ! stat $B0/${V0}1/src
e7a346
+
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_dst
e7a346
 
e7a346
 cleanup
e7a346
diff --git a/tests/bugs/shard/bug-1568521.t b/tests/bugs/shard/bug-1568521.t
e7a346
new file mode 100644
e7a346
index 0000000..167fb63
e7a346
--- /dev/null
e7a346
+++ b/tests/bugs/shard/bug-1568521.t
e7a346
@@ -0,0 +1,53 @@
e7a346
+#!/bin/bash
e7a346
+
e7a346
+. $(dirname $0)/../../include.rc
e7a346
+
e7a346
+
e7a346
+function delete_files {
e7a346
+        local mountpoint=$1;
e7a346
+        local success=0;
e7a346
+        local value=$2
e7a346
+        for i in {1..500}; do
e7a346
+                unlink $mountpoint/file-$i 2>/dev/null 1>/dev/null
e7a346
+                if [ $? -eq 0 ]; then
e7a346
+                        echo $2 >> $B0/output.txt
e7a346
+                fi
e7a346
+        done
e7a346
+        echo $success
e7a346
+}
e7a346
+
e7a346
+cleanup
e7a346
+
e7a346
+TEST glusterd
e7a346
+TEST pidof glusterd
e7a346
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
e7a346
+TEST $CLI volume set $V0 features.shard on
e7a346
+TEST $CLI volume set $V0 shard-block-size 4MB
e7a346
+TEST $CLI volume start $V0
e7a346
+
e7a346
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
e7a346
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1
e7a346
+
e7a346
+for i in {1..500}; do
e7a346
+        dd if=/dev/urandom of=$M0/file-$i bs=1M count=2
e7a346
+done
e7a346
+
e7a346
+for i in {1..500}; do
e7a346
+        stat $M1/file-$i > /dev/null
e7a346
+done
e7a346
+
e7a346
+delete_files $M0 0 &
e7a346
+delete_files $M1 1 &
e7a346
+wait
e7a346
+
e7a346
+success1=$(grep 0 $B0/output.txt | wc -l);
e7a346
+success2=$(grep 1 $B0/output.txt | wc -l);
e7a346
+
e7a346
+echo "Success1 is $success1";
e7a346
+echo "Success2 is $success2";
e7a346
+
e7a346
+success_total=$((success1 + success2));
e7a346
+
e7a346
+EXPECT 500 echo $success_total
e7a346
+
e7a346
+cleanup
e7a346
diff --git a/tests/bugs/shard/bug-shard-discard.t b/tests/bugs/shard/bug-shard-discard.t
e7a346
index 884d9e7..910ade1 100644
e7a346
--- a/tests/bugs/shard/bug-shard-discard.t
e7a346
+++ b/tests/bugs/shard/bug-shard-discard.t
e7a346
@@ -5,6 +5,12 @@
e7a346
 
e7a346
 cleanup
e7a346
 
e7a346
+FILE_COUNT_TIME=5
e7a346
+
e7a346
+function get_shard_count {
e7a346
+    ls $1/$2.* | wc -l
e7a346
+}
e7a346
+
e7a346
 TEST glusterd
e7a346
 TEST pidof glusterd
e7a346
 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0..3}
e7a346
@@ -42,14 +48,11 @@ EXPECT_NOT "1" file_all_zeroes `find $B0 -name $gfid_foo.1`
e7a346
 
e7a346
 # Now unlink the file. And ensure that all shards associated with the file are cleaned up
e7a346
 TEST unlink $M0/foo
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_foo.1
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_foo.1
e7a346
-#TEST ! stat $B0/${V0}2/.shard/$gfid_foo.1
e7a346
-#TEST ! stat $B0/${V0}3/.shard/$gfid_foo.1
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_foo.2
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_foo.2
e7a346
-#TEST ! stat $B0/${V0}2/.shard/$gfid_foo.2
e7a346
-#TEST ! stat $B0/${V0}3/.shard/$gfid_foo.2
e7a346
+
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}0/.shard $gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}1/.shard $gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}2/.shard $gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}3/.shard $gfid_foo
e7a346
 TEST ! stat $M0/foo
e7a346
 
e7a346
 #clean up everything
e7a346
diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t
e7a346
index c92dc07..087c8ba 100644
e7a346
--- a/tests/bugs/shard/shard-inode-refcount-test.t
e7a346
+++ b/tests/bugs/shard/shard-inode-refcount-test.t
e7a346
@@ -5,6 +5,8 @@
e7a346
 
e7a346
 cleanup
e7a346
 
e7a346
+SHARD_COUNT_TIME=5
e7a346
+
e7a346
 TEST glusterd
e7a346
 TEST pidof glusterd
e7a346
 TEST $CLI volume create $V0 $H0:$B0/${V0}0
e7a346
@@ -18,7 +20,8 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23
e7a346
 
e7a346
 ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0)
e7a346
 TEST rm -f $M0/one-plus-five-shards
e7a346
-#EXPECT `expr $ACTIVE_INODES_BEFORE - 4` get_mount_active_size_value $V0
e7a346
+# Expect 5 inodes less. But one inode more than before because .remove_me would be created.
e7a346
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0
e7a346
 
e7a346
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
e7a346
 TEST $CLI volume stop $V0
e7a346
diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t
e7a346
index 997c397..6e5164f 100644
e7a346
--- a/tests/bugs/shard/unlinks-and-renames.t
e7a346
+++ b/tests/bugs/shard/unlinks-and-renames.t
e7a346
@@ -9,6 +9,12 @@ cleanup
e7a346
 # and rename fops in sharding and make sure they work fine.
e7a346
 #
e7a346
 
e7a346
+FILE_COUNT_TIME=5
e7a346
+
e7a346
+function get_file_count {
e7a346
+    ls $1* | wc -l
e7a346
+}
e7a346
+
e7a346
 #################################################
e7a346
 ################### UNLINK ######################
e7a346
 #################################################
e7a346
@@ -36,13 +42,8 @@ gfid_foo=$(get_gfid_string $M0/dir/foo)
e7a346
 TEST unlink $M0/dir/foo
e7a346
 TEST stat $B0/${V0}0/.shard/.remove_me
e7a346
 TEST stat $B0/${V0}1/.shard/.remove_me
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
-
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
 
e7a346
 ##################################################
e7a346
 ##### Unlink of a sharded file without holes #####
e7a346
@@ -56,20 +57,14 @@ TEST stat $B0/${V0}1/.shard/$gfid_new.1
e7a346
 TEST stat $B0/${V0}0/.shard/$gfid_new.2
e7a346
 TEST stat $B0/${V0}1/.shard/$gfid_new.2
e7a346
 TEST unlink $M0/dir/new
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_new.1
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_new.1
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_new.2
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_new.2
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_new
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_new
e7a346
 TEST ! stat $M0/dir/new
e7a346
 TEST ! stat $B0/${V0}0/dir/new
e7a346
 TEST ! stat $B0/${V0}1/dir/new
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_new
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_new
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new
e7a346
 
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_new
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_new
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_new
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_new
e7a346
 #######################################
e7a346
 ##### Unlink with /.shard present #####
e7a346
 #######################################
e7a346
@@ -83,13 +78,8 @@ TEST unlink $M0/dir/foo
e7a346
 TEST ! stat $B0/${V0}0/dir/foo
e7a346
 TEST ! stat $B0/${V0}1/dir/foo
e7a346
 TEST ! stat $M0/dir/foo
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
-
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
 
e7a346
 #############################################################
e7a346
 ##### Unlink of a file with only one block (the zeroth) #####
e7a346
@@ -102,13 +92,9 @@ TEST unlink $M0/dir/foo
e7a346
 TEST ! stat $B0/${V0}0/dir/foo
e7a346
 TEST ! stat $B0/${V0}1/dir/foo
e7a346
 TEST ! stat $M0/dir/foo
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
 
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo
e7a346
-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo
e7a346
 ####################################################
e7a346
 ##### Unlink of a sharded file with hard-links #####
e7a346
 ####################################################
e7a346
@@ -137,22 +123,15 @@ TEST stat $B0/${V0}0/link
e7a346
 TEST stat $B0/${V0}1/link
e7a346
 # Now delete the last link.
e7a346
 TEST unlink $M0/link
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_original
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_original
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_original
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_original
e7a346
 # Ensure that the shards are all cleaned up.
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_original.1
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_original.1
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_original.2
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_original.2
e7a346
-#TEST ! stat $M0/link
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_original
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_original
e7a346
+TEST ! stat $M0/link
e7a346
 TEST ! stat $B0/${V0}0/link
e7a346
 TEST ! stat $B0/${V0}1/link
e7a346
 
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_original
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_original
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_original
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_original
e7a346
-
e7a346
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
e7a346
 TEST $CLI volume stop $V0
e7a346
 TEST $CLI volume delete $V0
e7a346
@@ -190,13 +169,8 @@ TEST ! stat $B0/${V0}0/dir/src
e7a346
 TEST ! stat $B0/${V0}1/dir/src
e7a346
 TEST   stat $B0/${V0}0/dir/dst
e7a346
 TEST   stat $B0/${V0}1/dir/dst
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 
e7a346
 ##################################################
e7a346
 ##### Rename to a sharded file without holes #####
e7a346
@@ -212,23 +186,16 @@ TEST stat $B0/${V0}1/.shard/$gfid_dst.1
e7a346
 TEST stat $B0/${V0}0/.shard/$gfid_dst.2
e7a346
 TEST stat $B0/${V0}1/.shard/$gfid_dst.2
e7a346
 TEST mv -f $M0/dir/src $M0/dir/dst
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_dst
e7a346
 TEST ! stat $M0/dir/src
e7a346
 TEST   stat $M0/dir/dst
e7a346
 TEST ! stat $B0/${V0}0/dir/src
e7a346
 TEST ! stat $B0/${V0}1/dir/src
e7a346
 TEST   stat $B0/${V0}0/dir/dst
e7a346
 TEST   stat $B0/${V0}1/dir/dst
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 
e7a346
 ###################################################
e7a346
 ##### Rename of dst file with /.shard present #####
e7a346
@@ -245,13 +212,8 @@ TEST ! stat $B0/${V0}0/dir/src
e7a346
 TEST ! stat $B0/${V0}1/dir/src
e7a346
 TEST   stat $B0/${V0}0/dir/dst
e7a346
 TEST   stat $B0/${V0}1/dir/dst
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 
e7a346
 ###############################################################
e7a346
 ##### Rename of dst file with only one block (the zeroth) #####
e7a346
@@ -268,13 +230,8 @@ TEST ! stat $B0/${V0}0/dir/src
e7a346
 TEST ! stat $B0/${V0}1/dir/src
e7a346
 TEST   stat $B0/${V0}0/dir/dst
e7a346
 TEST   stat $B0/${V0}1/dir/dst
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 
e7a346
 ########################################################
e7a346
 ##### Rename to a dst sharded file with hard-links #####
e7a346
@@ -307,20 +264,18 @@ TEST ! stat $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 TEST touch $M0/dir/src2
e7a346
 TEST mv -f $M0/dir/src2 $M0/link
e7a346
 # Ensure that the shards are all cleaned up.
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1
e7a346
-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2
e7a346
-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_dst
e7a346
+TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1
e7a346
+TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1
e7a346
+TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2
e7a346
+TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2
e7a346
 TEST ! stat $M0/dir/src2
e7a346
 TEST ! stat $B0/${V0}0/dir/src2
e7a346
 TEST ! stat $B0/${V0}1/dir/src2
e7a346
-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst
e7a346
-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst
e7a346
 # Rename with non-existent dst and a sharded src
e7a346
 TEST touch $M0/dir/src
e7a346
 TEST dd if=/dev/zero of=$M0/dir/src bs=1024 count=9216
e7a346
diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h
e7a346
index 0267f8a..bc04e5e 100644
e7a346
--- a/xlators/features/shard/src/shard-messages.h
e7a346
+++ b/xlators/features/shard/src/shard-messages.h
e7a346
@@ -40,7 +40,7 @@
e7a346
  */
e7a346
 
e7a346
 #define GLFS_COMP_BASE_SHARD      GLFS_MSGID_COMP_SHARD
e7a346
-#define GLFS_NUM_MESSAGES         20
e7a346
+#define GLFS_NUM_MESSAGES         22
e7a346
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_SHARD + GLFS_NUM_MESSAGES + 1)
e7a346
 
e7a346
 #define glfs_msg_start_x GLFS_COMP_BASE_SHARD, "Invalid: Start of messages"
e7a346
@@ -58,7 +58,7 @@
e7a346
  * @diagnosis
e7a346
  * @recommendedaction
e7a346
  */
e7a346
-#define SHARD_MSG_DICT_SET_FAILED                     (GLFS_COMP_BASE_SHARD + 2)
e7a346
+#define SHARD_MSG_DICT_OP_FAILED                     (GLFS_COMP_BASE_SHARD + 2)
e7a346
 
e7a346
 
e7a346
 /*!
e7a346
@@ -194,5 +194,19 @@
e7a346
 */
e7a346
 #define SHARD_MSG_FOP_FAILED                         (GLFS_COMP_BASE_SHARD + 20)
e7a346
 
e7a346
+/*!
e7a346
+ * @messageid 133021
e7a346
+ * @diagnosis
e7a346
+ * @recommendedaction
e7a346
+*/
e7a346
+#define SHARD_MSG_SHARDS_DELETION_FAILED             (GLFS_COMP_BASE_SHARD + 21)
e7a346
+
e7a346
+/*!
e7a346
+ * @messageid 133022
e7a346
+ * @diagnosis
e7a346
+ * @recommendedaction
e7a346
+*/
e7a346
+#define SHARD_MSG_SHARDS_DELETION_COMPLETED          (GLFS_COMP_BASE_SHARD + 22)
e7a346
+
e7a346
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
e7a346
 #endif /* !_SHARD_MESSAGES_H_ */
e7a346
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
e7a346
index 492341c..2faf711 100644
e7a346
--- a/xlators/features/shard/src/shard.c
e7a346
+++ b/xlators/features/shard/src/shard.c
e7a346
@@ -677,7 +677,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
e7a346
                          * keep it alive by holding a ref on it.
e7a346
                          */
e7a346
                         inode_ref (linked_inode);
e7a346
-                        gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
e7a346
+                        if (base_inode)
e7a346
+                                gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
e7a346
                         ctx->block_num = block_num;
e7a346
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
e7a346
                         priv->inode_count++;
e7a346
@@ -738,7 +739,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
e7a346
                          * keep it alive by holding a ref on it.
e7a346
                          */
e7a346
                         inode_ref (linked_inode);
e7a346
-                        gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
e7a346
+                        if (base_inode)
e7a346
+                                gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
e7a346
                         ctx->block_num = block_num;
e7a346
                         ctx->base_inode = base_inode;
e7a346
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
e7a346
@@ -977,6 +979,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
e7a346
         int                   i              = -1;
e7a346
         uint32_t              shard_idx_iter = 0;
e7a346
         char                  path[PATH_MAX] = {0,};
e7a346
+        uuid_t                gfid           = {0,};
e7a346
         inode_t              *inode          = NULL;
e7a346
         inode_t              *res_inode      = NULL;
e7a346
         inode_t              *fsync_inode    = NULL;
e7a346
@@ -988,6 +991,10 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
e7a346
         local->call_count = 0;
e7a346
         shard_idx_iter = local->first_block;
e7a346
         res_inode = local->resolver_base_inode;
e7a346
+        if (res_inode)
e7a346
+                gf_uuid_copy (gfid, res_inode->gfid);
e7a346
+        else
e7a346
+                gf_uuid_copy (gfid, local->base_gfid);
e7a346
 
e7a346
         if ((local->op_ret < 0) || (local->resolve_not))
e7a346
                 goto out;
e7a346
@@ -1000,7 +1007,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
e7a346
                         continue;
e7a346
                 }
e7a346
 
e7a346
-                shard_make_block_abspath (shard_idx_iter, res_inode->gfid, path,
e7a346
+                shard_make_block_abspath (shard_idx_iter, gfid, path,
e7a346
                                           sizeof(path));
e7a346
 
e7a346
                 inode = NULL;
e7a346
@@ -1147,7 +1154,7 @@ shard_update_file_size (call_frame_t *frame, xlator_t *this, fd_t *fd,
e7a346
         ret = dict_set_bin (xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr,
e7a346
                             8 * 4);
e7a346
         if (ret) {
e7a346
-                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED,
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
                         "Failed to set key %s into dict. gfid=%s",
e7a346
                         GF_XATTR_SHARD_FILE_SIZE, uuid_utoa (inode->gfid));
e7a346
                 GF_FREE (size_attr);
e7a346
@@ -1376,7 +1383,7 @@ shard_lookup_internal_dir (call_frame_t *frame, xlator_t *this,
e7a346
 
e7a346
         ret = dict_set_bin (xattr_req, "gfid-req", *gfid, 16);
e7a346
         if (ret) {
e7a346
-                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED,
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
                         "Failed to set gfid of %s into dict",
e7a346
                         shard_internal_dir_string (type));
e7a346
                 local->op_ret = -1;
e7a346
@@ -1431,10 +1438,49 @@ shard_inode_ctx_update (inode_t *inode, xlator_t *this, dict_t *xdata,
e7a346
 }
e7a346
 
e7a346
 int
e7a346
+shard_delete_shards (void *opaque);
e7a346
+
e7a346
+int
e7a346
+shard_delete_shards_cbk (int ret, call_frame_t *frame, void *data);
e7a346
+
e7a346
+int
e7a346
+shard_start_background_deletion (xlator_t *this)
e7a346
+{
e7a346
+        int              ret           = 0;
e7a346
+        call_frame_t    *cleanup_frame = NULL;
e7a346
+
e7a346
+        cleanup_frame = create_frame (this, this->ctx->pool);
e7a346
+        if (!cleanup_frame) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
e7a346
+                        SHARD_MSG_MEMALLOC_FAILED, "Failed to create "
e7a346
+                        "new frame to delete shards");
e7a346
+                return -ENOMEM;
e7a346
+        }
e7a346
+
e7a346
+        ret = synctask_new (this->ctx->env, shard_delete_shards,
e7a346
+                            shard_delete_shards_cbk, cleanup_frame,
e7a346
+                            cleanup_frame);
e7a346
+        if (ret < 0) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, errno,
e7a346
+                        SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
+                        "failed to create task to do background "
e7a346
+                        "cleanup of shards");
e7a346
+                STACK_DESTROY (cleanup_frame->root);
e7a346
+        }
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
 shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                   int32_t op_ret, int32_t op_errno, inode_t *inode,
e7a346
                   struct iatt *buf, dict_t *xdata, struct iatt *postparent)
e7a346
 {
e7a346
+        int             ret             = 0;
e7a346
+        shard_priv_t   *priv            = NULL;
e7a346
+        gf_boolean_t    i_start_cleanup = _gf_false;
e7a346
+
e7a346
+        priv = this->private;
e7a346
+
e7a346
         if (op_ret < 0)
e7a346
                 goto unwind;
e7a346
 
e7a346
@@ -1460,6 +1506,25 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
 
e7a346
         (void) shard_inode_ctx_update (inode, this, xdata, buf);
e7a346
 
e7a346
+        LOCK (&priv->lock);
e7a346
+        {
e7a346
+                if (priv->first_lookup == SHARD_FIRST_LOOKUP_PENDING) {
e7a346
+                        priv->first_lookup = SHARD_FIRST_LOOKUP_IN_PROGRESS;
e7a346
+                        i_start_cleanup = _gf_true;
e7a346
+                }
e7a346
+        }
e7a346
+        UNLOCK (&priv->lock);
e7a346
+
e7a346
+        if (i_start_cleanup) {
e7a346
+                ret = shard_start_background_deletion (this);
e7a346
+                if (ret) {
e7a346
+                        LOCK (&priv->lock);
e7a346
+                        {
e7a346
+                                priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING;
e7a346
+                        }
e7a346
+                        UNLOCK (&priv->lock);
e7a346
+                }
e7a346
+        }
e7a346
 unwind:
e7a346
         SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
e7a346
                             xdata, postparent);
e7a346
@@ -1475,6 +1540,7 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
e7a346
         uint64_t        block_size = 0;
e7a346
         shard_local_t  *local      = NULL;
e7a346
 
e7a346
+        this->itable = loc->inode->table;
e7a346
         if (frame->root->pid != GF_CLIENT_PID_GSYNCD) {
e7a346
                 SHARD_ENTRY_FOP_CHECK (loc, op_errno, err);
e7a346
         }
e7a346
@@ -1496,7 +1562,7 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
e7a346
                                        GF_XATTR_SHARD_BLOCK_SIZE, 0);
e7a346
                 if (ret) {
e7a346
                         gf_msg (this->name, GF_LOG_WARNING, 0,
e7a346
-                                SHARD_MSG_DICT_SET_FAILED, "Failed to set dict"
e7a346
+                                SHARD_MSG_DICT_OP_FAILED, "Failed to set dict"
e7a346
                                 " value: key:%s for path %s",
e7a346
                                 GF_XATTR_SHARD_BLOCK_SIZE, loc->path);
e7a346
                         goto err;
e7a346
@@ -1508,7 +1574,7 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
e7a346
                                        GF_XATTR_SHARD_FILE_SIZE, 8 * 4);
e7a346
                 if (ret) {
e7a346
                         gf_msg (this->name, GF_LOG_WARNING, 0,
e7a346
-                                SHARD_MSG_DICT_SET_FAILED,
e7a346
+                                SHARD_MSG_DICT_OP_FAILED,
e7a346
                                 "Failed to set dict value: key:%s for path %s.",
e7a346
                                 GF_XATTR_SHARD_FILE_SIZE, loc->path);
e7a346
                         goto err;
e7a346
@@ -1901,12 +1967,6 @@ shard_truncate_last_shard (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
-int
e7a346
-shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
-                            int32_t op_ret, int32_t op_errno,
e7a346
-                            struct iatt *preparent, struct iatt *postparent,
e7a346
-                            dict_t *xdata);
e7a346
-
e7a346
 void
e7a346
 shard_unlink_block_inode (shard_local_t *local, int shard_block_num);
e7a346
 
e7a346
@@ -1941,17 +2001,17 @@ done:
e7a346
 int
e7a346
 shard_truncate_htol (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
 {
e7a346
-        int i = 1;
e7a346
-        int ret = -1;
e7a346
-        int call_count = 0;
e7a346
-        uint32_t cur_block = 0;
e7a346
-        uint32_t last_block = 0;
e7a346
-        char path[PATH_MAX] = {0,};
e7a346
-        char *bname = NULL;
e7a346
-        loc_t loc = {0,};
e7a346
-        gf_boolean_t wind_failed = _gf_false;
e7a346
-        shard_local_t *local = NULL;
e7a346
-        shard_priv_t *priv = NULL;
e7a346
+        int             i              = 1;
e7a346
+        int             ret            = -1;
e7a346
+        int             call_count     = 0;
e7a346
+        uint32_t        cur_block      = 0;
e7a346
+        uint32_t        last_block     = 0;
e7a346
+        char            path[PATH_MAX] = {0,};
e7a346
+        char           *bname          = NULL;
e7a346
+        loc_t           loc            = {0,};
e7a346
+        gf_boolean_t    wind_failed    = _gf_false;
e7a346
+        shard_local_t  *local          = NULL;
e7a346
+        shard_priv_t   *priv           = NULL;
e7a346
 
e7a346
         local = frame->local;
e7a346
         priv = this->private;
e7a346
@@ -2086,6 +2146,7 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
e7a346
 {
e7a346
         int             list_index       = 0;
e7a346
         char            block_bname[256] = {0,};
e7a346
+        uuid_t          gfid             = {0,};
e7a346
         inode_t        *linked_inode     = NULL;
e7a346
         xlator_t       *this             = NULL;
e7a346
         inode_t        *fsync_inode      = NULL;
e7a346
@@ -2093,9 +2154,12 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
e7a346
 
e7a346
         this = THIS;
e7a346
         priv = this->private;
e7a346
+        if (local->loc.inode)
e7a346
+                gf_uuid_copy (gfid, local->loc.inode->gfid);
e7a346
+        else
e7a346
+                gf_uuid_copy (gfid, local->base_gfid);
e7a346
 
e7a346
-        shard_make_block_bname (block_num, (local->loc.inode)->gfid,
e7a346
-                                block_bname, sizeof (block_bname));
e7a346
+        shard_make_block_bname (block_num, gfid, block_bname, sizeof (block_bname));
e7a346
 
e7a346
         shard_inode_ctx_set (inode, this, buf, 0, SHARD_LOOKUP_MASK);
e7a346
         linked_inode = inode_link (inode, priv->dot_shard_inode, block_bname,
e7a346
@@ -2125,9 +2189,14 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie,
e7a346
 {
e7a346
         int             call_count      = 0;
e7a346
         int             shard_block_num = (long) cookie;
e7a346
+        uuid_t          gfid            = {0,};
e7a346
         shard_local_t  *local           = NULL;
e7a346
 
e7a346
         local = frame->local;
e7a346
+        if (local->resolver_base_inode)
e7a346
+                gf_uuid_copy (gfid, local->resolver_base_inode->gfid);
e7a346
+        else
e7a346
+                gf_uuid_copy (gfid, local->base_gfid);
e7a346
 
e7a346
         if (op_ret < 0) {
e7a346
                 /* Ignore absence of shards in the backend in truncate fop. */
e7a346
@@ -2162,9 +2231,7 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie,
e7a346
                 gf_msg (this->name, GF_LOG_ERROR, op_errno,
e7a346
                         SHARD_MSG_LOOKUP_SHARD_FAILED, "Lookup on shard %d "
e7a346
                         "failed. Base file gfid = %s", shard_block_num,
e7a346
-                        (local->fop == GF_FOP_RENAME) ?
e7a346
-                        uuid_utoa (local->loc2.inode->gfid)
e7a346
-                        : uuid_utoa (local->loc.inode->gfid));
e7a346
+                        uuid_utoa (gfid));
e7a346
                 local->op_ret = op_ret;
e7a346
                 local->op_errno = op_errno;
e7a346
                 goto done;
e7a346
@@ -2173,25 +2240,18 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie,
e7a346
         shard_link_block_inode (local, shard_block_num, inode, buf);
e7a346
 
e7a346
 done:
e7a346
-        call_count = shard_call_count_return (frame);
e7a346
         if (local->lookup_shards_barriered) {
e7a346
                 syncbarrier_wake (&local->barrier);
e7a346
                 return 0;
e7a346
         } else {
e7a346
+                call_count = shard_call_count_return (frame);
e7a346
                 if (call_count == 0) {
e7a346
                         if (!local->first_lookup_done)
e7a346
                                 local->first_lookup_done = _gf_true;
e7a346
-                        if (local->op_ret < 0)
e7a346
-                                goto unwind;
e7a346
-                        else
e7a346
-                                local->pls_fop_handler (frame, this);
e7a346
+                        local->pls_fop_handler (frame, this);
e7a346
                 }
e7a346
         }
e7a346
         return 0;
e7a346
-
e7a346
-unwind:
e7a346
-        local->pls_fop_handler (frame, this);
e7a346
-        return 0;
e7a346
 }
e7a346
 
e7a346
 dict_t*
e7a346
@@ -2237,6 +2297,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
         int            last_block     = 0;
e7a346
         char           path[PATH_MAX] = {0,};
e7a346
         char          *bname          = NULL;
e7a346
+        uuid_t         gfid           = {0,};
e7a346
         loc_t          loc            = {0,};
e7a346
         shard_local_t *local          = NULL;
e7a346
         shard_priv_t  *priv           = NULL;
e7a346
@@ -2252,6 +2313,11 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
         if (local->lookup_shards_barriered)
e7a346
                 local->barrier.waitfor = local->call_count;
e7a346
 
e7a346
+        if (inode)
e7a346
+                gf_uuid_copy (gfid, inode->gfid);
e7a346
+        else
e7a346
+                gf_uuid_copy (gfid, local->base_gfid);
e7a346
+
e7a346
         while (shard_idx_iter <= last_block) {
e7a346
                 if (local->inode_list[i]) {
e7a346
                         i++;
e7a346
@@ -2267,7 +2333,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
                         goto next;
e7a346
                 }
e7a346
 
e7a346
-                shard_make_block_abspath (shard_idx_iter, inode->gfid, path,
e7a346
+                shard_make_block_abspath (shard_idx_iter, gfid, path,
e7a346
                                           sizeof(path));
e7a346
 
e7a346
                 bname = strrchr (path, '/') + 1;
e7a346
@@ -2279,7 +2345,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
                         gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
                                 SHARD_MSG_INODE_PATH_FAILED, "Inode path failed"
e7a346
                                 " on %s, base file gfid = %s", bname,
e7a346
-                                uuid_utoa (inode->gfid));
e7a346
+                                uuid_utoa (gfid));
e7a346
                         local->op_ret = -1;
e7a346
                         local->op_errno = ENOMEM;
e7a346
                         loc_wipe (&loc;;
e7a346
@@ -2322,8 +2388,10 @@ next:
e7a346
                 if (!--call_count)
e7a346
                         break;
e7a346
         }
e7a346
-        if (local->lookup_shards_barriered)
e7a346
+        if (local->lookup_shards_barriered) {
e7a346
                 syncbarrier_wait (&local->barrier, count);
e7a346
+                local->pls_fop_handler (frame, this);
e7a346
+        }
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
@@ -2779,8 +2847,9 @@ shard_post_lookup_shards_unlink_handler (call_frame_t *frame, xlator_t *this)
e7a346
         local = frame->local;
e7a346
 
e7a346
         if ((local->op_ret < 0) && (local->op_errno != ENOENT)) {
e7a346
-                shard_common_failure_unwind (local->fop, frame, local->op_ret,
e7a346
-                                             local->op_errno);
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, local->op_errno,
e7a346
+                        SHARD_MSG_FOP_FAILED, "failed to delete shards of %s",
e7a346
+                        uuid_utoa (local->resolver_base_inode->gfid));
e7a346
                 return 0;
e7a346
         }
e7a346
         local->op_ret = 0;
e7a346
@@ -2791,41 +2860,12 @@ shard_post_lookup_shards_unlink_handler (call_frame_t *frame, xlator_t *this)
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-shard_rename_cbk (call_frame_t *frame, xlator_t *this);
e7a346
-
e7a346
-int32_t
e7a346
-shard_unlink_cbk (call_frame_t *frame, xlator_t *this);
e7a346
-
e7a346
-int
e7a346
 shard_post_resolve_unlink_handler (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         shard_local_t *local = NULL;
e7a346
 
e7a346
         local = frame->local;
e7a346
-
e7a346
-        if (local->op_ret < 0) {
e7a346
-                if (local->op_errno == ENOENT) {
e7a346
-                        /* If lookup on /.shard fails with ENOENT, it probably
e7a346
-                         * means that the file is being unlinked before it
e7a346
-                         * could grow beyond its first block. In this case,
e7a346
-                         * unlink boils down to unlinking the base file and
e7a346
-                         * unwinding the call.
e7a346
-                         */
e7a346
-                        local->op_ret = 0;
e7a346
-                        local->first_block = local->last_block = 0;
e7a346
-                        local->num_blocks = 1;
e7a346
-                        if (local->fop == GF_FOP_UNLINK)
e7a346
-                                shard_unlink_cbk (frame, this);
e7a346
-                        else
e7a346
-                                shard_rename_cbk (frame, this);
e7a346
-                        return 0;
e7a346
-                } else {
e7a346
-                        shard_common_failure_unwind (local->fop, frame,
e7a346
-                                                     local->op_ret,
e7a346
-                                                     local->op_errno);
e7a346
-                        return 0;
e7a346
-                }
e7a346
-        }
e7a346
+        local->lookup_shards_barriered = _gf_true;
e7a346
 
e7a346
         if (!local->call_count)
e7a346
                 shard_unlink_shards_do (frame, this,
e7a346
@@ -2841,6 +2881,7 @@ void
e7a346
 shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
e7a346
 {
e7a346
         char                  block_bname[256]  = {0,};
e7a346
+        uuid_t                gfid              = {0,};
e7a346
         inode_t              *inode             = NULL;
e7a346
         inode_t              *base_inode        = NULL;
e7a346
         xlator_t             *this              = NULL;
e7a346
@@ -2854,12 +2895,17 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
e7a346
 
e7a346
         inode = local->inode_list[shard_block_num - local->first_block];
e7a346
         base_inode = local->resolver_base_inode;
e7a346
+        if (base_inode)
e7a346
+                gf_uuid_copy (gfid, base_inode->gfid);
e7a346
+        else
e7a346
+                gf_uuid_copy (gfid, local->base_gfid);
e7a346
 
e7a346
-        shard_make_block_bname (shard_block_num, (local->loc.inode)->gfid,
e7a346
+        shard_make_block_bname (shard_block_num, gfid,
e7a346
                                 block_bname, sizeof (block_bname));
e7a346
 
e7a346
         LOCK(&priv->lock);
e7a346
-        LOCK(&base_inode->lock);
e7a346
+        if (base_inode)
e7a346
+                LOCK(&base_inode->lock);
e7a346
         LOCK(&inode->lock);
e7a346
         {
e7a346
                 __shard_inode_ctx_get (inode, this, &ctx;;
e7a346
@@ -2870,14 +2916,18 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
e7a346
                         unlink_unref_forget = _gf_true;
e7a346
                 }
e7a346
                 if (ctx->fsync_needed) {
e7a346
-                        inode_unref (base_inode);
e7a346
+                        if (base_inode)
e7a346
+                                inode_unref (base_inode);
e7a346
                         list_del_init (&ctx->to_fsync_list);
e7a346
-                        __shard_inode_ctx_get (base_inode, this, &base_ictx);
e7a346
-                        base_ictx->fsync_count--;
e7a346
+                        if (base_inode) {
e7a346
+                                __shard_inode_ctx_get (base_inode, this, &base_ictx);
e7a346
+                                base_ictx->fsync_count--;
e7a346
+                        }
e7a346
                 }
e7a346
         }
e7a346
         UNLOCK(&inode->lock);
e7a346
-        UNLOCK(&base_inode->lock);
e7a346
+        if (base_inode)
e7a346
+                UNLOCK(&base_inode->lock);
e7a346
         if (unlink_unref_forget) {
e7a346
                 inode_unlink (inode, priv->dot_shard_inode, block_bname);
e7a346
                 inode_unref (inode);
e7a346
@@ -2887,7 +2937,18 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-shard_rename_cbk (call_frame_t *frame, xlator_t *this);
e7a346
+shard_rename_cbk (call_frame_t *frame, xlator_t *this)
e7a346
+{
e7a346
+        shard_local_t *local = NULL;
e7a346
+
e7a346
+        local = frame->local;
e7a346
+
e7a346
+        SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
e7a346
+                            &local->prebuf, &local->preoldparent,
e7a346
+                            &local->postoldparent, &local->prenewparent,
e7a346
+                            &local->postnewparent, local->xattr_rsp);
e7a346
+        return 0;
e7a346
+}
e7a346
 
e7a346
 int32_t
e7a346
 shard_unlink_cbk (call_frame_t *frame, xlator_t *this)
e7a346
@@ -2906,7 +2967,6 @@ shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                             struct iatt *preparent, struct iatt *postparent,
e7a346
                             dict_t *xdata)
e7a346
 {
e7a346
-        int            call_count      = 0;
e7a346
         int            shard_block_num = (long) cookie;
e7a346
         shard_local_t *local           = NULL;
e7a346
 
e7a346
@@ -2919,22 +2979,8 @@ shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
         }
e7a346
 
e7a346
         shard_unlink_block_inode (local, shard_block_num);
e7a346
-
e7a346
 done:
e7a346
-        call_count = shard_call_count_return (frame);
e7a346
-        if (local->unlink_shards_barriered) {
e7a346
-                syncbarrier_wake (&local->barrier);
e7a346
-        } else {
e7a346
-
e7a346
-                if (call_count == 0) {
e7a346
-                        SHARD_UNSET_ROOT_FS_ID (frame, local);
e7a346
-
e7a346
-                        if (local->fop == GF_FOP_UNLINK)
e7a346
-                                shard_unlink_cbk (frame, this);
e7a346
-                        else if (local->fop == GF_FOP_RENAME)
e7a346
-                                shard_rename_cbk (frame, this);
e7a346
-                }
e7a346
-        }
e7a346
+        syncbarrier_wake (&local->barrier);
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
@@ -2944,11 +2990,11 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
         int               i              = 0;
e7a346
         int               ret            = -1;
e7a346
         int               count          = 0;
e7a346
-        int               call_count     = 0;
e7a346
-        uint32_t          last_block     = 0;
e7a346
         uint32_t          cur_block      = 0;
e7a346
+        uint32_t          cur_block_idx  = 0;/*this is idx into inode_list[] array */
e7a346
         char             *bname          = NULL;
e7a346
         char              path[PATH_MAX] = {0,};
e7a346
+        uuid_t           gfid            = {0,};
e7a346
         loc_t             loc            = {0,};
e7a346
         gf_boolean_t      wind_failed    = _gf_false;
e7a346
         shard_local_t    *local          = NULL;
e7a346
@@ -2957,16 +3003,12 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
         priv = this->private;
e7a346
         local = frame->local;
e7a346
 
e7a346
-        /* local->num_blocks includes the base file block. This function only
e7a346
-         * deletes the shards under /.shard. So subtract num_blocks by 1.
e7a346
-         */
e7a346
-        local->call_count = call_count = local->num_blocks - 1;
e7a346
-        last_block = local->last_block;
e7a346
+        if (inode)
e7a346
+                gf_uuid_copy (gfid, inode->gfid);
e7a346
+        else
e7a346
+                gf_uuid_copy (gfid, local->base_gfid);
e7a346
 
e7a346
-        /* Ignore the inode associated with the base file and start counting
e7a346
-         * from 1.
e7a346
-         */
e7a346
-        for (i = 1; i < local->num_blocks; i++) {
e7a346
+        for (i = 0; i < local->num_blocks; i++) {
e7a346
                 if (!local->inode_list[i])
e7a346
                         continue;
e7a346
                 count++;
e7a346
@@ -2975,35 +3017,21 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
         if (!count) {
e7a346
                 /* callcount = 0 implies that all of the shards that need to be
e7a346
                  * unlinked are non-existent (in other words the file is full of
e7a346
-                 * holes). So shard xlator can simply return the fop to its
e7a346
-                 * parent now.
e7a346
+                 * holes).
e7a346
                  */
e7a346
                 gf_msg_debug (this->name, 0, "All shards that need to be "
e7a346
                               "unlinked are non-existent: %s",
e7a346
-                              uuid_utoa (inode->gfid));
e7a346
-                local->num_blocks = 1;
e7a346
-                if (local->fop == GF_FOP_UNLINK) {
e7a346
-                        shard_unlink_cbk (frame, this);
e7a346
-                } else if (local->fop == GF_FOP_RENAME) {
e7a346
-                        gf_msg_debug (this->name, 0, "Resuming rename()");
e7a346
-                        shard_rename_cbk (frame, this);
e7a346
-                }
e7a346
+                              uuid_utoa (gfid));
e7a346
                 return 0;
e7a346
         }
e7a346
 
e7a346
-        local->call_count = call_count = count;
e7a346
-        cur_block = 1;
e7a346
         SHARD_SET_ROOT_FS_ID (frame, local);
e7a346
-        if (local->unlink_shards_barriered)
e7a346
-                local->barrier.waitfor = count;
e7a346
+        local->barrier.waitfor = count;
e7a346
+        cur_block = cur_block_idx + local->first_block;
e7a346
 
e7a346
-        /* Ignore the base file and start iterating from the first block shard.
e7a346
-         */
e7a346
-        while (cur_block <= last_block) {
e7a346
-                if (!local->inode_list[cur_block]) {
e7a346
-                        cur_block++;
e7a346
-                        continue;
e7a346
-                }
e7a346
+        while (cur_block_idx < local->num_blocks) {
e7a346
+                if (!local->inode_list[cur_block_idx])
e7a346
+                        goto next;
e7a346
 
e7a346
                 if (wind_failed) {
e7a346
                         shard_unlink_shards_do_cbk (frame,
e7a346
@@ -3013,8 +3041,7 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
                         goto next;
e7a346
                 }
e7a346
 
e7a346
-                shard_make_block_abspath (cur_block, inode->gfid, path,
e7a346
-                                          sizeof (path));
e7a346
+                shard_make_block_abspath (cur_block, gfid, path, sizeof (path));
e7a346
                 bname = strrchr (path, '/') + 1;
e7a346
                 loc.parent = inode_ref (priv->dot_shard_inode);
e7a346
                 ret = inode_path (loc.parent, bname, (char **) &(loc.path));
e7a346
@@ -3022,7 +3049,7 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
                         gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
                                 SHARD_MSG_INODE_PATH_FAILED, "Inode path failed"
e7a346
                                 " on %s, base file gfid = %s", bname,
e7a346
-                                uuid_utoa (inode->gfid));
e7a346
+                                uuid_utoa (gfid));
e7a346
                         local->op_ret = -1;
e7a346
                         local->op_errno = ENOMEM;
e7a346
                         loc_wipe (&loc;;
e7a346
@@ -3037,26 +3064,505 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
                 loc.name = strrchr (loc.path, '/');
e7a346
                 if (loc.name)
e7a346
                         loc.name++;
e7a346
-                loc.inode = inode_ref (local->inode_list[cur_block]);
e7a346
+                loc.inode = inode_ref (local->inode_list[cur_block_idx]);
e7a346
 
e7a346
                 STACK_WIND_COOKIE (frame, shard_unlink_shards_do_cbk,
e7a346
                                    (void *) (long) cur_block, FIRST_CHILD(this),
e7a346
                                    FIRST_CHILD (this)->fops->unlink, &loc,
e7a346
                                    local->xflag, local->xattr_req);
e7a346
                 loc_wipe (&loc;;
e7a346
-
e7a346
 next:
e7a346
                 cur_block++;
e7a346
-                if (!--call_count)
e7a346
-                        break;
e7a346
+                cur_block_idx++;
e7a346
         }
e7a346
-        if (local->unlink_shards_barriered)
e7a346
-                syncbarrier_wait (&local->barrier, count);
e7a346
+        syncbarrier_wait (&local->barrier, count);
e7a346
+        SHARD_UNSET_ROOT_FS_ID (frame, local);
e7a346
+        return 0;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
+shard_regulated_shards_deletion (call_frame_t *cleanup_frame, xlator_t *this,
e7a346
+                                 int now, int first_block, gf_dirent_t *entry)
e7a346
+{
e7a346
+        int            i     = 0;
e7a346
+        int            ret   = 0;
e7a346
+        shard_local_t *local = NULL;
e7a346
+        uuid_t         gfid  = {0,};
e7a346
+
e7a346
+        local = cleanup_frame->local;
e7a346
+
e7a346
+        local->inode_list = GF_CALLOC (now, sizeof (inode_t *),
e7a346
+                                       gf_shard_mt_inode_list);
e7a346
+        if (!local->inode_list)
e7a346
+                return -ENOMEM;
e7a346
+
e7a346
+        local->first_block = first_block;
e7a346
+        local->last_block = first_block + now - 1;
e7a346
+        local->num_blocks = now;
e7a346
+        gf_uuid_parse (entry->d_name, gfid);
e7a346
+        gf_uuid_copy (local->base_gfid, gfid);
e7a346
+        local->resolver_base_inode = inode_find (this->itable, gfid);
e7a346
+        local->call_count = 0;
e7a346
+        syncbarrier_init (&local->barrier);
e7a346
+
e7a346
+        shard_common_resolve_shards (cleanup_frame, this,
e7a346
+                                     shard_post_resolve_unlink_handler);
e7a346
+
e7a346
+        for (i = 0; i < local->num_blocks; i++) {
e7a346
+                if (local->inode_list[i])
e7a346
+                        inode_unref (local->inode_list[i]);
e7a346
+        }
e7a346
+        GF_FREE (local->inode_list);
e7a346
+        local->inode_list = NULL;
e7a346
+        if (local->op_ret)
e7a346
+                ret = -local->op_errno;
e7a346
+        syncbarrier_destroy (&local->barrier);
e7a346
+        inode_unref (local->resolver_base_inode);
e7a346
+        local->resolver_base_inode = NULL;
e7a346
+        STACK_RESET (cleanup_frame->root);
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+
e7a346
+int
e7a346
+__shard_delete_shards_of_entry (call_frame_t *cleanup_frame, xlator_t *this,
e7a346
+                                gf_dirent_t *entry, inode_t *inode)
e7a346
+{
e7a346
+        int              ret           = 0;
e7a346
+        int              shard_count   = 0;
e7a346
+        int              first_block   = 0;
e7a346
+        int              now           = 0;
e7a346
+        uint64_t         size          = 0;
e7a346
+        uint64_t         block_size    = 0;
e7a346
+        uint64_t         size_array[4] = {0,};
e7a346
+        void            *bsize         = NULL;
e7a346
+        void            *size_attr     = NULL;
e7a346
+        dict_t          *xattr_rsp     = NULL;
e7a346
+        loc_t            loc           = {0,};
e7a346
+        shard_local_t   *local         = NULL;
e7a346
+        shard_priv_t    *priv          = NULL;
e7a346
 
e7a346
+        priv = this->private;
e7a346
+        local = cleanup_frame->local;
e7a346
+        ret = dict_reset (local->xattr_req);
e7a346
+        if (ret) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
+                        "Failed to reset dict");
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        ret = dict_set_uint64 (local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0);
e7a346
+        if (ret) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
+                        "Failed to set dict value: key:%s",
e7a346
+                        GF_XATTR_SHARD_BLOCK_SIZE);
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        ret = dict_set_uint64 (local->xattr_req, GF_XATTR_SHARD_FILE_SIZE,
e7a346
+                               8 * 4);
e7a346
+        if (ret) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
+                        "Failed to set dict value: key:%s",
e7a346
+                        GF_XATTR_SHARD_FILE_SIZE);
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        loc.inode = inode_ref (inode);
e7a346
+        loc.parent = inode_ref (priv->dot_shard_rm_inode);
e7a346
+        ret = inode_path (loc.parent, entry->d_name, (char **)&(loc.path));
e7a346
+        if (ret < 0) {
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
e7a346
+                        "Inode path  failed on %s", entry->d_name);
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        loc.name = strrchr (loc.path, '/');
e7a346
+        if (loc.name)
e7a346
+                loc.name++;
e7a346
+        ret = syncop_lookup (FIRST_CHILD(this), &loc, NULL, NULL,
e7a346
+                             local->xattr_req, &xattr_rsp);
e7a346
+        if (ret)
e7a346
+                goto err;
e7a346
+
e7a346
+        ret = dict_get_ptr (xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize);
e7a346
+        if (ret) {
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
+                        "Failed to get dict value: key:%s",
e7a346
+                        GF_XATTR_SHARD_BLOCK_SIZE);
e7a346
+                goto err;
e7a346
+        }
e7a346
+        block_size = ntoh64 (*((uint64_t *)bsize));
e7a346
+
e7a346
+        ret = dict_get_ptr (xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr);
e7a346
+        if (ret) {
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
+                        "Failed to get dict value: key:%s",
e7a346
+                        GF_XATTR_SHARD_FILE_SIZE);
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        memcpy (size_array, size_attr, sizeof (size_array));
e7a346
+        size = ntoh64 (size_array[0]);
e7a346
+
e7a346
+        shard_count = (size / block_size) - 1;
e7a346
+        if (shard_count < 0) {
e7a346
+                gf_msg_debug (this->name, 0, "Size of %s hasn't grown beyond "
e7a346
+                              "its shard-block-size. Nothing to delete. "
e7a346
+                              "Returning", entry->d_name);
e7a346
+                /* File size < shard-block-size, so nothing to delete */
e7a346
+                ret = 0;
e7a346
+                goto delete_marker;
e7a346
+        }
e7a346
+        if ((size % block_size) > 0)
e7a346
+                shard_count++;
e7a346
+
e7a346
+        if (shard_count == 0) {
e7a346
+                gf_msg_debug (this->name, 0, "Size of %s is exactly equal to "
e7a346
+                              "its shard-block-size. Nothing to delete. "
e7a346
+                              "Returning", entry->d_name);
e7a346
+                ret = 0;
e7a346
+                goto delete_marker;
e7a346
+        }
e7a346
+        gf_msg_debug (this->name, 0, "base file = %s, "
e7a346
+                      "shard-block-size=%"PRIu64", file-size=%"PRIu64", "
e7a346
+                      "shard_count=%d", entry->d_name, block_size, size,
e7a346
+                      shard_count);
e7a346
+
e7a346
+        /* Perform a gfid-based lookup to see if gfid corresponding to marker
e7a346
+         * file's base name exists.
e7a346
+         */
e7a346
+        loc_wipe (&loc;;
e7a346
+        loc.inode = inode_new (this->itable);
e7a346
+        if (!loc.inode) {
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+        gf_uuid_parse (entry->d_name, loc.gfid);
e7a346
+        ret = syncop_lookup (FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL);
e7a346
+        if (!ret) {
e7a346
+                gf_msg_debug (this->name, 0, "Base shard corresponding to gfid "
e7a346
+                              "%s is present. Skipping shard deletion. "
e7a346
+                              "Returning", entry->d_name);
e7a346
+                ret = 0;
e7a346
+                goto delete_marker;
e7a346
+        }
e7a346
+
e7a346
+        first_block = 1;
e7a346
+
e7a346
+        while (shard_count) {
e7a346
+                if (shard_count < local->deletion_rate) {
e7a346
+                        now = shard_count;
e7a346
+                        shard_count = 0;
e7a346
+                } else {
e7a346
+                        now = local->deletion_rate;
e7a346
+                        shard_count -= local->deletion_rate;
e7a346
+                }
e7a346
+
e7a346
+                gf_msg_debug (this->name, 0, "deleting %d shards starting from "
e7a346
+                              "block %d of gfid %s", now, first_block,
e7a346
+                              entry->d_name);
e7a346
+                ret = shard_regulated_shards_deletion (cleanup_frame, this,
e7a346
+                                                       now, first_block,
e7a346
+                                                       entry);
e7a346
+                if (ret)
e7a346
+                        goto err;
e7a346
+                first_block += now;
e7a346
+        }
e7a346
+
e7a346
+delete_marker:
e7a346
+        loc_wipe (&loc;;
e7a346
+        loc.inode = inode_ref (inode);
e7a346
+        loc.parent = inode_ref (priv->dot_shard_rm_inode);
e7a346
+        ret = inode_path (loc.parent, entry->d_name, (char **)&(loc.path));
e7a346
+        if (ret < 0) {
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
e7a346
+                        "Inode path  failed on %s", entry->d_name);
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+        loc.name = strrchr (loc.path, '/');
e7a346
+        if (loc.name)
e7a346
+                loc.name++;
e7a346
+        ret = syncop_unlink (FIRST_CHILD(this), &loc, NULL, NULL);
e7a346
+        if (ret)
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
+                        SHARD_MSG_SHARDS_DELETION_FAILED, "Failed to delete %s "
e7a346
+                        "from /%s", entry->d_name, GF_SHARD_REMOVE_ME_DIR);
e7a346
+err:
e7a346
+        if (xattr_rsp)
e7a346
+                dict_unref (xattr_rsp);
e7a346
+        loc_wipe (&loc;;
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
+shard_delete_shards_of_entry (call_frame_t *cleanup_frame, xlator_t *this,
e7a346
+                              gf_dirent_t *entry, inode_t *inode)
e7a346
+{
e7a346
+        int           ret  = -1;
e7a346
+        loc_t         loc  = {0,};
e7a346
+        shard_priv_t *priv = NULL;
e7a346
+
e7a346
+        priv = this->private;
e7a346
+        loc.inode = inode_ref (priv->dot_shard_rm_inode);
e7a346
+
e7a346
+        ret = syncop_entrylk (FIRST_CHILD(this), this->name, &loc,
e7a346
+                              entry->d_name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL,
e7a346
+                              NULL);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
+        {
e7a346
+                ret = __shard_delete_shards_of_entry (cleanup_frame, this,
e7a346
+                                                      entry, inode);
e7a346
+        }
e7a346
+        syncop_entrylk (FIRST_CHILD(this), this->name, &loc, entry->d_name,
e7a346
+                        ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL);
e7a346
+out:
e7a346
+        loc_wipe (&loc;;
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
+shard_delete_shards_cbk (int ret, call_frame_t *frame, void *data)
e7a346
+{
e7a346
+        xlator_t     *this          = NULL;
e7a346
+        shard_priv_t *priv          = NULL;
e7a346
+
e7a346
+        this = frame->this;
e7a346
+        priv = this->private;
e7a346
+
e7a346
+        if (ret < 0) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, -ret,
e7a346
+                        SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
+                        "Background deletion of shards failed");
e7a346
+                priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING;
e7a346
+        } else {
e7a346
+                priv->first_lookup = SHARD_FIRST_LOOKUP_DONE;
e7a346
+        }
e7a346
+        SHARD_STACK_DESTROY (frame);
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
 int
e7a346
+shard_resolve_internal_dir (xlator_t *this, shard_local_t *local,
e7a346
+                            shard_internal_dir_type_t type)
e7a346
+{
e7a346
+        int                  ret   = 0;
e7a346
+        char                *bname = NULL;
e7a346
+        loc_t               *loc   = NULL;
e7a346
+        shard_priv_t        *priv  = NULL;
e7a346
+        uuid_t               gfid  = {0,};
e7a346
+        struct iatt          stbuf = {0,};
e7a346
+
e7a346
+        priv = this->private;
e7a346
+
e7a346
+        switch (type) {
e7a346
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
e7a346
+                loc = &local->dot_shard_loc;
e7a346
+                gf_uuid_copy (gfid, priv->dot_shard_gfid);
e7a346
+                bname = GF_SHARD_DIR;
e7a346
+                break;
e7a346
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
e7a346
+                loc = &local->dot_shard_rm_loc;
e7a346
+                gf_uuid_copy (gfid, priv->dot_shard_rm_gfid);
e7a346
+                bname = GF_SHARD_REMOVE_ME_DIR;
e7a346
+                break;
e7a346
+        default:
e7a346
+                break;
e7a346
+        }
e7a346
+
e7a346
+        loc->inode = inode_find (this->itable, gfid);
e7a346
+        if (!loc->inode) {
e7a346
+                ret = shard_init_internal_dir_loc (this, local, type);
e7a346
+                if (ret)
e7a346
+                        goto err;
e7a346
+                ret = dict_reset (local->xattr_req);
e7a346
+                if (ret) {
e7a346
+                        gf_msg (this->name, GF_LOG_WARNING, 0,
e7a346
+                                SHARD_MSG_DICT_OP_FAILED, "Failed to reset "
e7a346
+                                "dict");
e7a346
+                        ret = -ENOMEM;
e7a346
+                        goto err;
e7a346
+                }
e7a346
+                ret = dict_set_static_bin (local->xattr_req, "gfid-req", gfid,
e7a346
+                                           16);
e7a346
+                ret = syncop_lookup (FIRST_CHILD(this), loc, &stbuf, NULL,
e7a346
+                                     local->xattr_req, NULL);
e7a346
+                if (ret < 0) {
e7a346
+                        if (ret != -ENOENT)
e7a346
+                                gf_msg (this->name, GF_LOG_ERROR, -ret,
e7a346
+                                        SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
+                                        "Lookup on %s failed, exiting", bname);
e7a346
+                        goto err;
e7a346
+                } else {
e7a346
+                        shard_link_internal_dir_inode (local,
e7a346
+                                                       loc->inode, &stbuf,
e7a346
+                                                       type);
e7a346
+                }
e7a346
+        }
e7a346
+        ret = 0;
e7a346
+err:
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
+shard_lookup_marker_entry (xlator_t *this, shard_local_t *local,
e7a346
+                           gf_dirent_t *entry)
e7a346
+{
e7a346
+        int   ret = 0;
e7a346
+        loc_t loc = {0,};
e7a346
+
e7a346
+        loc.inode = inode_new (this->itable);
e7a346
+        if (!loc.inode) {
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+        loc.parent = inode_ref (local->fd->inode);
e7a346
+
e7a346
+        ret = inode_path (loc.parent, entry->d_name, (char **)&(loc.path));
e7a346
+        if (ret < 0) {
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
e7a346
+                        "Inode path failed on %s", entry->d_name);
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        loc.name = strrchr (loc.path, '/');
e7a346
+        if (loc.name)
e7a346
+                loc.name++;
e7a346
+
e7a346
+        ret = syncop_lookup (FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL);
e7a346
+        if (ret < 0) {
e7a346
+                goto err;
e7a346
+        }
e7a346
+        entry->inode = inode_ref (loc.inode);
e7a346
+        ret = 0;
e7a346
+err:
e7a346
+        loc_wipe (&loc;;
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
+shard_delete_shards (void *opaque)
e7a346
+{
e7a346
+        int              ret                        = 0;
e7a346
+        off_t            offset                     = 0;
e7a346
+        loc_t            loc                        = {0,};
e7a346
+        inode_t         *link_inode                 = NULL;
e7a346
+        xlator_t        *this                       = NULL;
e7a346
+        shard_priv_t    *priv                       = NULL;
e7a346
+        shard_local_t   *local                      = NULL;
e7a346
+        gf_dirent_t      entries;
e7a346
+        gf_dirent_t     *entry                      = NULL;
e7a346
+        call_frame_t    *cleanup_frame              = NULL;
e7a346
+
e7a346
+        this = THIS;
e7a346
+        priv = this->private;
e7a346
+        INIT_LIST_HEAD (&entries.list);
e7a346
+
e7a346
+        cleanup_frame = opaque;
e7a346
+
e7a346
+        local = mem_get0 (this->local_pool);
e7a346
+        if (!local) {
e7a346
+                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
e7a346
+                        SHARD_MSG_MEMALLOC_FAILED, "Failed to create local to "
e7a346
+                        "delete shards");
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+        cleanup_frame->local = local;
e7a346
+
e7a346
+        local->xattr_req = dict_new ();
e7a346
+        if (!local->xattr_req) {
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+        local->deletion_rate = priv->deletion_rate;
e7a346
+
e7a346
+        ret = shard_resolve_internal_dir (this, local,
e7a346
+                                          SHARD_INTERNAL_DIR_DOT_SHARD);
e7a346
+        if (ret == -ENOENT) {
e7a346
+                gf_msg_debug (this->name, 0, ".shard absent. Nothing to"
e7a346
+                              " delete. Exiting");
e7a346
+                ret = 0;
e7a346
+                goto err;
e7a346
+        } else if (ret < 0) {
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        ret = shard_resolve_internal_dir (this, local,
e7a346
+                                          SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME);
e7a346
+        if (ret == -ENOENT) {
e7a346
+                gf_msg_debug (this->name, 0, ".remove_me absent. "
e7a346
+                              "Nothing to delete. Exiting");
e7a346
+                ret = 0;
e7a346
+                goto err;
e7a346
+        } else if (ret < 0) {
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        local->fd = fd_anonymous (local->dot_shard_rm_loc.inode);
e7a346
+        if (!local->fd) {
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
+        }
e7a346
+
e7a346
+        while ((ret = syncop_readdirp (FIRST_CHILD(this), local->fd, 131072,
e7a346
+                                      offset, &entries, local->xattr_req,
e7a346
+                                      NULL))) {
e7a346
+                if (ret > 0)
e7a346
+                        ret = 0;
e7a346
+                list_for_each_entry (entry, &entries.list, list) {
e7a346
+                        offset = entry->d_off;
e7a346
+
e7a346
+                        if (!strcmp (entry->d_name, ".") ||
e7a346
+                            !strcmp (entry->d_name, ".."))
e7a346
+                                continue;
e7a346
+
e7a346
+                        if (!entry->inode) {
e7a346
+                                ret = shard_lookup_marker_entry (this, local,
e7a346
+                                                                 entry);
e7a346
+                                if (ret < 0)
e7a346
+                                        continue;
e7a346
+                        }
e7a346
+                        link_inode = inode_link (entry->inode, local->fd->inode,
e7a346
+                                                 entry->d_name, &entry->d_stat);
e7a346
+
e7a346
+                        gf_msg_debug (this->name, 0, "Initiating deletion of "
e7a346
+                                      "shards of gfid %s", entry->d_name);
e7a346
+                        ret = shard_delete_shards_of_entry (cleanup_frame, this,
e7a346
+                                                            entry, link_inode);
e7a346
+                        inode_unlink (link_inode, local->fd->inode,
e7a346
+                                      entry->d_name);
e7a346
+                        inode_unref (link_inode);
e7a346
+                        if (ret) {
e7a346
+                                gf_msg (this->name, GF_LOG_ERROR, -ret,
e7a346
+                                        SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
+                                        "Failed to clean up shards of gfid %s",
e7a346
+                                        entry->d_name);
e7a346
+                                continue;
e7a346
+                        }
e7a346
+                        gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
+                                SHARD_MSG_SHARDS_DELETION_COMPLETED, "Deleted "
e7a346
+                                "shards of gfid=%s from backend",
e7a346
+                                entry->d_name);
e7a346
+                }
e7a346
+                gf_dirent_free (&entries);
e7a346
+                if (ret)
e7a346
+                        break;
e7a346
+        }
e7a346
+        ret = 0;
e7a346
+err:
e7a346
+        loc_wipe (&loc;;
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
 shard_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                           int32_t op_ret, int32_t op_errno, dict_t *xdata)
e7a346
 {
e7a346
@@ -3394,7 +3900,10 @@ shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                 local->postoldparent = *postparent;
e7a346
                 if (xdata)
e7a346
                         local->xattr_rsp = dict_ref (xdata);
e7a346
+                if (local->cleanup_required)
e7a346
+                        shard_start_background_deletion (this);
e7a346
         }
e7a346
+
e7a346
         if (local->entrylk_frame) {
e7a346
                 ret = shard_unlock_entrylk (frame, this);
e7a346
                 if (ret < 0) {
e7a346
@@ -3408,6 +3917,7 @@ shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                 local->op_ret = -1;
e7a346
                 local->op_errno = -ret;
e7a346
         }
e7a346
+
e7a346
         shard_unlink_cbk (frame, this);
e7a346
         return 0;
e7a346
 }
e7a346
@@ -3576,6 +4086,7 @@ shard_post_lookup_base_shard_rm_handler (call_frame_t *frame, xlator_t *this)
e7a346
         } else {
e7a346
                 gf_msg_debug (this->name, 0, "link count on %s = 1, creating "
e7a346
                               "file under .remove_me", local->int_inodelk.loc.path);
e7a346
+                local->cleanup_required = _gf_true;
e7a346
                 shard_acquire_entrylk (frame, this, priv->dot_shard_rm_inode,
e7a346
                                        local->prebuf.ia_gfid);
e7a346
         }
e7a346
@@ -3788,20 +4299,6 @@ err:
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-shard_rename_cbk (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        shard_local_t *local = NULL;
e7a346
-
e7a346
-        local = frame->local;
e7a346
-
e7a346
-        SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
e7a346
-                            &local->prebuf, &local->preoldparent,
e7a346
-                            &local->postoldparent, &local->prenewparent,
e7a346
-                            &local->postnewparent, local->xattr_rsp);
e7a346
-        return 0;
e7a346
-}
e7a346
-
e7a346
-int
e7a346
 shard_post_rename_lookup_handler (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         shard_rename_cbk (frame, this);
e7a346
@@ -3854,6 +4351,8 @@ shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                         local->op_errno = -ret;
e7a346
                         goto err;
e7a346
                 }
e7a346
+                if (local->cleanup_required)
e7a346
+                        shard_start_background_deletion (this);
e7a346
         }
e7a346
 
e7a346
         /* Now the base file of src, if sharded, is looked up to gather ia_size
e7a346
@@ -4822,7 +5321,7 @@ shard_common_inode_write_do (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
         if (dict_set_uint32 (local->xattr_req,
e7a346
                              GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) {
e7a346
-                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED,
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
                         "Failed to set "GLUSTERFS_WRITE_UPDATE_ATOMIC" into "
e7a346
                         "dict: %s", uuid_utoa (fd->inode->gfid));
e7a346
                 local->op_ret = -1;
e7a346
@@ -5141,7 +5640,7 @@ shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this,
e7a346
 
e7a346
         ret = dict_set_bin (xattr_req, "gfid-req", *gfid, 16);
e7a346
         if (ret) {
e7a346
-                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED,
e7a346
+                gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
e7a346
                         "Failed to set gfid-req for %s",
e7a346
                         shard_internal_dir_string (type));
e7a346
                 goto err;
e7a346
@@ -6186,6 +6685,8 @@ init (xlator_t *this)
e7a346
 
e7a346
         GF_OPTION_INIT ("shard-block-size", priv->block_size, size_uint64, out);
e7a346
 
e7a346
+        GF_OPTION_INIT ("shard-deletion-rate", priv->deletion_rate, uint32, out);
e7a346
+
e7a346
         this->local_pool = mem_pool_new (shard_local_t, 128);
e7a346
         if (!this->local_pool) {
e7a346
                 ret = -1;
e7a346
@@ -6241,6 +6742,8 @@ reconfigure (xlator_t *this, dict_t *options)
e7a346
         GF_OPTION_RECONF ("shard-block-size", priv->block_size, options, size,
e7a346
                           out);
e7a346
 
e7a346
+        GF_OPTION_RECONF ("shard-deletion-rate", priv->deletion_rate, options,
e7a346
+                          uint32, out);
e7a346
         ret = 0;
e7a346
 
e7a346
 out:
e7a346
@@ -6364,5 +6867,12 @@ struct volume_options options[] = {
e7a346
            .description = "The size unit used to break a file into multiple "
e7a346
                           "chunks",
e7a346
         },
e7a346
+        {  .key = {"shard-deletion-rate"},
e7a346
+           .type = GF_OPTION_TYPE_INT,
e7a346
+           .default_value = "100",
e7a346
+           .min = 100,
e7a346
+           .max = INT_MAX,
e7a346
+           .description = "The number of shards to send deletes on at a time",
e7a346
+        },
e7a346
         { .key = {NULL} },
e7a346
 };
e7a346
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
e7a346
index 1783ff6..5de098a 100644
e7a346
--- a/xlators/features/shard/src/shard.h
e7a346
+++ b/xlators/features/shard/src/shard.h
e7a346
@@ -130,9 +130,9 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this);
e7a346
                               sizeof (*__bs));                                \
e7a346
         if (__ret) {                                                          \
e7a346
                 gf_msg (this->name, GF_LOG_WARNING, 0,                        \
e7a346
-                        SHARD_MSG_DICT_SET_FAILED, "Failed to set key: %s "   \
e7a346
+                        SHARD_MSG_DICT_OP_FAILED, "Failed to set key: %s "    \
e7a346
                         "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, (loc)->path);\
e7a346
-                GF_FREE (__bs);                                               \
e7a346
+                        GF_FREE (__bs);                                       \
e7a346
                 goto label;                                                   \
e7a346
         }                                                                     \
e7a346
                                                                               \
e7a346
@@ -144,7 +144,7 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this);
e7a346
                               __size_attr, 8 * 4);                            \
e7a346
         if (__ret) {                                                          \
e7a346
                 gf_msg (this->name, GF_LOG_WARNING, 0,                        \
e7a346
-                        SHARD_MSG_DICT_SET_FAILED, "Failed to set key: %s "   \
e7a346
+                        SHARD_MSG_DICT_OP_FAILED, "Failed to set key: %s "   \
e7a346
                         "on path %s", GF_XATTR_SHARD_FILE_SIZE, (loc)->path);   \
e7a346
                 GF_FREE (__size_attr);                                        \
e7a346
                 goto label;                                                   \
e7a346
@@ -160,7 +160,7 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this);
e7a346
                 local->op_ret = -1;                                           \
e7a346
                 local->op_errno = ENOMEM;                                     \
e7a346
                 gf_msg (this->name, GF_LOG_WARNING, 0,                        \
e7a346
-                        SHARD_MSG_DICT_SET_FAILED, "Failed to set dict value:"\
e7a346
+                        SHARD_MSG_DICT_OP_FAILED, "Failed to set dict value:"\
e7a346
                         " key:%s for %s.", GF_XATTR_SHARD_FILE_SIZE,          \
e7a346
                         uuid_utoa (gfid));                                    \
e7a346
                 goto label;                                                   \
e7a346
@@ -197,6 +197,12 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this);
e7a346
                 }                                                             \
e7a346
         } while (0)
e7a346
 
e7a346
+typedef enum {
e7a346
+        SHARD_FIRST_LOOKUP_PENDING = 0,
e7a346
+        SHARD_FIRST_LOOKUP_IN_PROGRESS,
e7a346
+        SHARD_FIRST_LOOKUP_DONE,
e7a346
+} shard_first_lookup_state_t;
e7a346
+
e7a346
 /* rm = "remove me" */
e7a346
 
e7a346
 typedef struct shard_priv {
e7a346
@@ -208,6 +214,8 @@ typedef struct shard_priv {
e7a346
         gf_lock_t lock;
e7a346
         int inode_count;
e7a346
         struct list_head ilist_head;
e7a346
+        uint32_t deletion_rate;
e7a346
+        shard_first_lookup_state_t first_lookup;
e7a346
 } shard_priv_t;
e7a346
 
e7a346
 typedef struct {
e7a346
@@ -303,6 +311,9 @@ typedef struct shard_local {
e7a346
         call_frame_t *main_frame;
e7a346
         call_frame_t *inodelk_frame;
e7a346
         call_frame_t *entrylk_frame;
e7a346
+        uint32_t deletion_rate;
e7a346
+        gf_boolean_t cleanup_required;
e7a346
+        uuid_t base_gfid;
e7a346
 } shard_local_t;
e7a346
 
e7a346
 typedef struct shard_inode_ctx {
e7a346
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
e7a346
index 5a697cf..4357562 100644
e7a346
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
e7a346
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
e7a346
@@ -3298,6 +3298,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
e7a346
           .op_version = GD_OP_VERSION_3_7_0,
e7a346
           .flags      = OPT_FLAG_CLIENT_OPT
e7a346
         },
e7a346
+        { .key        = "features.shard-deletion-rate",
e7a346
+          .voltype    = "features/shard",
e7a346
+          .op_version = GD_OP_VERSION_4_2_0,
e7a346
+          .flags      = OPT_FLAG_CLIENT_OPT
e7a346
+        },
e7a346
         { .key        = "features.scrub-throttle",
e7a346
           .voltype    = "features/bit-rot",
e7a346
           .value      = "lazy",
e7a346
-- 
e7a346
1.8.3.1
e7a346