17b94a
From 69ac1fd2da7a57f2f0854412863911959bf71fde Mon Sep 17 00:00:00 2001
17b94a
From: Sunny Kumar <sunkumar@redhat.com>
17b94a
Date: Tue, 2 Apr 2019 12:38:09 +0530
17b94a
Subject: [PATCH 161/169] geo-rep: Fix rename with existing destination with
17b94a
 same gfid
17b94a
17b94a
Problem:
17b94a
   Geo-rep fails to sync the rename properly if destination exists.
17b94a
It results in source to be remained on slave causing more number of
17b94a
files on slave. Also heavy rename workload like logrotate caused
17b94a
lot of ESTALE errors
17b94a
17b94a
Cause:
17b94a
   Geo-rep fails to sync rename if destination exists if creation
17b94a
of source file also falls into single batch of changelogs being
17b94a
processed. This is because, after fixing problematic gfids verifying
17b94a
from master, while re-processing original entries, CREATE also was
17b94a
re-processed causing more files on slave and rename to be failed.
17b94a
17b94a
Solution:
17b94a
   Entries need to be removed from retrial list after fixing
17b94a
problematic gfids on slave so that it's not re-created again on slave.
17b94a
   Also treat ESTALE as EEXIST so that the error is properly handled
17b94a
verifying the op on master volume.
17b94a
17b94a
Backport of:
17b94a
 > Patch: https://review.gluster.org/22519
17b94a
 > Change-Id: I50cf289e06b997adddff0552bf2466d9201dd1f9
17b94a
 > BUG: 1694820
17b94a
 > Signed-off-by: Kotresh HR <khiremat@redhat.com>
17b94a
 > Signed-off-by: Sunny Kumar <sunkumar@redhat.com>
17b94a
17b94a
Change-Id: I50cf289e06b997adddff0552bf2466d9201dd1f9
17b94a
fixes: bz#1708121
17b94a
Signed-off-by: Kotresh HR <khiremat@redhat.com>
17b94a
Signed-off-by: Sunny Kumar <sunkumar@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/172393
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
17b94a
---
17b94a
 geo-replication/syncdaemon/master.py              | 41 +++++++++++++++++++++--
17b94a
 geo-replication/syncdaemon/resource.py            | 10 ++++--
17b94a
 tests/00-geo-rep/georep-basic-dr-rsync-arbiter.t  |  5 +++
17b94a
 tests/00-geo-rep/georep-basic-dr-rsync.t          |  5 +++
17b94a
 tests/00-geo-rep/georep-basic-dr-tarssh-arbiter.t |  5 +++
17b94a
 tests/00-geo-rep/georep-basic-dr-tarssh.t         |  5 +++
17b94a
 tests/geo-rep.rc                                  | 36 ++++++++++++++++++++
17b94a
 7 files changed, 102 insertions(+), 5 deletions(-)
17b94a
17b94a
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
17b94a
index 3da7610..42c86d7 100644
17b94a
--- a/geo-replication/syncdaemon/master.py
17b94a
+++ b/geo-replication/syncdaemon/master.py
17b94a
@@ -65,6 +65,9 @@ def _volinfo_hook_relax_foreign(self):
17b94a
 def edct(op, **ed):
17b94a
     dct = {}
17b94a
     dct['op'] = op
17b94a
+    # This is used in automatic gfid conflict resolution.
17b94a
+    # When marked True, it's skipped during re-processing.
17b94a
+    dct['skip_entry'] = False
17b94a
     for k in ed:
17b94a
         if k == 'stat':
17b94a
             st = ed[k]
17b94a
@@ -792,6 +795,7 @@ class GMasterChangelogMixin(GMasterCommon):
17b94a
         pfx = gauxpfx()
17b94a
         fix_entry_ops = []
17b94a
         failures1 = []
17b94a
+        remove_gfids = set()
17b94a
         for failure in failures:
17b94a
             if failure[2]['name_mismatch']:
17b94a
                 pbname = failure[2]['slave_entry']
17b94a
@@ -822,6 +826,18 @@ class GMasterChangelogMixin(GMasterCommon):
17b94a
                             edct('UNLINK',
17b94a
                                  gfid=failure[2]['slave_gfid'],
17b94a
                                  entry=pbname))
17b94a
+                    remove_gfids.add(slave_gfid)
17b94a
+                    if op in ['RENAME']:
17b94a
+                        # If renamed gfid doesn't exists on master, remove
17b94a
+                        # rename entry and unlink src on slave
17b94a
+                        st = lstat(os.path.join(pfx, failure[0]['gfid']))
17b94a
+                        if isinstance(st, int) and st == ENOENT:
17b94a
+                            logging.debug("Unlink source %s" % repr(failure))
17b94a
+                            remove_gfids.add(failure[0]['gfid'])
17b94a
+                            fix_entry_ops.append(
17b94a
+                                edct('UNLINK',
17b94a
+                                     gfid=failure[0]['gfid'],
17b94a
+                                     entry=failure[0]['entry']))
17b94a
                 # Takes care of scenarios of hardlinks/renames on master
17b94a
                 elif not isinstance(st, int):
17b94a
                     if matching_disk_gfid(slave_gfid, pbname):
17b94a
@@ -831,7 +847,12 @@ class GMasterChangelogMixin(GMasterCommon):
17b94a
                                         ' Safe to ignore, take out entry',
17b94a
                                         retry_count=retry_count,
17b94a
                                         entry=repr(failure)))
17b94a
-                        entries.remove(failure[0])
17b94a
+                        remove_gfids.add(failure[0]['gfid'])
17b94a
+                        if op == 'RENAME':
17b94a
+                            fix_entry_ops.append(
17b94a
+                                edct('UNLINK',
17b94a
+                                     gfid=failure[0]['gfid'],
17b94a
+                                     entry=failure[0]['entry']))
17b94a
                     # The file exists on master but with different name.
17b94a
                     # Probably renamed and got missed during xsync crawl.
17b94a
                     elif failure[2]['slave_isdir']:
17b94a
@@ -856,7 +877,10 @@ class GMasterChangelogMixin(GMasterCommon):
17b94a
                                             'take out entry',
17b94a
                                             retry_count=retry_count,
17b94a
                                             entry=repr(failure)))
17b94a
-                            entries.remove(failure[0])
17b94a
+                            try:
17b94a
+                                entries.remove(failure[0])
17b94a
+                            except ValueError:
17b94a
+                                pass
17b94a
                         else:
17b94a
                             rename_dict = edct('RENAME', gfid=slave_gfid,
17b94a
                                                entry=src_entry,
17b94a
@@ -896,7 +920,10 @@ class GMasterChangelogMixin(GMasterCommon):
17b94a
                                     'ignore, take out entry',
17b94a
                                     retry_count=retry_count,
17b94a
                                     entry=repr(failure)))
17b94a
-                    entries.remove(failure[0])
17b94a
+                    try:
17b94a
+                        entries.remove(failure[0])
17b94a
+                    except ValueError:
17b94a
+                        pass
17b94a
                 else:
17b94a
                     logging.info(lf('Fixing ENOENT error in slave. Create '
17b94a
                                     'parent directory on slave.',
17b94a
@@ -913,6 +940,14 @@ class GMasterChangelogMixin(GMasterCommon):
17b94a
                         edct('MKDIR', gfid=pargfid, entry=dir_entry,
17b94a
                              mode=st.st_mode, uid=st.st_uid, gid=st.st_gid))
17b94a
 
17b94a
+        logging.debug("remove_gfids: %s" % repr(remove_gfids))
17b94a
+        if remove_gfids:
17b94a
+            for e in entries:
17b94a
+                if e['op'] in ['MKDIR', 'MKNOD', 'CREATE', 'RENAME'] \
17b94a
+                   and e['gfid'] in remove_gfids:
17b94a
+                    logging.debug("Removed entry op from retrial list: entry: %s" % repr(e))
17b94a
+                    e['skip_entry'] = True
17b94a
+
17b94a
         if fix_entry_ops:
17b94a
             # Process deletions of entries whose gfids are mismatched
17b94a
             failures1 = self.slave.server.entry_ops(fix_entry_ops)
17b94a
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
17b94a
index c290d86..f54ccd9 100644
17b94a
--- a/geo-replication/syncdaemon/resource.py
17b94a
+++ b/geo-replication/syncdaemon/resource.py
17b94a
@@ -426,7 +426,7 @@ class Server(object):
17b94a
                 e['stat']['uid'] = uid
17b94a
                 e['stat']['gid'] = gid
17b94a
 
17b94a
-            if cmd_ret == EEXIST:
17b94a
+            if cmd_ret in [EEXIST, ESTALE]:
17b94a
                 if dst:
17b94a
                     en = e['entry1']
17b94a
                 else:
17b94a
@@ -510,6 +510,12 @@ class Server(object):
17b94a
             entry = e['entry']
17b94a
             uid = 0
17b94a
             gid = 0
17b94a
+
17b94a
+            # Skip entry processing if it's marked true during gfid
17b94a
+            # conflict resolution
17b94a
+            if e['skip_entry']:
17b94a
+                continue
17b94a
+
17b94a
             if e.get("stat", {}):
17b94a
                 # Copy UID/GID value and then reset to zero. Copied UID/GID
17b94a
                 # will be used to run chown once entry is created.
17b94a
@@ -688,7 +694,7 @@ class Server(object):
17b94a
             if blob:
17b94a
                 cmd_ret = errno_wrap(Xattr.lsetxattr,
17b94a
                                      [pg, 'glusterfs.gfid.newfile', blob],
17b94a
-                                     [EEXIST, ENOENT],
17b94a
+                                     [EEXIST, ENOENT, ESTALE],
17b94a
                                      [ESTALE, EINVAL, EBUSY])
17b94a
                 collect_failure(e, cmd_ret, uid, gid)
17b94a
 
17b94a
diff --git a/tests/00-geo-rep/georep-basic-dr-rsync-arbiter.t b/tests/00-geo-rep/georep-basic-dr-rsync-arbiter.t
17b94a
index 67ac167..1a55ed2 100644
17b94a
--- a/tests/00-geo-rep/georep-basic-dr-rsync-arbiter.t
17b94a
+++ b/tests/00-geo-rep/georep-basic-dr-rsync-arbiter.t
17b94a
@@ -203,6 +203,11 @@ EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rsnapshot_data ${slave_mnt}
17b94a
 TEST gluster volume geo-rep $master $slave config rsync-options "--whole-file"
17b94a
 TEST "echo sampledata > $master_mnt/rsync_option_test_file"
17b94a
 
17b94a
+#rename with existing destination case BUG:1694820
17b94a
+TEST create_rename_with_existing_destination ${master_mnt}
17b94a
+#verify rename with existing destination case BUG:1694820
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rename_with_existing_destination ${slave_mnt}
17b94a
+
17b94a
 #Verify arequal for whole volume
17b94a
 EXPECT_WITHIN $GEO_REP_TIMEOUT 0 arequal_checksum ${master_mnt} ${slave_mnt}
17b94a
 
17b94a
diff --git a/tests/00-geo-rep/georep-basic-dr-rsync.t b/tests/00-geo-rep/georep-basic-dr-rsync.t
17b94a
index 8b64370..d0c0fc9 100644
17b94a
--- a/tests/00-geo-rep/georep-basic-dr-rsync.t
17b94a
+++ b/tests/00-geo-rep/georep-basic-dr-rsync.t
17b94a
@@ -204,6 +204,11 @@ EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rsnapshot_data ${slave_mnt}
17b94a
 TEST gluster volume geo-rep $master $slave config rsync-options "--whole-file"
17b94a
 TEST "echo sampledata > $master_mnt/rsync_option_test_file"
17b94a
 
17b94a
+#rename with existing destination case BUG:1694820
17b94a
+TEST create_rename_with_existing_destination ${master_mnt}
17b94a
+#verify rename with existing destination case BUG:1694820
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rename_with_existing_destination ${slave_mnt}
17b94a
+
17b94a
 #Verify arequal for whole volume
17b94a
 EXPECT_WITHIN $GEO_REP_TIMEOUT 0 arequal_checksum ${master_mnt} ${slave_mnt}
17b94a
 
17b94a
diff --git a/tests/00-geo-rep/georep-basic-dr-tarssh-arbiter.t b/tests/00-geo-rep/georep-basic-dr-tarssh-arbiter.t
17b94a
index 1726d0b..cb530ad 100644
17b94a
--- a/tests/00-geo-rep/georep-basic-dr-tarssh-arbiter.t
17b94a
+++ b/tests/00-geo-rep/georep-basic-dr-tarssh-arbiter.t
17b94a
@@ -202,6 +202,11 @@ EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_symlink_rename_mkdir_data ${slave_mnt}/s
17b94a
 #rsnapshot usecase
17b94a
 EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rsnapshot_data ${slave_mnt}
17b94a
 
17b94a
+#rename with existing destination case BUG:1694820
17b94a
+TEST create_rename_with_existing_destination ${master_mnt}
17b94a
+#verify rename with existing destination case BUG:1694820
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rename_with_existing_destination ${slave_mnt}
17b94a
+
17b94a
 #Verify arequal for whole volume
17b94a
 EXPECT_WITHIN $GEO_REP_TIMEOUT 0 arequal_checksum ${master_mnt} ${slave_mnt}
17b94a
 
17b94a
diff --git a/tests/00-geo-rep/georep-basic-dr-tarssh.t b/tests/00-geo-rep/georep-basic-dr-tarssh.t
17b94a
index c5d16ac..9e2f613 100644
17b94a
--- a/tests/00-geo-rep/georep-basic-dr-tarssh.t
17b94a
+++ b/tests/00-geo-rep/georep-basic-dr-tarssh.t
17b94a
@@ -202,6 +202,11 @@ EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_hardlink_rename_data ${slave_mnt}
17b94a
 #rsnapshot usecase
17b94a
 EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rsnapshot_data ${slave_mnt}
17b94a
 
17b94a
+#rename with existing destination case BUG:1694820
17b94a
+TEST create_rename_with_existing_destination ${master_mnt}
17b94a
+#verify rename with existing destination case BUG:1694820
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 verify_rename_with_existing_destination ${slave_mnt}
17b94a
+
17b94a
 #Verify arequal for whole volume
17b94a
 EXPECT_WITHIN $GEO_REP_TIMEOUT 0 arequal_checksum ${master_mnt} ${slave_mnt}
17b94a
 
17b94a
diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
17b94a
index d723129..e357ba8 100644
17b94a
--- a/tests/geo-rep.rc
17b94a
+++ b/tests/geo-rep.rc
17b94a
@@ -403,3 +403,39 @@ function check_slave_read_only()
17b94a
     gluster volume info $1 | grep 'features.read-only: on'
17b94a
     echo $?
17b94a
 }
17b94a
+
17b94a
+function create_rename_with_existing_destination()
17b94a
+{
17b94a
+    dir=$1/rename_with_existing_destination
17b94a
+    mkdir $dir
17b94a
+    for i in {1..5}
17b94a
+    do
17b94a
+        echo "Data_set$i" > $dir/data_set$i
17b94a
+        mv $dir/data_set$i $dir/data_set -f
17b94a
+    done
17b94a
+}
17b94a
+
17b94a
+function verify_rename_with_existing_destination()
17b94a
+{
17b94a
+    dir=$1/rename_with_existing_destination
17b94a
+
17b94a
+    if [ ! -d $dir ]; then
17b94a
+        echo 1
17b94a
+    elif [ ! -f $dir/data_set ]; then
17b94a
+        echo 2
17b94a
+    elif [ -f $dir/data_set1 ]; then
17b94a
+        echo 3
17b94a
+    elif [ -f $dir/data_set2 ]; then
17b94a
+        echo 4
17b94a
+    elif [ -f $dir/data_set3 ]; then
17b94a
+        echo 5
17b94a
+    elif [ -f $dir/data_set4 ]; then
17b94a
+        echo 6
17b94a
+    elif [ -f $dir/data_set5 ]; then
17b94a
+        echo 7
17b94a
+    elif test "XData_set5" != "X$(cat $dir/data_set)"; then
17b94a
+        echo 8
17b94a
+    else
17b94a
+        echo 0
17b94a
+    fi
17b94a
+}
17b94a
-- 
17b94a
1.8.3.1
17b94a