74096c
From 80eef2f52bb92ed740ac00eeb11ee7a3e7fffff2 Mon Sep 17 00:00:00 2001
74096c
From: Raghavendra Bhat <raghavendra@redhat.com>
74096c
Date: Mon, 11 Mar 2019 12:16:50 -0400
74096c
Subject: [PATCH 459/465] features/bit-rot: Unconditionally sign the files
74096c
 during oneshot crawl
74096c
74096c
Currently bit-rot feature has an issue with disabling and reenabling it
74096c
on the same volume. Consider enabling bit-rot detection which goes on to
74096c
crawl and sign all the files present in the volume. Then some files are
74096c
modified and the bit-rot daemon goes on to sign the modified files with
74096c
the correct signature. Now, disable bit-rot feature. While, signing and
74096c
scrubbing are not happening, previous checksums of the files continue to
74096c
exist as extended attributes. Now, if some files with checksum xattrs get
74096c
modified, they are not signed with new signature as the feature is off.
74096c
74096c
At this point, if the feature is enabled again, the bit rot daemon will
74096c
go and sign those files which does not have any bit-rot specific xattrs
74096c
(i.e. those files which were created after bit-rot was disabled). Whereas
74096c
the files with bit-rot xattrs wont get signed with proper new checksum.
74096c
At this point if scrubber runs, it finds the on disk checksum and the actual
74096c
checksum of the file to be different (because the file got modified) and
74096c
marks the file as corrupted.
74096c
74096c
FIX:
74096c
74096c
The fix is to unconditionally sign the files when the bit-rot daemon
74096c
comes up (instead of skipping the files with bit-rot xattrs).
74096c
74096c
upstream fix:
74096c
	> patch: https://review.gluster.org/#/c/glusterfs/+/22360/
74096c
	> fixes: #bz1700078
74096c
	> Change-ID: Iadfb47dd39f7e2e77f22d549a4a07a385284f4f5
74096c
74096c
Change-Id: Iadfb47dd39f7e2e77f22d549a4a07a385284f4f5
74096c
BUG: 1851424
74096c
Signed-off-by: Raghavendra M <raghavendra@redhat.com>
74096c
Reviewed-on: https://code.engineering.redhat.com/gerrit/208305
74096c
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74096c
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
74096c
---
74096c
 tests/bitrot/bug-1700078.t                  | 87 +++++++++++++++++++++++++++++
74096c
 xlators/features/bit-rot/src/bitd/bit-rot.c | 15 ++++-
74096c
 2 files changed, 101 insertions(+), 1 deletion(-)
74096c
 create mode 100644 tests/bitrot/bug-1700078.t
74096c
74096c
diff --git a/tests/bitrot/bug-1700078.t b/tests/bitrot/bug-1700078.t
74096c
new file mode 100644
74096c
index 0000000..f273742
74096c
--- /dev/null
74096c
+++ b/tests/bitrot/bug-1700078.t
74096c
@@ -0,0 +1,87 @@
74096c
+#!/bin/bash
74096c
+
74096c
+. $(dirname $0)/../include.rc
74096c
+. $(dirname $0)/../volume.rc
74096c
+
74096c
+cleanup;
74096c
+
74096c
+## Start glusterd
74096c
+TEST glusterd;
74096c
+TEST pidof glusterd;
74096c
+
74096c
+## Lets create and start the volume
74096c
+TEST $CLI volume create $V0 $H0:$B0/${V0}1
74096c
+TEST $CLI volume start $V0
74096c
+
74096c
+## Enable bitrot for volume $V0
74096c
+TEST $CLI volume bitrot $V0 enable
74096c
+
74096c
+## Turn off quick-read so that it wont cache the contents
74096c
+# of the file in lookup. For corrupted files, it might
74096c
+# end up in reads being served from the cache instead of
74096c
+# an error.
74096c
+TEST $CLI volume set $V0 performance.quick-read off
74096c
+
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count
74096c
+
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Active' scrub_status $V0 'State of scrub'
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/bitd.log' scrub_status $V0 'Bitrot error log location'
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/scrub.log' scrub_status $V0 'Scrubber error log location'
74096c
+
74096c
+## Set expiry-timeout to 1 sec
74096c
+TEST $CLI volume set $V0 features.expiry-time 1
74096c
+
74096c
+##Mount $V0
74096c
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
74096c
+
74096c
+## Turn off quick-read xlator so that, the contents are not served from the
74096c
+# quick-read cache.
74096c
+TEST $CLI volume set $V0 performance.quick-read off
74096c
+
74096c
+#Create sample file
74096c
+TEST `echo "1234" > $M0/FILE1`
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'trusted.bit-rot.signature' check_for_xattr 'trusted.bit-rot.signature' "/$B0/${V0}1/FILE1"
74096c
+
74096c
+##disable bitrot
74096c
+TEST $CLI volume bitrot $V0 disable
74096c
+
74096c
+## modify the file
74096c
+TEST `echo "write" >> $M0/FILE1`
74096c
+
74096c
+# unmount and remount when the file has to be accessed.
74096c
+# This is to ensure that, when the remount happens,
74096c
+# and the file is read, its contents are served from the
74096c
+# brick instead of cache.
74096c
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
74096c
+
74096c
+##enable bitrot
74096c
+TEST $CLI volume bitrot $V0 enable
74096c
+
74096c
+# expiry time is set to 1 second. Hence sleep for 2 seconds for the
74096c
+# oneshot crawler to finish its crawling and sign the file properly.
74096c
+sleep 2
74096c
+
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count
74096c
+
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Active' scrub_status $V0 'State of scrub'
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/bitd.log' scrub_status $V0 'Bitrot error log location'
74096c
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/scrub.log' scrub_status $V0 'Scrubber error log location'
74096c
+
74096c
+## Ondemand scrub
74096c
+TEST $CLI volume bitrot $V0 scrub ondemand
74096c
+
74096c
+# the scrub ondemand CLI command, just ensures that
74096c
+# the scrubber has received the ondemand scrub directive
74096c
+# and started. sleep for 2 seconds for scrubber to finish
74096c
+# crawling and marking file(s) as bad (if if finds that
74096c
+# corruption has happened) which are filesystem operations.
74096c
+sleep 2
74096c
+
74096c
+TEST ! getfattr -n 'trusted.bit-rot.bad-file' $B0/${V0}1/FILE1
74096c
+
74096c
+##Mount $V0
74096c
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
74096c
+
74096c
+TEST cat $M0/FILE1
74096c
+
74096c
+cleanup;
74096c
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
74096c
index b8feef7..424c0d5 100644
74096c
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
74096c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
74096c
@@ -973,6 +973,7 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
74096c
     int32_t ret = -1;
74096c
     inode_t *linked_inode = NULL;
74096c
     gf_boolean_t need_signing = _gf_false;
74096c
+    gf_boolean_t need_reopen = _gf_true;
74096c
 
74096c
     GF_VALIDATE_OR_GOTO("bit-rot", subvol, out);
74096c
     GF_VALIDATE_OR_GOTO("bit-rot", data, out);
74096c
@@ -1046,6 +1047,18 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
74096c
                    uuid_utoa(linked_inode->gfid));
74096c
     } else {
74096c
         need_signing = br_check_object_need_sign(this, xattr, child);
74096c
+
74096c
+        /*
74096c
+         * If we are here means, bitrot daemon has started. Is it just
74096c
+         * a simple restart of the daemon or is it started because the
74096c
+         * feature is enabled is something hard to determine. Hence,
74096c
+         * if need_signing is false (because bit-rot version and signature
74096c
+         * are present), then still go ahead and sign it.
74096c
+         */
74096c
+        if (!need_signing) {
74096c
+            need_signing = _gf_true;
74096c
+            need_reopen = _gf_true;
74096c
+        }
74096c
     }
74096c
 
74096c
     if (!need_signing)
74096c
@@ -1054,7 +1067,7 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
74096c
     gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN,
74096c
            "Triggering signing for %s [GFID: %s | Brick: %s]", loc.path,
74096c
            uuid_utoa(linked_inode->gfid), child->brick_path);
74096c
-    br_trigger_sign(this, child, linked_inode, &loc, _gf_true);
74096c
+    br_trigger_sign(this, child, linked_inode, &loc, need_reopen);
74096c
 
74096c
     ret = 0;
74096c
 
74096c
-- 
74096c
1.8.3.1
74096c