190130
From 80eef2f52bb92ed740ac00eeb11ee7a3e7fffff2 Mon Sep 17 00:00:00 2001
190130
From: Raghavendra Bhat <raghavendra@redhat.com>
190130
Date: Mon, 11 Mar 2019 12:16:50 -0400
190130
Subject: [PATCH 459/465] features/bit-rot: Unconditionally sign the files
190130
 during oneshot crawl
190130
190130
Currently bit-rot feature has an issue with disabling and reenabling it
190130
on the same volume. Consider enabling bit-rot detection which goes on to
190130
crawl and sign all the files present in the volume. Then some files are
190130
modified and the bit-rot daemon goes on to sign the modified files with
190130
the correct signature. Now, disable bit-rot feature. While, signing and
190130
scrubbing are not happening, previous checksums of the files continue to
190130
exist as extended attributes. Now, if some files with checksum xattrs get
190130
modified, they are not signed with new signature as the feature is off.
190130
190130
At this point, if the feature is enabled again, the bit rot daemon will
190130
go and sign those files which does not have any bit-rot specific xattrs
190130
(i.e. those files which were created after bit-rot was disabled). Whereas
190130
the files with bit-rot xattrs wont get signed with proper new checksum.
190130
At this point if scrubber runs, it finds the on disk checksum and the actual
190130
checksum of the file to be different (because the file got modified) and
190130
marks the file as corrupted.
190130
190130
FIX:
190130
190130
The fix is to unconditionally sign the files when the bit-rot daemon
190130
comes up (instead of skipping the files with bit-rot xattrs).
190130
190130
upstream fix:
190130
	> patch: https://review.gluster.org/#/c/glusterfs/+/22360/
190130
	> fixes: #bz1700078
190130
	> Change-ID: Iadfb47dd39f7e2e77f22d549a4a07a385284f4f5
190130
190130
Change-Id: Iadfb47dd39f7e2e77f22d549a4a07a385284f4f5
190130
BUG: 1851424
190130
Signed-off-by: Raghavendra M <raghavendra@redhat.com>
190130
Reviewed-on: https://code.engineering.redhat.com/gerrit/208305
190130
Tested-by: RHGS Build Bot <nigelb@redhat.com>
190130
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
190130
---
190130
 tests/bitrot/bug-1700078.t                  | 87 +++++++++++++++++++++++++++++
190130
 xlators/features/bit-rot/src/bitd/bit-rot.c | 15 ++++-
190130
 2 files changed, 101 insertions(+), 1 deletion(-)
190130
 create mode 100644 tests/bitrot/bug-1700078.t
190130
190130
diff --git a/tests/bitrot/bug-1700078.t b/tests/bitrot/bug-1700078.t
190130
new file mode 100644
190130
index 0000000..f273742
190130
--- /dev/null
190130
+++ b/tests/bitrot/bug-1700078.t
190130
@@ -0,0 +1,87 @@
190130
+#!/bin/bash
190130
+
190130
+. $(dirname $0)/../include.rc
190130
+. $(dirname $0)/../volume.rc
190130
+
190130
+cleanup;
190130
+
190130
+## Start glusterd
190130
+TEST glusterd;
190130
+TEST pidof glusterd;
190130
+
190130
+## Lets create and start the volume
190130
+TEST $CLI volume create $V0 $H0:$B0/${V0}1
190130
+TEST $CLI volume start $V0
190130
+
190130
+## Enable bitrot for volume $V0
190130
+TEST $CLI volume bitrot $V0 enable
190130
+
190130
+## Turn off quick-read so that it wont cache the contents
190130
+# of the file in lookup. For corrupted files, it might
190130
+# end up in reads being served from the cache instead of
190130
+# an error.
190130
+TEST $CLI volume set $V0 performance.quick-read off
190130
+
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count
190130
+
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Active' scrub_status $V0 'State of scrub'
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/bitd.log' scrub_status $V0 'Bitrot error log location'
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/scrub.log' scrub_status $V0 'Scrubber error log location'
190130
+
190130
+## Set expiry-timeout to 1 sec
190130
+TEST $CLI volume set $V0 features.expiry-time 1
190130
+
190130
+##Mount $V0
190130
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
190130
+
190130
+## Turn off quick-read xlator so that, the contents are not served from the
190130
+# quick-read cache.
190130
+TEST $CLI volume set $V0 performance.quick-read off
190130
+
190130
+#Create sample file
190130
+TEST `echo "1234" > $M0/FILE1`
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'trusted.bit-rot.signature' check_for_xattr 'trusted.bit-rot.signature' "/$B0/${V0}1/FILE1"
190130
+
190130
+##disable bitrot
190130
+TEST $CLI volume bitrot $V0 disable
190130
+
190130
+## modify the file
190130
+TEST `echo "write" >> $M0/FILE1`
190130
+
190130
+# unmount and remount when the file has to be accessed.
190130
+# This is to ensure that, when the remount happens,
190130
+# and the file is read, its contents are served from the
190130
+# brick instead of cache.
190130
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
190130
+
190130
+##enable bitrot
190130
+TEST $CLI volume bitrot $V0 enable
190130
+
190130
+# expiry time is set to 1 second. Hence sleep for 2 seconds for the
190130
+# oneshot crawler to finish its crawling and sign the file properly.
190130
+sleep 2
190130
+
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count
190130
+
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Active' scrub_status $V0 'State of scrub'
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/bitd.log' scrub_status $V0 'Bitrot error log location'
190130
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT '/var/log/glusterfs/scrub.log' scrub_status $V0 'Scrubber error log location'
190130
+
190130
+## Ondemand scrub
190130
+TEST $CLI volume bitrot $V0 scrub ondemand
190130
+
190130
+# the scrub ondemand CLI command, just ensures that
190130
+# the scrubber has received the ondemand scrub directive
190130
+# and started. sleep for 2 seconds for scrubber to finish
190130
+# crawling and marking file(s) as bad (if if finds that
190130
+# corruption has happened) which are filesystem operations.
190130
+sleep 2
190130
+
190130
+TEST ! getfattr -n 'trusted.bit-rot.bad-file' $B0/${V0}1/FILE1
190130
+
190130
+##Mount $V0
190130
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
190130
+
190130
+TEST cat $M0/FILE1
190130
+
190130
+cleanup;
190130
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
190130
index b8feef7..424c0d5 100644
190130
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
190130
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
190130
@@ -973,6 +973,7 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
190130
     int32_t ret = -1;
190130
     inode_t *linked_inode = NULL;
190130
     gf_boolean_t need_signing = _gf_false;
190130
+    gf_boolean_t need_reopen = _gf_true;
190130
 
190130
     GF_VALIDATE_OR_GOTO("bit-rot", subvol, out);
190130
     GF_VALIDATE_OR_GOTO("bit-rot", data, out);
190130
@@ -1046,6 +1047,18 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
190130
                    uuid_utoa(linked_inode->gfid));
190130
     } else {
190130
         need_signing = br_check_object_need_sign(this, xattr, child);
190130
+
190130
+        /*
190130
+         * If we are here means, bitrot daemon has started. Is it just
190130
+         * a simple restart of the daemon or is it started because the
190130
+         * feature is enabled is something hard to determine. Hence,
190130
+         * if need_signing is false (because bit-rot version and signature
190130
+         * are present), then still go ahead and sign it.
190130
+         */
190130
+        if (!need_signing) {
190130
+            need_signing = _gf_true;
190130
+            need_reopen = _gf_true;
190130
+        }
190130
     }
190130
 
190130
     if (!need_signing)
190130
@@ -1054,7 +1067,7 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
190130
     gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN,
190130
            "Triggering signing for %s [GFID: %s | Brick: %s]", loc.path,
190130
            uuid_utoa(linked_inode->gfid), child->brick_path);
190130
-    br_trigger_sign(this, child, linked_inode, &loc, _gf_true);
190130
+    br_trigger_sign(this, child, linked_inode, &loc, need_reopen);
190130
 
190130
     ret = 0;
190130
 
190130
-- 
190130
1.8.3.1
190130