cb8e9e
From a3b9ad5909923b24bec518565d945003bfecee69 Mon Sep 17 00:00:00 2001
cb8e9e
From: Raghavendra Bhat <raghavendra@redhat.com>
cb8e9e
Date: Wed, 27 May 2015 17:00:36 +0530
cb8e9e
Subject: [PATCH 156/190] features/bit-rot: check for both inmemory and ondisk staleness.
cb8e9e
cb8e9e
* Let bit-rot stub check both on disk ongoing version, signed version xattrs and
cb8e9e
  the in memory flags in the inode and then decide whether the inode is stale or
cb8e9e
  not. This information is used by one shot crawler in BitD to decide whether to
cb8e9e
  trigger the sign for the object or skip it.
cb8e9e
cb8e9e
  NOTE: The above check should be done only for BitD. For scrubber its still the
cb8e9e
        old way of comparing on disk ongoing version with signed version.
cb8e9e
cb8e9e
* BitD's one shot crawler should not sign zero byte objects if they do not contain
cb8e9e
  signature. (Means the object was just created and nothing was written to it).
cb8e9e
cb8e9e
Change-Id: I6941aefc2981bf79a6aeb476e660f79908e165a8
cb8e9e
BUG: 1232309
cb8e9e
Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/51738
cb8e9e
---
cb8e9e
 xlators/features/bit-rot/src/bitd/bit-rot.c      |   14 +-
cb8e9e
 xlators/features/bit-rot/src/stub/bit-rot-stub.c |  143 ++++++++++++++++++++--
cb8e9e
 2 files changed, 138 insertions(+), 19 deletions(-)
cb8e9e
cb8e9e
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
cb8e9e
index a4821ba..228cf34 100644
cb8e9e
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
cb8e9e
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
cb8e9e
@@ -858,7 +858,6 @@ br_check_object_need_sign (xlator_t *this, dict_t *xattr, br_child_t *child)
cb8e9e
 {
cb8e9e
         int32_t              ret       = -1;
cb8e9e
         gf_boolean_t         need_sign = _gf_false;
cb8e9e
-        struct timeval       tv        = {0,};
cb8e9e
         br_isignature_out_t *sign      = NULL;
cb8e9e
 
cb8e9e
         GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
cb8e9e
@@ -873,11 +872,8 @@ br_check_object_need_sign (xlator_t *this, dict_t *xattr, br_child_t *child)
cb8e9e
                 goto out;
cb8e9e
         }
cb8e9e
 
cb8e9e
-        tv.tv_sec  = ntohl (sign->time[0]);
cb8e9e
-        tv.tv_usec = ntohl (sign->time[1]);
cb8e9e
-
cb8e9e
         /* Object has been opened and hence dirty. Do not sign it */
cb8e9e
-        if (sign->stale && !br_time_equal (child, &tv))
cb8e9e
+        if (sign->stale)
cb8e9e
                 need_sign = _gf_true;
cb8e9e
 
cb8e9e
 out:
cb8e9e
@@ -1007,7 +1003,11 @@ bitd_oneshot_crawl (xlator_t *subvol,
cb8e9e
                 op_errno = -ret;
cb8e9e
                 br_log_object (this, "getxattr", linked_inode->gfid, op_errno);
cb8e9e
 
cb8e9e
-                if (op_errno == ENODATA)
cb8e9e
+                /**
cb8e9e
+                 * No need to sign the zero byte objects as the signing
cb8e9e
+                 * happens upon first modification of the object.
cb8e9e
+                 */
cb8e9e
+                if (op_errno == ENODATA && (iatt.ia_size != 0))
cb8e9e
                         need_signing = _gf_true;
cb8e9e
                 if (op_errno == EINVAL)
cb8e9e
                         gf_log (this->name, GF_LOG_WARNING, "Partial version "
cb8e9e
@@ -1236,7 +1236,7 @@ br_brick_connect (xlator_t *this, br_child_t *child)
cb8e9e
 
cb8e9e
         memcpy (child->brick_path, stub->export, strlen (stub->export) + 1);
cb8e9e
         child->tv.tv_sec = ntohl (stub->timebuf[0]);
cb8e9e
-        child->tv.tv_usec = ntohl (stub->timebuf[0]);
cb8e9e
+        child->tv.tv_usec = ntohl (stub->timebuf[1]);
cb8e9e
 
cb8e9e
         if (priv->iamscrubber)
cb8e9e
                 ret = br_enact_scrubber (this, child);
cb8e9e
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
cb8e9e
index 4f0605d..d4aecdc 100644
cb8e9e
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
cb8e9e
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
cb8e9e
@@ -949,6 +949,79 @@ br_stub_listxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
+/**
cb8e9e
+ * ONE SHOT CRAWLER from BitD signs the objects that it encounters while
cb8e9e
+ * crawling, if the object is identified as stale by the stub. Stub follows
cb8e9e
+ * the below logic to mark an object as stale or not.
cb8e9e
+ * If the ongoing version and the signed_version match, then the object is not
cb8e9e
+ * stale. Just return. Otherwise if they does not match, then it means one
cb8e9e
+ * of the below things.
cb8e9e
+ * 1) If the inode does not need write back of the version and the sign state is
cb8e9e
+ *    is NORMAL, then some active i/o is going on the object. So skip it.
cb8e9e
+ *    A notification will be sent to trigger the sign once the release is
cb8e9e
+ *    received on the object.
cb8e9e
+ * 2) If inode does not need writeback of the version and the sign state is
cb8e9e
+ *    either reopen wait or quick sign, then it means:
cb8e9e
+ *    A) BitD restarted and it is not sure whether the object it encountered
cb8e9e
+ *       while crawling is in its timer wheel or not. Since there is no way to
cb8e9e
+ *       scan the timer wheel as of now, ONE SHOT CRAWLER just goes ahead and
cb8e9e
+ *       signs the object. Since the inode does not need writeback, version will
cb8e9e
+ *       not be incremented and directly the object will be signed.
cb8e9e
+ * 3) If the inode needs writeback, then it means the inode was forgotten after
cb8e9e
+ *    the versioning and it has to be signed now.
cb8e9e
+ *
cb8e9e
+ * This is the algorithm followed:
cb8e9e
+ * if (ongoing_version == signed_version); then
cb8e9e
+ *     object_is_not_stale;
cb8e9e
+ *     return;
cb8e9e
+ * else; then
cb8e9e
+ *      if (!inode_needs_writeback && inode_sign_state != NORMAL); then
cb8e9e
+ *            object_is_stale;
cb8e9e
+ *      if (inode_needs_writeback); then
cb8e9e
+ *            object_is_stale;
cb8e9e
+ *
cb8e9e
+ * For SCRUBBER, no need to check for the sign state and inode writeback.
cb8e9e
+ * If the ondisk ongoingversion and the ondisk signed version does not match,
cb8e9e
+ * then treat the object as stale.
cb8e9e
+ */
cb8e9e
+char
cb8e9e
+br_stub_is_object_stale (xlator_t *this, call_frame_t *frame, inode_t *inode,
cb8e9e
+                         br_version_t *obuf, br_signature_t *sbuf)
cb8e9e
+{
cb8e9e
+        uint64_t   ctx_addr = 0;
cb8e9e
+        br_stub_inode_ctx_t *ctx = NULL;
cb8e9e
+        int32_t  ret = -1;
cb8e9e
+        char stale = 0;
cb8e9e
+
cb8e9e
+        if (obuf->ongoingversion == sbuf->signedversion)
cb8e9e
+                goto out;
cb8e9e
+
cb8e9e
+        if (frame->root->pid == GF_CLIENT_PID_SCRUB) {
cb8e9e
+                stale = 1;
cb8e9e
+                goto out;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        ret = br_stub_get_inode_ctx (this, inode, &ctx_addr);
cb8e9e
+        if (ret) {
cb8e9e
+                gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
cb8e9e
+                        "context for %s", uuid_utoa (inode->gfid));
cb8e9e
+                goto out;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
cb8e9e
+
cb8e9e
+        LOCK (&inode->lock);
cb8e9e
+        {
cb8e9e
+                if ((!__br_stub_is_inode_dirty (ctx) &&
cb8e9e
+                     ctx->info_sign != BR_SIGN_NORMAL) ||
cb8e9e
+                    __br_stub_is_inode_dirty (ctx))
cb8e9e
+                        stale = 1;
cb8e9e
+        }
cb8e9e
+        UNLOCK (&inode->lock);
cb8e9e
+
cb8e9e
+out:
cb8e9e
+        return stale;
cb8e9e
+}
cb8e9e
 
cb8e9e
 int
cb8e9e
 br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
@@ -961,12 +1034,18 @@ br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         br_signature_t      *sbuf         = NULL;
cb8e9e
         br_isignature_out_t *sign         = NULL;
cb8e9e
         br_vxattr_status_t   status;
cb8e9e
+        br_stub_local_t     *local        = NULL;
cb8e9e
+        inode_t             *inode        = NULL;
cb8e9e
 
cb8e9e
         if (op_ret < 0)
cb8e9e
                 goto unwind;
cb8e9e
         if (cookie != (void *) BR_STUB_REQUEST_COOKIE)
cb8e9e
                 goto unwind;
cb8e9e
 
cb8e9e
+        local = frame->local;
cb8e9e
+        frame->local = NULL;
cb8e9e
+        inode = local->u.context.inode;
cb8e9e
+
cb8e9e
         op_ret   = -1;
cb8e9e
         status = br_version_xattr_state (xattr, &obuf, &sbuf);
cb8e9e
 
cb8e9e
@@ -1005,7 +1084,7 @@ br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
 
cb8e9e
         /* Object's dirty state & current signed version */
cb8e9e
         sign->version = sbuf->signedversion;
cb8e9e
-        sign->stale = (obuf->ongoingversion != sbuf->signedversion) ? 1 : 0;
cb8e9e
+        sign->stale = br_stub_is_object_stale (this, frame, inode, obuf, sbuf);
cb8e9e
 
cb8e9e
         /* Object's signature */
cb8e9e
         sign->signaturelen  = signaturelen;
cb8e9e
@@ -1025,6 +1104,10 @@ br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
 
cb8e9e
  unwind:
cb8e9e
         STACK_UNWIND (frame, op_ret, op_errno, xattr, xdata);
cb8e9e
+        if (local) {
cb8e9e
+                br_stub_cleanup_local (local);
cb8e9e
+                br_stub_dealloc_local (local);
cb8e9e
+        }
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
@@ -1070,9 +1153,16 @@ int
cb8e9e
 br_stub_getxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                   loc_t *loc, const char *name, dict_t *xdata)
cb8e9e
 {
cb8e9e
-        void *cookie = NULL;
cb8e9e
-        uuid_t rootgfid = {0, };
cb8e9e
-        fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk;
cb8e9e
+        void               *cookie   = NULL;
cb8e9e
+        uuid_t              rootgfid = {0, };
cb8e9e
+        fop_getxattr_cbk_t  cbk      = br_stub_getxattr_cbk;
cb8e9e
+        int32_t             op_ret   = -1;
cb8e9e
+        int32_t             op_errno = EINVAL;
cb8e9e
+        br_stub_local_t    *local    = NULL;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
cb8e9e
 
cb8e9e
         rootgfid[15] = 1;
cb8e9e
 
cb8e9e
@@ -1081,10 +1171,8 @@ br_stub_getxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                 goto wind;
cb8e9e
         }
cb8e9e
 
cb8e9e
-        if (br_stub_is_internal_xattr (name)) {
cb8e9e
-                STACK_UNWIND (frame, -1, EINVAL, NULL, NULL);
cb8e9e
-                return 0;
cb8e9e
-        }
cb8e9e
+        if (br_stub_is_internal_xattr (name))
cb8e9e
+                goto unwind;
cb8e9e
 
cb8e9e
         /**
cb8e9e
          * this special extended attribute is allowed only on root
cb8e9e
@@ -1104,6 +1192,18 @@ br_stub_getxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
         if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE,
cb8e9e
                               strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) {
cb8e9e
                 cookie = (void *) BR_STUB_REQUEST_COOKIE;
cb8e9e
+
cb8e9e
+                local = br_stub_alloc_local (this);
cb8e9e
+                if (!local) {
cb8e9e
+                        op_ret = -1;
cb8e9e
+                        op_errno = ENOMEM;
cb8e9e
+                        goto unwind;
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                br_stub_fill_local (local, NULL, NULL, loc->inode,
cb8e9e
+                                    loc->inode->gfid,
cb8e9e
+                                    BR_STUB_NO_VERSIONING, 0);
cb8e9e
+                frame->local = local;
cb8e9e
         }
cb8e9e
 
cb8e9e
  wind:
cb8e9e
@@ -1111,6 +1211,9 @@ br_stub_getxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                       (frame, cbk, cookie, FIRST_CHILD (this),
cb8e9e
                        FIRST_CHILD (this)->fops->getxattr, loc, name, xdata);
cb8e9e
         return 0;
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
cb8e9e
+                return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
 int
cb8e9e
@@ -1120,6 +1223,9 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
         void *cookie = NULL;
cb8e9e
         uuid_t rootgfid = {0, };
cb8e9e
         fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk;
cb8e9e
+        int32_t op_ret = -1;
cb8e9e
+        int32_t op_errno = EINVAL;
cb8e9e
+        br_stub_local_t *local = NULL;
cb8e9e
 
cb8e9e
         rootgfid[15] = 1;
cb8e9e
 
cb8e9e
@@ -1128,10 +1234,8 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                 goto wind;
cb8e9e
         }
cb8e9e
 
cb8e9e
-        if (br_stub_is_internal_xattr (name)) {
cb8e9e
-                STACK_UNWIND (frame, -1, EINVAL, NULL, NULL);
cb8e9e
-                return 0;
cb8e9e
-        }
cb8e9e
+        if (br_stub_is_internal_xattr (name))
cb8e9e
+                goto unwind;
cb8e9e
 
cb8e9e
         /**
cb8e9e
          * this special extended attribute is allowed only on root
cb8e9e
@@ -1150,6 +1254,18 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
         if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE,
cb8e9e
                               strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) {
cb8e9e
                 cookie = (void *) BR_STUB_REQUEST_COOKIE;
cb8e9e
+
cb8e9e
+                local = br_stub_alloc_local (this);
cb8e9e
+                if (!local) {
cb8e9e
+                        op_ret = -1;
cb8e9e
+                        op_errno = ENOMEM;
cb8e9e
+                        goto unwind;
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                br_stub_fill_local (local, NULL, fd, fd->inode,
cb8e9e
+                                    fd->inode->gfid,
cb8e9e
+                                    BR_STUB_NO_VERSIONING, 0);
cb8e9e
+                frame->local = local;
cb8e9e
         }
cb8e9e
 
cb8e9e
  wind:
cb8e9e
@@ -1157,6 +1273,9 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                       (frame, cbk, cookie, FIRST_CHILD (this),
cb8e9e
                        FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata);
cb8e9e
         return 0;
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
cb8e9e
+        return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
 /**
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e