cb8e9e
From 27465e9f8b567db4a5265b1cfd0f08f300667416 Mon Sep 17 00:00:00 2001
cb8e9e
From: Raghavendra Bhat <raghavendra@redhat.com>
cb8e9e
Date: Tue, 26 May 2015 19:22:14 +0530
cb8e9e
Subject: [PATCH 185/190] features/bit-rot-stub: deny access to bad objects
cb8e9e
cb8e9e
       Backport of http://review.gluster.org/11126
cb8e9e
cb8e9e
* Access to bad objects (especially operations such as open, readv, writev)
cb8e9e
  should be denied to prevent applications from getting wrong data.
cb8e9e
cb8e9e
* Do not allow anyone apart from scrubber to set bad object xattr.
cb8e9e
cb8e9e
* Do not allow bad object xattr to be removed.
cb8e9e
cb8e9e
Change-Id: Id4e43b8318a7b0822231485c60bbc551b9adf7e8
cb8e9e
BUG: 1224227
cb8e9e
Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/51757
cb8e9e
Reviewed-by: Venky Shankar <vshankar@redhat.com>
cb8e9e
Tested-by: Venky Shankar <vshankar@redhat.com>
cb8e9e
---
cb8e9e
 libglusterfs/src/glusterfs.h                       |    3 +
cb8e9e
 xlators/features/bit-rot/src/bitd/bit-rot.c        |    7 +-
cb8e9e
 xlators/features/bit-rot/src/stub/bit-rot-common.h |   15 +-
cb8e9e
 .../bit-rot/src/stub/bit-rot-stub-messages.h       |   28 ++
cb8e9e
 xlators/features/bit-rot/src/stub/bit-rot-stub.c   |  310 ++++++++++++++++++--
cb8e9e
 xlators/features/bit-rot/src/stub/bit-rot-stub.h   |   94 ++++++
cb8e9e
 xlators/performance/quick-read/src/quick-read.c    |    5 +
cb8e9e
 7 files changed, 430 insertions(+), 32 deletions(-)
cb8e9e
cb8e9e
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
cb8e9e
index c00bf55..97965ab 100644
cb8e9e
--- a/libglusterfs/src/glusterfs.h
cb8e9e
+++ b/libglusterfs/src/glusterfs.h
cb8e9e
@@ -127,6 +127,9 @@
cb8e9e
 #define BITROT_CURRENT_VERSION_KEY  "trusted.bit-rot.version"
cb8e9e
 #define BITROT_SIGNING_VERSION_KEY  "trusted.bit-rot.signature"
cb8e9e
 
cb8e9e
+/* globally usable bad file marker */
cb8e9e
+#define GLUSTERFS_BAD_INODE         "glusterfs.bad-inode"
cb8e9e
+
cb8e9e
 /* on-disk size of signing xattr (not the signature itself) */
cb8e9e
 #define BITROT_SIGNING_XATTR_SIZE_KEY  "trusted.glusterfs.bit-rot.size"
cb8e9e
 
cb8e9e
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
cb8e9e
index cf9e8e2..94063cb 100644
cb8e9e
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
cb8e9e
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
cb8e9e
@@ -164,11 +164,10 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
cb8e9e
 
cb8e9e
         if (fd)
cb8e9e
                 ret = syncop_fgetxattr (child->xl, fd, &xattr,
cb8e9e
-                                        "trusted.glusterfs.bad-file", NULL,
cb8e9e
-                                        NULL);
cb8e9e
+                                        BITROT_OBJECT_BAD_KEY, NULL, NULL);
cb8e9e
         else if (loc)
cb8e9e
-                ret = syncop_getxattr (child->xl, loc, &xattr,
cb8e9e
-                                       "trusted.glusterfs.bad-file", NULL,
cb8e9e
+                ret = syncop_getxattr (child->xl, loc,
cb8e9e
+                                       &xattr, BITROT_OBJECT_BAD_KEY, NULL,
cb8e9e
                                        NULL);
cb8e9e
 
cb8e9e
         if (!ret) {
cb8e9e
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h
cb8e9e
index a8285d2..f8d03de 100644
cb8e9e
--- a/xlators/features/bit-rot/src/stub/bit-rot-common.h
cb8e9e
+++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h
cb8e9e
@@ -41,12 +41,23 @@ typedef enum br_sign_state {
cb8e9e
 } br_sign_state_t;
cb8e9e
 
cb8e9e
 static inline br_vxattr_status_t
cb8e9e
-br_version_xattr_state (dict_t *xattr,
cb8e9e
-                        br_version_t **obuf, br_signature_t **sbuf)
cb8e9e
+br_version_xattr_state (dict_t *xattr, br_version_t **obuf,
cb8e9e
+                        br_signature_t **sbuf, gf_boolean_t *objbad)
cb8e9e
 {
cb8e9e
         int32_t             ret    = 0;
cb8e9e
         int32_t             vxattr = 0;
cb8e9e
         br_vxattr_status_t  status;
cb8e9e
+        void               *data   = NULL;
cb8e9e
+
cb8e9e
+        /**
cb8e9e
+         * The key being present in the dict indicates the xattr was set on
cb8e9e
+         * disk. The presence of xattr itself as of now is suffecient to say
cb8e9e
+         * the the object is bad.
cb8e9e
+         */
cb8e9e
+        *objbad = _gf_false;
cb8e9e
+        ret = dict_get_bin (xattr, BITROT_OBJECT_BAD_KEY, (void **)&data);
cb8e9e
+        if (!ret)
cb8e9e
+                *objbad = _gf_true;
cb8e9e
 
cb8e9e
         ret = dict_get_bin (xattr, BITROT_CURRENT_VERSION_KEY, (void **)obuf);
cb8e9e
         if (ret)
cb8e9e
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
cb8e9e
index d940b65..db5736a 100644
cb8e9e
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
cb8e9e
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
cb8e9e
@@ -153,6 +153,34 @@
cb8e9e
  * @recommendedaction
cb8e9e
  *
cb8e9e
  */
cb8e9e
+#define BRS_MSG_BAD_OBJ_MARK_FAIL           (GLFS_BITROT_STUB_BASE + 16)
cb8e9e
+/*!
cb8e9e
+ * @messageid
cb8e9e
+ * @diagnosis
cb8e9e
+ * @recommendedaction
cb8e9e
+ *
cb8e9e
+ */
cb8e9e
+#define BRS_MSG_NON_SCRUB_BAD_OBJ_MARK      (GLFS_BITROT_STUB_BASE + 17)
cb8e9e
+/*!
cb8e9e
+ * @messageid
cb8e9e
+ * @diagnosis
cb8e9e
+ * @recommendedaction
cb8e9e
+ *
cb8e9e
+ */
cb8e9e
+#define BRS_MSG_REMOVE_BAD_OBJECT_XATTR     (GLFS_BITROT_STUB_BASE + 18)
cb8e9e
+/*!
cb8e9e
+ * @messageid
cb8e9e
+ * @diagnosis
cb8e9e
+ * @recommendedaction
cb8e9e
+ *
cb8e9e
+ */
cb8e9e
+#define BRS_MSG_BAD_OBJECT_ACCESS           (GLFS_BITROT_STUB_BASE + 20)
cb8e9e
+/*!
cb8e9e
+ * @messageid
cb8e9e
+ * @diagnosis
cb8e9e
+ * @recommendedaction
cb8e9e
+ *
cb8e9e
+ */
cb8e9e
 /*------------*/
cb8e9e
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
cb8e9e
 #endif /* !_BITROT_STUB_MESSAGES_H_ */
cb8e9e
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
cb8e9e
index 600eb80..de81510 100644
cb8e9e
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
cb8e9e
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
cb8e9e
@@ -237,7 +237,8 @@ br_stub_prepare_signing_request (dict_t *dict,
cb8e9e
  */
cb8e9e
 static inline int
cb8e9e
 br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode,
cb8e9e
-                             unsigned long version, gf_boolean_t markdirty)
cb8e9e
+                             unsigned long version, gf_boolean_t markdirty,
cb8e9e
+                             gf_boolean_t bad_object)
cb8e9e
 {
cb8e9e
         int32_t ret = 0;
cb8e9e
         br_stub_inode_ctx_t *ctx = NULL;
cb8e9e
@@ -252,17 +253,21 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode,
cb8e9e
                 : __br_stub_mark_inode_synced (ctx);
cb8e9e
         __br_stub_set_ongoing_version (ctx, version);
cb8e9e
 
cb8e9e
+        if (bad_object)
cb8e9e
+                __br_stub_mark_object_bad (ctx);
cb8e9e
+
cb8e9e
         if (fd) {
cb8e9e
                 ret = br_stub_add_fd_to_inode (this, fd, ctx);
cb8e9e
                 if (ret)
cb8e9e
                         goto free_ctx;
cb8e9e
         }
cb8e9e
+
cb8e9e
         ret = br_stub_set_inode_ctx (this, inode, ctx);
cb8e9e
         if (ret)
cb8e9e
                 goto free_ctx;
cb8e9e
         return 0;
cb8e9e
 
cb8e9e
- free_ctx:
cb8e9e
+free_ctx:
cb8e9e
         GF_FREE (ctx);
cb8e9e
  error_return:
cb8e9e
         return -1;
cb8e9e
@@ -290,7 +295,7 @@ br_stub_mod_inode_versions (xlator_t *this,
cb8e9e
 
cb8e9e
                 ret = 0;
cb8e9e
         }
cb8e9e
- unblock:
cb8e9e
+unblock:
cb8e9e
         UNLOCK (&inode->lock);
cb8e9e
 
cb8e9e
         return ret;
cb8e9e
@@ -623,7 +628,7 @@ int32_t
cb8e9e
 br_stub_perform_objsign (call_frame_t *frame, xlator_t *this,
cb8e9e
                          fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
cb8e9e
 {
cb8e9e
-        STACK_WIND (frame, default_setxattr_cbk,
cb8e9e
+        STACK_WIND (frame, default_fsetxattr_cbk,
cb8e9e
                     FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd,
cb8e9e
                     dict, flags, xdata);
cb8e9e
 
cb8e9e
@@ -900,13 +905,101 @@ br_stub_handle_object_reopen (call_frame_t *frame,
cb8e9e
         STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL);
cb8e9e
 }
cb8e9e
 
cb8e9e
+/**
cb8e9e
+ * This function only handles bad file identification. Instead of checking in
cb8e9e
+ * fops like open, readv, writev whether the object is bad or not by doing
cb8e9e
+ * getxattr calls, better to catch them when scrubber marks it as bad.
cb8e9e
+ * So this callback is called only when the fsetxattr is sent by the scrubber
cb8e9e
+ * to mark the object as bad.
cb8e9e
+ */
cb8e9e
+int
cb8e9e
+br_stub_fsetxattr_bad_object_cbk (call_frame_t *frame, void *cookie,
cb8e9e
+                                  xlator_t *this, int32_t op_ret,
cb8e9e
+                                  int32_t op_errno, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        br_stub_local_t *local = NULL;
cb8e9e
+        int32_t          ret   = -1;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+        frame->local = NULL;
cb8e9e
+
cb8e9e
+        if (op_ret < 0)
cb8e9e
+                goto unwind;
cb8e9e
+
cb8e9e
+        /*
cb8e9e
+         * What to do if marking the object as bad fails? (i.e. in memory
cb8e9e
+         * marking within the inode context. If we are here means fsetxattr
cb8e9e
+         * fop has succeeded on disk and the bad object xattr has been set).
cb8e9e
+         * We can return failure to scruber, but there is nothing the scrubber
cb8e9e
+         * can do with it (it might assume that the on disk setxattr itself has
cb8e9e
+         * failed). The main purpose of this operation is to help identify the
cb8e9e
+         * bad object by checking the inode context itself (thus avoiding the
cb8e9e
+         * necessity of doing a getxattr fop on the disk).
cb8e9e
+         *
cb8e9e
+         * So as of now, success itself is being returned even though inode
cb8e9e
+         * context set operation fails.
cb8e9e
+         * In future if there is any change in the policy which can handle this,
cb8e9e
+         * then appropriate response should be sent (i.e. success or error).
cb8e9e
+         */
cb8e9e
+        ret = br_stub_mark_object_bad (this, local->u.context.inode);
cb8e9e
+        if (ret)
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_MARK_FAIL,
cb8e9e
+                        "failed to mark object %s as bad",
cb8e9e
+                        uuid_utoa (local->u.context.inode->gfid));
cb8e9e
+
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
cb8e9e
+        br_stub_cleanup_local (local);
cb8e9e
+        br_stub_dealloc_local (local);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+static int32_t
cb8e9e
+br_stub_handle_bad_object_key (call_frame_t *frame, xlator_t *this, fd_t *fd,
cb8e9e
+                               dict_t *dict, int flags, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        br_stub_local_t *local    = NULL;
cb8e9e
+        int32_t          op_ret   = -1;
cb8e9e
+        int32_t         op_errno = EINVAL;
cb8e9e
+
cb8e9e
+        if (frame->root->pid != GF_CLIENT_PID_SCRUB) {
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                        BRS_MSG_NON_SCRUB_BAD_OBJ_MARK, "bad object marking "
cb8e9e
+                        "on %s is not from the scrubber",
cb8e9e
+                        uuid_utoa (fd->inode->gfid));
cb8e9e
+                goto unwind;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        local = br_stub_alloc_local (this);
cb8e9e
+        if (!local) {
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_MEMORY,
cb8e9e
+                        "failed to allocate memory for fsetxattr on %s",
cb8e9e
+                        uuid_utoa (fd->inode->gfid));
cb8e9e
+                op_ret = -1;
cb8e9e
+                op_errno = ENOMEM;
cb8e9e
+                goto unwind;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        br_stub_fill_local (local, NULL, fd, fd->inode,
cb8e9e
+                            fd->inode->gfid, BR_STUB_NO_VERSIONING, 0);
cb8e9e
+        frame->local = local;
cb8e9e
+
cb8e9e
+        STACK_WIND (frame, br_stub_fsetxattr_bad_object_cbk, FIRST_CHILD (this),
cb8e9e
+                    FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags,
cb8e9e
+                    xdata);
cb8e9e
+        return 0;
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
 int
cb8e9e
 br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                    fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
cb8e9e
 {
cb8e9e
-        int32_t          ret  = 0;
cb8e9e
-        uint32_t val = 0;
cb8e9e
-        br_isignature_t *sign = NULL;
cb8e9e
+        int32_t              ret      = 0;
cb8e9e
+        uint32_t             val      = 0;
cb8e9e
+        br_isignature_t     *sign     = NULL;
cb8e9e
 
cb8e9e
         if (!IA_ISREG (fd->inode->ia_type))
cb8e9e
                 goto wind;
cb8e9e
@@ -927,11 +1020,18 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
                 goto done;
cb8e9e
         }
cb8e9e
 
cb8e9e
- wind:
cb8e9e
-        STACK_WIND (frame, default_setxattr_cbk,
cb8e9e
-                    FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd,
cb8e9e
-                    dict, flags, xdata);
cb8e9e
- done:
cb8e9e
+        /* handle bad object */
cb8e9e
+        if (dict_get (dict, BITROT_OBJECT_BAD_KEY)) {
cb8e9e
+                br_stub_handle_bad_object_key (frame, this, fd,
cb8e9e
+                                               dict, flags, xdata);
cb8e9e
+                goto done;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+wind:
cb8e9e
+        STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD (this),
cb8e9e
+                    FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags,
cb8e9e
+                    xdata);
cb8e9e
+done:
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
@@ -940,6 +1040,59 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
cb8e9e
 
cb8e9e
 /** {{{ */
cb8e9e
 
cb8e9e
+/* {f}removexattr() */
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+br_stub_removexattr (call_frame_t *frame, xlator_t *this,
cb8e9e
+                     loc_t *loc, const char *name, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        int32_t op_ret    = -1;
cb8e9e
+        int32_t op_errno  = EINVAL;
cb8e9e
+
cb8e9e
+        if (!strcmp (BITROT_OBJECT_BAD_KEY, name)) {
cb8e9e
+                gf_msg (this->name, GF_LOG_WARNING, 0,
cb8e9e
+                        BRS_MSG_REMOVE_BAD_OBJECT_XATTR, "Remove xattr called"
cb8e9e
+                        " on bad object xattr for file %s", loc->path);
cb8e9e
+                goto unwind;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        STACK_WIND_TAIL (frame, FIRST_CHILD(this),
cb8e9e
+                         FIRST_CHILD(this)->fops->removexattr,
cb8e9e
+                         loc, name, xdata);
cb8e9e
+        return 0;
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, NULL);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+br_stub_fremovexattr (call_frame_t *frame, xlator_t *this,
cb8e9e
+                      fd_t *fd, const char *name, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        int32_t op_ret    = -1;
cb8e9e
+        int32_t op_errno  = EINVAL;
cb8e9e
+
cb8e9e
+        if (!strcmp (BITROT_OBJECT_BAD_KEY, name)) {
cb8e9e
+                gf_msg (this->name, GF_LOG_WARNING, 0,
cb8e9e
+                        BRS_MSG_REMOVE_BAD_OBJECT_XATTR, "Remove xattr called"
cb8e9e
+                        " on bad object xattr for inode %s",
cb8e9e
+                        uuid_utoa (fd->inode->gfid));
cb8e9e
+                goto unwind;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        STACK_WIND_TAIL (frame, FIRST_CHILD(this),
cb8e9e
+                         FIRST_CHILD(this)->fops->fremovexattr,
cb8e9e
+                         fd, name, xdata);
cb8e9e
+        return 0;
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, NULL);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+/** }}} */
cb8e9e
+
cb8e9e
+/** {{{ */
cb8e9e
+
cb8e9e
 /* {f}getxattr() */
cb8e9e
 
cb8e9e
 int
cb8e9e
@@ -1044,6 +1197,7 @@ br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         br_vxattr_status_t   status;
cb8e9e
         br_stub_local_t     *local        = NULL;
cb8e9e
         inode_t             *inode        = NULL;
cb8e9e
+        gf_boolean_t         bad_object   = _gf_false;
cb8e9e
 
cb8e9e
         if (op_ret < 0)
cb8e9e
                 goto unwind;
cb8e9e
@@ -1055,7 +1209,11 @@ br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         inode = local->u.context.inode;
cb8e9e
 
cb8e9e
         op_ret   = -1;
cb8e9e
-        status = br_version_xattr_state (xattr, &obuf, &sbuf);
cb8e9e
+        status = br_version_xattr_state (xattr, &obuf, &sbuf, &bad_object);
cb8e9e
+
cb8e9e
+        op_errno = EIO;
cb8e9e
+        if (bad_object)
cb8e9e
+                goto delkeys;
cb8e9e
 
cb8e9e
         op_errno = EINVAL;
cb8e9e
         if (status == BR_VXATTR_STATUS_INVALID)
cb8e9e
@@ -1286,6 +1444,31 @@ unwind:
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
+int32_t
cb8e9e
+br_stub_readv (call_frame_t *frame, xlator_t *this,
cb8e9e
+               fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        int32_t              op_ret   = -1;
cb8e9e
+        int32_t              op_errno = EINVAL;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
cb8e9e
+
cb8e9e
+        BR_STUB_HANDLE_BAD_OBJECT (this, fd->inode, op_ret, op_errno, unwind);
cb8e9e
+
cb8e9e
+        STACK_WIND_TAIL (frame, FIRST_CHILD(this),
cb8e9e
+                         FIRST_CHILD(this)->fops->readv, fd, size, offset,
cb8e9e
+                         flags, xdata);
cb8e9e
+        return 0;
cb8e9e
+
cb8e9e
+unwind:
cb8e9e
+        STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, NULL, 0, NULL,
cb8e9e
+                             NULL, NULL);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
 /**
cb8e9e
  * The first write response on the first fd in the list of fds will set
cb8e9e
  * the flag to indicate that the inode is modified. The subsequent write
cb8e9e
@@ -1367,6 +1550,8 @@ br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
cb8e9e
         if (ret)
cb8e9e
                 goto unwind;
cb8e9e
 
cb8e9e
+        BR_STUB_HANDLE_BAD_OBJECT (this, fd->inode, op_ret, op_errno, unwind);
cb8e9e
+
cb8e9e
         /**
cb8e9e
          * The inode is not dirty and also witnessed atleast one successful
cb8e9e
          * modification operation. Therefore, subsequent operations need not
cb8e9e
@@ -1486,6 +1671,8 @@ br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
cb8e9e
         if (ret)
cb8e9e
                 goto unwind;
cb8e9e
 
cb8e9e
+        BR_STUB_HANDLE_BAD_OBJECT (this, fd->inode, op_ret, op_errno, unwind);
cb8e9e
+
cb8e9e
         if (!inc_version && modified)
cb8e9e
                 goto wind;
cb8e9e
 
cb8e9e
@@ -1616,6 +1803,8 @@ br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
cb8e9e
         if (ret)
cb8e9e
                 goto cleanup_fd;
cb8e9e
 
cb8e9e
+        BR_STUB_HANDLE_BAD_OBJECT (this, fd->inode, op_ret, op_errno, unwind);
cb8e9e
+
cb8e9e
         if (!inc_version && modified)
cb8e9e
                 goto wind;
cb8e9e
 
cb8e9e
@@ -1689,15 +1878,14 @@ br_stub_open (call_frame_t *frame, xlator_t *this,
cb8e9e
         int32_t              ret      = -1;
cb8e9e
         br_stub_inode_ctx_t *ctx      = NULL;
cb8e9e
         uint64_t             ctx_addr = 0;
cb8e9e
+        int32_t              op_ret   = -1;
cb8e9e
+        int32_t              op_errno = EINVAL;
cb8e9e
 
cb8e9e
         GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
cb8e9e
         GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
cb8e9e
         GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
cb8e9e
         GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
cb8e9e
 
cb8e9e
-        if (frame->root->pid == GF_CLIENT_PID_SCRUB)
cb8e9e
-                goto wind;
cb8e9e
-
cb8e9e
         ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
cb8e9e
         if (ret) {
cb8e9e
                 gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
@@ -1708,6 +1896,12 @@ br_stub_open (call_frame_t *frame, xlator_t *this,
cb8e9e
         }
cb8e9e
 
cb8e9e
         ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
cb8e9e
+
cb8e9e
+        BR_STUB_HANDLE_BAD_OBJECT (this, loc->inode, op_ret, op_errno, unwind);
cb8e9e
+
cb8e9e
+        if (frame->root->pid == GF_CLIENT_PID_SCRUB)
cb8e9e
+                goto wind;
cb8e9e
+
cb8e9e
         if (flags == O_RDONLY)
cb8e9e
                 goto wind;
cb8e9e
 
cb8e9e
@@ -1725,7 +1919,7 @@ wind:
cb8e9e
                     FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata);
cb8e9e
         return 0;
cb8e9e
 unwind:
cb8e9e
-        STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL);
cb8e9e
+        STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, NULL, NULL);
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
@@ -1784,7 +1978,7 @@ br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
cb8e9e
         if (ret < 0) {
cb8e9e
                 ret = br_stub_init_inode_versions (this, fd, inode, version,
cb8e9e
-                                                   _gf_true);
cb8e9e
+                                                   _gf_true, _gf_false);
cb8e9e
                 if (ret) {
cb8e9e
                         op_ret = -1;
cb8e9e
                         op_errno = EINVAL;
cb8e9e
@@ -1834,7 +2028,7 @@ br_stub_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
                 goto unwind;
cb8e9e
 
cb8e9e
         ret = br_stub_init_inode_versions (this, NULL, inode, version,
cb8e9e
-                                           _gf_true);
cb8e9e
+                                           _gf_true, _gf_false);
cb8e9e
         /**
cb8e9e
          * Like lookup, if init_inode_versions fail, return EINVAL
cb8e9e
          */
cb8e9e
@@ -1869,6 +2063,23 @@ unwind:
cb8e9e
 
cb8e9e
 /** }}} */
cb8e9e
 
cb8e9e
+/**
cb8e9e
+ * As of now, only lookup searches for bad object xattr and marks the
cb8e9e
+ * object as bad in its inode context if the xattr is present. But there
cb8e9e
+ * is a possibility that, at the time of the lookup the object was not
cb8e9e
+ * marked bad (i.e. bad object xattr was not set), and later its marked
cb8e9e
+ * as bad. In this case, object is not bad, so when a fop such as open or
cb8e9e
+ * readv or writev comes on the object, the fop will be sent downward instead
cb8e9e
+ * of sending as error upwards.
cb8e9e
+ * The solution for this is to do a getxattr for the below list of fops.
cb8e9e
+ * lookup, readdirp, open, readv, writev.
cb8e9e
+ * But doing getxattr for each of the above fops might be costly.
cb8e9e
+ * So another method followed is to catch the bad file marking by the scrubber
cb8e9e
+ * and set that info within the object's inode context. In this way getxattr
cb8e9e
+ * calls can be avoided and bad objects can be caught instantly. Fetching the
cb8e9e
+ * xattr is needed only in lookups when there is a brick restart or inode
cb8e9e
+ * forget.
cb8e9e
+ */
cb8e9e
 static inline int32_t
cb8e9e
 br_stub_lookup_version (xlator_t *this,
cb8e9e
                         uuid_t gfid, inode_t *inode, dict_t *xattr)
cb8e9e
@@ -1877,6 +2088,7 @@ br_stub_lookup_version (xlator_t *this,
cb8e9e
         br_version_t       *obuf    = NULL;
cb8e9e
         br_signature_t     *sbuf    = NULL;
cb8e9e
         br_vxattr_status_t  status;
cb8e9e
+        gf_boolean_t        bad_object = _gf_false;
cb8e9e
 
cb8e9e
         /**
cb8e9e
          * versioning xattrs were requested from POSIX. if available, figure
cb8e9e
@@ -1886,13 +2098,13 @@ br_stub_lookup_version (xlator_t *this,
cb8e9e
          * operation (such as write(), etc..) triggers synchronization to
cb8e9e
          * disk.
cb8e9e
          */
cb8e9e
-        status = br_version_xattr_state (xattr, &obuf, &sbuf);
cb8e9e
-
cb8e9e
+        status = br_version_xattr_state (xattr, &obuf, &sbuf, &bad_object);
cb8e9e
         version = ((status == BR_VXATTR_STATUS_FULL)
cb8e9e
                    || (status == BR_VXATTR_STATUS_UNSIGNED))
cb8e9e
                         ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION;
cb8e9e
-        return br_stub_init_inode_versions (this, NULL,
cb8e9e
-                                            inode, version, _gf_true);
cb8e9e
+
cb8e9e
+        return br_stub_init_inode_versions (this, NULL, inode, version,
cb8e9e
+                                            _gf_true, bad_object);
cb8e9e
 }
cb8e9e
 
cb8e9e
 
cb8e9e
@@ -1975,6 +2187,9 @@ br_stub_readdirp (call_frame_t *frame, xlator_t *this,
cb8e9e
         ret = dict_set_uint32 (dict, BITROT_SIGNING_VERSION_KEY, 0);
cb8e9e
         if (ret)
cb8e9e
                 goto unwind;
cb8e9e
+        ret = dict_set_uint32 (dict, BITROT_OBJECT_BAD_KEY, 0);
cb8e9e
+        if (ret)
cb8e9e
+                goto unwind;
cb8e9e
 
cb8e9e
         STACK_WIND (frame, br_stub_readdirp_cbk, FIRST_CHILD (this),
cb8e9e
                     FIRST_CHILD(this)->fops->readdirp, fd, size,
cb8e9e
@@ -2009,18 +2224,51 @@ br_stub_lookup_cbk (call_frame_t *frame, void *cookie,
cb8e9e
                 goto unwind;
cb8e9e
         if (!IA_ISREG (stbuf->ia_type))
cb8e9e
                 goto unwind;
cb8e9e
-        if (cookie != (void *) BR_STUB_REQUEST_COOKIE)
cb8e9e
+
cb8e9e
+        /**
cb8e9e
+         * If the object is bad, then "bad inode" marker has to be sent back
cb8e9e
+         * in resoinse, for revalidated lookups as well. Some xlators such as
cb8e9e
+         * quick-read might cache the data in revalidated lookup as fresh
cb8e9e
+         * lookup would anyway have sent "bad inode" marker.
cb8e9e
+         * In general send bad inode marker for every lookup operation on the
cb8e9e
+         * bad object.
cb8e9e
+         */
cb8e9e
+        if (cookie != (void *) BR_STUB_REQUEST_COOKIE) {
cb8e9e
+                ret =  br_stub_mark_xdata_bad_object (this, inode, xattr);
cb8e9e
+                if (ret) {
cb8e9e
+                        op_ret = -1;
cb8e9e
+                        op_errno = EIO;
cb8e9e
+                        goto unwind;
cb8e9e
+                }
cb8e9e
+
cb8e9e
                 goto delkey;
cb8e9e
+        }
cb8e9e
 
cb8e9e
         ret = br_stub_lookup_version (this, stbuf->ia_gfid, inode, xattr);
cb8e9e
         if (ret < 0) {
cb8e9e
                 op_ret   = -1;
cb8e9e
                 op_errno = EINVAL;
cb8e9e
+                goto delkey;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        /**
cb8e9e
+         * If the object is bad, send "bad inode" marker back in response
cb8e9e
+         * for xlator(s) to act accordingly (such as quick-read, etc..)
cb8e9e
+         */
cb8e9e
+        ret = br_stub_mark_xdata_bad_object (this, inode, xattr);
cb8e9e
+        if (ret) {
cb8e9e
+                /**
cb8e9e
+                 * aaha! bad object, but sorry we would not
cb8e9e
+                 * satisfy the request on allocation failures.
cb8e9e
+                 */
cb8e9e
+                op_ret = -1;
cb8e9e
+                op_errno = EIO;
cb8e9e
+                goto unwind;
cb8e9e
         }
cb8e9e
 
cb8e9e
- delkey:
cb8e9e
+delkey:
cb8e9e
         br_stub_remove_vxattrs (xattr);
cb8e9e
- unwind:
cb8e9e
+unwind:
cb8e9e
         STACK_UNWIND_STRICT (lookup, frame,
cb8e9e
                              op_ret, op_errno, inode, stbuf, xattr, postparent);
cb8e9e
 
cb8e9e
@@ -2037,6 +2285,10 @@ br_stub_lookup (call_frame_t *frame,
cb8e9e
         uint64_t ctx_addr = 0;
cb8e9e
         gf_boolean_t xref = _gf_false;
cb8e9e
 
cb8e9e
+        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
cb8e9e
+
cb8e9e
         ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr);
cb8e9e
         if (ret < 0)
cb8e9e
                 ctx_addr = 0;
cb8e9e
@@ -2069,6 +2321,9 @@ br_stub_lookup (call_frame_t *frame,
cb8e9e
         ret = dict_set_uint32 (xdata, BITROT_SIGNING_VERSION_KEY, 0);
cb8e9e
         if (ret)
cb8e9e
                 goto unwind;
cb8e9e
+        ret = dict_set_uint32 (xdata, BITROT_OBJECT_BAD_KEY, 0);
cb8e9e
+        if (ret)
cb8e9e
+                goto unwind;
cb8e9e
         cookie = (void *) BR_STUB_REQUEST_COOKIE;
cb8e9e
 
cb8e9e
  wind:
cb8e9e
@@ -2335,6 +2590,9 @@ struct xlator_fops fops = {
cb8e9e
         .truncate  = br_stub_truncate,
cb8e9e
         .ftruncate = br_stub_ftruncate,
cb8e9e
         .mknod     = br_stub_mknod,
cb8e9e
+        .readv     = br_stub_readv,
cb8e9e
+        .removexattr = br_stub_removexattr,
cb8e9e
+        .fremovexattr = br_stub_fremovexattr,
cb8e9e
 };
cb8e9e
 
cb8e9e
 struct xlator_cbks cbks = {
cb8e9e
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
cb8e9e
index 48c7a37..e5649fc 100644
cb8e9e
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h
cb8e9e
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
cb8e9e
@@ -37,6 +37,7 @@ typedef struct br_stub_inode_ctx {
cb8e9e
         int            info_sign;
cb8e9e
         struct list_head fd_list; /* list of open fds or fds participating in
cb8e9e
                                      write operations */
cb8e9e
+        gf_boolean_t bad_object;
cb8e9e
 } br_stub_inode_ctx_t;
cb8e9e
 
cb8e9e
 typedef struct br_stub_fd {
cb8e9e
@@ -85,6 +86,18 @@ typedef struct br_stub_private {
cb8e9e
         struct mem_pool *local_pool;
cb8e9e
 } br_stub_private_t;
cb8e9e
 
cb8e9e
+static inline gf_boolean_t
cb8e9e
+__br_stub_is_bad_object (br_stub_inode_ctx_t *ctx)
cb8e9e
+{
cb8e9e
+        return ctx->bad_object;
cb8e9e
+}
cb8e9e
+
cb8e9e
+static inline void
cb8e9e
+__br_stub_mark_object_bad (br_stub_inode_ctx_t *ctx)
cb8e9e
+{
cb8e9e
+        ctx->bad_object = _gf_true;
cb8e9e
+}
cb8e9e
+
cb8e9e
 /* inode writeback helpers */
cb8e9e
 static inline void
cb8e9e
 __br_stub_mark_inode_dirty (br_stub_inode_ctx_t *ctx)
cb8e9e
@@ -370,12 +383,93 @@ static inline void
cb8e9e
 br_stub_remove_vxattrs (dict_t *xattr)
cb8e9e
 {
cb8e9e
         if (xattr) {
cb8e9e
+                dict_del (xattr, BITROT_OBJECT_BAD_KEY);
cb8e9e
                 dict_del (xattr, BITROT_CURRENT_VERSION_KEY);
cb8e9e
                 dict_del (xattr, BITROT_SIGNING_VERSION_KEY);
cb8e9e
                 dict_del (xattr, BITROT_SIGNING_XATTR_SIZE_KEY);
cb8e9e
         }
cb8e9e
 }
cb8e9e
 
cb8e9e
+#define BR_STUB_HANDLE_BAD_OBJECT(this, inode, op_ret, op_errno, label) \
cb8e9e
+        do {                                                            \
cb8e9e
+                if (br_stub_is_bad_object (this, inode)) {              \
cb8e9e
+                        gf_msg (this->name, GF_LOG_ERROR, 0,            \
cb8e9e
+                                 BRS_MSG_BAD_OBJECT_ACCESS,             \
cb8e9e
+                                 "%s is a bad object. Returning",       \
cb8e9e
+                                 uuid_utoa (inode->gfid));              \
cb8e9e
+                        op_ret = -1;                                    \
cb8e9e
+                        op_errno = EIO;                                 \
cb8e9e
+                        goto label;                                     \
cb8e9e
+                }                                                       \
cb8e9e
+        } while (0)
cb8e9e
+
cb8e9e
+static inline gf_boolean_t
cb8e9e
+br_stub_is_bad_object (xlator_t *this, inode_t *inode)
cb8e9e
+{
cb8e9e
+        gf_boolean_t         bad_object = _gf_false;
cb8e9e
+        uint64_t             ctx_addr   = 0;
cb8e9e
+        br_stub_inode_ctx_t *ctx        = NULL;
cb8e9e
+        int32_t              ret        = -1;
cb8e9e
+
cb8e9e
+        ret = br_stub_get_inode_ctx (this, inode, &ctx_addr);
cb8e9e
+        if (ret) {
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_CONTEXT_FAILED,
cb8e9e
+                        "failed to get the inode context for the inode %s",
cb8e9e
+                        uuid_utoa (inode->gfid));
cb8e9e
+                goto out;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
cb8e9e
+
cb8e9e
+        LOCK (&inode->lock);
cb8e9e
+        {
cb8e9e
+                bad_object = __br_stub_is_bad_object (ctx);
cb8e9e
+        }
cb8e9e
+        UNLOCK (&inode->lock);
cb8e9e
+
cb8e9e
+out:
cb8e9e
+        return bad_object;
cb8e9e
+}
cb8e9e
+
cb8e9e
+static inline int32_t
cb8e9e
+br_stub_mark_object_bad (xlator_t *this, inode_t *inode)
cb8e9e
+{
cb8e9e
+        int32_t  ret = -1;
cb8e9e
+        uint64_t ctx_addr = 0;
cb8e9e
+        br_stub_inode_ctx_t *ctx = NULL;
cb8e9e
+
cb8e9e
+        ret = br_stub_get_inode_ctx (this, inode, &ctx_addr);
cb8e9e
+        if (ret) {
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                        BRS_MSG_GET_INODE_CONTEXT_FAILED, "failed to get the "
cb8e9e
+                        "inode context for the inode %s",
cb8e9e
+                        uuid_utoa (inode->gfid));
cb8e9e
+               goto out;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
cb8e9e
+
cb8e9e
+        LOCK (&inode->lock);
cb8e9e
+        {
cb8e9e
+                __br_stub_mark_object_bad (ctx);
cb8e9e
+        }
cb8e9e
+        UNLOCK (&inode->lock);
cb8e9e
+
cb8e9e
+out:
cb8e9e
+        return ret;
cb8e9e
+}
cb8e9e
+
cb8e9e
+static inline int32_t
cb8e9e
+br_stub_mark_xdata_bad_object (xlator_t *this, inode_t *inode, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        int32_t    ret = 0;
cb8e9e
+
cb8e9e
+        if (br_stub_is_bad_object (this, inode))
cb8e9e
+                ret = dict_set_int32 (xdata, GLUSTERFS_BAD_INODE, 1);
cb8e9e
+
cb8e9e
+        return ret;
cb8e9e
+}
cb8e9e
+
cb8e9e
 int32_t
cb8e9e
 br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx);
cb8e9e
 
cb8e9e
diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c
cb8e9e
index c6913ee..1426ae5 100644
cb8e9e
--- a/xlators/performance/quick-read/src/quick-read.c
cb8e9e
+++ b/xlators/performance/quick-read/src/quick-read.c
cb8e9e
@@ -409,6 +409,11 @@ qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
                 goto out;
cb8e9e
 	}
cb8e9e
 
cb8e9e
+        if (dict_get (xdata, GLUSTERFS_BAD_INODE)) {
cb8e9e
+                qr_inode_prune (this, inode);
cb8e9e
+                goto out;
cb8e9e
+        }
cb8e9e
+
cb8e9e
 	if (dict_get (xdata, "sh-failed")) {
cb8e9e
 		qr_inode_prune (this, inode);
cb8e9e
 		goto out;
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e