Blob Blame History Raw
From 994fc131f10953679b7bf70d0603f5ec8f878fd2 Mon Sep 17 00:00:00 2001
From: Poornima G <pgurusid@redhat.com>
Date: Sun, 4 Sep 2016 08:27:47 +0530
Subject: [PATCH 134/141] md-cache, afr: Reduce the window of stale read

Problem:
Consider a replica setup, where one mount writes data to a
file and the other mount reads the file. In afr, read operations
are not transaction based, a brick(read subvolume) is chosen as
a part of lookup or other operations, read is always wound only
to the read subvolume, even if there was write from a different client
that failed on this brick. This stale read continues until there is
a lookup or any write operation from the mount point. Currently, this
is not a major issue, as a lookup is issued before every read and it will
switch the read subvolume to a correct one. But with the plan of
increasing md-cache timeout to 600s, the stale read problem will be
more pronounced, i.e. stale read can continue for 600s(or more if cascaded
with readdirp), as there will be no lookups.

Solution:
Afr doesn't have any built-in solution for stale read(without affecting
the performance). The solution that came up, was to use upcall. When a file
on any brick is marked bad for the first time, upcall sends a notification
to all the clients that had recently accessed the file. The solution has
2 parts:
- Identifying when a file is marked bad, on any of the bricks,
  for the first time
- Client side actions on recieving the notifications

Identifying when a file is marked bad on any of the bricks for the first time:
-----------------------------------------------------------------------------
The idea is to track xattrop in upcall. xattrop currently comes with 2 afr
xattrs - afr dirty bit and afr pending xattrs.
   Dirty xattr is set to 1 before every write, and is unset if write succeeds.
In certain scenarios, dirty xattr can be 0 and still the file could be bad
copy. Hence do not track dirty xattr.
   Pending xattr is set on the good copy, indicating the other bricks that have
bad copy. It is still not as simple as, notifying when any of the pending xattrs
change. It could lead to flood of notifcations, in case the other brick is
completely down or consistantly failing. Hence it is important to notify only
once, the first time a good copy is marked bad.

Client side actions on recieving pending xattr change, notification:
--------------------------------------------------------------------
md-cache will invalidate the cache of that file, so that further lookup is
passed down to afr and hence update the read subvolume. Invalidating only in
md-cache is not enough, consider the folling oder of opertaions:
- pending xattr invalidation - invalidate md-cache
- readdirp on the bad read subvolume - fill md-cache
- lookup (served from md-cache)
- read - wound to the old read subvol.
Hence, along with invalidating md-cache, it is very important to reset the
read subvolume for that file, in afr.

Design Credit: Anuradha Talur, Ravishankar N

1. xattrop doesn't carry info saying post op/pre op.
2. Pre xattrop will have 0 value for all pending xattrs,
   the cbk of pre xattrop carries the on-disk xattr value.
   Non zero indicated healing is required.
3. Post xattrop will have non zero value for any of the
   pending xattrs, if the fop failed on any of the bricks.

Change-Id: I469cbc111714c433984fe1c922be2ef113c25804
BUG: 1284873
Signed-off-by: Poornima G <pgurusid@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/87047
Reviewed-by: Rajesh Joseph <rjoseph@redhat.com>
Tested-by: Rajesh Joseph <rjoseph@redhat.com>
---
 libglusterfs/src/glusterfs.h                  |    3 +
 tests/bugs/md-cache/afr-stale-read.t          |   44 ++++
 xlators/cluster/afr/src/afr-common.c          |   41 +++-
 xlators/cluster/afr/src/afr.h                 |    1 -
 xlators/features/upcall/src/upcall-internal.c |   60 +++++-
 xlators/features/upcall/src/upcall.c          |  330 ++++++++++++++++---------
 xlators/features/upcall/src/upcall.h          |    7 +-
 7 files changed, 370 insertions(+), 116 deletions(-)
 create mode 100755 tests/bugs/md-cache/afr-stale-read.t

diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 8206e23..b5ae685 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -169,6 +169,9 @@
 #define VIRTUAL_QUOTA_XATTR_CLEANUP_KEY "glusterfs.quota-xattr-cleanup"
 #define QUOTA_READ_ONLY_KEY "trusted.glusterfs.quota.read-only"
 
+/* afr related */
+#define AFR_XATTR_PREFIX "trusted.afr"
+
 /* Index xlator related */
 #define GF_XATTROP_INDEX_GFID "glusterfs.xattrop_index_gfid"
 #define GF_XATTROP_ENTRY_CHANGES_GFID "glusterfs.xattrop_entry_changes_gfid"
diff --git a/tests/bugs/md-cache/afr-stale-read.t b/tests/bugs/md-cache/afr-stale-read.t
new file mode 100755
index 0000000..7cee5af
--- /dev/null
+++ b/tests/bugs/md-cache/afr-stale-read.t
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+#. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+#Basic checks
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info
+
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1..2};
+
+TEST $CLI volume set $V0 features.cache-invalidation on
+TEST $CLI volume set $V0 features.cache-invalidation-timeout 600
+TEST $CLI volume set $V0 performance.cache-invalidation on
+TEST $CLI volume set $V0 performance.md-cache-timeout 600
+TEST $CLI volume set $V0 performance.cache-samba-metadata on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 read-subvolume $V0-client-0
+TEST $CLI volume set $V0 performance.quick-read off
+
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M1
+
+#Write some data from M0 and read it from M1,
+#so that M1 selects a read subvol, and caches the lookup
+TEST `echo "one" > $M0/file1`
+EXPECT "one" cat $M1/file1
+
+#Fail few writes from M0 on brick-0, as a result of this failure
+#upcall in brick-0 will invalidate the read subvolume of M1.
+TEST chattr +i $B0/${V0}1/file1
+TEST `echo "two" > $M0/file1`
+TEST `echo "three" > $M0/file1`
+TEST `echo "four" > $M0/file1`
+TEST `echo "five" > $M0/file1`
+
+EXPECT_WITHIN $MDC_TIMEOUT "five" cat $M1/file1
+TEST chattr -i $B0/${V0}1/file1
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index d819a22..c2922fb 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -32,7 +32,7 @@
 #include "statedump.h"
 #include "inode.h"
 #include "events.h"
-
+#include "upcall-utils.h"
 #include "fd.h"
 
 #include "afr-inode-read.h"
@@ -4102,6 +4102,14 @@ afr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
                 goto err;
 
         call_cnt = local->call_count;
+
+        if (xdata) {
+                for (i = 0; i < priv->child_count; i++) {
+                        if (dict_set_int8 (xdata, priv->pending_key[i], 0) < 0)
+                                goto err;
+                }
+        }
+
         for (i = 0; i < priv->child_count; i++) {
                 if (!local->child_up[i])
                         continue;
@@ -4339,6 +4347,10 @@ afr_notify (xlator_t *this, int32_t event,
         dict_t          *output             = NULL;
         gf_boolean_t    had_quorum          = _gf_false;
         gf_boolean_t    has_quorum          = _gf_false;
+        struct gf_upcall *up_data           = NULL;
+        struct gf_upcall_cache_invalidation *up_ci = NULL;
+        inode_table_t  *itable              = NULL;
+        inode_t        *inode               = NULL;
 
         priv = this->private;
 
@@ -4459,7 +4471,34 @@ afr_notify (xlator_t *this, int32_t event,
                 case GF_EVENT_SOME_CHILD_DOWN:
                         priv->last_event[idx] = event;
                         break;
+                case GF_EVENT_UPCALL:
+                        up_data = (struct gf_upcall *)data;
+                        if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+                                break;
+                        up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+                        /* Since md-cache will be aggressively filtering
+                         * lookups, the stale read issue will be more
+                         * pronounced. Hence when a pending xattr is set notify
+                         * all the md-cache clients to invalidate the existing
+                         * stat cache and send the lookup next time */
+                        if (up_ci->dict) {
+                                for (i = 0; i < priv->child_count; i++) {
+                                        if (dict_get (up_ci->dict, priv->pending_key[i])) {
+                                                 ret = dict_set_int8 (up_ci->dict,
+                                                                      MDC_INVALIDATE_IATT , 0);
+                                                 break;
+                                        }
+                                }
+                        }
+                        itable = ((xlator_t *)this->graph->top)->itable;
+                       /*Internal processes may not have itable for top xlator*/
+                        if (itable)
+                                inode = inode_find (itable, up_data->gfid);
+                        if (inode)
+                                afr_inode_read_subvol_reset (inode, this);
 
+                        break;
                 default:
                         propagate = 1;
                         break;
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index d04775d..17b997e 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -23,7 +23,6 @@
 #include "afr-self-heald.h"
 #include "afr-messages.h"
 
-#define AFR_XATTR_PREFIX "trusted.afr"
 #define AFR_PATHINFO_HEADER "REPLICATE:"
 #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
 #define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
diff --git a/xlators/features/upcall/src/upcall-internal.c b/xlators/features/upcall/src/upcall-internal.c
index d3c9b91..c7d7dab 100644
--- a/xlators/features/upcall/src/upcall-internal.c
+++ b/xlators/features/upcall/src/upcall-internal.c
@@ -435,7 +435,36 @@ upcall_reaper_thread_init (xlator_t *this)
         return ret;
 }
 
+
 int
+up_compare_afr_xattr (dict_t *d, char *k, data_t *v, void *tmp)
+{
+        dict_t *dict = tmp;
+
+        if (!strncmp (k, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))
+            && (!is_data_equal (v, dict_get (dict, k))))
+                return -1;
+
+        return 0;
+}
+
+
+static void
+up_filter_afr_xattr (dict_t *xattrs, char *xattr, data_t *v)
+{
+        /* Filter the afr pending xattrs, with value 0. Ideally this should
+         * be executed only in case of xattrop and not in set and removexattr,
+         * butset and remove xattr fops do not come with keys AFR_XATTR_PREFIX
+         */
+        if (!strncmp (xattr, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))
+            && (mem_0filled (v->data, v->len) == 0)) {
+                dict_del (xattrs, xattr);
+        }
+        return;
+}
+
+
+static int
 up_filter_unregd_xattr (dict_t *xattrs, char *xattr, data_t *v,
                         void *regd_xattrs)
 {
@@ -446,11 +475,40 @@ up_filter_unregd_xattr (dict_t *xattrs, char *xattr, data_t *v,
                  * send notification for its change
                  */
                 dict_del (xattrs, xattr);
+                goto out;
         }
-
+        up_filter_afr_xattr (xattrs, xattr, v);
+out:
         return 0;
 }
 
+
+int
+up_filter_xattr (dict_t *xattr, dict_t *regd_xattrs)
+{
+        int ret = 0;
+
+        /* Remove the xattrs from the dict, if they are not registered for
+         * cache invalidation */
+        ret = dict_foreach (xattr, up_filter_unregd_xattr, regd_xattrs);
+        return ret;
+}
+
+
+gf_boolean_t
+up_invalidate_needed (dict_t *xattrs)
+{
+        if (dict_key_count (xattrs) == 0) {
+                gf_msg_trace ("upcall", 0, "None of xattrs requested for"
+                              " invalidation, were changed. Nothing to "
+                              "invalidate");
+                return _gf_false;
+        }
+
+        return _gf_true;
+}
+
+
 /*
  * Given a client, first fetch upcall_entry_t from the inode_ctx client list.
  * Later traverse through the client list of that upcall entry. If this client
diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c
index db4aa19..ba5d118 100644
--- a/xlators/features/upcall/src/upcall.c
+++ b/xlators/features/upcall/src/upcall.c
@@ -29,7 +29,7 @@
 #include "protocol-common.h"
 #include "defaults.h"
 
-int32_t
+static int32_t
 up_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
              int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
 {
@@ -56,7 +56,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
          fd_t *fd, dict_t *xdata)
 {
@@ -84,7 +84,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int op_ret, int op_errno, struct iatt *prebuf,
                struct iatt *postbuf, dict_t *xdata)
@@ -111,7 +111,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
            struct iovec *vector, int count, off_t off, uint32_t flags,
            struct iobref *iobref, dict_t *xdata)
@@ -141,7 +141,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
               int op_ret, int op_errno,
               struct iovec *vector, int count, struct iatt *stbuf,
@@ -170,7 +170,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readv (call_frame_t *frame, xlator_t *this,
           fd_t *fd, size_t size, off_t offset,
           uint32_t flags, dict_t *xdata)
@@ -200,7 +200,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
            int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
            dict_t *xdata)
@@ -227,7 +227,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_lk (call_frame_t *frame, xlator_t *this,
        fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
 {
@@ -254,7 +254,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                  int op_ret, int op_errno, struct iatt *prebuf,
                  struct iatt *postbuf, dict_t *xdata)
@@ -282,7 +282,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
              dict_t *xdata)
 {
@@ -310,7 +310,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 int op_ret, int op_errno, struct iatt *statpre,
                 struct iatt *statpost, dict_t *xdata)
@@ -352,7 +352,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
             struct iatt *stbuf, int32_t valid, dict_t *xdata)
 {
@@ -381,7 +381,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
                struct iatt *preoldparent, struct iatt *postoldparent,
@@ -412,7 +412,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_rename (call_frame_t *frame, xlator_t *this,
           loc_t *oldloc, loc_t *newloc, dict_t *xdata)
 {
@@ -443,7 +443,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int op_ret, int op_errno, struct iatt *preparent,
                struct iatt *postparent, dict_t *xdata)
@@ -471,7 +471,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
            dict_t *xdata)
 {
@@ -499,7 +499,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
              int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
@@ -527,7 +527,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
          loc_t *newloc, dict_t *xdata)
 {
@@ -556,7 +556,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
               int op_ret, int op_errno, struct iatt *preparent,
               struct iatt *postparent, dict_t *xdata)
@@ -585,7 +585,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
           dict_t *xdata)
 {
@@ -613,7 +613,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
               int op_ret, int op_errno, inode_t *inode,
               struct iatt *stbuf, struct iatt *preparent,
@@ -644,7 +644,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_mkdir (call_frame_t *frame, xlator_t *this,
           loc_t *loc, mode_t mode, mode_t umask, dict_t *params)
 {
@@ -673,7 +673,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int op_ret, int op_errno, fd_t *fd, inode_t *inode,
                struct iatt *stbuf, struct iatt *preparent,
@@ -705,7 +705,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_create (call_frame_t *frame, xlator_t *this,
            loc_t *loc, int32_t flags, mode_t mode,
            mode_t umask, fd_t *fd, dict_t *params)
@@ -736,7 +736,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int op_ret, int op_errno,
                inode_t *inode, struct iatt *stbuf, dict_t *xattr,
@@ -765,7 +765,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_lookup (call_frame_t *frame, xlator_t *this,
            loc_t *loc, dict_t *xattr_req)
 {
@@ -794,7 +794,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_stat_cbk (call_frame_t *frame, void *cookie,
              xlator_t *this, int32_t op_ret, int32_t op_errno,
              struct iatt *buf, dict_t *xdata)
@@ -822,7 +822,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
         int32_t          op_errno        = -1;
@@ -849,7 +849,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_fstat (call_frame_t *frame, xlator_t *this,
           fd_t *fd, dict_t *xdata)
 {
@@ -877,7 +877,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_ftruncate (call_frame_t *frame, xlator_t *this,
               fd_t *fd, off_t offset, dict_t *xdata)
 {
@@ -906,7 +906,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int op_ret, int op_errno, dict_t *xdata)
 {
@@ -932,7 +932,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_access (call_frame_t *frame, xlator_t *this,
            loc_t *loc, int32_t mask, dict_t *xdata)
 {
@@ -960,7 +960,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                  int op_ret, int op_errno, const char *path,
                  struct iatt *stbuf, dict_t *xdata)
@@ -988,7 +988,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readlink (call_frame_t *frame, xlator_t *this,
              loc_t *loc, size_t size, dict_t *xdata)
 {
@@ -1017,7 +1017,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
               int32_t op_ret, int32_t op_errno, inode_t *inode,
               struct iatt *buf, struct iatt *preparent,
@@ -1048,7 +1048,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
           mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
 {
@@ -1077,7 +1077,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 int32_t op_ret, int32_t op_errno, inode_t *inode,
                 struct iatt *buf, struct iatt *preparent,
@@ -1108,7 +1108,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_symlink (call_frame_t   *frame, xlator_t *this,
             const char *linkpath, loc_t *loc, mode_t umask,
             dict_t *xdata)
@@ -1138,7 +1138,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 int32_t op_ret, int32_t op_errno, fd_t *fd,
                 dict_t *xdata)
@@ -1165,7 +1165,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_opendir (call_frame_t *frame, xlator_t *this,
             loc_t *loc, fd_t *fd, dict_t *xdata)
 {
@@ -1193,7 +1193,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                int32_t op_ret, int32_t op_errno, struct statvfs *buf,
                dict_t *xdata)
@@ -1220,7 +1220,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_statfs (call_frame_t *frame, xlator_t *this,
            loc_t *loc, dict_t *xdata)
 {
@@ -1248,7 +1248,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
                 dict_t *xdata)
@@ -1275,7 +1275,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readdir (call_frame_t  *frame, xlator_t *this,
             fd_t *fd, size_t size, off_t off, dict_t *xdata)
 {
@@ -1303,7 +1303,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                  int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
                  dict_t *xdata)
@@ -1344,7 +1344,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_readdirp (call_frame_t *frame, xlator_t *this,
              fd_t *fd, size_t size, off_t off, dict_t *dict)
 {
@@ -1372,7 +1372,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
              struct iatt  *stbuf, int32_t valid, dict_t *xdata)
 {
@@ -1401,7 +1401,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
                  int32_t op_ret, int32_t op_errno, struct iatt *pre,
                  struct iatt *post, dict_t *xdata)
@@ -1429,7 +1429,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
              int32_t mode, off_t offset, size_t len, dict_t *xdata)
 {
@@ -1458,7 +1458,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
                int32_t op_ret, int32_t op_errno, struct iatt *pre,
                struct iatt *post, dict_t *xdata)
@@ -1486,7 +1486,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
            off_t offset, size_t len, dict_t *xdata)
 {
@@ -1515,7 +1515,7 @@ err:
         return 0;
 }
 
-int32_t
+static int32_t
 up_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
                 int32_t op_ret, int32_t op_errno, struct iatt *pre,
                 struct iatt *post, dict_t *xdata)
@@ -1543,7 +1543,7 @@ out:
         return 0;
 }
 
-int
+static int
 up_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
             off_t offset, off_t len, dict_t *xdata)
 {
@@ -1573,7 +1573,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_seek_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
              int op_errno, off_t offset, dict_t *xdata)
 {
@@ -1600,7 +1600,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_seek (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
          gf_seek_what_t what, dict_t *xdata)
 {
@@ -1628,7 +1628,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
@@ -1652,20 +1652,14 @@ up_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         }
 
         flags = UP_XATTR;
-        /* Remove the xattrs from the dict, if they are not registered for
-         * cache invalidation */
-        ret = dict_foreach (local->xattr, up_filter_unregd_xattr, priv->xattrs);
+
+        ret = up_filter_xattr (local->xattr, priv->xattrs);
         if (ret < 0) {
                 op_ret = ret;
                 goto out;
         }
-
-        if (dict_key_count(local->xattr) == 0) {
-                gf_msg_trace (this->name, 0, "None of xattrs requested for"
-                              " invalidation, were changed. Nothing to "
-                              "invalidate");
-                goto out; /* nothing to invalidate */
-        }
+        if (!up_invalidate_needed (local->xattr))
+                goto out;
 
         ret = syncop_stat (FIRST_CHILD(frame->this), &local->loc, &stbuf,
                            NULL, NULL);
@@ -1682,7 +1676,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
              int32_t flags, dict_t *xdata)
 {
@@ -1718,7 +1712,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
@@ -1742,20 +1736,14 @@ up_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         }
 
         flags = UP_XATTR;
-         /* Remove the xattrs from the dict, if they are not registered for
-         * cache invalidation */
-        ret = dict_foreach (local->xattr, up_filter_unregd_xattr, priv->xattrs);
+
+        ret = up_filter_xattr (local->xattr, priv->xattrs);
         if (ret < 0) {
                 op_ret = ret;
                 goto out;
         }
-
-        if (dict_key_count(local->xattr) == 0) {
-                gf_msg_trace (this->name, 0, "None of xattrs requested for"
-                              " invalidation, were changed. Nothing to "
-                              "invalidate");
-                goto out; /* nothing to invalidate */
-        }
+        if (!up_invalidate_needed (local->xattr))
+                goto out;
 
         ret = syncop_fstat (FIRST_CHILD(frame->this), local->fd, &stbuf, NULL,
                             NULL);
@@ -1772,7 +1760,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
               int32_t flags, dict_t *xdata)
 {
@@ -1808,7 +1796,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
@@ -1832,20 +1820,13 @@ up_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         }
         flags = UP_XATTR_RM;
 
-        /* Remove the xattrs from the dict, if they are not registered for
-         * cache invalidation */
-        ret = dict_foreach (local->xattr, up_filter_unregd_xattr, priv->xattrs);
+        ret = up_filter_xattr (local->xattr, priv->xattrs);
         if (ret < 0) {
                 op_ret = ret;
                 goto out;
         }
-
-        if (dict_key_count(local->xattr) == 0) {
-                gf_msg_trace (this->name, 0, "None of xattrs requested for"
-                              " invalidation, were changed. Nothing to "
-                              "invalidate");
-                goto out; /* nothing to invalidate */
-        }
+        if (!up_invalidate_needed (local->xattr))
+                goto out;
 
         ret = syncop_fstat (FIRST_CHILD(frame->this), local->fd, &stbuf, NULL,
                             NULL);
@@ -1862,7 +1843,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
                  const char *name, dict_t *xdata)
 {
@@ -1897,7 +1878,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
@@ -1921,20 +1902,13 @@ up_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         }
         flags = UP_XATTR_RM;
 
-         /* Remove the xattrs from the dict, if they are not registered for
-         * cache invalidation */
-        ret = dict_foreach (local->xattr, up_filter_unregd_xattr, priv->xattrs);
+        ret = up_filter_xattr (local->xattr, priv->xattrs);
         if (ret < 0) {
                 op_ret = ret;
                 goto out;
         }
-
-        if (dict_key_count(local->xattr) == 0) {
-                gf_msg_trace (this->name, 0, "None of xattrs requested for"
-                              " invalidation, were changed. Nothing to "
-                              "invalidate");
-                goto out; /* nothing to invalidate */
-        }
+        if (!up_invalidate_needed (local->xattr))
+                goto out;
 
         ret = syncop_stat (FIRST_CHILD(frame->this), &local->loc, &stbuf, NULL,
                            NULL);
@@ -1951,7 +1925,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
                 const char *name, dict_t *xdata)
 {
@@ -1986,7 +1960,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                   int32_t op_ret, int32_t op_errno, dict_t *dict,
                   dict_t *xdata)
@@ -2015,7 +1989,7 @@ out:
 }
 
 
-int32_t
+static int32_t
 up_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
               const char *name, dict_t *xdata)
 {
@@ -2042,7 +2016,7 @@ err:
 }
 
 
-int32_t
+static int32_t
 up_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                  int32_t op_ret, int32_t op_errno, dict_t *dict,
                  dict_t *xdata)
@@ -2070,7 +2044,7 @@ out:
         return 0;
 }
 
-int32_t
+static int32_t
 up_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
              const char *name, dict_t *xdata)
 {
@@ -2097,6 +2071,141 @@ err:
 }
 
 
+/* The xattrops here mainly tracks changes in afr pending xattr.
+ *    1. xattrop doesn't carry info saying post op/pre op.
+ *    2. Pre xattrop will have 0 value for all pending xattrs,
+ *       the cbk of pre xattrop carries the on-disk xattr value.
+ *       Non zero on-disk xattr indicates pending healing.
+ *    3. Post xattrop will either have 0 or 1 as value of pending xattrs,
+ *       0 on success, 1 on failure. But the post xattrop cbk will have
+ *       0 or 1 or any higher value.
+ *       0 - if no healing required*
+ *       1 - if this is the first time pending xattr is being set.
+ *       n - if there is already a pending xattr set, it will increment
+ *       the on-disk value and send that in cbk.
+ * Our aim is to send an invalidation, only the first time a pending
+ * xattr was set on a file. Below are some of the exceptions in handling
+ * xattrop:
+ * - Do not filter unregistered xattrs in the cbk, but in the call path.
+ *   Else, we will be invalidating on every preop, if the file already has
+ *   pending xattr set. Filtering unregistered xattrs on the fop path
+ *   ensures we invalidate only in postop, everytime a postop comes with
+ *   pending xattr value 1.
+ * - Consider a brick is down, and the postop sets pending xattrs as long
+ *   as the other brick is down. But we do not want to invalidate everytime
+ *   a pending xattr is set, but we wan't to inalidate only the first time
+ *   a pending xattr is set on any file. Hence, to identify if its the first
+ *   time a pending xattr is set, we compare the value of pending xattrs that
+ *   came in postop and postop cbk, if its same then its the first time.
+ */
+static int32_t
+up_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+        client_t         *client        = NULL;
+        upcall_local_t   *local         = NULL;
+
+        EXIT_IF_UPCALL_OFF (this, out);
+
+        client = frame->root->client;
+        local = frame->local;
+
+        if ((op_ret < 0) || !local) {
+                goto out;
+        }
+
+        if (up_invalidate_needed (local->xattr)) {
+                if (dict_foreach (local->xattr, up_compare_afr_xattr, dict) < 0)
+                        goto out;
+
+                upcall_cache_invalidate (frame, this, client, local->inode,
+                                         UP_XATTR, NULL, NULL, NULL,
+                                         local->xattr);
+        }
+out:
+        if (frame->root->op == GF_FOP_FXATTROP) {
+                UPCALL_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict,
+                                     xdata);
+        } else {
+                UPCALL_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict,
+                xdata);
+        }
+        return 0;
+}
+
+
+static int32_t
+up_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+        int32_t          op_errno        = -1;
+        upcall_local_t   *local          = NULL;
+        int              ret             = 0;
+        upcall_private_t *priv           = NULL;
+
+        EXIT_IF_UPCALL_OFF (this, out);
+
+        priv = this->private;
+        GF_VALIDATE_OR_GOTO (this->name, priv, out);
+
+        local = upcall_local_init (frame, this, loc, NULL, loc->inode, xattr);
+        if (!local) {
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+        ret = up_filter_xattr (local->xattr, priv->xattrs);
+        if (ret < 0) {
+                goto err;
+        }
+
+out:
+        STACK_WIND (frame, up_xattrop_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr,
+                    xdata);
+        return 0;
+err:
+        UPCALL_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+        return 0;
+}
+
+
+static int32_t
+up_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+             gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+        int32_t          op_errno        = -1;
+        upcall_local_t   *local          = NULL;
+        int              ret             = 0;
+        upcall_private_t *priv           = NULL;
+
+        EXIT_IF_UPCALL_OFF (this, out);
+
+        priv = this->private;
+        GF_VALIDATE_OR_GOTO (this->name, priv, out);
+
+        local = upcall_local_init (frame, this, NULL, fd, fd->inode, xattr);
+        if (!local) {
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+        ret = up_filter_xattr (local->xattr, priv->xattrs);
+        if (ret < 0) {
+                goto err;
+        }
+
+out:
+        STACK_WIND (frame, up_xattrop_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr,
+                    xdata);
+        return 0;
+err:
+        STACK_UNWIND_STRICT (fxattrop, frame, -1, op_errno, NULL, NULL);
+        return 0;
+}
+
+
 int32_t
 mem_acct_init (xlator_t *this)
 {
@@ -2150,7 +2259,7 @@ upcall_local_init (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
 
         local->inode = inode_ref (inode);
         if (xattr)
-                local->xattr = dict_ref (xattr);
+                local->xattr = dict_copy_with_ref (xattr, NULL);
 
         /* Shall we get inode_ctx and store it here itself? */
         local->upcall_inode_ctx = upcall_inode_ctx_get (inode, this);
@@ -2430,6 +2539,8 @@ struct xlator_fops fops = {
         .fgetxattr   = up_fgetxattr,
         .fremovexattr = up_fremovexattr,
         .removexattr = up_removexattr,
+        .xattrop     = up_xattrop,
+        .fxattrop    = up_fxattrop,
 
 #ifdef NOT_SUPPORTED
         /* internal lk fops */
@@ -2444,9 +2555,6 @@ struct xlator_fops fops = {
         .flush       = up_flush,
         .fsync       = up_fsync,
         .fsyncdir    = up_fsyncdir,
-
-        .xattrop     = up_xattrop,
-        .fxattrop    = up_fxattrop,
 #endif
 };
 
diff --git a/xlators/features/upcall/src/upcall.h b/xlators/features/upcall/src/upcall.h
index 852f551..4554248 100644
--- a/xlators/features/upcall/src/upcall.h
+++ b/xlators/features/upcall/src/upcall.h
@@ -132,6 +132,9 @@ void upcall_client_cache_invalidate (xlator_t *xl, uuid_t gfid,
                                      struct iatt *p_stbuf,
                                      struct iatt *oldp_stbuf, dict_t *xattr);
 
-int up_filter_unregd_xattr (dict_t *xattrs, char *xattr, data_t *v,
-                            void *regd_xattrs);
+int up_filter_xattr (dict_t *xattr, dict_t *regd_xattrs);
+
+int up_compare_afr_xattr (dict_t *d, char *k, data_t *v, void *tmp);
+
+gf_boolean_t up_invalidate_needed (dict_t *xattrs);
 #endif /* __UPCALL_H__ */
-- 
1.7.1