Blob Blame History Raw
From 66ea04f631a46c12fe1e40d1bb710d64e34764b2 Mon Sep 17 00:00:00 2001
From: Poornima G <pgurusid@redhat.com>
Date: Fri, 11 Dec 2015 05:12:07 -0500
Subject: [PATCH 116/141] md-cache: Add cache invalidation support for metadata cache

Problem:
md-cache currently updates its stat in cbks of selected fops.
The default cache time is 1 second, if this is increasd to reap the
benefits of caching, we may end up with stale cache for long time,
as there is no logic yet to notify md-cache of backend changes by
another client.

Solution:
Use the existing upcall mechanism to invalidate the cache.
For this feature to work, "features.cache-invalidation" volume
option should be enabled.

This patch as is doesn't improve any performance, the benifit of the
patch is that it provides coherency for stat cache, hence the cache
timeout can be quite longer which in turn can improve the performance.

Change-Id: I2dbb0afa7b5e4a5a248f910188e0918e02f18692
BUG: 1284873
Signed-off-by: Poornima G <pgurusid@redhat.com>
Reviewed-on: http://review.gluster.org/12951
Smoke: Gluster Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/87029
Reviewed-by: Rajesh Joseph <rjoseph@redhat.com>
Tested-by: Rajesh Joseph <rjoseph@redhat.com>
---
 tests/bugs/md-cache/bug-1211863.t                  |   60 ++++-
 xlators/mgmt/glusterd/src/glusterd-volume-set.c    |    6 +
 .../performance/md-cache/src/md-cache-messages.h   |   10 +-
 xlators/performance/md-cache/src/md-cache.c        |  258 ++++++++++++++++++--
 4 files changed, 304 insertions(+), 30 deletions(-)

diff --git a/tests/bugs/md-cache/bug-1211863.t b/tests/bugs/md-cache/bug-1211863.t
index b969fbb..4fa9e33 100644
--- a/tests/bugs/md-cache/bug-1211863.t
+++ b/tests/bugs/md-cache/bug-1211863.t
@@ -5,18 +5,64 @@
 
 cleanup;
 
-TEST glusterd
+## 1. Start glusterd
+TEST glusterd;
 
-TEST $CLI volume create $V0 $H0:$B0/$V0
+## 2. Lets create volume
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2,3};
+
+## 3. Start the volume
 TEST $CLI volume start $V0
 
-TEST $CLI volume set $V0 cache-samba-metadata on
-EXPECT 'on' volinfo_field $V0 'performance.cache-samba-metadata'
+## 4. Enable the upcall xlator, and increase the md-cache timeout to max
+TEST $CLI volume set $V0 performance.md-cache-timeout 600
+TEST $CLI volume set $V0 performance.cache-samba-metadata on
 
-TEST $CLI volume set $V0 cache-samba-metadata off
-EXPECT 'off' volinfo_field $V0 'performance.cache-samba-metadata'
+## 6. Create two gluster mounts
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M1
+
+## 8. Create a file
+TEST touch $M0/file1
+
+## 9. Setxattr from mount-0
+TEST "setfattr -n user.DOSATTRIB -v "abc" $M0/file1"
+## 10. Getxattr from mount-1, this should return the correct value as it is a fresh getxattr
+TEST "getfattr -n user.DOSATTRIB $M1/file1 | grep -q abc"
+
+## 11. Now modify the same xattr from mount-0 again
+TEST "setfattr -n user.DOSATTRIB -v "xyz" $M0/file1"
+## 12. Since the xattr is already cached in mount-1 it returns the old xattr
+       #value, until the timeout (600)
+TEST "getfattr -n user.DOSATTRIB $M1/file1 | grep -q abc"
+
+## 13. Unmount to clean all the cache
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1
 
+TEST $CLI volume set $V0 features.cache-invalidation on
+TEST $CLI volume set $V0 features.cache-invalidation-timeout 600
+
+## 18. Restart the volume to restart the bick process
 TEST $CLI volume stop $V0
-TEST $CLI volume delete $V0
+TEST $CLI volume start $V0
+
+## 20. Create two gluster mounts
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M1
+
+## 22. Repeat the tests 11-14, but this time since cache invalidation is on,
+       #the getxattr will reflect the new value
+TEST "setfattr -n user.DOSATTRIB -v "abc" $M0/file1"
+TEST "getfattr -n user.DOSATTRIB $M1/file1 | grep -q abc"
+TEST "setfattr -n user.DOSATTRIB -v "xyz" $M0/file1"
+sleep 2; #There can be a very very small window where the next getxattr
+         #reaches md-cache, before the cache-invalidation caused by previous
+         #setxattr, reaches md-cache. Hence sleeping for 2 sec.
+         #Also it should not be > 600.
+TEST "getfattr -n user.DOSATTRIB $M1/file1 | grep -q xyz"
+
+TEST $CLI volume set $V0 cache-samba-metadata off
+EXPECT 'off' volinfo_field $V0 'performance.cache-samba-metadata'
 
 cleanup;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 100e032..de7bcb4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1967,6 +1967,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .op_version = 2,
           .flags      = OPT_FLAG_CLIENT_OPT
         },
+        { .key        = "features.cache-invalidation",
+          .voltype    = "performance/md-cache",
+          .option     = "cache-invalidation",
+          .op_version = GD_OP_VERSION_3_9_0,
+          .flags      = OPT_FLAG_CLIENT_OPT
+        },
 
 	/* Feature translators */
         { .key         = "features.uss",
diff --git a/xlators/performance/md-cache/src/md-cache-messages.h b/xlators/performance/md-cache/src/md-cache-messages.h
index 2fe8d45..a4259ba 100644
--- a/xlators/performance/md-cache/src/md-cache-messages.h
+++ b/xlators/performance/md-cache/src/md-cache-messages.h
@@ -40,7 +40,7 @@
  */
 
 #define GLFS_MD_CACHE_BASE                   GLFS_MSGID_COMP_MD_CACHE
-#define GLFS_MD_CACHE_NUM_MESSAGES           1
+#define GLFS_MD_CACHE_NUM_MESSAGES           2
 #define GLFS_MSGID_END  (GLFS_MD_CACHE_BASE + GLFS_MD_CACHE_NUM_MESSAGES + 1)
 
 /* Messages with message IDs */
@@ -58,6 +58,14 @@
 
 #define MD_CACHE_MSG_NO_MEMORY        (GLFS_MD_CACHE_BASE + 1)
 
+/*!
+ * @messageid
+ * @diagnosis
+ * @recommendedaction  None
+ *
+ */
+
+#define MD_CACHE_MSG_DISCARD_UPDATE    (GLFS_MD_CACHE_BASE + 2)
 
 /*------------*/
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
index 1ad9e8e..fd2b366 100644
--- a/xlators/performance/md-cache/src/md-cache.c
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -16,6 +16,8 @@
 #include "md-cache-mem-types.h"
 #include "compat-errno.h"
 #include "glusterfs-acl.h"
+#include "defaults.h"
+#include "upcall-utils.h"
 #include <assert.h>
 #include <sys/time.h>
 #include "md-cache-messages.h"
@@ -34,6 +36,9 @@ struct mdc_conf {
 	gf_boolean_t force_readdirp;
         gf_boolean_t cache_swift_metadata;
         gf_boolean_t cache_samba_metadata;
+        gf_boolean_t mdc_invalidation;
+        time_t last_child_down;
+        gf_lock_t lock;
 };
 
 
@@ -328,20 +333,58 @@ unlock:
 }
 
 
+/* Cache is valid if:
+ * - It is not cached before any brick was down. Brick down case is handled by
+ *   invalidating all the cache when any brick went down.
+ * - The cache time is not expired
+ */
+static gf_boolean_t
+__is_cache_valid (xlator_t *this, time_t mdc_time)
+{
+        time_t           now             = 0;
+        gf_boolean_t     ret             = _gf_true;
+        struct mdc_conf *conf            = NULL;
+        int              timeout         = 0;
+        time_t           last_child_down = 0;
+
+        conf = this->private;
+
+        /* conf->lock here is not taken deliberately, so that the multi
+         * threaded IO doesn't contend on a global lock. While updating
+         * the variable, the lock is taken, so that atleast the writes are
+         * intact. The read of last_child_down may return junk, but that
+         * is for a very short period of time.
+         */
+        last_child_down = conf->last_child_down;
+        timeout = conf->timeout;
+
+        time (&now);
+
+        if ((mdc_time == 0) ||
+            ((last_child_down != 0) && (mdc_time < last_child_down))) {
+                ret = _gf_false;
+                goto out;
+        }
+
+        if (now >= (mdc_time + timeout)) {
+                ret = _gf_false;
+        }
+
+out:
+        return ret;
+}
+
+
 static gf_boolean_t
 is_md_cache_iatt_valid (xlator_t *this, struct md_cache *mdc)
 {
-	struct mdc_conf *conf = NULL;
-	time_t           now = 0;
         gf_boolean_t     ret = _gf_true;
-	conf = this->private;
-
-	time (&now);
 
         LOCK (&mdc->lock);
         {
-                if (now >= (mdc->ia_time + conf->timeout))
-                        ret = _gf_false;
+                ret = __is_cache_valid (this, mdc->ia_time);
+                if (ret == _gf_false)
+                        mdc->ia_time = 0;
         }
         UNLOCK (&mdc->lock);
 
@@ -352,18 +395,13 @@ is_md_cache_iatt_valid (xlator_t *this, struct md_cache *mdc)
 static gf_boolean_t
 is_md_cache_xatt_valid (xlator_t *this, struct md_cache *mdc)
 {
-	struct mdc_conf *conf = NULL;
-	time_t           now = 0;
         gf_boolean_t     ret = _gf_true;
 
-	conf = this->private;
-
-	time (&now);
-
         LOCK (&mdc->lock);
         {
-                if (now >= (mdc->xa_time + conf->timeout))
-                        ret = _gf_false;
+                ret = __is_cache_valid (this, mdc->xa_time);
+                if (ret == _gf_false)
+                        mdc->xa_time = 0;
         }
         UNLOCK (&mdc->lock);
 
@@ -413,12 +451,14 @@ int
 mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf,
 			    struct iatt *iatt)
 {
-        int              ret = -1;
+        int              ret = 0;
         struct md_cache *mdc = NULL;
 
         mdc = mdc_inode_prep (this, inode);
-        if (!mdc)
+        if (!mdc) {
+                ret = -1;
                 goto out;
+        }
 
         LOCK (&mdc->lock);
         {
@@ -427,6 +467,33 @@ mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf,
                         goto unlock;
                 }
 
+                /* There could be a race in invalidation, where the
+                 * invalidations in order A, B reaches md-cache in the order
+                 * B, A. Hence, make sure the invalidation A is discarded if
+                 * it comes after B. ctime of a file is always in ascending
+                 * order unlike atime and mtime(which can be changed by user
+                 * to any date), also ctime gets updates when atime/mtime
+                 * changes, hence check for ctime only.
+                 */
+                if (mdc->md_ctime > iatt->ia_ctime) {
+                        gf_msg_callingfn (this->name, GF_LOG_DEBUG, EINVAL,
+                                          MD_CACHE_MSG_DISCARD_UPDATE,
+                                          "discarding the iatt validate "
+                                          "request");
+                        ret = -1;
+                        goto unlock;
+
+                }
+                if ((mdc->md_ctime == iatt->ia_ctime) &&
+                    (mdc->md_ctime_nsec > iatt->ia_ctime_nsec)) {
+                        gf_msg_callingfn (this->name, GF_LOG_DEBUG, EINVAL,
+                                          MD_CACHE_MSG_DISCARD_UPDATE,
+                                          "discarding the iatt validate "
+                                          "request(ctime_nsec)");
+                        ret = -1;
+                        goto unlock;
+                }
+
 		/*
 		 * Invalidate the inode if the mtime or ctime has changed
 		 * and the prebuf doesn't match the value we have cached.
@@ -450,7 +517,7 @@ mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf,
         }
 unlock:
         UNLOCK (&mdc->lock);
-        ret = 0;
+
 out:
         return ret;
 }
@@ -2300,15 +2367,81 @@ mdc_key_load_set (struct mdc_key *keys, char *pattern, gf_boolean_t val)
 	return 0;
 }
 
+struct set {
+       inode_t *inode;
+       xlator_t *this;
+};
+
+static int
+mdc_inval_xatt (dict_t *d, char *k, data_t *v, void *tmp)
+{
+        struct set *tmp1 = NULL;
+        int         ret  = 0;
+
+        tmp1 = (struct set *)tmp;
+        ret = mdc_inode_xatt_unset (tmp1->this, tmp1->inode, k);
+        return ret;
+}
+
+static int
+mdc_invalidate (xlator_t *this, void *data)
+{
+        struct gf_upcall                    *up_data    = NULL;
+        struct gf_upcall_cache_invalidation *up_ci      = NULL;
+        inode_t                             *inode      = NULL;
+        int                                  ret        = 0;
+        struct set                           tmp        = {0, };
+        inode_table_t                       *itable     = NULL;
+
+        up_data = (struct gf_upcall *)data;
+
+        if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+                goto out;
+
+        up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+        itable = ((xlator_t *)this->graph->top)->itable;
+        inode = inode_find (itable, up_data->gfid);
+        if (!inode) {
+                ret = -1;
+                goto out;
+        }
+
+        if (up_ci->flags & IATT_UPDATE_FLAGS) {
+                ret = mdc_inode_iatt_set_validate (this, inode, NULL,
+                                                   &up_ci->stat);
+                /* one of the scenarios where ret < 0 is when this invalidate
+                 * is older than the current stat, in that case do not
+                 * update the xattrs as well
+                 */
+                if (ret < 0)
+                        goto out;
+        }
+        if (up_ci->flags & UP_XATTR) {
+                ret = mdc_inode_xatt_update (this, inode, up_ci->dict);
+        } else if (up_ci->flags & UP_XATTR_RM) {
+                tmp.inode = inode;
+                tmp.this = this;
+                ret = dict_foreach (up_ci->dict, mdc_inval_xatt, &tmp);
+        }
+
+out:
+        if (inode)
+                inode_unref (inode);
+
+        return ret;
+}
+
 
 int
 reconfigure (xlator_t *this, dict_t *options)
 {
 	struct mdc_conf *conf = NULL;
+        int    timeout = 0;
 
 	conf = this->private;
 
-	GF_OPTION_RECONF ("md-cache-timeout", conf->timeout, options, int32, out);
+	GF_OPTION_RECONF ("md-cache-timeout", timeout, options, int32, out);
 
 	GF_OPTION_RECONF ("cache-selinux", conf->cache_selinux, options, bool, out);
 	mdc_key_load_set (mdc_keys, "security.", conf->cache_selinux);
@@ -2330,7 +2463,19 @@ reconfigure (xlator_t *this, dict_t *options)
                           conf->cache_samba_metadata);
 
 	GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out);
-
+        GF_OPTION_RECONF("cache-invalidation", conf->mdc_invalidation, options,
+                         bool, out);
+
+        /* If timeout is greater than 60s (default before the patch that added
+         * cache invalidation support was added) then, cache invalidation
+         * feature for md-cache needs to be enabled, if not set timeout to the
+         * previous max which is 60s
+         */
+        if ((timeout > 60) && (!conf->mdc_invalidation)) {
+                        conf->timeout = 60;
+                        goto out;
+        }
+        conf->timeout = timeout;
 out:
 	return 0;
 }
@@ -2348,6 +2493,7 @@ int
 init (xlator_t *this)
 {
 	struct mdc_conf *conf = NULL;
+        int    timeout = 0;
 
 	conf = GF_CALLOC (sizeof (*conf), 1, gf_mdc_mt_mdc_conf_t);
 	if (!conf) {
@@ -2356,7 +2502,7 @@ init (xlator_t *this)
 		return -1;
 	}
 
-        GF_OPTION_INIT ("md-cache-timeout", conf->timeout, int32, out);
+        GF_OPTION_INIT ("md-cache-timeout", timeout, int32, out);
 
 	GF_OPTION_INIT ("cache-selinux", conf->cache_selinux, bool, out);
 	mdc_key_load_set (mdc_keys, "security.", conf->cache_selinux);
@@ -2378,6 +2524,22 @@ init (xlator_t *this)
                           conf->cache_samba_metadata);
 
 	GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out);
+        GF_OPTION_INIT("cache-invalidation", conf->mdc_invalidation, bool, out);
+
+        LOCK_INIT (&conf->lock);
+        time (&conf->last_child_down);
+
+        /* If timeout is greater than 60s (default before the patch that added
+         * cache invalidation support was added) then, cache invalidation
+         * feature for md-cache needs to be enabled, if not set timeout to the
+         * previous max which is 60s
+         */
+        if ((timeout > 60) && (!conf->mdc_invalidation)) {
+                        conf->timeout = 60;
+                        goto out;
+        }
+        conf->timeout = timeout;
+
 out:
 	this->private = conf;
 
@@ -2386,6 +2548,52 @@ out:
 
 
 void
+mdc_update_child_down_time (xlator_t *this, time_t *now)
+{
+        struct mdc_conf *conf = NULL;
+
+        conf = this->private;
+
+        LOCK (&conf->lock);
+        {
+                conf->last_child_down = *now;
+        }
+        UNLOCK (&conf->lock);
+}
+
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+        int ret = 0;
+        struct mdc_conf *conf = NULL;
+        time_t           now = 0;
+
+        conf = this->private;
+        switch (event) {
+        case GF_EVENT_CHILD_DOWN:
+        case GF_EVENT_SOME_CHILD_DOWN:
+        case GF_EVENT_CHILD_MODIFIED:
+                time (&now);
+                mdc_update_child_down_time (this, &now);
+                ret = default_notify (this, event, data);
+                break;
+        case GF_EVENT_UPCALL:
+                if (conf->mdc_invalidation)
+                        ret = mdc_invalidate (this, data);
+                        if (default_notify (this, event, data) != 0)
+                                ret = -1;
+                break;
+        default:
+                ret = default_notify (this, event, data);
+                break;
+        }
+
+        return ret;
+}
+
+
+void
 fini (xlator_t *this)
 {
         return;
@@ -2453,7 +2661,7 @@ struct volume_options options[] = {
         { .key = {"md-cache-timeout"},
           .type = GF_OPTION_TYPE_INT,
           .min = 0,
-          .max = 60,
+          .max = 600,
           .default_value = "1",
           .description = "Time period after which cache has to be refreshed",
         },
@@ -2463,5 +2671,11 @@ struct volume_options options[] = {
 	  .description = "Convert all readdir requests to readdirplus to "
 			 "collect stat info on each entry.",
 	},
+        { .key = {"cache-invalidation"},
+          .type = GF_OPTION_TYPE_BOOL,
+          .default_value = "false",
+          .description = "When \"on\", invalidates/updates the metadata cache "
+                         "on receiving of the cache-invalidation notifications",
+        },
     { .key = {NULL} },
 };
-- 
1.7.1