Blob Blame History Raw
From 55e67fb41ae3b4388839723ac929cd239280a0fc Mon Sep 17 00:00:00 2001
From: Amar Tumballi <amarts@redhat.com>
Date: Thu, 7 Feb 2019 18:06:43 +0530
Subject: [PATCH 522/529] fuse: add --lru-limit option

The inode LRU mechanism is moot in fuse xlator (ie. there is no
limit for the LRU list), as fuse inodes are referenced from
kernel context, and thus they can only be dropped on request of
the kernel. This might results in a high number of passive
inodes which are useless for the glusterfs client, causing a
significant memory overhead.

This change tries to remedy this by extending the LRU semantics
and allowing to set a finite limit on the fuse inode LRU.

A brief history of problem:

When gluster's inode table was designed, fuse didn't have any
'invalidate' method, which means, userspace application could
never ask kernel to send a 'forget()' fop, instead had to wait
for kernel to send it based on kernel's parameters. Inode table
remembers the number of times kernel has cached the inode based
on the 'nlookup' parameter. And 'nlookup' field is not used by
no other entry points (like server-protocol, gfapi etc).

Hence the inode_table of fuse module always has to have lru-limit
as '0', which means no limit. GlusterFS always had to keep all
inodes in memory as kernel would have had a reference to it.
Again, the reason for this is, kernel's glusterfs inode reference
was pointer of 'inode_t' structure in glusterfs. As it is a
pointer, we could never free it (to prevent segfault, or memory
corruption).

Solution:

In the inode table, handle the prune case of inodes with 'nlookup'
differently, and call a 'invalidator' method, which in this case is
fuse_invalidate(), and it sends the request to kernel for getting
the forget request.

When the kernel sends the forget, it means, it has dropped all
the reference to the inode, and it will send the forget with the
'nlookup' parameter too. We just need to make sure to reduce the
'nlookup' value we have when we get forget. That automatically
cause the relevant prune to happen.

Credits: Csaba Henk, Xavier Hernandez, Raghavendra Gowdappa, Nithya B

Upstream:
> URL: https://review.gluster.org/19778

BUG: 1511779
Change-Id: Iabe22a62e0f819b7eb67d4ecb850dd559b0c937f
Signed-off-by: Amar Tumballi <amarts@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/162494
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 doc/mount.glusterfs.8                       |   4 +
 glusterfsd/src/glusterfsd.c                 |  24 +++
 glusterfsd/src/glusterfsd.h                 |   1 +
 libglusterfs/src/glusterfs.h                |   1 +
 libglusterfs/src/inode.c                    | 256 ++++++++++++++++++++++++----
 libglusterfs/src/inode.h                    |  17 +-
 tests/features/fuse-lru-limit.t             |  42 +++++
 xlators/mount/fuse/src/fuse-bridge.c        | 121 ++++++++-----
 xlators/mount/fuse/src/fuse-bridge.h        |   3 +
 xlators/mount/fuse/utils/mount.glusterfs.in |   7 +
 10 files changed, 393 insertions(+), 83 deletions(-)
 create mode 100644 tests/features/fuse-lru-limit.t

diff --git a/doc/mount.glusterfs.8 b/doc/mount.glusterfs.8
index 95aad02..ed6b410 100644
--- a/doc/mount.glusterfs.8
+++ b/doc/mount.glusterfs.8
@@ -119,6 +119,10 @@ Provide list of backup volfile servers in the following format [default: None]
 \fBDeprecated\fR option - placed here for backward compatibility [default: 1]
 .TP
 .TP
+\fBlru-limit=\fRN
+Set fuse module's limit for number of inodes kept in LRU list to N [default: 131072]
+.TP
+.TP
 \fBbackground-qlen=\fRN
 Set fuse module's background queue length to N [default: 64]
 .TP
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
index 990036c..2e2cd77 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -203,6 +203,9 @@ static struct argp_option gf_options[] = {
 	 "[default: 300]"},
         {"resolve-gids", ARGP_RESOLVE_GIDS_KEY, 0, 0,
          "Resolve all auxiliary groups in fuse translator (max 32 otherwise)"},
+        {"lru-limit", ARGP_FUSE_LRU_LIMIT_KEY, "N", 0,
+         "Set fuse module's limit for number of inodes kept in LRU list to N "
+         "[default: 131072]"},
 	{"background-qlen", ARGP_FUSE_BACKGROUND_QLEN_KEY, "N", 0,
 	 "Set fuse module's background queue length to N "
 	 "[default: 64]"},
@@ -462,6 +465,15 @@ set_fuse_mount_options (glusterfs_ctx_t *ctx, dict_t *options)
                 }
         }
 
+        if (cmd_args->lru_limit >= 0) {
+                ret = dict_set_int32(options, "lru-limit", cmd_args->lru_limit);
+                if (ret < 0) {
+                        gf_msg("glusterfsd", GF_LOG_ERROR, 0, glusterfsd_msg_4,
+                               "lru-limit");
+                        goto err;
+                }
+        }
+
 	if (cmd_args->background_qlen) {
 		ret = dict_set_int32 (options, "background-qlen",
                                       cmd_args->background_qlen);
@@ -1169,6 +1181,13 @@ parse_opts (int key, char *arg, struct argp_state *state)
                 cmd_args->resolve_gids = 1;
                 break;
 
+        case ARGP_FUSE_LRU_LIMIT_KEY:
+                if (!gf_string2int32(arg, &cmd_args->lru_limit))
+                         break;
+
+                argp_failure(state, -1, 0, "unknown LRU limit option %s", arg);
+                break;
+
         case ARGP_FUSE_BACKGROUND_QLEN_KEY:
                 if (!gf_string2int (arg, &cmd_args->background_qlen))
                         break;
@@ -1937,6 +1956,11 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx)
                 ctx->ssl_cert_depth = glusterfs_read_secure_access_file ();
         }
 
+        /* Need to set lru_limit to below 0 to indicate there was nothing
+           specified. This is needed as 0 is a valid option, and may not be
+           default value. */
+        cmd_args->lru_limit = -1;
+
         argp_parse (&argp, argc, argv, ARGP_IN_ORDER, NULL, cmd_args);
         if (cmd_args->print_netgroups) {
                 /* When this option is set we don't want to do anything else
diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h
index 75cb1d8..1550a30 100644
--- a/glusterfsd/src/glusterfsd.h
+++ b/glusterfsd/src/glusterfsd.h
@@ -100,6 +100,7 @@ enum argp_option_keys {
         ARGP_SUBDIR_MOUNT_KEY             = 178,
         ARGP_FUSE_EVENT_HISTORY_KEY       = 179,
         ARGP_READER_THREAD_COUNT_KEY      = 180,
+        ARGP_FUSE_LRU_LIMIT_KEY           = 190,
 };
 
 struct _gfd_vol_top_priv {
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 157437c..2690306 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -413,6 +413,7 @@ struct _cmd_args {
         pid_t              client_pid;
         int                client_pid_set;
         unsigned           uid_map_root;
+        int32_t            lru_limit;
         int                background_qlen;
         int                congestion_threshold;
         char              *fuse_mountopts;
diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c
index 29d3c8f..f57020a 100644
--- a/libglusterfs/src/inode.c
+++ b/libglusterfs/src/inode.c
@@ -24,6 +24,100 @@
    move latest accessed dentry to list_head of inode
 */
 
+/* clang-format off */
+/*
+
+Details as per Xavi:
+
+ I think we should have 3 lists: active, lru and invalidate.
+
+We'll need 3 things: refs, nlookups and invalidate_sent flag. Any change of
+refs, invalidate_sent flag and moving from one list to another must be done
+atomically.
+
+With this information, these are the states that cause a transition:
+
+    refs nlookups inv_sent    op
+      1      0        0      unref  -> refs = 0, active--->destroy
+      1      1        0      unref  -> refs = 0, active--->lru
+      1      1        0     forget  -> nlookups = 0, active--->active
+     *0      1        0     forget  -> nlookups = 0, lru--->destroy
+     *0      1        1     forget  -> nlookups = 0, invalidate--->destroy
+      0      1        0       ref   -> refs = 1, lru--->active
+      0      1        1       ref   -> refs = 1, inv_sent = 0, invalidate--->active
+      0      1        0    overflow -> refs = 1, inv_sent = 1, lru--->invalidate
+      1      1        1      unref  -> refs = 0, invalidate--->invalidate
+      1      1        1     forget  -> nlookups = 0, inv_sent = 0, invalidate--->active
+
+(*) technically these combinations cannot happen because a forget sent by the
+kernel first calls ref() and then unref(). However it's equivalent.
+
+overflow means that lru list has grown beyond the limit and the inode needs to
+be invalidated. All other combinations do not cause a change in state or are not
+possible.
+
+Based on this, the code could be similar to this:
+
+    ref(inode, inv)
+    {
+        if (refs == 0) {
+            if (inv_sent) {
+                invalidate_count--;
+                inv_sent = 0;
+            } else {
+                lru_count--;
+            }
+            if (inv) {
+                inv_sent = 1;
+                invalidate_count++;
+                list_move(inode, invalidate);
+            } else {
+                active_count++;
+                list_move(inode, active);
+            }
+        }
+        refs++;
+    }
+
+    unref(inode, clear)
+    {
+        if (clear && inv_sent) {
+            // there is a case of fuse itself sending forget, without
+            // invalidate, after entry delete, like unlink(), rmdir().
+            inv_sent = 0;
+            invalidate_count--;
+            active_count++;
+            list_move(inode, active);
+        }
+        refs--;
+        if ((refs == 0) && !inv_sent) {
+            active_count--;
+            if (nlookups == 0) {
+                destroy(inode);
+            } else {
+                lru_count++;
+                list_move(inode, lru);
+            }
+        }
+    }
+
+    forget(inode)
+    {
+        ref(inode, false);
+        nlookups--;
+        unref(inode, true);
+    }
+
+    overflow(inode)
+    {
+        ref(inode, true);
+        invalidator(inode);
+        unref(inode, false);
+    }
+
+*/
+/* clang-format on */
+
 #define INODE_DUMP_LIST(head, key_buf, key_prefix, list_type)           \
         {                                                               \
                 int i = 1;                                              \
@@ -37,7 +131,7 @@
         }
 
 static inode_t *
-__inode_unref (inode_t *inode);
+__inode_unref (inode_t *inode, gf_boolean_t clear);
 
 static int
 inode_table_prune (inode_table_t *table);
@@ -138,7 +232,7 @@ __dentry_unset (dentry_t *dentry)
         dentry->name = NULL;
 
         if (dentry->parent) {
-                __inode_unref (dentry->parent);
+                __inode_unref (dentry->parent, _gf_false);
                 dentry->parent = NULL;
         }
 
@@ -465,7 +559,7 @@ out:
 
 
 static inode_t *
-__inode_unref (inode_t *inode)
+__inode_unref (inode_t *inode, gf_boolean_t clear)
 {
         int       index = 0;
         xlator_t *this  = NULL;
@@ -473,8 +567,6 @@ __inode_unref (inode_t *inode)
         if (!inode)
                 return NULL;
 
-        this = THIS;
-
         /*
          * Root inode should always be in active list of inode table. So unrefs
          * on root inode are no-ops.
@@ -482,6 +574,14 @@ __inode_unref (inode_t *inode)
         if (__is_root_gfid(inode->gfid))
                 return inode;
 
+        this = THIS;
+
+        if (clear && inode->invalidate_sent) {
+                inode->invalidate_sent = _gf_false;
+                inode->table->invalidate_size--;
+                __inode_activate(inode);
+        }
+
         GF_ASSERT (inode->ref);
 
         --inode->ref;
@@ -492,7 +592,7 @@ __inode_unref (inode_t *inode)
                 inode->_ctx[index].ref--;
         }
 
-        if (!inode->ref) {
+        if (!inode->ref && !inode->invalidate_sent) {
                 inode->table->active_size--;
 
                 if (inode->nlookup)
@@ -506,7 +606,7 @@ __inode_unref (inode_t *inode)
 
 
 static inode_t *
-__inode_ref (inode_t *inode)
+__inode_ref (inode_t *inode, gf_boolean_t is_invalidate)
 {
         int       index = 0;
         xlator_t *this  = NULL;
@@ -516,11 +616,6 @@ __inode_ref (inode_t *inode)
 
         this = THIS;
 
-        if (!inode->ref) {
-                inode->table->lru_size--;
-                __inode_activate (inode);
-        }
-
         /*
          * Root inode should always be in active list of inode table. So unrefs
          * on root inode are no-ops. If we do not allow unrefs but allow refs,
@@ -532,6 +627,22 @@ __inode_ref (inode_t *inode)
         if (__is_root_gfid(inode->gfid) && inode->ref)
                 return inode;
 
+        if (!inode->ref) {
+                if (inode->invalidate_sent) {
+                        inode->invalidate_sent = _gf_false;
+                        inode->table->invalidate_size--;
+                } else {
+                        inode->table->lru_size--;
+                }
+                if (is_invalidate) {
+                        inode->invalidate_sent = _gf_true;
+                        inode->table->invalidate_size++;
+                        list_move_tail(&inode->list, &inode->table->invalidate);
+                } else {
+                        __inode_activate(inode);
+                }
+        }
+
         inode->ref++;
 
         index = __inode_get_xl_index (inode, this);
@@ -556,7 +667,7 @@ inode_unref (inode_t *inode)
 
         pthread_mutex_lock (&table->lock);
         {
-                inode = __inode_unref (inode);
+                inode = __inode_unref (inode, _gf_false);
         }
         pthread_mutex_unlock (&table->lock);
 
@@ -578,7 +689,7 @@ inode_ref (inode_t *inode)
 
         pthread_mutex_lock (&table->lock);
         {
-                inode = __inode_ref (inode);
+                inode = __inode_ref (inode, _gf_false);
         }
         pthread_mutex_unlock (&table->lock);
 
@@ -614,7 +725,7 @@ __dentry_create (inode_t *inode, inode_t *parent, const char *name)
         }
 
         if (parent)
-                newd->parent = __inode_ref (parent);
+                newd->parent = __inode_ref (parent, _gf_false);
 
         list_add (&newd->inode_list, &inode->dentry_list);
         newd->inode = inode;
@@ -685,7 +796,7 @@ inode_new (inode_table_t *table)
         {
                 inode = __inode_create (table);
                 if (inode != NULL) {
-                        __inode_ref (inode);
+                       __inode_ref (inode, _gf_false);
                 }
         }
         pthread_mutex_unlock (&table->lock);
@@ -802,7 +913,7 @@ inode_grep (inode_table_t *table, inode_t *parent, const char *name)
                         inode = dentry->inode;
 
                 if (inode)
-                        __inode_ref (inode);
+                        __inode_ref (inode, _gf_false);
         }
         pthread_mutex_unlock (&table->lock);
 
@@ -947,7 +1058,7 @@ inode_find (inode_table_t *table, uuid_t gfid)
         {
                 inode = __inode_find (table, gfid);
                 if (inode)
-                        __inode_ref (inode);
+                        __inode_ref (inode, _gf_false);
         }
         pthread_mutex_unlock (&table->lock);
 
@@ -1096,7 +1207,7 @@ inode_link (inode_t *inode, inode_t *parent, const char *name,
                 linked_inode = __inode_link (inode, parent, name, iatt);
 
                 if (linked_inode)
-                        __inode_ref (linked_inode);
+                        __inode_ref (linked_inode, _gf_false);
         }
         pthread_mutex_unlock (&table->lock);
 
@@ -1178,6 +1289,31 @@ inode_forget (inode_t *inode, uint64_t nlookup)
         return 0;
 }
 
+int
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup)
+{
+    inode_table_t *table = NULL;
+
+    if (!inode) {
+            gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND,
+                             "inode not found");
+            return -1;
+    }
+
+    table = inode->table;
+
+    pthread_mutex_lock(&table->lock);
+    {
+            __inode_forget(inode, nlookup);
+            __inode_unref(inode, _gf_true);
+    }
+    pthread_mutex_unlock(&table->lock);
+
+    inode_table_prune(table);
+
+    return 0;
+}
+
 /*
  * Invalidate an inode. This is invoked when a translator decides that an inode's
  * cache is no longer valid. Any translator interested in taking action in this
@@ -1356,7 +1492,7 @@ inode_parent (inode_t *inode, uuid_t pargfid, const char *name)
                         parent = dentry->parent;
 
                 if (parent)
-                        __inode_ref (parent);
+                        __inode_ref (parent, _gf_false);
         }
         pthread_mutex_unlock (&table->lock);
 
@@ -1540,6 +1676,7 @@ inode_table_prune (inode_table_t *table)
         inode_t          *del = NULL;
         inode_t          *tmp = NULL;
         inode_t          *entry = NULL;
+        int64_t           lru_size = 0;
 
         if (!table)
                 return -1;
@@ -1548,8 +1685,11 @@ inode_table_prune (inode_table_t *table)
 
         pthread_mutex_lock (&table->lock);
         {
-                while (table->lru_limit
-                       && table->lru_size > (table->lru_limit)) {
+                if (!table->lru_limit)
+                        goto purge_list;
+
+                lru_size = table->lru_size;
+                while (lru_size > (table->lru_limit)) {
                         if (list_empty (&table->lru)) {
                                 gf_msg_callingfn (THIS->name, GF_LOG_WARNING, 0,
                                                   LG_MSG_INVALID_INODE_LIST,
@@ -1559,7 +1699,18 @@ inode_table_prune (inode_table_t *table)
                                 break;
                         }
 
+                        lru_size--;
                         entry = list_entry (table->lru.next, inode_t, list);
+                        /* The logic of invalidation is required only if invalidator_fn
+                           is present */
+                        if (table->invalidator_fn) {
+                          /* check for valid inode with 'nlookup' */
+                          if (entry->nlookup) {
+                            __inode_ref(entry, _gf_true);
+                            tmp = entry;
+                            break;
+                          }
+                        }
 
                         table->lru_size--;
                         __inode_retire (entry);
@@ -1567,17 +1718,25 @@ inode_table_prune (inode_table_t *table)
                         ret++;
                 }
 
+        purge_list:
                 list_splice_init (&table->purge, &purge);
                 table->purge_size = 0;
         }
         pthread_mutex_unlock (&table->lock);
 
-        {
-                list_for_each_entry_safe (del, tmp, &purge, list) {
-                        list_del_init (&del->list);
-                        __inode_forget (del, 0);
-                        __inode_destroy (del);
-                }
+        /* Pick 1 inode for invalidation */
+        if (tmp) {
+          xlator_t *old_THIS = THIS;
+          THIS = table->invalidator_xl;
+          table->invalidator_fn(table->invalidator_xl, tmp);
+          THIS = old_THIS;
+          inode_unref(tmp);
+        }
+
+        list_for_each_entry_safe (del, tmp, &purge, list) {
+                list_del_init (&del->list);
+                __inode_forget (del, 0);
+                __inode_destroy (del);
         }
 
         return ret;
@@ -1605,9 +1764,12 @@ __inode_table_init_root (inode_table_t *table)
 
 
 inode_table_t *
-inode_table_new (size_t lru_limit, xlator_t *xl)
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
+                             int32_t (*invalidator_fn)(xlator_t *, inode_t *),
+                             xlator_t *invalidator_xl)
 {
         inode_table_t *new = NULL;
+        uint32_t mem_pool_size = lru_limit;
         int            ret = -1;
         int            i = 0;
 
@@ -1619,20 +1781,19 @@ inode_table_new (size_t lru_limit, xlator_t *xl)
         new->ctxcount = xl->graph->xl_count + 1;
 
         new->lru_limit = lru_limit;
+        new->invalidator_fn = invalidator_fn;
+        new->invalidator_xl = invalidator_xl;
 
         new->hashsize = 14057; /* TODO: Random Number?? */
 
-        /* In case FUSE is initing the inode table. */
-        if (lru_limit == 0)
-                lru_limit = DEFAULT_INODE_MEMPOOL_ENTRIES;
-
-        new->inode_pool = mem_pool_new (inode_t, lru_limit);
+        if (!mem_pool_size || (mem_pool_size > DEFAULT_INODE_MEMPOOL_ENTRIES))
+                mem_pool_size = DEFAULT_INODE_MEMPOOL_ENTRIES;
 
+        new->inode_pool = mem_pool_new(inode_t, mem_pool_size);
         if (!new->inode_pool)
                 goto out;
 
-        new->dentry_pool = mem_pool_new (dentry_t, lru_limit);
-
+        new->dentry_pool = mem_pool_new (dentry_t, mem_pool_size);
         if (!new->dentry_pool)
                 goto out;
 
@@ -1667,6 +1828,7 @@ inode_table_new (size_t lru_limit, xlator_t *xl)
         INIT_LIST_HEAD (&new->active);
         INIT_LIST_HEAD (&new->lru);
         INIT_LIST_HEAD (&new->purge);
+        INIT_LIST_HEAD(&new->invalidate);
 
         ret = gf_asprintf (&new->name, "%s/inode", xl->name);
         if (-1 == ret) {
@@ -1696,6 +1858,14 @@ out:
         return new;
 }
 
+inode_table_t *
+inode_table_new(uint32_t lru_limit, xlator_t *xl)
+{
+        /* Only fuse for now requires the inode table with invalidator */
+        return inode_table_with_invalidator(lru_limit, xl, NULL, NULL);
+}
+
+
 int
 inode_table_ctx_free (inode_table_t *table)
 {
@@ -1830,6 +2000,15 @@ inode_table_destroy (inode_table_t *inode_table) {
                         inode_table->lru_size--;
                 }
 
+                /* Same logic for invalidate list */
+                while (!list_empty(&inode_table->invalidate)) {
+                        trav = list_first_entry(&inode_table->invalidate,
+                                                inode_t, list);
+                        __inode_forget(trav, 0);
+                        __inode_retire(trav);
+                        inode_table->invalidate_size--;
+                }
+
                 while (!list_empty (&inode_table->active)) {
                         trav = list_first_entry (&inode_table->active,
                                                  inode_t, list);
@@ -2347,6 +2526,8 @@ inode_dump (inode_t *inode, char *prefix)
                 gf_proc_dump_write("active-fd-count", "%u",
                                    inode->active_fd_count);
                 gf_proc_dump_write("ref", "%u", inode->ref);
+                gf_proc_dump_write("invalidate-sent", "%d",
+                                   inode->invalidate_sent);
                 gf_proc_dump_write("ia_type", "%d", inode->ia_type);
                 if (inode->_ctx) {
                         inode_ctx = GF_CALLOC (inode->table->ctxcount,
@@ -2427,10 +2608,13 @@ inode_table_dump (inode_table_t *itable, char *prefix)
         gf_proc_dump_write(key, "%d", itable->lru_size);
         gf_proc_dump_build_key(key, prefix, "purge_size");
         gf_proc_dump_write(key, "%d", itable->purge_size);
+        gf_proc_dump_build_key(key, prefix, "invalidate_size");
+        gf_proc_dump_write(key, "%d", itable->invalidate_size);
 
         INODE_DUMP_LIST(&itable->active, key, prefix, "active");
         INODE_DUMP_LIST(&itable->lru, key, prefix, "lru");
         INODE_DUMP_LIST(&itable->purge, key, prefix, "purge");
+        INODE_DUMP_LIST(&itable->invalidate, key, prefix, "invalidate");
 
         pthread_mutex_unlock(&itable->lock);
 }
diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h
index 7a87748..6a96447 100644
--- a/libglusterfs/src/inode.h
+++ b/libglusterfs/src/inode.h
@@ -55,6 +55,13 @@ struct _inode_table {
         struct mem_pool   *dentry_pool; /* memory pool for dentrys */
         struct mem_pool   *fd_mem_pool; /* memory pool for fd_t */
         int                ctxcount;    /* number of slots in inode->ctx */
+
+        /* This is required for 'invalidation' when 'nlookup' would be used,
+           specially in case of fuse-bridge */
+        int32_t (*invalidator_fn)(xlator_t *, inode_t *);
+        xlator_t *invalidator_xl;
+        struct list_head invalidate; /* inodes which are in invalidation queue */
+        uint32_t invalidate_size;    /* count of inodes in invalidation list */
 };
 
 
@@ -102,6 +109,7 @@ struct _inode {
         struct list_head     list;          /* active/lru/purge */
 
         struct _inode_ctx   *_ctx;    /* replacement for dict_t *(inode->ctx) */
+        gf_boolean_t invalidate_sent; /* Set it if invalidator_fn is called for inode */
 };
 
 
@@ -110,7 +118,14 @@ struct _inode {
 #define GFID_STR_PFX_LEN (sizeof (GFID_STR_PFX) - 1)
 
 inode_table_t *
-inode_table_new (size_t lru_limit, xlator_t *xl);
+inode_table_new(uint32_t lru_limit, xlator_t *xl);
+
+inode_table_t *
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
+                             int32_t (*invalidator_fn)(xlator_t *, inode_t *),
+                             xlator_t *invalidator_xl);
+int
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup);
 
 void
 inode_table_destroy_all (glusterfs_ctx_t *ctx);
diff --git a/tests/features/fuse-lru-limit.t b/tests/features/fuse-lru-limit.t
new file mode 100644
index 0000000..9f12116
--- /dev/null
+++ b/tests/features/fuse-lru-limit.t
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
+TEST $CLI volume start $V0
+TEST glusterfs -s $H0 --volfile-id $V0 $M0
+
+EXPECT "1" get_mount_active_size_value $V0 $M0
+EXPECT "0" get_mount_lru_size_value $V0 $M0
+
+mkdir ${M0}/dir-{1..9}
+for i in {1..9}; do
+    for j in {1..1000}; do
+        echo "Test file" > ${M0}/dir-$i/file-$j;
+    done;
+done
+lc=$(get_mount_lru_size_value $V0 ${M0})
+# ideally it should be 9000+
+TEST [ $lc -ge 9000 ]
+
+TEST umount $M0
+
+TEST glusterfs -s $H0 --volfile-id $V0 --lru-limit 1000 $M0
+
+TEST find $M0
+lc=$(get_mount_lru_size_value $V0 ${M0})
+# ideally it should be <1000
+# Not sure if there are any possibilities of buffer need.
+TEST [ $lc -le 1000 ]
+
+TEST rm -rf $M0/*
+
+EXPECT "1" get_mount_active_size_value $V0 $M0
+EXPECT "0" get_mount_lru_size_value $V0 $M0
+
+cleanup
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index 8d1e3a0..f3188d6 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -279,29 +279,31 @@ send_fuse_data (xlator_t *this, fuse_in_header_t *finh, void *data, size_t size)
         send_fuse_data (this, finh, obj, sizeof (*(obj)))
 
 
-#if FUSE_KERNEL_MINOR_VERSION >= 11
 static void
 fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
 {
+#if FUSE_KERNEL_MINOR_VERSION >= 11
         struct fuse_out_header             *fouh   = NULL;
         struct fuse_notify_inval_entry_out *fnieo  = NULL;
         fuse_private_t                     *priv   = NULL;
         dentry_t                           *dentry = NULL;
+        dentry_t                           *tmp = NULL;
         inode_t                            *inode  = NULL;
         size_t                              nlen   = 0;
         fuse_invalidate_node_t             *node   = NULL;
+        char gfid_str[UUID_CANONICAL_FORM_LEN + 1];
 
         priv = this->private;
 
         if (!priv->reverse_fuse_thread_started)
                 return;
 
-        inode = fuse_ino_to_inode(fuse_ino, this);
+        inode = (inode_t *)(unsigned long)fuse_ino;
         if (inode == NULL) {
                 return;
         }
 
-        list_for_each_entry (dentry, &inode->dentry_list, inode_list) {
+        list_for_each_entry_safe (dentry, tmp, &inode->dentry_list, inode_list) {
                 node = GF_CALLOC (1, sizeof (*node),
                                   gf_fuse_mt_invalidate_node_t);
                 if (node == NULL)
@@ -315,14 +317,31 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
                 fouh->unique = 0;
                 fouh->error = FUSE_NOTIFY_INVAL_ENTRY;
 
-                nlen = strlen (dentry->name);
-                fouh->len = sizeof (*fouh) + sizeof (*fnieo) + nlen + 1;
-                fnieo->parent = inode_to_fuse_nodeid (dentry->parent);
+                if (dentry->name) {
+                  nlen = strlen (dentry->name);
+                  fouh->len = sizeof (*fouh) + sizeof (*fnieo) + nlen + 1;
+                  fnieo->parent = inode_to_fuse_nodeid (dentry->parent);
+
+                  fnieo->namelen = nlen;
+                  strcpy (node->inval_buf + sizeof (*fouh) + sizeof (*fnieo),
+                          dentry->name);
+                }
 
-                fnieo->namelen = nlen;
-                strcpy (node->inval_buf + sizeof (*fouh) + sizeof (*fnieo),
-                        dentry->name);
+                gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
+                        "%"PRIu64"/%s (gfid:%s)", fnieo->parent, dentry->name,
+                        uuid_utoa(inode->gfid));
 
+                if (dentry->parent) {
+                        fuse_log_eh (this, "Invalidated entry %s (parent: %s)"
+                                     "(gfid: %s)", dentry->name,
+                                     uuid_utoa (dentry->parent->gfid),
+                                     uuid_utoa_r(inode->gfid, gfid_str));
+                } else {
+                        fuse_log_eh (this, "Invalidated entry %s(nodeid: %"
+                                     PRIu64 ") gfid: %s",
+                                     dentry->name, fnieo->parent,
+                                     uuid_utoa (inode->gfid));
+                }
                 pthread_mutex_lock (&priv->invalidate_mutex);
                 {
                         list_add_tail (&node->next, &priv->invalidate_list);
@@ -330,23 +349,10 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
                 }
                 pthread_mutex_unlock (&priv->invalidate_mutex);
 
-                gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
-                        "%"PRIu64"/%s", fnieo->parent, dentry->name);
-
-                if (dentry->parent) {
-                        fuse_log_eh (this, "Invalidated entry %s (parent: %s)",
-                                     dentry->name,
-                                     uuid_utoa (dentry->parent->gfid));
-                } else {
-                        fuse_log_eh (this, "Invalidated entry %s(nodeid: %" PRIu64 ")",
-                                     dentry->name, fnieo->parent);
-                }
         }
-
-        if (inode)
-                inode_unref (inode);
+#endif /* KERNEL_VERSION */
+        return;
 }
-#endif
 
 /*
  * Send an inval inode notification to fuse. This causes an invalidation of the
@@ -367,6 +373,10 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
         if (!priv->reverse_fuse_thread_started)
                 return;
 
+        inode = (inode_t *)(unsigned long)fuse_ino;
+        if (inode == NULL)
+                return;
+
         node = GF_CALLOC (1, sizeof (*node), gf_fuse_mt_invalidate_node_t);
         if (node == NULL)
                 return;
@@ -386,7 +396,11 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
         fniio->off = 0;
         fniio->len = -1;
 
-        inode = fuse_ino_to_inode (fuse_ino, this);
+        fuse_log_eh(this, "Invalidated inode %" PRIu64 " (gfid: %s)", fuse_ino,
+                    uuid_utoa(inode->gfid));
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "INVALIDATE inode: %" PRIu64 "(gfid:%s)", fuse_ino,
+               uuid_utoa(inode->gfid));
 
         pthread_mutex_lock (&priv->invalidate_mutex);
         {
@@ -395,24 +409,23 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
         }
         pthread_mutex_unlock (&priv->invalidate_mutex);
 
-        gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE inode: %" PRIu64,
-                fuse_ino);
-
-        if (inode) {
-                fuse_log_eh (this, "Invalidated inode %" PRIu64 " (gfid: %s)",
-                             fuse_ino, uuid_utoa (inode->gfid));
-        } else {
-                fuse_log_eh (this, "Invalidated inode %" PRIu64, fuse_ino);
-        }
-
-        if (inode)
-                inode_unref (inode);
 #else
 	gf_log ("glusterfs-fuse", GF_LOG_WARNING,
-		"fuse_invalidate_inode not implemented on OS X due to missing FUSE notification");
+		"fuse_invalidate_inode not implemented on this system");
 #endif
+        return;
 }
 
+#if FUSE_KERNEL_MINOR_VERSION >= 11
+/* Need this function for the signature (inode_t *, instead of uint64_t) */
+static int32_t
+fuse_inode_invalidate_fn(xlator_t *this, inode_t *inode)
+{
+    fuse_invalidate_entry(this, (uint64_t)inode);
+    return 0;
+}
+#endif
+
 
 int
 send_fuse_err (xlator_t *this, fuse_in_header_t *finh, int error)
@@ -686,11 +699,14 @@ do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup)
 {
 	inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this);
 
+        gf_log("fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)", unique,
+               nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
+
 	fuse_log_eh(this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)",
 		    unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
 
-	inode_forget(fuse_inode, nlookup);
-	inode_unref(fuse_inode);
+	inode_forget_with_unref(fuse_inode, nlookup);
 }
 
 static void
@@ -705,10 +721,6 @@ fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg,
                 return;
         }
 
-        gf_log ("glusterfs-fuse", GF_LOG_TRACE,
-                "%"PRIu64": FORGET %"PRIu64"/%"PRIu64,
-                finh->unique, finh->nodeid, ffi->nlookup);
-
 	do_forget(this, finh->unique, finh->nodeid, ffi->nlookup);
 
         GF_FREE (finh);
@@ -4940,7 +4952,9 @@ fuse_thread_proc (void *data)
         fuse_in_header_t         *finh = NULL;
         struct iovec              iov_in[2];
         void                     *msg = NULL;
-        const size_t              msg0_size = sizeof (*finh) + 128;
+        /* we need 512 extra buffer size for BATCH_FORGET fop. By tests, it is
+           found to be reduces 'REALLOC()' in the loop */
+        const size_t              msg0_size = sizeof (*finh) + 512;
         fuse_handler_t          **fuse_ops = NULL;
         struct pollfd             pfd[2] = {{0,}};
 
@@ -5283,7 +5297,12 @@ fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
                         goto unlock;
                 }
 
-                itable = inode_table_new (0, graph->top);
+#if FUSE_KERNEL_MINOR_VERSION >= 11
+                itable = inode_table_with_invalidator(priv->lru_limit, graph->top,
+                                                      fuse_inode_invalidate_fn, this);
+#else
+                itable = inode_table_new(0, graph->top);
+#endif
                 if (!itable) {
                         ret = -1;
                         goto unlock;
@@ -5740,6 +5759,8 @@ init (xlator_t *this_xl)
                 }
         }
 
+        GF_OPTION_INIT("lru-limit", priv->lru_limit, uint32, cleanup_exit);
+
         GF_OPTION_INIT("event-history", priv->event_history, bool,
                        cleanup_exit);
 
@@ -6061,5 +6082,13 @@ struct volume_options options[] = {
           .max = 64,
           .description = "Sets fuse reader thread count.",
         },
+        {
+         .key = {"lru-limit"},
+         .type = GF_OPTION_TYPE_INT,
+         .default_value = "131072",
+         .min = 0,
+         .description = "makes glusterfs invalidate kernel inodes after "
+         "reaching this limit (0 means 'unlimited')",
+        },
         { .key = {NULL} },
 };
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
index 4ca76e9..4e32a7f 100644
--- a/xlators/mount/fuse/src/fuse-bridge.h
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -144,6 +144,9 @@ struct fuse_private {
         gf_boolean_t         mount_finished;
         gf_boolean_t         handle_graph_switch;
         pthread_cond_t       migrate_cond;
+
+        /* LRU Limit, if not set, default is 128k for now */
+        uint32_t lru_limit;
 };
 typedef struct fuse_private fuse_private_t;
 
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index 817619e..9a0404f 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -245,6 +245,10 @@ start_glusterfs ()
         cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
     fi
 
+    if [ -n "$lru_limit" ]; then
+        cmd_line=$(echo "$cmd_line --lru-limit=$lru_limit");
+    fi
+
     if [ -n "$bg_qlen" ]; then
         cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
     fi
@@ -467,6 +471,9 @@ with_options()
         "gid-timeout")
             gid_timeout=$value
             ;;
+        "lru-limit")
+            lru_limit=$value
+            ;;
         "background-qlen")
             bg_qlen=$value
             ;;
-- 
1.8.3.1