887953
From 55e67fb41ae3b4388839723ac929cd239280a0fc Mon Sep 17 00:00:00 2001
887953
From: Amar Tumballi <amarts@redhat.com>
887953
Date: Thu, 7 Feb 2019 18:06:43 +0530
887953
Subject: [PATCH 522/529] fuse: add --lru-limit option
887953
887953
The inode LRU mechanism is moot in fuse xlator (ie. there is no
887953
limit for the LRU list), as fuse inodes are referenced from
887953
kernel context, and thus they can only be dropped on request of
887953
the kernel. This might results in a high number of passive
887953
inodes which are useless for the glusterfs client, causing a
887953
significant memory overhead.
887953
887953
This change tries to remedy this by extending the LRU semantics
887953
and allowing to set a finite limit on the fuse inode LRU.
887953
887953
A brief history of problem:
887953
887953
When gluster's inode table was designed, fuse didn't have any
887953
'invalidate' method, which means, userspace application could
887953
never ask kernel to send a 'forget()' fop, instead had to wait
887953
for kernel to send it based on kernel's parameters. Inode table
887953
remembers the number of times kernel has cached the inode based
887953
on the 'nlookup' parameter. And 'nlookup' field is not used by
887953
no other entry points (like server-protocol, gfapi etc).
887953
887953
Hence the inode_table of fuse module always has to have lru-limit
887953
as '0', which means no limit. GlusterFS always had to keep all
887953
inodes in memory as kernel would have had a reference to it.
887953
Again, the reason for this is, kernel's glusterfs inode reference
887953
was pointer of 'inode_t' structure in glusterfs. As it is a
887953
pointer, we could never free it (to prevent segfault, or memory
887953
corruption).
887953
887953
Solution:
887953
887953
In the inode table, handle the prune case of inodes with 'nlookup'
887953
differently, and call a 'invalidator' method, which in this case is
887953
fuse_invalidate(), and it sends the request to kernel for getting
887953
the forget request.
887953
887953
When the kernel sends the forget, it means, it has dropped all
887953
the reference to the inode, and it will send the forget with the
887953
'nlookup' parameter too. We just need to make sure to reduce the
887953
'nlookup' value we have when we get forget. That automatically
887953
cause the relevant prune to happen.
887953
887953
Credits: Csaba Henk, Xavier Hernandez, Raghavendra Gowdappa, Nithya B
887953
887953
Upstream:
887953
> URL: https://review.gluster.org/19778
887953
887953
BUG: 1511779
887953
Change-Id: Iabe22a62e0f819b7eb67d4ecb850dd559b0c937f
887953
Signed-off-by: Amar Tumballi <amarts@redhat.com>
887953
Reviewed-on: https://code.engineering.redhat.com/gerrit/162494
887953
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
887953
Tested-by: RHGS Build Bot <nigelb@redhat.com>
887953
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
887953
---
887953
 doc/mount.glusterfs.8                       |   4 +
887953
 glusterfsd/src/glusterfsd.c                 |  24 +++
887953
 glusterfsd/src/glusterfsd.h                 |   1 +
887953
 libglusterfs/src/glusterfs.h                |   1 +
887953
 libglusterfs/src/inode.c                    | 256 ++++++++++++++++++++++++----
887953
 libglusterfs/src/inode.h                    |  17 +-
887953
 tests/features/fuse-lru-limit.t             |  42 +++++
887953
 xlators/mount/fuse/src/fuse-bridge.c        | 121 ++++++++-----
887953
 xlators/mount/fuse/src/fuse-bridge.h        |   3 +
887953
 xlators/mount/fuse/utils/mount.glusterfs.in |   7 +
887953
 10 files changed, 393 insertions(+), 83 deletions(-)
887953
 create mode 100644 tests/features/fuse-lru-limit.t
887953
887953
diff --git a/doc/mount.glusterfs.8 b/doc/mount.glusterfs.8
887953
index 95aad02..ed6b410 100644
887953
--- a/doc/mount.glusterfs.8
887953
+++ b/doc/mount.glusterfs.8
887953
@@ -119,6 +119,10 @@ Provide list of backup volfile servers in the following format [default: None]
887953
 \fBDeprecated\fR option - placed here for backward compatibility [default: 1]
887953
 .TP
887953
 .TP
887953
+\fBlru-limit=\fRN
887953
+Set fuse module's limit for number of inodes kept in LRU list to N [default: 131072]
887953
+.TP
887953
+.TP
887953
 \fBbackground-qlen=\fRN
887953
 Set fuse module's background queue length to N [default: 64]
887953
 .TP
887953
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
887953
index 990036c..2e2cd77 100644
887953
--- a/glusterfsd/src/glusterfsd.c
887953
+++ b/glusterfsd/src/glusterfsd.c
887953
@@ -203,6 +203,9 @@ static struct argp_option gf_options[] = {
887953
 	 "[default: 300]"},
887953
         {"resolve-gids", ARGP_RESOLVE_GIDS_KEY, 0, 0,
887953
          "Resolve all auxiliary groups in fuse translator (max 32 otherwise)"},
887953
+        {"lru-limit", ARGP_FUSE_LRU_LIMIT_KEY, "N", 0,
887953
+         "Set fuse module's limit for number of inodes kept in LRU list to N "
887953
+         "[default: 131072]"},
887953
 	{"background-qlen", ARGP_FUSE_BACKGROUND_QLEN_KEY, "N", 0,
887953
 	 "Set fuse module's background queue length to N "
887953
 	 "[default: 64]"},
887953
@@ -462,6 +465,15 @@ set_fuse_mount_options (glusterfs_ctx_t *ctx, dict_t *options)
887953
                 }
887953
         }
887953
 
887953
+        if (cmd_args->lru_limit >= 0) {
887953
+                ret = dict_set_int32(options, "lru-limit", cmd_args->lru_limit);
887953
+                if (ret < 0) {
887953
+                        gf_msg("glusterfsd", GF_LOG_ERROR, 0, glusterfsd_msg_4,
887953
+                               "lru-limit");
887953
+                        goto err;
887953
+                }
887953
+        }
887953
+
887953
 	if (cmd_args->background_qlen) {
887953
 		ret = dict_set_int32 (options, "background-qlen",
887953
                                       cmd_args->background_qlen);
887953
@@ -1169,6 +1181,13 @@ parse_opts (int key, char *arg, struct argp_state *state)
887953
                 cmd_args->resolve_gids = 1;
887953
                 break;
887953
 
887953
+        case ARGP_FUSE_LRU_LIMIT_KEY:
887953
+                if (!gf_string2int32(arg, &cmd_args->lru_limit))
887953
+                         break;
887953
+
887953
+                argp_failure(state, -1, 0, "unknown LRU limit option %s", arg);
887953
+                break;
887953
+
887953
         case ARGP_FUSE_BACKGROUND_QLEN_KEY:
887953
                 if (!gf_string2int (arg, &cmd_args->background_qlen))
887953
                         break;
887953
@@ -1937,6 +1956,11 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx)
887953
                 ctx->ssl_cert_depth = glusterfs_read_secure_access_file ();
887953
         }
887953
 
887953
+        /* Need to set lru_limit to below 0 to indicate there was nothing
887953
+           specified. This is needed as 0 is a valid option, and may not be
887953
+           default value. */
887953
+        cmd_args->lru_limit = -1;
887953
+
887953
         argp_parse (&argp, argc, argv, ARGP_IN_ORDER, NULL, cmd_args);
887953
         if (cmd_args->print_netgroups) {
887953
                 /* When this option is set we don't want to do anything else
887953
diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h
887953
index 75cb1d8..1550a30 100644
887953
--- a/glusterfsd/src/glusterfsd.h
887953
+++ b/glusterfsd/src/glusterfsd.h
887953
@@ -100,6 +100,7 @@ enum argp_option_keys {
887953
         ARGP_SUBDIR_MOUNT_KEY             = 178,
887953
         ARGP_FUSE_EVENT_HISTORY_KEY       = 179,
887953
         ARGP_READER_THREAD_COUNT_KEY      = 180,
887953
+        ARGP_FUSE_LRU_LIMIT_KEY           = 190,
887953
 };
887953
 
887953
 struct _gfd_vol_top_priv {
887953
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
887953
index 157437c..2690306 100644
887953
--- a/libglusterfs/src/glusterfs.h
887953
+++ b/libglusterfs/src/glusterfs.h
887953
@@ -413,6 +413,7 @@ struct _cmd_args {
887953
         pid_t              client_pid;
887953
         int                client_pid_set;
887953
         unsigned           uid_map_root;
887953
+        int32_t            lru_limit;
887953
         int                background_qlen;
887953
         int                congestion_threshold;
887953
         char              *fuse_mountopts;
887953
diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c
887953
index 29d3c8f..f57020a 100644
887953
--- a/libglusterfs/src/inode.c
887953
+++ b/libglusterfs/src/inode.c
887953
@@ -24,6 +24,100 @@
887953
    move latest accessed dentry to list_head of inode
887953
 */
887953
 
887953
+/* clang-format off */
887953
+/*
887953
+
887953
+Details as per Xavi:
887953
+
887953
+ I think we should have 3 lists: active, lru and invalidate.
887953
+
887953
+We'll need 3 things: refs, nlookups and invalidate_sent flag. Any change of
887953
+refs, invalidate_sent flag and moving from one list to another must be done
887953
+atomically.
887953
+
887953
+With this information, these are the states that cause a transition:
887953
+
887953
+    refs nlookups inv_sent    op
887953
+      1      0        0      unref  -> refs = 0, active--->destroy
887953
+      1      1        0      unref  -> refs = 0, active--->lru
887953
+      1      1        0     forget  -> nlookups = 0, active--->active
887953
+     *0      1        0     forget  -> nlookups = 0, lru--->destroy
887953
+     *0      1        1     forget  -> nlookups = 0, invalidate--->destroy
887953
+      0      1        0       ref   -> refs = 1, lru--->active
887953
+      0      1        1       ref   -> refs = 1, inv_sent = 0, invalidate--->active
887953
+      0      1        0    overflow -> refs = 1, inv_sent = 1, lru--->invalidate
887953
+      1      1        1      unref  -> refs = 0, invalidate--->invalidate
887953
+      1      1        1     forget  -> nlookups = 0, inv_sent = 0, invalidate--->active
887953
+
887953
+(*) technically these combinations cannot happen because a forget sent by the
887953
+kernel first calls ref() and then unref(). However it's equivalent.
887953
+
887953
+overflow means that lru list has grown beyond the limit and the inode needs to
887953
+be invalidated. All other combinations do not cause a change in state or are not
887953
+possible.
887953
+
887953
+Based on this, the code could be similar to this:
887953
+
887953
+    ref(inode, inv)
887953
+    {
887953
+        if (refs == 0) {
887953
+            if (inv_sent) {
887953
+                invalidate_count--;
887953
+                inv_sent = 0;
887953
+            } else {
887953
+                lru_count--;
887953
+            }
887953
+            if (inv) {
887953
+                inv_sent = 1;
887953
+                invalidate_count++;
887953
+                list_move(inode, invalidate);
887953
+            } else {
887953
+                active_count++;
887953
+                list_move(inode, active);
887953
+            }
887953
+        }
887953
+        refs++;
887953
+    }
887953
+
887953
+    unref(inode, clear)
887953
+    {
887953
+        if (clear && inv_sent) {
887953
+            // there is a case of fuse itself sending forget, without
887953
+            // invalidate, after entry delete, like unlink(), rmdir().
887953
+            inv_sent = 0;
887953
+            invalidate_count--;
887953
+            active_count++;
887953
+            list_move(inode, active);
887953
+        }
887953
+        refs--;
887953
+        if ((refs == 0) && !inv_sent) {
887953
+            active_count--;
887953
+            if (nlookups == 0) {
887953
+                destroy(inode);
887953
+            } else {
887953
+                lru_count++;
887953
+                list_move(inode, lru);
887953
+            }
887953
+        }
887953
+    }
887953
+
887953
+    forget(inode)
887953
+    {
887953
+        ref(inode, false);
887953
+        nlookups--;
887953
+        unref(inode, true);
887953
+    }
887953
+
887953
+    overflow(inode)
887953
+    {
887953
+        ref(inode, true);
887953
+        invalidator(inode);
887953
+        unref(inode, false);
887953
+    }
887953
+
887953
+*/
887953
+/* clang-format on */
887953
+
887953
 #define INODE_DUMP_LIST(head, key_buf, key_prefix, list_type)           \
887953
         {                                                               \
887953
                 int i = 1;                                              \
887953
@@ -37,7 +131,7 @@
887953
         }
887953
 
887953
 static inode_t *
887953
-__inode_unref (inode_t *inode);
887953
+__inode_unref (inode_t *inode, gf_boolean_t clear);
887953
 
887953
 static int
887953
 inode_table_prune (inode_table_t *table);
887953
@@ -138,7 +232,7 @@ __dentry_unset (dentry_t *dentry)
887953
         dentry->name = NULL;
887953
 
887953
         if (dentry->parent) {
887953
-                __inode_unref (dentry->parent);
887953
+                __inode_unref (dentry->parent, _gf_false);
887953
                 dentry->parent = NULL;
887953
         }
887953
 
887953
@@ -465,7 +559,7 @@ out:
887953
 
887953
 
887953
 static inode_t *
887953
-__inode_unref (inode_t *inode)
887953
+__inode_unref (inode_t *inode, gf_boolean_t clear)
887953
 {
887953
         int       index = 0;
887953
         xlator_t *this  = NULL;
887953
@@ -473,8 +567,6 @@ __inode_unref (inode_t *inode)
887953
         if (!inode)
887953
                 return NULL;
887953
 
887953
-        this = THIS;
887953
-
887953
         /*
887953
          * Root inode should always be in active list of inode table. So unrefs
887953
          * on root inode are no-ops.
887953
@@ -482,6 +574,14 @@ __inode_unref (inode_t *inode)
887953
         if (__is_root_gfid(inode->gfid))
887953
                 return inode;
887953
 
887953
+        this = THIS;
887953
+
887953
+        if (clear && inode->invalidate_sent) {
887953
+                inode->invalidate_sent = _gf_false;
887953
+                inode->table->invalidate_size--;
887953
+                __inode_activate(inode);
887953
+        }
887953
+
887953
         GF_ASSERT (inode->ref);
887953
 
887953
         --inode->ref;
887953
@@ -492,7 +592,7 @@ __inode_unref (inode_t *inode)
887953
                 inode->_ctx[index].ref--;
887953
         }
887953
 
887953
-        if (!inode->ref) {
887953
+        if (!inode->ref && !inode->invalidate_sent) {
887953
                 inode->table->active_size--;
887953
 
887953
                 if (inode->nlookup)
887953
@@ -506,7 +606,7 @@ __inode_unref (inode_t *inode)
887953
 
887953
 
887953
 static inode_t *
887953
-__inode_ref (inode_t *inode)
887953
+__inode_ref (inode_t *inode, gf_boolean_t is_invalidate)
887953
 {
887953
         int       index = 0;
887953
         xlator_t *this  = NULL;
887953
@@ -516,11 +616,6 @@ __inode_ref (inode_t *inode)
887953
 
887953
         this = THIS;
887953
 
887953
-        if (!inode->ref) {
887953
-                inode->table->lru_size--;
887953
-                __inode_activate (inode);
887953
-        }
887953
-
887953
         /*
887953
          * Root inode should always be in active list of inode table. So unrefs
887953
          * on root inode are no-ops. If we do not allow unrefs but allow refs,
887953
@@ -532,6 +627,22 @@ __inode_ref (inode_t *inode)
887953
         if (__is_root_gfid(inode->gfid) && inode->ref)
887953
                 return inode;
887953
 
887953
+        if (!inode->ref) {
887953
+                if (inode->invalidate_sent) {
887953
+                        inode->invalidate_sent = _gf_false;
887953
+                        inode->table->invalidate_size--;
887953
+                } else {
887953
+                        inode->table->lru_size--;
887953
+                }
887953
+                if (is_invalidate) {
887953
+                        inode->invalidate_sent = _gf_true;
887953
+                        inode->table->invalidate_size++;
887953
+                        list_move_tail(&inode->list, &inode->table->invalidate);
887953
+                } else {
887953
+                        __inode_activate(inode);
887953
+                }
887953
+        }
887953
+
887953
         inode->ref++;
887953
 
887953
         index = __inode_get_xl_index (inode, this);
887953
@@ -556,7 +667,7 @@ inode_unref (inode_t *inode)
887953
 
887953
         pthread_mutex_lock (&table->lock);
887953
         {
887953
-                inode = __inode_unref (inode);
887953
+                inode = __inode_unref (inode, _gf_false);
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
@@ -578,7 +689,7 @@ inode_ref (inode_t *inode)
887953
 
887953
         pthread_mutex_lock (&table->lock);
887953
         {
887953
-                inode = __inode_ref (inode);
887953
+                inode = __inode_ref (inode, _gf_false);
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
@@ -614,7 +725,7 @@ __dentry_create (inode_t *inode, inode_t *parent, const char *name)
887953
         }
887953
 
887953
         if (parent)
887953
-                newd->parent = __inode_ref (parent);
887953
+                newd->parent = __inode_ref (parent, _gf_false);
887953
 
887953
         list_add (&newd->inode_list, &inode->dentry_list);
887953
         newd->inode = inode;
887953
@@ -685,7 +796,7 @@ inode_new (inode_table_t *table)
887953
         {
887953
                 inode = __inode_create (table);
887953
                 if (inode != NULL) {
887953
-                        __inode_ref (inode);
887953
+                       __inode_ref (inode, _gf_false);
887953
                 }
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
@@ -802,7 +913,7 @@ inode_grep (inode_table_t *table, inode_t *parent, const char *name)
887953
                         inode = dentry->inode;
887953
 
887953
                 if (inode)
887953
-                        __inode_ref (inode);
887953
+                        __inode_ref (inode, _gf_false);
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
@@ -947,7 +1058,7 @@ inode_find (inode_table_t *table, uuid_t gfid)
887953
         {
887953
                 inode = __inode_find (table, gfid);
887953
                 if (inode)
887953
-                        __inode_ref (inode);
887953
+                        __inode_ref (inode, _gf_false);
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
@@ -1096,7 +1207,7 @@ inode_link (inode_t *inode, inode_t *parent, const char *name,
887953
                 linked_inode = __inode_link (inode, parent, name, iatt);
887953
 
887953
                 if (linked_inode)
887953
-                        __inode_ref (linked_inode);
887953
+                        __inode_ref (linked_inode, _gf_false);
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
@@ -1178,6 +1289,31 @@ inode_forget (inode_t *inode, uint64_t nlookup)
887953
         return 0;
887953
 }
887953
 
887953
+int
887953
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup)
887953
+{
887953
+    inode_table_t *table = NULL;
887953
+
887953
+    if (!inode) {
887953
+            gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND,
887953
+                             "inode not found");
887953
+            return -1;
887953
+    }
887953
+
887953
+    table = inode->table;
887953
+
887953
+    pthread_mutex_lock(&table->lock);
887953
+    {
887953
+            __inode_forget(inode, nlookup);
887953
+            __inode_unref(inode, _gf_true);
887953
+    }
887953
+    pthread_mutex_unlock(&table->lock);
887953
+
887953
+    inode_table_prune(table);
887953
+
887953
+    return 0;
887953
+}
887953
+
887953
 /*
887953
  * Invalidate an inode. This is invoked when a translator decides that an inode's
887953
  * cache is no longer valid. Any translator interested in taking action in this
887953
@@ -1356,7 +1492,7 @@ inode_parent (inode_t *inode, uuid_t pargfid, const char *name)
887953
                         parent = dentry->parent;
887953
 
887953
                 if (parent)
887953
-                        __inode_ref (parent);
887953
+                        __inode_ref (parent, _gf_false);
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
@@ -1540,6 +1676,7 @@ inode_table_prune (inode_table_t *table)
887953
         inode_t          *del = NULL;
887953
         inode_t          *tmp = NULL;
887953
         inode_t          *entry = NULL;
887953
+        int64_t           lru_size = 0;
887953
 
887953
         if (!table)
887953
                 return -1;
887953
@@ -1548,8 +1685,11 @@ inode_table_prune (inode_table_t *table)
887953
 
887953
         pthread_mutex_lock (&table->lock);
887953
         {
887953
-                while (table->lru_limit
887953
-                       && table->lru_size > (table->lru_limit)) {
887953
+                if (!table->lru_limit)
887953
+                        goto purge_list;
887953
+
887953
+                lru_size = table->lru_size;
887953
+                while (lru_size > (table->lru_limit)) {
887953
                         if (list_empty (&table->lru)) {
887953
                                 gf_msg_callingfn (THIS->name, GF_LOG_WARNING, 0,
887953
                                                   LG_MSG_INVALID_INODE_LIST,
887953
@@ -1559,7 +1699,18 @@ inode_table_prune (inode_table_t *table)
887953
                                 break;
887953
                         }
887953
 
887953
+                        lru_size--;
887953
                         entry = list_entry (table->lru.next, inode_t, list);
887953
+                        /* The logic of invalidation is required only if invalidator_fn
887953
+                           is present */
887953
+                        if (table->invalidator_fn) {
887953
+                          /* check for valid inode with 'nlookup' */
887953
+                          if (entry->nlookup) {
887953
+                            __inode_ref(entry, _gf_true);
887953
+                            tmp = entry;
887953
+                            break;
887953
+                          }
887953
+                        }
887953
 
887953
                         table->lru_size--;
887953
                         __inode_retire (entry);
887953
@@ -1567,17 +1718,25 @@ inode_table_prune (inode_table_t *table)
887953
                         ret++;
887953
                 }
887953
 
887953
+        purge_list:
887953
                 list_splice_init (&table->purge, &purge);
887953
                 table->purge_size = 0;
887953
         }
887953
         pthread_mutex_unlock (&table->lock);
887953
 
887953
-        {
887953
-                list_for_each_entry_safe (del, tmp, &purge, list) {
887953
-                        list_del_init (&del->list);
887953
-                        __inode_forget (del, 0);
887953
-                        __inode_destroy (del);
887953
-                }
887953
+        /* Pick 1 inode for invalidation */
887953
+        if (tmp) {
887953
+          xlator_t *old_THIS = THIS;
887953
+          THIS = table->invalidator_xl;
887953
+          table->invalidator_fn(table->invalidator_xl, tmp);
887953
+          THIS = old_THIS;
887953
+          inode_unref(tmp);
887953
+        }
887953
+
887953
+        list_for_each_entry_safe (del, tmp, &purge, list) {
887953
+                list_del_init (&del->list);
887953
+                __inode_forget (del, 0);
887953
+                __inode_destroy (del);
887953
         }
887953
 
887953
         return ret;
887953
@@ -1605,9 +1764,12 @@ __inode_table_init_root (inode_table_t *table)
887953
 
887953
 
887953
 inode_table_t *
887953
-inode_table_new (size_t lru_limit, xlator_t *xl)
887953
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
887953
+                             int32_t (*invalidator_fn)(xlator_t *, inode_t *),
887953
+                             xlator_t *invalidator_xl)
887953
 {
887953
         inode_table_t *new = NULL;
887953
+        uint32_t mem_pool_size = lru_limit;
887953
         int            ret = -1;
887953
         int            i = 0;
887953
 
887953
@@ -1619,20 +1781,19 @@ inode_table_new (size_t lru_limit, xlator_t *xl)
887953
         new->ctxcount = xl->graph->xl_count + 1;
887953
 
887953
         new->lru_limit = lru_limit;
887953
+        new->invalidator_fn = invalidator_fn;
887953
+        new->invalidator_xl = invalidator_xl;
887953
 
887953
         new->hashsize = 14057; /* TODO: Random Number?? */
887953
 
887953
-        /* In case FUSE is initing the inode table. */
887953
-        if (lru_limit == 0)
887953
-                lru_limit = DEFAULT_INODE_MEMPOOL_ENTRIES;
887953
-
887953
-        new->inode_pool = mem_pool_new (inode_t, lru_limit);
887953
+        if (!mem_pool_size || (mem_pool_size > DEFAULT_INODE_MEMPOOL_ENTRIES))
887953
+                mem_pool_size = DEFAULT_INODE_MEMPOOL_ENTRIES;
887953
 
887953
+        new->inode_pool = mem_pool_new(inode_t, mem_pool_size);
887953
         if (!new->inode_pool)
887953
                 goto out;
887953
 
887953
-        new->dentry_pool = mem_pool_new (dentry_t, lru_limit);
887953
-
887953
+        new->dentry_pool = mem_pool_new (dentry_t, mem_pool_size);
887953
         if (!new->dentry_pool)
887953
                 goto out;
887953
 
887953
@@ -1667,6 +1828,7 @@ inode_table_new (size_t lru_limit, xlator_t *xl)
887953
         INIT_LIST_HEAD (&new->active);
887953
         INIT_LIST_HEAD (&new->lru);
887953
         INIT_LIST_HEAD (&new->purge);
887953
+        INIT_LIST_HEAD(&new->invalidate);
887953
 
887953
         ret = gf_asprintf (&new->name, "%s/inode", xl->name);
887953
         if (-1 == ret) {
887953
@@ -1696,6 +1858,14 @@ out:
887953
         return new;
887953
 }
887953
 
887953
+inode_table_t *
887953
+inode_table_new(uint32_t lru_limit, xlator_t *xl)
887953
+{
887953
+        /* Only fuse for now requires the inode table with invalidator */
887953
+        return inode_table_with_invalidator(lru_limit, xl, NULL, NULL);
887953
+}
887953
+
887953
+
887953
 int
887953
 inode_table_ctx_free (inode_table_t *table)
887953
 {
887953
@@ -1830,6 +2000,15 @@ inode_table_destroy (inode_table_t *inode_table) {
887953
                         inode_table->lru_size--;
887953
                 }
887953
 
887953
+                /* Same logic for invalidate list */
887953
+                while (!list_empty(&inode_table->invalidate)) {
887953
+                        trav = list_first_entry(&inode_table->invalidate,
887953
+                                                inode_t, list);
887953
+                        __inode_forget(trav, 0);
887953
+                        __inode_retire(trav);
887953
+                        inode_table->invalidate_size--;
887953
+                }
887953
+
887953
                 while (!list_empty (&inode_table->active)) {
887953
                         trav = list_first_entry (&inode_table->active,
887953
                                                  inode_t, list);
887953
@@ -2347,6 +2526,8 @@ inode_dump (inode_t *inode, char *prefix)
887953
                 gf_proc_dump_write("active-fd-count", "%u",
887953
                                    inode->active_fd_count);
887953
                 gf_proc_dump_write("ref", "%u", inode->ref);
887953
+                gf_proc_dump_write("invalidate-sent", "%d",
887953
+                                   inode->invalidate_sent);
887953
                 gf_proc_dump_write("ia_type", "%d", inode->ia_type);
887953
                 if (inode->_ctx) {
887953
                         inode_ctx = GF_CALLOC (inode->table->ctxcount,
887953
@@ -2427,10 +2608,13 @@ inode_table_dump (inode_table_t *itable, char *prefix)
887953
         gf_proc_dump_write(key, "%d", itable->lru_size);
887953
         gf_proc_dump_build_key(key, prefix, "purge_size");
887953
         gf_proc_dump_write(key, "%d", itable->purge_size);
887953
+        gf_proc_dump_build_key(key, prefix, "invalidate_size");
887953
+        gf_proc_dump_write(key, "%d", itable->invalidate_size);
887953
 
887953
         INODE_DUMP_LIST(&itable->active, key, prefix, "active");
887953
         INODE_DUMP_LIST(&itable->lru, key, prefix, "lru");
887953
         INODE_DUMP_LIST(&itable->purge, key, prefix, "purge");
887953
+        INODE_DUMP_LIST(&itable->invalidate, key, prefix, "invalidate");
887953
 
887953
         pthread_mutex_unlock(&itable->lock);
887953
 }
887953
diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h
887953
index 7a87748..6a96447 100644
887953
--- a/libglusterfs/src/inode.h
887953
+++ b/libglusterfs/src/inode.h
887953
@@ -55,6 +55,13 @@ struct _inode_table {
887953
         struct mem_pool   *dentry_pool; /* memory pool for dentrys */
887953
         struct mem_pool   *fd_mem_pool; /* memory pool for fd_t */
887953
         int                ctxcount;    /* number of slots in inode->ctx */
887953
+
887953
+        /* This is required for 'invalidation' when 'nlookup' would be used,
887953
+           specially in case of fuse-bridge */
887953
+        int32_t (*invalidator_fn)(xlator_t *, inode_t *);
887953
+        xlator_t *invalidator_xl;
887953
+        struct list_head invalidate; /* inodes which are in invalidation queue */
887953
+        uint32_t invalidate_size;    /* count of inodes in invalidation list */
887953
 };
887953
 
887953
 
887953
@@ -102,6 +109,7 @@ struct _inode {
887953
         struct list_head     list;          /* active/lru/purge */
887953
 
887953
         struct _inode_ctx   *_ctx;    /* replacement for dict_t *(inode->ctx) */
887953
+        gf_boolean_t invalidate_sent; /* Set it if invalidator_fn is called for inode */
887953
 };
887953
 
887953
 
887953
@@ -110,7 +118,14 @@ struct _inode {
887953
 #define GFID_STR_PFX_LEN (sizeof (GFID_STR_PFX) - 1)
887953
 
887953
 inode_table_t *
887953
-inode_table_new (size_t lru_limit, xlator_t *xl);
887953
+inode_table_new(uint32_t lru_limit, xlator_t *xl);
887953
+
887953
+inode_table_t *
887953
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
887953
+                             int32_t (*invalidator_fn)(xlator_t *, inode_t *),
887953
+                             xlator_t *invalidator_xl);
887953
+int
887953
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup);
887953
 
887953
 void
887953
 inode_table_destroy_all (glusterfs_ctx_t *ctx);
887953
diff --git a/tests/features/fuse-lru-limit.t b/tests/features/fuse-lru-limit.t
887953
new file mode 100644
887953
index 0000000..9f12116
887953
--- /dev/null
887953
+++ b/tests/features/fuse-lru-limit.t
887953
@@ -0,0 +1,42 @@
887953
+#!/bin/bash
887953
+
887953
+. $(dirname $0)/../include.rc
887953
+. $(dirname $0)/../volume.rc
887953
+
887953
+cleanup
887953
+
887953
+TEST glusterd
887953
+TEST pidof glusterd
887953
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
887953
+TEST $CLI volume start $V0
887953
+TEST glusterfs -s $H0 --volfile-id $V0 $M0
887953
+
887953
+EXPECT "1" get_mount_active_size_value $V0 $M0
887953
+EXPECT "0" get_mount_lru_size_value $V0 $M0
887953
+
887953
+mkdir ${M0}/dir-{1..9}
887953
+for i in {1..9}; do
887953
+    for j in {1..1000}; do
887953
+        echo "Test file" > ${M0}/dir-$i/file-$j;
887953
+    done;
887953
+done
887953
+lc=$(get_mount_lru_size_value $V0 ${M0})
887953
+# ideally it should be 9000+
887953
+TEST [ $lc -ge 9000 ]
887953
+
887953
+TEST umount $M0
887953
+
887953
+TEST glusterfs -s $H0 --volfile-id $V0 --lru-limit 1000 $M0
887953
+
887953
+TEST find $M0
887953
+lc=$(get_mount_lru_size_value $V0 ${M0})
887953
+# ideally it should be <1000
887953
+# Not sure if there are any possibilities of buffer need.
887953
+TEST [ $lc -le 1000 ]
887953
+
887953
+TEST rm -rf $M0/*
887953
+
887953
+EXPECT "1" get_mount_active_size_value $V0 $M0
887953
+EXPECT "0" get_mount_lru_size_value $V0 $M0
887953
+
887953
+cleanup
887953
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
887953
index 8d1e3a0..f3188d6 100644
887953
--- a/xlators/mount/fuse/src/fuse-bridge.c
887953
+++ b/xlators/mount/fuse/src/fuse-bridge.c
887953
@@ -279,29 +279,31 @@ send_fuse_data (xlator_t *this, fuse_in_header_t *finh, void *data, size_t size)
887953
         send_fuse_data (this, finh, obj, sizeof (*(obj)))
887953
 
887953
 
887953
-#if FUSE_KERNEL_MINOR_VERSION >= 11
887953
 static void
887953
 fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
887953
 {
887953
+#if FUSE_KERNEL_MINOR_VERSION >= 11
887953
         struct fuse_out_header             *fouh   = NULL;
887953
         struct fuse_notify_inval_entry_out *fnieo  = NULL;
887953
         fuse_private_t                     *priv   = NULL;
887953
         dentry_t                           *dentry = NULL;
887953
+        dentry_t                           *tmp = NULL;
887953
         inode_t                            *inode  = NULL;
887953
         size_t                              nlen   = 0;
887953
         fuse_invalidate_node_t             *node   = NULL;
887953
+        char gfid_str[UUID_CANONICAL_FORM_LEN + 1];
887953
 
887953
         priv = this->private;
887953
 
887953
         if (!priv->reverse_fuse_thread_started)
887953
                 return;
887953
 
887953
-        inode = fuse_ino_to_inode(fuse_ino, this);
887953
+        inode = (inode_t *)(unsigned long)fuse_ino;
887953
         if (inode == NULL) {
887953
                 return;
887953
         }
887953
 
887953
-        list_for_each_entry (dentry, &inode->dentry_list, inode_list) {
887953
+        list_for_each_entry_safe (dentry, tmp, &inode->dentry_list, inode_list) {
887953
                 node = GF_CALLOC (1, sizeof (*node),
887953
                                   gf_fuse_mt_invalidate_node_t);
887953
                 if (node == NULL)
887953
@@ -315,14 +317,31 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
887953
                 fouh->unique = 0;
887953
                 fouh->error = FUSE_NOTIFY_INVAL_ENTRY;
887953
 
887953
-                nlen = strlen (dentry->name);
887953
-                fouh->len = sizeof (*fouh) + sizeof (*fnieo) + nlen + 1;
887953
-                fnieo->parent = inode_to_fuse_nodeid (dentry->parent);
887953
+                if (dentry->name) {
887953
+                  nlen = strlen (dentry->name);
887953
+                  fouh->len = sizeof (*fouh) + sizeof (*fnieo) + nlen + 1;
887953
+                  fnieo->parent = inode_to_fuse_nodeid (dentry->parent);
887953
+
887953
+                  fnieo->namelen = nlen;
887953
+                  strcpy (node->inval_buf + sizeof (*fouh) + sizeof (*fnieo),
887953
+                          dentry->name);
887953
+                }
887953
 
887953
-                fnieo->namelen = nlen;
887953
-                strcpy (node->inval_buf + sizeof (*fouh) + sizeof (*fnieo),
887953
-                        dentry->name);
887953
+                gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
887953
+                        "%"PRIu64"/%s (gfid:%s)", fnieo->parent, dentry->name,
887953
+                        uuid_utoa(inode->gfid));
887953
 
887953
+                if (dentry->parent) {
887953
+                        fuse_log_eh (this, "Invalidated entry %s (parent: %s)"
887953
+                                     "(gfid: %s)", dentry->name,
887953
+                                     uuid_utoa (dentry->parent->gfid),
887953
+                                     uuid_utoa_r(inode->gfid, gfid_str));
887953
+                } else {
887953
+                        fuse_log_eh (this, "Invalidated entry %s(nodeid: %"
887953
+                                     PRIu64 ") gfid: %s",
887953
+                                     dentry->name, fnieo->parent,
887953
+                                     uuid_utoa (inode->gfid));
887953
+                }
887953
                 pthread_mutex_lock (&priv->invalidate_mutex);
887953
                 {
887953
                         list_add_tail (&node->next, &priv->invalidate_list);
887953
@@ -330,23 +349,10 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
887953
                 }
887953
                 pthread_mutex_unlock (&priv->invalidate_mutex);
887953
 
887953
-                gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
887953
-                        "%"PRIu64"/%s", fnieo->parent, dentry->name);
887953
-
887953
-                if (dentry->parent) {
887953
-                        fuse_log_eh (this, "Invalidated entry %s (parent: %s)",
887953
-                                     dentry->name,
887953
-                                     uuid_utoa (dentry->parent->gfid));
887953
-                } else {
887953
-                        fuse_log_eh (this, "Invalidated entry %s(nodeid: %" PRIu64 ")",
887953
-                                     dentry->name, fnieo->parent);
887953
-                }
887953
         }
887953
-
887953
-        if (inode)
887953
-                inode_unref (inode);
887953
+#endif /* KERNEL_VERSION */
887953
+        return;
887953
 }
887953
-#endif
887953
 
887953
 /*
887953
  * Send an inval inode notification to fuse. This causes an invalidation of the
887953
@@ -367,6 +373,10 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
887953
         if (!priv->reverse_fuse_thread_started)
887953
                 return;
887953
 
887953
+        inode = (inode_t *)(unsigned long)fuse_ino;
887953
+        if (inode == NULL)
887953
+                return;
887953
+
887953
         node = GF_CALLOC (1, sizeof (*node), gf_fuse_mt_invalidate_node_t);
887953
         if (node == NULL)
887953
                 return;
887953
@@ -386,7 +396,11 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
887953
         fniio->off = 0;
887953
         fniio->len = -1;
887953
 
887953
-        inode = fuse_ino_to_inode (fuse_ino, this);
887953
+        fuse_log_eh(this, "Invalidated inode %" PRIu64 " (gfid: %s)", fuse_ino,
887953
+                    uuid_utoa(inode->gfid));
887953
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
887953
+               "INVALIDATE inode: %" PRIu64 "(gfid:%s)", fuse_ino,
887953
+               uuid_utoa(inode->gfid));
887953
 
887953
         pthread_mutex_lock (&priv->invalidate_mutex);
887953
         {
887953
@@ -395,24 +409,23 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
887953
         }
887953
         pthread_mutex_unlock (&priv->invalidate_mutex);
887953
 
887953
-        gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE inode: %" PRIu64,
887953
-                fuse_ino);
887953
-
887953
-        if (inode) {
887953
-                fuse_log_eh (this, "Invalidated inode %" PRIu64 " (gfid: %s)",
887953
-                             fuse_ino, uuid_utoa (inode->gfid));
887953
-        } else {
887953
-                fuse_log_eh (this, "Invalidated inode %" PRIu64, fuse_ino);
887953
-        }
887953
-
887953
-        if (inode)
887953
-                inode_unref (inode);
887953
 #else
887953
 	gf_log ("glusterfs-fuse", GF_LOG_WARNING,
887953
-		"fuse_invalidate_inode not implemented on OS X due to missing FUSE notification");
887953
+		"fuse_invalidate_inode not implemented on this system");
887953
 #endif
887953
+        return;
887953
 }
887953
 
887953
+#if FUSE_KERNEL_MINOR_VERSION >= 11
887953
+/* Need this function for the signature (inode_t *, instead of uint64_t) */
887953
+static int32_t
887953
+fuse_inode_invalidate_fn(xlator_t *this, inode_t *inode)
887953
+{
887953
+    fuse_invalidate_entry(this, (uint64_t)inode);
887953
+    return 0;
887953
+}
887953
+#endif
887953
+
887953
 
887953
 int
887953
 send_fuse_err (xlator_t *this, fuse_in_header_t *finh, int error)
887953
@@ -686,11 +699,14 @@ do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup)
887953
 {
887953
 	inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this);
887953
 
887953
+        gf_log("fuse", GF_LOG_TRACE,
887953
+               "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)", unique,
887953
+               nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
887953
+
887953
 	fuse_log_eh(this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)",
887953
 		    unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
887953
 
887953
-	inode_forget(fuse_inode, nlookup);
887953
-	inode_unref(fuse_inode);
887953
+	inode_forget_with_unref(fuse_inode, nlookup);
887953
 }
887953
 
887953
 static void
887953
@@ -705,10 +721,6 @@ fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg,
887953
                 return;
887953
         }
887953
 
887953
-        gf_log ("glusterfs-fuse", GF_LOG_TRACE,
887953
-                "%"PRIu64": FORGET %"PRIu64"/%"PRIu64,
887953
-                finh->unique, finh->nodeid, ffi->nlookup);
887953
-
887953
 	do_forget(this, finh->unique, finh->nodeid, ffi->nlookup);
887953
 
887953
         GF_FREE (finh);
887953
@@ -4940,7 +4952,9 @@ fuse_thread_proc (void *data)
887953
         fuse_in_header_t         *finh = NULL;
887953
         struct iovec              iov_in[2];
887953
         void                     *msg = NULL;
887953
-        const size_t              msg0_size = sizeof (*finh) + 128;
887953
+        /* we need 512 extra buffer size for BATCH_FORGET fop. By tests, it is
887953
+           found to be reduces 'REALLOC()' in the loop */
887953
+        const size_t              msg0_size = sizeof (*finh) + 512;
887953
         fuse_handler_t          **fuse_ops = NULL;
887953
         struct pollfd             pfd[2] = {{0,}};
887953
 
887953
@@ -5283,7 +5297,12 @@ fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
887953
                         goto unlock;
887953
                 }
887953
 
887953
-                itable = inode_table_new (0, graph->top);
887953
+#if FUSE_KERNEL_MINOR_VERSION >= 11
887953
+                itable = inode_table_with_invalidator(priv->lru_limit, graph->top,
887953
+                                                      fuse_inode_invalidate_fn, this);
887953
+#else
887953
+                itable = inode_table_new(0, graph->top);
887953
+#endif
887953
                 if (!itable) {
887953
                         ret = -1;
887953
                         goto unlock;
887953
@@ -5740,6 +5759,8 @@ init (xlator_t *this_xl)
887953
                 }
887953
         }
887953
 
887953
+        GF_OPTION_INIT("lru-limit", priv->lru_limit, uint32, cleanup_exit);
887953
+
887953
         GF_OPTION_INIT("event-history", priv->event_history, bool,
887953
                        cleanup_exit);
887953
 
887953
@@ -6061,5 +6082,13 @@ struct volume_options options[] = {
887953
           .max = 64,
887953
           .description = "Sets fuse reader thread count.",
887953
         },
887953
+        {
887953
+         .key = {"lru-limit"},
887953
+         .type = GF_OPTION_TYPE_INT,
887953
+         .default_value = "131072",
887953
+         .min = 0,
887953
+         .description = "makes glusterfs invalidate kernel inodes after "
887953
+         "reaching this limit (0 means 'unlimited')",
887953
+        },
887953
         { .key = {NULL} },
887953
 };
887953
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
887953
index 4ca76e9..4e32a7f 100644
887953
--- a/xlators/mount/fuse/src/fuse-bridge.h
887953
+++ b/xlators/mount/fuse/src/fuse-bridge.h
887953
@@ -144,6 +144,9 @@ struct fuse_private {
887953
         gf_boolean_t         mount_finished;
887953
         gf_boolean_t         handle_graph_switch;
887953
         pthread_cond_t       migrate_cond;
887953
+
887953
+        /* LRU Limit, if not set, default is 128k for now */
887953
+        uint32_t lru_limit;
887953
 };
887953
 typedef struct fuse_private fuse_private_t;
887953
 
887953
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
887953
index 817619e..9a0404f 100755
887953
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
887953
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
887953
@@ -245,6 +245,10 @@ start_glusterfs ()
887953
         cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
887953
     fi
887953
 
887953
+    if [ -n "$lru_limit" ]; then
887953
+        cmd_line=$(echo "$cmd_line --lru-limit=$lru_limit");
887953
+    fi
887953
+
887953
     if [ -n "$bg_qlen" ]; then
887953
         cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
887953
     fi
887953
@@ -467,6 +471,9 @@ with_options()
887953
         "gid-timeout")
887953
             gid_timeout=$value
887953
             ;;
887953
+        "lru-limit")
887953
+            lru_limit=$value
887953
+            ;;
887953
         "background-qlen")
887953
             bg_qlen=$value
887953
             ;;
887953
-- 
887953
1.8.3.1
887953