b7d4d7
From b924c8ca8a133fc9413c8ed1407e63f1658c7e79 Mon Sep 17 00:00:00 2001
b7d4d7
From: Xavi Hernandez <xhernandez@redhat.com>
b7d4d7
Date: Tue, 12 May 2020 23:54:54 +0200
b7d4d7
Subject: [PATCH 523/526] open-behind: rewrite of internal logic
b7d4d7
b7d4d7
There was a critical flaw in the previous implementation of open-behind.
b7d4d7
b7d4d7
When an open is done in the background, it's necessary to take a
b7d4d7
reference on the fd_t object because once we "fake" the open answer,
b7d4d7
the fd could be destroyed. However as long as there's a reference,
b7d4d7
the release function won't be called. So, if the application closes
b7d4d7
the file descriptor without having actually opened it, there will
b7d4d7
always remain at least 1 reference, causing a leak.
b7d4d7
b7d4d7
To avoid this problem, the previous implementation didn't take a
b7d4d7
reference on the fd_t, so there were races where the fd could be
b7d4d7
destroyed while it was still in use.
b7d4d7
b7d4d7
To fix this, I've implemented a new xlator cbk that gets called from
b7d4d7
fuse when the application closes a file descriptor.
b7d4d7
b7d4d7
The whole logic of handling background opens have been simplified and
b7d4d7
it's more efficient now. Only if the fop needs to be delayed until an
b7d4d7
open completes, a stub is created. Otherwise no memory allocations are
b7d4d7
needed.
b7d4d7
b7d4d7
Correctly handling the close request while the open is still pending
b7d4d7
has added a bit of complexity, but overall normal operation is simpler.
b7d4d7
b7d4d7
Upstream patch:
b7d4d7
> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24451
b7d4d7
> Change-Id: I6376a5491368e0e1c283cc452849032636261592
b7d4d7
> Fixes: #1225
b7d4d7
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
b7d4d7
b7d4d7
BUG: 1830713
b7d4d7
Change-Id: I6376a5491368e0e1c283cc452849032636261592
b7d4d7
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
b7d4d7
Reviewed-on: https://code.engineering.redhat.com/gerrit/224487
b7d4d7
Tested-by: RHGS Build Bot <nigelb@redhat.com>
b7d4d7
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
b7d4d7
---
b7d4d7
 libglusterfs/src/fd.c                              |   26 +
b7d4d7
 libglusterfs/src/glusterfs/fd.h                    |    3 +
b7d4d7
 libglusterfs/src/glusterfs/xlator.h                |    4 +
b7d4d7
 libglusterfs/src/libglusterfs.sym                  |    1 +
b7d4d7
 tests/basic/open-behind/open-behind.t              |  183 +++
b7d4d7
 tests/basic/open-behind/tester-fd.c                |   99 ++
b7d4d7
 tests/basic/open-behind/tester.c                   |  444 +++++++
b7d4d7
 tests/basic/open-behind/tester.h                   |  145 +++
b7d4d7
 tests/bugs/glusterfs/bug-873962-spb.t              |    1 +
b7d4d7
 xlators/mount/fuse/src/fuse-bridge.c               |    2 +
b7d4d7
 .../open-behind/src/open-behind-messages.h         |    6 +-
b7d4d7
 xlators/performance/open-behind/src/open-behind.c  | 1302 ++++++++------------
b7d4d7
 12 files changed, 1393 insertions(+), 823 deletions(-)
b7d4d7
 create mode 100644 tests/basic/open-behind/open-behind.t
b7d4d7
 create mode 100644 tests/basic/open-behind/tester-fd.c
b7d4d7
 create mode 100644 tests/basic/open-behind/tester.c
b7d4d7
 create mode 100644 tests/basic/open-behind/tester.h
b7d4d7
b7d4d7
diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c
b7d4d7
index 314546a..e4ec401 100644
b7d4d7
--- a/libglusterfs/src/fd.c
b7d4d7
+++ b/libglusterfs/src/fd.c
b7d4d7
@@ -501,6 +501,32 @@ out:
b7d4d7
 }
b7d4d7
 
b7d4d7
 void
b7d4d7
+fd_close(fd_t *fd)
b7d4d7
+{
b7d4d7
+    xlator_t *xl, *old_THIS;
b7d4d7
+
b7d4d7
+    old_THIS = THIS;
b7d4d7
+
b7d4d7
+    for (xl = fd->inode->table->xl->graph->first; xl != NULL; xl = xl->next) {
b7d4d7
+        if (!xl->call_cleanup) {
b7d4d7
+            THIS = xl;
b7d4d7
+
b7d4d7
+            if (IA_ISDIR(fd->inode->ia_type)) {
b7d4d7
+                if (xl->cbks->fdclosedir != NULL) {
b7d4d7
+                    xl->cbks->fdclosedir(xl, fd);
b7d4d7
+                }
b7d4d7
+            } else {
b7d4d7
+                if (xl->cbks->fdclose != NULL) {
b7d4d7
+                    xl->cbks->fdclose(xl, fd);
b7d4d7
+                }
b7d4d7
+            }
b7d4d7
+        }
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    THIS = old_THIS;
b7d4d7
+}
b7d4d7
+
b7d4d7
+void
b7d4d7
 fd_unref(fd_t *fd)
b7d4d7
 {
b7d4d7
     int32_t refcount = 0;
b7d4d7
diff --git a/libglusterfs/src/glusterfs/fd.h b/libglusterfs/src/glusterfs/fd.h
b7d4d7
index cdbe289..4d157c4 100644
b7d4d7
--- a/libglusterfs/src/glusterfs/fd.h
b7d4d7
+++ b/libglusterfs/src/glusterfs/fd.h
b7d4d7
@@ -107,6 +107,9 @@ fd_ref(fd_t *fd);
b7d4d7
 void
b7d4d7
 fd_unref(fd_t *fd);
b7d4d7
 
b7d4d7
+void
b7d4d7
+fd_close(fd_t *fd);
b7d4d7
+
b7d4d7
 fd_t *
b7d4d7
 fd_create(struct _inode *inode, pid_t pid);
b7d4d7
 
b7d4d7
diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h
b7d4d7
index 8650ccc..273039a 100644
b7d4d7
--- a/libglusterfs/src/glusterfs/xlator.h
b7d4d7
+++ b/libglusterfs/src/glusterfs/xlator.h
b7d4d7
@@ -705,6 +705,8 @@ typedef size_t (*cbk_inodectx_size_t)(xlator_t *this, inode_t *inode);
b7d4d7
 
b7d4d7
 typedef size_t (*cbk_fdctx_size_t)(xlator_t *this, fd_t *fd);
b7d4d7
 
b7d4d7
+typedef void (*cbk_fdclose_t)(xlator_t *this, fd_t *fd);
b7d4d7
+
b7d4d7
 struct xlator_cbks {
b7d4d7
     cbk_forget_t forget;
b7d4d7
     cbk_release_t release;
b7d4d7
@@ -715,6 +717,8 @@ struct xlator_cbks {
b7d4d7
     cbk_ictxmerge_t ictxmerge;
b7d4d7
     cbk_inodectx_size_t ictxsize;
b7d4d7
     cbk_fdctx_size_t fdctxsize;
b7d4d7
+    cbk_fdclose_t fdclose;
b7d4d7
+    cbk_fdclose_t fdclosedir;
b7d4d7
 };
b7d4d7
 
b7d4d7
 typedef int32_t (*dumpop_priv_t)(xlator_t *this);
b7d4d7
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
b7d4d7
index bc770e2..0a0862e 100644
b7d4d7
--- a/libglusterfs/src/libglusterfs.sym
b7d4d7
+++ b/libglusterfs/src/libglusterfs.sym
b7d4d7
@@ -456,6 +456,7 @@ event_unregister_close
b7d4d7
 fd_anonymous
b7d4d7
 fd_anonymous_with_flags
b7d4d7
 fd_bind
b7d4d7
+fd_close
b7d4d7
 fd_create
b7d4d7
 fd_create_uint64
b7d4d7
 __fd_ctx_del
b7d4d7
diff --git a/tests/basic/open-behind/open-behind.t b/tests/basic/open-behind/open-behind.t
b7d4d7
new file mode 100644
b7d4d7
index 0000000..5e865d6
b7d4d7
--- /dev/null
b7d4d7
+++ b/tests/basic/open-behind/open-behind.t
b7d4d7
@@ -0,0 +1,183 @@
b7d4d7
+#!/bin/bash
b7d4d7
+
b7d4d7
+WD="$(dirname "${0}")"
b7d4d7
+
b7d4d7
+. ${WD}/../../include.rc
b7d4d7
+. ${WD}/../../volume.rc
b7d4d7
+
b7d4d7
+function assign() {
b7d4d7
+    local _assign_var="${1}"
b7d4d7
+    local _assign_value="${2}"
b7d4d7
+
b7d4d7
+    printf -v "${_assign_var}" "%s" "${_assign_value}"
b7d4d7
+}
b7d4d7
+
b7d4d7
+function pipe_create() {
b7d4d7
+    local _pipe_create_var="${1}"
b7d4d7
+    local _pipe_create_name
b7d4d7
+    local _pipe_create_fd
b7d4d7
+
b7d4d7
+    _pipe_create_name="$(mktemp -u)"
b7d4d7
+    mkfifo "${_pipe_create_name}"
b7d4d7
+    exec {_pipe_create_fd}<>"${_pipe_create_name}"
b7d4d7
+    rm "${_pipe_create_name}"
b7d4d7
+
b7d4d7
+    assign "${_pipe_create_var}" "${_pipe_create_fd}"
b7d4d7
+}
b7d4d7
+
b7d4d7
+function pipe_close() {
b7d4d7
+    local _pipe_close_fd="${!1}"
b7d4d7
+
b7d4d7
+    exec {_pipe_close_fd}>&-
b7d4d7
+}
b7d4d7
+
b7d4d7
+function tester_start() {
b7d4d7
+    declare -ag tester
b7d4d7
+    local tester_in
b7d4d7
+    local tester_out
b7d4d7
+
b7d4d7
+    pipe_create tester_in
b7d4d7
+    pipe_create tester_out
b7d4d7
+
b7d4d7
+    ${WD}/tester <&${tester_in} >&${tester_out} &
b7d4d7
+
b7d4d7
+    tester=("$!" "${tester_in}" "${tester_out}")
b7d4d7
+}
b7d4d7
+
b7d4d7
+function tester_send() {
b7d4d7
+    declare -ag tester
b7d4d7
+    local tester_res
b7d4d7
+    local tester_extra
b7d4d7
+
b7d4d7
+    echo "${*}" >&${tester[1]}
b7d4d7
+
b7d4d7
+    read -t 3 -u ${tester[2]} tester_res tester_extra
b7d4d7
+    echo "${tester_res} ${tester_extra}"
b7d4d7
+    if [[ "${tester_res}" == "OK" ]]; then
b7d4d7
+        return 0
b7d4d7
+    fi
b7d4d7
+
b7d4d7
+    return 1
b7d4d7
+}
b7d4d7
+
b7d4d7
+function tester_stop() {
b7d4d7
+    declare -ag tester
b7d4d7
+    local tester_res
b7d4d7
+
b7d4d7
+    tester_send "quit"
b7d4d7
+
b7d4d7
+    tester_res=0
b7d4d7
+    if ! wait ${tester[0]}; then
b7d4d7
+        tester_res=$?
b7d4d7
+    fi
b7d4d7
+
b7d4d7
+    unset tester
b7d4d7
+
b7d4d7
+    return ${tester_res}
b7d4d7
+}
b7d4d7
+
b7d4d7
+function count_open() {
b7d4d7
+    local file="$(realpath "${B0}/${V0}/${1}")"
b7d4d7
+    local count="0"
b7d4d7
+    local inode
b7d4d7
+    local ref
b7d4d7
+
b7d4d7
+    inode="$(stat -c %i "${file}")"
b7d4d7
+
b7d4d7
+    for fd in /proc/${BRICK_PID}/fd/*; do
b7d4d7
+        ref="$(readlink "${fd}")"
b7d4d7
+        if [[ "${ref}" == "${B0}/${V0}/"* ]]; then
b7d4d7
+            if [[ "$(stat -c %i "${ref}")" == "${inode}" ]]; then
b7d4d7
+                count="$((${count} + 1))"
b7d4d7
+            fi
b7d4d7
+        fi
b7d4d7
+    done
b7d4d7
+
b7d4d7
+    echo "${count}"
b7d4d7
+}
b7d4d7
+
b7d4d7
+cleanup
b7d4d7
+
b7d4d7
+TEST build_tester ${WD}/tester.c ${WD}/tester-fd.c
b7d4d7
+
b7d4d7
+TEST glusterd
b7d4d7
+TEST pidof glusterd
b7d4d7
+TEST ${CLI} volume create ${V0} ${H0}:${B0}/${V0}
b7d4d7
+TEST ${CLI} volume set ${V0} flush-behind off
b7d4d7
+TEST ${CLI} volume set ${V0} write-behind off
b7d4d7
+TEST ${CLI} volume set ${V0} quick-read off
b7d4d7
+TEST ${CLI} volume set ${V0} stat-prefetch on
b7d4d7
+TEST ${CLI} volume set ${V0} io-cache off
b7d4d7
+TEST ${CLI} volume set ${V0} open-behind on
b7d4d7
+TEST ${CLI} volume set ${V0} lazy-open off
b7d4d7
+TEST ${CLI} volume set ${V0} read-after-open off
b7d4d7
+TEST ${CLI} volume start ${V0}
b7d4d7
+
b7d4d7
+TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0};
b7d4d7
+
b7d4d7
+BRICK_PID="$(get_brick_pid ${V0} ${H0} ${B0}/${V0})"
b7d4d7
+
b7d4d7
+TEST touch "${M0}/test"
b7d4d7
+
b7d4d7
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
b7d4d7
+TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0};
b7d4d7
+
b7d4d7
+TEST tester_start
b7d4d7
+
b7d4d7
+TEST tester_send fd open 0 "${M0}/test"
b7d4d7
+EXPECT_WITHIN 5 "1" count_open "/test"
b7d4d7
+TEST tester_send fd close 0
b7d4d7
+EXPECT_WITHIN 5 "0" count_open "/test"
b7d4d7
+
b7d4d7
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
b7d4d7
+TEST ${CLI} volume set ${V0} lazy-open on
b7d4d7
+TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0};
b7d4d7
+
b7d4d7
+TEST tester_send fd open 0 "${M0}/test"
b7d4d7
+sleep 2
b7d4d7
+EXPECT "0" count_open "/test"
b7d4d7
+TEST tester_send fd write 0 "test"
b7d4d7
+EXPECT "1" count_open "/test"
b7d4d7
+TEST tester_send fd close 0
b7d4d7
+EXPECT_WITHIN 5 "0" count_open "/test"
b7d4d7
+
b7d4d7
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
b7d4d7
+TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0};
b7d4d7
+
b7d4d7
+TEST tester_send fd open 0 "${M0}/test"
b7d4d7
+EXPECT "0" count_open "/test"
b7d4d7
+EXPECT "test" tester_send fd read 0 64
b7d4d7
+# Even though read-after-open is disabled, use-anonymous-fd is also disabled,
b7d4d7
+# so reads need to open the file first.
b7d4d7
+EXPECT "1" count_open "/test"
b7d4d7
+TEST tester_send fd close 0
b7d4d7
+EXPECT "0" count_open "/test"
b7d4d7
+
b7d4d7
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
b7d4d7
+TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0};
b7d4d7
+
b7d4d7
+TEST tester_send fd open 0 "${M0}/test"
b7d4d7
+EXPECT "0" count_open "/test"
b7d4d7
+TEST tester_send fd open 1 "${M0}/test"
b7d4d7
+EXPECT "2" count_open "/test"
b7d4d7
+TEST tester_send fd close 0
b7d4d7
+EXPECT_WITHIN 5 "1" count_open "/test"
b7d4d7
+TEST tester_send fd close 1
b7d4d7
+EXPECT_WITHIN 5 "0" count_open "/test"
b7d4d7
+
b7d4d7
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
b7d4d7
+TEST ${CLI} volume set ${V0} read-after-open on
b7d4d7
+TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0};
b7d4d7
+
b7d4d7
+TEST tester_send fd open 0 "${M0}/test"
b7d4d7
+EXPECT "0" count_open "/test"
b7d4d7
+EXPECT "test" tester_send fd read 0 64
b7d4d7
+EXPECT "1" count_open "/test"
b7d4d7
+TEST tester_send fd close 0
b7d4d7
+EXPECT_WITHIN 5 "0" count_open "/test"
b7d4d7
+
b7d4d7
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
b7d4d7
+
b7d4d7
+TEST tester_stop
b7d4d7
+
b7d4d7
+cleanup
b7d4d7
diff --git a/tests/basic/open-behind/tester-fd.c b/tests/basic/open-behind/tester-fd.c
b7d4d7
new file mode 100644
b7d4d7
index 0000000..00f02bc
b7d4d7
--- /dev/null
b7d4d7
+++ b/tests/basic/open-behind/tester-fd.c
b7d4d7
@@ -0,0 +1,99 @@
b7d4d7
+/*
b7d4d7
+  Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com>
b7d4d7
+  This file is part of GlusterFS.
b7d4d7
+
b7d4d7
+  This file is licensed to you under your choice of the GNU Lesser
b7d4d7
+  General Public License, version 3 or any later version (LGPLv3 or
b7d4d7
+  later), or the GNU General Public License, version 2 (GPLv2), in all
b7d4d7
+  cases as published by the Free Software Foundation.
b7d4d7
+*/
b7d4d7
+
b7d4d7
+#include "tester.h"
b7d4d7
+
b7d4d7
+#include <stdlib.h>
b7d4d7
+#include <unistd.h>
b7d4d7
+#include <sys/types.h>
b7d4d7
+#include <sys/stat.h>
b7d4d7
+#include <fcntl.h>
b7d4d7
+#include <string.h>
b7d4d7
+#include <ctype.h>
b7d4d7
+#include <errno.h>
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+fd_open(context_t *ctx, command_t *cmd)
b7d4d7
+{
b7d4d7
+    obj_t *obj;
b7d4d7
+    int32_t fd;
b7d4d7
+
b7d4d7
+    obj = cmd->args[0].obj.ref;
b7d4d7
+
b7d4d7
+    fd = open(cmd->args[1].str.data, O_RDWR);
b7d4d7
+    if (fd < 0) {
b7d4d7
+        return error(errno, "open() failed");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    obj->type = OBJ_TYPE_FD;
b7d4d7
+    obj->fd = fd;
b7d4d7
+
b7d4d7
+    out_ok("%d", fd);
b7d4d7
+
b7d4d7
+    return 0;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+fd_close(context_t *ctx, command_t *cmd)
b7d4d7
+{
b7d4d7
+    obj_t *obj;
b7d4d7
+
b7d4d7
+    obj = cmd->args[0].obj.ref;
b7d4d7
+    obj->type = OBJ_TYPE_NONE;
b7d4d7
+
b7d4d7
+    if (close(obj->fd) != 0) {
b7d4d7
+        return error(errno, "close() failed");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    out_ok();
b7d4d7
+
b7d4d7
+    return 0;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+fd_write(context_t *ctx, command_t *cmd)
b7d4d7
+{
b7d4d7
+    ssize_t len, ret;
b7d4d7
+
b7d4d7
+    len = strlen(cmd->args[1].str.data);
b7d4d7
+    ret = write(cmd->args[0].obj.ref->fd, cmd->args[1].str.data, len);
b7d4d7
+    if (ret < 0) {
b7d4d7
+        return error(errno, "write() failed");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    out_ok("%zd", ret);
b7d4d7
+
b7d4d7
+    return 0;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+fd_read(context_t *ctx, command_t *cmd)
b7d4d7
+{
b7d4d7
+    char data[cmd->args[1].num.value + 1];
b7d4d7
+    ssize_t ret;
b7d4d7
+
b7d4d7
+    ret = read(cmd->args[0].obj.ref->fd, data, cmd->args[1].num.value);
b7d4d7
+    if (ret < 0) {
b7d4d7
+        return error(errno, "read() failed");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    data[ret] = 0;
b7d4d7
+
b7d4d7
+    out_ok("%zd %s", ret, data);
b7d4d7
+
b7d4d7
+    return 0;
b7d4d7
+}
b7d4d7
+
b7d4d7
+command_t fd_commands[] = {
b7d4d7
+    {"open", fd_open, CMD_ARGS(ARG_VAL(OBJ_TYPE_NONE), ARG_STR(1024))},
b7d4d7
+    {"close", fd_close, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD))},
b7d4d7
+    {"write", fd_write, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD), ARG_STR(1024))},
b7d4d7
+    {"read", fd_read, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD), ARG_NUM(0, 1024))},
b7d4d7
+    CMD_END};
b7d4d7
diff --git a/tests/basic/open-behind/tester.c b/tests/basic/open-behind/tester.c
b7d4d7
new file mode 100644
b7d4d7
index 0000000..b2da71c
b7d4d7
--- /dev/null
b7d4d7
+++ b/tests/basic/open-behind/tester.c
b7d4d7
@@ -0,0 +1,444 @@
b7d4d7
+/*
b7d4d7
+  Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com>
b7d4d7
+  This file is part of GlusterFS.
b7d4d7
+
b7d4d7
+  This file is licensed to you under your choice of the GNU Lesser
b7d4d7
+  General Public License, version 3 or any later version (LGPLv3 or
b7d4d7
+  later), or the GNU General Public License, version 2 (GPLv2), in all
b7d4d7
+  cases as published by the Free Software Foundation.
b7d4d7
+*/
b7d4d7
+
b7d4d7
+#include "tester.h"
b7d4d7
+
b7d4d7
+#include <stdlib.h>
b7d4d7
+#include <unistd.h>
b7d4d7
+#include <string.h>
b7d4d7
+#include <ctype.h>
b7d4d7
+#include <errno.h>
b7d4d7
+
b7d4d7
+static void *
b7d4d7
+mem_alloc(size_t size)
b7d4d7
+{
b7d4d7
+    void *ptr;
b7d4d7
+
b7d4d7
+    ptr = malloc(size);
b7d4d7
+    if (ptr == NULL) {
b7d4d7
+        error(ENOMEM, "Failed to allocate memory (%zu bytes)", size);
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return ptr;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static void
b7d4d7
+mem_free(void *ptr)
b7d4d7
+{
b7d4d7
+    free(ptr);
b7d4d7
+}
b7d4d7
+
b7d4d7
+static bool
b7d4d7
+buffer_create(context_t *ctx, size_t size)
b7d4d7
+{
b7d4d7
+    ctx->buffer.base = mem_alloc(size);
b7d4d7
+    if (ctx->buffer.base == NULL) {
b7d4d7
+        return false;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    ctx->buffer.size = size;
b7d4d7
+    ctx->buffer.len = 0;
b7d4d7
+    ctx->buffer.pos = 0;
b7d4d7
+
b7d4d7
+    return true;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static void
b7d4d7
+buffer_destroy(context_t *ctx)
b7d4d7
+{
b7d4d7
+    mem_free(ctx->buffer.base);
b7d4d7
+    ctx->buffer.size = 0;
b7d4d7
+    ctx->buffer.len = 0;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+buffer_get(context_t *ctx)
b7d4d7
+{
b7d4d7
+    ssize_t len;
b7d4d7
+
b7d4d7
+    if (ctx->buffer.pos >= ctx->buffer.len) {
b7d4d7
+        len = read(0, ctx->buffer.base, ctx->buffer.size);
b7d4d7
+        if (len < 0) {
b7d4d7
+            return error(errno, "read() failed");
b7d4d7
+        }
b7d4d7
+        if (len == 0) {
b7d4d7
+            return 0;
b7d4d7
+        }
b7d4d7
+
b7d4d7
+        ctx->buffer.len = len;
b7d4d7
+        ctx->buffer.pos = 0;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return ctx->buffer.base[ctx->buffer.pos++];
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+str_skip_spaces(context_t *ctx, int32_t current)
b7d4d7
+{
b7d4d7
+    while ((current > 0) && (current != '\n') && isspace(current)) {
b7d4d7
+        current = buffer_get(ctx);
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return current;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+str_token(context_t *ctx, char *buffer, uint32_t size, int32_t current)
b7d4d7
+{
b7d4d7
+    uint32_t len;
b7d4d7
+
b7d4d7
+    current = str_skip_spaces(ctx, current);
b7d4d7
+
b7d4d7
+    len = 0;
b7d4d7
+    while ((size > 0) && (current > 0) && (current != '\n') &&
b7d4d7
+           !isspace(current)) {
b7d4d7
+        len++;
b7d4d7
+        *buffer++ = current;
b7d4d7
+        size--;
b7d4d7
+        current = buffer_get(ctx);
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    if (len == 0) {
b7d4d7
+        return error(ENODATA, "Expecting a token");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    if (size == 0) {
b7d4d7
+        return error(ENOBUFS, "Token too long");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    *buffer = 0;
b7d4d7
+
b7d4d7
+    return current;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+str_number(context_t *ctx, uint64_t min, uint64_t max, uint64_t *value,
b7d4d7
+           int32_t current)
b7d4d7
+{
b7d4d7
+    char text[32], *ptr;
b7d4d7
+    uint64_t num;
b7d4d7
+
b7d4d7
+    current = str_token(ctx, text, sizeof(text), current);
b7d4d7
+    if (current > 0) {
b7d4d7
+        num = strtoul(text, &ptr, 0);
b7d4d7
+        if ((*ptr != 0) || (num < min) || (num > max)) {
b7d4d7
+            return error(ERANGE, "Invalid number");
b7d4d7
+        }
b7d4d7
+        *value = num;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return current;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+str_eol(context_t *ctx, int32_t current)
b7d4d7
+{
b7d4d7
+    current = str_skip_spaces(ctx, current);
b7d4d7
+    if (current != '\n') {
b7d4d7
+        return error(EINVAL, "Expecting end of command");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return current;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static void
b7d4d7
+str_skip(context_t *ctx, int32_t current)
b7d4d7
+{
b7d4d7
+    while ((current > 0) && (current != '\n')) {
b7d4d7
+        current = buffer_get(ctx);
b7d4d7
+    }
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+cmd_parse_obj(context_t *ctx, arg_t *arg, int32_t current)
b7d4d7
+{
b7d4d7
+    obj_t *obj;
b7d4d7
+    uint64_t id;
b7d4d7
+
b7d4d7
+    current = str_number(ctx, 0, ctx->obj_count, &id, current);
b7d4d7
+    if (current <= 0) {
b7d4d7
+        return current;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    obj = &ctx->objs[id];
b7d4d7
+    if (obj->type != arg->obj.type) {
b7d4d7
+        if (obj->type != OBJ_TYPE_NONE) {
b7d4d7
+            return error(EBUSY, "Object is in use");
b7d4d7
+        }
b7d4d7
+        return error(ENOENT, "Object is not defined");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    arg->obj.ref = obj;
b7d4d7
+
b7d4d7
+    return current;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+cmd_parse_num(context_t *ctx, arg_t *arg, int32_t current)
b7d4d7
+{
b7d4d7
+    return str_number(ctx, arg->num.min, arg->num.max, &arg->num.value,
b7d4d7
+                      current);
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+cmd_parse_str(context_t *ctx, arg_t *arg, int32_t current)
b7d4d7
+{
b7d4d7
+    return str_token(ctx, arg->str.data, arg->str.size, current);
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+cmd_parse_args(context_t *ctx, command_t *cmd, int32_t current)
b7d4d7
+{
b7d4d7
+    arg_t *arg;
b7d4d7
+
b7d4d7
+    for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) {
b7d4d7
+        switch (arg->type) {
b7d4d7
+            case ARG_TYPE_OBJ:
b7d4d7
+                current = cmd_parse_obj(ctx, arg, current);
b7d4d7
+                break;
b7d4d7
+            case ARG_TYPE_NUM:
b7d4d7
+                current = cmd_parse_num(ctx, arg, current);
b7d4d7
+                break;
b7d4d7
+            case ARG_TYPE_STR:
b7d4d7
+                current = cmd_parse_str(ctx, arg, current);
b7d4d7
+                break;
b7d4d7
+            default:
b7d4d7
+                return error(EINVAL, "Unknown argument type");
b7d4d7
+        }
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    if (current < 0) {
b7d4d7
+        return current;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    current = str_eol(ctx, current);
b7d4d7
+    if (current <= 0) {
b7d4d7
+        return error(EINVAL, "Syntax error");
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return cmd->handler(ctx, cmd);
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+cmd_parse(context_t *ctx, command_t *cmds)
b7d4d7
+{
b7d4d7
+    char text[32];
b7d4d7
+    command_t *cmd;
b7d4d7
+    int32_t current;
b7d4d7
+
b7d4d7
+    cmd = cmds;
b7d4d7
+    do {
b7d4d7
+        current = str_token(ctx, text, sizeof(text), buffer_get(ctx));
b7d4d7
+        if (current <= 0) {
b7d4d7
+            return current;
b7d4d7
+        }
b7d4d7
+
b7d4d7
+        while (cmd->name != NULL) {
b7d4d7
+            if (strcmp(cmd->name, text) == 0) {
b7d4d7
+                if (cmd->handler != NULL) {
b7d4d7
+                    return cmd_parse_args(ctx, cmd, current);
b7d4d7
+                }
b7d4d7
+                cmd = cmd->cmds;
b7d4d7
+                break;
b7d4d7
+            }
b7d4d7
+            cmd++;
b7d4d7
+        }
b7d4d7
+    } while (cmd->name != NULL);
b7d4d7
+
b7d4d7
+    str_skip(ctx, current);
b7d4d7
+
b7d4d7
+    return error(ENOTSUP, "Unknown command");
b7d4d7
+}
b7d4d7
+
b7d4d7
+static void
b7d4d7
+cmd_fini(context_t *ctx, command_t *cmds)
b7d4d7
+{
b7d4d7
+    command_t *cmd;
b7d4d7
+    arg_t *arg;
b7d4d7
+
b7d4d7
+    for (cmd = cmds; cmd->name != NULL; cmd++) {
b7d4d7
+        if (cmd->handler == NULL) {
b7d4d7
+            cmd_fini(ctx, cmd->cmds);
b7d4d7
+        } else {
b7d4d7
+            for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) {
b7d4d7
+                switch (arg->type) {
b7d4d7
+                    case ARG_TYPE_STR:
b7d4d7
+                        mem_free(arg->str.data);
b7d4d7
+                        arg->str.data = NULL;
b7d4d7
+                        break;
b7d4d7
+                    default:
b7d4d7
+                        break;
b7d4d7
+                }
b7d4d7
+            }
b7d4d7
+        }
b7d4d7
+    }
b7d4d7
+}
b7d4d7
+
b7d4d7
+static bool
b7d4d7
+cmd_init(context_t *ctx, command_t *cmds)
b7d4d7
+{
b7d4d7
+    command_t *cmd;
b7d4d7
+    arg_t *arg;
b7d4d7
+
b7d4d7
+    for (cmd = cmds; cmd->name != NULL; cmd++) {
b7d4d7
+        if (cmd->handler == NULL) {
b7d4d7
+            if (!cmd_init(ctx, cmd->cmds)) {
b7d4d7
+                return false;
b7d4d7
+            }
b7d4d7
+        } else {
b7d4d7
+            for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) {
b7d4d7
+                switch (arg->type) {
b7d4d7
+                    case ARG_TYPE_STR:
b7d4d7
+                        arg->str.data = mem_alloc(arg->str.size);
b7d4d7
+                        if (arg->str.data == NULL) {
b7d4d7
+                            return false;
b7d4d7
+                        }
b7d4d7
+                        break;
b7d4d7
+                    default:
b7d4d7
+                        break;
b7d4d7
+                }
b7d4d7
+            }
b7d4d7
+        }
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return true;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static bool
b7d4d7
+objs_create(context_t *ctx, uint32_t count)
b7d4d7
+{
b7d4d7
+    uint32_t i;
b7d4d7
+
b7d4d7
+    ctx->objs = mem_alloc(sizeof(obj_t) * count);
b7d4d7
+    if (ctx->objs == NULL) {
b7d4d7
+        return false;
b7d4d7
+    }
b7d4d7
+    ctx->obj_count = count;
b7d4d7
+
b7d4d7
+    for (i = 0; i < count; i++) {
b7d4d7
+        ctx->objs[i].type = OBJ_TYPE_NONE;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    return true;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+objs_destroy(context_t *ctx)
b7d4d7
+{
b7d4d7
+    uint32_t i;
b7d4d7
+    int32_t err;
b7d4d7
+
b7d4d7
+    err = 0;
b7d4d7
+    for (i = 0; i < ctx->obj_count; i++) {
b7d4d7
+        if (ctx->objs[i].type != OBJ_TYPE_NONE) {
b7d4d7
+            err = error(ENOTEMPTY, "Objects not destroyed");
b7d4d7
+            break;
b7d4d7
+        }
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    mem_free(ctx->objs);
b7d4d7
+    ctx->objs = NULL;
b7d4d7
+    ctx->obj_count = 0;
b7d4d7
+
b7d4d7
+    return err;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static context_t *
b7d4d7
+init(size_t size, uint32_t objs, command_t *cmds)
b7d4d7
+{
b7d4d7
+    context_t *ctx;
b7d4d7
+
b7d4d7
+    ctx = mem_alloc(sizeof(context_t));
b7d4d7
+    if (ctx == NULL) {
b7d4d7
+        goto failed;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    if (!buffer_create(ctx, size)) {
b7d4d7
+        goto failed_ctx;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    if (!objs_create(ctx, objs)) {
b7d4d7
+        goto failed_buffer;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    if (!cmd_init(ctx, cmds)) {
b7d4d7
+        goto failed_objs;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    ctx->active = true;
b7d4d7
+
b7d4d7
+    return ctx;
b7d4d7
+
b7d4d7
+failed_objs:
b7d4d7
+    cmd_fini(ctx, cmds);
b7d4d7
+    objs_destroy(ctx);
b7d4d7
+failed_buffer:
b7d4d7
+    buffer_destroy(ctx);
b7d4d7
+failed_ctx:
b7d4d7
+    mem_free(ctx);
b7d4d7
+failed:
b7d4d7
+    return NULL;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+fini(context_t *ctx, command_t *cmds)
b7d4d7
+{
b7d4d7
+    int32_t ret;
b7d4d7
+
b7d4d7
+    cmd_fini(ctx, cmds);
b7d4d7
+    buffer_destroy(ctx);
b7d4d7
+
b7d4d7
+    ret = objs_destroy(ctx);
b7d4d7
+
b7d4d7
+    ctx->active = false;
b7d4d7
+
b7d4d7
+    return ret;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static int32_t
b7d4d7
+exec_quit(context_t *ctx, command_t *cmd)
b7d4d7
+{
b7d4d7
+    ctx->active = false;
b7d4d7
+
b7d4d7
+    return 0;
b7d4d7
+}
b7d4d7
+
b7d4d7
+static command_t commands[] = {{"fd", NULL, CMD_SUB(fd_commands)},
b7d4d7
+                               {"quit", exec_quit, CMD_ARGS()},
b7d4d7
+                               CMD_END};
b7d4d7
+
b7d4d7
+int32_t
b7d4d7
+main(int32_t argc, char *argv[])
b7d4d7
+{
b7d4d7
+    context_t *ctx;
b7d4d7
+    int32_t res;
b7d4d7
+
b7d4d7
+    ctx = init(1024, 16, commands);
b7d4d7
+    if (ctx == NULL) {
b7d4d7
+        return 1;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    do {
b7d4d7
+        res = cmd_parse(ctx, commands);
b7d4d7
+        if (res < 0) {
b7d4d7
+            out_err(-res);
b7d4d7
+        }
b7d4d7
+    } while (ctx->active);
b7d4d7
+
b7d4d7
+    res = fini(ctx, commands);
b7d4d7
+    if (res >= 0) {
b7d4d7
+        out_ok();
b7d4d7
+        return 0;
b7d4d7
+    }
b7d4d7
+
b7d4d7
+    out_err(-res);
b7d4d7
+
b7d4d7
+    return 1;
b7d4d7
+}
b7d4d7
diff --git a/tests/basic/open-behind/tester.h b/tests/basic/open-behind/tester.h
b7d4d7
new file mode 100644
b7d4d7
index 0000000..64e940c
b7d4d7
--- /dev/null
b7d4d7
+++ b/tests/basic/open-behind/tester.h
b7d4d7
@@ -0,0 +1,145 @@
b7d4d7
+/*
b7d4d7
+  Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com>
b7d4d7
+  This file is part of GlusterFS.
b7d4d7
+
b7d4d7
+  This file is licensed to you under your choice of the GNU Lesser
b7d4d7
+  General Public License, version 3 or any later version (LGPLv3 or
b7d4d7
+  later), or the GNU General Public License, version 2 (GPLv2), in all
b7d4d7
+  cases as published by the Free Software Foundation.
b7d4d7
+*/
b7d4d7
+
b7d4d7
+#ifndef __TESTER_H__
b7d4d7
+#define __TESTER_H__
b7d4d7
+
b7d4d7
+#include <stdio.h>
b7d4d7
+#include <inttypes.h>
b7d4d7
+#include <stdbool.h>
b7d4d7
+
b7d4d7
+enum _obj_type;
b7d4d7
+typedef enum _obj_type obj_type_t;
b7d4d7
+
b7d4d7
+enum _arg_type;
b7d4d7
+typedef enum _arg_type arg_type_t;
b7d4d7
+
b7d4d7
+struct _buffer;
b7d4d7
+typedef struct _buffer buffer_t;
b7d4d7
+
b7d4d7
+struct _obj;
b7d4d7
+typedef struct _obj obj_t;
b7d4d7
+
b7d4d7
+struct _context;
b7d4d7
+typedef struct _context context_t;
b7d4d7
+
b7d4d7
+struct _arg;
b7d4d7
+typedef struct _arg arg_t;
b7d4d7
+
b7d4d7
+struct _command;
b7d4d7
+typedef struct _command command_t;
b7d4d7
+
b7d4d7
+enum _obj_type { OBJ_TYPE_NONE, OBJ_TYPE_FD };
b7d4d7
+
b7d4d7
+enum _arg_type { ARG_TYPE_NONE, ARG_TYPE_OBJ, ARG_TYPE_NUM, ARG_TYPE_STR };
b7d4d7
+
b7d4d7
+struct _buffer {
b7d4d7
+    char *base;
b7d4d7
+    uint32_t size;
b7d4d7
+    uint32_t len;
b7d4d7
+    uint32_t pos;
b7d4d7
+};
b7d4d7
+
b7d4d7
+struct _obj {
b7d4d7
+    obj_type_t type;
b7d4d7
+    union {
b7d4d7
+        int32_t fd;
b7d4d7
+    };
b7d4d7
+};
b7d4d7
+
b7d4d7
+struct _context {
b7d4d7
+    obj_t *objs;
b7d4d7
+    buffer_t buffer;
b7d4d7
+    uint32_t obj_count;
b7d4d7
+    bool active;
b7d4d7
+};
b7d4d7
+
b7d4d7
+struct _arg {
b7d4d7
+    arg_type_t type;
b7d4d7
+    union {
b7d4d7
+        struct {
b7d4d7
+            obj_type_t type;
b7d4d7
+            obj_t *ref;
b7d4d7
+        } obj;
b7d4d7
+        struct {
b7d4d7
+            uint64_t value;
b7d4d7
+            uint64_t min;
b7d4d7
+            uint64_t max;
b7d4d7
+        } num;
b7d4d7
+        struct {
b7d4d7
+            uint32_t size;
b7d4d7
+            char *data;
b7d4d7
+        } str;
b7d4d7
+    };
b7d4d7
+};
b7d4d7
+
b7d4d7
+struct _command {
b7d4d7
+    const char *name;
b7d4d7
+    int32_t (*handler)(context_t *ctx, command_t *cmd);
b7d4d7
+    union {
b7d4d7
+        arg_t *args;
b7d4d7
+        command_t *cmds;
b7d4d7
+    };
b7d4d7
+};
b7d4d7
+
b7d4d7
+#define msg(_stream, _fmt, _args...)                                           \
b7d4d7
+    do {                                                                       \
b7d4d7
+        fprintf(_stream, _fmt "\n", ##_args);                                  \
b7d4d7
+        fflush(_stream);                                                       \
b7d4d7
+    } while (0)
b7d4d7
+
b7d4d7
+#define msg_out(_fmt, _args...) msg(stdout, _fmt, ##_args)
b7d4d7
+#define msg_err(_err, _fmt, _args...)                                          \
b7d4d7
+    ({                                                                         \
b7d4d7
+        int32_t __msg_err = (_err);                                            \
b7d4d7
+        msg(stderr, "[%4u:%-15s] " _fmt, __LINE__, __FUNCTION__, __msg_err,    \
b7d4d7
+            ##_args);                                                          \
b7d4d7
+        -__msg_err;                                                            \
b7d4d7
+    })
b7d4d7
+
b7d4d7
+#define error(_err, _fmt, _args...) msg_err(_err, "E(%4d) " _fmt, ##_args)
b7d4d7
+#define warn(_err, _fmt, _args...) msg_err(_err, "W(%4d) " _fmt, ##_args)
b7d4d7
+#define info(_err, _fmt, _args...) msg_err(_err, "I(%4d) " _fmt, ##_args)
b7d4d7
+
b7d4d7
+#define out_ok(_args...) msg_out("OK " _args)
b7d4d7
+#define out_err(_err) msg_out("ERR %d", _err)
b7d4d7
+
b7d4d7
+#define ARG_END                                                                \
b7d4d7
+    {                                                                          \
b7d4d7
+        ARG_TYPE_NONE                                                          \
b7d4d7
+    }
b7d4d7
+
b7d4d7
+#define CMD_ARGS1(_x, _args...)                                                \
b7d4d7
+    .args = (arg_t[]) { _args }
b7d4d7
+#define CMD_ARGS(_args...) CMD_ARGS1(, ##_args, ARG_END)
b7d4d7
+
b7d4d7
+#define CMD_SUB(_cmds) .cmds = _cmds
b7d4d7
+
b7d4d7
+#define CMD_END                                                                \
b7d4d7
+    {                                                                          \
b7d4d7
+        NULL, NULL, CMD_SUB(NULL)                                              \
b7d4d7
+    }
b7d4d7
+
b7d4d7
+#define ARG_VAL(_type)                                                         \
b7d4d7
+    {                                                                          \
b7d4d7
+        ARG_TYPE_OBJ, .obj = {.type = _type }                                  \
b7d4d7
+    }
b7d4d7
+#define ARG_NUM(_min, _max)                                                    \
b7d4d7
+    {                                                                          \
b7d4d7
+        ARG_TYPE_NUM, .num = {.min = _min, .max = _max }                       \
b7d4d7
+    }
b7d4d7
+#define ARG_STR(_size)                                                         \
b7d4d7
+    {                                                                          \
b7d4d7
+        ARG_TYPE_STR, .str = {.size = _size }                                  \
b7d4d7
+    }
b7d4d7
+
b7d4d7
+extern command_t fd_commands[];
b7d4d7
+
b7d4d7
+#endif /* __TESTER_H__ */
b7d4d7
\ No newline at end of file
b7d4d7
diff --git a/tests/bugs/glusterfs/bug-873962-spb.t b/tests/bugs/glusterfs/bug-873962-spb.t
b7d4d7
index db84a22..db71cc0 100644
b7d4d7
--- a/tests/bugs/glusterfs/bug-873962-spb.t
b7d4d7
+++ b/tests/bugs/glusterfs/bug-873962-spb.t
b7d4d7
@@ -14,6 +14,7 @@ TEST $CLI volume set $V0 performance.io-cache off
b7d4d7
 TEST $CLI volume set $V0 performance.write-behind off
b7d4d7
 TEST $CLI volume set $V0 performance.stat-prefetch off
b7d4d7
 TEST $CLI volume set $V0 performance.read-ahead off
b7d4d7
+TEST $CLI volume set $V0 performance.open-behind off
b7d4d7
 TEST $CLI volume set $V0 cluster.background-self-heal-count 0
b7d4d7
 TEST $CLI volume start $V0
b7d4d7
 TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 --direct-io-mode=enable
b7d4d7
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
b7d4d7
index 919eea3..76b5809 100644
b7d4d7
--- a/xlators/mount/fuse/src/fuse-bridge.c
b7d4d7
+++ b/xlators/mount/fuse/src/fuse-bridge.c
b7d4d7
@@ -3398,6 +3398,8 @@ fuse_release(xlator_t *this, fuse_in_header_t *finh, void *msg,
b7d4d7
     gf_log("glusterfs-fuse", GF_LOG_TRACE,
b7d4d7
            "finh->unique: %" PRIu64 ": RELEASE %p", finh->unique, state->fd);
b7d4d7
 
b7d4d7
+    fd_close(state->fd);
b7d4d7
+
b7d4d7
     fuse_fd_ctx_destroy(this, state->fd);
b7d4d7
     fd_unref(fd);
b7d4d7
 
b7d4d7
diff --git a/xlators/performance/open-behind/src/open-behind-messages.h b/xlators/performance/open-behind/src/open-behind-messages.h
b7d4d7
index f250824..0e78917 100644
b7d4d7
--- a/xlators/performance/open-behind/src/open-behind-messages.h
b7d4d7
+++ b/xlators/performance/open-behind/src/open-behind-messages.h
b7d4d7
@@ -23,6 +23,10 @@
b7d4d7
  */
b7d4d7
 
b7d4d7
 GLFS_MSGID(OPEN_BEHIND, OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED,
b7d4d7
-           OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY);
b7d4d7
+           OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY,
b7d4d7
+           OPEN_BEHIND_MSG_FAILED, OPEN_BEHIND_MSG_BAD_STATE);
b7d4d7
+
b7d4d7
+#define OPEN_BEHIND_MSG_FAILED_STR "Failed to submit fop"
b7d4d7
+#define OPEN_BEHIND_MSG_BAD_STATE_STR "Unexpected state"
b7d4d7
 
b7d4d7
 #endif /* _OPEN_BEHIND_MESSAGES_H_ */
b7d4d7
diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c
b7d4d7
index cbe89ec..e43fe73 100644
b7d4d7
--- a/xlators/performance/open-behind/src/open-behind.c
b7d4d7
+++ b/xlators/performance/open-behind/src/open-behind.c
b7d4d7
@@ -16,6 +16,18 @@
b7d4d7
 #include "open-behind-messages.h"
b7d4d7
 #include <glusterfs/glusterfs-acl.h>
b7d4d7
 
b7d4d7
+/* Note: The initial design of open-behind was made to cover the simple case
b7d4d7
+ *       of open, read, close for small files. This pattern combined with
b7d4d7
+ *       quick-read can do the whole operation without a single request to the
b7d4d7
+ *       bricks (except the initial lookup).
b7d4d7
+ *
b7d4d7
+ *       The way to do this has been improved, but the logic remains the same.
b7d4d7
+ *       Basically, this means that any operation sent to the fd or the inode
b7d4d7
+ *       that it's not a read, causes the open request to be sent to the
b7d4d7
+ *       bricks, and all future operations will be executed synchronously,
b7d4d7
+ *       including opens (it's reset once all fd's are closed).
b7d4d7
+ */
b7d4d7
+
b7d4d7
 typedef struct ob_conf {
b7d4d7
     gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe
b7d4d7
                                       e.g - fstat() readv()
b7d4d7
@@ -32,1096 +44,754 @@ typedef struct ob_conf {
b7d4d7
                                         */
b7d4d7
 } ob_conf_t;
b7d4d7
 
b7d4d7
-typedef struct ob_inode {
b7d4d7
-    inode_t *inode;
b7d4d7
-    struct list_head resume_fops;
b7d4d7
-    struct list_head ob_fds;
b7d4d7
-    int count;
b7d4d7
-    int op_ret;
b7d4d7
-    int op_errno;
b7d4d7
-    gf_boolean_t open_in_progress;
b7d4d7
-    int unlinked;
b7d4d7
-} ob_inode_t;
b7d4d7
+/* A negative state represents an errno value negated. In this case the
b7d4d7
+ * current operation cannot be processed. */
b7d4d7
+typedef enum _ob_state {
b7d4d7
+    /* There are no opens on the inode or the first open is already
b7d4d7
+     * completed. The current operation can be sent directly. */
b7d4d7
+    OB_STATE_READY = 0,
b7d4d7
 
b7d4d7
-typedef struct ob_fd {
b7d4d7
-    call_frame_t *open_frame;
b7d4d7
-    loc_t loc;
b7d4d7
-    dict_t *xdata;
b7d4d7
-    int flags;
b7d4d7
-    int op_errno;
b7d4d7
-    ob_inode_t *ob_inode;
b7d4d7
-    fd_t *fd;
b7d4d7
-    gf_boolean_t opened;
b7d4d7
-    gf_boolean_t ob_inode_fops_waiting;
b7d4d7
-    struct list_head list;
b7d4d7
-    struct list_head ob_fds_on_inode;
b7d4d7
-} ob_fd_t;
b7d4d7
+    /* There's an open pending and it has been triggered. The current
b7d4d7
+     * operation should be "stubbified" and processed with
b7d4d7
+     * ob_stub_dispatch(). */
b7d4d7
+    OB_STATE_OPEN_TRIGGERED,
b7d4d7
 
b7d4d7
-ob_inode_t *
b7d4d7
-ob_inode_alloc(inode_t *inode)
b7d4d7
-{
b7d4d7
-    ob_inode_t *ob_inode = NULL;
b7d4d7
+    /* There's an open pending but it has not been triggered. The current
b7d4d7
+     * operation can be processed directly but using an anonymous fd. */
b7d4d7
+    OB_STATE_OPEN_PENDING,
b7d4d7
 
b7d4d7
-    ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t);
b7d4d7
-    if (ob_inode == NULL)
b7d4d7
-        goto out;
b7d4d7
+    /* The current operation is the first open on the inode. */
b7d4d7
+    OB_STATE_FIRST_OPEN
b7d4d7
+} ob_state_t;
b7d4d7
 
b7d4d7
-    ob_inode->inode = inode;
b7d4d7
-    INIT_LIST_HEAD(&ob_inode->resume_fops);
b7d4d7
-    INIT_LIST_HEAD(&ob_inode->ob_fds);
b7d4d7
-out:
b7d4d7
-    return ob_inode;
b7d4d7
-}
b7d4d7
-
b7d4d7
-void
b7d4d7
-ob_inode_free(ob_inode_t *ob_inode)
b7d4d7
-{
b7d4d7
-    if (ob_inode == NULL)
b7d4d7
-        goto out;
b7d4d7
+typedef struct ob_inode {
b7d4d7
+    /* List of stubs pending on the first open. Once the first open is
b7d4d7
+     * complete, all these stubs will be resubmitted, and dependencies
b7d4d7
+     * will be checked again. */
b7d4d7
+    struct list_head resume_fops;
b7d4d7
 
b7d4d7
-    list_del_init(&ob_inode->resume_fops);
b7d4d7
-    list_del_init(&ob_inode->ob_fds);
b7d4d7
+    /* The inode this object references. */
b7d4d7
+    inode_t *inode;
b7d4d7
 
b7d4d7
-    GF_FREE(ob_inode);
b7d4d7
-out:
b7d4d7
-    return;
b7d4d7
-}
b7d4d7
+    /* The fd from the first open sent to this inode. It will be set
b7d4d7
+     * from the moment the open is processed until the open if fully
b7d4d7
+     * executed or closed before actually opened. It's NULL in all
b7d4d7
+     * other cases. */
b7d4d7
+    fd_t *first_fd;
b7d4d7
+
b7d4d7
+    /* The stub from the first open operation. When open fop starts
b7d4d7
+     * being processed, it's assigned the OB_OPEN_PREPARING value
b7d4d7
+     * until the actual stub is created. This is necessary to avoid
b7d4d7
+     * creating the stub inside a locked region. Once the stub is
b7d4d7
+     * successfully created, it's assigned here. This value is set
b7d4d7
+     * to NULL once the stub is resumed. */
b7d4d7
+    call_stub_t *first_open;
b7d4d7
+
b7d4d7
+    /* The total number of currently open fd's on this inode. */
b7d4d7
+    int32_t open_count;
b7d4d7
+
b7d4d7
+    /* This flag is set as soon as we know that the open will be
b7d4d7
+     * sent to the bricks, even before the stub is ready. */
b7d4d7
+    bool triggered;
b7d4d7
+} ob_inode_t;
b7d4d7
 
b7d4d7
-ob_inode_t *
b7d4d7
-ob_inode_get(xlator_t *this, inode_t *inode)
b7d4d7
+/* Dummy pointer used temporarily while the actual open stub is being created */
b7d4d7
+#define OB_OPEN_PREPARING ((call_stub_t *)-1)
b7d4d7
+
b7d4d7
+#define OB_POST_COMMON(_fop, _xl, _frame, _fd, _args...)                       \
b7d4d7
+    case OB_STATE_FIRST_OPEN:                                                  \
b7d4d7
+        gf_smsg((_xl)->name, GF_LOG_ERROR, EINVAL, OPEN_BEHIND_MSG_BAD_STATE,  \
b7d4d7
+                "fop=%s", #_fop, "state=%d", __ob_state, NULL);                \
b7d4d7
+        default_##_fop##_failure_cbk(_frame, EINVAL);                          \
b7d4d7
+        break;                                                                 \
b7d4d7
+    case OB_STATE_READY:                                                       \
b7d4d7
+        default_##_fop(_frame, _xl, ##_args);                                  \
b7d4d7
+        break;                                                                 \
b7d4d7
+    case OB_STATE_OPEN_TRIGGERED: {                                            \
b7d4d7
+        call_stub_t *__ob_stub = fop_##_fop##_stub(_frame, ob_##_fop,          \
b7d4d7
+                                                   ##_args);                   \
b7d4d7
+        if (__ob_stub != NULL) {                                               \
b7d4d7
+            ob_stub_dispatch(_xl, __ob_inode, _fd, __ob_stub);                 \
b7d4d7
+            break;                                                             \
b7d4d7
+        }                                                                      \
b7d4d7
+        __ob_state = -ENOMEM;                                                  \
b7d4d7
+    }                                                                          \
b7d4d7
+    default:                                                                   \
b7d4d7
+        gf_smsg((_xl)->name, GF_LOG_ERROR, -__ob_state,                        \
b7d4d7
+                OPEN_BEHIND_MSG_FAILED, "fop=%s", #_fop, NULL);                \
b7d4d7
+        default_##_fop##_failure_cbk(_frame, -__ob_state)
b7d4d7
+
b7d4d7
+#define OB_POST_FD(_fop, _xl, _frame, _fd, _trigger, _args...)                 \
b7d4d7
+    do {                                                                       \
b7d4d7
+        ob_inode_t *__ob_inode;                                                \
b7d4d7
+        fd_t *__first_fd;                                                      \
b7d4d7
+        ob_state_t __ob_state = ob_open_and_resume_fd(                         \
b7d4d7
+            _xl, _fd, 0, true, _trigger, &__ob_inode, &__first_fd);            \
b7d4d7
+        switch (__ob_state) {                                                  \
b7d4d7
+            case OB_STATE_OPEN_PENDING:                                        \
b7d4d7
+                if (!(_trigger)) {                                             \
b7d4d7
+                    fd_t *__ob_fd = fd_anonymous_with_flags((_fd)->inode,      \
b7d4d7
+                                                            (_fd)->flags);     \
b7d4d7
+                    if (__ob_fd != NULL) {                                     \
b7d4d7
+                        default_##_fop(_frame, _xl, ##_args);                  \
b7d4d7
+                        fd_unref(__ob_fd);                                     \
b7d4d7
+                        break;                                                 \
b7d4d7
+                    }                                                          \
b7d4d7
+                    __ob_state = -ENOMEM;                                      \
b7d4d7
+                }                                                              \
b7d4d7
+                OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args);        \
b7d4d7
+        }                                                                      \
b7d4d7
+    } while (0)
b7d4d7
+
b7d4d7
+#define OB_POST_FLUSH(_xl, _frame, _fd, _args...)                              \
b7d4d7
+    do {                                                                       \
b7d4d7
+        ob_inode_t *__ob_inode;                                                \
b7d4d7
+        fd_t *__first_fd;                                                      \
b7d4d7
+        ob_state_t __ob_state = ob_open_and_resume_fd(                         \
b7d4d7
+            _xl, _fd, 0, true, false, &__ob_inode, &__first_fd);               \
b7d4d7
+        switch (__ob_state) {                                                  \
b7d4d7
+            case OB_STATE_OPEN_PENDING:                                        \
b7d4d7
+                default_flush_cbk(_frame, NULL, _xl, 0, 0, NULL);              \
b7d4d7
+                break;                                                         \
b7d4d7
+                OB_POST_COMMON(flush, _xl, _frame, __first_fd, ##_args);       \
b7d4d7
+        }                                                                      \
b7d4d7
+    } while (0)
b7d4d7
+
b7d4d7
+#define OB_POST_INODE(_fop, _xl, _frame, _inode, _trigger, _args...)           \
b7d4d7
+    do {                                                                       \
b7d4d7
+        ob_inode_t *__ob_inode;                                                \
b7d4d7
+        fd_t *__first_fd;                                                      \
b7d4d7
+        ob_state_t __ob_state = ob_open_and_resume_inode(                      \
b7d4d7
+            _xl, _inode, NULL, 0, true, _trigger, &__ob_inode, &__first_fd);   \
b7d4d7
+        switch (__ob_state) {                                                  \
b7d4d7
+            case OB_STATE_OPEN_PENDING:                                        \
b7d4d7
+                OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args);        \
b7d4d7
+        }                                                                      \
b7d4d7
+    } while (0)
b7d4d7
+
b7d4d7
+static ob_inode_t *
b7d4d7
+ob_inode_get_locked(xlator_t *this, inode_t *inode)
b7d4d7
 {
b7d4d7
     ob_inode_t *ob_inode = NULL;
b7d4d7
     uint64_t value = 0;
b7d4d7
-    int ret = 0;
b7d4d7
 
b7d4d7
-    if (!inode)
b7d4d7
-        goto out;
b7d4d7
+    if ((__inode_ctx_get(inode, this, &value) == 0) && (value != 0)) {
b7d4d7
+        return (ob_inode_t *)(uintptr_t)value;
b7d4d7
+    }
b7d4d7
 
b7d4d7
-    LOCK(&inode->lock);
b7d4d7
-    {
b7d4d7
-        __inode_ctx_get(inode, this, &value);
b7d4d7
-        if (value == 0) {
b7d4d7
-            ob_inode = ob_inode_alloc(inode);
b7d4d7
-            if (ob_inode == NULL)
b7d4d7
-                goto unlock;
b7d4d7
-
b7d4d7
-            value = (uint64_t)(uintptr_t)ob_inode;
b7d4d7
-            ret = __inode_ctx_set(inode, this, &value);
b7d4d7
-            if (ret < 0) {
b7d4d7
-                ob_inode_free(ob_inode);
b7d4d7
-                ob_inode = NULL;
b7d4d7
-            }
b7d4d7
-        } else {
b7d4d7
-            ob_inode = (ob_inode_t *)(uintptr_t)value;
b7d4d7
+    ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t);
b7d4d7
+    if (ob_inode != NULL) {
b7d4d7
+        ob_inode->inode = inode;
b7d4d7
+        INIT_LIST_HEAD(&ob_inode->resume_fops);
b7d4d7
+
b7d4d7
+        value = (uint64_t)(uintptr_t)ob_inode;
b7d4d7
+        if (__inode_ctx_set(inode, this, &value) < 0) {
b7d4d7
+            GF_FREE(ob_inode);
b7d4d7
+            ob_inode = NULL;
b7d4d7
         }
b7d4d7
     }
b7d4d7
-unlock:
b7d4d7
-    UNLOCK(&inode->lock);
b7d4d7
 
b7d4d7
-out:
b7d4d7
     return ob_inode;
b7d4d7
 }
b7d4d7
 
b7d4d7
-ob_fd_t *
b7d4d7
-__ob_fd_ctx_get(xlator_t *this, fd_t *fd)
b7d4d7
+static ob_state_t
b7d4d7
+ob_open_and_resume_inode(xlator_t *xl, inode_t *inode, fd_t *fd,
b7d4d7
+                         int32_t open_count, bool synchronous, bool trigger,
b7d4d7
+                         ob_inode_t **pob_inode, fd_t **pfd)
b7d4d7
 {
b7d4d7
-    uint64_t value = 0;
b7d4d7
-    int ret = -1;
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
+    ob_conf_t *conf;
b7d4d7
+    ob_inode_t *ob_inode;
b7d4d7
+    call_stub_t *open_stub;
b7d4d7
 
b7d4d7
-    ret = __fd_ctx_get(fd, this, &value);
b7d4d7
-    if (ret)
b7d4d7
-        return NULL;
b7d4d7
+    if (inode == NULL) {
b7d4d7
+        return OB_STATE_READY;
b7d4d7
+    }
b7d4d7
 
b7d4d7
-    ob_fd = (void *)((long)value);
b7d4d7
+    conf = xl->private;
b7d4d7
 
b7d4d7
-    return ob_fd;
b7d4d7
-}
b7d4d7
+    *pfd = NULL;
b7d4d7
 
b7d4d7
-ob_fd_t *
b7d4d7
-ob_fd_ctx_get(xlator_t *this, fd_t *fd)
b7d4d7
-{
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
-
b7d4d7
-    LOCK(&fd->lock);
b7d4d7
+    LOCK(&inode->lock);
b7d4d7
     {
b7d4d7
-        ob_fd = __ob_fd_ctx_get(this, fd);
b7d4d7
-    }
b7d4d7
-    UNLOCK(&fd->lock);
b7d4d7
-
b7d4d7
-    return ob_fd;
b7d4d7
-}
b7d4d7
+        ob_inode = ob_inode_get_locked(xl, inode);
b7d4d7
+        if (ob_inode == NULL) {
b7d4d7
+            UNLOCK(&inode->lock);
b7d4d7
 
b7d4d7
-int
b7d4d7
-__ob_fd_ctx_set(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
b7d4d7
-{
b7d4d7
-    uint64_t value = 0;
b7d4d7
-    int ret = -1;
b7d4d7
+            return -ENOMEM;
b7d4d7
+        }
b7d4d7
+        *pob_inode = ob_inode;
b7d4d7
+
b7d4d7
+        ob_inode->open_count += open_count;
b7d4d7
+
b7d4d7
+        /* If first_fd is not NULL, it means that there's a previous open not
b7d4d7
+         * yet completed. */
b7d4d7
+        if (ob_inode->first_fd != NULL) {
b7d4d7
+            *pfd = ob_inode->first_fd;
b7d4d7
+            /* If the current request doesn't trigger the open and it hasn't
b7d4d7
+             * been triggered yet, we can continue without issuing the open
b7d4d7
+             * only if the current request belongs to the same fd as the
b7d4d7
+             * first one. */
b7d4d7
+            if (!trigger && !ob_inode->triggered &&
b7d4d7
+                (ob_inode->first_fd == fd)) {
b7d4d7
+                UNLOCK(&inode->lock);
b7d4d7
+
b7d4d7
+                return OB_STATE_OPEN_PENDING;
b7d4d7
+            }
b7d4d7
 
b7d4d7
-    value = (long)((void *)ob_fd);
b7d4d7
+            /* We need to issue the open. It could have already been triggered
b7d4d7
+             * before. In this case open_stub will be NULL. Or the initial open
b7d4d7
+             * may not be completely ready yet. In this case open_stub will be
b7d4d7
+             * OB_OPEN_PREPARING. */
b7d4d7
+            open_stub = ob_inode->first_open;
b7d4d7
+            ob_inode->first_open = NULL;
b7d4d7
+            ob_inode->triggered = true;
b7d4d7
 
b7d4d7
-    ret = __fd_ctx_set(fd, this, value);
b7d4d7
+            UNLOCK(&inode->lock);
b7d4d7
 
b7d4d7
-    return ret;
b7d4d7
-}
b7d4d7
+            if ((open_stub != NULL) && (open_stub != OB_OPEN_PREPARING)) {
b7d4d7
+                call_resume(open_stub);
b7d4d7
+            }
b7d4d7
 
b7d4d7
-int
b7d4d7
-ob_fd_ctx_set(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
b7d4d7
-{
b7d4d7
-    int ret = -1;
b7d4d7
+            return OB_STATE_OPEN_TRIGGERED;
b7d4d7
+        }
b7d4d7
 
b7d4d7
-    LOCK(&fd->lock);
b7d4d7
-    {
b7d4d7
-        ret = __ob_fd_ctx_set(this, fd, ob_fd);
b7d4d7
-    }
b7d4d7
-    UNLOCK(&fd->lock);
b7d4d7
+        /* There's no pending open. Only opens can be non synchronous, so all
b7d4d7
+         * regular fops will be processed directly. For non synchronous opens,
b7d4d7
+         * we'll still process them normally (i.e. synchornous) if there are
b7d4d7
+         * more file descriptors open. */
b7d4d7
+        if (synchronous || (ob_inode->open_count > open_count)) {
b7d4d7
+            UNLOCK(&inode->lock);
b7d4d7
 
b7d4d7
-    return ret;
b7d4d7
-}
b7d4d7
+            return OB_STATE_READY;
b7d4d7
+        }
b7d4d7
 
b7d4d7
-ob_fd_t *
b7d4d7
-ob_fd_new(void)
b7d4d7
-{
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
+        *pfd = fd;
b7d4d7
 
b7d4d7
-    ob_fd = GF_CALLOC(1, sizeof(*ob_fd), gf_ob_mt_fd_t);
b7d4d7
+        /* This is the first open. We keep a reference on the fd and set
b7d4d7
+         * first_open stub to OB_OPEN_PREPARING until the actual stub can
b7d4d7
+         * be assigned (we don't create the stub here to avoid doing memory
b7d4d7
+         * allocations inside the mutex). */
b7d4d7
+        ob_inode->first_fd = __fd_ref(fd);
b7d4d7
+        ob_inode->first_open = OB_OPEN_PREPARING;
b7d4d7
 
b7d4d7
-    INIT_LIST_HEAD(&ob_fd->list);
b7d4d7
-    INIT_LIST_HEAD(&ob_fd->ob_fds_on_inode);
b7d4d7
+        /* If lazy_open is not set, we'll need to immediately send the open,
b7d4d7
+         * so we set triggered right now. */
b7d4d7
+        ob_inode->triggered = !conf->lazy_open;
b7d4d7
+    }
b7d4d7
+    UNLOCK(&inode->lock);
b7d4d7
 
b7d4d7
-    return ob_fd;
b7d4d7
+    return OB_STATE_FIRST_OPEN;
b7d4d7
 }
b7d4d7
 
b7d4d7
-void
b7d4d7
-ob_fd_free(ob_fd_t *ob_fd)
b7d4d7
+static ob_state_t
b7d4d7
+ob_open_and_resume_fd(xlator_t *xl, fd_t *fd, int32_t open_count,
b7d4d7
+                      bool synchronous, bool trigger, ob_inode_t **pob_inode,
b7d4d7
+                      fd_t **pfd)
b7d4d7
 {
b7d4d7
-    LOCK(&ob_fd->fd->inode->lock);
b7d4d7
-    {
b7d4d7
-        list_del_init(&ob_fd->ob_fds_on_inode);
b7d4d7
-    }
b7d4d7
-    UNLOCK(&ob_fd->fd->inode->lock);
b7d4d7
-
b7d4d7
-    loc_wipe(&ob_fd->loc);
b7d4d7
-
b7d4d7
-    if (ob_fd->xdata)
b7d4d7
-        dict_unref(ob_fd->xdata);
b7d4d7
+    uint64_t err;
b7d4d7
 
b7d4d7
-    if (ob_fd->open_frame) {
b7d4d7
-        /* If we sill have a frame it means that background open has never
b7d4d7
-         * been triggered. We need to release the pending reference. */
b7d4d7
-        fd_unref(ob_fd->fd);
b7d4d7
-
b7d4d7
-        STACK_DESTROY(ob_fd->open_frame->root);
b7d4d7
+    if ((fd_ctx_get(fd, xl, &err) == 0) && (err != 0)) {
b7d4d7
+        return (ob_state_t)-err;
b7d4d7
     }
b7d4d7
 
b7d4d7
-    GF_FREE(ob_fd);
b7d4d7
+    return ob_open_and_resume_inode(xl, fd->inode, fd, open_count, synchronous,
b7d4d7
+                                    trigger, pob_inode, pfd);
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
-ob_wake_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
b7d4d7
-            int op_errno, fd_t *fd_ret, dict_t *xdata)
b7d4d7
+static ob_state_t
b7d4d7
+ob_open_behind(xlator_t *xl, fd_t *fd, int32_t flags, ob_inode_t **pob_inode,
b7d4d7
+               fd_t **pfd)
b7d4d7
 {
b7d4d7
-    fd_t *fd = NULL;
b7d4d7
-    int count = 0;
b7d4d7
-    int ob_inode_op_ret = 0;
b7d4d7
-    int ob_inode_op_errno = 0;
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
-    call_stub_t *stub = NULL, *tmp = NULL;
b7d4d7
-    ob_inode_t *ob_inode = NULL;
b7d4d7
-    gf_boolean_t ob_inode_fops_waiting = _gf_false;
b7d4d7
-    struct list_head fops_waiting_on_fd, fops_waiting_on_inode;
b7d4d7
+    bool synchronous;
b7d4d7
 
b7d4d7
-    fd = frame->local;
b7d4d7
-    frame->local = NULL;
b7d4d7
-
b7d4d7
-    INIT_LIST_HEAD(&fops_waiting_on_fd);
b7d4d7
-    INIT_LIST_HEAD(&fops_waiting_on_inode);
b7d4d7
+    /* TODO: If O_CREAT, O_APPEND, O_WRONLY or O_DIRECT are specified, shouldn't
b7d4d7
+     *       we also execute this open synchronously ? */
b7d4d7
+    synchronous = (flags & O_TRUNC) != 0;
b7d4d7
 
b7d4d7
-    ob_inode = ob_inode_get(this, fd->inode);
b7d4d7
+    return ob_open_and_resume_fd(xl, fd, 1, synchronous, true, pob_inode, pfd);
b7d4d7
+}
b7d4d7
 
b7d4d7
-    LOCK(&fd->lock);
b7d4d7
+static int32_t
b7d4d7
+ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd,
b7d4d7
+                 call_stub_t *stub)
b7d4d7
+{
b7d4d7
+    LOCK(&ob_inode->inode->lock);
b7d4d7
     {
b7d4d7
-        ob_fd = __ob_fd_ctx_get(this, fd);
b7d4d7
-        ob_fd->opened = _gf_true;
b7d4d7
-
b7d4d7
-        ob_inode_fops_waiting = ob_fd->ob_inode_fops_waiting;
b7d4d7
-
b7d4d7
-        list_splice_init(&ob_fd->list, &fops_waiting_on_fd);
b7d4d7
-
b7d4d7
-        if (op_ret < 0) {
b7d4d7
-            /* mark fd BAD for ever */
b7d4d7
-            ob_fd->op_errno = op_errno;
b7d4d7
-            ob_fd = NULL; /*shouldn't be freed*/
b7d4d7
-        } else {
b7d4d7
-            __fd_ctx_del(fd, this, NULL);
b7d4d7
-        }
b7d4d7
-    }
b7d4d7
-    UNLOCK(&fd->lock);
b7d4d7
-
b7d4d7
-    if (ob_inode_fops_waiting) {
b7d4d7
-        LOCK(&fd->inode->lock);
b7d4d7
-        {
b7d4d7
-            count = --ob_inode->count;
b7d4d7
-            if (op_ret < 0) {
b7d4d7
-                /* TODO: when to reset the error? */
b7d4d7
-                ob_inode->op_ret = -1;
b7d4d7
-                ob_inode->op_errno = op_errno;
b7d4d7
-            }
b7d4d7
-
b7d4d7
-            if (count == 0) {
b7d4d7
-                ob_inode->open_in_progress = _gf_false;
b7d4d7
-                ob_inode_op_ret = ob_inode->op_ret;
b7d4d7
-                ob_inode_op_errno = ob_inode->op_errno;
b7d4d7
-                list_splice_init(&ob_inode->resume_fops,
b7d4d7
-                                 &fops_waiting_on_inode);
b7d4d7
-            }
b7d4d7
+        /* We only queue a stub if the open has not been completed or
b7d4d7
+         * cancelled. */
b7d4d7
+        if (ob_inode->first_fd == fd) {
b7d4d7
+            list_add_tail(&stub->list, &ob_inode->resume_fops);
b7d4d7
+            stub = NULL;
b7d4d7
         }
b7d4d7
-        UNLOCK(&fd->inode->lock);
b7d4d7
-    }
b7d4d7
-
b7d4d7
-    if (ob_fd)
b7d4d7
-        ob_fd_free(ob_fd);
b7d4d7
-
b7d4d7
-    list_for_each_entry_safe(stub, tmp, &fops_waiting_on_fd, list)
b7d4d7
-    {
b7d4d7
-        list_del_init(&stub->list);
b7d4d7
-
b7d4d7
-        if (op_ret < 0)
b7d4d7
-            call_unwind_error(stub, -1, op_errno);
b7d4d7
-        else
b7d4d7
-            call_resume(stub);
b7d4d7
     }
b7d4d7
+    UNLOCK(&ob_inode->inode->lock);
b7d4d7
 
b7d4d7
-    list_for_each_entry_safe(stub, tmp, &fops_waiting_on_inode, list)
b7d4d7
-    {
b7d4d7
-        list_del_init(&stub->list);
b7d4d7
-
b7d4d7
-        if (ob_inode_op_ret < 0)
b7d4d7
-            call_unwind_error(stub, -1, ob_inode_op_errno);
b7d4d7
-        else
b7d4d7
-            call_resume(stub);
b7d4d7
+    if (stub != NULL) {
b7d4d7
+        call_resume(stub);
b7d4d7
     }
b7d4d7
 
b7d4d7
-    /* The background open is completed. We can release the 'fd' reference. */
b7d4d7
-    fd_unref(fd);
b7d4d7
-
b7d4d7
-    STACK_DESTROY(frame->root);
b7d4d7
-
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
-ob_fd_wake(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
b7d4d7
+static int32_t
b7d4d7
+ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd,
b7d4d7
+                 call_stub_t *stub)
b7d4d7
 {
b7d4d7
-    call_frame_t *frame = NULL;
b7d4d7
-
b7d4d7
-    if (ob_fd == NULL) {
b7d4d7
-        LOCK(&fd->lock);
b7d4d7
-        {
b7d4d7
-            ob_fd = __ob_fd_ctx_get(this, fd);
b7d4d7
-            if (!ob_fd)
b7d4d7
-                goto unlock;
b7d4d7
+    bool closed;
b7d4d7
 
b7d4d7
-            frame = ob_fd->open_frame;
b7d4d7
-            ob_fd->open_frame = NULL;
b7d4d7
-        }
b7d4d7
-    unlock:
b7d4d7
-        UNLOCK(&fd->lock);
b7d4d7
-    } else {
b7d4d7
-        LOCK(&fd->lock);
b7d4d7
-        {
b7d4d7
-            frame = ob_fd->open_frame;
b7d4d7
-            ob_fd->open_frame = NULL;
b7d4d7
+    LOCK(&ob_inode->inode->lock);
b7d4d7
+    {
b7d4d7
+        closed = ob_inode->first_fd != fd;
b7d4d7
+        if (!closed) {
b7d4d7
+            if (ob_inode->triggered) {
b7d4d7
+                ob_inode->first_open = NULL;
b7d4d7
+            } else {
b7d4d7
+                ob_inode->first_open = stub;
b7d4d7
+                stub = NULL;
b7d4d7
+            }
b7d4d7
         }
b7d4d7
-        UNLOCK(&fd->lock);
b7d4d7
     }
b7d4d7
+    UNLOCK(&ob_inode->inode->lock);
b7d4d7
 
b7d4d7
-    if (frame) {
b7d4d7
-        /* We don't need to take a reference here. We already have a reference
b7d4d7
-         * while the open is pending. */
b7d4d7
-        frame->local = fd;
b7d4d7
-
b7d4d7
-        STACK_WIND(frame, ob_wake_cbk, FIRST_CHILD(this),
b7d4d7
-                   FIRST_CHILD(this)->fops->open, &ob_fd->loc, ob_fd->flags, fd,
b7d4d7
-                   ob_fd->xdata);
b7d4d7
+    if (stub != NULL) {
b7d4d7
+        if (closed) {
b7d4d7
+            call_stub_destroy(stub);
b7d4d7
+            fd_unref(fd);
b7d4d7
+        } else {
b7d4d7
+            call_resume(stub);
b7d4d7
+        }
b7d4d7
     }
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-void
b7d4d7
-ob_inode_wake(xlator_t *this, struct list_head *ob_fds)
b7d4d7
+static void
b7d4d7
+ob_resume_pending(struct list_head *list)
b7d4d7
 {
b7d4d7
-    ob_fd_t *ob_fd = NULL, *tmp = NULL;
b7d4d7
+    call_stub_t *stub;
b7d4d7
 
b7d4d7
-    if (!list_empty(ob_fds)) {
b7d4d7
-        list_for_each_entry_safe(ob_fd, tmp, ob_fds, ob_fds_on_inode)
b7d4d7
-        {
b7d4d7
-            ob_fd_wake(this, ob_fd->fd, ob_fd);
b7d4d7
-            ob_fd_free(ob_fd);
b7d4d7
-        }
b7d4d7
-    }
b7d4d7
-}
b7d4d7
+    while (!list_empty(list)) {
b7d4d7
+        stub = list_first_entry(list, call_stub_t, list);
b7d4d7
+        list_del_init(&stub->list);
b7d4d7
 
b7d4d7
-/* called holding inode->lock and fd->lock */
b7d4d7
-void
b7d4d7
-ob_fd_copy(ob_fd_t *src, ob_fd_t *dst)
b7d4d7
-{
b7d4d7
-    if (!src || !dst)
b7d4d7
-        goto out;
b7d4d7
-
b7d4d7
-    dst->fd = src->fd;
b7d4d7
-    dst->loc.inode = inode_ref(src->loc.inode);
b7d4d7
-    gf_uuid_copy(dst->loc.gfid, src->loc.gfid);
b7d4d7
-    dst->flags = src->flags;
b7d4d7
-    dst->xdata = dict_ref(src->xdata);
b7d4d7
-    dst->ob_inode = src->ob_inode;
b7d4d7
-out:
b7d4d7
-    return;
b7d4d7
+        call_resume(stub);
b7d4d7
+    }
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
-open_all_pending_fds_and_resume(xlator_t *this, inode_t *inode,
b7d4d7
-                                call_stub_t *stub)
b7d4d7
+static void
b7d4d7
+ob_open_completed(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, int32_t op_ret,
b7d4d7
+                  int32_t op_errno)
b7d4d7
 {
b7d4d7
-    ob_inode_t *ob_inode = NULL;
b7d4d7
-    ob_fd_t *ob_fd = NULL, *tmp = NULL;
b7d4d7
-    gf_boolean_t was_open_in_progress = _gf_false;
b7d4d7
-    gf_boolean_t wait_for_open = _gf_false;
b7d4d7
-    struct list_head ob_fds;
b7d4d7
+    struct list_head list;
b7d4d7
 
b7d4d7
-    ob_inode = ob_inode_get(this, inode);
b7d4d7
-    if (ob_inode == NULL)
b7d4d7
-        goto out;
b7d4d7
+    INIT_LIST_HEAD(&list);
b7d4d7
 
b7d4d7
-    INIT_LIST_HEAD(&ob_fds);
b7d4d7
+    if (op_ret < 0) {
b7d4d7
+        fd_ctx_set(fd, xl, op_errno <= 0 ? EIO : op_errno);
b7d4d7
+    }
b7d4d7
 
b7d4d7
-    LOCK(&inode->lock);
b7d4d7
+    LOCK(&ob_inode->inode->lock);
b7d4d7
     {
b7d4d7
-        was_open_in_progress = ob_inode->open_in_progress;
b7d4d7
-        ob_inode->unlinked = 1;
b7d4d7
-
b7d4d7
-        if (was_open_in_progress) {
b7d4d7
-            list_add_tail(&stub->list, &ob_inode->resume_fops);
b7d4d7
-            goto inode_unlock;
b7d4d7
-        }
b7d4d7
-
b7d4d7
-        list_for_each_entry(ob_fd, &ob_inode->ob_fds, ob_fds_on_inode)
b7d4d7
-        {
b7d4d7
-            LOCK(&ob_fd->fd->lock);
b7d4d7
-            {
b7d4d7
-                if (ob_fd->opened)
b7d4d7
-                    goto fd_unlock;
b7d4d7
-
b7d4d7
-                ob_inode->count++;
b7d4d7
-                ob_fd->ob_inode_fops_waiting = _gf_true;
b7d4d7
-
b7d4d7
-                if (ob_fd->open_frame == NULL) {
b7d4d7
-                    /* open in progress no need of wake */
b7d4d7
-                } else {
b7d4d7
-                    tmp = ob_fd_new();
b7d4d7
-                    tmp->open_frame = ob_fd->open_frame;
b7d4d7
-                    ob_fd->open_frame = NULL;
b7d4d7
-
b7d4d7
-                    ob_fd_copy(ob_fd, tmp);
b7d4d7
-                    list_add_tail(&tmp->ob_fds_on_inode, &ob_fds);
b7d4d7
-                }
b7d4d7
-            }
b7d4d7
-        fd_unlock:
b7d4d7
-            UNLOCK(&ob_fd->fd->lock);
b7d4d7
-        }
b7d4d7
-
b7d4d7
-        if (ob_inode->count) {
b7d4d7
-            wait_for_open = ob_inode->open_in_progress = _gf_true;
b7d4d7
-            list_add_tail(&stub->list, &ob_inode->resume_fops);
b7d4d7
+        /* Only update the fields if the file has not been closed before
b7d4d7
+         * getting here. */
b7d4d7
+        if (ob_inode->first_fd == fd) {
b7d4d7
+            list_splice_init(&ob_inode->resume_fops, &list);
b7d4d7
+            ob_inode->first_fd = NULL;
b7d4d7
+            ob_inode->first_open = NULL;
b7d4d7
+            ob_inode->triggered = false;
b7d4d7
         }
b7d4d7
     }
b7d4d7
-inode_unlock:
b7d4d7
-    UNLOCK(&inode->lock);
b7d4d7
+    UNLOCK(&ob_inode->inode->lock);
b7d4d7
 
b7d4d7
-out:
b7d4d7
-    if (!was_open_in_progress) {
b7d4d7
-        if (!wait_for_open) {
b7d4d7
-            call_resume(stub);
b7d4d7
-        } else {
b7d4d7
-            ob_inode_wake(this, &ob_fds);
b7d4d7
-        }
b7d4d7
-    }
b7d4d7
+    ob_resume_pending(&list);
b7d4d7
 
b7d4d7
-    return 0;
b7d4d7
+    fd_unref(fd);
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
-open_and_resume(xlator_t *this, fd_t *fd, call_stub_t *stub)
b7d4d7
+static int32_t
b7d4d7
+ob_open_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret,
b7d4d7
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
-    int op_errno = 0;
b7d4d7
-
b7d4d7
-    if (!fd)
b7d4d7
-        goto nofd;
b7d4d7
-
b7d4d7
-    LOCK(&fd->lock);
b7d4d7
-    {
b7d4d7
-        ob_fd = __ob_fd_ctx_get(this, fd);
b7d4d7
-        if (!ob_fd)
b7d4d7
-            goto unlock;
b7d4d7
+    ob_inode_t *ob_inode;
b7d4d7
 
b7d4d7
-        if (ob_fd->op_errno) {
b7d4d7
-            op_errno = ob_fd->op_errno;
b7d4d7
-            goto unlock;
b7d4d7
-        }
b7d4d7
+    ob_inode = frame->local;
b7d4d7
+    frame->local = NULL;
b7d4d7
 
b7d4d7
-        list_add_tail(&stub->list, &ob_fd->list);
b7d4d7
-    }
b7d4d7
-unlock:
b7d4d7
-    UNLOCK(&fd->lock);
b7d4d7
+    ob_open_completed(xl, ob_inode, cookie, op_ret, op_errno);
b7d4d7
 
b7d4d7
-nofd:
b7d4d7
-    if (op_errno)
b7d4d7
-        call_unwind_error(stub, -1, op_errno);
b7d4d7
-    else if (ob_fd)
b7d4d7
-        ob_fd_wake(this, fd, NULL);
b7d4d7
-    else
b7d4d7
-        call_resume(stub);
b7d4d7
+    STACK_DESTROY(frame->root);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
-ob_open_behind(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
b7d4d7
+static int32_t
b7d4d7
+ob_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
b7d4d7
                fd_t *fd, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
-    int ret = -1;
b7d4d7
-    ob_conf_t *conf = NULL;
b7d4d7
-    ob_inode_t *ob_inode = NULL;
b7d4d7
-    gf_boolean_t open_in_progress = _gf_false;
b7d4d7
-    int unlinked = 0;
b7d4d7
-
b7d4d7
-    conf = this->private;
b7d4d7
-
b7d4d7
-    if (flags & O_TRUNC) {
b7d4d7
-        STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this),
b7d4d7
-                   FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
b7d4d7
-        return 0;
b7d4d7
-    }
b7d4d7
-
b7d4d7
-    ob_inode = ob_inode_get(this, fd->inode);
b7d4d7
-
b7d4d7
-    ob_fd = ob_fd_new();
b7d4d7
-    if (!ob_fd)
b7d4d7
-        goto enomem;
b7d4d7
-
b7d4d7
-    ob_fd->ob_inode = ob_inode;
b7d4d7
-
b7d4d7
-    ob_fd->fd = fd;
b7d4d7
-
b7d4d7
-    ob_fd->open_frame = copy_frame(frame);
b7d4d7
-    if (!ob_fd->open_frame)
b7d4d7
-        goto enomem;
b7d4d7
-    ret = loc_copy(&ob_fd->loc, loc);
b7d4d7
-    if (ret)
b7d4d7
-        goto enomem;
b7d4d7
-
b7d4d7
-    ob_fd->flags = flags;
b7d4d7
-    if (xdata)
b7d4d7
-        ob_fd->xdata = dict_ref(xdata);
b7d4d7
-
b7d4d7
-    LOCK(&fd->inode->lock);
b7d4d7
-    {
b7d4d7
-        open_in_progress = ob_inode->open_in_progress;
b7d4d7
-        unlinked = ob_inode->unlinked;
b7d4d7
-        if (!open_in_progress && !unlinked) {
b7d4d7
-            ret = ob_fd_ctx_set(this, fd, ob_fd);
b7d4d7
-            if (ret) {
b7d4d7
-                UNLOCK(&fd->inode->lock);
b7d4d7
-                goto enomem;
b7d4d7
-            }
b7d4d7
-
b7d4d7
-            list_add(&ob_fd->ob_fds_on_inode, &ob_inode->ob_fds);
b7d4d7
-        }
b7d4d7
-    }
b7d4d7
-    UNLOCK(&fd->inode->lock);
b7d4d7
-
b7d4d7
-    /* We take a reference while the background open is pending or being
b7d4d7
-     * processed. If we finally wind the request in the foreground, then
b7d4d7
-     * ob_fd_free() will take care of this additional reference. */
b7d4d7
-    fd_ref(fd);
b7d4d7
-
b7d4d7
-    if (!open_in_progress && !unlinked) {
b7d4d7
-        STACK_UNWIND_STRICT(open, frame, 0, 0, fd, xdata);
b7d4d7
-
b7d4d7
-        if (!conf->lazy_open)
b7d4d7
-            ob_fd_wake(this, fd, NULL);
b7d4d7
-    } else {
b7d4d7
-        ob_fd_free(ob_fd);
b7d4d7
-        STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this),
b7d4d7
-                   FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
b7d4d7
-    }
b7d4d7
+    STACK_WIND_COOKIE(frame, ob_open_cbk, fd, FIRST_CHILD(this),
b7d4d7
+                      FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
-enomem:
b7d4d7
-    if (ob_fd) {
b7d4d7
-        if (ob_fd->open_frame)
b7d4d7
-            STACK_DESTROY(ob_fd->open_frame->root);
b7d4d7
-
b7d4d7
-        loc_wipe(&ob_fd->loc);
b7d4d7
-        if (ob_fd->xdata)
b7d4d7
-            dict_unref(ob_fd->xdata);
b7d4d7
-
b7d4d7
-        GF_FREE(ob_fd);
b7d4d7
-    }
b7d4d7
-
b7d4d7
-    return -1;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
b7d4d7
         dict_t *xdata)
b7d4d7
 {
b7d4d7
-    fd_t *old_fd = NULL;
b7d4d7
-    int ret = -1;
b7d4d7
-    int op_errno = ENOMEM;
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    old_fd = fd_lookup(fd->inode, 0);
b7d4d7
-    if (old_fd) {
b7d4d7
-        /* open-behind only when this is the first FD */
b7d4d7
-        stub = fop_open_stub(frame, default_open_resume, loc, flags, fd, xdata);
b7d4d7
-        if (!stub) {
b7d4d7
-            fd_unref(old_fd);
b7d4d7
-            goto err;
b7d4d7
-        }
b7d4d7
-
b7d4d7
-        open_and_resume(this, old_fd, stub);
b7d4d7
+    ob_inode_t *ob_inode;
b7d4d7
+    call_frame_t *open_frame;
b7d4d7
+    call_stub_t *stub;
b7d4d7
+    fd_t *first_fd;
b7d4d7
+    ob_state_t state;
b7d4d7
+
b7d4d7
+    state = ob_open_behind(this, fd, flags, &ob_inode, &first_fd);
b7d4d7
+    if (state == OB_STATE_READY) {
b7d4d7
+        /* There's no pending open, but there are other file descriptors opened
b7d4d7
+         * or the current flags require a synchronous open. */
b7d4d7
+        return default_open(frame, this, loc, flags, fd, xdata);
b7d4d7
+    }
b7d4d7
 
b7d4d7
-        fd_unref(old_fd);
b7d4d7
+    if (state == OB_STATE_OPEN_TRIGGERED) {
b7d4d7
+        /* The first open is in progress (either because it was already issued
b7d4d7
+         * or because this request triggered it). We try to create a new stub
b7d4d7
+         * to retry the operation once the initial open completes. */
b7d4d7
+        stub = fop_open_stub(frame, ob_open, loc, flags, fd, xdata);
b7d4d7
+        if (stub != NULL) {
b7d4d7
+            return ob_stub_dispatch(this, ob_inode, first_fd, stub);
b7d4d7
+        }
b7d4d7
 
b7d4d7
-        return 0;
b7d4d7
+        state = -ENOMEM;
b7d4d7
     }
b7d4d7
 
b7d4d7
-    ret = ob_open_behind(frame, this, loc, flags, fd, xdata);
b7d4d7
-    if (ret) {
b7d4d7
-        goto err;
b7d4d7
-    }
b7d4d7
+    if (state == OB_STATE_FIRST_OPEN) {
b7d4d7
+        /* We try to create a stub for the new open. A new frame needs to be
b7d4d7
+         * used because the current one may be destroyed soon after sending
b7d4d7
+         * the open's reply. */
b7d4d7
+        open_frame = copy_frame(frame);
b7d4d7
+        if (open_frame != NULL) {
b7d4d7
+            stub = fop_open_stub(open_frame, ob_open_resume, loc, flags, fd,
b7d4d7
+                                 xdata);
b7d4d7
+            if (stub != NULL) {
b7d4d7
+                open_frame->local = ob_inode;
b7d4d7
 
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    gf_msg(this->name, GF_LOG_ERROR, op_errno, OPEN_BEHIND_MSG_NO_MEMORY, "%s",
b7d4d7
-           loc->path);
b7d4d7
+                /* TODO: Previous version passed xdata back to the caller, but
b7d4d7
+                 *       probably this doesn't make sense since it won't contain
b7d4d7
+                 *       any requested data. I think it would be better to pass
b7d4d7
+                 *       NULL for xdata. */
b7d4d7
+                default_open_cbk(frame, NULL, this, 0, 0, fd, xdata);
b7d4d7
 
b7d4d7
-    STACK_UNWIND_STRICT(open, frame, -1, op_errno, 0, 0);
b7d4d7
+                return ob_open_dispatch(this, ob_inode, first_fd, stub);
b7d4d7
+            }
b7d4d7
 
b7d4d7
-    return 0;
b7d4d7
-}
b7d4d7
+            STACK_DESTROY(open_frame->root);
b7d4d7
+        }
b7d4d7
 
b7d4d7
-fd_t *
b7d4d7
-ob_get_wind_fd(xlator_t *this, fd_t *fd, uint32_t *flag)
b7d4d7
-{
b7d4d7
-    fd_t *wind_fd = NULL;
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
-    ob_conf_t *conf = NULL;
b7d4d7
+        /* In case of error, simulate a regular completion but with an error
b7d4d7
+         * code. */
b7d4d7
+        ob_open_completed(this, ob_inode, first_fd, -1, ENOMEM);
b7d4d7
 
b7d4d7
-    conf = this->private;
b7d4d7
+        state = -ENOMEM;
b7d4d7
+    }
b7d4d7
 
b7d4d7
-    ob_fd = ob_fd_ctx_get(this, fd);
b7d4d7
+    /* In case of failure we need to decrement the number of open files because
b7d4d7
+     * ob_fdclose() won't be called. */
b7d4d7
 
b7d4d7
-    if (ob_fd && ob_fd->open_frame && conf->use_anonymous_fd) {
b7d4d7
-        wind_fd = fd_anonymous(fd->inode);
b7d4d7
-        if ((ob_fd->flags & O_DIRECT) && (flag))
b7d4d7
-            *flag = *flag | O_DIRECT;
b7d4d7
-    } else {
b7d4d7
-        wind_fd = fd_ref(fd);
b7d4d7
+    LOCK(&fd->inode->lock);
b7d4d7
+    {
b7d4d7
+        ob_inode->open_count--;
b7d4d7
     }
b7d4d7
+    UNLOCK(&fd->inode->lock);
b7d4d7
 
b7d4d7
-    return wind_fd;
b7d4d7
+    gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s",
b7d4d7
+            "open", "path=%s", loc->path, NULL);
b7d4d7
+
b7d4d7
+    return default_open_failure_cbk(frame, -state);
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
b7d4d7
          off_t offset, uint32_t flags, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-    fd_t *wind_fd = NULL;
b7d4d7
-    ob_conf_t *conf = NULL;
b7d4d7
+    ob_conf_t *conf = this->private;
b7d4d7
+    bool trigger = conf->read_after_open || !conf->use_anonymous_fd;
b7d4d7
 
b7d4d7
-    conf = this->private;
b7d4d7
-
b7d4d7
-    if (!conf->read_after_open)
b7d4d7
-        wind_fd = ob_get_wind_fd(this, fd, &flags);
b7d4d7
-    else
b7d4d7
-        wind_fd = fd_ref(fd);
b7d4d7
-
b7d4d7
-    stub = fop_readv_stub(frame, default_readv_resume, wind_fd, size, offset,
b7d4d7
-                          flags, xdata);
b7d4d7
-    fd_unref(wind_fd);
b7d4d7
-
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, wind_fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0);
b7d4d7
+    OB_POST_FD(readv, this, frame, fd, trigger, fd, size, offset, flags, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
b7d4d7
           int count, off_t offset, uint32_t flags, struct iobref *iobref,
b7d4d7
           dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_writev_stub(frame, default_writev_resume, fd, iov, count, offset,
b7d4d7
-                           flags, iobref, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, 0, 0, 0);
b7d4d7
+    OB_POST_FD(writev, this, frame, fd, true, fd, iov, count, offset, flags,
b7d4d7
+               iobref, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-    fd_t *wind_fd = NULL;
b7d4d7
-
b7d4d7
-    wind_fd = ob_get_wind_fd(this, fd, NULL);
b7d4d7
-
b7d4d7
-    stub = fop_fstat_stub(frame, default_fstat_resume, wind_fd, xdata);
b7d4d7
+    ob_conf_t *conf = this->private;
b7d4d7
+    bool trigger = !conf->use_anonymous_fd;
b7d4d7
 
b7d4d7
-    fd_unref(wind_fd);
b7d4d7
-
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, wind_fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0);
b7d4d7
+    OB_POST_FD(fstat, this, frame, fd, trigger, fd, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
b7d4d7
         gf_seek_what_t what, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-    fd_t *wind_fd = NULL;
b7d4d7
-
b7d4d7
-    wind_fd = ob_get_wind_fd(this, fd, NULL);
b7d4d7
+    ob_conf_t *conf = this->private;
b7d4d7
+    bool trigger = !conf->use_anonymous_fd;
b7d4d7
 
b7d4d7
-    stub = fop_seek_stub(frame, default_seek_resume, wind_fd, offset, what,
b7d4d7
-                         xdata);
b7d4d7
-
b7d4d7
-    fd_unref(wind_fd);
b7d4d7
-
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, wind_fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0);
b7d4d7
+    OB_POST_FD(seek, this, frame, fd, trigger, fd, offset, what, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
-    gf_boolean_t unwind = _gf_false;
b7d4d7
-
b7d4d7
-    LOCK(&fd->lock);
b7d4d7
-    {
b7d4d7
-        ob_fd = __ob_fd_ctx_get(this, fd);
b7d4d7
-        if (ob_fd && ob_fd->open_frame)
b7d4d7
-            /* if open() was never wound to backend,
b7d4d7
-               no need to wind flush() either.
b7d4d7
-            */
b7d4d7
-            unwind = _gf_true;
b7d4d7
-    }
b7d4d7
-    UNLOCK(&fd->lock);
b7d4d7
-
b7d4d7
-    if (unwind)
b7d4d7
-        goto unwind;
b7d4d7
-
b7d4d7
-    stub = fop_flush_stub(frame, default_flush_resume, fd, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, 0);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-
b7d4d7
-unwind:
b7d4d7
-    STACK_UNWIND_STRICT(flush, frame, 0, 0, 0);
b7d4d7
+    OB_POST_FLUSH(this, frame, fd, fd, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_fsync_stub(frame, default_fsync_resume, fd, flag, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, 0, 0, 0);
b7d4d7
+    OB_POST_FD(fsync, this, frame, fd, true, fd, flag, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
b7d4d7
       struct gf_flock *flock, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_lk_stub(frame, default_lk_resume, fd, cmd, flock, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(lk, frame, -1, ENOMEM, 0, 0);
b7d4d7
+    OB_POST_FD(lk, this, frame, fd, true, fd, cmd, flock, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
b7d4d7
              dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_ftruncate_stub(frame, default_ftruncate_resume, fd, offset,
b7d4d7
-                              xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, 0, 0, 0);
b7d4d7
+    OB_POST_FD(ftruncate, this, frame, fd, true, fd, offset, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
b7d4d7
              int flags, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_fsetxattr_stub(frame, default_fsetxattr_resume, fd, xattr, flags,
b7d4d7
-                              xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fsetxattr, frame, -1, ENOMEM, 0);
b7d4d7
+    OB_POST_FD(fsetxattr, this, frame, fd, true, fd, xattr, flags, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
b7d4d7
              dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, fd, name, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, 0, 0);
b7d4d7
+    OB_POST_FD(fgetxattr, this, frame, fd, true, fd, name, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
b7d4d7
                 dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_fremovexattr_stub(frame, default_fremovexattr_resume, fd, name,
b7d4d7
-                                 xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fremovexattr, frame, -1, ENOMEM, 0);
b7d4d7
+    OB_POST_FD(fremovexattr, this, frame, fd, true, fd, name, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
b7d4d7
             int cmd, struct gf_flock *flock, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = fop_finodelk_stub(frame, default_finodelk_resume,
b7d4d7
-                                          volume, fd, cmd, flock, xdata);
b7d4d7
-    if (stub)
b7d4d7
-        open_and_resume(this, fd, stub);
b7d4d7
-    else
b7d4d7
-        STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, 0);
b7d4d7
+    OB_POST_FD(finodelk, this, frame, fd, true, volume, fd, cmd, flock, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
b7d4d7
             const char *basename, entrylk_cmd cmd, entrylk_type type,
b7d4d7
             dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = fop_fentrylk_stub(
b7d4d7
-        frame, default_fentrylk_resume, volume, fd, basename, cmd, type, xdata);
b7d4d7
-    if (stub)
b7d4d7
-        open_and_resume(this, fd, stub);
b7d4d7
-    else
b7d4d7
-        STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, 0);
b7d4d7
+    OB_POST_FD(fentrylk, this, frame, fd, true, volume, fd, basename, cmd, type,
b7d4d7
+               xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
b7d4d7
             gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd,
b7d4d7
-                                          optype, xattr, xdata);
b7d4d7
-    if (stub)
b7d4d7
-        open_and_resume(this, fd, stub);
b7d4d7
-    else
b7d4d7
-        STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, 0, 0);
b7d4d7
+    OB_POST_FD(fxattrop, this, frame, fd, true, fd, optype, xattr, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *iatt,
b7d4d7
             int valid, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_fsetattr_stub(frame, default_fsetattr_resume, fd, iatt, valid,
b7d4d7
-                             xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, 0, 0, 0);
b7d4d7
+    OB_POST_FD(fsetattr, this, frame, fd, true, fd, iatt, valid, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
b7d4d7
              off_t offset, size_t len, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub;
b7d4d7
-
b7d4d7
-    stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, offset,
b7d4d7
-                              len, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
+    OB_POST_FD(fallocate, this, frame, fd, true, fd, mode, offset, len, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
b7d4d7
-    return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
b7d4d7
            size_t len, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub;
b7d4d7
-
b7d4d7
-    stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len,
b7d4d7
-                            xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
+    OB_POST_FD(discard, this, frame, fd, true, fd, offset, len, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
b7d4d7
-    return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
b7d4d7
             off_t len, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub;
b7d4d7
-
b7d4d7
-    stub = fop_zerofill_stub(frame, default_zerofill_resume, fd, offset, len,
b7d4d7
-                             xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
+    OB_POST_FD(zerofill, this, frame, fd, true, fd, offset, len, xdata);
b7d4d7
 
b7d4d7
-    open_and_resume(this, fd, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
b7d4d7
           dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_unlink_stub(frame, default_unlink_resume, loc, xflags, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_all_pending_fds_and_resume(this, loc->inode, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, 0, 0, 0);
b7d4d7
+    OB_POST_INODE(unlink, this, frame, loc->inode, true, loc, xflags, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
+static int32_t
b7d4d7
 ob_rename(call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst,
b7d4d7
           dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_rename_stub(frame, default_rename_resume, src, dst, xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_all_pending_fds_and_resume(this, dst->inode, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0);
b7d4d7
+    OB_POST_INODE(rename, this, frame, dst->inode, true, src, dst, xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int32_t
b7d4d7
+static int32_t
b7d4d7
 ob_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
b7d4d7
            int32_t valid, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-
b7d4d7
-    stub = fop_setattr_stub(frame, default_setattr_resume, loc, stbuf, valid,
b7d4d7
-                            xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
+    OB_POST_INODE(setattr, this, frame, loc->inode, true, loc, stbuf, valid,
b7d4d7
+                  xdata);
b7d4d7
 
b7d4d7
-    open_all_pending_fds_and_resume(this, loc->inode, stub);
b7d4d7
-
b7d4d7
-    return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
b7d4d7
     return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int32_t
b7d4d7
+static int32_t
b7d4d7
 ob_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
b7d4d7
             int32_t flags, dict_t *xdata)
b7d4d7
 {
b7d4d7
-    call_stub_t *stub = NULL;
b7d4d7
-    gf_boolean_t access_xattr = _gf_false;
b7d4d7
-
b7d4d7
     if (dict_get(dict, POSIX_ACL_DEFAULT_XATTR) ||
b7d4d7
         dict_get(dict, POSIX_ACL_ACCESS_XATTR) ||
b7d4d7
-        dict_get(dict, GF_SELINUX_XATTR_KEY))
b7d4d7
-        access_xattr = _gf_true;
b7d4d7
-
b7d4d7
-    if (!access_xattr)
b7d4d7
+        dict_get(dict, GF_SELINUX_XATTR_KEY)) {
b7d4d7
         return default_setxattr(frame, this, loc, dict, flags, xdata);
b7d4d7
+    }
b7d4d7
 
b7d4d7
-    stub = fop_setxattr_stub(frame, default_setxattr_resume, loc, dict, flags,
b7d4d7
-                             xdata);
b7d4d7
-    if (!stub)
b7d4d7
-        goto err;
b7d4d7
-
b7d4d7
-    open_all_pending_fds_and_resume(this, loc->inode, stub);
b7d4d7
+    OB_POST_INODE(setxattr, this, frame, loc->inode, true, loc, dict, flags,
b7d4d7
+                  xdata);
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
-err:
b7d4d7
-    STACK_UNWIND_STRICT(setxattr, frame, -1, ENOMEM, NULL);
b7d4d7
-    return 0;
b7d4d7
 }
b7d4d7
 
b7d4d7
-int
b7d4d7
-ob_release(xlator_t *this, fd_t *fd)
b7d4d7
+static void
b7d4d7
+ob_fdclose(xlator_t *this, fd_t *fd)
b7d4d7
 {
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
+    struct list_head list;
b7d4d7
+    ob_inode_t *ob_inode;
b7d4d7
+    call_stub_t *stub;
b7d4d7
+
b7d4d7
+    INIT_LIST_HEAD(&list);
b7d4d7
+    stub = NULL;
b7d4d7
 
b7d4d7
-    ob_fd = ob_fd_ctx_get(this, fd);
b7d4d7
+    LOCK(&fd->inode->lock);
b7d4d7
+    {
b7d4d7
+        ob_inode = ob_inode_get_locked(this, fd->inode);
b7d4d7
+        if (ob_inode != NULL) {
b7d4d7
+            ob_inode->open_count--;
b7d4d7
+
b7d4d7
+            /* If this fd is the same as ob_inode->first_fd, it means that
b7d4d7
+             * the initial open has not fully completed. We'll try to cancel
b7d4d7
+             * it. */
b7d4d7
+            if (ob_inode->first_fd == fd) {
b7d4d7
+                if (ob_inode->first_open == OB_OPEN_PREPARING) {
b7d4d7
+                    /* In this case ob_open_dispatch() has not been called yet.
b7d4d7
+                     * We clear first_fd and first_open to allow that function
b7d4d7
+                     * to know that the open is not really needed. This also
b7d4d7
+                     * allows other requests to work as expected if they
b7d4d7
+                     * arrive before the dispatch function is called. If there
b7d4d7
+                     * are pending fops, we can directly process them here.
b7d4d7
+                     * (note that there shouldn't be any fd related fops, but
b7d4d7
+                     * if there are, it's fine if they fail). */
b7d4d7
+                    ob_inode->first_fd = NULL;
b7d4d7
+                    ob_inode->first_open = NULL;
b7d4d7
+                    ob_inode->triggered = false;
b7d4d7
+                    list_splice_init(&ob_inode->resume_fops, &list);
b7d4d7
+                } else if (!ob_inode->triggered) {
b7d4d7
+                    /* If the open has already been dispatched, we can only
b7d4d7
+                     * cancel it if it has not been triggered. Otherwise we
b7d4d7
+                     * simply wait until it completes. While it's not triggered,
b7d4d7
+                     * first_open must be a valid stub and there can't be any
b7d4d7
+                     * pending fops. */
b7d4d7
+                    GF_ASSERT((ob_inode->first_open != NULL) &&
b7d4d7
+                              list_empty(&ob_inode->resume_fops));
b7d4d7
+
b7d4d7
+                    ob_inode->first_fd = NULL;
b7d4d7
+                    stub = ob_inode->first_open;
b7d4d7
+                    ob_inode->first_open = NULL;
b7d4d7
+                }
b7d4d7
+            }
b7d4d7
+        }
b7d4d7
+    }
b7d4d7
+    UNLOCK(&fd->inode->lock);
b7d4d7
 
b7d4d7
-    ob_fd_free(ob_fd);
b7d4d7
+    if (stub != NULL) {
b7d4d7
+        call_stub_destroy(stub);
b7d4d7
+        fd_unref(fd);
b7d4d7
+    }
b7d4d7
 
b7d4d7
-    return 0;
b7d4d7
+    ob_resume_pending(&list);
b7d4d7
 }
b7d4d7
 
b7d4d7
 int
b7d4d7
 ob_forget(xlator_t *this, inode_t *inode)
b7d4d7
 {
b7d4d7
-    ob_inode_t *ob_inode = NULL;
b7d4d7
+    ob_inode_t *ob_inode;
b7d4d7
     uint64_t value = 0;
b7d4d7
 
b7d4d7
-    inode_ctx_del(inode, this, &value);
b7d4d7
-
b7d4d7
-    if (value) {
b7d4d7
+    if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) {
b7d4d7
         ob_inode = (ob_inode_t *)(uintptr_t)value;
b7d4d7
-        ob_inode_free(ob_inode);
b7d4d7
+        GF_FREE(ob_inode);
b7d4d7
     }
b7d4d7
 
b7d4d7
     return 0;
b7d4d7
@@ -1153,20 +823,18 @@ ob_priv_dump(xlator_t *this)
b7d4d7
 int
b7d4d7
 ob_fdctx_dump(xlator_t *this, fd_t *fd)
b7d4d7
 {
b7d4d7
-    ob_fd_t *ob_fd = NULL;
b7d4d7
     char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
b7d4d7
         0,
b7d4d7
     };
b7d4d7
-    int ret = 0;
b7d4d7
+    uint64_t value = 0;
b7d4d7
+    int ret = 0, error = 0;
b7d4d7
 
b7d4d7
     ret = TRY_LOCK(&fd->lock);
b7d4d7
     if (ret)
b7d4d7
         return 0;
b7d4d7
 
b7d4d7
-    ob_fd = __ob_fd_ctx_get(this, fd);
b7d4d7
-    if (!ob_fd) {
b7d4d7
-        UNLOCK(&fd->lock);
b7d4d7
-        return 0;
b7d4d7
+    if ((__fd_ctx_get(fd, this, &value) == 0) && (value != 0)) {
b7d4d7
+        error = (int32_t)value;
b7d4d7
     }
b7d4d7
 
b7d4d7
     gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind",
b7d4d7
@@ -1175,17 +843,7 @@ ob_fdctx_dump(xlator_t *this, fd_t *fd)
b7d4d7
 
b7d4d7
     gf_proc_dump_write("fd", "%p", fd);
b7d4d7
 
b7d4d7
-    gf_proc_dump_write("open_frame", "%p", ob_fd->open_frame);
b7d4d7
-
b7d4d7
-    if (ob_fd->open_frame)
b7d4d7
-        gf_proc_dump_write("open_frame.root.unique", "%" PRIu64,
b7d4d7
-                           ob_fd->open_frame->root->unique);
b7d4d7
-
b7d4d7
-    gf_proc_dump_write("loc.path", "%s", ob_fd->loc.path);
b7d4d7
-
b7d4d7
-    gf_proc_dump_write("loc.ino", "%s", uuid_utoa(ob_fd->loc.gfid));
b7d4d7
-
b7d4d7
-    gf_proc_dump_write("flags", "%d", ob_fd->flags);
b7d4d7
+    gf_proc_dump_write("error", "%d", error);
b7d4d7
 
b7d4d7
     UNLOCK(&fd->lock);
b7d4d7
 
b7d4d7
@@ -1307,7 +965,7 @@ struct xlator_fops fops = {
b7d4d7
 };
b7d4d7
 
b7d4d7
 struct xlator_cbks cbks = {
b7d4d7
-    .release = ob_release,
b7d4d7
+    .fdclose = ob_fdclose,
b7d4d7
     .forget = ob_forget,
b7d4d7
 };
b7d4d7
 
b7d4d7
-- 
b7d4d7
1.8.3.1
b7d4d7