1df6c8
From 399fad1ac0f9273483270e8af06a5b2d28927533 Mon Sep 17 00:00:00 2001
1df6c8
From: Pranith Kumar K <pkarampu@redhat.com>
1df6c8
Date: Fri, 29 May 2020 14:24:53 +0530
1df6c8
Subject: [PATCH 387/392] cluster/afr: Delay post-op for fsync
1df6c8
1df6c8
Problem:
1df6c8
AFR doesn't delay post-op for fsync fop. For fsync heavy workloads
1df6c8
this leads to un-necessary fxattrop/finodelk for every fsync leading
1df6c8
to bad performance.
1df6c8
1df6c8
Fix:
1df6c8
Have delayed post-op for fsync. Add special flag in xdata to indicate
1df6c8
that afr shouldn't delay post-op in cases where either the
1df6c8
process will terminate or graph-switch would happen. Otherwise it leads
1df6c8
to un-necessary heals when the graph-switch/process-termination
1df6c8
happens before delayed-post-op completes.
1df6c8
1df6c8
> Upstream-patch: https://review.gluster.org/c/glusterfs/+/24473
1df6c8
> Fixes: #1253
1df6c8
1df6c8
BUG: 1848896
1df6c8
Change-Id: I531940d13269a111c49e0510d49514dc169f4577
1df6c8
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
1df6c8
(cherry picked from commit 3ed98fc9dcb39223032e343fd5b0ad17fa3cae14)
1df6c8
Reviewed-on: https://code.engineering.redhat.com/gerrit/203694
1df6c8
Tested-by: RHGS Build Bot <nigelb@redhat.com>
1df6c8
Tested-by: Karthik Subrahmanya <ksubrahm@redhat.com>
1df6c8
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
1df6c8
---
1df6c8
 api/src/glfs-resolve.c                         |  14 ++-
1df6c8
 tests/basic/afr/durability-off.t               |   2 +
1df6c8
 tests/basic/gfapi/gfapi-graph-switch-open-fd.t |  44 +++++++++
1df6c8
 tests/basic/gfapi/gfapi-keep-writing.c         | 129 +++++++++++++++++++++++++
1df6c8
 xlators/cluster/afr/src/afr-inode-write.c      |  11 ++-
1df6c8
 xlators/cluster/afr/src/afr-transaction.c      |   9 +-
1df6c8
 xlators/cluster/afr/src/afr.h                  |   2 +-
1df6c8
 xlators/cluster/dht/src/dht-rebalance.c        |  15 ++-
1df6c8
 xlators/mount/fuse/src/fuse-bridge.c           |  23 ++++-
1df6c8
 9 files changed, 239 insertions(+), 10 deletions(-)
1df6c8
 create mode 100644 tests/basic/gfapi/gfapi-graph-switch-open-fd.t
1df6c8
 create mode 100644 tests/basic/gfapi/gfapi-keep-writing.c
1df6c8
1df6c8
diff --git a/api/src/glfs-resolve.c b/api/src/glfs-resolve.c
1df6c8
index a79f490..062b7dc 100644
1df6c8
--- a/api/src/glfs-resolve.c
1df6c8
+++ b/api/src/glfs-resolve.c
1df6c8
@@ -722,6 +722,7 @@ glfs_migrate_fd_safe(struct glfs *fs, xlator_t *newsubvol, fd_t *oldfd)
1df6c8
         0,
1df6c8
     };
1df6c8
     char uuid1[64];
1df6c8
+    dict_t *xdata = NULL;
1df6c8
 
1df6c8
     oldinode = oldfd->inode;
1df6c8
     oldsubvol = oldinode->table->xl;
1df6c8
@@ -730,7 +731,15 @@ glfs_migrate_fd_safe(struct glfs *fs, xlator_t *newsubvol, fd_t *oldfd)
1df6c8
         return fd_ref(oldfd);
1df6c8
 
1df6c8
     if (!oldsubvol->switched) {
1df6c8
-        ret = syncop_fsync(oldsubvol, oldfd, 0, NULL, NULL, NULL, NULL);
1df6c8
+        xdata = dict_new();
1df6c8
+        if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
1df6c8
+            gf_msg(fs->volname, GF_LOG_WARNING, ENOMEM, API_MSG_FSYNC_FAILED,
1df6c8
+                   "last-fsync set failed on %s graph %s (%d)",
1df6c8
+                   uuid_utoa_r(oldfd->inode->gfid, uuid1),
1df6c8
+                   graphid_str(oldsubvol), oldsubvol->graph->id);
1df6c8
+        }
1df6c8
+
1df6c8
+        ret = syncop_fsync(oldsubvol, oldfd, 0, NULL, NULL, xdata, NULL);
1df6c8
         DECODE_SYNCOP_ERR(ret);
1df6c8
         if (ret) {
1df6c8
             gf_msg(fs->volname, GF_LOG_WARNING, errno, API_MSG_FSYNC_FAILED,
1df6c8
@@ -809,6 +818,9 @@ out:
1df6c8
         newfd = NULL;
1df6c8
     }
1df6c8
 
1df6c8
+    if (xdata)
1df6c8
+        dict_unref(xdata);
1df6c8
+
1df6c8
     return newfd;
1df6c8
 }
1df6c8
 
1df6c8
diff --git a/tests/basic/afr/durability-off.t b/tests/basic/afr/durability-off.t
1df6c8
index 155ffa0..6e0f18b 100644
1df6c8
--- a/tests/basic/afr/durability-off.t
1df6c8
+++ b/tests/basic/afr/durability-off.t
1df6c8
@@ -26,6 +26,8 @@ TEST $CLI volume heal $V0
1df6c8
 EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
1df6c8
 EXPECT "^0$" echo $($CLI volume profile $V0 info | grep -w FSYNC | wc -l)
1df6c8
 
1df6c8
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
1df6c8
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
1df6c8
 #Test that fsyncs happen when durability is on
1df6c8
 TEST $CLI volume set $V0 cluster.ensure-durability on
1df6c8
 TEST $CLI volume set $V0 performance.strict-write-ordering on
1df6c8
diff --git a/tests/basic/gfapi/gfapi-graph-switch-open-fd.t b/tests/basic/gfapi/gfapi-graph-switch-open-fd.t
1df6c8
new file mode 100644
1df6c8
index 0000000..2e666be
1df6c8
--- /dev/null
1df6c8
+++ b/tests/basic/gfapi/gfapi-graph-switch-open-fd.t
1df6c8
@@ -0,0 +1,44 @@
1df6c8
+#!/bin/bash
1df6c8
+
1df6c8
+. $(dirname $0)/../../include.rc
1df6c8
+. $(dirname $0)/../../volume.rc
1df6c8
+
1df6c8
+cleanup;
1df6c8
+
1df6c8
+TEST glusterd
1df6c8
+
1df6c8
+TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{0..2};
1df6c8
+EXPECT 'Created' volinfo_field $V0 'Status';
1df6c8
+
1df6c8
+TEST $CLI volume start $V0;
1df6c8
+EXPECT 'Started' volinfo_field $V0 'Status';
1df6c8
+
1df6c8
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
1df6c8
+TEST touch $M0/sync
1df6c8
+logdir=`gluster --print-logdir`
1df6c8
+
1df6c8
+TEST build_tester $(dirname $0)/gfapi-keep-writing.c -lgfapi
1df6c8
+
1df6c8
+
1df6c8
+#Launch a program to keep doing writes on an fd
1df6c8
+./$(dirname $0)/gfapi-keep-writing ${H0} $V0 $logdir/gfapi-async-calls-test.log sync &
1df6c8
+p=$!
1df6c8
+sleep 1 #Let some writes go through
1df6c8
+#Check if graph switch will lead to any pending markers for ever
1df6c8
+TEST $CLI volume set $V0 performance.quick-read off
1df6c8
+TEST $CLI volume set $V0 performance.io-cache off
1df6c8
+TEST $CLI volume set $V0 performance.stat-prefetch off
1df6c8
+TEST $CLI volume set $V0 performance.read-ahead off
1df6c8
+
1df6c8
+
1df6c8
+TEST rm -f $M0/sync #Make sure the glfd is closed
1df6c8
+TEST wait #Wait for background process to die
1df6c8
+#Goal is to check if there is permanent FOOL changelog
1df6c8
+sleep 5
1df6c8
+EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/brick0/glfs_test.txt trusted.afr.dirty
1df6c8
+EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/brick1/glfs_test.txt trusted.afr.dirty
1df6c8
+EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/brick2/glfs_test.txt trusted.afr.dirty
1df6c8
+
1df6c8
+cleanup_tester $(dirname $0)/gfapi-async-calls-test
1df6c8
+
1df6c8
+cleanup;
1df6c8
diff --git a/tests/basic/gfapi/gfapi-keep-writing.c b/tests/basic/gfapi/gfapi-keep-writing.c
1df6c8
new file mode 100644
1df6c8
index 0000000..91b59ce
1df6c8
--- /dev/null
1df6c8
+++ b/tests/basic/gfapi/gfapi-keep-writing.c
1df6c8
@@ -0,0 +1,129 @@
1df6c8
+#include <fcntl.h>
1df6c8
+#include <unistd.h>
1df6c8
+#include <time.h>
1df6c8
+#include <limits.h>
1df6c8
+#include <string.h>
1df6c8
+#include <stdio.h>
1df6c8
+#include <stdlib.h>
1df6c8
+#include <errno.h>
1df6c8
+#include <glusterfs/api/glfs.h>
1df6c8
+#include <glusterfs/api/glfs-handles.h>
1df6c8
+
1df6c8
+#define LOG_ERR(msg)                                                           \
1df6c8
+    do {                                                                       \
1df6c8
+        fprintf(stderr, "%s : Error (%s)\n", msg, strerror(errno));            \
1df6c8
+    } while (0)
1df6c8
+
1df6c8
+glfs_t *
1df6c8
+init_glfs(const char *hostname, const char *volname, const char *logfile)
1df6c8
+{
1df6c8
+    int ret = -1;
1df6c8
+    glfs_t *fs = NULL;
1df6c8
+
1df6c8
+    fs = glfs_new(volname);
1df6c8
+    if (!fs) {
1df6c8
+        LOG_ERR("glfs_new failed");
1df6c8
+        return NULL;
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
1df6c8
+    if (ret < 0) {
1df6c8
+        LOG_ERR("glfs_set_volfile_server failed");
1df6c8
+        goto out;
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = glfs_set_logging(fs, logfile, 7);
1df6c8
+    if (ret < 0) {
1df6c8
+        LOG_ERR("glfs_set_logging failed");
1df6c8
+        goto out;
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = glfs_init(fs);
1df6c8
+    if (ret < 0) {
1df6c8
+        LOG_ERR("glfs_init failed");
1df6c8
+        goto out;
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = 0;
1df6c8
+out:
1df6c8
+    if (ret) {
1df6c8
+        glfs_fini(fs);
1df6c8
+        fs = NULL;
1df6c8
+    }
1df6c8
+
1df6c8
+    return fs;
1df6c8
+}
1df6c8
+
1df6c8
+int
1df6c8
+glfs_test_function(const char *hostname, const char *volname,
1df6c8
+                   const char *logfile, const char *syncfile)
1df6c8
+{
1df6c8
+    int ret = -1;
1df6c8
+    int flags = O_CREAT | O_RDWR;
1df6c8
+    glfs_t *fs = NULL;
1df6c8
+    glfs_fd_t *glfd = NULL;
1df6c8
+    const char *buff = "This is from my prog\n";
1df6c8
+    const char *filename = "glfs_test.txt";
1df6c8
+    struct stat buf = {0};
1df6c8
+
1df6c8
+    fs = init_glfs(hostname, volname, logfile);
1df6c8
+    if (fs == NULL) {
1df6c8
+        LOG_ERR("init_glfs failed");
1df6c8
+        return -1;
1df6c8
+    }
1df6c8
+
1df6c8
+    glfd = glfs_creat(fs, filename, flags, 0644);
1df6c8
+    if (glfd == NULL) {
1df6c8
+        LOG_ERR("glfs_creat failed");
1df6c8
+        goto out;
1df6c8
+    }
1df6c8
+
1df6c8
+    while (glfs_stat(fs, syncfile, &buf) == 0) {
1df6c8
+        ret = glfs_write(glfd, buff, strlen(buff), flags);
1df6c8
+        if (ret < 0) {
1df6c8
+            LOG_ERR("glfs_write failed");
1df6c8
+            goto out;
1df6c8
+        }
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = glfs_close(glfd);
1df6c8
+    if (ret < 0) {
1df6c8
+        LOG_ERR("glfs_write failed");
1df6c8
+        goto out;
1df6c8
+    }
1df6c8
+
1df6c8
+out:
1df6c8
+    ret = glfs_fini(fs);
1df6c8
+    if (ret) {
1df6c8
+        LOG_ERR("glfs_fini failed");
1df6c8
+    }
1df6c8
+
1df6c8
+    return ret;
1df6c8
+}
1df6c8
+
1df6c8
+int
1df6c8
+main(int argc, char *argv[])
1df6c8
+{
1df6c8
+    int ret = 0;
1df6c8
+    char *hostname = NULL;
1df6c8
+    char *volname = NULL;
1df6c8
+    char *logfile = NULL;
1df6c8
+    char *syncfile = NULL;
1df6c8
+
1df6c8
+    if (argc != 5) {
1df6c8
+        fprintf(stderr, "Invalid argument\n");
1df6c8
+        exit(1);
1df6c8
+    }
1df6c8
+
1df6c8
+    hostname = argv[1];
1df6c8
+    volname = argv[2];
1df6c8
+    logfile = argv[3];
1df6c8
+    syncfile = argv[4];
1df6c8
+
1df6c8
+    ret = glfs_test_function(hostname, volname, logfile, syncfile);
1df6c8
+    if (ret) {
1df6c8
+        LOG_ERR("glfs_test_function failed");
1df6c8
+    }
1df6c8
+
1df6c8
+    return ret;
1df6c8
+}
1df6c8
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
1df6c8
index 7fcc9d4..df82b6e 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-inode-write.c
1df6c8
+++ b/xlators/cluster/afr/src/afr-inode-write.c
1df6c8
@@ -2492,6 +2492,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
1df6c8
     call_frame_t *transaction_frame = NULL;
1df6c8
     int ret = -1;
1df6c8
     int32_t op_errno = ENOMEM;
1df6c8
+    int8_t last_fsync = 0;
1df6c8
 
1df6c8
     transaction_frame = copy_frame(frame);
1df6c8
     if (!transaction_frame)
1df6c8
@@ -2501,10 +2502,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
1df6c8
     if (!local)
1df6c8
         goto out;
1df6c8
 
1df6c8
-    if (xdata)
1df6c8
+    if (xdata) {
1df6c8
         local->xdata_req = dict_copy_with_ref(xdata, NULL);
1df6c8
-    else
1df6c8
+        if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
1df6c8
+            if (last_fsync) {
1df6c8
+                local->transaction.disable_delayed_post_op = _gf_true;
1df6c8
+            }
1df6c8
+        }
1df6c8
+    } else {
1df6c8
         local->xdata_req = dict_new();
1df6c8
+    }
1df6c8
 
1df6c8
     if (!local->xdata_req)
1df6c8
         goto out;
1df6c8
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
1df6c8
index 8e65ae2..ffd0ab8 100644
1df6c8
--- a/xlators/cluster/afr/src/afr-transaction.c
1df6c8
+++ b/xlators/cluster/afr/src/afr-transaction.c
1df6c8
@@ -2385,8 +2385,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
1df6c8
         goto out;
1df6c8
     }
1df6c8
 
1df6c8
-    if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) {
1df6c8
-        /*Only allow writes but shard does [f]xattrops on writes, so
1df6c8
+    if (local->transaction.disable_delayed_post_op) {
1df6c8
+        goto out;
1df6c8
+    }
1df6c8
+
1df6c8
+    if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
1df6c8
+        (local->op != GF_FOP_FSYNC)) {
1df6c8
+        /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
1df6c8
          * they are fine too*/
1df6c8
         goto out;
1df6c8
     }
1df6c8
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
1df6c8
index e731cfa..6bc4721 100644
1df6c8
--- a/xlators/cluster/afr/src/afr.h
1df6c8
+++ b/xlators/cluster/afr/src/afr.h
1df6c8
@@ -854,7 +854,7 @@ typedef struct _afr_local {
1df6c8
 
1df6c8
         int (*unwind)(call_frame_t *frame, xlator_t *this);
1df6c8
 
1df6c8
-        /* post-op hook */
1df6c8
+        gf_boolean_t disable_delayed_post_op;
1df6c8
     } transaction;
1df6c8
 
1df6c8
     syncbarrier_t barrier;
1df6c8
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
1df6c8
index 8f31dca..145e616 100644
1df6c8
--- a/xlators/cluster/dht/src/dht-rebalance.c
1df6c8
+++ b/xlators/cluster/dht/src/dht-rebalance.c
1df6c8
@@ -1564,6 +1564,7 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
1df6c8
     xlator_t *old_target = NULL;
1df6c8
     xlator_t *hashed_subvol = NULL;
1df6c8
     fd_t *linkto_fd = NULL;
1df6c8
+    dict_t *xdata = NULL;
1df6c8
 
1df6c8
     if (from == to) {
1df6c8
         gf_msg_debug(this->name, 0,
1df6c8
@@ -1882,7 +1883,15 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
1df6c8
 
1df6c8
     /* TODO: Sync the locks */
1df6c8
 
1df6c8
-    ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, NULL, NULL);
1df6c8
+    xdata = dict_new();
1df6c8
+    if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
1df6c8
+        gf_log(this->name, GF_LOG_ERROR,
1df6c8
+               "%s: failed to set last-fsync flag on "
1df6c8
+               "%s (%s)",
1df6c8
+               loc->path, to->name, strerror(ENOMEM));
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL);
1df6c8
     if (ret) {
1df6c8
         gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)",
1df6c8
                loc->path, to->name, strerror(-ret));
1df6c8
@@ -2356,11 +2365,15 @@ out:
1df6c8
 
1df6c8
     if (dst_fd)
1df6c8
         syncop_close(dst_fd);
1df6c8
+
1df6c8
     if (src_fd)
1df6c8
         syncop_close(src_fd);
1df6c8
     if (linkto_fd)
1df6c8
         syncop_close(linkto_fd);
1df6c8
 
1df6c8
+    if (xdata)
1df6c8
+        dict_unref(xdata);
1df6c8
+
1df6c8
     loc_wipe(&tmp_loc);
1df6c8
     loc_wipe(&parent_loc);
1df6c8
 
1df6c8
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
1df6c8
index 6e99053..1592067 100644
1df6c8
--- a/xlators/mount/fuse/src/fuse-bridge.c
1df6c8
+++ b/xlators/mount/fuse/src/fuse-bridge.c
1df6c8
@@ -5551,6 +5551,7 @@ fuse_migrate_fd(xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
1df6c8
     char create_in_progress = 0;
1df6c8
     fuse_fd_ctx_t *basefd_ctx = NULL;
1df6c8
     fd_t *oldfd = NULL;
1df6c8
+    dict_t *xdata = NULL;
1df6c8
 
1df6c8
     basefd_ctx = fuse_fd_ctx_get(this, basefd);
1df6c8
     GF_VALIDATE_OR_GOTO("glusterfs-fuse", basefd_ctx, out);
1df6c8
@@ -5587,10 +5588,23 @@ fuse_migrate_fd(xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
1df6c8
     }
1df6c8
 
1df6c8
     if (oldfd->inode->table->xl == old_subvol) {
1df6c8
-        if (IA_ISDIR(oldfd->inode->ia_type))
1df6c8
+        if (IA_ISDIR(oldfd->inode->ia_type)) {
1df6c8
             ret = syncop_fsyncdir(old_subvol, oldfd, 0, NULL, NULL);
1df6c8
-        else
1df6c8
-            ret = syncop_fsync(old_subvol, oldfd, 0, NULL, NULL, NULL, NULL);
1df6c8
+        } else {
1df6c8
+            xdata = dict_new();
1df6c8
+            if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
1df6c8
+                gf_log("glusterfs-fuse", GF_LOG_WARNING,
1df6c8
+                       "last-fsync set failed (%s) on fd (%p)"
1df6c8
+                       "(basefd:%p basefd-inode.gfid:%s) "
1df6c8
+                       "(old-subvolume:%s-%d new-subvolume:%s-%d)",
1df6c8
+                       strerror(ENOMEM), oldfd, basefd,
1df6c8
+                       uuid_utoa(basefd->inode->gfid), old_subvol->name,
1df6c8
+                       old_subvol->graph->id, new_subvol->name,
1df6c8
+                       new_subvol->graph->id);
1df6c8
+            }
1df6c8
+
1df6c8
+            ret = syncop_fsync(old_subvol, oldfd, 0, NULL, NULL, xdata, NULL);
1df6c8
+        }
1df6c8
 
1df6c8
         if (ret < 0) {
1df6c8
             gf_log("glusterfs-fuse", GF_LOG_WARNING,
1df6c8
@@ -5645,6 +5659,9 @@ out:
1df6c8
 
1df6c8
     fd_unref(oldfd);
1df6c8
 
1df6c8
+    if (xdata)
1df6c8
+        dict_unref(xdata);
1df6c8
+
1df6c8
     return ret;
1df6c8
 }
1df6c8
 
1df6c8
-- 
1df6c8
1.8.3.1
1df6c8