14f8ab
From 546f412c155dd5aca2b3cd4202f80c9977b215dc Mon Sep 17 00:00:00 2001
14f8ab
From: Pranith Kumar K <pkarampu@redhat.com>
14f8ab
Date: Wed, 4 Sep 2019 12:06:34 +0530
14f8ab
Subject: [PATCH 287/297] cluster/ec: Fail fsync/flush for files on update
14f8ab
 size/version failure
14f8ab
14f8ab
Problem:
14f8ab
If update size/version is not successful on the file, updates on the
14f8ab
same stripe could lead to data corruptions if the earlier un-aligned
14f8ab
write is not successful on all the bricks. Application won't have
14f8ab
any knowledge of this because update size/version happens in the
14f8ab
background.
14f8ab
14f8ab
Fix:
14f8ab
Fail fsync/flush on fds that are opened before update-size-version
14f8ab
went bad.
14f8ab
14f8ab
Upstream-patch: https://review.gluster.org/c/glusterfs/+/23355
14f8ab
fixes: bz#1745107
14f8ab
Change-Id: I9d323eddcda703bd27d55f340c4079d76e06e492
14f8ab
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/180672
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 tests/basic/ec/ec-badfd.c            | 124 +++++++++++++++++++++++++++++++++++
14f8ab
 tests/basic/ec/ec-badfd.t            |  26 ++++++++
14f8ab
 xlators/cluster/ec/src/ec-common.c   |  23 +++++++
14f8ab
 xlators/cluster/ec/src/ec-generic.c  |  47 +++++++++++++
14f8ab
 xlators/cluster/ec/src/ec-helpers.c  |   7 ++
14f8ab
 xlators/cluster/ec/src/ec-messages.h |   2 +-
14f8ab
 xlators/cluster/ec/src/ec-types.h    |   2 +
14f8ab
 7 files changed, 230 insertions(+), 1 deletion(-)
14f8ab
 create mode 100644 tests/basic/ec/ec-badfd.c
14f8ab
 create mode 100755 tests/basic/ec/ec-badfd.t
14f8ab
14f8ab
diff --git a/tests/basic/ec/ec-badfd.c b/tests/basic/ec/ec-badfd.c
14f8ab
new file mode 100644
14f8ab
index 0000000..8be23c1
14f8ab
--- /dev/null
14f8ab
+++ b/tests/basic/ec/ec-badfd.c
14f8ab
@@ -0,0 +1,124 @@
14f8ab
+#include <stdio.h>
14f8ab
+#include <fcntl.h>
14f8ab
+#include <unistd.h>
14f8ab
+#include <time.h>
14f8ab
+#include <limits.h>
14f8ab
+#include <string.h>
14f8ab
+#include <stdlib.h>
14f8ab
+#include <errno.h>
14f8ab
+#include <glusterfs/api/glfs.h>
14f8ab
+#include <glusterfs/api/glfs-handles.h>
14f8ab
+
14f8ab
+int
14f8ab
+fill_iov(struct iovec *iov, char fillchar, int count)
14f8ab
+{
14f8ab
+    int ret = -1;
14f8ab
+
14f8ab
+    iov->iov_base = malloc(count + 1);
14f8ab
+    if (iov->iov_base == NULL) {
14f8ab
+        return ret;
14f8ab
+    } else {
14f8ab
+        iov->iov_len = count;
14f8ab
+        ret = 0;
14f8ab
+    }
14f8ab
+    memset(iov->iov_base, fillchar, count);
14f8ab
+    memset(iov->iov_base + count, '\0', 1);
14f8ab
+
14f8ab
+    return ret;
14f8ab
+}
14f8ab
+
14f8ab
+int
14f8ab
+write_sync(glfs_t *fs, glfs_fd_t *glfd, int char_count)
14f8ab
+{
14f8ab
+    ssize_t ret = -1;
14f8ab
+    int flags = O_RDWR;
14f8ab
+    struct iovec iov = {0};
14f8ab
+
14f8ab
+    ret = fill_iov(&iov, 'a', char_count);
14f8ab
+    if (ret) {
14f8ab
+        fprintf(stderr, "failed to create iov");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_pwritev(glfd, &iov, 1, 0, flags);
14f8ab
+out:
14f8ab
+    if (ret < 0) {
14f8ab
+        fprintf(stderr, "glfs_pwritev failed, %d", errno);
14f8ab
+    }
14f8ab
+    return ret;
14f8ab
+}
14f8ab
+
14f8ab
+int
14f8ab
+main(int argc, char *argv[])
14f8ab
+{
14f8ab
+    glfs_t *fs = NULL;
14f8ab
+    glfs_fd_t *fd = NULL;
14f8ab
+    int ret = 1;
14f8ab
+    char volume_cmd[4096] = {0};
14f8ab
+
14f8ab
+    if (argc != 4) {
14f8ab
+        fprintf(stderr, "Syntax: %s <host> <volname> <file>\n", argv[0]);
14f8ab
+        return 1;
14f8ab
+    }
14f8ab
+
14f8ab
+    fs = glfs_new(argv[2]);
14f8ab
+    if (!fs) {
14f8ab
+        fprintf(stderr, "glfs_new: returned NULL\n");
14f8ab
+        return 1;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
14f8ab
+    if (ret != 0) {
14f8ab
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    ret = glfs_set_logging(fs, "/tmp/ec-badfd.log", 7);
14f8ab
+    if (ret != 0) {
14f8ab
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    ret = glfs_init(fs);
14f8ab
+    if (ret != 0) {
14f8ab
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    fd = glfs_open(fs, argv[3], O_RDWR);
14f8ab
+    if (fd == NULL) {
14f8ab
+        fprintf(stderr, "glfs_open: returned NULL\n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = write_sync(fs, fd, 16);
14f8ab
+    if (ret < 0) {
14f8ab
+        fprintf(stderr, "write_sync failed\n");
14f8ab
+    }
14f8ab
+
14f8ab
+    snprintf(volume_cmd, sizeof(volume_cmd),
14f8ab
+             "gluster --mode=script volume stop %s", argv[2]);
14f8ab
+    /*Stop the volume so that update-size-version fails*/
14f8ab
+    system(volume_cmd);
14f8ab
+    sleep(8); /* 3 seconds more than eager-lock-timeout*/
14f8ab
+    snprintf(volume_cmd, sizeof(volume_cmd),
14f8ab
+             "gluster --mode=script volume start %s", argv[2]);
14f8ab
+    system(volume_cmd);
14f8ab
+    sleep(8); /*wait for bricks to come up*/
14f8ab
+    ret = glfs_fsync(fd, NULL, NULL);
14f8ab
+    if (ret == 0) {
14f8ab
+        fprintf(stderr, "fsync succeeded on a BADFD\n");
14f8ab
+        exit(1);
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_close(fd);
14f8ab
+    if (ret == 0) {
14f8ab
+        fprintf(stderr, "flush succeeded on a BADFD\n");
14f8ab
+        exit(1);
14f8ab
+    }
14f8ab
+    ret = 0;
14f8ab
+
14f8ab
+out:
14f8ab
+    unlink("/tmp/ec-badfd.log");
14f8ab
+    glfs_fini(fs);
14f8ab
+
14f8ab
+    return ret;
14f8ab
+}
14f8ab
diff --git a/tests/basic/ec/ec-badfd.t b/tests/basic/ec/ec-badfd.t
14f8ab
new file mode 100755
14f8ab
index 0000000..56feb47
14f8ab
--- /dev/null
14f8ab
+++ b/tests/basic/ec/ec-badfd.t
14f8ab
@@ -0,0 +1,26 @@
14f8ab
+#!/bin/bash
14f8ab
+
14f8ab
+. $(dirname $0)/../../include.rc
14f8ab
+. $(dirname $0)/../../volume.rc
14f8ab
+
14f8ab
+cleanup;
14f8ab
+
14f8ab
+TEST glusterd
14f8ab
+TEST pidof glusterd
14f8ab
+
14f8ab
+TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{1..6}
14f8ab
+TEST $CLI volume set $V0 performance.write-behind off
14f8ab
+TEST $CLI volume set $V0 disperse.eager-lock-timeout 5
14f8ab
+
14f8ab
+TEST $CLI volume start $V0
14f8ab
+EXPECT 'Started' volinfo_field $V0 'Status'
14f8ab
+
14f8ab
+TEST $GFS -s $H0 --volfile-id $V0 $M0
14f8ab
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
14f8ab
+TEST touch $M0/file
14f8ab
+
14f8ab
+TEST build_tester $(dirname $0)/ec-badfd.c -lgfapi -Wall -O2
14f8ab
+TEST $(dirname $0)/ec-badfd $H0 $V0 /file
14f8ab
+cleanup_tester $(dirname ${0})/ec-badfd
14f8ab
+
14f8ab
+cleanup;
14f8ab
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
14f8ab
index 5fb4610..92d4e5d 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-common.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-common.c
14f8ab
@@ -2255,6 +2255,23 @@ ec_unlock_lock(ec_lock_link_t *link)
14f8ab
     }
14f8ab
 }
14f8ab
 
14f8ab
+void
14f8ab
+ec_inode_bad_inc(inode_t *inode, xlator_t *xl)
14f8ab
+{
14f8ab
+    ec_inode_t *ctx = NULL;
14f8ab
+
14f8ab
+    LOCK(&inode->lock);
14f8ab
+    {
14f8ab
+        ctx = __ec_inode_get(inode, xl);
14f8ab
+        if (ctx == NULL) {
14f8ab
+            goto unlock;
14f8ab
+        }
14f8ab
+        ctx->bad_version++;
14f8ab
+    }
14f8ab
+unlock:
14f8ab
+    UNLOCK(&inode->lock);
14f8ab
+}
14f8ab
+
14f8ab
 int32_t
14f8ab
 ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
14f8ab
                             int32_t op_ret, int32_t op_errno, dict_t *xattr,
14f8ab
@@ -2270,6 +2287,12 @@ ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
14f8ab
     ctx = lock->ctx;
14f8ab
 
14f8ab
     if (op_ret < 0) {
14f8ab
+        if (link->lock->fd == NULL) {
14f8ab
+            ec_inode_bad_inc(link->lock->loc.inode, this);
14f8ab
+        } else {
14f8ab
+            ec_inode_bad_inc(link->lock->fd->inode, this);
14f8ab
+        }
14f8ab
+
14f8ab
         gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno,
14f8ab
                EC_MSG_SIZE_VERS_UPDATE_FAIL,
14f8ab
                "Failed to update version and size. %s", ec_msg_str(fop));
14f8ab
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
14f8ab
index acc16b5..b019050 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-generic.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-generic.c
14f8ab
@@ -150,6 +150,37 @@ ec_manager_flush(ec_fop_data_t *fop, int32_t state)
14f8ab
     }
14f8ab
 }
14f8ab
 
14f8ab
+static int32_t
14f8ab
+ec_validate_fd(fd_t *fd, xlator_t *xl)
14f8ab
+{
14f8ab
+    uint64_t iversion = 0;
14f8ab
+    uint64_t fversion = 0;
14f8ab
+    ec_inode_t *inode_ctx = NULL;
14f8ab
+    ec_fd_t *fd_ctx = NULL;
14f8ab
+
14f8ab
+    LOCK(&fd->lock);
14f8ab
+    {
14f8ab
+        fd_ctx = __ec_fd_get(fd, xl);
14f8ab
+        if (fd_ctx) {
14f8ab
+            fversion = fd_ctx->bad_version;
14f8ab
+        }
14f8ab
+    }
14f8ab
+    UNLOCK(&fd->lock);
14f8ab
+
14f8ab
+    LOCK(&fd->inode->lock);
14f8ab
+    {
14f8ab
+        inode_ctx = __ec_inode_get(fd->inode, xl);
14f8ab
+        if (inode_ctx) {
14f8ab
+            iversion = inode_ctx->bad_version;
14f8ab
+        }
14f8ab
+    }
14f8ab
+    UNLOCK(&fd->inode->lock);
14f8ab
+    if (fversion < iversion) {
14f8ab
+        return EBADF;
14f8ab
+    }
14f8ab
+    return 0;
14f8ab
+}
14f8ab
+
14f8ab
 void
14f8ab
 ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
14f8ab
          uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
14f8ab
@@ -165,6 +196,14 @@ ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
14f8ab
     GF_VALIDATE_OR_GOTO(this->name, frame, out);
14f8ab
     GF_VALIDATE_OR_GOTO(this->name, this->private, out);
14f8ab
 
14f8ab
+    error = ec_validate_fd(fd, this);
14f8ab
+    if (error) {
14f8ab
+        gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
14f8ab
+               "Failing %s on %s", gf_fop_list[GF_FOP_FLUSH],
14f8ab
+               fd->inode ? uuid_utoa(fd->inode->gfid) : "");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
     fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, fop_flags,
14f8ab
                                ec_wind_flush, ec_manager_flush, callback, data);
14f8ab
     if (fop == NULL) {
14f8ab
@@ -381,6 +420,14 @@ ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
14f8ab
     GF_VALIDATE_OR_GOTO(this->name, frame, out);
14f8ab
     GF_VALIDATE_OR_GOTO(this->name, this->private, out);
14f8ab
 
14f8ab
+    error = ec_validate_fd(fd, this);
14f8ab
+    if (error) {
14f8ab
+        gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
14f8ab
+               "Failing %s on %s", gf_fop_list[GF_FOP_FSYNC],
14f8ab
+               fd->inode ? uuid_utoa(fd->inode->gfid) : "");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
     fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, fop_flags,
14f8ab
                                ec_wind_fsync, ec_manager_fsync, callback, data);
14f8ab
     if (fop == NULL) {
14f8ab
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
14f8ab
index 43f6e3b..baac001 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-helpers.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-helpers.c
14f8ab
@@ -753,6 +753,7 @@ __ec_fd_get(fd_t *fd, xlator_t *xl)
14f8ab
 {
14f8ab
     int i = 0;
14f8ab
     ec_fd_t *ctx = NULL;
14f8ab
+    ec_inode_t *ictx = NULL;
14f8ab
     uint64_t value = 0;
14f8ab
     ec_t *ec = xl->private;
14f8ab
 
14f8ab
@@ -775,6 +776,12 @@ __ec_fd_get(fd_t *fd, xlator_t *xl)
14f8ab
                 GF_FREE(ctx);
14f8ab
                 return NULL;
14f8ab
             }
14f8ab
+            /* Only refering bad-version so no need for lock
14f8ab
+             * */
14f8ab
+            ictx = __ec_inode_get(fd->inode, xl);
14f8ab
+            if (ictx) {
14f8ab
+                ctx->bad_version = ictx->bad_version;
14f8ab
+            }
14f8ab
         }
14f8ab
     } else {
14f8ab
         ctx = (ec_fd_t *)(uintptr_t)value;
14f8ab
diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h
14f8ab
index 7c28808..be86b37 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-messages.h
14f8ab
+++ b/xlators/cluster/ec/src/ec-messages.h
14f8ab
@@ -55,6 +55,6 @@ GLFS_MSGID(EC, EC_MSG_INVALID_CONFIG, EC_MSG_HEAL_FAIL,
14f8ab
            EC_MSG_CONFIG_XATTR_INVALID, EC_MSG_EXTENSION, EC_MSG_EXTENSION_NONE,
14f8ab
            EC_MSG_EXTENSION_UNKNOWN, EC_MSG_EXTENSION_UNSUPPORTED,
14f8ab
            EC_MSG_EXTENSION_FAILED, EC_MSG_NO_GF, EC_MSG_MATRIX_FAILED,
14f8ab
-           EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED);
14f8ab
+           EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED, EC_MSG_FD_BAD);
14f8ab
 
14f8ab
 #endif /* !_EC_MESSAGES_H_ */
14f8ab
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
14f8ab
index 1c295c0..f27f2ec 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-types.h
14f8ab
+++ b/xlators/cluster/ec/src/ec-types.h
14f8ab
@@ -150,6 +150,7 @@ struct _ec_fd {
14f8ab
     loc_t loc;
14f8ab
     uintptr_t open;
14f8ab
     int32_t flags;
14f8ab
+    uint64_t bad_version;
14f8ab
     ec_fd_status_t fd_status[0];
14f8ab
 };
14f8ab
 
14f8ab
@@ -180,6 +181,7 @@ struct _ec_inode {
14f8ab
     uint64_t dirty[2];
14f8ab
     struct list_head heal;
14f8ab
     ec_stripe_list_t stripe_cache;
14f8ab
+    uint64_t bad_version;
14f8ab
 };
14f8ab
 
14f8ab
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab