21ab4e
From 1ceab6768cc5dfc50b0c00e07655d907cf489f09 Mon Sep 17 00:00:00 2001
21ab4e
From: Pranith Kumar K <pkarampu@redhat.com>
21ab4e
Date: Wed, 24 May 2017 22:30:29 +0530
21ab4e
Subject: [PATCH 480/486] features/shard: Handle offset in appending writes
21ab4e
21ab4e
When a file is opened with append, all writes are appended at the end of file
21ab4e
irrespective of the offset given in the write syscall. This needs to be
21ab4e
considered in shard size update function and also for choosing which shard to
21ab4e
write to.
21ab4e
21ab4e
At the moment shard piggybacks on queuing from write-behind
21ab4e
xlator for ordering of the operations. So if write-behind is disabled and
21ab4e
two parallel appending-writes come both of which can increase the file size
21ab4e
beyond shard-size the file will be corrupted.
21ab4e
21ab4e
 >BUG: 1455301
21ab4e
 >Change-Id: I9007e6a39098ab0b5d5386367bd07eb5f89cb09e
21ab4e
 >Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
21ab4e
 >Reviewed-on: https://review.gluster.org/17387
21ab4e
 >Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
 >Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
21ab4e
 >NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
 >CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
21ab4e
BUG: 1454313
21ab4e
Change-Id: I9007e6a39098ab0b5d5386367bd07eb5f89cb09e
21ab4e
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/107640
21ab4e
---
21ab4e
 extras/group-gluster-block           |   1 -
21ab4e
 tests/bugs/shard/shard-append-test.c | 179 +++++++++++++++++++++++++++++++++++
21ab4e
 tests/bugs/shard/shard-append-test.t |  32 +++++++
21ab4e
 xlators/features/shard/src/shard.c   | 109 +++++++++++++--------
21ab4e
 4 files changed, 278 insertions(+), 43 deletions(-)
21ab4e
 create mode 100644 tests/bugs/shard/shard-append-test.c
21ab4e
 create mode 100644 tests/bugs/shard/shard-append-test.t
21ab4e
21ab4e
diff --git a/extras/group-gluster-block b/extras/group-gluster-block
21ab4e
index 0753d26..a4a6367 100644
21ab4e
--- a/extras/group-gluster-block
21ab4e
+++ b/extras/group-gluster-block
21ab4e
@@ -2,7 +2,6 @@ performance.quick-read=off
21ab4e
 performance.read-ahead=off
21ab4e
 performance.io-cache=off
21ab4e
 performance.stat-prefetch=off
21ab4e
-performance.write-behind=off
21ab4e
 performance.open-behind=off
21ab4e
 performance.readdir-ahead=off
21ab4e
 network.remote-dio=enable
21ab4e
diff --git a/tests/bugs/shard/shard-append-test.c b/tests/bugs/shard/shard-append-test.c
21ab4e
new file mode 100644
21ab4e
index 0000000..92dff3d
21ab4e
--- /dev/null
21ab4e
+++ b/tests/bugs/shard/shard-append-test.c
21ab4e
@@ -0,0 +1,179 @@
21ab4e
+#include <fcntl.h>
21ab4e
+#include <unistd.h>
21ab4e
+#include <time.h>
21ab4e
+#include <limits.h>
21ab4e
+#include <string.h>
21ab4e
+#include <pthread.h>
21ab4e
+#include <stdio.h>
21ab4e
+#include <stdlib.h>
21ab4e
+#include <errno.h>
21ab4e
+#include <glusterfs/api/glfs.h>
21ab4e
+#include <glusterfs/api/glfs-handles.h>
21ab4e
+
21ab4e
+#define LOG_ERR(msg) do { \
21ab4e
+        fprintf (stderr, "%s : Error (%s)\n", msg, strerror (errno)); \
21ab4e
+        } while (0)
21ab4e
+
21ab4e
+/*This test tests that shard xlator handles offset in appending writes
21ab4e
+ * correctly. This test performs writes of 1025 bytes 1025 times, in 5 threads
21ab4e
+ * with different threads. The buffer to be written is same character repeated
21ab4e
+ * 1025 times in the buffer for a thread. At the end it reads the buffer till
21ab4e
+ * end of file and tests that the read of 1025 bytes is always same character
21ab4e
+ * and the content read is 5*1025*1025 size. 1025 bytes is chosen because it
21ab4e
+ * will lead to write on more than one shard at some point when the size is
21ab4e
+ * going over the initial shard*/
21ab4e
+pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
21ab4e
+int thread_data = '1';
21ab4e
+
21ab4e
+glfs_t *
21ab4e
+init_glfs (const char *hostname, const char *volname,
21ab4e
+           const char *logfile)
21ab4e
+{
21ab4e
+        int     ret     = -1;
21ab4e
+        glfs_t *fs      = NULL;
21ab4e
+
21ab4e
+        fs = glfs_new (volname);
21ab4e
+        if (!fs) {
21ab4e
+                LOG_ERR ("glfs_new failed");
21ab4e
+                return NULL;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = glfs_set_volfile_server (fs, "tcp", hostname, 24007);
21ab4e
+        if (ret < 0) {
21ab4e
+                LOG_ERR ("glfs_set_volfile_server failed");
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = glfs_set_logging (fs, logfile, 7);
21ab4e
+        if (ret < 0) {
21ab4e
+                LOG_ERR ("glfs_set_logging failed");
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = glfs_init (fs);
21ab4e
+        if (ret < 0) {
21ab4e
+                LOG_ERR ("glfs_init failed");
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = 0;
21ab4e
+out:
21ab4e
+        if (ret) {
21ab4e
+                glfs_fini (fs);
21ab4e
+                fs = NULL;
21ab4e
+        }
21ab4e
+
21ab4e
+        return fs;
21ab4e
+}
21ab4e
+
21ab4e
+void*
21ab4e
+write_data (void *data)
21ab4e
+{
21ab4e
+        char           buf[1025] = {0};
21ab4e
+        glfs_fd_t      *glfd = NULL;
21ab4e
+        glfs_t         *fs       = data;
21ab4e
+        int            i     = 0;
21ab4e
+
21ab4e
+        pthread_mutex_lock (&lock);
21ab4e
+        {
21ab4e
+                memset(buf, thread_data, sizeof(buf));
21ab4e
+                thread_data++;
21ab4e
+        }
21ab4e
+        pthread_mutex_unlock (&lock);
21ab4e
+
21ab4e
+        for (i = 0; i < 1025; i++) {
21ab4e
+                glfd = glfs_creat(fs, "parallel-write.txt", O_WRONLY | O_APPEND,
21ab4e
+                                   S_IRUSR | S_IWUSR | O_SYNC);
21ab4e
+                if (!glfd) {
21ab4e
+                        LOG_ERR ("Failed to create file");
21ab4e
+                        exit(1);
21ab4e
+                }
21ab4e
+
21ab4e
+                if (glfs_write (glfd, buf, sizeof(buf), 0) < 0) {
21ab4e
+                        LOG_ERR ("Failed to write to file");
21ab4e
+                        exit(1);
21ab4e
+                }
21ab4e
+                if (glfs_close(glfd) != 0) {
21ab4e
+                        LOG_ERR ("Failed to close file");
21ab4e
+                        exit(1);
21ab4e
+                }
21ab4e
+        }
21ab4e
+        return NULL;
21ab4e
+}
21ab4e
+
21ab4e
+int
21ab4e
+main (int argc, char *argv[])
21ab4e
+{
21ab4e
+        pthread_t  tid[5] = {0};
21ab4e
+        char       buf[1025] = {0};
21ab4e
+        char       cmp_buf[1025] = {0};
21ab4e
+        int         ret      = 0;
21ab4e
+        char       *hostname = NULL;
21ab4e
+        char       *volname  = NULL;
21ab4e
+        char       *logfile  = NULL;
21ab4e
+        glfs_t     *fs       = NULL;
21ab4e
+        glfs_fd_t  *glfd     = NULL;
21ab4e
+        ssize_t     bytes_read = 0;
21ab4e
+        ssize_t     total_bytes_read = 0;
21ab4e
+        int i = 0;
21ab4e
+
21ab4e
+        if (argc != 4) {
21ab4e
+                fprintf (stderr, "Invalid argument\n");
21ab4e
+                exit(1);
21ab4e
+        }
21ab4e
+
21ab4e
+        hostname = argv[1];
21ab4e
+        volname = argv[2];
21ab4e
+        logfile = argv[3];
21ab4e
+
21ab4e
+        fs = init_glfs (hostname, volname, logfile);
21ab4e
+        if (fs == NULL) {
21ab4e
+                LOG_ERR ("init_glfs failed");
21ab4e
+                return -1;
21ab4e
+        }
21ab4e
+
21ab4e
+        for (i = 0; i < 5; i++) {
21ab4e
+                pthread_create(&tid[i], NULL, write_data, fs);
21ab4e
+        }
21ab4e
+
21ab4e
+        for (i = 0; i < 5; i++) {
21ab4e
+                pthread_join(tid[i], NULL);
21ab4e
+        }
21ab4e
+        glfd = glfs_open(fs, "parallel-write.txt", O_RDONLY);
21ab4e
+        if (!glfd) {
21ab4e
+                LOG_ERR ("Failed to open file for reading");
21ab4e
+                exit(1);
21ab4e
+        }
21ab4e
+
21ab4e
+        while ((bytes_read = glfs_read (glfd, buf, sizeof(buf), 0)) > 0) {
21ab4e
+                if (bytes_read != sizeof(buf)) {
21ab4e
+                        fprintf (stderr, "Didn't read complete data read: %zd "
21ab4e
+                                 "expected: %lu", bytes_read, sizeof(buf));
21ab4e
+                        exit(1);
21ab4e
+                }
21ab4e
+
21ab4e
+                total_bytes_read += bytes_read;
21ab4e
+                if (buf[0] < '1' || buf[0] >= thread_data) {
21ab4e
+                        fprintf(stderr, "Invalid character found: %c", buf[0]);
21ab4e
+                        exit(1);
21ab4e
+                }
21ab4e
+                memset(cmp_buf, buf[0], sizeof(cmp_buf));
21ab4e
+                if (memcmp(cmp_buf, buf, sizeof(cmp_buf))) {
21ab4e
+                        LOG_ERR ("Data corrupted");
21ab4e
+                        exit(1);
21ab4e
+                }
21ab4e
+                memset(cmp_buf, 0, sizeof(cmp_buf));
21ab4e
+        }
21ab4e
+
21ab4e
+        if (total_bytes_read != 5*1025*1025) {
21ab4e
+                fprintf(stderr, "Failed to read what is written, read; %zd, "
21ab4e
+                        "expected %zu", total_bytes_read, 5*1025*1025);
21ab4e
+                exit(1);
21ab4e
+        }
21ab4e
+
21ab4e
+        if (glfs_close(glfd) != 0) {
21ab4e
+                LOG_ERR ("Failed to close");
21ab4e
+                exit(1);
21ab4e
+        }
21ab4e
+        return 0;
21ab4e
+}
21ab4e
diff --git a/tests/bugs/shard/shard-append-test.t b/tests/bugs/shard/shard-append-test.t
21ab4e
new file mode 100644
21ab4e
index 0000000..f8719f2
21ab4e
--- /dev/null
21ab4e
+++ b/tests/bugs/shard/shard-append-test.t
21ab4e
@@ -0,0 +1,32 @@
21ab4e
+#!/bin/bash
21ab4e
+
21ab4e
+. $(dirname $0)/../../include.rc
21ab4e
+. $(dirname $0)/../../volume.rc
21ab4e
+
21ab4e
+cleanup;
21ab4e
+
21ab4e
+TEST glusterd
21ab4e
+
21ab4e
+TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{1,2,3};
21ab4e
+TEST $CLI volume set $V0 features.shard on
21ab4e
+TEST $CLI volume set $V0 features.shard-block-size 4MB
21ab4e
+TEST $CLI volume set $V0 performance.quick-read off
21ab4e
+TEST $CLI volume set $V0 performance.io-cache off
21ab4e
+
21ab4e
+#Uncomment the following line after shard-queuing is implemented
21ab4e
+#TEST $CLI volume set $V0 performance.write-behind off
21ab4e
+
21ab4e
+TEST $CLI volume set $V0 performance.strict-o-direct on
21ab4e
+TEST $CLI volume set $V0 performance.stat-prefetch off
21ab4e
+TEST $CLI volume set $V0 performance.read-ahead off
21ab4e
+TEST $CLI volume start $V0;
21ab4e
+
21ab4e
+logdir=`gluster --print-logdir`
21ab4e
+
21ab4e
+TEST build_tester $(dirname $0)/shard-append-test.c -lgfapi -lpthread
21ab4e
+
21ab4e
+TEST ./$(dirname $0)/shard-append-test ${H0} $V0 $logdir/shard-append-test.log
21ab4e
+
21ab4e
+cleanup_tester $(dirname $0)/shard-append-test
21ab4e
+
21ab4e
+cleanup;
21ab4e
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
21ab4e
index bbd566a..141c42f 100644
21ab4e
--- a/xlators/features/shard/src/shard.c
21ab4e
+++ b/xlators/features/shard/src/shard.c
21ab4e
@@ -3634,6 +3634,18 @@ shard_common_inode_write_post_update_size_handler (call_frame_t *frame,
21ab4e
         return 0;
21ab4e
 }
21ab4e
 
21ab4e
+static gf_boolean_t
21ab4e
+shard_is_appending_write (shard_local_t *local)
21ab4e
+{
21ab4e
+        if (local->fop != GF_FOP_WRITE)
21ab4e
+                return _gf_false;
21ab4e
+        if (local->flags & O_APPEND)
21ab4e
+                return _gf_true;
21ab4e
+        if (local->fd->flags & O_APPEND)
21ab4e
+                return _gf_true;
21ab4e
+        return _gf_false;
21ab4e
+}
21ab4e
+
21ab4e
 int
21ab4e
 __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
21ab4e
                                        xlator_t *this)
21ab4e
@@ -3648,13 +3660,15 @@ __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
21ab4e
 
21ab4e
         ctx = (shard_inode_ctx_t *) ctx_uint;
21ab4e
 
21ab4e
-        if (local->offset + local->total_size > ctx->stat.ia_size) {
21ab4e
+        if (shard_is_appending_write (local)) {
21ab4e
+                local->delta_size = local->total_size;
21ab4e
+        } else if (local->offset + local->total_size > ctx->stat.ia_size) {
21ab4e
                 local->delta_size = (local->offset + local->total_size) -
21ab4e
                                     ctx->stat.ia_size;
21ab4e
-                ctx->stat.ia_size += (local->delta_size);
21ab4e
         } else {
21ab4e
                 local->delta_size = 0;
21ab4e
         }
21ab4e
+        ctx->stat.ia_size += (local->delta_size);
21ab4e
         local->postbuf = ctx->stat;
21ab4e
 
21ab4e
         return 0;
21ab4e
@@ -3960,8 +3974,11 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame,
21ab4e
 }
21ab4e
 
21ab4e
 int
21ab4e
-shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
21ab4e
-                                              xlator_t *this)
21ab4e
+shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this,
21ab4e
+                       shard_post_resolve_fop_handler_t handler);
21ab4e
+int
21ab4e
+shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
21ab4e
+                                               xlator_t *this)
21ab4e
 {
21ab4e
         shard_local_t *local = NULL;
21ab4e
 
21ab4e
@@ -3974,8 +3991,6 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
21ab4e
                 return 0;
21ab4e
         }
21ab4e
 
21ab4e
-        local->postbuf = local->prebuf;
21ab4e
-
21ab4e
         if (local->call_count) {
21ab4e
                 shard_common_lookup_shards (frame, this,
21ab4e
                                             local->resolver_base_inode,
21ab4e
@@ -3988,12 +4003,11 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
21ab4e
 }
21ab4e
 
21ab4e
 int
21ab4e
-shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
21ab4e
-                                               xlator_t *this)
21ab4e
+shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
21ab4e
+                                              xlator_t *this)
21ab4e
 {
21ab4e
-        shard_local_t *local = NULL;
21ab4e
-
21ab4e
-        local = frame->local;
21ab4e
+        shard_local_t *local = frame->local;
21ab4e
+        shard_priv_t  *priv  = this->private;
21ab4e
 
21ab4e
         if (local->op_ret < 0) {
21ab4e
                 shard_common_inode_write_failure_unwind (local->fop, frame,
21ab4e
@@ -4002,8 +4016,46 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
21ab4e
                 return 0;
21ab4e
         }
21ab4e
 
21ab4e
-        shard_lookup_base_file (frame, this, &local->loc,
21ab4e
-                                shard_common_inode_write_post_lookup_handler);
21ab4e
+        local->postbuf = local->prebuf;
21ab4e
+
21ab4e
+        /*Adjust offset to EOF so that correct shard is chosen for append*/
21ab4e
+        if (shard_is_appending_write (local))
21ab4e
+                local->offset = local->prebuf.ia_size;
21ab4e
+
21ab4e
+        local->first_block = get_lowest_block (local->offset,
21ab4e
+                                               local->block_size);
21ab4e
+        local->last_block = get_highest_block (local->offset, local->total_size,
21ab4e
+                                               local->block_size);
21ab4e
+        local->num_blocks = local->last_block - local->first_block + 1;
21ab4e
+        local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
21ab4e
+                                       gf_shard_mt_inode_list);
21ab4e
+        if (!local->inode_list) {
21ab4e
+                shard_common_inode_write_failure_unwind (local->fop, frame,
21ab4e
+                                                         -1, ENOMEM);
21ab4e
+                return 0;
21ab4e
+        }
21ab4e
+
21ab4e
+        gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
21ab4e
+                      "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64
21ab4e
+                      " total_size=%zu flags=%"PRId32"",
21ab4e
+                      gf_fop_list[local->fop],
21ab4e
+                      uuid_utoa (local->resolver_base_inode->gfid),
21ab4e
+                      local->first_block, local->last_block, local->num_blocks,
21ab4e
+                      local->offset, local->total_size, local->flags);
21ab4e
+
21ab4e
+        local->dot_shard_loc.inode = inode_find (this->itable,
21ab4e
+                                                 priv->dot_shard_gfid);
21ab4e
+
21ab4e
+        if (!local->dot_shard_loc.inode) {
21ab4e
+                /*change handler*/
21ab4e
+                shard_mkdir_dot_shard (frame, this,
21ab4e
+                                 shard_common_inode_write_post_resolve_handler);
21ab4e
+        } else {
21ab4e
+                /*change handler*/
21ab4e
+                local->post_res_handler =
21ab4e
+                                shard_common_inode_write_post_resolve_handler;
21ab4e
+                shard_refresh_dot_shard (frame, this);
21ab4e
+        }
21ab4e
         return 0;
21ab4e
 }
21ab4e
 
21ab4e
@@ -4706,9 +4758,6 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
21ab4e
         int             i              = 0;
21ab4e
         uint64_t        block_size     = 0;
21ab4e
         shard_local_t  *local          = NULL;
21ab4e
-        shard_priv_t   *priv           = NULL;
21ab4e
-
21ab4e
-        priv = this->private;
21ab4e
 
21ab4e
         ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
21ab4e
         if (ret) {
21ab4e
@@ -4784,37 +4833,13 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
21ab4e
                 local->iobref = iobref_ref (iobref);
21ab4e
         local->fd = fd_ref (fd);
21ab4e
         local->block_size = block_size;
21ab4e
-        local->first_block = get_lowest_block (offset, local->block_size);
21ab4e
-        local->last_block = get_highest_block (offset, local->total_size,
21ab4e
-                                               local->block_size);
21ab4e
-        local->num_blocks = local->last_block - local->first_block + 1;
21ab4e
         local->resolver_base_inode = local->fd->inode;
21ab4e
-        local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
21ab4e
-                                       gf_shard_mt_inode_list);
21ab4e
-        if (!local->inode_list)
21ab4e
-                goto out;
21ab4e
 
21ab4e
         local->loc.inode = inode_ref (fd->inode);
21ab4e
         gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
21ab4e
 
21ab4e
-        gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
21ab4e
-                      "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64""
21ab4e
-                      " total_size=%zu flags=%"PRId32"", gf_fop_list[fop],
21ab4e
-                      uuid_utoa (fd->inode->gfid), local->first_block,
21ab4e
-                      local->last_block, local->num_blocks, offset,
21ab4e
-                      local->total_size, local->flags);
21ab4e
-
21ab4e
-        local->dot_shard_loc.inode = inode_find (this->itable,
21ab4e
-                                                 priv->dot_shard_gfid);
21ab4e
-
21ab4e
-        if (!local->dot_shard_loc.inode) {
21ab4e
-                shard_mkdir_dot_shard (frame, this,
21ab4e
-                                 shard_common_inode_write_post_resolve_handler);
21ab4e
-        } else {
21ab4e
-                local->post_res_handler = shard_common_inode_write_post_resolve_handler;
21ab4e
-                shard_refresh_dot_shard (frame, this);
21ab4e
-        }
21ab4e
-
21ab4e
+        shard_lookup_base_file (frame, this, &local->loc,
21ab4e
+                                shard_common_inode_write_post_lookup_handler);
21ab4e
         return 0;
21ab4e
 out:
21ab4e
         shard_common_inode_write_failure_unwind (fop, frame, -1, ENOMEM);
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e