14f8ab
From 48f6929590157d9a1697e11c02441207afdc1bed Mon Sep 17 00:00:00 2001
14f8ab
From: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Date: Fri, 27 Mar 2020 23:56:15 +0100
14f8ab
Subject: [PATCH 362/362] write-behind: fix data corruption
14f8ab
14f8ab
There was a bug in write-behind that allowed a previous completed write
14f8ab
to overwrite the overlapping region of data from a future write.
14f8ab
14f8ab
Suppose we want to send three writes (W1, W2 and W3). W1 and W2 are
14f8ab
sequential, and W3 writes at the same offset of W2:
14f8ab
14f8ab
    W2.offset = W3.offset = W1.offset + W1.size
14f8ab
14f8ab
Both W1 and W2 are sent in parallel. W3 is only sent after W2 completes.
14f8ab
So W3 should *always* overwrite the overlapping part of W2.
14f8ab
14f8ab
Suppose write-behind processes the requests from 2 concurrent threads:
14f8ab
14f8ab
    Thread 1                    Thread 2
14f8ab
14f8ab
    <received W1>
14f8ab
                                <received W2>
14f8ab
    wb_enqueue_tempted(W1)
14f8ab
    /* W1 is assigned gen X */
14f8ab
                                wb_enqueue_tempted(W2)
14f8ab
                                /* W2 is assigned gen X */
14f8ab
14f8ab
                                wb_process_queue()
14f8ab
                                  __wb_preprocess_winds()
14f8ab
                                    /* W1 and W2 are sequential and all
14f8ab
                                     * other requisites are met to merge
14f8ab
                                     * both requests. */
14f8ab
                                    __wb_collapse_small_writes(W1, W2)
14f8ab
                                    __wb_fulfill_request(W2)
14f8ab
14f8ab
                                  __wb_pick_unwinds() -> W2
14f8ab
                                  /* In this case, since the request is
14f8ab
                                   * already fulfilled, wb_inode->gen
14f8ab
                                   * is not updated. */
14f8ab
14f8ab
                                wb_do_unwinds()
14f8ab
                                  STACK_UNWIND(W2)
14f8ab
14f8ab
                                /* The application has received the
14f8ab
                                 * result of W2, so it can send W3. */
14f8ab
                                <received W3>
14f8ab
14f8ab
                                wb_enqueue_tempted(W3)
14f8ab
                                /* W3 is assigned gen X */
14f8ab
14f8ab
                                wb_process_queue()
14f8ab
                                  /* Here we have W1 (which contains
14f8ab
                                   * the conflicting W2) and W3 with
14f8ab
                                   * same gen, so they are interpreted
14f8ab
                                   * as concurrent writes that do not
14f8ab
                                   * conflict. */
14f8ab
                                  __wb_pick_winds() -> W3
14f8ab
14f8ab
                                wb_do_winds()
14f8ab
                                  STACK_WIND(W3)
14f8ab
14f8ab
    wb_process_queue()
14f8ab
      /* Eventually W1 will be
14f8ab
       * ready to be sent */
14f8ab
      __wb_pick_winds() -> W1
14f8ab
      __wb_pick_unwinds() -> W1
14f8ab
        /* Here wb_inode->gen is
14f8ab
         * incremented. */
14f8ab
14f8ab
    wb_do_unwinds()
14f8ab
      STACK_UNWIND(W1)
14f8ab
14f8ab
    wb_do_winds()
14f8ab
      STACK_WIND(W1)
14f8ab
14f8ab
So, as we can see, W3 is sent before W1, which shouldn't happen.
14f8ab
14f8ab
The problem is that wb_inode->gen is only incremented for requests that
14f8ab
have not been fulfilled but, after a merge, the request is marked as
14f8ab
fulfilled even though it has not been sent to the brick. This allows
14f8ab
that future requests are assigned to the same generation, which could
14f8ab
be internally reordered.
14f8ab
14f8ab
Solution:
14f8ab
14f8ab
Increment wb_inode->gen before any unwind, even if it's for a fulfilled
14f8ab
request.
14f8ab
14f8ab
Special thanks to Stefan Ring for writing a reproducer that has been
14f8ab
crucial to identify the issue.
14f8ab
14f8ab
Upstream patch:
14f8ab
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24263
14f8ab
> Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45
14f8ab
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
> Fixes: #884
14f8ab
14f8ab
Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45
14f8ab
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
BUG: 1819059
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/196250
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 tests/bugs/write-behind/issue-884.c                | 267 +++++++++++++++++++++
14f8ab
 tests/bugs/write-behind/issue-884.t                |  40 +++
14f8ab
 .../performance/write-behind/src/write-behind.c    |   4 +-
14f8ab
 3 files changed, 309 insertions(+), 2 deletions(-)
14f8ab
 create mode 100644 tests/bugs/write-behind/issue-884.c
14f8ab
 create mode 100755 tests/bugs/write-behind/issue-884.t
14f8ab
14f8ab
diff --git a/tests/bugs/write-behind/issue-884.c b/tests/bugs/write-behind/issue-884.c
14f8ab
new file mode 100644
14f8ab
index 0000000..e9c33b3
14f8ab
--- /dev/null
14f8ab
+++ b/tests/bugs/write-behind/issue-884.c
14f8ab
@@ -0,0 +1,267 @@
14f8ab
+
14f8ab
+#define _GNU_SOURCE
14f8ab
+
14f8ab
+#include <stdlib.h>
14f8ab
+#include <stdio.h>
14f8ab
+#include <string.h>
14f8ab
+#include <time.h>
14f8ab
+#include <assert.h>
14f8ab
+#include <errno.h>
14f8ab
+#include <sys/types.h>
14f8ab
+#include <sys/stat.h>
14f8ab
+#include <pthread.h>
14f8ab
+
14f8ab
+#include <glusterfs/api/glfs.h>
14f8ab
+
14f8ab
+/* Based on a reproducer by Stefan Ring. It seems to be quite sensible to any
14f8ab
+ * timing modification, so the code has been maintained as is, only with minor
14f8ab
+ * changes. */
14f8ab
+
14f8ab
+struct glfs *glfs;
14f8ab
+
14f8ab
+pthread_mutex_t the_mutex = PTHREAD_MUTEX_INITIALIZER;
14f8ab
+pthread_cond_t the_cond = PTHREAD_COND_INITIALIZER;
14f8ab
+
14f8ab
+typedef struct _my_aiocb {
14f8ab
+    int64_t size;
14f8ab
+    volatile int64_t seq;
14f8ab
+    int which;
14f8ab
+} my_aiocb;
14f8ab
+
14f8ab
+typedef struct _worker_data {
14f8ab
+    my_aiocb cb;
14f8ab
+    struct iovec iov;
14f8ab
+    int64_t offset;
14f8ab
+} worker_data;
14f8ab
+
14f8ab
+typedef struct {
14f8ab
+    worker_data wdata[2];
14f8ab
+
14f8ab
+    volatile unsigned busy;
14f8ab
+} all_data_t;
14f8ab
+
14f8ab
+all_data_t all_data;
14f8ab
+
14f8ab
+static void
14f8ab
+completion_fnc(struct glfs_fd *fd, ssize_t ret, struct glfs_stat *pre,
14f8ab
+               struct glfs_stat *post, void *arg)
14f8ab
+{
14f8ab
+    void *the_thread;
14f8ab
+    my_aiocb *cb = (my_aiocb *)arg;
14f8ab
+    long seq = cb->seq;
14f8ab
+
14f8ab
+    assert(ret == cb->size);
14f8ab
+
14f8ab
+    pthread_mutex_lock(&the_mutex);
14f8ab
+    pthread_cond_broadcast(&the_cond);
14f8ab
+
14f8ab
+    all_data.busy &= ~(1 << cb->which);
14f8ab
+    cb->seq = -1;
14f8ab
+
14f8ab
+    the_thread = (void *)pthread_self();
14f8ab
+    printf("worker %d is done from thread %p, seq %ld!\n", cb->which,
14f8ab
+           the_thread, seq);
14f8ab
+
14f8ab
+    pthread_mutex_unlock(&the_mutex);
14f8ab
+}
14f8ab
+
14f8ab
+static void
14f8ab
+init_wdata(worker_data *data, int which)
14f8ab
+{
14f8ab
+    data->cb.which = which;
14f8ab
+    data->cb.seq = -1;
14f8ab
+
14f8ab
+    data->iov.iov_base = malloc(1024 * 1024);
14f8ab
+    memset(data->iov.iov_base, 6,
14f8ab
+           1024 * 1024); /* tail part never overwritten */
14f8ab
+}
14f8ab
+
14f8ab
+static void
14f8ab
+init()
14f8ab
+{
14f8ab
+    all_data.busy = 0;
14f8ab
+
14f8ab
+    init_wdata(&all_data.wdata[0], 0);
14f8ab
+    init_wdata(&all_data.wdata[1], 1);
14f8ab
+}
14f8ab
+
14f8ab
+static void
14f8ab
+do_write(struct glfs_fd *fd, int content, int size, int64_t seq,
14f8ab
+         worker_data *wdata, const char *name)
14f8ab
+{
14f8ab
+    int ret;
14f8ab
+
14f8ab
+    wdata->cb.size = size;
14f8ab
+    wdata->cb.seq = seq;
14f8ab
+
14f8ab
+    if (content >= 0)
14f8ab
+        memset(wdata->iov.iov_base, content, size);
14f8ab
+    wdata->iov.iov_len = size;
14f8ab
+
14f8ab
+    pthread_mutex_lock(&the_mutex);
14f8ab
+    printf("(%d) dispatching write \"%s\", offset %lx, len %x, seq %ld\n",
14f8ab
+           wdata->cb.which, name, (long)wdata->offset, size, (long)seq);
14f8ab
+    pthread_mutex_unlock(&the_mutex);
14f8ab
+    ret = glfs_pwritev_async(fd, &wdata->iov, 1, wdata->offset, 0,
14f8ab
+                             completion_fnc, &wdata->cb);
14f8ab
+    assert(ret >= 0);
14f8ab
+}
14f8ab
+
14f8ab
+#define IDLE 0  // both workers must be idle
14f8ab
+#define ANY 1   // use any worker, other one may be busy
14f8ab
+
14f8ab
+int
14f8ab
+get_worker(int waitfor, int64_t excl_seq)
14f8ab
+{
14f8ab
+    int which;
14f8ab
+
14f8ab
+    pthread_mutex_lock(&the_mutex);
14f8ab
+
14f8ab
+    while (waitfor == IDLE && (all_data.busy & 3) != 0 ||
14f8ab
+           waitfor == ANY &&
14f8ab
+               ((all_data.busy & 3) == 3 ||
14f8ab
+                excl_seq >= 0 && (all_data.wdata[0].cb.seq == excl_seq ||
14f8ab
+                                  all_data.wdata[1].cb.seq == excl_seq)))
14f8ab
+        pthread_cond_wait(&the_cond, &the_mutex);
14f8ab
+
14f8ab
+    if (!(all_data.busy & 1))
14f8ab
+        which = 0;
14f8ab
+    else
14f8ab
+        which = 1;
14f8ab
+
14f8ab
+    all_data.busy |= (1 << which);
14f8ab
+
14f8ab
+    pthread_mutex_unlock(&the_mutex);
14f8ab
+
14f8ab
+    return which;
14f8ab
+}
14f8ab
+
14f8ab
+static int
14f8ab
+doit(struct glfs_fd *fd)
14f8ab
+{
14f8ab
+    int ret;
14f8ab
+    int64_t seq = 0;
14f8ab
+    int64_t offset = 0;     // position in file, in blocks
14f8ab
+    int64_t base = 0x1000;  // where to place the data, in blocks
14f8ab
+
14f8ab
+    int async_mode = ANY;
14f8ab
+
14f8ab
+    init();
14f8ab
+
14f8ab
+    for (;;) {
14f8ab
+        int which;
14f8ab
+        worker_data *wdata;
14f8ab
+
14f8ab
+        // for growing to the first offset
14f8ab
+        for (;;) {
14f8ab
+            int gap = base + 0x42 - offset;
14f8ab
+            if (!gap)
14f8ab
+                break;
14f8ab
+            if (gap > 80)
14f8ab
+                gap = 80;
14f8ab
+
14f8ab
+            which = get_worker(IDLE, -1);
14f8ab
+            wdata = &all_data.wdata[which];
14f8ab
+
14f8ab
+            wdata->offset = offset << 9;
14f8ab
+            do_write(fd, 0, gap << 9, seq++, wdata, "gap-filling");
14f8ab
+
14f8ab
+            offset += gap;
14f8ab
+        }
14f8ab
+
14f8ab
+        // 8700
14f8ab
+        which = get_worker(IDLE, -1);
14f8ab
+        wdata = &all_data.wdata[which];
14f8ab
+
14f8ab
+        wdata->offset = (base + 0x42) << 9;
14f8ab
+        do_write(fd, 1, 62 << 9, seq++, wdata, "!8700");
14f8ab
+
14f8ab
+        // 8701
14f8ab
+        which = get_worker(IDLE, -1);
14f8ab
+        wdata = &all_data.wdata[which];
14f8ab
+
14f8ab
+        wdata->offset = (base + 0x42) << 9;
14f8ab
+        do_write(fd, 2, 55 << 9, seq++, wdata, "!8701");
14f8ab
+
14f8ab
+        // 8702
14f8ab
+        which = get_worker(async_mode, -1);
14f8ab
+        wdata = &all_data.wdata[which];
14f8ab
+
14f8ab
+        wdata->offset = (base + 0x79) << 9;
14f8ab
+        do_write(fd, 3, 54 << 9, seq++, wdata, "!8702");
14f8ab
+
14f8ab
+        // 8703
14f8ab
+        which = get_worker(async_mode, -1);
14f8ab
+        wdata = &all_data.wdata[which];
14f8ab
+
14f8ab
+        wdata->offset = (base + 0xaf) << 9;
14f8ab
+        do_write(fd, 4, 81 << 9, seq++, wdata, "!8703");
14f8ab
+
14f8ab
+        // 8704
14f8ab
+        // this writes both 5s and 6s
14f8ab
+        // the range of 5s is the one that overwrites 8703
14f8ab
+
14f8ab
+        which = get_worker(async_mode, seq - 1);
14f8ab
+        wdata = &all_data.wdata[which];
14f8ab
+
14f8ab
+        memset(wdata->iov.iov_base, 5, 81 << 9);
14f8ab
+        wdata->offset = (base + 0xaf) << 9;
14f8ab
+        do_write(fd, -1, 1623 << 9, seq++, wdata, "!8704");
14f8ab
+
14f8ab
+        offset = base + 0x706;
14f8ab
+        base += 0x1000;
14f8ab
+        if (base >= 0x100000)
14f8ab
+            break;
14f8ab
+    }
14f8ab
+
14f8ab
+    printf("done!\n");
14f8ab
+    fflush(stdout);
14f8ab
+
14f8ab
+    pthread_mutex_lock(&the_mutex);
14f8ab
+
14f8ab
+    while ((all_data.busy & 3) != 0)
14f8ab
+        pthread_cond_wait(&the_cond, &the_mutex);
14f8ab
+
14f8ab
+    pthread_mutex_unlock(&the_mutex);
14f8ab
+
14f8ab
+    ret = glfs_close(fd);
14f8ab
+    assert(ret >= 0);
14f8ab
+    /*
14f8ab
+        ret = glfs_fini(glfs);
14f8ab
+        assert(ret >= 0);
14f8ab
+    */
14f8ab
+    return 0;
14f8ab
+}
14f8ab
+
14f8ab
+int
14f8ab
+main(int argc, char *argv[])
14f8ab
+{
14f8ab
+    int ret;
14f8ab
+    int open_flags = O_RDWR | O_DIRECT | O_TRUNC;
14f8ab
+    struct glfs_fd *fd;
14f8ab
+
14f8ab
+    glfs = glfs_new(argv[1]);
14f8ab
+    if (!glfs) {
14f8ab
+        printf("glfs_new!\n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    ret = glfs_set_volfile_server(glfs, "tcp", "localhost", 24007);
14f8ab
+    if (ret < 0) {
14f8ab
+        printf("set_volfile!\n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    ret = glfs_init(glfs);
14f8ab
+    if (ret) {
14f8ab
+        printf("init!\n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    fd = glfs_open(glfs, argv[2], open_flags);
14f8ab
+    if (!fd) {
14f8ab
+        printf("open!\n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    srand(time(NULL));
14f8ab
+    return doit(fd);
14f8ab
+out:
14f8ab
+    return 1;
14f8ab
+}
14f8ab
diff --git a/tests/bugs/write-behind/issue-884.t b/tests/bugs/write-behind/issue-884.t
14f8ab
new file mode 100755
14f8ab
index 0000000..2bcf7d1
14f8ab
--- /dev/null
14f8ab
+++ b/tests/bugs/write-behind/issue-884.t
14f8ab
@@ -0,0 +1,40 @@
14f8ab
+#!/bin/bash
14f8ab
+
14f8ab
+. $(dirname $0)/../../include.rc
14f8ab
+. $(dirname $0)/../../volume.rc
14f8ab
+
14f8ab
+# This test tries to detect a race condition in write-behind. It's based on a
14f8ab
+# reproducer written by Stefan Ring that is able to hit it sometimes. On my
14f8ab
+# system, it happened around 10% of the runs. This means that if this bug
14f8ab
+# appears again, this test will fail once every 10 runs. Most probably this
14f8ab
+# failure will be hidden by the automatic test retry of the testing framework.
14f8ab
+#
14f8ab
+# Please, if this test fails, it needs to be analyzed in detail.
14f8ab
+
14f8ab
+function run() {
14f8ab
+    "${@}" >/dev/null
14f8ab
+}
14f8ab
+
14f8ab
+cleanup
14f8ab
+
14f8ab
+TEST glusterd
14f8ab
+TEST pidof glusterd
14f8ab
+
14f8ab
+TEST $CLI volume create $V0 $H0:$B0/$V0
14f8ab
+# This makes it easier to hit the issue
14f8ab
+TEST $CLI volume set $V0 client-log-level TRACE
14f8ab
+TEST $CLI volume start $V0
14f8ab
+
14f8ab
+TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0
14f8ab
+
14f8ab
+build_tester $(dirname $0)/issue-884.c -lgfapi
14f8ab
+
14f8ab
+TEST touch $M0/testfile
14f8ab
+
14f8ab
+# This program generates a file of 535694336 bytes with a fixed pattern
14f8ab
+TEST run $(dirname $0)/issue-884 $V0 testfile
14f8ab
+
14f8ab
+# This is the md5sum of the expected pattern without corruption
14f8ab
+EXPECT "ad105f9349345a70fc697632cbb5eec8" echo "$(md5sum $B0/$V0/testfile | awk '{ print $1; }')"
14f8ab
+
14f8ab
+cleanup
14f8ab
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
14f8ab
index 70e281a..90a0bcf 100644
14f8ab
--- a/xlators/performance/write-behind/src/write-behind.c
14f8ab
+++ b/xlators/performance/write-behind/src/write-behind.c
14f8ab
@@ -1284,14 +1284,14 @@ __wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies)
14f8ab
 
14f8ab
         wb_inode->window_current += req->orig_size;
14f8ab
 
14f8ab
+        wb_inode->gen++;
14f8ab
+
14f8ab
         if (!req->ordering.fulfilled) {
14f8ab
             /* burden increased */
14f8ab
             list_add_tail(&req->lie, &wb_inode->liability);
14f8ab
 
14f8ab
             req->ordering.lied = 1;
14f8ab
 
14f8ab
-            wb_inode->gen++;
14f8ab
-
14f8ab
             uuid_utoa_r(req->gfid, gfid);
14f8ab
             gf_msg_debug(wb_inode->this->name, 0,
14f8ab
                          "(unique=%" PRIu64
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab