9f5ccc
From 48f6929590157d9a1697e11c02441207afdc1bed Mon Sep 17 00:00:00 2001
9f5ccc
From: Xavi Hernandez <xhernandez@redhat.com>
9f5ccc
Date: Fri, 27 Mar 2020 23:56:15 +0100
9f5ccc
Subject: [PATCH 362/362] write-behind: fix data corruption
9f5ccc
9f5ccc
There was a bug in write-behind that allowed a previous completed write
9f5ccc
to overwrite the overlapping region of data from a future write.
9f5ccc
9f5ccc
Suppose we want to send three writes (W1, W2 and W3). W1 and W2 are
9f5ccc
sequential, and W3 writes at the same offset of W2:
9f5ccc
9f5ccc
    W2.offset = W3.offset = W1.offset + W1.size
9f5ccc
9f5ccc
Both W1 and W2 are sent in parallel. W3 is only sent after W2 completes.
9f5ccc
So W3 should *always* overwrite the overlapping part of W2.
9f5ccc
9f5ccc
Suppose write-behind processes the requests from 2 concurrent threads:
9f5ccc
9f5ccc
    Thread 1                    Thread 2
9f5ccc
9f5ccc
    <received W1>
9f5ccc
                                <received W2>
9f5ccc
    wb_enqueue_tempted(W1)
9f5ccc
    /* W1 is assigned gen X */
9f5ccc
                                wb_enqueue_tempted(W2)
9f5ccc
                                /* W2 is assigned gen X */
9f5ccc
9f5ccc
                                wb_process_queue()
9f5ccc
                                  __wb_preprocess_winds()
9f5ccc
                                    /* W1 and W2 are sequential and all
9f5ccc
                                     * other requisites are met to merge
9f5ccc
                                     * both requests. */
9f5ccc
                                    __wb_collapse_small_writes(W1, W2)
9f5ccc
                                    __wb_fulfill_request(W2)
9f5ccc
9f5ccc
                                  __wb_pick_unwinds() -> W2
9f5ccc
                                  /* In this case, since the request is
9f5ccc
                                   * already fulfilled, wb_inode->gen
9f5ccc
                                   * is not updated. */
9f5ccc
9f5ccc
                                wb_do_unwinds()
9f5ccc
                                  STACK_UNWIND(W2)
9f5ccc
9f5ccc
                                /* The application has received the
9f5ccc
                                 * result of W2, so it can send W3. */
9f5ccc
                                <received W3>
9f5ccc
9f5ccc
                                wb_enqueue_tempted(W3)
9f5ccc
                                /* W3 is assigned gen X */
9f5ccc
9f5ccc
                                wb_process_queue()
9f5ccc
                                  /* Here we have W1 (which contains
9f5ccc
                                   * the conflicting W2) and W3 with
9f5ccc
                                   * same gen, so they are interpreted
9f5ccc
                                   * as concurrent writes that do not
9f5ccc
                                   * conflict. */
9f5ccc
                                  __wb_pick_winds() -> W3
9f5ccc
9f5ccc
                                wb_do_winds()
9f5ccc
                                  STACK_WIND(W3)
9f5ccc
9f5ccc
    wb_process_queue()
9f5ccc
      /* Eventually W1 will be
9f5ccc
       * ready to be sent */
9f5ccc
      __wb_pick_winds() -> W1
9f5ccc
      __wb_pick_unwinds() -> W1
9f5ccc
        /* Here wb_inode->gen is
9f5ccc
         * incremented. */
9f5ccc
9f5ccc
    wb_do_unwinds()
9f5ccc
      STACK_UNWIND(W1)
9f5ccc
9f5ccc
    wb_do_winds()
9f5ccc
      STACK_WIND(W1)
9f5ccc
9f5ccc
So, as we can see, W3 is sent before W1, which shouldn't happen.
9f5ccc
9f5ccc
The problem is that wb_inode->gen is only incremented for requests that
9f5ccc
have not been fulfilled but, after a merge, the request is marked as
9f5ccc
fulfilled even though it has not been sent to the brick. This allows
9f5ccc
that future requests are assigned to the same generation, which could
9f5ccc
be internally reordered.
9f5ccc
9f5ccc
Solution:
9f5ccc
9f5ccc
Increment wb_inode->gen before any unwind, even if it's for a fulfilled
9f5ccc
request.
9f5ccc
9f5ccc
Special thanks to Stefan Ring for writing a reproducer that has been
9f5ccc
crucial to identify the issue.
9f5ccc
9f5ccc
Upstream patch:
9f5ccc
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24263
9f5ccc
> Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45
9f5ccc
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
9f5ccc
> Fixes: #884
9f5ccc
9f5ccc
Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45
9f5ccc
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
9f5ccc
BUG: 1819059
9f5ccc
Reviewed-on: https://code.engineering.redhat.com/gerrit/196250
9f5ccc
Tested-by: RHGS Build Bot <nigelb@redhat.com>
9f5ccc
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
9f5ccc
---
9f5ccc
 tests/bugs/write-behind/issue-884.c                | 267 +++++++++++++++++++++
9f5ccc
 tests/bugs/write-behind/issue-884.t                |  40 +++
9f5ccc
 .../performance/write-behind/src/write-behind.c    |   4 +-
9f5ccc
 3 files changed, 309 insertions(+), 2 deletions(-)
9f5ccc
 create mode 100644 tests/bugs/write-behind/issue-884.c
9f5ccc
 create mode 100755 tests/bugs/write-behind/issue-884.t
9f5ccc
9f5ccc
diff --git a/tests/bugs/write-behind/issue-884.c b/tests/bugs/write-behind/issue-884.c
9f5ccc
new file mode 100644
9f5ccc
index 0000000..e9c33b3
9f5ccc
--- /dev/null
9f5ccc
+++ b/tests/bugs/write-behind/issue-884.c
9f5ccc
@@ -0,0 +1,267 @@
9f5ccc
+
9f5ccc
+#define _GNU_SOURCE
9f5ccc
+
9f5ccc
+#include <stdlib.h>
9f5ccc
+#include <stdio.h>
9f5ccc
+#include <string.h>
9f5ccc
+#include <time.h>
9f5ccc
+#include <assert.h>
9f5ccc
+#include <errno.h>
9f5ccc
+#include <sys/types.h>
9f5ccc
+#include <sys/stat.h>
9f5ccc
+#include <pthread.h>
9f5ccc
+
9f5ccc
+#include <glusterfs/api/glfs.h>
9f5ccc
+
9f5ccc
+/* Based on a reproducer by Stefan Ring. It seems to be quite sensible to any
9f5ccc
+ * timing modification, so the code has been maintained as is, only with minor
9f5ccc
+ * changes. */
9f5ccc
+
9f5ccc
+struct glfs *glfs;
9f5ccc
+
9f5ccc
+pthread_mutex_t the_mutex = PTHREAD_MUTEX_INITIALIZER;
9f5ccc
+pthread_cond_t the_cond = PTHREAD_COND_INITIALIZER;
9f5ccc
+
9f5ccc
+typedef struct _my_aiocb {
9f5ccc
+    int64_t size;
9f5ccc
+    volatile int64_t seq;
9f5ccc
+    int which;
9f5ccc
+} my_aiocb;
9f5ccc
+
9f5ccc
+typedef struct _worker_data {
9f5ccc
+    my_aiocb cb;
9f5ccc
+    struct iovec iov;
9f5ccc
+    int64_t offset;
9f5ccc
+} worker_data;
9f5ccc
+
9f5ccc
+typedef struct {
9f5ccc
+    worker_data wdata[2];
9f5ccc
+
9f5ccc
+    volatile unsigned busy;
9f5ccc
+} all_data_t;
9f5ccc
+
9f5ccc
+all_data_t all_data;
9f5ccc
+
9f5ccc
+static void
9f5ccc
+completion_fnc(struct glfs_fd *fd, ssize_t ret, struct glfs_stat *pre,
9f5ccc
+               struct glfs_stat *post, void *arg)
9f5ccc
+{
9f5ccc
+    void *the_thread;
9f5ccc
+    my_aiocb *cb = (my_aiocb *)arg;
9f5ccc
+    long seq = cb->seq;
9f5ccc
+
9f5ccc
+    assert(ret == cb->size);
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&the_mutex);
9f5ccc
+    pthread_cond_broadcast(&the_cond);
9f5ccc
+
9f5ccc
+    all_data.busy &= ~(1 << cb->which);
9f5ccc
+    cb->seq = -1;
9f5ccc
+
9f5ccc
+    the_thread = (void *)pthread_self();
9f5ccc
+    printf("worker %d is done from thread %p, seq %ld!\n", cb->which,
9f5ccc
+           the_thread, seq);
9f5ccc
+
9f5ccc
+    pthread_mutex_unlock(&the_mutex);
9f5ccc
+}
9f5ccc
+
9f5ccc
+static void
9f5ccc
+init_wdata(worker_data *data, int which)
9f5ccc
+{
9f5ccc
+    data->cb.which = which;
9f5ccc
+    data->cb.seq = -1;
9f5ccc
+
9f5ccc
+    data->iov.iov_base = malloc(1024 * 1024);
9f5ccc
+    memset(data->iov.iov_base, 6,
9f5ccc
+           1024 * 1024); /* tail part never overwritten */
9f5ccc
+}
9f5ccc
+
9f5ccc
+static void
9f5ccc
+init()
9f5ccc
+{
9f5ccc
+    all_data.busy = 0;
9f5ccc
+
9f5ccc
+    init_wdata(&all_data.wdata[0], 0);
9f5ccc
+    init_wdata(&all_data.wdata[1], 1);
9f5ccc
+}
9f5ccc
+
9f5ccc
+static void
9f5ccc
+do_write(struct glfs_fd *fd, int content, int size, int64_t seq,
9f5ccc
+         worker_data *wdata, const char *name)
9f5ccc
+{
9f5ccc
+    int ret;
9f5ccc
+
9f5ccc
+    wdata->cb.size = size;
9f5ccc
+    wdata->cb.seq = seq;
9f5ccc
+
9f5ccc
+    if (content >= 0)
9f5ccc
+        memset(wdata->iov.iov_base, content, size);
9f5ccc
+    wdata->iov.iov_len = size;
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&the_mutex);
9f5ccc
+    printf("(%d) dispatching write \"%s\", offset %lx, len %x, seq %ld\n",
9f5ccc
+           wdata->cb.which, name, (long)wdata->offset, size, (long)seq);
9f5ccc
+    pthread_mutex_unlock(&the_mutex);
9f5ccc
+    ret = glfs_pwritev_async(fd, &wdata->iov, 1, wdata->offset, 0,
9f5ccc
+                             completion_fnc, &wdata->cb);
9f5ccc
+    assert(ret >= 0);
9f5ccc
+}
9f5ccc
+
9f5ccc
+#define IDLE 0  // both workers must be idle
9f5ccc
+#define ANY 1   // use any worker, other one may be busy
9f5ccc
+
9f5ccc
+int
9f5ccc
+get_worker(int waitfor, int64_t excl_seq)
9f5ccc
+{
9f5ccc
+    int which;
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&the_mutex);
9f5ccc
+
9f5ccc
+    while (waitfor == IDLE && (all_data.busy & 3) != 0 ||
9f5ccc
+           waitfor == ANY &&
9f5ccc
+               ((all_data.busy & 3) == 3 ||
9f5ccc
+                excl_seq >= 0 && (all_data.wdata[0].cb.seq == excl_seq ||
9f5ccc
+                                  all_data.wdata[1].cb.seq == excl_seq)))
9f5ccc
+        pthread_cond_wait(&the_cond, &the_mutex);
9f5ccc
+
9f5ccc
+    if (!(all_data.busy & 1))
9f5ccc
+        which = 0;
9f5ccc
+    else
9f5ccc
+        which = 1;
9f5ccc
+
9f5ccc
+    all_data.busy |= (1 << which);
9f5ccc
+
9f5ccc
+    pthread_mutex_unlock(&the_mutex);
9f5ccc
+
9f5ccc
+    return which;
9f5ccc
+}
9f5ccc
+
9f5ccc
+static int
9f5ccc
+doit(struct glfs_fd *fd)
9f5ccc
+{
9f5ccc
+    int ret;
9f5ccc
+    int64_t seq = 0;
9f5ccc
+    int64_t offset = 0;     // position in file, in blocks
9f5ccc
+    int64_t base = 0x1000;  // where to place the data, in blocks
9f5ccc
+
9f5ccc
+    int async_mode = ANY;
9f5ccc
+
9f5ccc
+    init();
9f5ccc
+
9f5ccc
+    for (;;) {
9f5ccc
+        int which;
9f5ccc
+        worker_data *wdata;
9f5ccc
+
9f5ccc
+        // for growing to the first offset
9f5ccc
+        for (;;) {
9f5ccc
+            int gap = base + 0x42 - offset;
9f5ccc
+            if (!gap)
9f5ccc
+                break;
9f5ccc
+            if (gap > 80)
9f5ccc
+                gap = 80;
9f5ccc
+
9f5ccc
+            which = get_worker(IDLE, -1);
9f5ccc
+            wdata = &all_data.wdata[which];
9f5ccc
+
9f5ccc
+            wdata->offset = offset << 9;
9f5ccc
+            do_write(fd, 0, gap << 9, seq++, wdata, "gap-filling");
9f5ccc
+
9f5ccc
+            offset += gap;
9f5ccc
+        }
9f5ccc
+
9f5ccc
+        // 8700
9f5ccc
+        which = get_worker(IDLE, -1);
9f5ccc
+        wdata = &all_data.wdata[which];
9f5ccc
+
9f5ccc
+        wdata->offset = (base + 0x42) << 9;
9f5ccc
+        do_write(fd, 1, 62 << 9, seq++, wdata, "!8700");
9f5ccc
+
9f5ccc
+        // 8701
9f5ccc
+        which = get_worker(IDLE, -1);
9f5ccc
+        wdata = &all_data.wdata[which];
9f5ccc
+
9f5ccc
+        wdata->offset = (base + 0x42) << 9;
9f5ccc
+        do_write(fd, 2, 55 << 9, seq++, wdata, "!8701");
9f5ccc
+
9f5ccc
+        // 8702
9f5ccc
+        which = get_worker(async_mode, -1);
9f5ccc
+        wdata = &all_data.wdata[which];
9f5ccc
+
9f5ccc
+        wdata->offset = (base + 0x79) << 9;
9f5ccc
+        do_write(fd, 3, 54 << 9, seq++, wdata, "!8702");
9f5ccc
+
9f5ccc
+        // 8703
9f5ccc
+        which = get_worker(async_mode, -1);
9f5ccc
+        wdata = &all_data.wdata[which];
9f5ccc
+
9f5ccc
+        wdata->offset = (base + 0xaf) << 9;
9f5ccc
+        do_write(fd, 4, 81 << 9, seq++, wdata, "!8703");
9f5ccc
+
9f5ccc
+        // 8704
9f5ccc
+        // this writes both 5s and 6s
9f5ccc
+        // the range of 5s is the one that overwrites 8703
9f5ccc
+
9f5ccc
+        which = get_worker(async_mode, seq - 1);
9f5ccc
+        wdata = &all_data.wdata[which];
9f5ccc
+
9f5ccc
+        memset(wdata->iov.iov_base, 5, 81 << 9);
9f5ccc
+        wdata->offset = (base + 0xaf) << 9;
9f5ccc
+        do_write(fd, -1, 1623 << 9, seq++, wdata, "!8704");
9f5ccc
+
9f5ccc
+        offset = base + 0x706;
9f5ccc
+        base += 0x1000;
9f5ccc
+        if (base >= 0x100000)
9f5ccc
+            break;
9f5ccc
+    }
9f5ccc
+
9f5ccc
+    printf("done!\n");
9f5ccc
+    fflush(stdout);
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&the_mutex);
9f5ccc
+
9f5ccc
+    while ((all_data.busy & 3) != 0)
9f5ccc
+        pthread_cond_wait(&the_cond, &the_mutex);
9f5ccc
+
9f5ccc
+    pthread_mutex_unlock(&the_mutex);
9f5ccc
+
9f5ccc
+    ret = glfs_close(fd);
9f5ccc
+    assert(ret >= 0);
9f5ccc
+    /*
9f5ccc
+        ret = glfs_fini(glfs);
9f5ccc
+        assert(ret >= 0);
9f5ccc
+    */
9f5ccc
+    return 0;
9f5ccc
+}
9f5ccc
+
9f5ccc
+int
9f5ccc
+main(int argc, char *argv[])
9f5ccc
+{
9f5ccc
+    int ret;
9f5ccc
+    int open_flags = O_RDWR | O_DIRECT | O_TRUNC;
9f5ccc
+    struct glfs_fd *fd;
9f5ccc
+
9f5ccc
+    glfs = glfs_new(argv[1]);
9f5ccc
+    if (!glfs) {
9f5ccc
+        printf("glfs_new!\n");
9f5ccc
+        goto out;
9f5ccc
+    }
9f5ccc
+    ret = glfs_set_volfile_server(glfs, "tcp", "localhost", 24007);
9f5ccc
+    if (ret < 0) {
9f5ccc
+        printf("set_volfile!\n");
9f5ccc
+        goto out;
9f5ccc
+    }
9f5ccc
+    ret = glfs_init(glfs);
9f5ccc
+    if (ret) {
9f5ccc
+        printf("init!\n");
9f5ccc
+        goto out;
9f5ccc
+    }
9f5ccc
+    fd = glfs_open(glfs, argv[2], open_flags);
9f5ccc
+    if (!fd) {
9f5ccc
+        printf("open!\n");
9f5ccc
+        goto out;
9f5ccc
+    }
9f5ccc
+    srand(time(NULL));
9f5ccc
+    return doit(fd);
9f5ccc
+out:
9f5ccc
+    return 1;
9f5ccc
+}
9f5ccc
diff --git a/tests/bugs/write-behind/issue-884.t b/tests/bugs/write-behind/issue-884.t
9f5ccc
new file mode 100755
9f5ccc
index 0000000..2bcf7d1
9f5ccc
--- /dev/null
9f5ccc
+++ b/tests/bugs/write-behind/issue-884.t
9f5ccc
@@ -0,0 +1,40 @@
9f5ccc
+#!/bin/bash
9f5ccc
+
9f5ccc
+. $(dirname $0)/../../include.rc
9f5ccc
+. $(dirname $0)/../../volume.rc
9f5ccc
+
9f5ccc
+# This test tries to detect a race condition in write-behind. It's based on a
9f5ccc
+# reproducer written by Stefan Ring that is able to hit it sometimes. On my
9f5ccc
+# system, it happened around 10% of the runs. This means that if this bug
9f5ccc
+# appears again, this test will fail once every 10 runs. Most probably this
9f5ccc
+# failure will be hidden by the automatic test retry of the testing framework.
9f5ccc
+#
9f5ccc
+# Please, if this test fails, it needs to be analyzed in detail.
9f5ccc
+
9f5ccc
+function run() {
9f5ccc
+    "${@}" >/dev/null
9f5ccc
+}
9f5ccc
+
9f5ccc
+cleanup
9f5ccc
+
9f5ccc
+TEST glusterd
9f5ccc
+TEST pidof glusterd
9f5ccc
+
9f5ccc
+TEST $CLI volume create $V0 $H0:$B0/$V0
9f5ccc
+# This makes it easier to hit the issue
9f5ccc
+TEST $CLI volume set $V0 client-log-level TRACE
9f5ccc
+TEST $CLI volume start $V0
9f5ccc
+
9f5ccc
+TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0
9f5ccc
+
9f5ccc
+build_tester $(dirname $0)/issue-884.c -lgfapi
9f5ccc
+
9f5ccc
+TEST touch $M0/testfile
9f5ccc
+
9f5ccc
+# This program generates a file of 535694336 bytes with a fixed pattern
9f5ccc
+TEST run $(dirname $0)/issue-884 $V0 testfile
9f5ccc
+
9f5ccc
+# This is the md5sum of the expected pattern without corruption
9f5ccc
+EXPECT "ad105f9349345a70fc697632cbb5eec8" echo "$(md5sum $B0/$V0/testfile | awk '{ print $1; }')"
9f5ccc
+
9f5ccc
+cleanup
9f5ccc
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
9f5ccc
index 70e281a..90a0bcf 100644
9f5ccc
--- a/xlators/performance/write-behind/src/write-behind.c
9f5ccc
+++ b/xlators/performance/write-behind/src/write-behind.c
9f5ccc
@@ -1284,14 +1284,14 @@ __wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies)
9f5ccc
 
9f5ccc
         wb_inode->window_current += req->orig_size;
9f5ccc
 
9f5ccc
+        wb_inode->gen++;
9f5ccc
+
9f5ccc
         if (!req->ordering.fulfilled) {
9f5ccc
             /* burden increased */
9f5ccc
             list_add_tail(&req->lie, &wb_inode->liability);
9f5ccc
 
9f5ccc
             req->ordering.lied = 1;
9f5ccc
 
9f5ccc
-            wb_inode->gen++;
9f5ccc
-
9f5ccc
             uuid_utoa_r(req->gfid, gfid);
9f5ccc
             gf_msg_debug(wb_inode->this->name, 0,
9f5ccc
                          "(unique=%" PRIu64
9f5ccc
-- 
9f5ccc
1.8.3.1
9f5ccc