Tree - rpms/glusterfs - CentOS Git server

rpms / glusterfs

Blame SOURCES/0362-write-behind-fix-data-corruption.patch

Blob History Raw

		1df6c8	`From 48f6929590157d9a1697e11c02441207afdc1bed Mon Sep 17 00:00:00 2001`
		1df6c8	`From: Xavi Hernandez <xhernandez@redhat.com>`
		1df6c8	`Date: Fri, 27 Mar 2020 23:56:15 +0100`
		1df6c8	`Subject: [PATCH 362/362] write-behind: fix data corruption`
		1df6c8
		1df6c8	`There was a bug in write-behind that allowed a previous completed write`
		1df6c8	`to overwrite the overlapping region of data from a future write.`
		1df6c8
		1df6c8	`Suppose we want to send three writes (W1, W2 and W3). W1 and W2 are`
		1df6c8	`sequential, and W3 writes at the same offset of W2:`
		1df6c8
		1df6c8	`W2.offset = W3.offset = W1.offset + W1.size`
		1df6c8
		1df6c8	`Both W1 and W2 are sent in parallel. W3 is only sent after W2 completes.`
		1df6c8	`So W3 should always overwrite the overlapping part of W2.`
		1df6c8
		1df6c8	`Suppose write-behind processes the requests from 2 concurrent threads:`
		1df6c8
		1df6c8	`Thread 1 Thread 2`
		1df6c8
		1df6c8	`<received W1>`
		1df6c8	`<received W2>`
		1df6c8	`wb_enqueue_tempted(W1)`
		1df6c8	`/* W1 is assigned gen X */`
		1df6c8	`wb_enqueue_tempted(W2)`
		1df6c8	`/* W2 is assigned gen X */`
		1df6c8
		1df6c8	`wb_process_queue()`
		1df6c8	`__wb_preprocess_winds()`
		1df6c8	`/* W1 and W2 are sequential and all`
		1df6c8	`* other requisites are met to merge`
		1df6c8	`* both requests. */`
		1df6c8	`__wb_collapse_small_writes(W1, W2)`
		1df6c8	`__wb_fulfill_request(W2)`
		1df6c8
		1df6c8	`__wb_pick_unwinds() -> W2`
		1df6c8	`/* In this case, since the request is`
		1df6c8	`* already fulfilled, wb_inode->gen`
		1df6c8	`* is not updated. */`
		1df6c8
		1df6c8	`wb_do_unwinds()`
		1df6c8	`STACK_UNWIND(W2)`
		1df6c8
		1df6c8	`/* The application has received the`
		1df6c8	`* result of W2, so it can send W3. */`
		1df6c8	`<received W3>`
		1df6c8
		1df6c8	`wb_enqueue_tempted(W3)`
		1df6c8	`/* W3 is assigned gen X */`
		1df6c8
		1df6c8	`wb_process_queue()`
		1df6c8	`/* Here we have W1 (which contains`
		1df6c8	`* the conflicting W2) and W3 with`
		1df6c8	`* same gen, so they are interpreted`
		1df6c8	`* as concurrent writes that do not`
		1df6c8	`* conflict. */`
		1df6c8	`__wb_pick_winds() -> W3`
		1df6c8
		1df6c8	`wb_do_winds()`
		1df6c8	`STACK_WIND(W3)`
		1df6c8
		1df6c8	`wb_process_queue()`
		1df6c8	`/* Eventually W1 will be`
		1df6c8	`* ready to be sent */`
		1df6c8	`__wb_pick_winds() -> W1`
		1df6c8	`__wb_pick_unwinds() -> W1`
		1df6c8	`/* Here wb_inode->gen is`
		1df6c8	`* incremented. */`
		1df6c8
		1df6c8	`wb_do_unwinds()`
		1df6c8	`STACK_UNWIND(W1)`
		1df6c8
		1df6c8	`wb_do_winds()`
		1df6c8	`STACK_WIND(W1)`
		1df6c8
		1df6c8	`So, as we can see, W3 is sent before W1, which shouldn't happen.`
		1df6c8
		1df6c8	`The problem is that wb_inode->gen is only incremented for requests that`
		1df6c8	`have not been fulfilled but, after a merge, the request is marked as`
		1df6c8	`fulfilled even though it has not been sent to the brick. This allows`
		1df6c8	`that future requests are assigned to the same generation, which could`
		1df6c8	`be internally reordered.`
		1df6c8
		1df6c8	`Solution:`
		1df6c8
		1df6c8	`Increment wb_inode->gen before any unwind, even if it's for a fulfilled`
		1df6c8	`request.`
		1df6c8
		1df6c8	`Special thanks to Stefan Ring for writing a reproducer that has been`
		1df6c8	`crucial to identify the issue.`
		1df6c8
		1df6c8	`Upstream patch:`
		1df6c8	`> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24263`
		1df6c8	`> Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45`
		1df6c8	`> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>`
		1df6c8	`> Fixes: #884`
		1df6c8
		1df6c8	`Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45`
		1df6c8	`Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>`
		1df6c8	`BUG: 1819059`
		1df6c8	`Reviewed-on: https://code.engineering.redhat.com/gerrit/196250`
		1df6c8	`Tested-by: RHGS Build Bot <nigelb@redhat.com>`
		1df6c8	`Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>`
		1df6c8	`---`
		1df6c8	`tests/bugs/write-behind/issue-884.c \| 267 +++++++++++++++++++++`
		1df6c8	`tests/bugs/write-behind/issue-884.t \| 40 +++`
		1df6c8	`.../performance/write-behind/src/write-behind.c \| 4 +-`
		1df6c8	`3 files changed, 309 insertions(+), 2 deletions(-)`
		1df6c8	`create mode 100644 tests/bugs/write-behind/issue-884.c`
		1df6c8	`create mode 100755 tests/bugs/write-behind/issue-884.t`
		1df6c8
		1df6c8	`diff --git a/tests/bugs/write-behind/issue-884.c b/tests/bugs/write-behind/issue-884.c`
		1df6c8	`new file mode 100644`
		1df6c8	`index 0000000..e9c33b3`
		1df6c8	`--- /dev/null`
		1df6c8	`+++ b/tests/bugs/write-behind/issue-884.c`
		1df6c8	`@@ -0,0 +1,267 @@`
		1df6c8	`+`
		1df6c8	`+#define _GNU_SOURCE`
		1df6c8	`+`
		1df6c8	`+#include <stdlib.h>`
		1df6c8	`+#include <stdio.h>`
		1df6c8	`+#include <string.h>`
		1df6c8	`+#include <time.h>`
		1df6c8	`+#include <assert.h>`
		1df6c8	`+#include <errno.h>`
		1df6c8	`+#include <sys/types.h>`
		1df6c8	`+#include <sys/stat.h>`
		1df6c8	`+#include <pthread.h>`
		1df6c8	`+`
		1df6c8	`+#include <glusterfs/api/glfs.h>`
		1df6c8	`+`
		1df6c8	`+/* Based on a reproducer by Stefan Ring. It seems to be quite sensible to any`
		1df6c8	`+ * timing modification, so the code has been maintained as is, only with minor`
		1df6c8	`+ * changes. */`
		1df6c8	`+`
		1df6c8	`+struct glfs *glfs;`
		1df6c8	`+`
		1df6c8	`+pthread_mutex_t the_mutex = PTHREAD_MUTEX_INITIALIZER;`
		1df6c8	`+pthread_cond_t the_cond = PTHREAD_COND_INITIALIZER;`
		1df6c8	`+`
		1df6c8	`+typedef struct _my_aiocb {`
		1df6c8	`+ int64_t size;`
		1df6c8	`+ volatile int64_t seq;`
		1df6c8	`+ int which;`
		1df6c8	`+} my_aiocb;`
		1df6c8	`+`
		1df6c8	`+typedef struct _worker_data {`
		1df6c8	`+ my_aiocb cb;`
		1df6c8	`+ struct iovec iov;`
		1df6c8	`+ int64_t offset;`
		1df6c8	`+} worker_data;`
		1df6c8	`+`
		1df6c8	`+typedef struct {`
		1df6c8	`+ worker_data wdata[2];`
		1df6c8	`+`
		1df6c8	`+ volatile unsigned busy;`
		1df6c8	`+} all_data_t;`
		1df6c8	`+`
		1df6c8	`+all_data_t all_data;`
		1df6c8	`+`
		1df6c8	`+static void`
		1df6c8	`+completion_fnc(struct glfs_fd fd, ssize_t ret, struct glfs_stat pre,`
		1df6c8	`+ struct glfs_stat post, void arg)`
		1df6c8	`+{`
		1df6c8	`+ void *the_thread;`
		1df6c8	`+ my_aiocb cb = (my_aiocb )arg;`
		1df6c8	`+ long seq = cb->seq;`
		1df6c8	`+`
		1df6c8	`+ assert(ret == cb->size);`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_lock(&the_mutex);`
		1df6c8	`+ pthread_cond_broadcast(&the_cond);`
		1df6c8	`+`
		1df6c8	`+ all_data.busy &= ~(1 << cb->which);`
		1df6c8	`+ cb->seq = -1;`
		1df6c8	`+`
		1df6c8	`+ the_thread = (void *)pthread_self();`
		1df6c8	`+ printf("worker %d is done from thread %p, seq %ld!\n", cb->which,`
		1df6c8	`+ the_thread, seq);`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_unlock(&the_mutex);`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+static void`
		1df6c8	`+init_wdata(worker_data *data, int which)`
		1df6c8	`+{`
		1df6c8	`+ data->cb.which = which;`
		1df6c8	`+ data->cb.seq = -1;`
		1df6c8	`+`
		1df6c8	`+ data->iov.iov_base = malloc(1024 * 1024);`
		1df6c8	`+ memset(data->iov.iov_base, 6,`
		1df6c8	`+ 1024 * 1024); /* tail part never overwritten */`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+static void`
		1df6c8	`+init()`
		1df6c8	`+{`
		1df6c8	`+ all_data.busy = 0;`
		1df6c8	`+`
		1df6c8	`+ init_wdata(&all_data.wdata[0], 0);`
		1df6c8	`+ init_wdata(&all_data.wdata[1], 1);`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+static void`
		1df6c8	`+do_write(struct glfs_fd *fd, int content, int size, int64_t seq,`
		1df6c8	`+ worker_data wdata, const char name)`
		1df6c8	`+{`
		1df6c8	`+ int ret;`
		1df6c8	`+`
		1df6c8	`+ wdata->cb.size = size;`
		1df6c8	`+ wdata->cb.seq = seq;`
		1df6c8	`+`
		1df6c8	`+ if (content >= 0)`
		1df6c8	`+ memset(wdata->iov.iov_base, content, size);`
		1df6c8	`+ wdata->iov.iov_len = size;`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_lock(&the_mutex);`
		1df6c8	`+ printf("(%d) dispatching write \"%s\", offset %lx, len %x, seq %ld\n",`
		1df6c8	`+ wdata->cb.which, name, (long)wdata->offset, size, (long)seq);`
		1df6c8	`+ pthread_mutex_unlock(&the_mutex);`
		1df6c8	`+ ret = glfs_pwritev_async(fd, &wdata->iov, 1, wdata->offset, 0,`
		1df6c8	`+ completion_fnc, &wdata->cb);`
		1df6c8	`+ assert(ret >= 0);`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+#define IDLE 0 // both workers must be idle`
		1df6c8	`+#define ANY 1 // use any worker, other one may be busy`
		1df6c8	`+`
		1df6c8	`+int`
		1df6c8	`+get_worker(int waitfor, int64_t excl_seq)`
		1df6c8	`+{`
		1df6c8	`+ int which;`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_lock(&the_mutex);`
		1df6c8	`+`
		1df6c8	`+ while (waitfor == IDLE && (all_data.busy & 3) != 0 \|\|`
		1df6c8	`+ waitfor == ANY &&`
		1df6c8	`+ ((all_data.busy & 3) == 3 \|\|`
		1df6c8	`+ excl_seq >= 0 && (all_data.wdata[0].cb.seq == excl_seq \|\|`
		1df6c8	`+ all_data.wdata[1].cb.seq == excl_seq)))`
		1df6c8	`+ pthread_cond_wait(&the_cond, &the_mutex);`
		1df6c8	`+`
		1df6c8	`+ if (!(all_data.busy & 1))`
		1df6c8	`+ which = 0;`
		1df6c8	`+ else`
		1df6c8	`+ which = 1;`
		1df6c8	`+`
		1df6c8	`+ all_data.busy \|= (1 << which);`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_unlock(&the_mutex);`
		1df6c8	`+`
		1df6c8	`+ return which;`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+static int`
		1df6c8	`+doit(struct glfs_fd *fd)`
		1df6c8	`+{`
		1df6c8	`+ int ret;`
		1df6c8	`+ int64_t seq = 0;`
		1df6c8	`+ int64_t offset = 0; // position in file, in blocks`
		1df6c8	`+ int64_t base = 0x1000; // where to place the data, in blocks`
		1df6c8	`+`
		1df6c8	`+ int async_mode = ANY;`
		1df6c8	`+`
		1df6c8	`+ init();`
		1df6c8	`+`
		1df6c8	`+ for (;;) {`
		1df6c8	`+ int which;`
		1df6c8	`+ worker_data *wdata;`
		1df6c8	`+`
		1df6c8	`+ // for growing to the first offset`
		1df6c8	`+ for (;;) {`
		1df6c8	`+ int gap = base + 0x42 - offset;`
		1df6c8	`+ if (!gap)`
		1df6c8	`+ break;`
		1df6c8	`+ if (gap > 80)`
		1df6c8	`+ gap = 80;`
		1df6c8	`+`
		1df6c8	`+ which = get_worker(IDLE, -1);`
		1df6c8	`+ wdata = &all_data.wdata[which];`
		1df6c8	`+`
		1df6c8	`+ wdata->offset = offset << 9;`
		1df6c8	`+ do_write(fd, 0, gap << 9, seq++, wdata, "gap-filling");`
		1df6c8	`+`
		1df6c8	`+ offset += gap;`
		1df6c8	`+ }`
		1df6c8	`+`
		1df6c8	`+ // 8700`
		1df6c8	`+ which = get_worker(IDLE, -1);`
		1df6c8	`+ wdata = &all_data.wdata[which];`
		1df6c8	`+`
		1df6c8	`+ wdata->offset = (base + 0x42) << 9;`
		1df6c8	`+ do_write(fd, 1, 62 << 9, seq++, wdata, "!8700");`
		1df6c8	`+`
		1df6c8	`+ // 8701`
		1df6c8	`+ which = get_worker(IDLE, -1);`
		1df6c8	`+ wdata = &all_data.wdata[which];`
		1df6c8	`+`
		1df6c8	`+ wdata->offset = (base + 0x42) << 9;`
		1df6c8	`+ do_write(fd, 2, 55 << 9, seq++, wdata, "!8701");`
		1df6c8	`+`
		1df6c8	`+ // 8702`
		1df6c8	`+ which = get_worker(async_mode, -1);`
		1df6c8	`+ wdata = &all_data.wdata[which];`
		1df6c8	`+`
		1df6c8	`+ wdata->offset = (base + 0x79) << 9;`
		1df6c8	`+ do_write(fd, 3, 54 << 9, seq++, wdata, "!8702");`
		1df6c8	`+`
		1df6c8	`+ // 8703`
		1df6c8	`+ which = get_worker(async_mode, -1);`
		1df6c8	`+ wdata = &all_data.wdata[which];`
		1df6c8	`+`
		1df6c8	`+ wdata->offset = (base + 0xaf) << 9;`
		1df6c8	`+ do_write(fd, 4, 81 << 9, seq++, wdata, "!8703");`
		1df6c8	`+`
		1df6c8	`+ // 8704`
		1df6c8	`+ // this writes both 5s and 6s`
		1df6c8	`+ // the range of 5s is the one that overwrites 8703`
		1df6c8	`+`
		1df6c8	`+ which = get_worker(async_mode, seq - 1);`
		1df6c8	`+ wdata = &all_data.wdata[which];`
		1df6c8	`+`
		1df6c8	`+ memset(wdata->iov.iov_base, 5, 81 << 9);`
		1df6c8	`+ wdata->offset = (base + 0xaf) << 9;`
		1df6c8	`+ do_write(fd, -1, 1623 << 9, seq++, wdata, "!8704");`
		1df6c8	`+`
		1df6c8	`+ offset = base + 0x706;`
		1df6c8	`+ base += 0x1000;`
		1df6c8	`+ if (base >= 0x100000)`
		1df6c8	`+ break;`
		1df6c8	`+ }`
		1df6c8	`+`
		1df6c8	`+ printf("done!\n");`
		1df6c8	`+ fflush(stdout);`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_lock(&the_mutex);`
		1df6c8	`+`
		1df6c8	`+ while ((all_data.busy & 3) != 0)`
		1df6c8	`+ pthread_cond_wait(&the_cond, &the_mutex);`
		1df6c8	`+`
		1df6c8	`+ pthread_mutex_unlock(&the_mutex);`
		1df6c8	`+`
		1df6c8	`+ ret = glfs_close(fd);`
		1df6c8	`+ assert(ret >= 0);`
		1df6c8	`+ /*`
		1df6c8	`+ ret = glfs_fini(glfs);`
		1df6c8	`+ assert(ret >= 0);`
		1df6c8	`+ */`
		1df6c8	`+ return 0;`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+int`
		1df6c8	`+main(int argc, char *argv[])`
		1df6c8	`+{`
		1df6c8	`+ int ret;`
		1df6c8	`+ int open_flags = O_RDWR \| O_DIRECT \| O_TRUNC;`
		1df6c8	`+ struct glfs_fd *fd;`
		1df6c8	`+`
		1df6c8	`+ glfs = glfs_new(argv[1]);`
		1df6c8	`+ if (!glfs) {`
		1df6c8	`+ printf("glfs_new!\n");`
		1df6c8	`+ goto out;`
		1df6c8	`+ }`
		1df6c8	`+ ret = glfs_set_volfile_server(glfs, "tcp", "localhost", 24007);`
		1df6c8	`+ if (ret < 0) {`
		1df6c8	`+ printf("set_volfile!\n");`
		1df6c8	`+ goto out;`
		1df6c8	`+ }`
		1df6c8	`+ ret = glfs_init(glfs);`
		1df6c8	`+ if (ret) {`
		1df6c8	`+ printf("init!\n");`
		1df6c8	`+ goto out;`
		1df6c8	`+ }`
		1df6c8	`+ fd = glfs_open(glfs, argv[2], open_flags);`
		1df6c8	`+ if (!fd) {`
		1df6c8	`+ printf("open!\n");`
		1df6c8	`+ goto out;`
		1df6c8	`+ }`
		1df6c8	`+ srand(time(NULL));`
		1df6c8	`+ return doit(fd);`
		1df6c8	`+out:`
		1df6c8	`+ return 1;`
		1df6c8	`+}`
		1df6c8	`diff --git a/tests/bugs/write-behind/issue-884.t b/tests/bugs/write-behind/issue-884.t`
		1df6c8	`new file mode 100755`
		1df6c8	`index 0000000..2bcf7d1`
		1df6c8	`--- /dev/null`
		1df6c8	`+++ b/tests/bugs/write-behind/issue-884.t`
		1df6c8	`@@ -0,0 +1,40 @@`
		1df6c8	`+#!/bin/bash`
		1df6c8	`+`
		1df6c8	`+. $(dirname $0)/../../include.rc`
		1df6c8	`+. $(dirname $0)/../../volume.rc`
		1df6c8	`+`
		1df6c8	`+# This test tries to detect a race condition in write-behind. It's based on a`
		1df6c8	`+# reproducer written by Stefan Ring that is able to hit it sometimes. On my`
		1df6c8	`+# system, it happened around 10% of the runs. This means that if this bug`
		1df6c8	`+# appears again, this test will fail once every 10 runs. Most probably this`
		1df6c8	`+# failure will be hidden by the automatic test retry of the testing framework.`
		1df6c8	`+#`
		1df6c8	`+# Please, if this test fails, it needs to be analyzed in detail.`
		1df6c8	`+`
		1df6c8	`+function run() {`
		1df6c8	`+ "${@}" >/dev/null`
		1df6c8	`+}`
		1df6c8	`+`
		1df6c8	`+cleanup`
		1df6c8	`+`
		1df6c8	`+TEST glusterd`
		1df6c8	`+TEST pidof glusterd`
		1df6c8	`+`
		1df6c8	`+TEST $CLI volume create $V0 $H0:$B0/$V0`
		1df6c8	`+# This makes it easier to hit the issue`
		1df6c8	`+TEST $CLI volume set $V0 client-log-level TRACE`
		1df6c8	`+TEST $CLI volume start $V0`
		1df6c8	`+`
		1df6c8	`+TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0`
		1df6c8	`+`
		1df6c8	`+build_tester $(dirname $0)/issue-884.c -lgfapi`
		1df6c8	`+`
		1df6c8	`+TEST touch $M0/testfile`
		1df6c8	`+`
		1df6c8	`+# This program generates a file of 535694336 bytes with a fixed pattern`
		1df6c8	`+TEST run $(dirname $0)/issue-884 $V0 testfile`
		1df6c8	`+`
		1df6c8	`+# This is the md5sum of the expected pattern without corruption`
		1df6c8	`+EXPECT "ad105f9349345a70fc697632cbb5eec8" echo "$(md5sum $B0/$V0/testfile \| awk '{ print $1; }')"`
		1df6c8	`+`
		1df6c8	`+cleanup`
		1df6c8	`diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c`
		1df6c8	`index 70e281a..90a0bcf 100644`
		1df6c8	`--- a/xlators/performance/write-behind/src/write-behind.c`
		1df6c8	`+++ b/xlators/performance/write-behind/src/write-behind.c`
		1df6c8	`@@ -1284,14 +1284,14 @@ __wb_pick_unwinds(wb_inode_t wb_inode, list_head_t lies)`
		1df6c8
		1df6c8	`wb_inode->window_current += req->orig_size;`
		1df6c8
		1df6c8	`+ wb_inode->gen++;`
		1df6c8	`+`
		1df6c8	`if (!req->ordering.fulfilled) {`
		1df6c8	`/* burden increased */`
		1df6c8	`list_add_tail(&req->lie, &wb_inode->liability);`
		1df6c8
		1df6c8	`req->ordering.lied = 1;`
		1df6c8
		1df6c8	`- wb_inode->gen++;`
		1df6c8	`-`
		1df6c8	`uuid_utoa_r(req->gfid, gfid);`
		1df6c8	`gf_msg_debug(wb_inode->this->name, 0,`
		1df6c8	`"(unique=%" PRIu64`
		1df6c8	`--`
		1df6c8	`1.8.3.1`
		1df6c8

rpms / glusterfs

Source Code

Blame SOURCES/0362-write-behind-fix-data-corruption.patch