|
|
383d26 |
From 5627c9fb0b86809d42914f1beef9b68226141d4b Mon Sep 17 00:00:00 2001
|
|
|
383d26 |
From: John Snow <jsnow@redhat.com>
|
|
|
383d26 |
Date: Wed, 18 Jul 2018 22:54:58 +0200
|
|
|
383d26 |
Subject: [PATCH 73/89] block/backup: fix fleecing scheme: use serialized
|
|
|
383d26 |
writes
|
|
|
383d26 |
|
|
|
383d26 |
RH-Author: John Snow <jsnow@redhat.com>
|
|
|
383d26 |
Message-id: <20180718225511.14878-23-jsnow@redhat.com>
|
|
|
383d26 |
Patchwork-id: 81396
|
|
|
383d26 |
O-Subject: [RHEL-7.6 qemu-kvm-rhev PATCH 22/35] block/backup: fix fleecing scheme: use serialized writes
|
|
|
383d26 |
Bugzilla: 1207657
|
|
|
383d26 |
RH-Acked-by: Eric Blake <eblake@redhat.com>
|
|
|
383d26 |
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
|
|
383d26 |
RH-Acked-by: Fam Zheng <famz@redhat.com>
|
|
|
383d26 |
|
|
|
383d26 |
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
|
|
|
383d26 |
|
|
|
383d26 |
Fleecing scheme works as follows: we want a kind of temporary snapshot
|
|
|
383d26 |
of active drive A. We create temporary image B, with B->backing = A.
|
|
|
383d26 |
Then we start backup(sync=none) from A to B. From this point, B reads
|
|
|
383d26 |
as point-in-time snapshot of A (A continues to be active drive,
|
|
|
383d26 |
accepting guest IO).
|
|
|
383d26 |
|
|
|
383d26 |
This scheme needs some additional synchronization between reads from B
|
|
|
383d26 |
and backup COW operations, otherwise, the following situation is
|
|
|
383d26 |
theoretically possible:
|
|
|
383d26 |
|
|
|
383d26 |
(assume B is qcow2, client is NBD client, reading from B)
|
|
|
383d26 |
|
|
|
383d26 |
1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and
|
|
|
383d26 |
goes up to l2 table loading (assume cache miss)
|
|
|
383d26 |
|
|
|
383d26 |
2) guest write => backup COW => qcow2 write =>
|
|
|
383d26 |
try to take qcow2 mutex => waiting
|
|
|
383d26 |
|
|
|
383d26 |
3. l2 table loaded, we see that cluster is UNALLOCATED, go to
|
|
|
383d26 |
"case QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before
|
|
|
383d26 |
bdrv_co_preadv(bs->backing, ...)
|
|
|
383d26 |
|
|
|
383d26 |
4) aha, mutex unlocked, backup COW continues, and we finally finish
|
|
|
383d26 |
guest write and change cluster in our active disk A
|
|
|
383d26 |
|
|
|
383d26 |
5. actually, do bdrv_co_preadv(bs->backing, ...) and read
|
|
|
383d26 |
_new updated_ data.
|
|
|
383d26 |
|
|
|
383d26 |
To avoid this, let's make backup writes serializing, to not intersect
|
|
|
383d26 |
with reads from B.
|
|
|
383d26 |
|
|
|
383d26 |
Note: we expand range of handled cases from (sync=none and
|
|
|
383d26 |
B->backing = A) to just (A in backing chain of B), to finally allow
|
|
|
383d26 |
safe reading from B during backup for all cases when A in backing chain
|
|
|
383d26 |
of B, i.e. B formally looks like point-in-time snapshot of A.
|
|
|
383d26 |
|
|
|
383d26 |
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
|
|
|
383d26 |
Reviewed-by: Fam Zheng <famz@redhat.com>
|
|
|
383d26 |
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
|
|
383d26 |
(cherry picked from commit f8d59dfb40bbc6f5aeea57c8aac1e68c1d2454ee)
|
|
|
383d26 |
Signed-off-by: John Snow <jsnow@redhat.com>
|
|
|
383d26 |
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
|
|
|
383d26 |
---
|
|
|
383d26 |
block/backup.c | 20 ++++++++++++++------
|
|
|
383d26 |
1 file changed, 14 insertions(+), 6 deletions(-)
|
|
|
383d26 |
|
|
|
383d26 |
diff --git a/block/backup.c b/block/backup.c
|
|
|
383d26 |
index 369155a..4ba1a6a 100644
|
|
|
383d26 |
--- a/block/backup.c
|
|
|
383d26 |
+++ b/block/backup.c
|
|
|
383d26 |
@@ -47,6 +47,8 @@ typedef struct BackupBlockJob {
|
|
|
383d26 |
HBitmap *copy_bitmap;
|
|
|
383d26 |
bool use_copy_range;
|
|
|
383d26 |
int64_t copy_range_size;
|
|
|
383d26 |
+
|
|
|
383d26 |
+ bool serialize_target_writes;
|
|
|
383d26 |
} BackupBlockJob;
|
|
|
383d26 |
|
|
|
383d26 |
static const BlockJobDriver backup_job_driver;
|
|
|
383d26 |
@@ -102,6 +104,8 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
|
|
|
383d26 |
QEMUIOVector qiov;
|
|
|
383d26 |
BlockBackend *blk = job->common.blk;
|
|
|
383d26 |
int nbytes;
|
|
|
383d26 |
+ int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
|
|
|
383d26 |
+ int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
|
|
|
383d26 |
|
|
|
383d26 |
hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);
|
|
|
383d26 |
nbytes = MIN(job->cluster_size, job->len - start);
|
|
|
383d26 |
@@ -112,8 +116,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
|
|
|
383d26 |
iov.iov_len = nbytes;
|
|
|
383d26 |
qemu_iovec_init_external(&qiov, &iov, 1);
|
|
|
383d26 |
|
|
|
383d26 |
- ret = blk_co_preadv(blk, start, qiov.size, &qiov,
|
|
|
383d26 |
- is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
|
|
|
383d26 |
+ ret = blk_co_preadv(blk, start, qiov.size, &qiov, read_flags);
|
|
|
383d26 |
if (ret < 0) {
|
|
|
383d26 |
trace_backup_do_cow_read_fail(job, start, ret);
|
|
|
383d26 |
if (error_is_read) {
|
|
|
383d26 |
@@ -124,11 +127,11 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
|
|
|
383d26 |
|
|
|
383d26 |
if (qemu_iovec_is_zero(&qiov)) {
|
|
|
383d26 |
ret = blk_co_pwrite_zeroes(job->target, start,
|
|
|
383d26 |
- qiov.size, BDRV_REQ_MAY_UNMAP);
|
|
|
383d26 |
+ qiov.size, write_flags | BDRV_REQ_MAY_UNMAP);
|
|
|
383d26 |
} else {
|
|
|
383d26 |
ret = blk_co_pwritev(job->target, start,
|
|
|
383d26 |
- qiov.size, &qiov,
|
|
|
383d26 |
- job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
|
|
|
383d26 |
+ qiov.size, &qiov, write_flags |
|
|
|
383d26 |
+ (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
|
|
|
383d26 |
}
|
|
|
383d26 |
if (ret < 0) {
|
|
|
383d26 |
trace_backup_do_cow_write_fail(job, start, ret);
|
|
|
383d26 |
@@ -156,6 +159,8 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
|
|
|
383d26 |
int nr_clusters;
|
|
|
383d26 |
BlockBackend *blk = job->common.blk;
|
|
|
383d26 |
int nbytes;
|
|
|
383d26 |
+ int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
|
|
|
383d26 |
+ int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
|
|
|
383d26 |
|
|
|
383d26 |
assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
|
|
|
383d26 |
nbytes = MIN(job->copy_range_size, end - start);
|
|
|
383d26 |
@@ -163,7 +168,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
|
|
|
383d26 |
hbitmap_reset(job->copy_bitmap, start / job->cluster_size,
|
|
|
383d26 |
nr_clusters);
|
|
|
383d26 |
ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
|
|
|
383d26 |
- is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0);
|
|
|
383d26 |
+ read_flags, write_flags);
|
|
|
383d26 |
if (ret < 0) {
|
|
|
383d26 |
trace_backup_do_cow_copy_range_fail(job, start, ret);
|
|
|
383d26 |
hbitmap_set(job->copy_bitmap, start / job->cluster_size,
|
|
|
383d26 |
@@ -701,6 +706,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
|
|
|
383d26 |
sync_bitmap : NULL;
|
|
|
383d26 |
job->compress = compress;
|
|
|
383d26 |
|
|
|
383d26 |
+ /* Detect image-fleecing (and similar) schemes */
|
|
|
383d26 |
+ job->serialize_target_writes = bdrv_chain_contains(target, bs);
|
|
|
383d26 |
+
|
|
|
383d26 |
/* If there is no backing file on the target, we cannot rely on COW if our
|
|
|
383d26 |
* backup cluster size is smaller than the target cluster size. Even for
|
|
|
383d26 |
* targets with a backing file, try to avoid COW if possible. */
|
|
|
383d26 |
--
|
|
|
383d26 |
1.8.3.1
|
|
|
383d26 |
|