|
|
9db63c |
From: Nir Soffer <nirsof@gmail.com>
|
|
|
9db63c |
Date: Tue, 13 Aug 2019 21:21:03 +0300
|
|
|
9db63c |
Subject: [PATCH] file-posix: Handle undetectable alignment
|
|
|
9db63c |
|
|
|
9db63c |
In some cases buf_align or request_alignment cannot be detected:
|
|
|
9db63c |
|
|
|
9db63c |
1. With Gluster, buf_align cannot be detected since the actual I/O is
|
|
|
9db63c |
done on Gluster server, and qemu buffer alignment does not matter.
|
|
|
9db63c |
Since we don't have alignment requirement, buf_align=1 is the best
|
|
|
9db63c |
value.
|
|
|
9db63c |
|
|
|
9db63c |
2. With local XFS filesystem, buf_align cannot be detected if reading
|
|
|
9db63c |
from unallocated area. In this we must align the buffer, but we don't
|
|
|
9db63c |
know what is the correct size. Using the wrong alignment results in
|
|
|
9db63c |
I/O error.
|
|
|
9db63c |
|
|
|
9db63c |
3. With Gluster backed by XFS, request_alignment cannot be detected if
|
|
|
9db63c |
reading from unallocated area. In this case we need to use the
|
|
|
9db63c |
correct alignment, and failing to do so results in I/O errors.
|
|
|
9db63c |
|
|
|
9db63c |
4. With NFS, the server does not use direct I/O, so both buf_align cannot
|
|
|
9db63c |
be detected. In this case we don't need any alignment so we can use
|
|
|
9db63c |
buf_align=1 and request_alignment=1.
|
|
|
9db63c |
|
|
|
9db63c |
These cases seems to work when storage sector size is 512 bytes, because
|
|
|
9db63c |
the current code starts checking align=512. If the check succeeds
|
|
|
9db63c |
because alignment cannot be detected we use 512. But this does not work
|
|
|
9db63c |
for storage with 4k sector size.
|
|
|
9db63c |
|
|
|
9db63c |
To determine if we can detect the alignment, we probe first with
|
|
|
9db63c |
align=1. If probing succeeds, maybe there are no alignment requirement
|
|
|
9db63c |
(cases 1, 4) or we are probing unallocated area (cases 2, 3). Since we
|
|
|
9db63c |
don't have any way to tell, we treat this as undetectable alignment. If
|
|
|
9db63c |
probing with align=1 fails with EINVAL, but probing with one of the
|
|
|
9db63c |
expected alignments succeeds, we know that we found a working alignment.
|
|
|
9db63c |
|
|
|
9db63c |
Practically the alignment requirements are the same for buffer
|
|
|
9db63c |
alignment, buffer length, and offset in file. So in case we cannot
|
|
|
9db63c |
detect buf_align, we can use request alignment. If we cannot detect
|
|
|
9db63c |
request alignment, we can fallback to a safe value. To use this logic,
|
|
|
9db63c |
we probe first request alignment instead of buf_align.
|
|
|
9db63c |
|
|
|
9db63c |
Here is a table showing the behaviour with current code (the value in
|
|
|
9db63c |
parenthesis is the optimal value).
|
|
|
9db63c |
|
|
|
9db63c |
Case Sector buf_align (opt) request_alignment (opt) result
|
|
|
9db63c |
======================================================================
|
|
|
9db63c |
1 512 512 (1) 512 (512) OK
|
|
|
9db63c |
1 4096 512 (1) 4096 (4096) FAIL
|
|
|
9db63c |
----------------------------------------------------------------------
|
|
|
9db63c |
2 512 512 (512) 512 (512) OK
|
|
|
9db63c |
2 4096 512 (4096) 4096 (4096) FAIL
|
|
|
9db63c |
----------------------------------------------------------------------
|
|
|
9db63c |
3 512 512 (1) 512 (512) OK
|
|
|
9db63c |
3 4096 512 (1) 512 (4096) FAIL
|
|
|
9db63c |
----------------------------------------------------------------------
|
|
|
9db63c |
4 512 512 (1) 512 (1) OK
|
|
|
9db63c |
4 4096 512 (1) 512 (1) OK
|
|
|
9db63c |
|
|
|
9db63c |
Same cases with this change:
|
|
|
9db63c |
|
|
|
9db63c |
Case Sector buf_align (opt) request_alignment (opt) result
|
|
|
9db63c |
======================================================================
|
|
|
9db63c |
1 512 512 (1) 512 (512) OK
|
|
|
9db63c |
1 4096 4096 (1) 4096 (4096) OK
|
|
|
9db63c |
----------------------------------------------------------------------
|
|
|
9db63c |
2 512 512 (512) 512 (512) OK
|
|
|
9db63c |
2 4096 4096 (4096) 4096 (4096) OK
|
|
|
9db63c |
----------------------------------------------------------------------
|
|
|
9db63c |
3 512 4096 (1) 4096 (512) OK
|
|
|
9db63c |
3 4096 4096 (1) 4096 (4096) OK
|
|
|
9db63c |
----------------------------------------------------------------------
|
|
|
9db63c |
4 512 4096 (1) 4096 (1) OK
|
|
|
9db63c |
4 4096 4096 (1) 4096 (1) OK
|
|
|
9db63c |
|
|
|
9db63c |
I tested that provisioning VMs and copying disks on local XFS and
|
|
|
9db63c |
Gluster with 4k bytes sector size work now, resolving bugs [1],[2].
|
|
|
9db63c |
I tested also on XFS, NFS, Gluster with 512 bytes sector size.
|
|
|
9db63c |
|
|
|
9db63c |
[1] https://bugzilla.redhat.com/1737256
|
|
|
9db63c |
[2] https://bugzilla.redhat.com/1738657
|
|
|
9db63c |
|
|
|
9db63c |
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
|
|
|
9db63c |
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
|
|
9db63c |
(cherry picked from commit a6b257a08e3d72219f03e461a52152672fec0612)
|
|
|
9db63c |
---
|
|
|
9db63c |
block/file-posix.c | 36 +++++++++++++++++++++++++-----------
|
|
|
9db63c |
1 file changed, 25 insertions(+), 11 deletions(-)
|
|
|
9db63c |
|
|
|
9db63c |
diff --git a/block/file-posix.c b/block/file-posix.c
|
|
|
9db63c |
index 4479cc7ab4..b8b4dad553 100644
|
|
|
9db63c |
--- a/block/file-posix.c
|
|
|
9db63c |
+++ b/block/file-posix.c
|
|
|
9db63c |
@@ -323,6 +323,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
|
|
|
9db63c |
BDRVRawState *s = bs->opaque;
|
|
|
9db63c |
char *buf;
|
|
|
9db63c |
size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
|
|
|
9db63c |
+ size_t alignments[] = {1, 512, 1024, 2048, 4096};
|
|
|
9db63c |
|
|
|
9db63c |
/* For SCSI generic devices the alignment is not really used.
|
|
|
9db63c |
With buffered I/O, we don't have any restrictions. */
|
|
|
9db63c |
@@ -349,25 +350,38 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
|
|
|
9db63c |
}
|
|
|
9db63c |
#endif
|
|
|
9db63c |
|
|
|
9db63c |
- /* If we could not get the sizes so far, we can only guess them */
|
|
|
9db63c |
- if (!s->buf_align) {
|
|
|
9db63c |
+ /*
|
|
|
9db63c |
+ * If we could not get the sizes so far, we can only guess them. First try
|
|
|
9db63c |
+ * to detect request alignment, since it is more likely to succeed. Then
|
|
|
9db63c |
+ * try to detect buf_align, which cannot be detected in some cases (e.g.
|
|
|
9db63c |
+ * Gluster). If buf_align cannot be detected, we fallback to the value of
|
|
|
9db63c |
+ * request_alignment.
|
|
|
9db63c |
+ */
|
|
|
9db63c |
+
|
|
|
9db63c |
+ if (!bs->bl.request_alignment) {
|
|
|
9db63c |
+ int i;
|
|
|
9db63c |
size_t align;
|
|
|
9db63c |
- buf = qemu_memalign(max_align, 2 * max_align);
|
|
|
9db63c |
- for (align = 512; align <= max_align; align <<= 1) {
|
|
|
9db63c |
- if (raw_is_io_aligned(fd, buf + align, max_align)) {
|
|
|
9db63c |
- s->buf_align = align;
|
|
|
9db63c |
+ buf = qemu_memalign(max_align, max_align);
|
|
|
9db63c |
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
|
|
|
9db63c |
+ align = alignments[i];
|
|
|
9db63c |
+ if (raw_is_io_aligned(fd, buf, align)) {
|
|
|
9db63c |
+ /* Fallback to safe value. */
|
|
|
9db63c |
+ bs->bl.request_alignment = (align != 1) ? align : max_align;
|
|
|
9db63c |
break;
|
|
|
9db63c |
}
|
|
|
9db63c |
}
|
|
|
9db63c |
qemu_vfree(buf);
|
|
|
9db63c |
}
|
|
|
9db63c |
|
|
|
9db63c |
- if (!bs->bl.request_alignment) {
|
|
|
9db63c |
+ if (!s->buf_align) {
|
|
|
9db63c |
+ int i;
|
|
|
9db63c |
size_t align;
|
|
|
9db63c |
- buf = qemu_memalign(s->buf_align, max_align);
|
|
|
9db63c |
- for (align = 512; align <= max_align; align <<= 1) {
|
|
|
9db63c |
- if (raw_is_io_aligned(fd, buf, align)) {
|
|
|
9db63c |
- bs->bl.request_alignment = align;
|
|
|
9db63c |
+ buf = qemu_memalign(max_align, 2 * max_align);
|
|
|
9db63c |
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
|
|
|
9db63c |
+ align = alignments[i];
|
|
|
9db63c |
+ if (raw_is_io_aligned(fd, buf + align, max_align)) {
|
|
|
9db63c |
+ /* Fallback to request_aligment. */
|
|
|
9db63c |
+ s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
|
|
|
9db63c |
break;
|
|
|
9db63c |
}
|
|
|
9db63c |
}
|