b38b0f
From 5935958fc4eb9934b1493486a69f0f571e7da112 Mon Sep 17 00:00:00 2001
b38b0f
From: Thomas Huth <thuth@redhat.com>
b38b0f
Date: Fri, 30 Aug 2019 12:56:24 +0100
b38b0f
Subject: [PATCH 06/10] file-posix: Handle undetectable alignment
b38b0f
b38b0f
RH-Author: Thomas Huth <thuth@redhat.com>
b38b0f
Message-id: <20190830125628.23668-2-thuth@redhat.com>
b38b0f
Patchwork-id: 90209
b38b0f
O-Subject: [RHEL-8.1.0 qemu-kvm PATCH v2 1/5] file-posix: Handle undetectable alignment
b38b0f
Bugzilla: 1738839
b38b0f
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
b38b0f
RH-Acked-by: Max Reitz <mreitz@redhat.com>
b38b0f
RH-Acked-by: David Hildenbrand <david@redhat.com>
b38b0f
b38b0f
In some cases buf_align or request_alignment cannot be detected:
b38b0f
b38b0f
1. With Gluster, buf_align cannot be detected since the actual I/O is
b38b0f
   done on Gluster server, and qemu buffer alignment does not matter.
b38b0f
   Since we don't have alignment requirement, buf_align=1 is the best
b38b0f
   value.
b38b0f
b38b0f
2. With local XFS filesystem, buf_align cannot be detected if reading
b38b0f
   from unallocated area. In this we must align the buffer, but we don't
b38b0f
   know what is the correct size. Using the wrong alignment results in
b38b0f
   I/O error.
b38b0f
b38b0f
3. With Gluster backed by XFS, request_alignment cannot be detected if
b38b0f
   reading from unallocated area. In this case we need to use the
b38b0f
   correct alignment, and failing to do so results in I/O errors.
b38b0f
b38b0f
4. With NFS, the server does not use direct I/O, so both buf_align cannot
b38b0f
   be detected. In this case we don't need any alignment so we can use
b38b0f
   buf_align=1 and request_alignment=1.
b38b0f
b38b0f
These cases seems to work when storage sector size is 512 bytes, because
b38b0f
the current code starts checking align=512. If the check succeeds
b38b0f
because alignment cannot be detected we use 512. But this does not work
b38b0f
for storage with 4k sector size.
b38b0f
b38b0f
To determine if we can detect the alignment, we probe first with
b38b0f
align=1. If probing succeeds, maybe there are no alignment requirement
b38b0f
(cases 1, 4) or we are probing unallocated area (cases 2, 3). Since we
b38b0f
don't have any way to tell, we treat this as undetectable alignment. If
b38b0f
probing with align=1 fails with EINVAL, but probing with one of the
b38b0f
expected alignments succeeds, we know that we found a working alignment.
b38b0f
b38b0f
Practically the alignment requirements are the same for buffer
b38b0f
alignment, buffer length, and offset in file. So in case we cannot
b38b0f
detect buf_align, we can use request alignment. If we cannot detect
b38b0f
request alignment, we can fallback to a safe value. To use this logic,
b38b0f
we probe first request alignment instead of buf_align.
b38b0f
b38b0f
Here is a table showing the behaviour with current code (the value in
b38b0f
parenthesis is the optimal value).
b38b0f
b38b0f
Case    Sector    buf_align (opt)   request_alignment (opt)     result
b38b0f
b38b0f
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
b38b0f
---
b38b0f
 block/file-posix.c | 36 +++++++++++++++++++++++++-----------
b38b0f
 1 file changed, 25 insertions(+), 11 deletions(-)
b38b0f
b38b0f
diff --git a/block/file-posix.c b/block/file-posix.c
b38b0f
index 4b404e4..84c5a31 100644
b38b0f
--- a/block/file-posix.c
b38b0f
+++ b/block/file-posix.c
b38b0f
@@ -324,6 +324,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
b38b0f
     BDRVRawState *s = bs->opaque;
b38b0f
     char *buf;
b38b0f
     size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
b38b0f
+    size_t alignments[] = {1, 512, 1024, 2048, 4096};
b38b0f
 
b38b0f
     /* For SCSI generic devices the alignment is not really used.
b38b0f
        With buffered I/O, we don't have any restrictions. */
b38b0f
@@ -350,25 +351,38 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
b38b0f
     }
b38b0f
 #endif
b38b0f
 
b38b0f
-    /* If we could not get the sizes so far, we can only guess them */
b38b0f
-    if (!s->buf_align) {
b38b0f
+    /*
b38b0f
+     * If we could not get the sizes so far, we can only guess them. First try
b38b0f
+     * to detect request alignment, since it is more likely to succeed. Then
b38b0f
+     * try to detect buf_align, which cannot be detected in some cases (e.g.
b38b0f
+     * Gluster). If buf_align cannot be detected, we fallback to the value of
b38b0f
+     * request_alignment.
b38b0f
+     */
b38b0f
+
b38b0f
+    if (!bs->bl.request_alignment) {
b38b0f
+        int i;
b38b0f
         size_t align;
b38b0f
-        buf = qemu_memalign(max_align, 2 * max_align);
b38b0f
-        for (align = 512; align <= max_align; align <<= 1) {
b38b0f
-            if (raw_is_io_aligned(fd, buf + align, max_align)) {
b38b0f
-                s->buf_align = align;
b38b0f
+        buf = qemu_memalign(max_align, max_align);
b38b0f
+        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
b38b0f
+            align = alignments[i];
b38b0f
+            if (raw_is_io_aligned(fd, buf, align)) {
b38b0f
+                /* Fallback to safe value. */
b38b0f
+                bs->bl.request_alignment = (align != 1) ? align : max_align;
b38b0f
                 break;
b38b0f
             }
b38b0f
         }
b38b0f
         qemu_vfree(buf);
b38b0f
     }
b38b0f
 
b38b0f
-    if (!bs->bl.request_alignment) {
b38b0f
+    if (!s->buf_align) {
b38b0f
+        int i;
b38b0f
         size_t align;
b38b0f
-        buf = qemu_memalign(s->buf_align, max_align);
b38b0f
-        for (align = 512; align <= max_align; align <<= 1) {
b38b0f
-            if (raw_is_io_aligned(fd, buf, align)) {
b38b0f
-                bs->bl.request_alignment = align;
b38b0f
+        buf = qemu_memalign(max_align, 2 * max_align);
b38b0f
+        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
b38b0f
+            align = alignments[i];
b38b0f
+            if (raw_is_io_aligned(fd, buf + align, max_align)) {
b38b0f
+                /* Fallback to request_aligment. */
b38b0f
+                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
b38b0f
                 break;
b38b0f
             }
b38b0f
         }
b38b0f
-- 
b38b0f
1.8.3.1
b38b0f