b38b0f
From 273237507842493f78cd492cd54137e828a986ef Mon Sep 17 00:00:00 2001
b38b0f
From: Thomas Huth <thuth@redhat.com>
b38b0f
Date: Fri, 30 Aug 2019 12:56:27 +0100
b38b0f
Subject: [PATCH 09/10] block: posix: Always allocate the first block
b38b0f
b38b0f
RH-Author: Thomas Huth <thuth@redhat.com>
b38b0f
Message-id: <20190830125628.23668-5-thuth@redhat.com>
b38b0f
Patchwork-id: 90210
b38b0f
O-Subject: [RHEL-8.1.0 qemu-kvm PATCH v2 4/5] block: posix: Always allocate the first block
b38b0f
Bugzilla: 1738839
b38b0f
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
b38b0f
RH-Acked-by: Max Reitz <mreitz@redhat.com>
b38b0f
RH-Acked-by: David Hildenbrand <david@redhat.com>
b38b0f
b38b0f
From: Nir Soffer <nirsof@gmail.com>
b38b0f
b38b0f
When creating an image with preallocation "off" or "falloc", the first
b38b0f
block of the image is typically not allocated. When using Gluster
b38b0f
storage backed by XFS filesystem, reading this block using direct I/O
b38b0f
succeeds regardless of request length, fooling alignment detection.
b38b0f
b38b0f
In this case we fallback to a safe value (4096) instead of the optimal
b38b0f
value (512), which may lead to unneeded data copying when aligning
b38b0f
requests.  Allocating the first block avoids the fallback.
b38b0f
b38b0f
Since we allocate the first block even with preallocation=off, we no
b38b0f
longer create images with zero disk size:
b38b0f
b38b0f
    $ ./qemu-img create -f raw test.raw 1g
b38b0f
    Formatting 'test.raw', fmt=raw size=1073741824
b38b0f
b38b0f
    $ ls -lhs test.raw
b38b0f
    4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw
b38b0f
b38b0f
And converting the image requires additional cluster:
b38b0f
b38b0f
    $ ./qemu-img measure -f raw -O qcow2 test.raw
b38b0f
    required size: 458752
b38b0f
    fully allocated size: 1074135040
b38b0f
b38b0f
When using format like vmdk with multiple files per image, we allocate
b38b0f
one block per file:
b38b0f
b38b0f
    $ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g
b38b0f
    Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentFlat
b38b0f
b38b0f
    $ ls -lhs test*.vmdk
b38b0f
    4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk
b38b0f
    4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk
b38b0f
    4.0K -rw-r--r--. 1 nsoffer nsoffer  353 Aug 27 03:23 test.vmdk
b38b0f
b38b0f
I did quick performance test for copying disks with qemu-img convert to
b38b0f
new raw target image to Gluster storage with sector size of 512 bytes:
b38b0f
b38b0f
    for i in $(seq 10); do
b38b0f
        rm -f dst.raw
b38b0f
        sleep 10
b38b0f
        time ./qemu-img convert -f raw -O raw -t none -T none src.raw dst.raw
b38b0f
    done
b38b0f
b38b0f
Here is a table comparing the total time spent:
b38b0f
b38b0f
Type    Before(s)   After(s)    Diff(%)
b38b0f
---------------------------------------
b38b0f
real      530.028    469.123      -11.4
b38b0f
user       17.204     10.768      -37.4
b38b0f
sys        17.881      7.011      -60.7
b38b0f
b38b0f
We can see very clear improvement in CPU usage.
b38b0f
b38b0f
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
b38b0f
Message-id: 20190827010528.8818-2-nsoffer@redhat.com
b38b0f
Reviewed-by: Max Reitz <mreitz@redhat.com>
b38b0f
Signed-off-by: Max Reitz <mreitz@redhat.com>
b38b0f
(cherry picked from commit 3f900188502670a15f8915d5363533512ecd035f)
b38b0f
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
b38b0f
b38b0f
Conflicts:
b38b0f
	block/file-posix.c (simple contextual conflict)
b38b0f
	tests/qemu-iotests/059.out (Needed to adapt output a little bit)
b38b0f
b38b0f
Signed-off-by: Thomas Huth <thuth@redhat.com>
b38b0f
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
b38b0f
---
b38b0f
 block/file-posix.c               | 51 ++++++++++++++++++++++++++++++++++++++++
b38b0f
 tests/qemu-iotests/059.out       |  2 +-
b38b0f
 tests/qemu-iotests/150.out       | 11 ---------
b38b0f
 tests/qemu-iotests/150.out.qcow2 | 11 +++++++++
b38b0f
 tests/qemu-iotests/150.out.raw   | 12 ++++++++++
b38b0f
 tests/qemu-iotests/175           | 19 ++++++++++-----
b38b0f
 tests/qemu-iotests/175.out       |  8 +++----
b38b0f
 tests/qemu-iotests/178.out.qcow2 |  4 ++--
b38b0f
 tests/qemu-iotests/221.out       | 12 ++++++----
b38b0f
 tests/qemu-iotests/253.out       | 12 ++++++----
b38b0f
 10 files changed, 110 insertions(+), 32 deletions(-)
b38b0f
 delete mode 100644 tests/qemu-iotests/150.out
b38b0f
 create mode 100644 tests/qemu-iotests/150.out.qcow2
b38b0f
 create mode 100644 tests/qemu-iotests/150.out.raw
b38b0f
b38b0f
diff --git a/block/file-posix.c b/block/file-posix.c
b38b0f
index 84c5a31..dfe0bca 100644
b38b0f
--- a/block/file-posix.c
b38b0f
+++ b/block/file-posix.c
b38b0f
@@ -1605,6 +1605,43 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
b38b0f
     return ret;
b38b0f
 }
b38b0f
 
b38b0f
+/*
b38b0f
+ * Help alignment probing by allocating the first block.
b38b0f
+ *
b38b0f
+ * When reading with direct I/O from unallocated area on Gluster backed by XFS,
b38b0f
+ * reading succeeds regardless of request length. In this case we fallback to
b38b0f
+ * safe alignment which is not optimal. Allocating the first block avoids this
b38b0f
+ * fallback.
b38b0f
+ *
b38b0f
+ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
b38b0f
+ * request alignment, so we use safe values.
b38b0f
+ *
b38b0f
+ * Returns: 0 on success, -errno on failure. Since this is an optimization,
b38b0f
+ * caller may ignore failures.
b38b0f
+ */
b38b0f
+static int allocate_first_block(int fd, size_t max_size)
b38b0f
+{
b38b0f
+    size_t write_size = (max_size < MAX_BLOCKSIZE)
b38b0f
+        ? BDRV_SECTOR_SIZE
b38b0f
+        : MAX_BLOCKSIZE;
b38b0f
+    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
b38b0f
+    void *buf;
b38b0f
+    ssize_t n;
b38b0f
+    int ret;
b38b0f
+
b38b0f
+    buf = qemu_memalign(max_align, write_size);
b38b0f
+    memset(buf, 0, write_size);
b38b0f
+
b38b0f
+    do {
b38b0f
+        n = pwrite(fd, buf, write_size, 0);
b38b0f
+    } while (n == -1 && errno == EINTR);
b38b0f
+
b38b0f
+    ret = (n == -1) ? -errno : 0;
b38b0f
+
b38b0f
+    qemu_vfree(buf);
b38b0f
+    return ret;
b38b0f
+}
b38b0f
+
b38b0f
 static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
b38b0f
 {
b38b0f
     int result = 0;
b38b0f
@@ -1642,6 +1679,17 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
b38b0f
                 /* posix_fallocate() doesn't set errno. */
b38b0f
                 error_setg_errno(errp, -result,
b38b0f
                                  "Could not preallocate new data");
b38b0f
+            } else if (current_length == 0) {
b38b0f
+                /*
b38b0f
+                 * posix_fallocate() uses fallocate() if the filesystem
b38b0f
+                 * supports it, or fallback to manually writing zeroes. If
b38b0f
+                 * fallocate() was used, unaligned reads from the fallocated
b38b0f
+                 * area in raw_probe_alignment() will succeed, hence we need to
b38b0f
+                 * allocate the first block.
b38b0f
+                 *
b38b0f
+                 * Optimize future alignment probing; ignore failures.
b38b0f
+                 */
b38b0f
+                allocate_first_block(fd, offset);
b38b0f
             }
b38b0f
         } else {
b38b0f
             result = 0;
b38b0f
@@ -1700,6 +1748,9 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
b38b0f
         if (ftruncate(fd, offset) != 0) {
b38b0f
             result = -errno;
b38b0f
             error_setg_errno(errp, -result, "Could not resize file");
b38b0f
+        } else if (current_length == 0 && offset > current_length) {
b38b0f
+            /* Optimize future alignment probing; ignore failures. */
b38b0f
+            allocate_first_block(fd, offset);
b38b0f
         }
b38b0f
         return result;
b38b0f
     default:
b38b0f
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
b38b0f
index f6dce79..19cd591 100644
b38b0f
--- a/tests/qemu-iotests/059.out
b38b0f
+++ b/tests/qemu-iotests/059.out
b38b0f
@@ -27,7 +27,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824000 subformat=twoGbMax
b38b0f
 image: TEST_DIR/t.vmdk
b38b0f
 file format: vmdk
b38b0f
 virtual size: 1.0T (1073741824000 bytes)
b38b0f
-disk size: 16K
b38b0f
+disk size: 2.0M
b38b0f
 Format specific information:
b38b0f
     cid: XXXXXXXX
b38b0f
     parent cid: XXXXXXXX
b38b0f
diff --git a/tests/qemu-iotests/150.out b/tests/qemu-iotests/150.out
b38b0f
deleted file mode 100644
b38b0f
index 2a54e8d..0000000
b38b0f
--- a/tests/qemu-iotests/150.out
b38b0f
+++ /dev/null
b38b0f
@@ -1,11 +0,0 @@
b38b0f
-QA output created by 150
b38b0f
-
b38b0f
-=== Mapping sparse conversion ===
b38b0f
-
b38b0f
-Offset          Length          File
b38b0f
-
b38b0f
-=== Mapping non-sparse conversion ===
b38b0f
-
b38b0f
-Offset          Length          File
b38b0f
-0               0x100000        TEST_DIR/t.IMGFMT
b38b0f
-*** done
b38b0f
diff --git a/tests/qemu-iotests/150.out.qcow2 b/tests/qemu-iotests/150.out.qcow2
b38b0f
new file mode 100644
b38b0f
index 0000000..2a54e8d
b38b0f
--- /dev/null
b38b0f
+++ b/tests/qemu-iotests/150.out.qcow2
b38b0f
@@ -0,0 +1,11 @@
b38b0f
+QA output created by 150
b38b0f
+
b38b0f
+=== Mapping sparse conversion ===
b38b0f
+
b38b0f
+Offset          Length          File
b38b0f
+
b38b0f
+=== Mapping non-sparse conversion ===
b38b0f
+
b38b0f
+Offset          Length          File
b38b0f
+0               0x100000        TEST_DIR/t.IMGFMT
b38b0f
+*** done
b38b0f
diff --git a/tests/qemu-iotests/150.out.raw b/tests/qemu-iotests/150.out.raw
b38b0f
new file mode 100644
b38b0f
index 0000000..3cdc772
b38b0f
--- /dev/null
b38b0f
+++ b/tests/qemu-iotests/150.out.raw
b38b0f
@@ -0,0 +1,12 @@
b38b0f
+QA output created by 150
b38b0f
+
b38b0f
+=== Mapping sparse conversion ===
b38b0f
+
b38b0f
+Offset          Length          File
b38b0f
+0               0x1000          TEST_DIR/t.IMGFMT
b38b0f
+
b38b0f
+=== Mapping non-sparse conversion ===
b38b0f
+
b38b0f
+Offset          Length          File
b38b0f
+0               0x100000        TEST_DIR/t.IMGFMT
b38b0f
+*** done
b38b0f
diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175
b38b0f
index 2e37c9a..b3b7712 100755
b38b0f
--- a/tests/qemu-iotests/175
b38b0f
+++ b/tests/qemu-iotests/175
b38b0f
@@ -38,14 +38,16 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
b38b0f
 # the file size.  This function hides the resulting difference in the
b38b0f
 # stat -c '%b' output.
b38b0f
 # Parameter 1: Number of blocks an empty file occupies
b38b0f
-# Parameter 2: Image size in bytes
b38b0f
+# Parameter 2: Minimal number of blocks in an image
b38b0f
+# Parameter 3: Image size in bytes
b38b0f
 _filter_blocks()
b38b0f
 {
b38b0f
     extra_blocks=$1
b38b0f
-    img_size=$2
b38b0f
+    min_blocks=$2
b38b0f
+    img_size=$3
b38b0f
 
b38b0f
-    sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \
b38b0f
-        -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/everything allocated/"
b38b0f
+    sed -e "s/blocks=$min_blocks\\(\$\\|[^0-9]\\)/min allocation/" \
b38b0f
+        -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max allocation/"
b38b0f
 }
b38b0f
 
b38b0f
 # get standard environment, filters and checks
b38b0f
@@ -61,16 +63,21 @@ size=$((1 * 1024 * 1024))
b38b0f
 touch "$TEST_DIR/empty"
b38b0f
 extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
b38b0f
 
b38b0f
+# We always write the first byte; check how many blocks this filesystem
b38b0f
+# allocates to match empty image alloation.
b38b0f
+printf "\0" > "$TEST_DIR/empty"
b38b0f
+min_blocks=$(stat -c '%b' "$TEST_DIR/empty")
b38b0f
+
b38b0f
 echo
b38b0f
 echo "== creating image with default preallocation =="
b38b0f
 _make_test_img $size | _filter_imgfmt
b38b0f
-stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
b38b0f
+stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
b38b0f
 
b38b0f
 for mode in off full falloc; do
b38b0f
     echo
b38b0f
     echo "== creating image with preallocation $mode =="
b38b0f
     IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
b38b0f
-    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
b38b0f
+    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
b38b0f
 done
b38b0f
 
b38b0f
 # success, all done
b38b0f
diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out
b38b0f
index 6d9a5ed..263e521 100644
b38b0f
--- a/tests/qemu-iotests/175.out
b38b0f
+++ b/tests/qemu-iotests/175.out
b38b0f
@@ -2,17 +2,17 @@ QA output created by 175
b38b0f
 
b38b0f
 == creating image with default preallocation ==
b38b0f
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
b38b0f
-size=1048576, nothing allocated
b38b0f
+size=1048576, min allocation
b38b0f
 
b38b0f
 == creating image with preallocation off ==
b38b0f
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
b38b0f
-size=1048576, nothing allocated
b38b0f
+size=1048576, min allocation
b38b0f
 
b38b0f
 == creating image with preallocation full ==
b38b0f
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
b38b0f
-size=1048576, everything allocated
b38b0f
+size=1048576, max allocation
b38b0f
 
b38b0f
 == creating image with preallocation falloc ==
b38b0f
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
b38b0f
-size=1048576, everything allocated
b38b0f
+size=1048576, max allocation
b38b0f
  *** done
b38b0f
diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2
b38b0f
index d42d4a4..12edc3d 100644
b38b0f
--- a/tests/qemu-iotests/178.out.qcow2
b38b0f
+++ b/tests/qemu-iotests/178.out.qcow2
b38b0f
@@ -96,7 +96,7 @@ converted image file size in bytes: 196608
b38b0f
 == raw input image with data (human) ==
b38b0f
 
b38b0f
 Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
b38b0f
-required size: 393216
b38b0f
+required size: 458752
b38b0f
 fully allocated size: 1074135040
b38b0f
 wrote 512/512 bytes at offset 512
b38b0f
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
b38b0f
@@ -240,7 +240,7 @@ converted image file size in bytes: 196608
b38b0f
 
b38b0f
 Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
b38b0f
 {
b38b0f
-    "required": 393216,
b38b0f
+    "required": 458752,
b38b0f
     "fully-allocated": 1074135040
b38b0f
 }
b38b0f
 wrote 512/512 bytes at offset 512
b38b0f
diff --git a/tests/qemu-iotests/221.out b/tests/qemu-iotests/221.out
b38b0f
index 9f9dd52..dca024a 100644
b38b0f
--- a/tests/qemu-iotests/221.out
b38b0f
+++ b/tests/qemu-iotests/221.out
b38b0f
@@ -3,14 +3,18 @@ QA output created by 221
b38b0f
 === Check mapping of unaligned raw image ===
b38b0f
 
b38b0f
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537
b38b0f
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
 wrote 1/1 bytes at offset 65536
b38b0f
 1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
b38b0f
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
 { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
 { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
 { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
 { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
 *** done
b38b0f
diff --git a/tests/qemu-iotests/253.out b/tests/qemu-iotests/253.out
b38b0f
index 607c0ba..3d08b30 100644
b38b0f
--- a/tests/qemu-iotests/253.out
b38b0f
+++ b/tests/qemu-iotests/253.out
b38b0f
@@ -3,12 +3,16 @@ QA output created by 253
b38b0f
 === Check mapping of unaligned raw image ===
b38b0f
 
b38b0f
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575
b38b0f
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
b38b0f
 wrote 65535/65535 bytes at offset 983040
b38b0f
 63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
b38b0f
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
 { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
b38b0f
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
b38b0f
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
b38b0f
 { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
b38b0f
 *** done
b38b0f
-- 
b38b0f
1.8.3.1
b38b0f