Blame 0002-block-posix-Always-allocate-the-first-block.patch

9db63c
From: Nir Soffer <nirsof@gmail.com>
9db63c
Date: Tue, 27 Aug 2019 04:05:27 +0300
9db63c
Subject: [PATCH] block: posix: Always allocate the first block
9db63c
9db63c
When creating an image with preallocation "off" or "falloc", the first
9db63c
block of the image is typically not allocated. When using Gluster
9db63c
storage backed by XFS filesystem, reading this block using direct I/O
9db63c
succeeds regardless of request length, fooling alignment detection.
9db63c
9db63c
In this case we fallback to a safe value (4096) instead of the optimal
9db63c
value (512), which may lead to unneeded data copying when aligning
9db63c
requests.  Allocating the first block avoids the fallback.
9db63c
9db63c
Since we allocate the first block even with preallocation=off, we no
9db63c
longer create images with zero disk size:
9db63c
9db63c
    $ ./qemu-img create -f raw test.raw 1g
9db63c
    Formatting 'test.raw', fmt=raw size=1073741824
9db63c
9db63c
    $ ls -lhs test.raw
9db63c
    4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw
9db63c
9db63c
And converting the image requires additional cluster:
9db63c
9db63c
    $ ./qemu-img measure -f raw -O qcow2 test.raw
9db63c
    required size: 458752
9db63c
    fully allocated size: 1074135040
9db63c
9db63c
When using format like vmdk with multiple files per image, we allocate
9db63c
one block per file:
9db63c
9db63c
    $ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g
9db63c
    Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentFlat
9db63c
9db63c
    $ ls -lhs test*.vmdk
9db63c
    4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk
9db63c
    4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk
9db63c
    4.0K -rw-r--r--. 1 nsoffer nsoffer  353 Aug 27 03:23 test.vmdk
9db63c
9db63c
I did quick performance test for copying disks with qemu-img convert to
9db63c
new raw target image to Gluster storage with sector size of 512 bytes:
9db63c
9db63c
    for i in $(seq 10); do
9db63c
        rm -f dst.raw
9db63c
        sleep 10
9db63c
        time ./qemu-img convert -f raw -O raw -t none -T none src.raw dst.raw
9db63c
    done
9db63c
9db63c
Here is a table comparing the total time spent:
9db63c
9db63c
Type    Before(s)   After(s)    Diff(%)
9db63c
---------------------------------------
9db63c
real      530.028    469.123      -11.4
9db63c
user       17.204     10.768      -37.4
9db63c
sys        17.881      7.011      -60.7
9db63c
9db63c
We can see very clear improvement in CPU usage.
9db63c
9db63c
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
9db63c
Message-id: 20190827010528.8818-2-nsoffer@redhat.com
9db63c
Reviewed-by: Max Reitz <mreitz@redhat.com>
9db63c
Signed-off-by: Max Reitz <mreitz@redhat.com>
9db63c
(cherry picked from commit 3a20013fbb26d2a1bd11ef148eefdb1508783787)
9db63c
---
9db63c
 block/file-posix.c               | 51 ++++++++++++++++++++++++++++++++
9db63c
 tests/qemu-iotests/059.out       |  2 +-
9db63c
 tests/qemu-iotests/150.out       | 11 -------
9db63c
 tests/qemu-iotests/150.out.qcow2 | 11 +++++++
9db63c
 tests/qemu-iotests/150.out.raw   | 12 ++++++++
9db63c
 tests/qemu-iotests/175           | 19 ++++++++----
9db63c
 tests/qemu-iotests/175.out       |  8 ++---
9db63c
 tests/qemu-iotests/178.out.qcow2 |  4 +--
9db63c
 tests/qemu-iotests/221.out       | 12 +++++---
9db63c
 tests/qemu-iotests/253.out       | 12 +++++---
9db63c
 10 files changed, 110 insertions(+), 32 deletions(-)
9db63c
 delete mode 100644 tests/qemu-iotests/150.out
9db63c
 create mode 100644 tests/qemu-iotests/150.out.qcow2
9db63c
 create mode 100644 tests/qemu-iotests/150.out.raw
9db63c
9db63c
diff --git a/block/file-posix.c b/block/file-posix.c
9db63c
index b8b4dad553..8ea98896ce 100644
9db63c
--- a/block/file-posix.c
9db63c
+++ b/block/file-posix.c
9db63c
@@ -1749,6 +1749,43 @@ static int handle_aiocb_discard(void *opaque)
9db63c
     return ret;
9db63c
 }
9db63c
 
9db63c
+/*
9db63c
+ * Help alignment probing by allocating the first block.
9db63c
+ *
9db63c
+ * When reading with direct I/O from unallocated area on Gluster backed by XFS,
9db63c
+ * reading succeeds regardless of request length. In this case we fallback to
9db63c
+ * safe alignment which is not optimal. Allocating the first block avoids this
9db63c
+ * fallback.
9db63c
+ *
9db63c
+ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
9db63c
+ * request alignment, so we use safe values.
9db63c
+ *
9db63c
+ * Returns: 0 on success, -errno on failure. Since this is an optimization,
9db63c
+ * caller may ignore failures.
9db63c
+ */
9db63c
+static int allocate_first_block(int fd, size_t max_size)
9db63c
+{
9db63c
+    size_t write_size = (max_size < MAX_BLOCKSIZE)
9db63c
+        ? BDRV_SECTOR_SIZE
9db63c
+        : MAX_BLOCKSIZE;
9db63c
+    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
9db63c
+    void *buf;
9db63c
+    ssize_t n;
9db63c
+    int ret;
9db63c
+
9db63c
+    buf = qemu_memalign(max_align, write_size);
9db63c
+    memset(buf, 0, write_size);
9db63c
+
9db63c
+    do {
9db63c
+        n = pwrite(fd, buf, write_size, 0);
9db63c
+    } while (n == -1 && errno == EINTR);
9db63c
+
9db63c
+    ret = (n == -1) ? -errno : 0;
9db63c
+
9db63c
+    qemu_vfree(buf);
9db63c
+    return ret;
9db63c
+}
9db63c
+
9db63c
 static int handle_aiocb_truncate(void *opaque)
9db63c
 {
9db63c
     RawPosixAIOData *aiocb = opaque;
9db63c
@@ -1788,6 +1825,17 @@ static int handle_aiocb_truncate(void *opaque)
9db63c
                 /* posix_fallocate() doesn't set errno. */
9db63c
                 error_setg_errno(errp, -result,
9db63c
                                  "Could not preallocate new data");
9db63c
+            } else if (current_length == 0) {
9db63c
+                /*
9db63c
+                 * posix_fallocate() uses fallocate() if the filesystem
9db63c
+                 * supports it, or fallback to manually writing zeroes. If
9db63c
+                 * fallocate() was used, unaligned reads from the fallocated
9db63c
+                 * area in raw_probe_alignment() will succeed, hence we need to
9db63c
+                 * allocate the first block.
9db63c
+                 *
9db63c
+                 * Optimize future alignment probing; ignore failures.
9db63c
+                 */
9db63c
+                allocate_first_block(fd, offset);
9db63c
             }
9db63c
         } else {
9db63c
             result = 0;
9db63c
@@ -1849,6 +1897,9 @@ static int handle_aiocb_truncate(void *opaque)
9db63c
         if (ftruncate(fd, offset) != 0) {
9db63c
             result = -errno;
9db63c
             error_setg_errno(errp, -result, "Could not resize file");
9db63c
+        } else if (current_length == 0 && offset > current_length) {
9db63c
+            /* Optimize future alignment probing; ignore failures. */
9db63c
+            allocate_first_block(fd, offset);
9db63c
         }
9db63c
         return result;
9db63c
     default:
9db63c
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
9db63c
index 4fab42a28c..fe3f861f3c 100644
9db63c
--- a/tests/qemu-iotests/059.out
9db63c
+++ b/tests/qemu-iotests/059.out
9db63c
@@ -27,7 +27,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824000 subformat=twoGbMax
9db63c
 image: TEST_DIR/t.vmdk
9db63c
 file format: vmdk
9db63c
 virtual size: 0.977 TiB (1073741824000 bytes)
9db63c
-disk size: 16 KiB
9db63c
+disk size: 1.97 MiB
9db63c
 Format specific information:
9db63c
     cid: XXXXXXXX
9db63c
     parent cid: XXXXXXXX
9db63c
diff --git a/tests/qemu-iotests/150.out b/tests/qemu-iotests/150.out
9db63c
deleted file mode 100644
9db63c
index 2a54e8dcfa..0000000000
9db63c
--- a/tests/qemu-iotests/150.out
9db63c
+++ /dev/null
9db63c
@@ -1,11 +0,0 @@
9db63c
-QA output created by 150
9db63c
-
9db63c
-=== Mapping sparse conversion ===
9db63c
-
9db63c
-Offset          Length          File
9db63c
-
9db63c
-=== Mapping non-sparse conversion ===
9db63c
-
9db63c
-Offset          Length          File
9db63c
-0               0x100000        TEST_DIR/t.IMGFMT
9db63c
-*** done
9db63c
diff --git a/tests/qemu-iotests/150.out.qcow2 b/tests/qemu-iotests/150.out.qcow2
9db63c
new file mode 100644
9db63c
index 0000000000..2a54e8dcfa
9db63c
--- /dev/null
9db63c
+++ b/tests/qemu-iotests/150.out.qcow2
9db63c
@@ -0,0 +1,11 @@
9db63c
+QA output created by 150
9db63c
+
9db63c
+=== Mapping sparse conversion ===
9db63c
+
9db63c
+Offset          Length          File
9db63c
+
9db63c
+=== Mapping non-sparse conversion ===
9db63c
+
9db63c
+Offset          Length          File
9db63c
+0               0x100000        TEST_DIR/t.IMGFMT
9db63c
+*** done
9db63c
diff --git a/tests/qemu-iotests/150.out.raw b/tests/qemu-iotests/150.out.raw
9db63c
new file mode 100644
9db63c
index 0000000000..3cdc7727a5
9db63c
--- /dev/null
9db63c
+++ b/tests/qemu-iotests/150.out.raw
9db63c
@@ -0,0 +1,12 @@
9db63c
+QA output created by 150
9db63c
+
9db63c
+=== Mapping sparse conversion ===
9db63c
+
9db63c
+Offset          Length          File
9db63c
+0               0x1000          TEST_DIR/t.IMGFMT
9db63c
+
9db63c
+=== Mapping non-sparse conversion ===
9db63c
+
9db63c
+Offset          Length          File
9db63c
+0               0x100000        TEST_DIR/t.IMGFMT
9db63c
+*** done
9db63c
diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175
9db63c
index 51e62c8276..7ba28b3c1b 100755
9db63c
--- a/tests/qemu-iotests/175
9db63c
+++ b/tests/qemu-iotests/175
9db63c
@@ -37,14 +37,16 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
9db63c
 # the file size.  This function hides the resulting difference in the
9db63c
 # stat -c '%b' output.
9db63c
 # Parameter 1: Number of blocks an empty file occupies
9db63c
-# Parameter 2: Image size in bytes
9db63c
+# Parameter 2: Minimal number of blocks in an image
9db63c
+# Parameter 3: Image size in bytes
9db63c
 _filter_blocks()
9db63c
 {
9db63c
     extra_blocks=$1
9db63c
-    img_size=$2
9db63c
+    min_blocks=$2
9db63c
+    img_size=$3
9db63c
 
9db63c
-    sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \
9db63c
-        -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/everything allocated/"
9db63c
+    sed -e "s/blocks=$min_blocks\\(\$\\|[^0-9]\\)/min allocation/" \
9db63c
+        -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max allocation/"
9db63c
 }
9db63c
 
9db63c
 # get standard environment, filters and checks
9db63c
@@ -60,16 +62,21 @@ size=$((1 * 1024 * 1024))
9db63c
 touch "$TEST_DIR/empty"
9db63c
 extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
9db63c
 
9db63c
+# We always write the first byte; check how many blocks this filesystem
9db63c
+# allocates to match empty image alloation.
9db63c
+printf "\0" > "$TEST_DIR/empty"
9db63c
+min_blocks=$(stat -c '%b' "$TEST_DIR/empty")
9db63c
+
9db63c
 echo
9db63c
 echo "== creating image with default preallocation =="
9db63c
 _make_test_img $size | _filter_imgfmt
9db63c
-stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
9db63c
+stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
9db63c
 
9db63c
 for mode in off full falloc; do
9db63c
     echo
9db63c
     echo "== creating image with preallocation $mode =="
9db63c
     IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
9db63c
-    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
9db63c
+    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
9db63c
 done
9db63c
 
9db63c
 # success, all done
9db63c
diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out
9db63c
index 6d9a5ed84e..263e521262 100644
9db63c
--- a/tests/qemu-iotests/175.out
9db63c
+++ b/tests/qemu-iotests/175.out
9db63c
@@ -2,17 +2,17 @@ QA output created by 175
9db63c
 
9db63c
 == creating image with default preallocation ==
9db63c
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
9db63c
-size=1048576, nothing allocated
9db63c
+size=1048576, min allocation
9db63c
 
9db63c
 == creating image with preallocation off ==
9db63c
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
9db63c
-size=1048576, nothing allocated
9db63c
+size=1048576, min allocation
9db63c
 
9db63c
 == creating image with preallocation full ==
9db63c
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
9db63c
-size=1048576, everything allocated
9db63c
+size=1048576, max allocation
9db63c
 
9db63c
 == creating image with preallocation falloc ==
9db63c
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
9db63c
-size=1048576, everything allocated
9db63c
+size=1048576, max allocation
9db63c
  *** done
9db63c
diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2
9db63c
index 55a8dc926f..9e7d8c44df 100644
9db63c
--- a/tests/qemu-iotests/178.out.qcow2
9db63c
+++ b/tests/qemu-iotests/178.out.qcow2
9db63c
@@ -101,7 +101,7 @@ converted image file size in bytes: 196608
9db63c
 == raw input image with data (human) ==
9db63c
 
9db63c
 Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
9db63c
-required size: 393216
9db63c
+required size: 458752
9db63c
 fully allocated size: 1074135040
9db63c
 wrote 512/512 bytes at offset 512
9db63c
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
9db63c
@@ -257,7 +257,7 @@ converted image file size in bytes: 196608
9db63c
 
9db63c
 Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
9db63c
 {
9db63c
-    "required": 393216,
9db63c
+    "required": 458752,
9db63c
     "fully-allocated": 1074135040
9db63c
 }
9db63c
 wrote 512/512 bytes at offset 512
9db63c
diff --git a/tests/qemu-iotests/221.out b/tests/qemu-iotests/221.out
9db63c
index 9f9dd52bb0..dca024a0c3 100644
9db63c
--- a/tests/qemu-iotests/221.out
9db63c
+++ b/tests/qemu-iotests/221.out
9db63c
@@ -3,14 +3,18 @@ QA output created by 221
9db63c
 === Check mapping of unaligned raw image ===
9db63c
 
9db63c
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537
9db63c
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
 wrote 1/1 bytes at offset 65536
9db63c
 1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
9db63c
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
 { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
 { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
 { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
 { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
 *** done
9db63c
diff --git a/tests/qemu-iotests/253.out b/tests/qemu-iotests/253.out
9db63c
index 607c0baa0b..3d08b305d7 100644
9db63c
--- a/tests/qemu-iotests/253.out
9db63c
+++ b/tests/qemu-iotests/253.out
9db63c
@@ -3,12 +3,16 @@ QA output created by 253
9db63c
 === Check mapping of unaligned raw image ===
9db63c
 
9db63c
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575
9db63c
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
9db63c
 wrote 65535/65535 bytes at offset 983040
9db63c
 63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
9db63c
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
 { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
9db63c
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
9db63c
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
9db63c
 { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
9db63c
 *** done