0a122b
From de29d97cab7b97d776cbb1077079eff0bfc46186 Mon Sep 17 00:00:00 2001
0a122b
From: Kevin Wolf <kwolf@redhat.com>
0a122b
Date: Tue, 29 Nov 2011 12:42:20 +0100
0a122b
Subject: [PATCH 10/37] raw: Probe required direct I/O alignment
0a122b
0a122b
Message-id: <1392117622-28812-11-git-send-email-kwolf@redhat.com>
0a122b
Patchwork-id: 57175
0a122b
O-Subject: [RHEL-7.0 qemu-kvm PATCH v2 10/37] raw: Probe required direct I/O alignment
0a122b
Bugzilla: 748906
0a122b
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
0a122b
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
0a122b
RH-Acked-by: Max Reitz <mreitz@redhat.com>
0a122b
0a122b
From: Paolo Bonzini <pbonzini@redhat.com>
0a122b
0a122b
Add a bs->request_alignment field that contains the required
0a122b
offset/length alignment for I/O requests and fill it in the raw block
0a122b
drivers. Use ioctls if possible, else see what alignment it takes for
0a122b
O_DIRECT to succeed.
0a122b
0a122b
While at it, also expose the memory alignment requirements, which may be
0a122b
(and in practice are) different from the disk alignment requirements.
0a122b
0a122b
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
0a122b
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
0a122b
Reviewed-by: Max Reitz <mreitz@redhat.com>
0a122b
(cherry picked from commit c25f53b06eba1575d5d0e92a0132455c97825b83)
0a122b
0a122b
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
0a122b
---
0a122b
 block.c                   |   3 ++
0a122b
 block/raw-posix.c         | 102 ++++++++++++++++++++++++++++++++++++++--------
0a122b
 block/raw-win32.c         |  41 +++++++++++++++++++
0a122b
 include/block/block_int.h |   3 ++
0a122b
 4 files changed, 132 insertions(+), 17 deletions(-)
0a122b
---
0a122b
 block.c                   |    3 +
0a122b
 block/raw-posix.c         |  102 +++++++++++++++++++++++++++++++++++++-------
0a122b
 block/raw-win32.c         |   41 ++++++++++++++++++
0a122b
 include/block/block_int.h |    3 +
0a122b
 4 files changed, 132 insertions(+), 17 deletions(-)
0a122b
0a122b
diff --git a/block.c b/block.c
0a122b
index f2102bc..40a4a34 100644
0a122b
--- a/block.c
0a122b
+++ b/block.c
0a122b
@@ -780,6 +780,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
0a122b
 
0a122b
     bs->open_flags = flags;
0a122b
     bs->guest_block_size = 512;
0a122b
+    bs->request_alignment = 512;
0a122b
     bs->zero_beyond_eof = true;
0a122b
     open_flags = bdrv_open_flags(bs, flags);
0a122b
     bs->read_only = !(open_flags & BDRV_O_RDWR);
0a122b
@@ -845,6 +846,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
0a122b
     }
0a122b
 
0a122b
     bdrv_refresh_limits(bs);
0a122b
+    assert(bdrv_opt_mem_align(bs) != 0);
0a122b
+    assert(bs->request_alignment != 0);
0a122b
 
0a122b
 #ifndef _WIN32
0a122b
     if (bs->is_temporary) {
0a122b
diff --git a/block/raw-posix.c b/block/raw-posix.c
0a122b
index f410668..9ee5b8e 100644
0a122b
--- a/block/raw-posix.c
0a122b
+++ b/block/raw-posix.c
0a122b
@@ -127,6 +127,8 @@ typedef struct BDRVRawState {
0a122b
     int fd;
0a122b
     int type;
0a122b
     int open_flags;
0a122b
+    size_t buf_align;
0a122b
+
0a122b
 #if defined(__linux__)
0a122b
     /* linux floppy specific */
0a122b
     int64_t fd_open_time;
0a122b
@@ -213,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename)
0a122b
 }
0a122b
 #endif
0a122b
 
0a122b
+static void raw_probe_alignment(BlockDriverState *bs)
0a122b
+{
0a122b
+    BDRVRawState *s = bs->opaque;
0a122b
+    char *buf;
0a122b
+    unsigned int sector_size;
0a122b
+
0a122b
+    /* For /dev/sg devices the alignment is not really used.
0a122b
+       With buffered I/O, we don't have any restrictions. */
0a122b
+    if (bs->sg || !(s->open_flags & O_DIRECT)) {
0a122b
+        bs->request_alignment = 1;
0a122b
+        s->buf_align = 1;
0a122b
+        return;
0a122b
+    }
0a122b
+
0a122b
+    /* Try a few ioctls to get the right size */
0a122b
+    bs->request_alignment = 0;
0a122b
+    s->buf_align = 0;
0a122b
+
0a122b
+#ifdef BLKSSZGET
0a122b
+    if (ioctl(s->fd, BLKSSZGET, &sector_size) >= 0) {
0a122b
+        bs->request_alignment = sector_size;
0a122b
+    }
0a122b
+#endif
0a122b
+#ifdef DKIOCGETBLOCKSIZE
0a122b
+    if (ioctl(s->fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
0a122b
+        bs->request_alignment = sector_size;
0a122b
+    }
0a122b
+#endif
0a122b
+#ifdef DIOCGSECTORSIZE
0a122b
+    if (ioctl(s->fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
0a122b
+        bs->request_alignment = sector_size;
0a122b
+    }
0a122b
+#endif
0a122b
+#ifdef CONFIG_XFS
0a122b
+    if (s->is_xfs) {
0a122b
+        struct dioattr da;
0a122b
+        if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) {
0a122b
+            bs->request_alignment = da.d_miniosz;
0a122b
+            /* The kernel returns wrong information for d_mem */
0a122b
+            /* s->buf_align = da.d_mem; */
0a122b
+        }
0a122b
+    }
0a122b
+#endif
0a122b
+
0a122b
+    /* If we could not get the sizes so far, we can only guess them */
0a122b
+    if (!s->buf_align) {
0a122b
+        size_t align;
0a122b
+        buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
0a122b
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
0a122b
+            if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
0a122b
+                s->buf_align = align;
0a122b
+                break;
0a122b
+            }
0a122b
+        }
0a122b
+        qemu_vfree(buf);
0a122b
+    }
0a122b
+
0a122b
+    if (!bs->request_alignment) {
0a122b
+        size_t align;
0a122b
+        buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
0a122b
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
0a122b
+            if (pread(s->fd, buf, align, 0) >= 0) {
0a122b
+                bs->request_alignment = align;
0a122b
+                break;
0a122b
+            }
0a122b
+        }
0a122b
+        qemu_vfree(buf);
0a122b
+    }
0a122b
+}
0a122b
+
0a122b
 static void raw_parse_flags(int bdrv_flags, int *open_flags)
0a122b
 {
0a122b
     assert(open_flags != NULL);
0a122b
@@ -464,7 +536,6 @@ static int raw_reopen_prepare(BDRVReopenState *state,
0a122b
     return ret;
0a122b
 }
0a122b
 
0a122b
-
0a122b
 static void raw_reopen_commit(BDRVReopenState *state)
0a122b
 {
0a122b
     BDRVRawReopenState *raw_s = state->opaque;
0a122b
@@ -500,23 +571,15 @@ static void raw_reopen_abort(BDRVReopenState *state)
0a122b
     state->opaque = NULL;
0a122b
 }
0a122b
 
0a122b
+static int raw_refresh_limits(BlockDriverState *bs)
0a122b
+{
0a122b
+    BDRVRawState *s = bs->opaque;
0a122b
 
0a122b
-/* XXX: use host sector size if necessary with:
0a122b
-#ifdef DIOCGSECTORSIZE
0a122b
-        {
0a122b
-            unsigned int sectorsize = 512;
0a122b
-            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
0a122b
-                sectorsize > bufsize)
0a122b
-                bufsize = sectorsize;
0a122b
-        }
0a122b
-#endif
0a122b
-#ifdef CONFIG_COCOA
0a122b
-        uint32_t blockSize = 512;
0a122b
-        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
0a122b
-            bufsize = blockSize;
0a122b
-        }
0a122b
-#endif
0a122b
-*/
0a122b
+    raw_probe_alignment(bs);
0a122b
+    bs->bl.opt_mem_alignment = s->buf_align;
0a122b
+
0a122b
+    return 0;
0a122b
+}
0a122b
 
0a122b
 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
0a122b
 {
0a122b
@@ -1363,6 +1426,7 @@ static BlockDriver bdrv_file = {
0a122b
     .bdrv_aio_writev = raw_aio_writev,
0a122b
     .bdrv_aio_flush = raw_aio_flush,
0a122b
     .bdrv_aio_discard = raw_aio_discard,
0a122b
+    .bdrv_refresh_limits = raw_refresh_limits,
0a122b
 
0a122b
     .bdrv_truncate = raw_truncate,
0a122b
     .bdrv_getlength = raw_getlength,
0a122b
@@ -1739,6 +1803,7 @@ static BlockDriver bdrv_host_device = {
0a122b
     .bdrv_aio_writev	= raw_aio_writev,
0a122b
     .bdrv_aio_flush	= raw_aio_flush,
0a122b
     .bdrv_aio_discard   = hdev_aio_discard,
0a122b
+    .bdrv_refresh_limits = raw_refresh_limits,
0a122b
 
0a122b
     .bdrv_truncate      = raw_truncate,
0a122b
     .bdrv_getlength	= raw_getlength,
0a122b
@@ -1869,6 +1934,7 @@ static BlockDriver bdrv_host_floppy = {
0a122b
     .bdrv_aio_readv     = raw_aio_readv,
0a122b
     .bdrv_aio_writev    = raw_aio_writev,
0a122b
     .bdrv_aio_flush	= raw_aio_flush,
0a122b
+    .bdrv_refresh_limits = raw_refresh_limits,
0a122b
 
0a122b
     .bdrv_truncate      = raw_truncate,
0a122b
     .bdrv_getlength      = raw_getlength,
0a122b
@@ -1978,6 +2044,7 @@ static BlockDriver bdrv_host_cdrom = {
0a122b
     .bdrv_aio_readv     = raw_aio_readv,
0a122b
     .bdrv_aio_writev    = raw_aio_writev,
0a122b
     .bdrv_aio_flush	= raw_aio_flush,
0a122b
+    .bdrv_refresh_limits = raw_refresh_limits,
0a122b
 
0a122b
     .bdrv_truncate      = raw_truncate,
0a122b
     .bdrv_getlength      = raw_getlength,
0a122b
@@ -2105,6 +2172,7 @@ static BlockDriver bdrv_host_cdrom = {
0a122b
     .bdrv_aio_readv     = raw_aio_readv,
0a122b
     .bdrv_aio_writev    = raw_aio_writev,
0a122b
     .bdrv_aio_flush	= raw_aio_flush,
0a122b
+    .bdrv_refresh_limits = raw_refresh_limits,
0a122b
 
0a122b
     .bdrv_truncate      = raw_truncate,
0a122b
     .bdrv_getlength      = raw_getlength,
0a122b
diff --git a/block/raw-win32.c b/block/raw-win32.c
0a122b
index 6ac3797..ac20370 100644
0a122b
--- a/block/raw-win32.c
0a122b
+++ b/block/raw-win32.c
0a122b
@@ -201,6 +201,35 @@ static int set_sparse(int fd)
0a122b
 				 NULL, 0, NULL, 0, &returned, NULL);
0a122b
 }
0a122b
 
0a122b
+static void raw_probe_alignment(BlockDriverState *bs)
0a122b
+{
0a122b
+    BDRVRawState *s = bs->opaque;
0a122b
+    DWORD sectorsPerCluster, freeClusters, totalClusters, count;
0a122b
+    DISK_GEOMETRY_EX dg;
0a122b
+    BOOL status;
0a122b
+
0a122b
+    if (s->type == FTYPE_CD) {
0a122b
+        bs->request_alignment = 2048;
0a122b
+        return;
0a122b
+    }
0a122b
+    if (s->type == FTYPE_HARDDISK) {
0a122b
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
0a122b
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
0a122b
+        if (status != 0) {
0a122b
+            bs->request_alignment = dg.Geometry.BytesPerSector;
0a122b
+            return;
0a122b
+        }
0a122b
+        /* try GetDiskFreeSpace too */
0a122b
+    }
0a122b
+
0a122b
+    if (s->drive_path[0]) {
0a122b
+        GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
0a122b
+                         &dg.Geometry.BytesPerSector,
0a122b
+                         &freeClusters, &totalClusters);
0a122b
+        bs->request_alignment = dg.Geometry.BytesPerSector;
0a122b
+    }
0a122b
+}
0a122b
+
0a122b
 static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
0a122b
 {
0a122b
     assert(access_flags != NULL);
0a122b
@@ -268,6 +297,17 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
0a122b
         }
0a122b
     }
0a122b
 
0a122b
+    if (filename[0] && filename[1] == ':') {
0a122b
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]);
0a122b
+    } else if (filename[0] == '\\' && filename[1] == '\\') {
0a122b
+        s->drive_path[0] = 0;
0a122b
+    } else {
0a122b
+        /* Relative path.  */
0a122b
+        char buf[MAX_PATH];
0a122b
+        GetCurrentDirectory(MAX_PATH, buf);
0a122b
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]);
0a122b
+    }
0a122b
+
0a122b
     s->hfile = CreateFile(filename, access_flags,
0a122b
                           FILE_SHARE_READ, NULL,
0a122b
                           OPEN_EXISTING, overlapped, NULL);
0a122b
@@ -293,6 +333,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
0a122b
         s->aio = aio;
0a122b
     }
0a122b
 
0a122b
+    raw_probe_alignment(bs);
0a122b
     ret = 0;
0a122b
 fail:
0a122b
     qemu_opts_del(opts);
0a122b
diff --git a/include/block/block_int.h b/include/block/block_int.h
0a122b
index 0445e6b..e66bd5f 100644
0a122b
--- a/include/block/block_int.h
0a122b
+++ b/include/block/block_int.h
0a122b
@@ -307,6 +307,9 @@ struct BlockDriverState {
0a122b
     /* Whether produces zeros when read beyond eof */
0a122b
     bool zero_beyond_eof;
0a122b
 
0a122b
+    /* Alignment requirement for offset/length of I/O requests */
0a122b
+    unsigned int request_alignment;
0a122b
+
0a122b
     /* the block size for which the guest device expects atomicity */
0a122b
     int guest_block_size;
0a122b
 
0a122b
-- 
0a122b
1.7.1
0a122b