Blame SOURCES/kvm-mmap-alloc-fix-hugetlbfs-misaligned-length-in-ppc64.patch

016a62
From e69f257e657473ba59f48692d387e292a24892bb Mon Sep 17 00:00:00 2001
016a62
From: "plai@redhat.com" <plai@redhat.com>
016a62
Date: Tue, 20 Aug 2019 16:12:50 +0100
016a62
Subject: [PATCH 03/11] mmap-alloc: fix hugetlbfs misaligned length in ppc64
016a62
016a62
RH-Author: plai@redhat.com
016a62
Message-id: <1566317571-5697-4-git-send-email-plai@redhat.com>
016a62
Patchwork-id: 90082
016a62
O-Subject: [RHEL8.2 qemu-kvm PATCH 3/4] mmap-alloc: fix hugetlbfs misaligned length in ppc64
016a62
Bugzilla: 1539282
016a62
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
016a62
RH-Acked-by: Pankaj Gupta <pagupta@redhat.com>
016a62
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
016a62
016a62
From: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
016a62
016a62
The commit 7197fb4058bcb68986bae2bb2c04d6370f3e7218 ("util/mmap-alloc:
016a62
fix hugetlb support on ppc64") fixed Huge TLB mappings on ppc64.
016a62
016a62
However, we still need to consider the underlying huge page size
016a62
during munmap() because it requires that both address and length be a
016a62
multiple of the underlying huge page size for Huge TLB mappings.
016a62
Quote from "Huge page (Huge TLB) mappings" paragraph under NOTES
016a62
section of the munmap(2) manual:
016a62
016a62
  "For munmap(), addr and length must both be a multiple of the
016a62
  underlying huge page size."
016a62
016a62
On ppc64, the munmap() in qemu_ram_munmap() does not work for Huge TLB
016a62
mappings because the mapped segment can be aligned with the underlying
016a62
huge page size, not aligned with the native system page size, as
016a62
returned by getpagesize().
016a62
016a62
This has the side effect of not releasing huge pages back to the pool
016a62
after a hugetlbfs file-backed memory device is hot-unplugged.
016a62
016a62
This patch fixes the situation in qemu_ram_mmap() and
016a62
qemu_ram_munmap() by considering the underlying page size on ppc64.
016a62
016a62
After this patch, memory hot-unplug releases huge pages back to the
016a62
pool.
016a62
016a62
Fixes: 7197fb4058bcb68986bae2bb2c04d6370f3e7218
016a62
Signed-off-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
016a62
Reviewed-by: Greg Kurz <groug@kaod.org>
016a62
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
016a62
(cherry picked from commit 53adb9d43e1abba187387a51f238e878e934c647)
016a62
Signed-off-by: Paul Lai <plai@redhat.com>
016a62
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
016a62
---
016a62
 exec.c                    |  4 ++--
016a62
 include/qemu/mmap-alloc.h |  2 +-
016a62
 util/mmap-alloc.c         | 22 ++++++++++++++++------
016a62
 util/oslib-posix.c        |  2 +-
016a62
 4 files changed, 20 insertions(+), 10 deletions(-)
016a62
016a62
diff --git a/exec.c b/exec.c
016a62
index a79eaa3..9112d8b 100644
016a62
--- a/exec.c
016a62
+++ b/exec.c
016a62
@@ -1679,7 +1679,7 @@ static void *file_ram_alloc(RAMBlock *block,
016a62
     if (mem_prealloc) {
016a62
         os_mem_prealloc(fd, area, memory, smp_cpus, errp);
016a62
         if (errp && *errp) {
016a62
-            qemu_ram_munmap(area, memory);
016a62
+            qemu_ram_munmap(fd, area, memory);
016a62
             return NULL;
016a62
         }
016a62
     }
016a62
@@ -2200,7 +2200,7 @@ static void reclaim_ramblock(RAMBlock *block)
016a62
         xen_invalidate_map_cache_entry(block->host);
016a62
 #ifndef _WIN32
016a62
     } else if (block->fd >= 0) {
016a62
-        qemu_ram_munmap(block->host, block->max_length);
016a62
+        qemu_ram_munmap(block->fd, block->host, block->max_length);
016a62
         close(block->fd);
016a62
 #endif
016a62
     } else {
016a62
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
016a62
index 190688a..eec98d8 100644
016a62
--- a/include/qemu/mmap-alloc.h
016a62
+++ b/include/qemu/mmap-alloc.h
016a62
@@ -28,6 +28,6 @@ void *qemu_ram_mmap(int fd,
016a62
                     bool shared,
016a62
                     bool is_pmem);
016a62
 
016a62
-void qemu_ram_munmap(void *ptr, size_t size);
016a62
+void qemu_ram_munmap(int fd, void *ptr, size_t size);
016a62
 
016a62
 #endif
016a62
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
016a62
index b29fcee..bbd9077 100644
016a62
--- a/util/mmap-alloc.c
016a62
+++ b/util/mmap-alloc.c
016a62
@@ -82,6 +82,7 @@ void *qemu_ram_mmap(int fd,
016a62
     int flags;
016a62
     int guardfd;
016a62
     size_t offset;
016a62
+    size_t pagesize;
016a62
     size_t total;
016a62
     void *guardptr;
016a62
     void *ptr;
016a62
@@ -102,7 +103,8 @@ void *qemu_ram_mmap(int fd,
016a62
      * anonymous memory is OK.
016a62
      */
016a62
     flags = MAP_PRIVATE;
016a62
-    if (fd == -1 || qemu_fd_getpagesize(fd) == getpagesize()) {
016a62
+    pagesize = qemu_fd_getpagesize(fd);
016a62
+    if (fd == -1 || pagesize == getpagesize()) {
016a62
         guardfd = -1;
016a62
         flags |= MAP_ANONYMOUS;
016a62
     } else {
016a62
@@ -111,6 +113,7 @@ void *qemu_ram_mmap(int fd,
016a62
     }
016a62
 #else
016a62
     guardfd = -1;
016a62
+    pagesize = getpagesize();
016a62
     flags = MAP_PRIVATE | MAP_ANONYMOUS;
016a62
 #endif
016a62
 
016a62
@@ -122,7 +125,7 @@ void *qemu_ram_mmap(int fd,
016a62
 
016a62
     assert(is_power_of_2(align));
016a62
     /* Always align to host page size */
016a62
-    assert(align >= getpagesize());
016a62
+    assert(align >= pagesize);
016a62
 
016a62
     flags = MAP_FIXED;
016a62
     flags |= fd == -1 ? MAP_ANONYMOUS : 0;
016a62
@@ -145,17 +148,24 @@ void *qemu_ram_mmap(int fd,
016a62
      * a guard page guarding against potential buffer overflows.
016a62
      */
016a62
     total -= offset;
016a62
-    if (total > size + getpagesize()) {
016a62
-        munmap(ptr + size + getpagesize(), total - size - getpagesize());
016a62
+    if (total > size + pagesize) {
016a62
+        munmap(ptr + size + pagesize, total - size - pagesize);
016a62
     }
016a62
 
016a62
     return ptr;
016a62
 }
016a62
 
016a62
-void qemu_ram_munmap(void *ptr, size_t size)
016a62
+void qemu_ram_munmap(int fd, void *ptr, size_t size)
016a62
 {
016a62
+    size_t pagesize;
016a62
+
016a62
     if (ptr) {
016a62
         /* Unmap both the RAM block and the guard page */
016a62
-        munmap(ptr, size + getpagesize());
016a62
+#if defined(__powerpc64__) && defined(__linux__)
016a62
+        pagesize = qemu_fd_getpagesize(fd);
016a62
+#else
016a62
+        pagesize = getpagesize();
016a62
+#endif
016a62
+        munmap(ptr, size + pagesize);
016a62
     }
016a62
 }
016a62
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
016a62
index c36b2bb..7b6db04 100644
016a62
--- a/util/oslib-posix.c
016a62
+++ b/util/oslib-posix.c
016a62
@@ -153,7 +153,7 @@ void qemu_vfree(void *ptr)
016a62
 void qemu_anon_ram_free(void *ptr, size_t size)
016a62
 {
016a62
     trace_qemu_anon_ram_free(ptr, size);
016a62
-    qemu_ram_munmap(ptr, size);
016a62
+    qemu_ram_munmap(-1, ptr, size);
016a62
 }
016a62
 
016a62
 void qemu_set_block(int fd)
016a62
-- 
016a62
1.8.3.1
016a62