d740ea
From e9148733cef44bebb0d74a731a70b3304e720634 Mon Sep 17 00:00:00 2001
d740ea
From: Alex Williamson <alex.williamson@redhat.com>
d740ea
Date: Thu, 13 Dec 2018 21:55:26 +0100
d740ea
Subject: [PATCH 5/5] vfio: Inhibit ballooning based on group attachment to a
d740ea
 container
d740ea
d740ea
RH-Author: Alex Williamson <alex.williamson@redhat.com>
d740ea
Message-id: <154473812659.22725.6814768117383324849.stgit@gimli.home>
d740ea
Patchwork-id: 83497
d740ea
O-Subject: [RHEL-7.7 qemu-kvm PATCH 5/5] vfio: Inhibit ballooning based on group attachment to a container
d740ea
Bugzilla: 1659229
d740ea
RH-Acked-by: Peter Xu <peterx@redhat.com>
d740ea
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
d740ea
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
d740ea
d740ea
Bugzilla: 1659229
d740ea
Notes: Error path has more exit paths versus upstream
d740ea
d740ea
We use a VFIOContainer to associate an AddressSpace to one or more
d740ea
VFIOGroups.  The VFIOContainer represents the DMA context for that
d740ea
AdressSpace for those VFIOGroups and is synchronized to changes in
d740ea
that AddressSpace via a MemoryListener.  For IOMMU backed devices,
d740ea
maintaining the DMA context for a VFIOGroup generally involves
d740ea
pinning a host virtual address in order to create a stable host
d740ea
physical address and then mapping a translation from the associated
d740ea
guest physical address to that host physical address into the IOMMU.
d740ea
d740ea
While the above maintains the VFIOContainer synchronized to the QEMU
d740ea
memory API of the VM, memory ballooning occurs outside of that API.
d740ea
Inflating the memory balloon (ie. cooperatively capturing pages from
d740ea
the guest for use by the host) simply uses MADV_DONTNEED to "zap"
d740ea
pages from QEMU's host virtual address space.  The page pinning and
d740ea
IOMMU mapping above remains in place, negating the host's ability to
d740ea
reuse the page, but the host virtual to host physical mapping of the
d740ea
page is invalidated outside of QEMU's memory API.
d740ea
d740ea
When the balloon is later deflated, attempting to cooperatively
d740ea
return pages to the guest, the page is simply freed by the guest
d740ea
balloon driver, allowing it to be used in the guest and incurring a
d740ea
page fault when that occurs.  The page fault maps a new host physical
d740ea
page backing the existing host virtual address, meanwhile the
d740ea
VFIOContainer still maintains the translation to the original host
d740ea
physical address.  At this point the guest vCPU and any assigned
d740ea
devices will map different host physical addresses to the same guest
d740ea
physical address.  Badness.
d740ea
d740ea
The IOMMU typically does not have page level granularity with which
d740ea
it can track this mapping without also incurring inefficiencies in
d740ea
using page size mappings throughout.  MMU notifiers in the host
d740ea
kernel also provide indicators for invalidating the mapping on
d740ea
balloon inflation, not for updating the mapping when the balloon is
d740ea
deflated.  For these reasons we assume a default behavior that the
d740ea
mapping of each VFIOGroup into the VFIOContainer is incompatible
d740ea
with memory ballooning and increment the balloon inhibitor to match
d740ea
the attached VFIOGroups.
d740ea
d740ea
Reviewed-by: Peter Xu <peterx@redhat.com>
d740ea
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
d740ea
(cherry picked from commit c65ee433153b5925e183a00ebf568e160077c694)
d740ea
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
d740ea
---
d740ea
 hw/misc/vfio.c | 35 +++++++++++++++++++++++++++++++++++
d740ea
 1 file changed, 35 insertions(+)
d740ea
d740ea
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
d740ea
index f91eecb..f7360bf 100644
d740ea
--- a/hw/misc/vfio.c
d740ea
+++ b/hw/misc/vfio.c
d740ea
@@ -37,6 +37,7 @@
d740ea
 #include "qemu/event_notifier.h"
d740ea
 #include "qemu/queue.h"
d740ea
 #include "qemu/range.h"
d740ea
+#include "sysemu/balloon.h"
d740ea
 #include "sysemu/kvm.h"
d740ea
 #include "sysemu/sysemu.h"
d740ea
 #include "trace.h"
d740ea
@@ -3667,6 +3668,33 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
         return 0;
d740ea
     }
d740ea
 
d740ea
+    /*
d740ea
+     * VFIO is currently incompatible with memory ballooning insofar as the
d740ea
+     * madvise to purge (zap) the page from QEMU's address space does not
d740ea
+     * interact with the memory API and therefore leaves stale virtual to
d740ea
+     * physical mappings in the IOMMU if the page was previously pinned.  We
d740ea
+     * therefore add a balloon inhibit for each group added to a container,
d740ea
+     * whether the container is used individually or shared.  This provides
d740ea
+     * us with options to allow devices within a group to opt-in and allow
d740ea
+     * ballooning, so long as it is done consistently for a group (for instance
d740ea
+     * if the device is an mdev device where it is known that the host vendor
d740ea
+     * driver will never pin pages outside of the working set of the guest
d740ea
+     * driver, which would thus not be ballooning candidates).
d740ea
+     *
d740ea
+     * The first opportunity to induce pinning occurs here where we attempt to
d740ea
+     * attach the group to existing containers within the AddressSpace.  If any
d740ea
+     * pages are already zapped from the virtual address space, such as from a
d740ea
+     * previous ballooning opt-in, new pinning will cause valid mappings to be
d740ea
+     * re-established.  Likewise, when the overall MemoryListener for a new
d740ea
+     * container is registered, a replay of mappings within the AddressSpace
d740ea
+     * will occur, re-establishing any previously zapped pages as well.
d740ea
+     *
d740ea
+     * NB. Balloon inhibiting does not currently block operation of the
d740ea
+     * balloon driver or revoke previously pinned pages, it only prevents
d740ea
+     * calling madvise to modify the virtual mapping of ballooned pages.
d740ea
+     */
d740ea
+    qemu_balloon_inhibit(true);
d740ea
+
d740ea
     QLIST_FOREACH(container, &container_list, next) {
d740ea
         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
d740ea
             group->container = container;
d740ea
@@ -3678,6 +3706,7 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
     fd = qemu_open("/dev/vfio/vfio", O_RDWR);
d740ea
     if (fd < 0) {
d740ea
         error_report("vfio: failed to open /dev/vfio/vfio: %m");
d740ea
+        qemu_balloon_inhibit(false);
d740ea
         return -errno;
d740ea
     }
d740ea
 
d740ea
@@ -3686,6 +3715,7 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
         error_report("vfio: supported vfio version: %d, "
d740ea
                      "reported version: %d", VFIO_API_VERSION, ret);
d740ea
         close(fd);
d740ea
+        qemu_balloon_inhibit(false);
d740ea
         return -EINVAL;
d740ea
     }
d740ea
 
d740ea
@@ -3701,6 +3731,7 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
             error_report("vfio: failed to set group container: %m");
d740ea
             g_free(container);
d740ea
             close(fd);
d740ea
+            qemu_balloon_inhibit(false);
d740ea
             return -errno;
d740ea
         }
d740ea
 
d740ea
@@ -3710,6 +3741,7 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
             error_report("vfio: failed to set iommu for container: %m");
d740ea
             g_free(container);
d740ea
             close(fd);
d740ea
+            qemu_balloon_inhibit(false);
d740ea
             return -errno;
d740ea
         }
d740ea
 
d740ea
@@ -3724,6 +3756,7 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
             vfio_listener_release(container);
d740ea
             g_free(container);
d740ea
             close(fd);
d740ea
+            qemu_balloon_inhibit(false);
d740ea
             error_report("vfio: memory listener initialization failed for container\n");
d740ea
             return ret;
d740ea
         }
d740ea
@@ -3734,6 +3767,7 @@ static int vfio_connect_container(VFIOGroup *group)
d740ea
         error_report("vfio: No available IOMMU models");
d740ea
         g_free(container);
d740ea
         close(fd);
d740ea
+        qemu_balloon_inhibit(false);
d740ea
         return -EINVAL;
d740ea
     }
d740ea
 
d740ea
@@ -3834,6 +3868,7 @@ static void vfio_put_group(VFIOGroup *group)
d740ea
         return;
d740ea
     }
d740ea
 
d740ea
+    qemu_balloon_inhibit(false);
d740ea
     vfio_kvm_device_del_group(group);
d740ea
     vfio_disconnect_container(group);
d740ea
     QLIST_REMOVE(group, next);
d740ea
-- 
d740ea
1.8.3.1
d740ea