9ae3a8
From e9148733cef44bebb0d74a731a70b3304e720634 Mon Sep 17 00:00:00 2001
9ae3a8
From: Alex Williamson <alex.williamson@redhat.com>
9ae3a8
Date: Thu, 13 Dec 2018 21:55:26 +0100
9ae3a8
Subject: [PATCH 5/5] vfio: Inhibit ballooning based on group attachment to a
9ae3a8
 container
9ae3a8
9ae3a8
RH-Author: Alex Williamson <alex.williamson@redhat.com>
9ae3a8
Message-id: <154473812659.22725.6814768117383324849.stgit@gimli.home>
9ae3a8
Patchwork-id: 83497
9ae3a8
O-Subject: [RHEL-7.7 qemu-kvm PATCH 5/5] vfio: Inhibit ballooning based on group attachment to a container
9ae3a8
Bugzilla: 1659229
9ae3a8
RH-Acked-by: Peter Xu <peterx@redhat.com>
9ae3a8
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
9ae3a8
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
9ae3a8
9ae3a8
Bugzilla: 1659229
9ae3a8
Notes: Error path has more exit paths versus upstream
9ae3a8
9ae3a8
We use a VFIOContainer to associate an AddressSpace to one or more
9ae3a8
VFIOGroups.  The VFIOContainer represents the DMA context for that
9ae3a8
AdressSpace for those VFIOGroups and is synchronized to changes in
9ae3a8
that AddressSpace via a MemoryListener.  For IOMMU backed devices,
9ae3a8
maintaining the DMA context for a VFIOGroup generally involves
9ae3a8
pinning a host virtual address in order to create a stable host
9ae3a8
physical address and then mapping a translation from the associated
9ae3a8
guest physical address to that host physical address into the IOMMU.
9ae3a8
9ae3a8
While the above maintains the VFIOContainer synchronized to the QEMU
9ae3a8
memory API of the VM, memory ballooning occurs outside of that API.
9ae3a8
Inflating the memory balloon (ie. cooperatively capturing pages from
9ae3a8
the guest for use by the host) simply uses MADV_DONTNEED to "zap"
9ae3a8
pages from QEMU's host virtual address space.  The page pinning and
9ae3a8
IOMMU mapping above remains in place, negating the host's ability to
9ae3a8
reuse the page, but the host virtual to host physical mapping of the
9ae3a8
page is invalidated outside of QEMU's memory API.
9ae3a8
9ae3a8
When the balloon is later deflated, attempting to cooperatively
9ae3a8
return pages to the guest, the page is simply freed by the guest
9ae3a8
balloon driver, allowing it to be used in the guest and incurring a
9ae3a8
page fault when that occurs.  The page fault maps a new host physical
9ae3a8
page backing the existing host virtual address, meanwhile the
9ae3a8
VFIOContainer still maintains the translation to the original host
9ae3a8
physical address.  At this point the guest vCPU and any assigned
9ae3a8
devices will map different host physical addresses to the same guest
9ae3a8
physical address.  Badness.
9ae3a8
9ae3a8
The IOMMU typically does not have page level granularity with which
9ae3a8
it can track this mapping without also incurring inefficiencies in
9ae3a8
using page size mappings throughout.  MMU notifiers in the host
9ae3a8
kernel also provide indicators for invalidating the mapping on
9ae3a8
balloon inflation, not for updating the mapping when the balloon is
9ae3a8
deflated.  For these reasons we assume a default behavior that the
9ae3a8
mapping of each VFIOGroup into the VFIOContainer is incompatible
9ae3a8
with memory ballooning and increment the balloon inhibitor to match
9ae3a8
the attached VFIOGroups.
9ae3a8
9ae3a8
Reviewed-by: Peter Xu <peterx@redhat.com>
9ae3a8
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
9ae3a8
(cherry picked from commit c65ee433153b5925e183a00ebf568e160077c694)
9ae3a8
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
9ae3a8
---
9ae3a8
 hw/misc/vfio.c | 35 +++++++++++++++++++++++++++++++++++
9ae3a8
 1 file changed, 35 insertions(+)
9ae3a8
9ae3a8
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
9ae3a8
index f91eecb..f7360bf 100644
9ae3a8
--- a/hw/misc/vfio.c
9ae3a8
+++ b/hw/misc/vfio.c
9ae3a8
@@ -37,6 +37,7 @@
9ae3a8
 #include "qemu/event_notifier.h"
9ae3a8
 #include "qemu/queue.h"
9ae3a8
 #include "qemu/range.h"
9ae3a8
+#include "sysemu/balloon.h"
9ae3a8
 #include "sysemu/kvm.h"
9ae3a8
 #include "sysemu/sysemu.h"
9ae3a8
 #include "trace.h"
9ae3a8
@@ -3667,6 +3668,33 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
         return 0;
9ae3a8
     }
9ae3a8
 
9ae3a8
+    /*
9ae3a8
+     * VFIO is currently incompatible with memory ballooning insofar as the
9ae3a8
+     * madvise to purge (zap) the page from QEMU's address space does not
9ae3a8
+     * interact with the memory API and therefore leaves stale virtual to
9ae3a8
+     * physical mappings in the IOMMU if the page was previously pinned.  We
9ae3a8
+     * therefore add a balloon inhibit for each group added to a container,
9ae3a8
+     * whether the container is used individually or shared.  This provides
9ae3a8
+     * us with options to allow devices within a group to opt-in and allow
9ae3a8
+     * ballooning, so long as it is done consistently for a group (for instance
9ae3a8
+     * if the device is an mdev device where it is known that the host vendor
9ae3a8
+     * driver will never pin pages outside of the working set of the guest
9ae3a8
+     * driver, which would thus not be ballooning candidates).
9ae3a8
+     *
9ae3a8
+     * The first opportunity to induce pinning occurs here where we attempt to
9ae3a8
+     * attach the group to existing containers within the AddressSpace.  If any
9ae3a8
+     * pages are already zapped from the virtual address space, such as from a
9ae3a8
+     * previous ballooning opt-in, new pinning will cause valid mappings to be
9ae3a8
+     * re-established.  Likewise, when the overall MemoryListener for a new
9ae3a8
+     * container is registered, a replay of mappings within the AddressSpace
9ae3a8
+     * will occur, re-establishing any previously zapped pages as well.
9ae3a8
+     *
9ae3a8
+     * NB. Balloon inhibiting does not currently block operation of the
9ae3a8
+     * balloon driver or revoke previously pinned pages, it only prevents
9ae3a8
+     * calling madvise to modify the virtual mapping of ballooned pages.
9ae3a8
+     */
9ae3a8
+    qemu_balloon_inhibit(true);
9ae3a8
+
9ae3a8
     QLIST_FOREACH(container, &container_list, next) {
9ae3a8
         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
9ae3a8
             group->container = container;
9ae3a8
@@ -3678,6 +3706,7 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
     fd = qemu_open("/dev/vfio/vfio", O_RDWR);
9ae3a8
     if (fd < 0) {
9ae3a8
         error_report("vfio: failed to open /dev/vfio/vfio: %m");
9ae3a8
+        qemu_balloon_inhibit(false);
9ae3a8
         return -errno;
9ae3a8
     }
9ae3a8
 
9ae3a8
@@ -3686,6 +3715,7 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
         error_report("vfio: supported vfio version: %d, "
9ae3a8
                      "reported version: %d", VFIO_API_VERSION, ret);
9ae3a8
         close(fd);
9ae3a8
+        qemu_balloon_inhibit(false);
9ae3a8
         return -EINVAL;
9ae3a8
     }
9ae3a8
 
9ae3a8
@@ -3701,6 +3731,7 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
             error_report("vfio: failed to set group container: %m");
9ae3a8
             g_free(container);
9ae3a8
             close(fd);
9ae3a8
+            qemu_balloon_inhibit(false);
9ae3a8
             return -errno;
9ae3a8
         }
9ae3a8
 
9ae3a8
@@ -3710,6 +3741,7 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
             error_report("vfio: failed to set iommu for container: %m");
9ae3a8
             g_free(container);
9ae3a8
             close(fd);
9ae3a8
+            qemu_balloon_inhibit(false);
9ae3a8
             return -errno;
9ae3a8
         }
9ae3a8
 
9ae3a8
@@ -3724,6 +3756,7 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
             vfio_listener_release(container);
9ae3a8
             g_free(container);
9ae3a8
             close(fd);
9ae3a8
+            qemu_balloon_inhibit(false);
9ae3a8
             error_report("vfio: memory listener initialization failed for container\n");
9ae3a8
             return ret;
9ae3a8
         }
9ae3a8
@@ -3734,6 +3767,7 @@ static int vfio_connect_container(VFIOGroup *group)
9ae3a8
         error_report("vfio: No available IOMMU models");
9ae3a8
         g_free(container);
9ae3a8
         close(fd);
9ae3a8
+        qemu_balloon_inhibit(false);
9ae3a8
         return -EINVAL;
9ae3a8
     }
9ae3a8
 
9ae3a8
@@ -3834,6 +3868,7 @@ static void vfio_put_group(VFIOGroup *group)
9ae3a8
         return;
9ae3a8
     }
9ae3a8
 
9ae3a8
+    qemu_balloon_inhibit(false);
9ae3a8
     vfio_kvm_device_del_group(group);
9ae3a8
     vfio_disconnect_container(group);
9ae3a8
     QLIST_REMOVE(group, next);
9ae3a8
-- 
9ae3a8
1.8.3.1
9ae3a8