Blob Blame Raw
From f37a1e337dd62c873f18aabd31863c8df144c7ea Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 3 Dec 2018 22:01:54 +0000
Subject: [PATCH 13/16] vfio/ccw/pci: Allow devices to opt-in for ballooning

RH-Author: Alex Williamson <alex.williamson@redhat.com>
Message-id: <154387451469.27651.8657130146789267501.stgit@gimli.home>
Patchwork-id: 83236
O-Subject: [RHEL-8.0 qemu-kvm PATCH 4/7] vfio/ccw/pci: Allow devices to opt-in for ballooning
Bugzilla: 1650272
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Acked-by: David Hildenbrand <david@redhat.com>

Bugzilla: 1650272

If a vfio assigned device makes use of a physical IOMMU, then memory
ballooning is necessarily inhibited due to the page pinning, lack of
page level granularity at the IOMMU, and sufficient notifiers to both
remove the page on balloon inflation and add it back on deflation.
However, not all devices are backed by a physical IOMMU.  In the case
of mediated devices, if a vendor driver is well synchronized with the
guest driver, such that only pages actively used by the guest driver
are pinned by the host mdev vendor driver, then there should be no
overlap between pages available for the balloon driver and pages
actively in use by the device.  Under these conditions, ballooning
should be safe.

vfio-ccw devices are always mediated devices and always operate under
the constraints above.  Therefore we can consider all vfio-ccw devices
as balloon compatible.

The situation is far from straightforward with vfio-pci.  These
devices can be physical devices with physical IOMMU backing or
mediated devices where it is unknown whether a physical IOMMU is in
use or whether the vendor driver is well synchronized to the working
set of the guest driver.  The safest approach is therefore to assume
all vfio-pci devices are incompatible with ballooning, but allow user
opt-in should they have further insight into mediated devices.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 238e91728503d400e1c4e644e3a9b80f9e621682)
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
---
 hw/vfio/ccw.c                 |  9 +++++++++
 hw/vfio/common.c              | 23 ++++++++++++++++++++++-
 hw/vfio/pci.c                 | 26 +++++++++++++++++++++++++-
 hw/vfio/trace-events          |  1 +
 include/hw/vfio/vfio-common.h |  2 ++
 5 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index fe34b50..0c74dda 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -362,6 +362,15 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
         }
     }
 
+    /*
+     * All vfio-ccw devices are believed to operate in a way compatible with
+     * memory ballooning, ie. pages pinned in the host are in the current
+     * working set of the guest driver and therefore never overlap with pages
+     * available to the guest balloon driver.  This needs to be set before
+     * vfio_get_device() for vfio common to handle the balloon inhibitor.
+     */
+    vcdev->vdev.balloon_allowed = true;
+
     if (vfio_get_device(group, cdev->mdevid, &vcdev->vdev, &err)) {
         g_free(vcdev->vdev.name);
         goto out_device_err;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7e8f289..cda2d1f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1376,7 +1376,9 @@ void vfio_put_group(VFIOGroup *group)
         return;
     }
 
-    qemu_balloon_inhibit(false);
+    if (!group->balloon_allowed) {
+        qemu_balloon_inhibit(false);
+    }
     vfio_kvm_device_del_group(group);
     vfio_disconnect_container(group);
     QLIST_REMOVE(group, next);
@@ -1412,6 +1414,25 @@ int vfio_get_device(VFIOGroup *group, const char *name,
         return ret;
     }
 
+    /*
+     * Clear the balloon inhibitor for this group if the driver knows the
+     * device operates compatibly with ballooning.  Setting must be consistent
+     * per group, but since compatibility is really only possible with mdev
+     * currently, we expect singleton groups.
+     */
+    if (vbasedev->balloon_allowed != group->balloon_allowed) {
+        if (!QLIST_EMPTY(&group->device_list)) {
+            error_setg(errp,
+                       "Inconsistent device balloon setting within group");
+            return -1;
+        }
+
+        if (!group->balloon_allowed) {
+            group->balloon_allowed = true;
+            qemu_balloon_inhibit(false);
+        }
+    }
+
     vbasedev->fd = fd;
     vbasedev->group = group;
     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 4683eb4..d43727f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2803,12 +2803,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
     VFIODevice *vbasedev_iter;
     VFIOGroup *group;
-    char *tmp, group_path[PATH_MAX], *group_name;
+    char *tmp, *subsys, group_path[PATH_MAX], *group_name;
     Error *err = NULL;
     ssize_t len;
     struct stat st;
     int groupid;
     int ret, i = 0;
+    bool is_mdev;
 
     QLIST_FOREACH(group, &vfio_group_list, next) {
         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
@@ -2880,6 +2881,27 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         }
     }
 
+    /*
+     * Mediated devices *might* operate compatibly with memory ballooning, but
+     * we cannot know for certain, it depends on whether the mdev vendor driver
+     * stays in sync with the active working set of the guest driver.  Prevent
+     * the x-balloon-allowed option unless this is minimally an mdev device.
+     */
+    tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev);
+    subsys = realpath(tmp, NULL);
+    g_free(tmp);
+    is_mdev = (strcmp(subsys, "/sys/bus/mdev") == 0);
+    free(subsys);
+
+    trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
+
+    if (vdev->vbasedev.balloon_allowed && !is_mdev) {
+        error_setg(errp, "x-balloon-allowed only potentially compatible "
+                   "with mdev devices");
+        vfio_put_group(group);
+        goto error;
+    }
+
     ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
     if (ret) {
         vfio_put_group(group);
@@ -3177,6 +3199,8 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
+    DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
+                     vbasedev.balloon_allowed, false),
     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 20109cb..9487887 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -39,6 +39,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
 vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
 vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
 vfio_realize(const char *name, int group_id) " (%s) group %d"
+vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
 vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
 vfio_pci_reset(const char *name) " (%s)"
 vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f29df6e..36ee657 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -123,6 +123,7 @@ typedef struct VFIODevice {
     bool reset_works;
     bool needs_reset;
     bool no_mmap;
+    bool balloon_allowed;
     VFIODeviceOps *ops;
     unsigned int num_irqs;
     unsigned int num_regions;
@@ -142,6 +143,7 @@ typedef struct VFIOGroup {
     QLIST_HEAD(, VFIODevice) device_list;
     QLIST_ENTRY(VFIOGroup) next;
     QLIST_ENTRY(VFIOGroup) container_next;
+    bool balloon_allowed;
 } VFIOGroup;
 
 typedef struct VFIODMABuf {
-- 
1.8.3.1