26ba25
From f37a1e337dd62c873f18aabd31863c8df144c7ea Mon Sep 17 00:00:00 2001
26ba25
From: Alex Williamson <alex.williamson@redhat.com>
26ba25
Date: Mon, 3 Dec 2018 22:01:54 +0000
26ba25
Subject: [PATCH 13/16] vfio/ccw/pci: Allow devices to opt-in for ballooning
26ba25
26ba25
RH-Author: Alex Williamson <alex.williamson@redhat.com>
26ba25
Message-id: <154387451469.27651.8657130146789267501.stgit@gimli.home>
26ba25
Patchwork-id: 83236
26ba25
O-Subject: [RHEL-8.0 qemu-kvm PATCH 4/7] vfio/ccw/pci: Allow devices to opt-in for ballooning
26ba25
Bugzilla: 1650272
26ba25
RH-Acked-by: Peter Xu <peterx@redhat.com>
26ba25
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
26ba25
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
26ba25
RH-Acked-by: David Hildenbrand <david@redhat.com>
26ba25
26ba25
Bugzilla: 1650272
26ba25
26ba25
If a vfio assigned device makes use of a physical IOMMU, then memory
26ba25
ballooning is necessarily inhibited due to the page pinning, lack of
26ba25
page level granularity at the IOMMU, and sufficient notifiers to both
26ba25
remove the page on balloon inflation and add it back on deflation.
26ba25
However, not all devices are backed by a physical IOMMU.  In the case
26ba25
of mediated devices, if a vendor driver is well synchronized with the
26ba25
guest driver, such that only pages actively used by the guest driver
26ba25
are pinned by the host mdev vendor driver, then there should be no
26ba25
overlap between pages available for the balloon driver and pages
26ba25
actively in use by the device.  Under these conditions, ballooning
26ba25
should be safe.
26ba25
26ba25
vfio-ccw devices are always mediated devices and always operate under
26ba25
the constraints above.  Therefore we can consider all vfio-ccw devices
26ba25
as balloon compatible.
26ba25
26ba25
The situation is far from straightforward with vfio-pci.  These
26ba25
devices can be physical devices with physical IOMMU backing or
26ba25
mediated devices where it is unknown whether a physical IOMMU is in
26ba25
use or whether the vendor driver is well synchronized to the working
26ba25
set of the guest driver.  The safest approach is therefore to assume
26ba25
all vfio-pci devices are incompatible with ballooning, but allow user
26ba25
opt-in should they have further insight into mediated devices.
26ba25
26ba25
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
26ba25
(cherry picked from commit 238e91728503d400e1c4e644e3a9b80f9e621682)
26ba25
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
26ba25
---
26ba25
 hw/vfio/ccw.c                 |  9 +++++++++
26ba25
 hw/vfio/common.c              | 23 ++++++++++++++++++++++-
26ba25
 hw/vfio/pci.c                 | 26 +++++++++++++++++++++++++-
26ba25
 hw/vfio/trace-events          |  1 +
26ba25
 include/hw/vfio/vfio-common.h |  2 ++
26ba25
 5 files changed, 59 insertions(+), 2 deletions(-)
26ba25
26ba25
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
26ba25
index fe34b50..0c74dda 100644
26ba25
--- a/hw/vfio/ccw.c
26ba25
+++ b/hw/vfio/ccw.c
26ba25
@@ -362,6 +362,15 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
26ba25
         }
26ba25
     }
26ba25
 
26ba25
+    /*
26ba25
+     * All vfio-ccw devices are believed to operate in a way compatible with
26ba25
+     * memory ballooning, ie. pages pinned in the host are in the current
26ba25
+     * working set of the guest driver and therefore never overlap with pages
26ba25
+     * available to the guest balloon driver.  This needs to be set before
26ba25
+     * vfio_get_device() for vfio common to handle the balloon inhibitor.
26ba25
+     */
26ba25
+    vcdev->vdev.balloon_allowed = true;
26ba25
+
26ba25
     if (vfio_get_device(group, cdev->mdevid, &vcdev->vdev, &err)) {
26ba25
         g_free(vcdev->vdev.name);
26ba25
         goto out_device_err;
26ba25
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
26ba25
index 7e8f289..cda2d1f 100644
26ba25
--- a/hw/vfio/common.c
26ba25
+++ b/hw/vfio/common.c
26ba25
@@ -1376,7 +1376,9 @@ void vfio_put_group(VFIOGroup *group)
26ba25
         return;
26ba25
     }
26ba25
 
26ba25
-    qemu_balloon_inhibit(false);
26ba25
+    if (!group->balloon_allowed) {
26ba25
+        qemu_balloon_inhibit(false);
26ba25
+    }
26ba25
     vfio_kvm_device_del_group(group);
26ba25
     vfio_disconnect_container(group);
26ba25
     QLIST_REMOVE(group, next);
26ba25
@@ -1412,6 +1414,25 @@ int vfio_get_device(VFIOGroup *group, const char *name,
26ba25
         return ret;
26ba25
     }
26ba25
 
26ba25
+    /*
26ba25
+     * Clear the balloon inhibitor for this group if the driver knows the
26ba25
+     * device operates compatibly with ballooning.  Setting must be consistent
26ba25
+     * per group, but since compatibility is really only possible with mdev
26ba25
+     * currently, we expect singleton groups.
26ba25
+     */
26ba25
+    if (vbasedev->balloon_allowed != group->balloon_allowed) {
26ba25
+        if (!QLIST_EMPTY(&group->device_list)) {
26ba25
+            error_setg(errp,
26ba25
+                       "Inconsistent device balloon setting within group");
26ba25
+            return -1;
26ba25
+        }
26ba25
+
26ba25
+        if (!group->balloon_allowed) {
26ba25
+            group->balloon_allowed = true;
26ba25
+            qemu_balloon_inhibit(false);
26ba25
+        }
26ba25
+    }
26ba25
+
26ba25
     vbasedev->fd = fd;
26ba25
     vbasedev->group = group;
26ba25
     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
26ba25
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
26ba25
index 4683eb4..d43727f 100644
26ba25
--- a/hw/vfio/pci.c
26ba25
+++ b/hw/vfio/pci.c
26ba25
@@ -2803,12 +2803,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
26ba25
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
26ba25
     VFIODevice *vbasedev_iter;
26ba25
     VFIOGroup *group;
26ba25
-    char *tmp, group_path[PATH_MAX], *group_name;
26ba25
+    char *tmp, *subsys, group_path[PATH_MAX], *group_name;
26ba25
     Error *err = NULL;
26ba25
     ssize_t len;
26ba25
     struct stat st;
26ba25
     int groupid;
26ba25
     int ret, i = 0;
26ba25
+    bool is_mdev;
26ba25
 
26ba25
     QLIST_FOREACH(group, &vfio_group_list, next) {
26ba25
         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
26ba25
@@ -2880,6 +2881,27 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
26ba25
         }
26ba25
     }
26ba25
 
26ba25
+    /*
26ba25
+     * Mediated devices *might* operate compatibly with memory ballooning, but
26ba25
+     * we cannot know for certain, it depends on whether the mdev vendor driver
26ba25
+     * stays in sync with the active working set of the guest driver.  Prevent
26ba25
+     * the x-balloon-allowed option unless this is minimally an mdev device.
26ba25
+     */
26ba25
+    tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev);
26ba25
+    subsys = realpath(tmp, NULL);
26ba25
+    g_free(tmp);
26ba25
+    is_mdev = (strcmp(subsys, "/sys/bus/mdev") == 0);
26ba25
+    free(subsys);
26ba25
+
26ba25
+    trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
26ba25
+
26ba25
+    if (vdev->vbasedev.balloon_allowed && !is_mdev) {
26ba25
+        error_setg(errp, "x-balloon-allowed only potentially compatible "
26ba25
+                   "with mdev devices");
26ba25
+        vfio_put_group(group);
26ba25
+        goto error;
26ba25
+    }
26ba25
+
26ba25
     ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
26ba25
     if (ret) {
26ba25
         vfio_put_group(group);
26ba25
@@ -3177,6 +3199,8 @@ static Property vfio_pci_dev_properties[] = {
26ba25
     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
26ba25
                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
26ba25
     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
26ba25
+    DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
26ba25
+                     vbasedev.balloon_allowed, false),
26ba25
     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
26ba25
     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
26ba25
     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
26ba25
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
26ba25
index 20109cb..9487887 100644
26ba25
--- a/hw/vfio/trace-events
26ba25
+++ b/hw/vfio/trace-events
26ba25
@@ -39,6 +39,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
26ba25
 vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
26ba25
 vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
26ba25
 vfio_realize(const char *name, int group_id) " (%s) group %d"
26ba25
+vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
26ba25
 vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
26ba25
 vfio_pci_reset(const char *name) " (%s)"
26ba25
 vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
26ba25
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
26ba25
index f29df6e..36ee657 100644
26ba25
--- a/include/hw/vfio/vfio-common.h
26ba25
+++ b/include/hw/vfio/vfio-common.h
26ba25
@@ -123,6 +123,7 @@ typedef struct VFIODevice {
26ba25
     bool reset_works;
26ba25
     bool needs_reset;
26ba25
     bool no_mmap;
26ba25
+    bool balloon_allowed;
26ba25
     VFIODeviceOps *ops;
26ba25
     unsigned int num_irqs;
26ba25
     unsigned int num_regions;
26ba25
@@ -142,6 +143,7 @@ typedef struct VFIOGroup {
26ba25
     QLIST_HEAD(, VFIODevice) device_list;
26ba25
     QLIST_ENTRY(VFIOGroup) next;
26ba25
     QLIST_ENTRY(VFIOGroup) container_next;
26ba25
+    bool balloon_allowed;
26ba25
 } VFIOGroup;
26ba25
 
26ba25
 typedef struct VFIODMABuf {
26ba25
-- 
26ba25
1.8.3.1
26ba25