218e99
From 8ea5e0ba0b1ba054d71bb10f8c45c167dd3d7792 Mon Sep 17 00:00:00 2001
218e99
From: Alex Williamson <alex.williamson@redhat.com>
218e99
Date: Tue, 5 Nov 2013 17:31:11 +0100
218e99
Subject: [PATCH 19/25] vfio-pci: Implement PCI hot reset
218e99
218e99
RH-Author: Alex Williamson <alex.williamson@redhat.com>
218e99
Message-id: <20131105173110.19372.22420.stgit@bling.home>
218e99
Patchwork-id: 55440
218e99
O-Subject: [RHEL7 qemu-kvm PATCH 2/2] vfio-pci: Implement PCI hot reset
218e99
Bugzilla: 1025472
218e99
RH-Acked-by: Bandan Das <bsd@redhat.com>
218e99
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
218e99
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
218e99
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
218e99
218e99
Bugzilla: 1025472
218e99
Upstream commit: f16f39c3fc973c5d7cbc2224eefb4ef5eb1e64ff
218e99
218e99
Now that VFIO has a PCI hot reset interface, take advantage of it.
218e99
There are two modes that we need to consider.  The first is when only
218e99
one device within the set of devices affected is actually assigned to
218e99
the guest.  In this case the other devices are are just held by VFIO
218e99
for isolation and we can pretend they're not there, doing an entire
218e99
bus reset whenever the device reset callback is triggered.  Supporting
218e99
this case separately allows us to do the best reset we can do of the
218e99
device even if the device is hotplugged.
218e99
218e99
The second mode is when multiple affected devices are all exposed to
218e99
the guest.  In this case we can only do a hot reset when the entire
218e99
system is being reset.  However, this also allows us to track which
218e99
individual devices are affected by a reset and only do them once.
218e99
218e99
We split our reset function into pre- and post-reset helper functions
218e99
prioritize the types of device resets available to us, and create
218e99
separate _one vs _multi reset interfaces to handle the distinct cases
218e99
above.
218e99
218e99
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
218e99
---
218e99
 hw/misc/vfio.c |  338 ++++++++++++++++++++++++++++++++++++++++++++++++++------
218e99
 1 file changed, 300 insertions(+), 38 deletions(-)
218e99
218e99
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
218e99
---
218e99
 hw/misc/vfio.c |  338 +++++++++++++++++++++++++++++++++++++++++++++++++-------
218e99
 1 files changed, 300 insertions(+), 38 deletions(-)
218e99
218e99
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
218e99
index 6178221..331ae5f 100644
218e99
--- a/hw/misc/vfio.c
218e99
+++ b/hw/misc/vfio.c
218e99
@@ -188,6 +188,7 @@ typedef struct VFIODevice {
218e99
     bool pci_aer;
218e99
     bool has_flr;
218e99
     bool has_pm_reset;
218e99
+    bool needs_reset;
218e99
 } VFIODevice;
218e99
 
218e99
 typedef struct VFIOGroup {
218e99
@@ -2759,6 +2760,279 @@ static int vfio_add_capabilities(VFIODevice *vdev)
218e99
     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
218e99
 }
218e99
 
218e99
+static void vfio_pci_pre_reset(VFIODevice *vdev)
218e99
+{
218e99
+    PCIDevice *pdev = &vdev->pdev;
218e99
+    uint16_t cmd;
218e99
+
218e99
+    vfio_disable_interrupts(vdev);
218e99
+
218e99
+    /* Make sure the device is in D0 */
218e99
+    if (vdev->pm_cap) {
218e99
+        uint16_t pmcsr;
218e99
+        uint8_t state;
218e99
+
218e99
+        pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
218e99
+        state = pmcsr & PCI_PM_CTRL_STATE_MASK;
218e99
+        if (state) {
218e99
+            pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
218e99
+            vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
218e99
+            /* vfio handles the necessary delay here */
218e99
+            pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
218e99
+            state = pmcsr & PCI_PM_CTRL_STATE_MASK;
218e99
+            if (state) {
218e99
+                error_report("vfio: Unable to power on device, stuck in D%d\n",
218e99
+                             state);
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+
218e99
+    /*
218e99
+     * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
218e99
+     * Also put INTx Disable in known state.
218e99
+     */
218e99
+    cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
218e99
+    cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
218e99
+             PCI_COMMAND_INTX_DISABLE);
218e99
+    vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
218e99
+}
218e99
+
218e99
+static void vfio_pci_post_reset(VFIODevice *vdev)
218e99
+{
218e99
+    vfio_enable_intx(vdev);
218e99
+}
218e99
+
218e99
+static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
218e99
+                                PCIHostDeviceAddress *host2)
218e99
+{
218e99
+    return (host1->domain == host2->domain && host1->bus == host2->bus &&
218e99
+            host1->slot == host2->slot && host1->function == host2->function);
218e99
+}
218e99
+
218e99
+static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
218e99
+{
218e99
+    VFIOGroup *group;
218e99
+    struct vfio_pci_hot_reset_info *info;
218e99
+    struct vfio_pci_dependent_device *devices;
218e99
+    struct vfio_pci_hot_reset *reset;
218e99
+    int32_t *fds;
218e99
+    int ret, i, count;
218e99
+    bool multi = false;
218e99
+
218e99
+    DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain,
218e99
+            vdev->host.bus, vdev->host.slot, vdev->host.function,
218e99
+            single ? "one" : "multi");
218e99
+
218e99
+    vfio_pci_pre_reset(vdev);
218e99
+    vdev->needs_reset = false;
218e99
+
218e99
+    info = g_malloc0(sizeof(*info));
218e99
+    info->argsz = sizeof(*info);
218e99
+
218e99
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
218e99
+    if (ret && errno != ENOSPC) {
218e99
+        ret = -errno;
218e99
+        if (!vdev->has_pm_reset) {
218e99
+            error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
218e99
+                         "no available reset mechanism.", vdev->host.domain,
218e99
+                         vdev->host.bus, vdev->host.slot, vdev->host.function);
218e99
+        }
218e99
+        goto out_single;
218e99
+    }
218e99
+
218e99
+    count = info->count;
218e99
+    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
218e99
+    info->argsz = sizeof(*info) + (count * sizeof(*devices));
218e99
+    devices = &info->devices[0];
218e99
+
218e99
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
218e99
+    if (ret) {
218e99
+        ret = -errno;
218e99
+        error_report("vfio: hot reset info failed: %m");
218e99
+        goto out_single;
218e99
+    }
218e99
+
218e99
+    DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n",
218e99
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
218e99
+            vdev->host.function);
218e99
+
218e99
+    /* Verify that we have all the groups required */
218e99
+    for (i = 0; i < info->count; i++) {
218e99
+        PCIHostDeviceAddress host;
218e99
+        VFIODevice *tmp;
218e99
+
218e99
+        host.domain = devices[i].segment;
218e99
+        host.bus = devices[i].bus;
218e99
+        host.slot = PCI_SLOT(devices[i].devfn);
218e99
+        host.function = PCI_FUNC(devices[i].devfn);
218e99
+
218e99
+        DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain,
218e99
+                host.bus, host.slot, host.function, devices[i].group_id);
218e99
+
218e99
+        if (vfio_pci_host_match(&host, &vdev->host)) {
218e99
+            continue;
218e99
+        }
218e99
+
218e99
+        QLIST_FOREACH(group, &group_list, next) {
218e99
+            if (group->groupid == devices[i].group_id) {
218e99
+                break;
218e99
+            }
218e99
+        }
218e99
+
218e99
+        if (!group) {
218e99
+            if (!vdev->has_pm_reset) {
218e99
+                error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
218e99
+                             "depends on group %d which is not owned.",
218e99
+                             vdev->host.domain, vdev->host.bus, vdev->host.slot,
218e99
+                             vdev->host.function, devices[i].group_id);
218e99
+            }
218e99
+            ret = -EPERM;
218e99
+            goto out;
218e99
+        }
218e99
+
218e99
+        /* Prep dependent devices for reset and clear our marker. */
218e99
+        QLIST_FOREACH(tmp, &group->device_list, next) {
218e99
+            if (vfio_pci_host_match(&host, &tmp->host)) {
218e99
+                if (single) {
218e99
+                    DPRINTF("vfio: found another in-use device "
218e99
+                            "%04x:%02x:%02x.%x\n", host.domain, host.bus,
218e99
+                            host.slot, host.function);
218e99
+                    ret = -EINVAL;
218e99
+                    goto out_single;
218e99
+                }
218e99
+                vfio_pci_pre_reset(tmp);
218e99
+                tmp->needs_reset = false;
218e99
+                multi = true;
218e99
+                break;
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+
218e99
+    if (!single && !multi) {
218e99
+        DPRINTF("vfio: No other in-use devices for multi hot reset\n");
218e99
+        ret = -EINVAL;
218e99
+        goto out_single;
218e99
+    }
218e99
+
218e99
+    /* Determine how many group fds need to be passed */
218e99
+    count = 0;
218e99
+    QLIST_FOREACH(group, &group_list, next) {
218e99
+        for (i = 0; i < info->count; i++) {
218e99
+            if (group->groupid == devices[i].group_id) {
218e99
+                count++;
218e99
+                break;
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+
218e99
+    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
218e99
+    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
218e99
+    fds = &reset->group_fds[0];
218e99
+
218e99
+    /* Fill in group fds */
218e99
+    QLIST_FOREACH(group, &group_list, next) {
218e99
+        for (i = 0; i < info->count; i++) {
218e99
+            if (group->groupid == devices[i].group_id) {
218e99
+                fds[reset->count++] = group->fd;
218e99
+                break;
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+
218e99
+    /* Bus reset! */
218e99
+    ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
218e99
+    g_free(reset);
218e99
+
218e99
+    DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain,
218e99
+            vdev->host.bus, vdev->host.slot, vdev->host.function,
218e99
+            ret ? "%m" : "Success");
218e99
+
218e99
+out:
218e99
+    /* Re-enable INTx on affected devices */
218e99
+    for (i = 0; i < info->count; i++) {
218e99
+        PCIHostDeviceAddress host;
218e99
+        VFIODevice *tmp;
218e99
+
218e99
+        host.domain = devices[i].segment;
218e99
+        host.bus = devices[i].bus;
218e99
+        host.slot = PCI_SLOT(devices[i].devfn);
218e99
+        host.function = PCI_FUNC(devices[i].devfn);
218e99
+
218e99
+        if (vfio_pci_host_match(&host, &vdev->host)) {
218e99
+            continue;
218e99
+        }
218e99
+
218e99
+        QLIST_FOREACH(group, &group_list, next) {
218e99
+            if (group->groupid == devices[i].group_id) {
218e99
+                break;
218e99
+            }
218e99
+        }
218e99
+
218e99
+        if (!group) {
218e99
+            break;
218e99
+        }
218e99
+
218e99
+        QLIST_FOREACH(tmp, &group->device_list, next) {
218e99
+            if (vfio_pci_host_match(&host, &tmp->host)) {
218e99
+                vfio_pci_post_reset(tmp);
218e99
+                break;
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+out_single:
218e99
+    vfio_pci_post_reset(vdev);
218e99
+    g_free(info);
218e99
+
218e99
+    return ret;
218e99
+}
218e99
+
218e99
+/*
218e99
+ * We want to differentiate hot reset of mulitple in-use devices vs hot reset
218e99
+ * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
218e99
+ * of doing hot resets when there is only a single device per bus.  The in-use
218e99
+ * here refers to how many VFIODevices are affected.  A hot reset that affects
218e99
+ * multiple devices, but only a single in-use device, means that we can call
218e99
+ * it from our bus ->reset() callback since the extent is effectively a single
218e99
+ * device.  This allows us to make use of it in the hotplug path.  When there
218e99
+ * are multiple in-use devices, we can only trigger the hot reset during a
218e99
+ * system reset and thus from our reset handler.  We separate _one vs _multi
218e99
+ * here so that we don't overlap and do a double reset on the system reset
218e99
+ * path where both our reset handler and ->reset() callback are used.  Calling
218e99
+ * _one() will only do a hot reset for the one in-use devices case, calling
218e99
+ * _multi() will do nothing if a _one() would have been sufficient.
218e99
+ */
218e99
+static int vfio_pci_hot_reset_one(VFIODevice *vdev)
218e99
+{
218e99
+    return vfio_pci_hot_reset(vdev, true);
218e99
+}
218e99
+
218e99
+static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
218e99
+{
218e99
+    return vfio_pci_hot_reset(vdev, false);
218e99
+}
218e99
+
218e99
+static void vfio_pci_reset_handler(void *opaque)
218e99
+{
218e99
+    VFIOGroup *group;
218e99
+    VFIODevice *vdev;
218e99
+
218e99
+    QLIST_FOREACH(group, &group_list, next) {
218e99
+        QLIST_FOREACH(vdev, &group->device_list, next) {
218e99
+            if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
218e99
+                vdev->needs_reset = true;
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+
218e99
+    QLIST_FOREACH(group, &group_list, next) {
218e99
+        QLIST_FOREACH(vdev, &group->device_list, next) {
218e99
+            if (vdev->needs_reset) {
218e99
+                vfio_pci_hot_reset_multi(vdev);
218e99
+            }
218e99
+        }
218e99
+    }
218e99
+}
218e99
+
218e99
 static int vfio_connect_container(VFIOGroup *group)
218e99
 {
218e99
     VFIOContainer *container;
218e99
@@ -2901,6 +3175,10 @@ static VFIOGroup *vfio_get_group(int groupid)
218e99
         return NULL;
218e99
     }
218e99
 
218e99
+    if (QLIST_EMPTY(&group_list)) {
218e99
+        qemu_register_reset(vfio_pci_reset_handler, NULL);
218e99
+    }
218e99
+
218e99
     QLIST_INSERT_HEAD(&group_list, group, next);
218e99
 
218e99
     return group;
218e99
@@ -2917,6 +3195,10 @@ static void vfio_put_group(VFIOGroup *group)
218e99
     DPRINTF("vfio_put_group: close group->fd\n");
218e99
     close(group->fd);
218e99
     g_free(group);
218e99
+
218e99
+    if (QLIST_EMPTY(&group_list)) {
218e99
+        qemu_unregister_reset(vfio_pci_reset_handler, NULL);
218e99
+    }
218e99
 }
218e99
 
218e99
 static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
218e99
@@ -2955,9 +3237,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
218e99
     }
218e99
 
218e99
     vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
218e99
-    if (!vdev->reset_works) {
218e99
-        error_report("Warning, device %s does not support reset", name);
218e99
-    }
218e99
 
218e99
     if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
218e99
         error_report("vfio: unexpected number of io regions %u",
218e99
@@ -3363,51 +3642,34 @@ static void vfio_pci_reset(DeviceState *dev)
218e99
 {
218e99
     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
218e99
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
218e99
-    uint16_t cmd;
218e99
 
218e99
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
218e99
             vdev->host.bus, vdev->host.slot, vdev->host.function);
218e99
 
218e99
-    vfio_disable_interrupts(vdev);
218e99
-
218e99
-    /* Make sure the device is in D0 */
218e99
-    if (vdev->pm_cap) {
218e99
-        uint16_t pmcsr;
218e99
-        uint8_t state;
218e99
+    vfio_pci_pre_reset(vdev);
218e99
 
218e99
-        pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
218e99
-        state = pmcsr & PCI_PM_CTRL_STATE_MASK;
218e99
-        if (state) {
218e99
-            pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
218e99
-            vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
218e99
-            /* vfio handles the necessary delay here */
218e99
-            pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
218e99
-            state = pmcsr & PCI_PM_CTRL_STATE_MASK;
218e99
-            if (state) {
218e99
-                error_report("vfio: Unable to power on device, stuck in D%d\n",
218e99
-                             state);
218e99
-            }
218e99
-        }
218e99
+    if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
218e99
+        !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
218e99
+        DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain,
218e99
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
218e99
+        goto post_reset;
218e99
     }
218e99
 
218e99
-    /*
218e99
-     * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
218e99
-     * Also put INTx Disable in known state.
218e99
-     */
218e99
-    cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
218e99
-    cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
218e99
-             PCI_COMMAND_INTX_DISABLE);
218e99
-    vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
218e99
+    /* See if we can do our own bus reset */
218e99
+    if (!vfio_pci_hot_reset_one(vdev)) {
218e99
+        goto post_reset;
218e99
+    }
218e99
 
218e99
-    if (vdev->reset_works) {
218e99
-        if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
218e99
-            error_report("vfio: Error unable to reset physical device "
218e99
-                         "(%04x:%02x:%02x.%x): %m", vdev->host.domain,
218e99
-                         vdev->host.bus, vdev->host.slot, vdev->host.function);
218e99
-        }
218e99
+    /* If nothing else works and the device supports PM reset, use it */
218e99
+    if (vdev->reset_works && vdev->has_pm_reset &&
218e99
+        !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
218e99
+        DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain,
218e99
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
218e99
+        goto post_reset;
218e99
     }
218e99
 
218e99
-    vfio_enable_intx(vdev);
218e99
+post_reset:
218e99
+    vfio_pci_post_reset(vdev);
218e99
 }
218e99
 
218e99
 static Property vfio_pci_dev_properties[] = {
218e99
-- 
218e99
1.7.1
218e99