218e99
From 742bae4cea60c8601bbb6e5ec643167d8ca664d7 Mon Sep 17 00:00:00 2001
218e99
From: Nigel Croxon <ncroxon@redhat.com>
218e99
Date: Tue, 6 Aug 2013 19:52:04 +0200
218e99
Subject: vfio: QEMU-AER: Qemu changes to support AER for VFIO-PCI devices
218e99
218e99
RH-Author: Nigel Croxon <ncroxon@redhat.com>
218e99
Message-id: <1375818724-41239-3-git-send-email-ncroxon@redhat.com>
218e99
Patchwork-id: 53017
218e99
O-Subject: [RHEL7.0 qemu-kvm PATCH v2 2/2] vfio: QEMU-AER: Qemu changes to support AER for VFIO-PCI devices
218e99
Bugzilla: 984604
218e99
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
218e99
RH-Acked-by: Alex Williamson <alex.williamson@redhat.com>
218e99
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
218e99
218e99
From: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
218e99
218e99
Add support for error containment when a VFIO device assigned to a KVM
218e99
guest encounters an error. This is for PCIe devices/drivers that support AER
218e99
functionality. When the host OS is notified of an error in a device either
218e99
through the firmware first approach or through an interrupt handled by the AER
218e99
root port driver, the error handler registered by the vfio-pci driver gets
218e99
invoked. The qemu process is signaled through an eventfd registered per
218e99
VFIO device by the qemu process. In the eventfd handler, qemu decides on
218e99
what action to take. In this implementation, guest is brought down to
218e99
contain the error.
218e99
218e99
The kernel patches for the above functionality has been already accepted.
218e99
218e99
This is a refresh of the QEMU patch which was reviewed earlier.
218e99
http://marc.info/?l=linux-kernel&m=136281557608087&w=2
218e99
This patch has the same contents and has been built after refreshing
218e99
to latest upstream and after the linux headers have been updated in qemu.
218e99
218e99
	- Create eventfd per vfio device assigned to a guest and register an
218e99
          event handler
218e99
218e99
	- This fd is passed to the vfio_pci driver through the SET_IRQ ioctl
218e99
218e99
	- When the device encounters an error, the eventfd is signalled
218e99
          and the qemu eventfd handler gets invoked.
218e99
218e99
	- In the handler decide what action to take. Current action taken
218e99
          is to stop the guest.
218e99
218e99
Signed-off-by: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
218e99
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
218e99
(cherry picked from commit 7b4b0e9eda51902b53bc1a2318df53cdb8b72eed)
218e99
218e99
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
218e99
index 693a9ff..f8fef8c 100644
218e99
--- a/hw/misc/vfio.c
218e99
+++ b/hw/misc/vfio.c
218e99
@@ -158,6 +158,7 @@ typedef struct VFIODevice {
218e99
     PCIHostDeviceAddress host;
218e99
     QLIST_ENTRY(VFIODevice) next;
218e99
     struct VFIOGroup *group;
218e99
+    EventNotifier err_notifier;
218e99
     uint32_t features;
218e99
 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
218e99
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
218e99
@@ -165,6 +166,7 @@ typedef struct VFIODevice {
218e99
     uint8_t pm_cap;
218e99
     bool reset_works;
218e99
     bool has_vga;
218e99
+    bool pci_aer;
218e99
 } VFIODevice;
218e99
 
218e99
 typedef struct VFIOGroup {
218e99
@@ -2776,6 +2778,7 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
218e99
 {
218e99
     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
218e99
     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
218e99
+    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
218e99
     int ret, i;
218e99
 
218e99
     ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
218e99
@@ -2919,6 +2922,19 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
218e99
 
218e99
         vdev->has_vga = true;
218e99
     }
218e99
+    irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
218e99
+
218e99
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
218e99
+    if (ret) {
218e99
+        /* This can fail for an old kernel or legacy PCI dev */
218e99
+        DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret);
218e99
+        ret = 0;
218e99
+    } else if (irq_info.count == 1) {
218e99
+        vdev->pci_aer = true;
218e99
+    } else {
218e99
+        error_report("vfio: Warning: "
218e99
+                     "Could not enable error recovery for the device\n");
218e99
+    }
218e99
 
218e99
 error:
218e99
     if (ret) {
218e99
@@ -2941,6 +2957,113 @@ static void vfio_put_device(VFIODevice *vdev)
218e99
     }
218e99
 }
218e99
 
218e99
+static void vfio_err_notifier_handler(void *opaque)
218e99
+{
218e99
+    VFIODevice *vdev = opaque;
218e99
+
218e99
+    if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
218e99
+        return;
218e99
+    }
218e99
+
218e99
+    /*
218e99
+     * TBD. Retrieve the error details and decide what action
218e99
+     * needs to be taken. One of the actions could be to pass
218e99
+     * the error to the guest and have the guest driver recover
218e99
+     * from the error. This requires that PCIe capabilities be
218e99
+     * exposed to the guest. For now, we just terminate the
218e99
+     * guest to contain the error.
218e99
+     */
218e99
+
218e99
+    error_report("%s (%04x:%02x:%02x.%x)"
218e99
+        "Unrecoverable error detected...\n"
218e99
+        "Please collect any data possible and then kill the guest",
218e99
+        __func__, vdev->host.domain, vdev->host.bus,
218e99
+        vdev->host.slot, vdev->host.function);
218e99
+
218e99
+    vm_stop(RUN_STATE_IO_ERROR);
218e99
+}
218e99
+
218e99
+/*
218e99
+ * Registers error notifier for devices supporting error recovery.
218e99
+ * If we encounter a failure in this function, we report an error
218e99
+ * and continue after disabling error recovery support for the
218e99
+ * device.
218e99
+ */
218e99
+static void vfio_register_err_notifier(VFIODevice *vdev)
218e99
+{
218e99
+    int ret;
218e99
+    int argsz;
218e99
+    struct vfio_irq_set *irq_set;
218e99
+    int32_t *pfd;
218e99
+
218e99
+    if (!vdev->pci_aer) {
218e99
+        return;
218e99
+    }
218e99
+
218e99
+    if (event_notifier_init(&vdev->err_notifier, 0)) {
218e99
+        error_report("vfio: Warning: "
218e99
+                     "Unable to init event notifier for error detection\n");
218e99
+        vdev->pci_aer = false;
218e99
+        return;
218e99
+    }
218e99
+
218e99
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
218e99
+
218e99
+    irq_set = g_malloc0(argsz);
218e99
+    irq_set->argsz = argsz;
218e99
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
218e99
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
218e99
+    irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
218e99
+    irq_set->start = 0;
218e99
+    irq_set->count = 1;
218e99
+    pfd = (int32_t *)&irq_set->data;
218e99
+
218e99
+    *pfd = event_notifier_get_fd(&vdev->err_notifier);
218e99
+    qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
218e99
+
218e99
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
218e99
+    if (ret) {
218e99
+        error_report("vfio: Failed to set up error notification\n");
218e99
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
218e99
+        event_notifier_cleanup(&vdev->err_notifier);
218e99
+        vdev->pci_aer = false;
218e99
+    }
218e99
+    g_free(irq_set);
218e99
+}
218e99
+
218e99
+static void vfio_unregister_err_notifier(VFIODevice *vdev)
218e99
+{
218e99
+    int argsz;
218e99
+    struct vfio_irq_set *irq_set;
218e99
+    int32_t *pfd;
218e99
+    int ret;
218e99
+
218e99
+    if (!vdev->pci_aer) {
218e99
+        return;
218e99
+    }
218e99
+
218e99
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
218e99
+
218e99
+    irq_set = g_malloc0(argsz);
218e99
+    irq_set->argsz = argsz;
218e99
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
218e99
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
218e99
+    irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
218e99
+    irq_set->start = 0;
218e99
+    irq_set->count = 1;
218e99
+    pfd = (int32_t *)&irq_set->data;
218e99
+    *pfd = -1;
218e99
+
218e99
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
218e99
+    if (ret) {
218e99
+        error_report("vfio: Failed to de-assign error fd: %d\n", ret);
218e99
+    }
218e99
+    g_free(irq_set);
218e99
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
218e99
+                        NULL, NULL, vdev);
218e99
+    event_notifier_cleanup(&vdev->err_notifier);
218e99
+}
218e99
+
218e99
 static int vfio_initfn(PCIDevice *pdev)
218e99
 {
218e99
     VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
218e99
@@ -3073,6 +3196,7 @@ static int vfio_initfn(PCIDevice *pdev)
218e99
     }
218e99
 
218e99
     add_boot_device_path(vdev->bootindex, &pdev->qdev, NULL);
218e99
+    vfio_register_err_notifier(vdev);
218e99
 
218e99
     return 0;
218e99
 
218e99
@@ -3092,6 +3216,7 @@ static void vfio_exitfn(PCIDevice *pdev)
218e99
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
218e99
     VFIOGroup *group = vdev->group;
218e99
 
218e99
+    vfio_unregister_err_notifier(vdev);
218e99
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
218e99
     vfio_disable_interrupts(vdev);
218e99
     if (vdev->intx.mmap_timer) {