958e1b
From 81ee28858d0082b300184c9fcfc10480c67aa74d Mon Sep 17 00:00:00 2001
2382db
From: Alex Williamson <alex.williamson@redhat.com>
2382db
Date: Thu, 7 Aug 2014 21:03:08 +0200
958e1b
Subject: [PATCH 5/7] vfio-pci: Fix MSI-X masking performance
2382db
2382db
Message-id: <20140807210308.11689.93757.stgit@gimli.home>
2382db
Patchwork-id: 60481
2382db
O-Subject: [RHEL7.0/z qemu-kvm PATCH v2 4/6] vfio-pci: Fix MSI-X masking performance
958e1b
Bugzilla: 1098976
2382db
RH-Acked-by: Bandan Das <bsd@redhat.com>
2382db
RH-Acked-by: Amos Kong <akong@redhat.com>
2382db
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
2382db
2382db
There are still old guests out there that over-exercise MSI-X masking.
2382db
The current code completely sets-up and tears-down an MSI-X vector on
2382db
the "use" and "release" callbacks.  While this is functional, it can
2382db
slow an old guest to a crawl.  We can easily skip the KVM parts of
2382db
this so that we keep the MSI route and irqfd setup.  We do however
2382db
need to switch VFIO to trigger a different eventfd while masked.
2382db
Actually, we have the option of continuing to use -1 to disable the
2382db
trigger, but by using another EventNotifier we can allow the MSI-X
2382db
core to emulate pending bits and re-fire the vector once unmasked.
2382db
MSI code gets updated as well to use the same setup and teardown
2382db
structures and functions.
2382db
2382db
Prior to this change, an igbvf assigned to a RHEL5 guest gets about
2382db
20Mbps and 50 transactions/s with netperf (remote or VF->PF).  With
2382db
this change, we get line rate and 3k transactions/s remote or 2Gbps
2382db
and 6k+ transactions/s to the PF.  No significant change is expected
2382db
for newer guests with more well behaved MSI-X support.
2382db
2382db
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2382db
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
2382db
---
958e1b
 hw/misc/vfio.c | 233 ++++++++++++++++++++++++++++++++-------------------------
958e1b
 1 file changed, 131 insertions(+), 102 deletions(-)
2382db
2382db
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
2382db
index 5f0460a..bd37924 100644
2382db
--- a/hw/misc/vfio.c
2382db
+++ b/hw/misc/vfio.c
2382db
@@ -120,6 +120,7 @@ typedef struct VFIOINTx {
2382db
 
2382db
 typedef struct VFIOMSIVector {
2382db
     EventNotifier interrupt; /* eventfd triggered on interrupt */
2382db
+    EventNotifier kvm_interrupt; /* eventfd triggered for KVM irqfd bypass */
2382db
     struct VFIODevice *vdev; /* back pointer to device */
2382db
     MSIMessage msg; /* cache the MSI message so we know when it changes */
2382db
     int virq; /* KVM irqchip route for QEMU bypass */
2382db
@@ -663,10 +664,11 @@ static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
2382db
     for (i = 0; i < vdev->nr_vectors; i++) {
2382db
         if (!vdev->msi_vectors[i].use) {
2382db
             fds[i] = -1;
2382db
-            continue;
2382db
+        } else if (vdev->msi_vectors[i].virq >= 0) {
2382db
+            fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
2382db
+        } else {
2382db
+            fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
2382db
         }
2382db
-
2382db
-        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
2382db
     }
2382db
 
2382db
     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
2382db
@@ -676,6 +678,52 @@ static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
2382db
     return ret;
2382db
 }
2382db
 
2382db
+static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
2382db
+                                  bool msix)
2382db
+{
2382db
+    int virq;
2382db
+
2382db
+    if ((msix && !VFIO_ALLOW_KVM_MSIX) ||
2382db
+        (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) {
2382db
+        return;
2382db
+    }
2382db
+
2382db
+    if (event_notifier_init(&vector->kvm_interrupt, 0)) {
2382db
+        return;
2382db
+    }
2382db
+
2382db
+    virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
2382db
+    if (virq < 0) {
2382db
+        event_notifier_cleanup(&vector->kvm_interrupt);
2382db
+        return;
2382db
+    }
2382db
+
2382db
+    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
2382db
+                                       virq) < 0) {
2382db
+        kvm_irqchip_release_virq(kvm_state, virq);
2382db
+        event_notifier_cleanup(&vector->kvm_interrupt);
2382db
+        return;
2382db
+    }
2382db
+
2382db
+    vector->msg = *msg;
2382db
+    vector->virq = virq;
2382db
+}
2382db
+
2382db
+static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
2382db
+{
2382db
+    kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
2382db
+                                      vector->virq);
2382db
+    kvm_irqchip_release_virq(kvm_state, vector->virq);
2382db
+    vector->virq = -1;
2382db
+    event_notifier_cleanup(&vector->kvm_interrupt);
2382db
+}
2382db
+
2382db
+static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
2382db
+{
2382db
+    kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
2382db
+    vector->msg = msg;
2382db
+}
2382db
+
2382db
 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
2382db
                                    MSIMessage *msg, IOHandler *handler)
2382db
 {
2382db
@@ -688,30 +736,32 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
2382db
             vdev->host.function, nr);
2382db
 
2382db
     vector = &vdev->msi_vectors[nr];
2382db
-    vector->vdev = vdev;
2382db
-    vector->use = true;
2382db
-
2382db
-    msix_vector_use(pdev, nr);
2382db
 
2382db
-    if (event_notifier_init(&vector->interrupt, 0)) {
2382db
-        error_report("vfio: Error: event_notifier_init failed");
2382db
+    if (!vector->use) {
2382db
+        vector->vdev = vdev;
2382db
+        vector->virq = -1;
2382db
+        if (event_notifier_init(&vector->interrupt, 0)) {
2382db
+            error_report("vfio: Error: event_notifier_init failed");
2382db
+        }
2382db
+        vector->use = true;
2382db
+        msix_vector_use(pdev, nr);
2382db
     }
2382db
 
2382db
+    qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
+                        handler, NULL, vector);
2382db
+
2382db
     /*
2382db
      * Attempt to enable route through KVM irqchip,
2382db
      * default to userspace handling if unavailable.
2382db
      */
2382db
-    vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
2382db
-                   kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
2382db
-    if (vector->virq < 0 ||
2382db
-        kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
2382db
-                                       vector->virq) < 0) {
2382db
-        if (vector->virq >= 0) {
2382db
-            kvm_irqchip_release_virq(kvm_state, vector->virq);
2382db
-            vector->virq = -1;
2382db
+    if (vector->virq >= 0) {
2382db
+        if (!msg) {
2382db
+            vfio_remove_kvm_msi_virq(vector);
2382db
+        } else {
2382db
+            vfio_update_kvm_msi_virq(vector, *msg);
2382db
         }
2382db
-        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
-                            handler, NULL, vector);
2382db
+    } else {
2382db
+        vfio_add_kvm_msi_virq(vector, msg, true);
2382db
     }
2382db
 
2382db
     /*
2382db
@@ -742,7 +792,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
2382db
         irq_set->count = 1;
2382db
         pfd = (int32_t *)&irq_set->data;
2382db
 
2382db
-        *pfd = event_notifier_get_fd(&vector->interrupt);
2382db
+        if (vector->virq >= 0) {
2382db
+            *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
2382db
+        } else {
2382db
+            *pfd = event_notifier_get_fd(&vector->interrupt);
2382db
+        }
2382db
 
2382db
         ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
2382db
         g_free(irq_set);
2382db
@@ -764,50 +818,41 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
2382db
 {
2382db
     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2382db
     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
2382db
-    int argsz;
2382db
-    struct vfio_irq_set *irq_set;
2382db
-    int32_t *pfd;
2382db
 
2382db
     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
2382db
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2382db
             vdev->host.function, nr);
2382db
 
2382db
     /*
2382db
-     * XXX What's the right thing to do here?  This turns off the interrupt
2382db
-     * completely, but do we really just want to switch the interrupt to
2382db
-     * bouncing through userspace and let msix.c drop it?  Not sure.
2382db
+     * There are still old guests that mask and unmask vectors on every
2382db
+     * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
2382db
+     * the KVM setup in place, simply switch VFIO to use the non-bypass
2382db
+     * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
2382db
+     * core will mask the interrupt and set pending bits, allowing it to
2382db
+     * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
2382db
      */
2382db
-    msix_vector_unuse(pdev, nr);
2382db
-
2382db
-    argsz = sizeof(*irq_set) + sizeof(*pfd);
2382db
+    if (vector->virq >= 0) {
2382db
+        int argsz;
2382db
+        struct vfio_irq_set *irq_set;
2382db
+        int32_t *pfd;
2382db
 
2382db
-    irq_set = g_malloc0(argsz);
2382db
-    irq_set->argsz = argsz;
2382db
-    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
2382db
-                     VFIO_IRQ_SET_ACTION_TRIGGER;
2382db
-    irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
2382db
-    irq_set->start = nr;
2382db
-    irq_set->count = 1;
2382db
-    pfd = (int32_t *)&irq_set->data;
2382db
+        argsz = sizeof(*irq_set) + sizeof(*pfd);
2382db
 
2382db
-    *pfd = -1;
2382db
+        irq_set = g_malloc0(argsz);
2382db
+        irq_set->argsz = argsz;
2382db
+        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
2382db
+                         VFIO_IRQ_SET_ACTION_TRIGGER;
2382db
+        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
2382db
+        irq_set->start = nr;
2382db
+        irq_set->count = 1;
2382db
+        pfd = (int32_t *)&irq_set->data;
2382db
 
2382db
-    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
2382db
+        *pfd = event_notifier_get_fd(&vector->interrupt);
2382db
 
2382db
-    g_free(irq_set);
2382db
+        ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
2382db
 
2382db
-    if (vector->virq < 0) {
2382db
-        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
-                            NULL, NULL, NULL);
2382db
-    } else {
2382db
-        kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
2382db
-                                          vector->virq);
2382db
-        kvm_irqchip_release_virq(kvm_state, vector->virq);
2382db
-        vector->virq = -1;
2382db
+        g_free(irq_set);
2382db
     }
2382db
-
2382db
-    event_notifier_cleanup(&vector->interrupt);
2382db
-    vector->use = false;
2382db
 }
2382db
 
2382db
 static void vfio_enable_msix(VFIODevice *vdev)
2382db
@@ -857,28 +902,28 @@ retry:
2382db
         VFIOMSIVector *vector = &vdev->msi_vectors[i];
2382db
 
2382db
         vector->vdev = vdev;
2382db
+        vector->virq = -1;
2382db
         vector->use = true;
2382db
 
2382db
         if (event_notifier_init(&vector->interrupt, 0)) {
2382db
             error_report("vfio: Error: event_notifier_init failed");
2382db
         }
2382db
 
2382db
+        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
+                            vfio_msi_interrupt, NULL, vector);
2382db
+
2382db
         vector->msg = msi_get_message(&vdev->pdev, i);
2382db
 
2382db
         /*
2382db
          * Attempt to enable route through KVM irqchip,
2382db
          * default to userspace handling if unavailable.
2382db
          */
2382db
-        vector->virq = VFIO_ALLOW_KVM_MSI ?
2382db
-                       kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
2382db
-        if (vector->virq < 0 ||
2382db
-            kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
2382db
-                                           vector->virq) < 0) {
2382db
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
-                                vfio_msi_interrupt, NULL, vector);
2382db
-        }
2382db
+        vfio_add_kvm_msi_virq(vector, &vector->msg, false);
2382db
     }
2382db
 
2382db
+    /* Set interrupt type prior to possible interrupts */
2382db
+    vdev->interrupt = VFIO_INT_MSI;
2382db
+
2382db
     ret = vfio_enable_vectors(vdev, false);
2382db
     if (ret) {
2382db
         if (ret < 0) {
2382db
@@ -891,14 +936,10 @@ retry:
2382db
         for (i = 0; i < vdev->nr_vectors; i++) {
2382db
             VFIOMSIVector *vector = &vdev->msi_vectors[i];
2382db
             if (vector->virq >= 0) {
2382db
-                kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
2382db
-                                                  vector->virq);
2382db
-                kvm_irqchip_release_virq(kvm_state, vector->virq);
2382db
-                vector->virq = -1;
2382db
-            } else {
2382db
-                qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
-                                    NULL, NULL, NULL);
2382db
+                vfio_remove_kvm_msi_virq(vector);
2382db
             }
2382db
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
+                                NULL, NULL, NULL);
2382db
             event_notifier_cleanup(&vector->interrupt);
2382db
         }
2382db
 
2382db
@@ -910,11 +951,17 @@ retry:
2382db
         }
2382db
         vdev->nr_vectors = 0;
2382db
 
2382db
+        /*
2382db
+         * Failing to setup MSI doesn't really fall within any specification.
2382db
+         * Let's try leaving interrupts disabled and hope the guest figures
2382db
+         * out to fall back to INTx for this device.
2382db
+         */
2382db
+        error_report("vfio: Error: Failed to enable MSI");
2382db
+        vdev->interrupt = VFIO_INT_NONE;
2382db
+
2382db
         return;
2382db
     }
2382db
 
2382db
-    vdev->interrupt = VFIO_INT_MSI;
2382db
-
2382db
     DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
2382db
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2382db
             vdev->host.function, vdev->nr_vectors);
2382db
@@ -922,6 +969,20 @@ retry:
2382db
 
2382db
 static void vfio_disable_msi_common(VFIODevice *vdev)
2382db
 {
2382db
+    int i;
2382db
+
2382db
+    for (i = 0; i < vdev->nr_vectors; i++) {
2382db
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
2382db
+        if (vdev->msi_vectors[i].use) {
2382db
+            if (vector->virq >= 0) {
2382db
+                vfio_remove_kvm_msi_virq(vector);
2382db
+            }
2382db
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
+                                NULL, NULL, NULL);
2382db
+            event_notifier_cleanup(&vector->interrupt);
2382db
+        }
2382db
+    }
2382db
+
2382db
     g_free(vdev->msi_vectors);
2382db
     vdev->msi_vectors = NULL;
2382db
     vdev->nr_vectors = 0;
2382db
@@ -943,6 +1004,7 @@ static void vfio_disable_msix(VFIODevice *vdev)
2382db
     for (i = 0; i < vdev->nr_vectors; i++) {
2382db
         if (vdev->msi_vectors[i].use) {
2382db
             vfio_msix_vector_release(&vdev->pdev, i);
2382db
+            msix_vector_unuse(&vdev->pdev, i);
2382db
         }
2382db
     }
2382db
 
2382db
@@ -958,30 +1020,7 @@ static void vfio_disable_msix(VFIODevice *vdev)
2382db
 
2382db
 static void vfio_disable_msi(VFIODevice *vdev)
2382db
 {
2382db
-    int i;
2382db
-
2382db
     vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
2382db
-
2382db
-    for (i = 0; i < vdev->nr_vectors; i++) {
2382db
-        VFIOMSIVector *vector = &vdev->msi_vectors[i];
2382db
-
2382db
-        if (!vector->use) {
2382db
-            continue;
2382db
-        }
2382db
-
2382db
-        if (vector->virq >= 0) {
2382db
-            kvm_irqchip_remove_irqfd_notifier(kvm_state,
2382db
-                                              &vector->interrupt, vector->virq);
2382db
-            kvm_irqchip_release_virq(kvm_state, vector->virq);
2382db
-            vector->virq = -1;
2382db
-        } else {
2382db
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
2382db
-                                NULL, NULL, NULL);
2382db
-        }
2382db
-
2382db
-        event_notifier_cleanup(&vector->interrupt);
2382db
-    }
2382db
-
2382db
     vfio_disable_msi_common(vdev);
2382db
 
2382db
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
2382db
@@ -1001,17 +1040,7 @@ static void vfio_update_msi(VFIODevice *vdev)
2382db
         }
2382db
 
2382db
         msg = msi_get_message(&vdev->pdev, i);
2382db
-
2382db
-        if (msg.address != vector->msg.address ||
2382db
-            msg.data != vector->msg.data) {
2382db
-
2382db
-            DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
2382db
-                    __func__, vdev->host.domain, vdev->host.bus,
2382db
-                    vdev->host.slot, vdev->host.function, i);
2382db
-
2382db
-            kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
2382db
-            vector->msg = msg;
2382db
-        }
2382db
+        vfio_update_kvm_msi_virq(vector, msg);
2382db
     }
2382db
 }
2382db
 
2382db
-- 
958e1b
1.8.3.1
2382db