d740ea
From 1cdac7e143db48d28605da1feb85a229197d9692 Mon Sep 17 00:00:00 2001
d740ea
From: Alex Williamson <alex.williamson@redhat.com>
d740ea
Date: Mon, 18 Mar 2019 19:41:40 +0100
d740ea
Subject: [PATCH 2/3] vfio/pci: Lazy PBA emulation
d740ea
d740ea
RH-Author: Alex Williamson <alex.williamson@redhat.com>
d740ea
Message-id: <155293738623.17152.7817154338901863813.stgit@gimli.home>
d740ea
Patchwork-id: 84905
d740ea
O-Subject: [RHEL7.7 qemu-kvm PATCH] vfio/pci: Lazy PBA emulation
d740ea
Bugzilla: 1459077
d740ea
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
d740ea
RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
d740ea
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
d740ea
d740ea
Bugzilla: 1459077
d740ea
Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=20652144
d740ea
d740ea
Notes: Intel's Omnipath card seems to depend on direct access to the
d740ea
       MSI-X PBA, when it's emulated the driver in the guest will
d740ea
       report:
d740ea
d740ea
         hfi1_0: Interrupt registers not properly mapped by VMM
d740ea
d740ea
       The patch here disables the PBA memory region except when
d740ea
       vectors are masked, which is essentially never, and therefore
d740ea
       provides that direct access.  Tested against an "Omni-Path
d740ea
       HFI Silicon 100 Series" card as well as regression tested
d740ea
       against a Intel 82576 PF and VF, both of which also use MSI-X.
d740ea
d740ea
The PCI spec recommends devices use additional alignment for MSI-X
d740ea
data structures to allow software to map them to separate processor
d740ea
pages.  One advantage of doing this is that we can emulate those data
d740ea
structures without a significant performance impact to the operation
d740ea
of the device.  Some devices fail to implement that suggestion and
d740ea
assigned device performance suffers.
d740ea
d740ea
One such case of this is a Mellanox MT27500 series, ConnectX-3 VF,
d740ea
where the MSI-X vector table and PBA are aligned on separate 4K
d740ea
pages.  If PBA emulation is enabled, performance suffers.  It's not
d740ea
clear how much value we get from PBA emulation, but the solution here
d740ea
is to only lazily enable the emulated PBA when a masked MSI-X vector
d740ea
fires.  We then attempt to more aggresively disable the PBA memory
d740ea
region any time a vector is unmasked.  The expectation is then that
d740ea
a typical VM will run entirely with PBA emulation disabled, and only
d740ea
when used is that emulation re-enabled.
d740ea
d740ea
Reported-by: Shyam Kaushik <shyam.kaushik@gmail.com>
d740ea
Tested-by: Shyam Kaushik <shyam.kaushik@gmail.com>
d740ea
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
d740ea
(cherry picked from 95239e162518dc6577164be3d9a789aba7f591a3)
d740ea
d740ea
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
d740ea
---
d740ea
 hw/misc/vfio.c | 40 ++++++++++++++++++++++++++++++++++++++++
d740ea
 trace-events   |  2 ++
d740ea
 2 files changed, 42 insertions(+)
d740ea
d740ea
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
d740ea
index f7360bf..781acd9 100644
d740ea
--- a/hw/misc/vfio.c
d740ea
+++ b/hw/misc/vfio.c
d740ea
@@ -190,6 +190,7 @@ typedef struct VFIOMSIXInfo {
d740ea
     uint32_t pba_offset;
d740ea
     MemoryRegion mmap_mem;
d740ea
     void *mmap;
d740ea
+    unsigned long *pending;
d740ea
 } VFIOMSIXInfo;
d740ea
 
d740ea
 typedef struct VFIODeviceOps VFIODeviceOps;
d740ea
@@ -691,6 +692,13 @@ static void vfio_msi_interrupt(void *opaque)
d740ea
 #endif
d740ea
 
d740ea
     if (vdev->interrupt == VFIO_INT_MSIX) {
d740ea
+        /* A masked vector firing needs to use the PBA, enable it */
d740ea
+        if (msix_is_masked(&vdev->pdev, nr)) {
d740ea
+            set_bit(nr, vdev->msix->pending);
d740ea
+            memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
d740ea
+            trace_vfio_msix_pba_enable(vdev->vbasedev.name);
d740ea
+        }
d740ea
+
d740ea
         msix_notify(&vdev->pdev, nr);
d740ea
     } else if (vdev->interrupt == VFIO_INT_MSI) {
d740ea
         msi_notify(&vdev->pdev, nr);
d740ea
@@ -866,6 +874,14 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
d740ea
         }
d740ea
     }
d740ea
 
d740ea
+    /* Disable PBA emulation when nothing more is pending. */
d740ea
+    clear_bit(nr, vdev->msix->pending);
d740ea
+    if (find_first_bit(vdev->msix->pending,
d740ea
+                       vdev->nr_vectors) == vdev->nr_vectors) {
d740ea
+        memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
d740ea
+        trace_vfio_msix_pba_disable(vdev->vbasedev.name);
d740ea
+    }
d740ea
+
d740ea
     return 0;
d740ea
 }
d740ea
 
d740ea
@@ -1070,6 +1086,9 @@ static void vfio_disable_msix(VFIOPCIDevice *vdev)
d740ea
 
d740ea
     vfio_disable_msi_common(vdev);
d740ea
 
d740ea
+    memset(vdev->msix->pending, 0,
d740ea
+           BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
d740ea
+
d740ea
     DPRINTF("%s(%s)\n", __func__, vdev->vbasedev.name);
d740ea
 }
d740ea
 
d740ea
@@ -2561,6 +2580,8 @@ static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
d740ea
 {
d740ea
     int ret;
d740ea
 
d740ea
+    vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
d740ea
+                                    sizeof(unsigned long));
d740ea
     ret = msix_init(&vdev->pdev, vdev->msix->entries,
d740ea
                     vdev->bars[vdev->msix->table_bar].region.mem,
d740ea
                     vdev->msix->table_bar, vdev->msix->table_offset,
d740ea
@@ -2574,6 +2595,24 @@ static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
d740ea
         return ret;
d740ea
     }
d740ea
 
d740ea
+    /*
d740ea
+     * The PCI spec suggests that devices provide additional alignment for
d740ea
+     * MSI-X structures and avoid overlapping non-MSI-X related registers.
d740ea
+     * For an assigned device, this hopefully means that emulation of MSI-X
d740ea
+     * structures does not affect the performance of the device.  If devices
d740ea
+     * fail to provide that alignment, a significant performance penalty may
d740ea
+     * result, for instance Mellanox MT27500 VFs:
d740ea
+     * http://www.spinics.net/lists/kvm/msg125881.html
d740ea
+     *
d740ea
+     * The PBA is simply not that important for such a serious regression and
d740ea
+     * most drivers do not appear to look at it.  The solution for this is to
d740ea
+     * disable the PBA MemoryRegion unless it's being used.  We disable it
d740ea
+     * here and only enable it if a masked vector fires through QEMU.  As the
d740ea
+     * vector-use notifier is called, which occurs on unmask, we test whether
d740ea
+     * PBA emulation is needed and again disable if not.
d740ea
+     */
d740ea
+    memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
d740ea
+
d740ea
     return 0;
d740ea
 }
d740ea
 
d740ea
@@ -2585,6 +2624,7 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
d740ea
         msix_uninit(&vdev->pdev,
d740ea
                     vdev->bars[vdev->msix->table_bar].region.mem,
d740ea
                     vdev->bars[vdev->msix->pba_bar].region.mem);
d740ea
+	g_free(vdev->msix->pending);
d740ea
     }
d740ea
 }
d740ea
 
d740ea
diff --git a/trace-events b/trace-events
d740ea
index 7b7aad1..0fb2745 100644
d740ea
--- a/trace-events
d740ea
+++ b/trace-events
d740ea
@@ -1166,3 +1166,5 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
d740ea
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
d740ea
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
d740ea
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
d740ea
+vfio_msix_pba_disable(const char *name) " (%s)"
d740ea
+vfio_msix_pba_enable(const char *name) " (%s)"
d740ea
-- 
d740ea
1.8.3.1
d740ea