9ae3a8
From 1cdac7e143db48d28605da1feb85a229197d9692 Mon Sep 17 00:00:00 2001
9ae3a8
From: Alex Williamson <alex.williamson@redhat.com>
9ae3a8
Date: Mon, 18 Mar 2019 19:41:40 +0100
9ae3a8
Subject: [PATCH 2/3] vfio/pci: Lazy PBA emulation
9ae3a8
9ae3a8
RH-Author: Alex Williamson <alex.williamson@redhat.com>
9ae3a8
Message-id: <155293738623.17152.7817154338901863813.stgit@gimli.home>
9ae3a8
Patchwork-id: 84905
9ae3a8
O-Subject: [RHEL7.7 qemu-kvm PATCH] vfio/pci: Lazy PBA emulation
9ae3a8
Bugzilla: 1459077
9ae3a8
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
9ae3a8
RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
9ae3a8
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
9ae3a8
9ae3a8
Bugzilla: 1459077
9ae3a8
Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=20652144
9ae3a8
9ae3a8
Notes: Intel's Omnipath card seems to depend on direct access to the
9ae3a8
       MSI-X PBA, when it's emulated the driver in the guest will
9ae3a8
       report:
9ae3a8
9ae3a8
         hfi1_0: Interrupt registers not properly mapped by VMM
9ae3a8
9ae3a8
       The patch here disables the PBA memory region except when
9ae3a8
       vectors are masked, which is essentially never, and therefore
9ae3a8
       provides that direct access.  Tested against an "Omni-Path
9ae3a8
       HFI Silicon 100 Series" card as well as regression tested
9ae3a8
       against a Intel 82576 PF and VF, both of which also use MSI-X.
9ae3a8
9ae3a8
The PCI spec recommends devices use additional alignment for MSI-X
9ae3a8
data structures to allow software to map them to separate processor
9ae3a8
pages.  One advantage of doing this is that we can emulate those data
9ae3a8
structures without a significant performance impact to the operation
9ae3a8
of the device.  Some devices fail to implement that suggestion and
9ae3a8
assigned device performance suffers.
9ae3a8
9ae3a8
One such case of this is a Mellanox MT27500 series, ConnectX-3 VF,
9ae3a8
where the MSI-X vector table and PBA are aligned on separate 4K
9ae3a8
pages.  If PBA emulation is enabled, performance suffers.  It's not
9ae3a8
clear how much value we get from PBA emulation, but the solution here
9ae3a8
is to only lazily enable the emulated PBA when a masked MSI-X vector
9ae3a8
fires.  We then attempt to more aggresively disable the PBA memory
9ae3a8
region any time a vector is unmasked.  The expectation is then that
9ae3a8
a typical VM will run entirely with PBA emulation disabled, and only
9ae3a8
when used is that emulation re-enabled.
9ae3a8
9ae3a8
Reported-by: Shyam Kaushik <shyam.kaushik@gmail.com>
9ae3a8
Tested-by: Shyam Kaushik <shyam.kaushik@gmail.com>
9ae3a8
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
9ae3a8
(cherry picked from 95239e162518dc6577164be3d9a789aba7f591a3)
9ae3a8
9ae3a8
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
9ae3a8
---
9ae3a8
 hw/misc/vfio.c | 40 ++++++++++++++++++++++++++++++++++++++++
9ae3a8
 trace-events   |  2 ++
9ae3a8
 2 files changed, 42 insertions(+)
9ae3a8
9ae3a8
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
9ae3a8
index f7360bf..781acd9 100644
9ae3a8
--- a/hw/misc/vfio.c
9ae3a8
+++ b/hw/misc/vfio.c
9ae3a8
@@ -190,6 +190,7 @@ typedef struct VFIOMSIXInfo {
9ae3a8
     uint32_t pba_offset;
9ae3a8
     MemoryRegion mmap_mem;
9ae3a8
     void *mmap;
9ae3a8
+    unsigned long *pending;
9ae3a8
 } VFIOMSIXInfo;
9ae3a8
 
9ae3a8
 typedef struct VFIODeviceOps VFIODeviceOps;
9ae3a8
@@ -691,6 +692,13 @@ static void vfio_msi_interrupt(void *opaque)
9ae3a8
 #endif
9ae3a8
 
9ae3a8
     if (vdev->interrupt == VFIO_INT_MSIX) {
9ae3a8
+        /* A masked vector firing needs to use the PBA, enable it */
9ae3a8
+        if (msix_is_masked(&vdev->pdev, nr)) {
9ae3a8
+            set_bit(nr, vdev->msix->pending);
9ae3a8
+            memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
9ae3a8
+            trace_vfio_msix_pba_enable(vdev->vbasedev.name);
9ae3a8
+        }
9ae3a8
+
9ae3a8
         msix_notify(&vdev->pdev, nr);
9ae3a8
     } else if (vdev->interrupt == VFIO_INT_MSI) {
9ae3a8
         msi_notify(&vdev->pdev, nr);
9ae3a8
@@ -866,6 +874,14 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
9ae3a8
         }
9ae3a8
     }
9ae3a8
 
9ae3a8
+    /* Disable PBA emulation when nothing more is pending. */
9ae3a8
+    clear_bit(nr, vdev->msix->pending);
9ae3a8
+    if (find_first_bit(vdev->msix->pending,
9ae3a8
+                       vdev->nr_vectors) == vdev->nr_vectors) {
9ae3a8
+        memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
9ae3a8
+        trace_vfio_msix_pba_disable(vdev->vbasedev.name);
9ae3a8
+    }
9ae3a8
+
9ae3a8
     return 0;
9ae3a8
 }
9ae3a8
 
9ae3a8
@@ -1070,6 +1086,9 @@ static void vfio_disable_msix(VFIOPCIDevice *vdev)
9ae3a8
 
9ae3a8
     vfio_disable_msi_common(vdev);
9ae3a8
 
9ae3a8
+    memset(vdev->msix->pending, 0,
9ae3a8
+           BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
9ae3a8
+
9ae3a8
     DPRINTF("%s(%s)\n", __func__, vdev->vbasedev.name);
9ae3a8
 }
9ae3a8
 
9ae3a8
@@ -2561,6 +2580,8 @@ static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
9ae3a8
 {
9ae3a8
     int ret;
9ae3a8
 
9ae3a8
+    vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
9ae3a8
+                                    sizeof(unsigned long));
9ae3a8
     ret = msix_init(&vdev->pdev, vdev->msix->entries,
9ae3a8
                     vdev->bars[vdev->msix->table_bar].region.mem,
9ae3a8
                     vdev->msix->table_bar, vdev->msix->table_offset,
9ae3a8
@@ -2574,6 +2595,24 @@ static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
9ae3a8
         return ret;
9ae3a8
     }
9ae3a8
 
9ae3a8
+    /*
9ae3a8
+     * The PCI spec suggests that devices provide additional alignment for
9ae3a8
+     * MSI-X structures and avoid overlapping non-MSI-X related registers.
9ae3a8
+     * For an assigned device, this hopefully means that emulation of MSI-X
9ae3a8
+     * structures does not affect the performance of the device.  If devices
9ae3a8
+     * fail to provide that alignment, a significant performance penalty may
9ae3a8
+     * result, for instance Mellanox MT27500 VFs:
9ae3a8
+     * http://www.spinics.net/lists/kvm/msg125881.html
9ae3a8
+     *
9ae3a8
+     * The PBA is simply not that important for such a serious regression and
9ae3a8
+     * most drivers do not appear to look at it.  The solution for this is to
9ae3a8
+     * disable the PBA MemoryRegion unless it's being used.  We disable it
9ae3a8
+     * here and only enable it if a masked vector fires through QEMU.  As the
9ae3a8
+     * vector-use notifier is called, which occurs on unmask, we test whether
9ae3a8
+     * PBA emulation is needed and again disable if not.
9ae3a8
+     */
9ae3a8
+    memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
9ae3a8
+
9ae3a8
     return 0;
9ae3a8
 }
9ae3a8
 
9ae3a8
@@ -2585,6 +2624,7 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
9ae3a8
         msix_uninit(&vdev->pdev,
9ae3a8
                     vdev->bars[vdev->msix->table_bar].region.mem,
9ae3a8
                     vdev->bars[vdev->msix->pba_bar].region.mem);
9ae3a8
+	g_free(vdev->msix->pending);
9ae3a8
     }
9ae3a8
 }
9ae3a8
 
9ae3a8
diff --git a/trace-events b/trace-events
9ae3a8
index 7b7aad1..0fb2745 100644
9ae3a8
--- a/trace-events
9ae3a8
+++ b/trace-events
9ae3a8
@@ -1166,3 +1166,5 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
9ae3a8
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
9ae3a8
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
9ae3a8
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
9ae3a8
+vfio_msix_pba_disable(const char *name) " (%s)"
9ae3a8
+vfio_msix_pba_enable(const char *name) " (%s)"
9ae3a8
-- 
9ae3a8
1.8.3.1
9ae3a8