|
|
9ae3a8 |
From 1cdac7e143db48d28605da1feb85a229197d9692 Mon Sep 17 00:00:00 2001
|
|
|
9ae3a8 |
From: Alex Williamson <alex.williamson@redhat.com>
|
|
|
9ae3a8 |
Date: Mon, 18 Mar 2019 19:41:40 +0100
|
|
|
9ae3a8 |
Subject: [PATCH 2/3] vfio/pci: Lazy PBA emulation
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
RH-Author: Alex Williamson <alex.williamson@redhat.com>
|
|
|
9ae3a8 |
Message-id: <155293738623.17152.7817154338901863813.stgit@gimli.home>
|
|
|
9ae3a8 |
Patchwork-id: 84905
|
|
|
9ae3a8 |
O-Subject: [RHEL7.7 qemu-kvm PATCH] vfio/pci: Lazy PBA emulation
|
|
|
9ae3a8 |
Bugzilla: 1459077
|
|
|
9ae3a8 |
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
|
|
|
9ae3a8 |
RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
|
|
|
9ae3a8 |
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Bugzilla: 1459077
|
|
|
9ae3a8 |
Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=20652144
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Notes: Intel's Omnipath card seems to depend on direct access to the
|
|
|
9ae3a8 |
MSI-X PBA, when it's emulated the driver in the guest will
|
|
|
9ae3a8 |
report:
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
hfi1_0: Interrupt registers not properly mapped by VMM
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
The patch here disables the PBA memory region except when
|
|
|
9ae3a8 |
vectors are masked, which is essentially never, and therefore
|
|
|
9ae3a8 |
provides that direct access. Tested against an "Omni-Path
|
|
|
9ae3a8 |
HFI Silicon 100 Series" card as well as regression tested
|
|
|
9ae3a8 |
against a Intel 82576 PF and VF, both of which also use MSI-X.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
The PCI spec recommends devices use additional alignment for MSI-X
|
|
|
9ae3a8 |
data structures to allow software to map them to separate processor
|
|
|
9ae3a8 |
pages. One advantage of doing this is that we can emulate those data
|
|
|
9ae3a8 |
structures without a significant performance impact to the operation
|
|
|
9ae3a8 |
of the device. Some devices fail to implement that suggestion and
|
|
|
9ae3a8 |
assigned device performance suffers.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
One such case of this is a Mellanox MT27500 series, ConnectX-3 VF,
|
|
|
9ae3a8 |
where the MSI-X vector table and PBA are aligned on separate 4K
|
|
|
9ae3a8 |
pages. If PBA emulation is enabled, performance suffers. It's not
|
|
|
9ae3a8 |
clear how much value we get from PBA emulation, but the solution here
|
|
|
9ae3a8 |
is to only lazily enable the emulated PBA when a masked MSI-X vector
|
|
|
9ae3a8 |
fires. We then attempt to more aggresively disable the PBA memory
|
|
|
9ae3a8 |
region any time a vector is unmasked. The expectation is then that
|
|
|
9ae3a8 |
a typical VM will run entirely with PBA emulation disabled, and only
|
|
|
9ae3a8 |
when used is that emulation re-enabled.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Reported-by: Shyam Kaushik <shyam.kaushik@gmail.com>
|
|
|
9ae3a8 |
Tested-by: Shyam Kaushik <shyam.kaushik@gmail.com>
|
|
|
9ae3a8 |
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
|
|
9ae3a8 |
(cherry picked from 95239e162518dc6577164be3d9a789aba7f591a3)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
|
|
|
9ae3a8 |
---
|
|
|
9ae3a8 |
hw/misc/vfio.c | 40 ++++++++++++++++++++++++++++++++++++++++
|
|
|
9ae3a8 |
trace-events | 2 ++
|
|
|
9ae3a8 |
2 files changed, 42 insertions(+)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
|
|
|
9ae3a8 |
index f7360bf..781acd9 100644
|
|
|
9ae3a8 |
--- a/hw/misc/vfio.c
|
|
|
9ae3a8 |
+++ b/hw/misc/vfio.c
|
|
|
9ae3a8 |
@@ -190,6 +190,7 @@ typedef struct VFIOMSIXInfo {
|
|
|
9ae3a8 |
uint32_t pba_offset;
|
|
|
9ae3a8 |
MemoryRegion mmap_mem;
|
|
|
9ae3a8 |
void *mmap;
|
|
|
9ae3a8 |
+ unsigned long *pending;
|
|
|
9ae3a8 |
} VFIOMSIXInfo;
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
typedef struct VFIODeviceOps VFIODeviceOps;
|
|
|
9ae3a8 |
@@ -691,6 +692,13 @@ static void vfio_msi_interrupt(void *opaque)
|
|
|
9ae3a8 |
#endif
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
if (vdev->interrupt == VFIO_INT_MSIX) {
|
|
|
9ae3a8 |
+ /* A masked vector firing needs to use the PBA, enable it */
|
|
|
9ae3a8 |
+ if (msix_is_masked(&vdev->pdev, nr)) {
|
|
|
9ae3a8 |
+ set_bit(nr, vdev->msix->pending);
|
|
|
9ae3a8 |
+ memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
|
|
|
9ae3a8 |
+ trace_vfio_msix_pba_enable(vdev->vbasedev.name);
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
msix_notify(&vdev->pdev, nr);
|
|
|
9ae3a8 |
} else if (vdev->interrupt == VFIO_INT_MSI) {
|
|
|
9ae3a8 |
msi_notify(&vdev->pdev, nr);
|
|
|
9ae3a8 |
@@ -866,6 +874,14 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+ /* Disable PBA emulation when nothing more is pending. */
|
|
|
9ae3a8 |
+ clear_bit(nr, vdev->msix->pending);
|
|
|
9ae3a8 |
+ if (find_first_bit(vdev->msix->pending,
|
|
|
9ae3a8 |
+ vdev->nr_vectors) == vdev->nr_vectors) {
|
|
|
9ae3a8 |
+ memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
|
|
|
9ae3a8 |
+ trace_vfio_msix_pba_disable(vdev->vbasedev.name);
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
return 0;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -1070,6 +1086,9 @@ static void vfio_disable_msix(VFIOPCIDevice *vdev)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
vfio_disable_msi_common(vdev);
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+ memset(vdev->msix->pending, 0,
|
|
|
9ae3a8 |
+ BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
DPRINTF("%s(%s)\n", __func__, vdev->vbasedev.name);
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -2561,6 +2580,8 @@ static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
|
|
|
9ae3a8 |
{
|
|
|
9ae3a8 |
int ret;
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+ vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
|
|
|
9ae3a8 |
+ sizeof(unsigned long));
|
|
|
9ae3a8 |
ret = msix_init(&vdev->pdev, vdev->msix->entries,
|
|
|
9ae3a8 |
vdev->bars[vdev->msix->table_bar].region.mem,
|
|
|
9ae3a8 |
vdev->msix->table_bar, vdev->msix->table_offset,
|
|
|
9ae3a8 |
@@ -2574,6 +2595,24 @@ static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos)
|
|
|
9ae3a8 |
return ret;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+ /*
|
|
|
9ae3a8 |
+ * The PCI spec suggests that devices provide additional alignment for
|
|
|
9ae3a8 |
+ * MSI-X structures and avoid overlapping non-MSI-X related registers.
|
|
|
9ae3a8 |
+ * For an assigned device, this hopefully means that emulation of MSI-X
|
|
|
9ae3a8 |
+ * structures does not affect the performance of the device. If devices
|
|
|
9ae3a8 |
+ * fail to provide that alignment, a significant performance penalty may
|
|
|
9ae3a8 |
+ * result, for instance Mellanox MT27500 VFs:
|
|
|
9ae3a8 |
+ * http://www.spinics.net/lists/kvm/msg125881.html
|
|
|
9ae3a8 |
+ *
|
|
|
9ae3a8 |
+ * The PBA is simply not that important for such a serious regression and
|
|
|
9ae3a8 |
+ * most drivers do not appear to look at it. The solution for this is to
|
|
|
9ae3a8 |
+ * disable the PBA MemoryRegion unless it's being used. We disable it
|
|
|
9ae3a8 |
+ * here and only enable it if a masked vector fires through QEMU. As the
|
|
|
9ae3a8 |
+ * vector-use notifier is called, which occurs on unmask, we test whether
|
|
|
9ae3a8 |
+ * PBA emulation is needed and again disable if not.
|
|
|
9ae3a8 |
+ */
|
|
|
9ae3a8 |
+ memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
return 0;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -2585,6 +2624,7 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
|
|
|
9ae3a8 |
msix_uninit(&vdev->pdev,
|
|
|
9ae3a8 |
vdev->bars[vdev->msix->table_bar].region.mem,
|
|
|
9ae3a8 |
vdev->bars[vdev->msix->pba_bar].region.mem);
|
|
|
9ae3a8 |
+ g_free(vdev->msix->pending);
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
diff --git a/trace-events b/trace-events
|
|
|
9ae3a8 |
index 7b7aad1..0fb2745 100644
|
|
|
9ae3a8 |
--- a/trace-events
|
|
|
9ae3a8 |
+++ b/trace-events
|
|
|
9ae3a8 |
@@ -1166,3 +1166,5 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
|
|
|
9ae3a8 |
vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
|
|
|
9ae3a8 |
vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
|
|
|
9ae3a8 |
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
|
|
|
9ae3a8 |
+vfio_msix_pba_disable(const char *name) " (%s)"
|
|
|
9ae3a8 |
+vfio_msix_pba_enable(const char *name) " (%s)"
|
|
|
9ae3a8 |
--
|
|
|
9ae3a8 |
1.8.3.1
|
|
|
9ae3a8 |
|