9c6c51
From 5347b12008842b5c86f766e391c6f3756afbff7d Mon Sep 17 00:00:00 2001
9c6c51
Message-Id: <5347b12008842b5c86f766e391c6f3756afbff7d@dist-git>
9c6c51
From: Daniel Henrique Barboza <danielhb413@gmail.com>
9c6c51
Date: Fri, 3 May 2019 13:54:53 +0200
9c6c51
Subject: [PATCH] PPC64 support for NVIDIA V100 GPU with NVLink2 passthrough
9c6c51
9c6c51
The NVIDIA V100 GPU has an onboard RAM that is mapped into the
9c6c51
host memory and accessible as normal RAM via an NVLink2 bridge. When
9c6c51
passed through in a guest, QEMU puts the NVIDIA RAM window in a
9c6c51
non-contiguous area, above the PCI MMIO area that starts at 32TiB.
9c6c51
This means that the NVIDIA RAM window starts at 64TiB and go all the
9c6c51
way to 128TiB.
9c6c51
9c6c51
This means that the guest might request a 64-bit window, for each PCI
9c6c51
Host Bridge, that goes all the way to 128TiB. However, the NVIDIA RAM
9c6c51
window isn't counted as regular RAM, thus this window is considered
9c6c51
only for the allocation of the Translation and Control Entry (TCE).
9c6c51
For more information about how NVLink2 support works in QEMU,
9c6c51
refer to the accepted implementation [1].
9c6c51
9c6c51
This memory layout differs from the existing VFIO case, requiring its
9c6c51
own formula. This patch changes the PPC64 code of
9c6c51
@qemuDomainGetMemLockLimitBytes to:
9c6c51
9c6c51
- detect if we have a NVLink2 bridge being passed through to the
9c6c51
guest. This is done by using the @ppc64VFIODeviceIsNV2Bridge function
9c6c51
added in the previous patch. The existence of the NVLink2 bridge in
9c6c51
the guest means that we are dealing with the NVLink2 memory layout;
9c6c51
9c6c51
- if an IBM NVLink2 bridge exists, passthroughLimit is calculated in a
9c6c51
different way to account for the extra memory the TCE table can alloc.
9c6c51
The 64TiB..128TiB window is more than enough to fit all possible
9c6c51
GPUs, thus the memLimit is the same regardless of passing through 1 or
9c6c51
multiple V100 GPUs.
9c6c51
9c6c51
Further reading explaining the background
9c6c51
[1] https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg03700.html
9c6c51
[2] https://www.redhat.com/archives/libvir-list/2019-March/msg00660.html
9c6c51
[3] https://www.redhat.com/archives/libvir-list/2019-April/msg00527.html
9c6c51
9c6c51
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
9c6c51
Reviewed-by: Erik Skultety <eskultet@redhat.com>
9c6c51
(cherry picked from commit 1a922648f67f56c4374d647feebf2adb9a642f96)
9c6c51
9c6c51
https://bugzilla.redhat.com/show_bug.cgi?id=1505998
9c6c51
9c6c51
Conflicts:
9c6c51
    The upstream commit relied on:
9c6c51
        - v4.7.0-37-gb72183223f
9c6c51
        - v4.7.0-38-ga14f597266
9c6c51
    which were not backported so virPCIDeviceAddressAsString had to
9c6c51
    swapped for the former virDomainPCIAddressAsString in order to
9c6c51
    compile.
9c6c51
9c6c51
Signed-off-by: Erik Skultety <eskultet@redhat.com>
9c6c51
Message-Id: <03c00ebf46d85b0615134ef8655e67a4c909b7da.1556884443.git.eskultet@redhat.com>
9c6c51
Reviewed-by: Andrea Bolognani <abologna@redhat.com>
9c6c51
---
9c6c51
 src/qemu/qemu_domain.c | 80 ++++++++++++++++++++++++++++++++----------
9c6c51
 1 file changed, 61 insertions(+), 19 deletions(-)
9c6c51
9c6c51
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
9c6c51
index a8bc618389..21f0722495 100644
9c6c51
--- a/src/qemu/qemu_domain.c
9c6c51
+++ b/src/qemu/qemu_domain.c
9c6c51
@@ -9813,7 +9813,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver,
9c6c51
  * such as '0004:04:00.0', and tells if the device is a NVLink2
9c6c51
  * bridge.
9c6c51
  */
9c6c51
-static ATTRIBUTE_UNUSED bool
9c6c51
+static bool
9c6c51
 ppc64VFIODeviceIsNV2Bridge(const char *device)
9c6c51
 {
9c6c51
     const char *nvlink2Files[] = {"ibm,gpu", "ibm,nvlink",
9c6c51
@@ -9851,7 +9851,9 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
9c6c51
     unsigned long long maxMemory = 0;
9c6c51
     unsigned long long passthroughLimit = 0;
9c6c51
     size_t i, nPCIHostBridges = 0;
9c6c51
+    virPCIDeviceAddressPtr pciAddr;
9c6c51
     bool usesVFIO = false;
9c6c51
+    bool nvlink2Capable = false;
9c6c51
 
9c6c51
     for (i = 0; i < def->ncontrollers; i++) {
9c6c51
         virDomainControllerDefPtr cont = def->controllers[i];
9c6c51
@@ -9869,7 +9871,17 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
9c6c51
             dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
9c6c51
             dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
9c6c51
             usesVFIO = true;
9c6c51
-            break;
9c6c51
+
9c6c51
+            pciAddr = &dev->source.subsys.u.pci.addr;
9c6c51
+            if (virPCIDeviceAddressIsValid(pciAddr, false)) {
9c6c51
+                VIR_AUTOFREE(char *) pciAddrStr = NULL;
9c6c51
+
9c6c51
+                pciAddrStr = virDomainPCIAddressAsString(pciAddr);
9c6c51
+                if (ppc64VFIODeviceIsNV2Bridge(pciAddrStr)) {
9c6c51
+                    nvlink2Capable = true;
9c6c51
+                    break;
9c6c51
+                }
9c6c51
+            }
9c6c51
         }
9c6c51
     }
9c6c51
 
9c6c51
@@ -9896,29 +9908,59 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
9c6c51
                 4096 * nPCIHostBridges +
9c6c51
                 8192;
9c6c51
 
9c6c51
-    /* passthroughLimit := max( 2 GiB * #PHBs,                       (c)
9c6c51
-     *                          memory                               (d)
9c6c51
-     *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
9c6c51
+    /* NVLink2 support in QEMU is a special case of the passthrough
9c6c51
+     * mechanics explained in the usesVFIO case below. The GPU RAM
9c6c51
+     * is placed with a gap after maxMemory. The current QEMU
9c6c51
+     * implementation puts the NVIDIA RAM above the PCI MMIO, which
9c6c51
+     * starts at 32TiB and is the MMIO reserved for the guest main RAM.
9c6c51
      *
9c6c51
-     * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB
9c6c51
-     * rather than 1 GiB
9c6c51
+     * This window ends at 64TiB, and this is where the GPUs are being
9c6c51
+     * placed. The next available window size is at 128TiB, and
9c6c51
+     * 64TiB..128TiB will fit all possible NVIDIA GPUs.
9c6c51
      *
9c6c51
-     * (d) is the with-DDW (and memory pre-registration and related
9c6c51
-     * features) DMA window accounting - assuming that we only account RAM
9c6c51
-     * once, even if mapped to multiple PHBs
9c6c51
+     * The same assumption as the most common case applies here:
9c6c51
+     * the guest will request a 64-bit DMA window, per PHB, that is
9c6c51
+     * big enough to map all its RAM, which is now at 128TiB due
9c6c51
+     * to the GPUs.
9c6c51
      *
9c6c51
-     * (e) is the with-DDW userspace view and overhead for the 64-bit DMA
9c6c51
-     * window. This is based a bit on expected guest behaviour, but there
9c6c51
-     * really isn't a way to completely avoid that. We assume the guest
9c6c51
-     * requests a 64-bit DMA window (per PHB) just big enough to map all
9c6c51
-     * its RAM. 4 kiB page size gives the 1/512; it will be less with 64
9c6c51
-     * kiB pages, less still if the guest is mapped with hugepages (unlike
9c6c51
-     * the default 32-bit DMA window, DDW windows can use large IOMMU
9c6c51
-     * pages). 8 MiB is for second and further level overheads, like (b) */
9c6c51
-    if (usesVFIO)
9c6c51
+     * Note that the NVIDIA RAM window must be accounted for the TCE
9c6c51
+     * table size, but *not* for the main RAM (maxMemory). This gives
9c6c51
+     * us the following passthroughLimit for the NVLink2 case:
9c6c51
+     *
9c6c51
+     * passthroughLimit = maxMemory +
9c6c51
+     *                    128TiB/512KiB * #PHBs + 8 MiB */
9c6c51
+    if (nvlink2Capable) {
9c6c51
+        passthroughLimit = maxMemory +
9c6c51
+                           128 * (1ULL<<30) / 512 * nPCIHostBridges +
9c6c51
+                           8192;
9c6c51
+    } else if (usesVFIO) {
9c6c51
+        /* For regular (non-NVLink2 present) VFIO passthrough, the value
9c6c51
+         * of passthroughLimit is:
9c6c51
+         *
9c6c51
+         * passthroughLimit := max( 2 GiB * #PHBs,                       (c)
9c6c51
+         *                          memory                               (d)
9c6c51
+         *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
9c6c51
+         *
9c6c51
+         * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2
9c6c51
+         * GiB rather than 1 GiB
9c6c51
+         *
9c6c51
+         * (d) is the with-DDW (and memory pre-registration and related
9c6c51
+         * features) DMA window accounting - assuming that we only account
9c6c51
+         * RAM once, even if mapped to multiple PHBs
9c6c51
+         *
9c6c51
+         * (e) is the with-DDW userspace view and overhead for the 64-bit
9c6c51
+         * DMA window. This is based a bit on expected guest behaviour, but
9c6c51
+         * there really isn't a way to completely avoid that. We assume the
9c6c51
+         * guest requests a 64-bit DMA window (per PHB) just big enough to
9c6c51
+         * map all its RAM. 4 kiB page size gives the 1/512; it will be
9c6c51
+         * less with 64 kiB pages, less still if the guest is mapped with
9c6c51
+         * hugepages (unlike the default 32-bit DMA window, DDW windows
9c6c51
+         * can use large IOMMU pages). 8 MiB is for second and further level
9c6c51
+         * overheads, like (b) */
9c6c51
         passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges,
9c6c51
                                memory +
9c6c51
                                memory / 512 * nPCIHostBridges + 8192);
9c6c51
+    }
9c6c51
 
9c6c51
     memKB = baseLimit + passthroughLimit;
9c6c51
 
9c6c51
-- 
9c6c51
2.21.0
9c6c51