c480ed
From 5347b12008842b5c86f766e391c6f3756afbff7d Mon Sep 17 00:00:00 2001
c480ed
Message-Id: <5347b12008842b5c86f766e391c6f3756afbff7d@dist-git>
c480ed
From: Daniel Henrique Barboza <danielhb413@gmail.com>
c480ed
Date: Fri, 3 May 2019 13:54:53 +0200
c480ed
Subject: [PATCH] PPC64 support for NVIDIA V100 GPU with NVLink2 passthrough
c480ed
c480ed
The NVIDIA V100 GPU has an onboard RAM that is mapped into the
c480ed
host memory and accessible as normal RAM via an NVLink2 bridge. When
c480ed
passed through in a guest, QEMU puts the NVIDIA RAM window in a
c480ed
non-contiguous area, above the PCI MMIO area that starts at 32TiB.
c480ed
This means that the NVIDIA RAM window starts at 64TiB and go all the
c480ed
way to 128TiB.
c480ed
c480ed
This means that the guest might request a 64-bit window, for each PCI
c480ed
Host Bridge, that goes all the way to 128TiB. However, the NVIDIA RAM
c480ed
window isn't counted as regular RAM, thus this window is considered
c480ed
only for the allocation of the Translation and Control Entry (TCE).
c480ed
For more information about how NVLink2 support works in QEMU,
c480ed
refer to the accepted implementation [1].
c480ed
c480ed
This memory layout differs from the existing VFIO case, requiring its
c480ed
own formula. This patch changes the PPC64 code of
c480ed
@qemuDomainGetMemLockLimitBytes to:
c480ed
c480ed
- detect if we have a NVLink2 bridge being passed through to the
c480ed
guest. This is done by using the @ppc64VFIODeviceIsNV2Bridge function
c480ed
added in the previous patch. The existence of the NVLink2 bridge in
c480ed
the guest means that we are dealing with the NVLink2 memory layout;
c480ed
c480ed
- if an IBM NVLink2 bridge exists, passthroughLimit is calculated in a
c480ed
different way to account for the extra memory the TCE table can alloc.
c480ed
The 64TiB..128TiB window is more than enough to fit all possible
c480ed
GPUs, thus the memLimit is the same regardless of passing through 1 or
c480ed
multiple V100 GPUs.
c480ed
c480ed
Further reading explaining the background
c480ed
[1] https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg03700.html
c480ed
[2] https://www.redhat.com/archives/libvir-list/2019-March/msg00660.html
c480ed
[3] https://www.redhat.com/archives/libvir-list/2019-April/msg00527.html
c480ed
c480ed
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
c480ed
Reviewed-by: Erik Skultety <eskultet@redhat.com>
c480ed
(cherry picked from commit 1a922648f67f56c4374d647feebf2adb9a642f96)
c480ed
c480ed
https://bugzilla.redhat.com/show_bug.cgi?id=1505998
c480ed
c480ed
Conflicts:
c480ed
    The upstream commit relied on:
c480ed
        - v4.7.0-37-gb72183223f
c480ed
        - v4.7.0-38-ga14f597266
c480ed
    which were not backported so virPCIDeviceAddressAsString had to
c480ed
    swapped for the former virDomainPCIAddressAsString in order to
c480ed
    compile.
c480ed
c480ed
Signed-off-by: Erik Skultety <eskultet@redhat.com>
c480ed
Message-Id: <03c00ebf46d85b0615134ef8655e67a4c909b7da.1556884443.git.eskultet@redhat.com>
c480ed
Reviewed-by: Andrea Bolognani <abologna@redhat.com>
c480ed
---
c480ed
 src/qemu/qemu_domain.c | 80 ++++++++++++++++++++++++++++++++----------
c480ed
 1 file changed, 61 insertions(+), 19 deletions(-)
c480ed
c480ed
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
c480ed
index a8bc618389..21f0722495 100644
c480ed
--- a/src/qemu/qemu_domain.c
c480ed
+++ b/src/qemu/qemu_domain.c
c480ed
@@ -9813,7 +9813,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver,
c480ed
  * such as '0004:04:00.0', and tells if the device is a NVLink2
c480ed
  * bridge.
c480ed
  */
c480ed
-static ATTRIBUTE_UNUSED bool
c480ed
+static bool
c480ed
 ppc64VFIODeviceIsNV2Bridge(const char *device)
c480ed
 {
c480ed
     const char *nvlink2Files[] = {"ibm,gpu", "ibm,nvlink",
c480ed
@@ -9851,7 +9851,9 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
c480ed
     unsigned long long maxMemory = 0;
c480ed
     unsigned long long passthroughLimit = 0;
c480ed
     size_t i, nPCIHostBridges = 0;
c480ed
+    virPCIDeviceAddressPtr pciAddr;
c480ed
     bool usesVFIO = false;
c480ed
+    bool nvlink2Capable = false;
c480ed
 
c480ed
     for (i = 0; i < def->ncontrollers; i++) {
c480ed
         virDomainControllerDefPtr cont = def->controllers[i];
c480ed
@@ -9869,7 +9871,17 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
c480ed
             dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
c480ed
             dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
c480ed
             usesVFIO = true;
c480ed
-            break;
c480ed
+
c480ed
+            pciAddr = &dev->source.subsys.u.pci.addr;
c480ed
+            if (virPCIDeviceAddressIsValid(pciAddr, false)) {
c480ed
+                VIR_AUTOFREE(char *) pciAddrStr = NULL;
c480ed
+
c480ed
+                pciAddrStr = virDomainPCIAddressAsString(pciAddr);
c480ed
+                if (ppc64VFIODeviceIsNV2Bridge(pciAddrStr)) {
c480ed
+                    nvlink2Capable = true;
c480ed
+                    break;
c480ed
+                }
c480ed
+            }
c480ed
         }
c480ed
     }
c480ed
 
c480ed
@@ -9896,29 +9908,59 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def)
c480ed
                 4096 * nPCIHostBridges +
c480ed
                 8192;
c480ed
 
c480ed
-    /* passthroughLimit := max( 2 GiB * #PHBs,                       (c)
c480ed
-     *                          memory                               (d)
c480ed
-     *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
c480ed
+    /* NVLink2 support in QEMU is a special case of the passthrough
c480ed
+     * mechanics explained in the usesVFIO case below. The GPU RAM
c480ed
+     * is placed with a gap after maxMemory. The current QEMU
c480ed
+     * implementation puts the NVIDIA RAM above the PCI MMIO, which
c480ed
+     * starts at 32TiB and is the MMIO reserved for the guest main RAM.
c480ed
      *
c480ed
-     * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB
c480ed
-     * rather than 1 GiB
c480ed
+     * This window ends at 64TiB, and this is where the GPUs are being
c480ed
+     * placed. The next available window size is at 128TiB, and
c480ed
+     * 64TiB..128TiB will fit all possible NVIDIA GPUs.
c480ed
      *
c480ed
-     * (d) is the with-DDW (and memory pre-registration and related
c480ed
-     * features) DMA window accounting - assuming that we only account RAM
c480ed
-     * once, even if mapped to multiple PHBs
c480ed
+     * The same assumption as the most common case applies here:
c480ed
+     * the guest will request a 64-bit DMA window, per PHB, that is
c480ed
+     * big enough to map all its RAM, which is now at 128TiB due
c480ed
+     * to the GPUs.
c480ed
      *
c480ed
-     * (e) is the with-DDW userspace view and overhead for the 64-bit DMA
c480ed
-     * window. This is based a bit on expected guest behaviour, but there
c480ed
-     * really isn't a way to completely avoid that. We assume the guest
c480ed
-     * requests a 64-bit DMA window (per PHB) just big enough to map all
c480ed
-     * its RAM. 4 kiB page size gives the 1/512; it will be less with 64
c480ed
-     * kiB pages, less still if the guest is mapped with hugepages (unlike
c480ed
-     * the default 32-bit DMA window, DDW windows can use large IOMMU
c480ed
-     * pages). 8 MiB is for second and further level overheads, like (b) */
c480ed
-    if (usesVFIO)
c480ed
+     * Note that the NVIDIA RAM window must be accounted for the TCE
c480ed
+     * table size, but *not* for the main RAM (maxMemory). This gives
c480ed
+     * us the following passthroughLimit for the NVLink2 case:
c480ed
+     *
c480ed
+     * passthroughLimit = maxMemory +
c480ed
+     *                    128TiB/512KiB * #PHBs + 8 MiB */
c480ed
+    if (nvlink2Capable) {
c480ed
+        passthroughLimit = maxMemory +
c480ed
+                           128 * (1ULL<<30) / 512 * nPCIHostBridges +
c480ed
+                           8192;
c480ed
+    } else if (usesVFIO) {
c480ed
+        /* For regular (non-NVLink2 present) VFIO passthrough, the value
c480ed
+         * of passthroughLimit is:
c480ed
+         *
c480ed
+         * passthroughLimit := max( 2 GiB * #PHBs,                       (c)
c480ed
+         *                          memory                               (d)
c480ed
+         *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
c480ed
+         *
c480ed
+         * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2
c480ed
+         * GiB rather than 1 GiB
c480ed
+         *
c480ed
+         * (d) is the with-DDW (and memory pre-registration and related
c480ed
+         * features) DMA window accounting - assuming that we only account
c480ed
+         * RAM once, even if mapped to multiple PHBs
c480ed
+         *
c480ed
+         * (e) is the with-DDW userspace view and overhead for the 64-bit
c480ed
+         * DMA window. This is based a bit on expected guest behaviour, but
c480ed
+         * there really isn't a way to completely avoid that. We assume the
c480ed
+         * guest requests a 64-bit DMA window (per PHB) just big enough to
c480ed
+         * map all its RAM. 4 kiB page size gives the 1/512; it will be
c480ed
+         * less with 64 kiB pages, less still if the guest is mapped with
c480ed
+         * hugepages (unlike the default 32-bit DMA window, DDW windows
c480ed
+         * can use large IOMMU pages). 8 MiB is for second and further level
c480ed
+         * overheads, like (b) */
c480ed
         passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges,
c480ed
                                memory +
c480ed
                                memory / 512 * nPCIHostBridges + 8192);
c480ed
+    }
c480ed
 
c480ed
     memKB = baseLimit + passthroughLimit;
c480ed
 
c480ed
-- 
c480ed
2.21.0
c480ed