38f2fd
From c1bbe50778b3a114bccccb1d94fa9beefcc52ae5 Mon Sep 17 00:00:00 2001
38f2fd
Message-Id: <c1bbe50778b3a114bccccb1d94fa9beefcc52ae5@dist-git>
38f2fd
From: Andrea Bolognani <abologna@redhat.com>
38f2fd
Date: Fri, 20 Nov 2015 13:29:14 +0100
38f2fd
Subject: [PATCH] qemu: Add ppc64-specific math to
38f2fd
 qemuDomainGetMlockLimitBytes()
38f2fd
38f2fd
The amount of memory a ppc64 domain might need to lock is different
38f2fd
than that of a equally-sized x86 domain, so we need to check the
38f2fd
domain's architecture and act accordingly.
38f2fd
38f2fd
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1273480
38f2fd
38f2fd
(cherry picked from commit d269ef165c178ad62b48e5179fc4f3b4fa5e590b)
38f2fd
38f2fd
Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1283924
38f2fd
38f2fd
Signed-off-by: Andrea Bolognani <abologna@redhat.com>
38f2fd
Signed-off-by: Jiri Denemark <jdenemar@redhat.com>
38f2fd
---
38f2fd
 src/qemu/qemu_domain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-
38f2fd
 1 file changed, 79 insertions(+), 1 deletion(-)
38f2fd
38f2fd
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
38f2fd
index be3dcf6..4615e3e 100644
38f2fd
--- a/src/qemu/qemu_domain.c
38f2fd
+++ b/src/qemu/qemu_domain.c
38f2fd
@@ -3430,7 +3430,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver,
38f2fd
  * @def: domain definition
38f2fd
  *
38f2fd
  * Returns the size of the memory in bytes that needs to be set as
38f2fd
- * RLIMIT_MEMLOCK for purpose of VFIO device passthrough.
38f2fd
+ * RLIMIT_MEMLOCK for the QEMU process.
38f2fd
  * If a mem.hard_limit is set, then that value is preferred; otherwise, the
38f2fd
  * value returned may depend upon the architecture or devices present.
38f2fd
  */
38f2fd
@@ -3445,6 +3445,84 @@ qemuDomainGetMlockLimitBytes(virDomainDefPtr def)
38f2fd
         goto done;
38f2fd
     }
38f2fd
 
38f2fd
+    if (ARCH_IS_PPC64(def->os.arch)) {
38f2fd
+        unsigned long long maxMemory;
38f2fd
+        unsigned long long memory;
38f2fd
+        unsigned long long baseLimit;
38f2fd
+        unsigned long long passthroughLimit;
38f2fd
+        size_t nPCIHostBridges;
38f2fd
+        size_t i;
38f2fd
+        bool usesVFIO = false;
38f2fd
+
38f2fd
+        /* TODO: Detect at runtime once we start using more than just
38f2fd
+         *       the default PCI Host Bridge */
38f2fd
+        nPCIHostBridges = 1;
38f2fd
+
38f2fd
+        for (i = 0; i < def->nhostdevs; i++) {
38f2fd
+            virDomainHostdevDefPtr dev = def->hostdevs[i];
38f2fd
+
38f2fd
+            if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
38f2fd
+                dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
38f2fd
+                dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
38f2fd
+                usesVFIO = true;
38f2fd
+                break;
38f2fd
+            }
38f2fd
+        }
38f2fd
+
38f2fd
+        memory = virDomainDefGetMemoryActual(def);
38f2fd
+
38f2fd
+        if (def->mem.max_memory)
38f2fd
+            maxMemory = def->mem.max_memory;
38f2fd
+        else
38f2fd
+            maxMemory = memory;
38f2fd
+
38f2fd
+        /* baseLimit := maxMemory / 128                                  (a)
38f2fd
+         *              + 4 MiB * #PHBs + 8 MiB                          (b)
38f2fd
+         *
38f2fd
+         * (a) is the hash table
38f2fd
+         *
38f2fd
+         * (b) is accounting for the 32-bit DMA window - it could be either the
38f2fd
+         * KVM accelerated TCE tables for emulated devices, or the VFIO
38f2fd
+         * userspace view. The 4 MiB per-PHB (including the default one) covers
38f2fd
+         * a 2GiB DMA window: default is 1GiB, but it's possible it'll be
38f2fd
+         * increased to help performance. The 8 MiB extra should be plenty for
38f2fd
+         * the TCE table index for any reasonable number of PHBs and several
38f2fd
+         * spapr-vlan or spapr-vscsi devices (512kB + a tiny bit each) */
38f2fd
+        baseLimit = maxMemory / 128 +
38f2fd
+                    4096 * nPCIHostBridges +
38f2fd
+                    8192;
38f2fd
+
38f2fd
+        /* passthroughLimit := max( 2 GiB * #PHBs,                       (c)
38f2fd
+         *                          memory                               (d)
38f2fd
+         *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
38f2fd
+         *
38f2fd
+         * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB
38f2fd
+         * rather than 1 GiB
38f2fd
+         *
38f2fd
+         * (d) is the with-DDW (and memory pre-registration and related
38f2fd
+         * features) DMA window accounting - assuming that we only account RAM
38f2fd
+         * once, even if mapped to multiple PHBs
38f2fd
+         *
38f2fd
+         * (e) is the with-DDW userspace view and overhead for the 64-bit DMA
38f2fd
+         * window. This is based a bit on expected guest behaviour, but there
38f2fd
+         * really isn't a way to completely avoid that. We assume the guest
38f2fd
+         * requests a 64-bit DMA window (per PHB) just big enough to map all
38f2fd
+         * its RAM. 4 kiB page size gives the 1/512; it will be less with 64
38f2fd
+         * kiB pages, less still if the guest is mapped with hugepages (unlike
38f2fd
+         * the default 32-bit DMA window, DDW windows can use large IOMMU
38f2fd
+         * pages). 8 MiB is for second and further level overheads, like (b) */
38f2fd
+        passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges,
38f2fd
+                               memory +
38f2fd
+                               memory / 512 * nPCIHostBridges + 8192);
38f2fd
+
38f2fd
+        if (usesVFIO)
38f2fd
+            memKB = baseLimit + passthroughLimit;
38f2fd
+        else
38f2fd
+            memKB = baseLimit;
38f2fd
+
38f2fd
+        goto done;
38f2fd
+    }
38f2fd
+
38f2fd
     /* For device passthrough using VFIO the guest memory and MMIO memory
38f2fd
      * regions need to be locked persistent in order to allow DMA.
38f2fd
      *
38f2fd
-- 
38f2fd
2.6.3
38f2fd