c313de
From dc27c829fd5909394e69ed253979f19b47644569 Mon Sep 17 00:00:00 2001
c313de
Message-Id: <dc27c829fd5909394e69ed253979f19b47644569@dist-git>
c313de
From: Michal Privoznik <mprivozn@redhat.com>
c313de
Date: Wed, 5 Jun 2019 11:33:28 +0200
c313de
Subject: [PATCH] qemu: Rework setting process affinity
c313de
MIME-Version: 1.0
c313de
Content-Type: text/plain; charset=UTF-8
c313de
Content-Transfer-Encoding: 8bit
c313de
c313de
https://bugzilla.redhat.com/show_bug.cgi?id=1503284
c313de
c313de
The way we currently start qemu from CPU affinity POV is as
c313de
follows:
c313de
c313de
  1) the child process is set affinity to all online CPUs (unless
c313de
  some vcpu pinning was given in the domain XML)
c313de
c313de
  2) Once qemu is running, cpuset cgroup is configured taking
c313de
  memory pinning into account
c313de
c313de
Problem is that we let qemu allocate its memory just anywhere in
c313de
1) and then rely in 2) to be able to move the memory to
c313de
configured NUMA nodes. This might not be always possible (e.g.
c313de
qemu might lock some parts of its memory) and is very suboptimal
c313de
(copying large memory between NUMA nodes takes significant amount
c313de
of time).
c313de
c313de
The solution is to set affinity to one of (in priority order):
c313de
  - The CPUs associated with NUMA memory affinity mask
c313de
  - The CPUs associated with emulator pinning
c313de
  - All online host CPUs
c313de
c313de
Later (once QEMU has allocated its memory) we then change this
c313de
again to (again in priority order):
c313de
  - The CPUs associated with emulator pinning
c313de
  - The CPUs returned by numad
c313de
  - The CPUs associated with vCPU pinning
c313de
  - All online host CPUs
c313de
c313de
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
c313de
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
c313de
(cherry picked from commit f136b83139c63f20de0df3285d9e82df2fb97bfc)
c313de
c313de
RHEL-8.1.0: https://bugzilla.redhat.com/show_bug.cgi?id=1716943
c313de
c313de
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
c313de
Message-Id: <c5f31a30daef2be65dc404ab0f1fbfb15be0d062.1559727075.git.mprivozn@redhat.com>
c313de
Reviewed-by: Andrea Bolognani <abologna@redhat.com>
c313de
---
c313de
 src/qemu/qemu_process.c | 132 +++++++++++++++++++---------------------
c313de
 1 file changed, 63 insertions(+), 69 deletions(-)
c313de
c313de
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
c313de
index 2d2954ba18..6071b3ba3d 100644
c313de
--- a/src/qemu/qemu_process.c
c313de
+++ b/src/qemu/qemu_process.c
c313de
@@ -2335,6 +2335,21 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
c313de
 }
c313de
 
c313de
 
c313de
+static int
c313de
+qemuProcessGetAllCpuAffinity(virBitmapPtr *cpumapRet)
c313de
+{
c313de
+    *cpumapRet = NULL;
c313de
+
c313de
+    if (!virHostCPUHasBitmap())
c313de
+        return 0;
c313de
+
c313de
+    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
c313de
+        return -1;
c313de
+
c313de
+    return 0;
c313de
+}
c313de
+
c313de
+
c313de
 /*
c313de
  * To be run between fork/exec of QEMU only
c313de
  */
c313de
@@ -2342,9 +2357,9 @@ static int
c313de
 qemuProcessInitCpuAffinity(virDomainObjPtr vm)
c313de
 {
c313de
     int ret = -1;
c313de
-    virBitmapPtr cpumap = NULL;
c313de
     virBitmapPtr cpumapToSet = NULL;
c313de
-    virBitmapPtr hostcpumap = NULL;
c313de
+    VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
c313de
+    virDomainNumatuneMemMode mem_mode;
c313de
     qemuDomainObjPrivatePtr priv = vm->privateData;
c313de
 
c313de
     if (!vm->pid) {
c313de
@@ -2353,59 +2368,39 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
c313de
         return -1;
c313de
     }
c313de
 
c313de
-    if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
c313de
-        VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
c313de
-        cpumapToSet = priv->autoCpuset;
c313de
+    /* Here is the deal, we can't set cpuset.mems before qemu is
c313de
+     * started as it clashes with KVM allocation. Therefore, we
c313de
+     * used to let qemu allocate its memory anywhere as we would
c313de
+     * then move the memory to desired NUMA node via CGroups.
c313de
+     * However, that might not be always possible because qemu
c313de
+     * might lock some parts of its memory (e.g. due to VFIO).
c313de
+     * Even if it possible, memory has to be copied between NUMA
c313de
+     * nodes which is suboptimal.
c313de
+     * Solution is to set affinity that matches the best what we
c313de
+     * would have set in CGroups and then fix it later, once qemu
c313de
+     * is already running. */
c313de
+    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
c313de
+        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
c313de
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
c313de
+        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
c313de
+                                             priv->autoNodeset,
c313de
+                                             &cpumapToSet,
c313de
+                                             -1) < 0)
c313de
+            goto cleanup;
c313de
+    } else if (vm->def->cputune.emulatorpin) {
c313de
+        cpumapToSet = vm->def->cputune.emulatorpin;
c313de
     } else {
c313de
-        VIR_DEBUG("Set CPU affinity with specified cpuset");
c313de
-        if (vm->def->cpumask) {
c313de
-            cpumapToSet = vm->def->cpumask;
c313de
-        } else {
c313de
-            /* You may think this is redundant, but we can't assume libvirtd
c313de
-             * itself is running on all pCPUs, so we need to explicitly set
c313de
-             * the spawned QEMU instance to all pCPUs if no map is given in
c313de
-             * its config file */
c313de
-            int hostcpus;
c313de
-
c313de
-            if (virHostCPUHasBitmap()) {
c313de
-                hostcpumap = virHostCPUGetOnlineBitmap();
c313de
-                cpumap = virProcessGetAffinity(vm->pid);
c313de
-            }
c313de
-
c313de
-            if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
c313de
-                /* we're using all available CPUs, no reason to set
c313de
-                 * mask. If libvirtd is running without explicit
c313de
-                 * affinity, we can use hotplugged CPUs for this VM */
c313de
-                ret = 0;
c313de
-                goto cleanup;
c313de
-            } else {
c313de
-                /* setaffinity fails if you set bits for CPUs which
c313de
-                 * aren't present, so we have to limit ourselves */
c313de
-                if ((hostcpus = virHostCPUGetCount()) < 0)
c313de
-                    goto cleanup;
c313de
-
c313de
-                if (hostcpus > QEMUD_CPUMASK_LEN)
c313de
-                    hostcpus = QEMUD_CPUMASK_LEN;
c313de
-
c313de
-                virBitmapFree(cpumap);
c313de
-                if (!(cpumap = virBitmapNew(hostcpus)))
c313de
-                    goto cleanup;
c313de
-
c313de
-                virBitmapSetAll(cpumap);
c313de
-
c313de
-                cpumapToSet = cpumap;
c313de
-            }
c313de
-        }
c313de
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
c313de
+            goto cleanup;
c313de
+        cpumapToSet = hostcpumap;
c313de
     }
c313de
 
c313de
-    if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
c313de
+    if (cpumapToSet &&
c313de
+        virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
c313de
         goto cleanup;
c313de
 
c313de
     ret = 0;
c313de
-
c313de
  cleanup:
c313de
-    virBitmapFree(cpumap);
c313de
-    virBitmapFree(hostcpumap);
c313de
     return ret;
c313de
 }
c313de
 
c313de
@@ -2478,7 +2473,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
c313de
     qemuDomainObjPrivatePtr priv = vm->privateData;
c313de
     virDomainNumatuneMemMode mem_mode;
c313de
     virCgroupPtr cgroup = NULL;
c313de
-    virBitmapPtr use_cpumask;
c313de
+    virBitmapPtr use_cpumask = NULL;
c313de
+    VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
c313de
     char *mem_mask = NULL;
c313de
     int ret = -1;
c313de
 
c313de
@@ -2490,12 +2486,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
c313de
     }
c313de
 
c313de
     /* Infer which cpumask shall be used. */
c313de
-    if (cpumask)
c313de
+    if (cpumask) {
c313de
         use_cpumask = cpumask;
c313de
-    else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
c313de
+    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
c313de
         use_cpumask = priv->autoCpuset;
c313de
-    else
c313de
+    } else if (vm->def->cpumask) {
c313de
         use_cpumask = vm->def->cpumask;
c313de
+    } else {
c313de
+        /* You may think this is redundant, but we can't assume libvirtd
c313de
+         * itself is running on all pCPUs, so we need to explicitly set
c313de
+         * the spawned QEMU instance to all pCPUs if no map is given in
c313de
+         * its config file */
c313de
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
c313de
+            goto cleanup;
c313de
+        use_cpumask = hostcpumap;
c313de
+    }
c313de
 
c313de
     /*
c313de
      * If CPU cgroup controller is not initialized here, then we need
c313de
@@ -2520,13 +2525,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
c313de
                 qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
c313de
                 goto cleanup;
c313de
 
c313de
-            /*
c313de
-             * Don't setup cpuset.mems for the emulator, they need to
c313de
-             * be set up after initialization in order for kvm
c313de
-             * allocations to succeed.
c313de
-             */
c313de
-            if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
c313de
-                mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
c313de
+            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
c313de
                 goto cleanup;
c313de
 
c313de
         }
c313de
@@ -6440,12 +6439,7 @@ qemuProcessLaunch(virConnectPtr conn,
c313de
 
c313de
     /* This must be done after cgroup placement to avoid resetting CPU
c313de
      * affinity */
c313de
-    if (!vm->def->cputune.emulatorpin &&
c313de
-        qemuProcessInitCpuAffinity(vm) < 0)
c313de
-        goto cleanup;
c313de
-
c313de
-    VIR_DEBUG("Setting emulator tuning/settings");
c313de
-    if (qemuProcessSetupEmulator(vm) < 0)
c313de
+    if (qemuProcessInitCpuAffinity(vm) < 0)
c313de
         goto cleanup;
c313de
 
c313de
     VIR_DEBUG("Setting cgroup for external devices (if required)");
c313de
@@ -6514,10 +6508,6 @@ qemuProcessLaunch(virConnectPtr conn,
c313de
     if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
c313de
         goto cleanup;
c313de
 
c313de
-    VIR_DEBUG("Setting up post-init cgroup restrictions");
c313de
-    if (qemuSetupCpusetMems(vm) < 0)
c313de
-        goto cleanup;
c313de
-
c313de
     VIR_DEBUG("setting up hotpluggable cpus");
c313de
     if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
c313de
         if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
c313de
@@ -6543,6 +6533,10 @@ qemuProcessLaunch(virConnectPtr conn,
c313de
     if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
c313de
         goto cleanup;
c313de
 
c313de
+    VIR_DEBUG("Setting emulator tuning/settings");
c313de
+    if (qemuProcessSetupEmulator(vm) < 0)
c313de
+        goto cleanup;
c313de
+
c313de
     VIR_DEBUG("Setting global CPU cgroup (if required)");
c313de
     if (qemuSetupGlobalCpuCgroup(vm) < 0)
c313de
         goto cleanup;
c313de
-- 
c313de
2.22.0
c313de