0a7476
From eb7ef8053311d82d43912a5cc1e82d0266bb29de Mon Sep 17 00:00:00 2001
0a7476
Message-Id: <eb7ef8053311d82d43912a5cc1e82d0266bb29de@dist-git>
0a7476
From: Michal Privoznik <mprivozn@redhat.com>
0a7476
Date: Thu, 18 Apr 2019 18:58:57 +0200
0a7476
Subject: [PATCH] qemu: Rework setting process affinity
0a7476
MIME-Version: 1.0
0a7476
Content-Type: text/plain; charset=UTF-8
0a7476
Content-Transfer-Encoding: 8bit
0a7476
0a7476
RHEL-7.7: https://bugzilla.redhat.com/show_bug.cgi?id=1695434
0a7476
RHEL-8.0.1: https://bugzilla.redhat.com/show_bug.cgi?id=1503284
0a7476
0a7476
The way we currently start qemu from CPU affinity POV is as
0a7476
follows:
0a7476
0a7476
  1) the child process is set affinity to all online CPUs (unless
0a7476
  some vcpu pinning was given in the domain XML)
0a7476
0a7476
  2) Once qemu is running, cpuset cgroup is configured taking
0a7476
  memory pinning into account
0a7476
0a7476
Problem is that we let qemu allocate its memory just anywhere in
0a7476
1) and then rely in 2) to be able to move the memory to
0a7476
configured NUMA nodes. This might not be always possible (e.g.
0a7476
qemu might lock some parts of its memory) and is very suboptimal
0a7476
(copying large memory between NUMA nodes takes significant amount
0a7476
of time).
0a7476
0a7476
The solution is to set affinity to one of (in priority order):
0a7476
  - The CPUs associated with NUMA memory affinity mask
0a7476
  - The CPUs associated with emulator pinning
0a7476
  - All online host CPUs
0a7476
0a7476
Later (once QEMU has allocated its memory) we then change this
0a7476
again to (again in priority order):
0a7476
  - The CPUs associated with emulator pinning
0a7476
  - The CPUs returned by numad
0a7476
  - The CPUs associated with vCPU pinning
0a7476
  - All online host CPUs
0a7476
0a7476
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
0a7476
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
0a7476
(cherry picked from commit f136b83139c63f20de0df3285d9e82df2fb97bfc)
0a7476
0a7476
I had to explicitly free bitmaps, because there is no VIR_AUTOPTR
0a7476
just yet.
0a7476
0a7476
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
0a7476
Message-Id: <a6edd347c999f999a49d1a878c74c690eb2ab619.1555606711.git.mprivozn@redhat.com>
0a7476
Reviewed-by: Jiri Denemark <jdenemar@redhat.com>
0a7476
---
0a7476
 src/qemu/qemu_process.c | 130 +++++++++++++++++++---------------------
0a7476
 1 file changed, 63 insertions(+), 67 deletions(-)
0a7476
0a7476
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
0a7476
index 6945d76b18..7d04a6e226 100644
0a7476
--- a/src/qemu/qemu_process.c
0a7476
+++ b/src/qemu/qemu_process.c
0a7476
@@ -2335,6 +2335,21 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
0a7476
 }
0a7476
 
0a7476
 
0a7476
+static int
0a7476
+qemuProcessGetAllCpuAffinity(virBitmapPtr *cpumapRet)
0a7476
+{
0a7476
+    *cpumapRet = NULL;
0a7476
+
0a7476
+    if (!virHostCPUHasBitmap())
0a7476
+        return 0;
0a7476
+
0a7476
+    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
0a7476
+        return -1;
0a7476
+
0a7476
+    return 0;
0a7476
+}
0a7476
+
0a7476
+
0a7476
 /*
0a7476
  * To be run between fork/exec of QEMU only
0a7476
  */
0a7476
@@ -2342,9 +2357,9 @@ static int
0a7476
 qemuProcessInitCpuAffinity(virDomainObjPtr vm)
0a7476
 {
0a7476
     int ret = -1;
0a7476
-    virBitmapPtr cpumap = NULL;
0a7476
     virBitmapPtr cpumapToSet = NULL;
0a7476
     virBitmapPtr hostcpumap = NULL;
0a7476
+    virDomainNumatuneMemMode mem_mode;
0a7476
     qemuDomainObjPrivatePtr priv = vm->privateData;
0a7476
 
0a7476
     if (!vm->pid) {
0a7476
@@ -2353,58 +2368,39 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
0a7476
         return -1;
0a7476
     }
0a7476
 
0a7476
-    if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
0a7476
-        VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
0a7476
-        cpumapToSet = priv->autoCpuset;
0a7476
+    /* Here is the deal, we can't set cpuset.mems before qemu is
0a7476
+     * started as it clashes with KVM allocation. Therefore, we
0a7476
+     * used to let qemu allocate its memory anywhere as we would
0a7476
+     * then move the memory to desired NUMA node via CGroups.
0a7476
+     * However, that might not be always possible because qemu
0a7476
+     * might lock some parts of its memory (e.g. due to VFIO).
0a7476
+     * Even if it possible, memory has to be copied between NUMA
0a7476
+     * nodes which is suboptimal.
0a7476
+     * Solution is to set affinity that matches the best what we
0a7476
+     * would have set in CGroups and then fix it later, once qemu
0a7476
+     * is already running. */
0a7476
+    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
0a7476
+        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
0a7476
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
0a7476
+        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
0a7476
+                                             priv->autoNodeset,
0a7476
+                                             &cpumapToSet,
0a7476
+                                             -1) < 0)
0a7476
+            goto cleanup;
0a7476
+    } else if (vm->def->cputune.emulatorpin) {
0a7476
+        cpumapToSet = vm->def->cputune.emulatorpin;
0a7476
     } else {
0a7476
-        VIR_DEBUG("Set CPU affinity with specified cpuset");
0a7476
-        if (vm->def->cpumask) {
0a7476
-            cpumapToSet = vm->def->cpumask;
0a7476
-        } else {
0a7476
-            /* You may think this is redundant, but we can't assume libvirtd
0a7476
-             * itself is running on all pCPUs, so we need to explicitly set
0a7476
-             * the spawned QEMU instance to all pCPUs if no map is given in
0a7476
-             * its config file */
0a7476
-            int hostcpus;
0a7476
-
0a7476
-            if (virHostCPUHasBitmap()) {
0a7476
-                hostcpumap = virHostCPUGetOnlineBitmap();
0a7476
-                cpumap = virProcessGetAffinity(vm->pid);
0a7476
-            }
0a7476
-
0a7476
-            if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
0a7476
-                /* we're using all available CPUs, no reason to set
0a7476
-                 * mask. If libvirtd is running without explicit
0a7476
-                 * affinity, we can use hotplugged CPUs for this VM */
0a7476
-                ret = 0;
0a7476
-                goto cleanup;
0a7476
-            } else {
0a7476
-                /* setaffinity fails if you set bits for CPUs which
0a7476
-                 * aren't present, so we have to limit ourselves */
0a7476
-                if ((hostcpus = virHostCPUGetCount()) < 0)
0a7476
-                    goto cleanup;
0a7476
-
0a7476
-                if (hostcpus > QEMUD_CPUMASK_LEN)
0a7476
-                    hostcpus = QEMUD_CPUMASK_LEN;
0a7476
-
0a7476
-                virBitmapFree(cpumap);
0a7476
-                if (!(cpumap = virBitmapNew(hostcpus)))
0a7476
-                    goto cleanup;
0a7476
-
0a7476
-                virBitmapSetAll(cpumap);
0a7476
-
0a7476
-                cpumapToSet = cpumap;
0a7476
-            }
0a7476
-        }
0a7476
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
0a7476
+            goto cleanup;
0a7476
+        cpumapToSet = hostcpumap;
0a7476
     }
0a7476
 
0a7476
-    if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
0a7476
+    if (cpumapToSet &&
0a7476
+        virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
0a7476
         goto cleanup;
0a7476
 
0a7476
     ret = 0;
0a7476
-
0a7476
  cleanup:
0a7476
-    virBitmapFree(cpumap);
0a7476
     virBitmapFree(hostcpumap);
0a7476
     return ret;
0a7476
 }
0a7476
@@ -2478,7 +2474,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
0a7476
     qemuDomainObjPrivatePtr priv = vm->privateData;
0a7476
     virDomainNumatuneMemMode mem_mode;
0a7476
     virCgroupPtr cgroup = NULL;
0a7476
-    virBitmapPtr use_cpumask;
0a7476
+    virBitmapPtr use_cpumask = NULL;
0a7476
+    virBitmapPtr hostcpumap = NULL;
0a7476
     char *mem_mask = NULL;
0a7476
     int ret = -1;
0a7476
 
0a7476
@@ -2490,12 +2487,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
0a7476
     }
0a7476
 
0a7476
     /* Infer which cpumask shall be used. */
0a7476
-    if (cpumask)
0a7476
+    if (cpumask) {
0a7476
         use_cpumask = cpumask;
0a7476
-    else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
0a7476
+    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
0a7476
         use_cpumask = priv->autoCpuset;
0a7476
-    else
0a7476
+    } else if (vm->def->cpumask) {
0a7476
         use_cpumask = vm->def->cpumask;
0a7476
+    } else {
0a7476
+        /* You may think this is redundant, but we can't assume libvirtd
0a7476
+         * itself is running on all pCPUs, so we need to explicitly set
0a7476
+         * the spawned QEMU instance to all pCPUs if no map is given in
0a7476
+         * its config file */
0a7476
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
0a7476
+            goto cleanup;
0a7476
+        use_cpumask = hostcpumap;
0a7476
+    }
0a7476
 
0a7476
     /*
0a7476
      * If CPU cgroup controller is not initialized here, then we need
0a7476
@@ -2520,13 +2526,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
0a7476
                 qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
0a7476
                 goto cleanup;
0a7476
 
0a7476
-            /*
0a7476
-             * Don't setup cpuset.mems for the emulator, they need to
0a7476
-             * be set up after initialization in order for kvm
0a7476
-             * allocations to succeed.
0a7476
-             */
0a7476
-            if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
0a7476
-                mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
0a7476
+            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
0a7476
                 goto cleanup;
0a7476
 
0a7476
         }
0a7476
@@ -2553,6 +2553,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
0a7476
     ret = 0;
0a7476
  cleanup:
0a7476
     VIR_FREE(mem_mask);
0a7476
+    virBitmapFree(hostcpumap);
0a7476
     if (cgroup) {
0a7476
         if (ret < 0)
0a7476
             virCgroupRemove(cgroup);
0a7476
@@ -6428,12 +6429,7 @@ qemuProcessLaunch(virConnectPtr conn,
0a7476
 
0a7476
     /* This must be done after cgroup placement to avoid resetting CPU
0a7476
      * affinity */
0a7476
-    if (!vm->def->cputune.emulatorpin &&
0a7476
-        qemuProcessInitCpuAffinity(vm) < 0)
0a7476
-        goto cleanup;
0a7476
-
0a7476
-    VIR_DEBUG("Setting emulator tuning/settings");
0a7476
-    if (qemuProcessSetupEmulator(vm) < 0)
0a7476
+    if (qemuProcessInitCpuAffinity(vm) < 0)
0a7476
         goto cleanup;
0a7476
 
0a7476
     VIR_DEBUG("Setting cgroup for external devices (if required)");
0a7476
@@ -6502,10 +6498,6 @@ qemuProcessLaunch(virConnectPtr conn,
0a7476
     if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
0a7476
         goto cleanup;
0a7476
 
0a7476
-    VIR_DEBUG("Setting up post-init cgroup restrictions");
0a7476
-    if (qemuSetupCpusetMems(vm) < 0)
0a7476
-        goto cleanup;
0a7476
-
0a7476
     VIR_DEBUG("setting up hotpluggable cpus");
0a7476
     if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
0a7476
         if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
0a7476
@@ -6531,6 +6523,10 @@ qemuProcessLaunch(virConnectPtr conn,
0a7476
     if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
0a7476
         goto cleanup;
0a7476
 
0a7476
+    VIR_DEBUG("Setting emulator tuning/settings");
0a7476
+    if (qemuProcessSetupEmulator(vm) < 0)
0a7476
+        goto cleanup;
0a7476
+
0a7476
     VIR_DEBUG("Setting global CPU cgroup (if required)");
0a7476
     if (qemuSetupGlobalCpuCgroup(vm) < 0)
0a7476
         goto cleanup;
0a7476
-- 
0a7476
2.21.0
0a7476