c480ed
From dc27c829fd5909394e69ed253979f19b47644569 Mon Sep 17 00:00:00 2001
c480ed
Message-Id: <dc27c829fd5909394e69ed253979f19b47644569@dist-git>
c480ed
From: Michal Privoznik <mprivozn@redhat.com>
c480ed
Date: Wed, 5 Jun 2019 11:33:28 +0200
c480ed
Subject: [PATCH] qemu: Rework setting process affinity
c480ed
MIME-Version: 1.0
c480ed
Content-Type: text/plain; charset=UTF-8
c480ed
Content-Transfer-Encoding: 8bit
c480ed
c480ed
https://bugzilla.redhat.com/show_bug.cgi?id=1503284
c480ed
c480ed
The way we currently start qemu from CPU affinity POV is as
c480ed
follows:
c480ed
c480ed
  1) the child process is set affinity to all online CPUs (unless
c480ed
  some vcpu pinning was given in the domain XML)
c480ed
c480ed
  2) Once qemu is running, cpuset cgroup is configured taking
c480ed
  memory pinning into account
c480ed
c480ed
Problem is that we let qemu allocate its memory just anywhere in
c480ed
1) and then rely in 2) to be able to move the memory to
c480ed
configured NUMA nodes. This might not be always possible (e.g.
c480ed
qemu might lock some parts of its memory) and is very suboptimal
c480ed
(copying large memory between NUMA nodes takes significant amount
c480ed
of time).
c480ed
c480ed
The solution is to set affinity to one of (in priority order):
c480ed
  - The CPUs associated with NUMA memory affinity mask
c480ed
  - The CPUs associated with emulator pinning
c480ed
  - All online host CPUs
c480ed
c480ed
Later (once QEMU has allocated its memory) we then change this
c480ed
again to (again in priority order):
c480ed
  - The CPUs associated with emulator pinning
c480ed
  - The CPUs returned by numad
c480ed
  - The CPUs associated with vCPU pinning
c480ed
  - All online host CPUs
c480ed
c480ed
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
c480ed
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
c480ed
(cherry picked from commit f136b83139c63f20de0df3285d9e82df2fb97bfc)
c480ed
c480ed
RHEL-8.1.0: https://bugzilla.redhat.com/show_bug.cgi?id=1716943
c480ed
c480ed
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
c480ed
Message-Id: <c5f31a30daef2be65dc404ab0f1fbfb15be0d062.1559727075.git.mprivozn@redhat.com>
c480ed
Reviewed-by: Andrea Bolognani <abologna@redhat.com>
c480ed
---
c480ed
 src/qemu/qemu_process.c | 132 +++++++++++++++++++---------------------
c480ed
 1 file changed, 63 insertions(+), 69 deletions(-)
c480ed
c480ed
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
c480ed
index 2d2954ba18..6071b3ba3d 100644
c480ed
--- a/src/qemu/qemu_process.c
c480ed
+++ b/src/qemu/qemu_process.c
c480ed
@@ -2335,6 +2335,21 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
c480ed
 }
c480ed
 
c480ed
 
c480ed
+static int
c480ed
+qemuProcessGetAllCpuAffinity(virBitmapPtr *cpumapRet)
c480ed
+{
c480ed
+    *cpumapRet = NULL;
c480ed
+
c480ed
+    if (!virHostCPUHasBitmap())
c480ed
+        return 0;
c480ed
+
c480ed
+    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
c480ed
+        return -1;
c480ed
+
c480ed
+    return 0;
c480ed
+}
c480ed
+
c480ed
+
c480ed
 /*
c480ed
  * To be run between fork/exec of QEMU only
c480ed
  */
c480ed
@@ -2342,9 +2357,9 @@ static int
c480ed
 qemuProcessInitCpuAffinity(virDomainObjPtr vm)
c480ed
 {
c480ed
     int ret = -1;
c480ed
-    virBitmapPtr cpumap = NULL;
c480ed
     virBitmapPtr cpumapToSet = NULL;
c480ed
-    virBitmapPtr hostcpumap = NULL;
c480ed
+    VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
c480ed
+    virDomainNumatuneMemMode mem_mode;
c480ed
     qemuDomainObjPrivatePtr priv = vm->privateData;
c480ed
 
c480ed
     if (!vm->pid) {
c480ed
@@ -2353,59 +2368,39 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
c480ed
         return -1;
c480ed
     }
c480ed
 
c480ed
-    if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
c480ed
-        VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
c480ed
-        cpumapToSet = priv->autoCpuset;
c480ed
+    /* Here is the deal, we can't set cpuset.mems before qemu is
c480ed
+     * started as it clashes with KVM allocation. Therefore, we
c480ed
+     * used to let qemu allocate its memory anywhere as we would
c480ed
+     * then move the memory to desired NUMA node via CGroups.
c480ed
+     * However, that might not be always possible because qemu
c480ed
+     * might lock some parts of its memory (e.g. due to VFIO).
c480ed
+     * Even if it possible, memory has to be copied between NUMA
c480ed
+     * nodes which is suboptimal.
c480ed
+     * Solution is to set affinity that matches the best what we
c480ed
+     * would have set in CGroups and then fix it later, once qemu
c480ed
+     * is already running. */
c480ed
+    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
c480ed
+        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
c480ed
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
c480ed
+        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
c480ed
+                                             priv->autoNodeset,
c480ed
+                                             &cpumapToSet,
c480ed
+                                             -1) < 0)
c480ed
+            goto cleanup;
c480ed
+    } else if (vm->def->cputune.emulatorpin) {
c480ed
+        cpumapToSet = vm->def->cputune.emulatorpin;
c480ed
     } else {
c480ed
-        VIR_DEBUG("Set CPU affinity with specified cpuset");
c480ed
-        if (vm->def->cpumask) {
c480ed
-            cpumapToSet = vm->def->cpumask;
c480ed
-        } else {
c480ed
-            /* You may think this is redundant, but we can't assume libvirtd
c480ed
-             * itself is running on all pCPUs, so we need to explicitly set
c480ed
-             * the spawned QEMU instance to all pCPUs if no map is given in
c480ed
-             * its config file */
c480ed
-            int hostcpus;
c480ed
-
c480ed
-            if (virHostCPUHasBitmap()) {
c480ed
-                hostcpumap = virHostCPUGetOnlineBitmap();
c480ed
-                cpumap = virProcessGetAffinity(vm->pid);
c480ed
-            }
c480ed
-
c480ed
-            if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
c480ed
-                /* we're using all available CPUs, no reason to set
c480ed
-                 * mask. If libvirtd is running without explicit
c480ed
-                 * affinity, we can use hotplugged CPUs for this VM */
c480ed
-                ret = 0;
c480ed
-                goto cleanup;
c480ed
-            } else {
c480ed
-                /* setaffinity fails if you set bits for CPUs which
c480ed
-                 * aren't present, so we have to limit ourselves */
c480ed
-                if ((hostcpus = virHostCPUGetCount()) < 0)
c480ed
-                    goto cleanup;
c480ed
-
c480ed
-                if (hostcpus > QEMUD_CPUMASK_LEN)
c480ed
-                    hostcpus = QEMUD_CPUMASK_LEN;
c480ed
-
c480ed
-                virBitmapFree(cpumap);
c480ed
-                if (!(cpumap = virBitmapNew(hostcpus)))
c480ed
-                    goto cleanup;
c480ed
-
c480ed
-                virBitmapSetAll(cpumap);
c480ed
-
c480ed
-                cpumapToSet = cpumap;
c480ed
-            }
c480ed
-        }
c480ed
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
c480ed
+            goto cleanup;
c480ed
+        cpumapToSet = hostcpumap;
c480ed
     }
c480ed
 
c480ed
-    if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
c480ed
+    if (cpumapToSet &&
c480ed
+        virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
c480ed
         goto cleanup;
c480ed
 
c480ed
     ret = 0;
c480ed
-
c480ed
  cleanup:
c480ed
-    virBitmapFree(cpumap);
c480ed
-    virBitmapFree(hostcpumap);
c480ed
     return ret;
c480ed
 }
c480ed
 
c480ed
@@ -2478,7 +2473,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
c480ed
     qemuDomainObjPrivatePtr priv = vm->privateData;
c480ed
     virDomainNumatuneMemMode mem_mode;
c480ed
     virCgroupPtr cgroup = NULL;
c480ed
-    virBitmapPtr use_cpumask;
c480ed
+    virBitmapPtr use_cpumask = NULL;
c480ed
+    VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
c480ed
     char *mem_mask = NULL;
c480ed
     int ret = -1;
c480ed
 
c480ed
@@ -2490,12 +2486,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
c480ed
     }
c480ed
 
c480ed
     /* Infer which cpumask shall be used. */
c480ed
-    if (cpumask)
c480ed
+    if (cpumask) {
c480ed
         use_cpumask = cpumask;
c480ed
-    else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
c480ed
+    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
c480ed
         use_cpumask = priv->autoCpuset;
c480ed
-    else
c480ed
+    } else if (vm->def->cpumask) {
c480ed
         use_cpumask = vm->def->cpumask;
c480ed
+    } else {
c480ed
+        /* You may think this is redundant, but we can't assume libvirtd
c480ed
+         * itself is running on all pCPUs, so we need to explicitly set
c480ed
+         * the spawned QEMU instance to all pCPUs if no map is given in
c480ed
+         * its config file */
c480ed
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
c480ed
+            goto cleanup;
c480ed
+        use_cpumask = hostcpumap;
c480ed
+    }
c480ed
 
c480ed
     /*
c480ed
      * If CPU cgroup controller is not initialized here, then we need
c480ed
@@ -2520,13 +2525,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
c480ed
                 qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
c480ed
                 goto cleanup;
c480ed
 
c480ed
-            /*
c480ed
-             * Don't setup cpuset.mems for the emulator, they need to
c480ed
-             * be set up after initialization in order for kvm
c480ed
-             * allocations to succeed.
c480ed
-             */
c480ed
-            if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
c480ed
-                mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
c480ed
+            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
c480ed
                 goto cleanup;
c480ed
 
c480ed
         }
c480ed
@@ -6440,12 +6439,7 @@ qemuProcessLaunch(virConnectPtr conn,
c480ed
 
c480ed
     /* This must be done after cgroup placement to avoid resetting CPU
c480ed
      * affinity */
c480ed
-    if (!vm->def->cputune.emulatorpin &&
c480ed
-        qemuProcessInitCpuAffinity(vm) < 0)
c480ed
-        goto cleanup;
c480ed
-
c480ed
-    VIR_DEBUG("Setting emulator tuning/settings");
c480ed
-    if (qemuProcessSetupEmulator(vm) < 0)
c480ed
+    if (qemuProcessInitCpuAffinity(vm) < 0)
c480ed
         goto cleanup;
c480ed
 
c480ed
     VIR_DEBUG("Setting cgroup for external devices (if required)");
c480ed
@@ -6514,10 +6508,6 @@ qemuProcessLaunch(virConnectPtr conn,
c480ed
     if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
c480ed
         goto cleanup;
c480ed
 
c480ed
-    VIR_DEBUG("Setting up post-init cgroup restrictions");
c480ed
-    if (qemuSetupCpusetMems(vm) < 0)
c480ed
-        goto cleanup;
c480ed
-
c480ed
     VIR_DEBUG("setting up hotpluggable cpus");
c480ed
     if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
c480ed
         if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
c480ed
@@ -6543,6 +6533,10 @@ qemuProcessLaunch(virConnectPtr conn,
c480ed
     if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
c480ed
         goto cleanup;
c480ed
 
c480ed
+    VIR_DEBUG("Setting emulator tuning/settings");
c480ed
+    if (qemuProcessSetupEmulator(vm) < 0)
c480ed
+        goto cleanup;
c480ed
+
c480ed
     VIR_DEBUG("Setting global CPU cgroup (if required)");
c480ed
     if (qemuSetupGlobalCpuCgroup(vm) < 0)
c480ed
         goto cleanup;
c480ed
-- 
c480ed
2.22.0
c480ed