|
Pablo Greco |
40546a |
From dc27c829fd5909394e69ed253979f19b47644569 Mon Sep 17 00:00:00 2001
|
|
Pablo Greco |
40546a |
Message-Id: <dc27c829fd5909394e69ed253979f19b47644569@dist-git>
|
|
Pablo Greco |
40546a |
From: Michal Privoznik <mprivozn@redhat.com>
|
|
Pablo Greco |
40546a |
Date: Wed, 5 Jun 2019 11:33:28 +0200
|
|
Pablo Greco |
40546a |
Subject: [PATCH] qemu: Rework setting process affinity
|
|
Pablo Greco |
40546a |
MIME-Version: 1.0
|
|
Pablo Greco |
40546a |
Content-Type: text/plain; charset=UTF-8
|
|
Pablo Greco |
40546a |
Content-Transfer-Encoding: 8bit
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
https://bugzilla.redhat.com/show_bug.cgi?id=1503284
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
The way we currently start qemu from CPU affinity POV is as
|
|
Pablo Greco |
40546a |
follows:
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
1) the child process is set affinity to all online CPUs (unless
|
|
Pablo Greco |
40546a |
some vcpu pinning was given in the domain XML)
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
2) Once qemu is running, cpuset cgroup is configured taking
|
|
Pablo Greco |
40546a |
memory pinning into account
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
Problem is that we let qemu allocate its memory just anywhere in
|
|
Pablo Greco |
40546a |
1) and then rely in 2) to be able to move the memory to
|
|
Pablo Greco |
40546a |
configured NUMA nodes. This might not be always possible (e.g.
|
|
Pablo Greco |
40546a |
qemu might lock some parts of its memory) and is very suboptimal
|
|
Pablo Greco |
40546a |
(copying large memory between NUMA nodes takes significant amount
|
|
Pablo Greco |
40546a |
of time).
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
The solution is to set affinity to one of (in priority order):
|
|
Pablo Greco |
40546a |
- The CPUs associated with NUMA memory affinity mask
|
|
Pablo Greco |
40546a |
- The CPUs associated with emulator pinning
|
|
Pablo Greco |
40546a |
- All online host CPUs
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
Later (once QEMU has allocated its memory) we then change this
|
|
Pablo Greco |
40546a |
again to (again in priority order):
|
|
Pablo Greco |
40546a |
- The CPUs associated with emulator pinning
|
|
Pablo Greco |
40546a |
- The CPUs returned by numad
|
|
Pablo Greco |
40546a |
- The CPUs associated with vCPU pinning
|
|
Pablo Greco |
40546a |
- All online host CPUs
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
|
|
Pablo Greco |
40546a |
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
|
|
Pablo Greco |
40546a |
(cherry picked from commit f136b83139c63f20de0df3285d9e82df2fb97bfc)
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
RHEL-8.1.0: https://bugzilla.redhat.com/show_bug.cgi?id=1716943
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
|
|
Pablo Greco |
40546a |
Message-Id: <c5f31a30daef2be65dc404ab0f1fbfb15be0d062.1559727075.git.mprivozn@redhat.com>
|
|
Pablo Greco |
40546a |
Reviewed-by: Andrea Bolognani <abologna@redhat.com>
|
|
Pablo Greco |
40546a |
---
|
|
Pablo Greco |
40546a |
src/qemu/qemu_process.c | 132 +++++++++++++++++++---------------------
|
|
Pablo Greco |
40546a |
1 file changed, 63 insertions(+), 69 deletions(-)
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
|
|
Pablo Greco |
40546a |
index 2d2954ba18..6071b3ba3d 100644
|
|
Pablo Greco |
40546a |
--- a/src/qemu/qemu_process.c
|
|
Pablo Greco |
40546a |
+++ b/src/qemu/qemu_process.c
|
|
Pablo Greco |
40546a |
@@ -2335,6 +2335,21 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
|
|
Pablo Greco |
40546a |
}
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
+static int
|
|
Pablo Greco |
40546a |
+qemuProcessGetAllCpuAffinity(virBitmapPtr *cpumapRet)
|
|
Pablo Greco |
40546a |
+{
|
|
Pablo Greco |
40546a |
+ *cpumapRet = NULL;
|
|
Pablo Greco |
40546a |
+
|
|
Pablo Greco |
40546a |
+ if (!virHostCPUHasBitmap())
|
|
Pablo Greco |
40546a |
+ return 0;
|
|
Pablo Greco |
40546a |
+
|
|
Pablo Greco |
40546a |
+ if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
|
|
Pablo Greco |
40546a |
+ return -1;
|
|
Pablo Greco |
40546a |
+
|
|
Pablo Greco |
40546a |
+ return 0;
|
|
Pablo Greco |
40546a |
+}
|
|
Pablo Greco |
40546a |
+
|
|
Pablo Greco |
40546a |
+
|
|
Pablo Greco |
40546a |
/*
|
|
Pablo Greco |
40546a |
* To be run between fork/exec of QEMU only
|
|
Pablo Greco |
40546a |
*/
|
|
Pablo Greco |
40546a |
@@ -2342,9 +2357,9 @@ static int
|
|
Pablo Greco |
40546a |
qemuProcessInitCpuAffinity(virDomainObjPtr vm)
|
|
Pablo Greco |
40546a |
{
|
|
Pablo Greco |
40546a |
int ret = -1;
|
|
Pablo Greco |
40546a |
- virBitmapPtr cpumap = NULL;
|
|
Pablo Greco |
40546a |
virBitmapPtr cpumapToSet = NULL;
|
|
Pablo Greco |
40546a |
- virBitmapPtr hostcpumap = NULL;
|
|
Pablo Greco |
40546a |
+ VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
|
|
Pablo Greco |
40546a |
+ virDomainNumatuneMemMode mem_mode;
|
|
Pablo Greco |
40546a |
qemuDomainObjPrivatePtr priv = vm->privateData;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
if (!vm->pid) {
|
|
Pablo Greco |
40546a |
@@ -2353,59 +2368,39 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
|
|
Pablo Greco |
40546a |
return -1;
|
|
Pablo Greco |
40546a |
}
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
- if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
|
|
Pablo Greco |
40546a |
- VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
|
|
Pablo Greco |
40546a |
- cpumapToSet = priv->autoCpuset;
|
|
Pablo Greco |
40546a |
+ /* Here is the deal, we can't set cpuset.mems before qemu is
|
|
Pablo Greco |
40546a |
+ * started as it clashes with KVM allocation. Therefore, we
|
|
Pablo Greco |
40546a |
+ * used to let qemu allocate its memory anywhere as we would
|
|
Pablo Greco |
40546a |
+ * then move the memory to desired NUMA node via CGroups.
|
|
Pablo Greco |
40546a |
+ * However, that might not be always possible because qemu
|
|
Pablo Greco |
40546a |
+ * might lock some parts of its memory (e.g. due to VFIO).
|
|
Pablo Greco |
40546a |
+ * Even if it possible, memory has to be copied between NUMA
|
|
Pablo Greco |
40546a |
+ * nodes which is suboptimal.
|
|
Pablo Greco |
40546a |
+ * Solution is to set affinity that matches the best what we
|
|
Pablo Greco |
40546a |
+ * would have set in CGroups and then fix it later, once qemu
|
|
Pablo Greco |
40546a |
+ * is already running. */
|
|
Pablo Greco |
40546a |
+ if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
|
|
Pablo Greco |
40546a |
+ virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
|
|
Pablo Greco |
40546a |
+ mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
|
|
Pablo Greco |
40546a |
+ if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
|
|
Pablo Greco |
40546a |
+ priv->autoNodeset,
|
|
Pablo Greco |
40546a |
+ &cpumapToSet,
|
|
Pablo Greco |
40546a |
+ -1) < 0)
|
|
Pablo Greco |
40546a |
+ goto cleanup;
|
|
Pablo Greco |
40546a |
+ } else if (vm->def->cputune.emulatorpin) {
|
|
Pablo Greco |
40546a |
+ cpumapToSet = vm->def->cputune.emulatorpin;
|
|
Pablo Greco |
40546a |
} else {
|
|
Pablo Greco |
40546a |
- VIR_DEBUG("Set CPU affinity with specified cpuset");
|
|
Pablo Greco |
40546a |
- if (vm->def->cpumask) {
|
|
Pablo Greco |
40546a |
- cpumapToSet = vm->def->cpumask;
|
|
Pablo Greco |
40546a |
- } else {
|
|
Pablo Greco |
40546a |
- /* You may think this is redundant, but we can't assume libvirtd
|
|
Pablo Greco |
40546a |
- * itself is running on all pCPUs, so we need to explicitly set
|
|
Pablo Greco |
40546a |
- * the spawned QEMU instance to all pCPUs if no map is given in
|
|
Pablo Greco |
40546a |
- * its config file */
|
|
Pablo Greco |
40546a |
- int hostcpus;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- if (virHostCPUHasBitmap()) {
|
|
Pablo Greco |
40546a |
- hostcpumap = virHostCPUGetOnlineBitmap();
|
|
Pablo Greco |
40546a |
- cpumap = virProcessGetAffinity(vm->pid);
|
|
Pablo Greco |
40546a |
- }
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
|
|
Pablo Greco |
40546a |
- /* we're using all available CPUs, no reason to set
|
|
Pablo Greco |
40546a |
- * mask. If libvirtd is running without explicit
|
|
Pablo Greco |
40546a |
- * affinity, we can use hotplugged CPUs for this VM */
|
|
Pablo Greco |
40546a |
- ret = 0;
|
|
Pablo Greco |
40546a |
- goto cleanup;
|
|
Pablo Greco |
40546a |
- } else {
|
|
Pablo Greco |
40546a |
- /* setaffinity fails if you set bits for CPUs which
|
|
Pablo Greco |
40546a |
- * aren't present, so we have to limit ourselves */
|
|
Pablo Greco |
40546a |
- if ((hostcpus = virHostCPUGetCount()) < 0)
|
|
Pablo Greco |
40546a |
- goto cleanup;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- if (hostcpus > QEMUD_CPUMASK_LEN)
|
|
Pablo Greco |
40546a |
- hostcpus = QEMUD_CPUMASK_LEN;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- virBitmapFree(cpumap);
|
|
Pablo Greco |
40546a |
- if (!(cpumap = virBitmapNew(hostcpus)))
|
|
Pablo Greco |
40546a |
- goto cleanup;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- virBitmapSetAll(cpumap);
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- cpumapToSet = cpumap;
|
|
Pablo Greco |
40546a |
- }
|
|
Pablo Greco |
40546a |
- }
|
|
Pablo Greco |
40546a |
+ if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
|
|
Pablo Greco |
40546a |
+ goto cleanup;
|
|
Pablo Greco |
40546a |
+ cpumapToSet = hostcpumap;
|
|
Pablo Greco |
40546a |
}
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
- if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
|
|
Pablo Greco |
40546a |
+ if (cpumapToSet &&
|
|
Pablo Greco |
40546a |
+ virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
ret = 0;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
cleanup:
|
|
Pablo Greco |
40546a |
- virBitmapFree(cpumap);
|
|
Pablo Greco |
40546a |
- virBitmapFree(hostcpumap);
|
|
Pablo Greco |
40546a |
return ret;
|
|
Pablo Greco |
40546a |
}
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
@@ -2478,7 +2473,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
Pablo Greco |
40546a |
qemuDomainObjPrivatePtr priv = vm->privateData;
|
|
Pablo Greco |
40546a |
virDomainNumatuneMemMode mem_mode;
|
|
Pablo Greco |
40546a |
virCgroupPtr cgroup = NULL;
|
|
Pablo Greco |
40546a |
- virBitmapPtr use_cpumask;
|
|
Pablo Greco |
40546a |
+ virBitmapPtr use_cpumask = NULL;
|
|
Pablo Greco |
40546a |
+ VIR_AUTOPTR(virBitmap) hostcpumap = NULL;
|
|
Pablo Greco |
40546a |
char *mem_mask = NULL;
|
|
Pablo Greco |
40546a |
int ret = -1;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
@@ -2490,12 +2486,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
Pablo Greco |
40546a |
}
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
/* Infer which cpumask shall be used. */
|
|
Pablo Greco |
40546a |
- if (cpumask)
|
|
Pablo Greco |
40546a |
+ if (cpumask) {
|
|
Pablo Greco |
40546a |
use_cpumask = cpumask;
|
|
Pablo Greco |
40546a |
- else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
|
|
Pablo Greco |
40546a |
+ } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
|
|
Pablo Greco |
40546a |
use_cpumask = priv->autoCpuset;
|
|
Pablo Greco |
40546a |
- else
|
|
Pablo Greco |
40546a |
+ } else if (vm->def->cpumask) {
|
|
Pablo Greco |
40546a |
use_cpumask = vm->def->cpumask;
|
|
Pablo Greco |
40546a |
+ } else {
|
|
Pablo Greco |
40546a |
+ /* You may think this is redundant, but we can't assume libvirtd
|
|
Pablo Greco |
40546a |
+ * itself is running on all pCPUs, so we need to explicitly set
|
|
Pablo Greco |
40546a |
+ * the spawned QEMU instance to all pCPUs if no map is given in
|
|
Pablo Greco |
40546a |
+ * its config file */
|
|
Pablo Greco |
40546a |
+ if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
|
|
Pablo Greco |
40546a |
+ goto cleanup;
|
|
Pablo Greco |
40546a |
+ use_cpumask = hostcpumap;
|
|
Pablo Greco |
40546a |
+ }
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
/*
|
|
Pablo Greco |
40546a |
* If CPU cgroup controller is not initialized here, then we need
|
|
Pablo Greco |
40546a |
@@ -2520,13 +2525,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
Pablo Greco |
40546a |
qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
- /*
|
|
Pablo Greco |
40546a |
- * Don't setup cpuset.mems for the emulator, they need to
|
|
Pablo Greco |
40546a |
- * be set up after initialization in order for kvm
|
|
Pablo Greco |
40546a |
- * allocations to succeed.
|
|
Pablo Greco |
40546a |
- */
|
|
Pablo Greco |
40546a |
- if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
|
|
Pablo Greco |
40546a |
- mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
|
|
Pablo Greco |
40546a |
+ if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
}
|
|
Pablo Greco |
40546a |
@@ -6440,12 +6439,7 @@ qemuProcessLaunch(virConnectPtr conn,
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
/* This must be done after cgroup placement to avoid resetting CPU
|
|
Pablo Greco |
40546a |
* affinity */
|
|
Pablo Greco |
40546a |
- if (!vm->def->cputune.emulatorpin &&
|
|
Pablo Greco |
40546a |
- qemuProcessInitCpuAffinity(vm) < 0)
|
|
Pablo Greco |
40546a |
- goto cleanup;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
- VIR_DEBUG("Setting emulator tuning/settings");
|
|
Pablo Greco |
40546a |
- if (qemuProcessSetupEmulator(vm) < 0)
|
|
Pablo Greco |
40546a |
+ if (qemuProcessInitCpuAffinity(vm) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
VIR_DEBUG("Setting cgroup for external devices (if required)");
|
|
Pablo Greco |
40546a |
@@ -6514,10 +6508,6 @@ qemuProcessLaunch(virConnectPtr conn,
|
|
Pablo Greco |
40546a |
if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
- VIR_DEBUG("Setting up post-init cgroup restrictions");
|
|
Pablo Greco |
40546a |
- if (qemuSetupCpusetMems(vm) < 0)
|
|
Pablo Greco |
40546a |
- goto cleanup;
|
|
Pablo Greco |
40546a |
-
|
|
Pablo Greco |
40546a |
VIR_DEBUG("setting up hotpluggable cpus");
|
|
Pablo Greco |
40546a |
if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
|
|
Pablo Greco |
40546a |
if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
|
|
Pablo Greco |
40546a |
@@ -6543,6 +6533,10 @@ qemuProcessLaunch(virConnectPtr conn,
|
|
Pablo Greco |
40546a |
if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
|
|
Pablo Greco |
40546a |
+ VIR_DEBUG("Setting emulator tuning/settings");
|
|
Pablo Greco |
40546a |
+ if (qemuProcessSetupEmulator(vm) < 0)
|
|
Pablo Greco |
40546a |
+ goto cleanup;
|
|
Pablo Greco |
40546a |
+
|
|
Pablo Greco |
40546a |
VIR_DEBUG("Setting global CPU cgroup (if required)");
|
|
Pablo Greco |
40546a |
if (qemuSetupGlobalCpuCgroup(vm) < 0)
|
|
Pablo Greco |
40546a |
goto cleanup;
|
|
Pablo Greco |
40546a |
--
|
|
Pablo Greco |
40546a |
2.22.0
|
|
Pablo Greco |
40546a |
|