016a62
From 1cfbcbeebc6d9ca1f1f7656fff572bf6ac50de76 Mon Sep 17 00:00:00 2001
016a62
From: "plai@redhat.com" <plai@redhat.com>
016a62
Date: Tue, 26 Nov 2019 19:36:52 +0000
016a62
Subject: [PATCH 08/11] kvm: support -overcommit cpu-pm=on|off
016a62
016a62
RH-Author: plai@redhat.com
016a62
Message-id: <1574797015-32564-5-git-send-email-plai@redhat.com>
016a62
Patchwork-id: 92697
016a62
O-Subject: [RHEL8.2 qemu-kvm PATCH 4/7] kvm: support -overcommit cpu-pm=on|off
016a62
Bugzilla: 1634827
016a62
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
016a62
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
016a62
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
016a62
016a62
From: "Michael S. Tsirkin" <mst@redhat.com>
016a62
016a62
With this flag, kvm allows guest to control host CPU power state.  This
016a62
increases latency for other processes using same host CPU in an
016a62
unpredictable way, but if decreases idle entry/exit times for the
016a62
running VCPU, so to use it QEMU needs a hint about whether host CPU is
016a62
overcommitted, hence the flag name.
016a62
016a62
Follow-up patches will expose this capability to guest
016a62
(using mwait leaf).
016a62
016a62
Based on a patch by Wanpeng Li <kernellwp@gmail.com> .
016a62
016a62
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
016a62
Message-Id: <20180622192148.178309-2-mst@redhat.com>
016a62
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
016a62
(cherry picked from commit 6f131f13e68d648a8e4f083c667ab1acd88ce4cd)
016a62
Signed-off-by: Paul Lai <plai@redhat.com>
016a62
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
016a62
---
016a62
 include/sysemu/sysemu.h |  1 +
016a62
 qemu-options.hx         | 24 ++++++++++++++++++++++++
016a62
 target/i386/kvm.c       | 23 +++++++++++++++++++++++
016a62
 vl.c                    | 32 +++++++++++++++++++++++++++++++-
016a62
 4 files changed, 79 insertions(+), 1 deletion(-)
016a62
016a62
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
016a62
index f20e4f5..f38fad0 100644
016a62
--- a/include/sysemu/sysemu.h
016a62
+++ b/include/sysemu/sysemu.h
016a62
@@ -131,6 +131,7 @@ extern bool boot_strict;
016a62
 extern uint8_t *boot_splash_filedata;
016a62
 extern size_t boot_splash_filedata_size;
016a62
 extern bool enable_mlock;
016a62
+extern bool enable_cpu_pm;
016a62
 extern uint8_t qemu_extra_params_fw[2];
016a62
 extern QEMUClockType rtc_clock;
016a62
 extern const char *mem_path;
016a62
diff --git a/qemu-options.hx b/qemu-options.hx
016a62
index 1243057..99933a0 100644
016a62
--- a/qemu-options.hx
016a62
+++ b/qemu-options.hx
016a62
@@ -3331,6 +3331,30 @@ mlocking qemu-kvm and guest memory can be enabled via @option{mlock=on}
016a62
 (enabled by default).
016a62
 ETEXI
016a62
 
016a62
+DEF("overcommit", HAS_ARG, QEMU_OPTION_overcommit,
016a62
+    "--overcommit [mem-lock=on|off][cpu-pm=on|off]\n"
016a62
+    "                run qemu with overcommit hints\n"
016a62
+    "                mem-lock=on|off controls memory lock support (default: off)\n"
016a62
+    "                cpu-pm=on|off controls cpu power management (default: off)\n",
016a62
+    QEMU_ARCH_ALL)
016a62
+STEXI
016a62
+@item -overcommit mem-lock=on|off
016a62
+@item -overcommit cpu-pm=on|off
016a62
+@findex -overcommit
016a62
+Run qemu with hints about host resource overcommit. The default is
016a62
+to assume that host overcommits all resources.
016a62
+
016a62
+Locking qemu and guest memory can be enabled via @option{mem-lock=on} (disabled
016a62
+by default).  This works when host memory is not overcommitted and reduces the
016a62
+worst-case latency for guest.  This is equivalent to @option{realtime}.
016a62
+
016a62
+Guest ability to manage power state of host cpus (increasing latency for other
016a62
+processes on the same host cpu, but decreasing latency for guest) can be
016a62
+enabled via @option{cpu-pm=on} (disabled by default).  This works best when
016a62
+host CPU is not overcommitted. When used, host estimates of CPU cycle and power
016a62
+utilization will be incorrect, not taking into account guest idle time.
016a62
+ETEXI
016a62
+
016a62
 DEF("gdb", HAS_ARG, QEMU_OPTION_gdb, \
016a62
     "-gdb dev        wait for gdb connection on 'dev'\n", QEMU_ARCH_ALL)
016a62
 STEXI
016a62
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
016a62
index 107c53b..879c3e0 100644
016a62
--- a/target/i386/kvm.c
016a62
+++ b/target/i386/kvm.c
016a62
@@ -1606,6 +1606,29 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
016a62
         smram_machine_done.notify = register_smram_listener;
016a62
         qemu_add_machine_init_done_notifier(&smram_machine_done);
016a62
     }
016a62
+
016a62
+    if (enable_cpu_pm) {
016a62
+        int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
016a62
+        int ret;
016a62
+
016a62
+/* Work around for kernel header with a typo. TODO: fix header and drop. */
016a62
+#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
016a62
+#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
016a62
+#endif
016a62
+        if (disable_exits) {
016a62
+            disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
016a62
+                              KVM_X86_DISABLE_EXITS_HLT |
016a62
+                              KVM_X86_DISABLE_EXITS_PAUSE);
016a62
+        }
016a62
+
016a62
+        ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
016a62
+                                disable_exits);
016a62
+        if (ret < 0) {
016a62
+            error_report("kvm: guest stopping CPU not supported: %s",
016a62
+                         strerror(-ret));
016a62
+        }
016a62
+    }
016a62
+
016a62
     return 0;
016a62
 }
016a62
 
016a62
diff --git a/vl.c b/vl.c
016a62
index 932c1cf..aa08ab5 100644
016a62
--- a/vl.c
016a62
+++ b/vl.c
016a62
@@ -150,6 +150,7 @@ ram_addr_t ram_size;
016a62
 const char *mem_path = NULL;
016a62
 int mem_prealloc = 0; /* force preallocation of physical target memory */
016a62
 bool enable_mlock = false;
016a62
+bool enable_cpu_pm = false;
016a62
 int nb_nics;
016a62
 NICInfo nd_table[MAX_NICS];
016a62
 int autostart;
016a62
@@ -428,6 +429,22 @@ static QemuOptsList qemu_realtime_opts = {
016a62
     },
016a62
 };
016a62
 
016a62
+static QemuOptsList qemu_overcommit_opts = {
016a62
+    .name = "overcommit",
016a62
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_overcommit_opts.head),
016a62
+    .desc = {
016a62
+        {
016a62
+            .name = "mem-lock",
016a62
+            .type = QEMU_OPT_BOOL,
016a62
+        },
016a62
+        {
016a62
+            .name = "cpu-pm",
016a62
+            .type = QEMU_OPT_BOOL,
016a62
+        },
016a62
+        { /* end of list */ }
016a62
+    },
016a62
+};
016a62
+
016a62
 static QemuOptsList qemu_msg_opts = {
016a62
     .name = "msg",
016a62
     .head = QTAILQ_HEAD_INITIALIZER(qemu_msg_opts.head),
016a62
@@ -4089,7 +4106,20 @@ int main(int argc, char **argv, char **envp)
016a62
                 if (!opts) {
016a62
                     exit(1);
016a62
                 }
016a62
-                enable_mlock = qemu_opt_get_bool(opts, "mlock", true);
016a62
+                /* Don't override the -overcommit option if set */
016a62
+                enable_mlock = enable_mlock ||
016a62
+                    qemu_opt_get_bool(opts, "mlock", true);
016a62
+                break;
016a62
+            case QEMU_OPTION_overcommit:
016a62
+                opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"),
016a62
+                                               optarg, false);
016a62
+                if (!opts) {
016a62
+                    exit(1);
016a62
+                }
016a62
+                /* Don't override the -realtime option if set */
016a62
+                enable_mlock = enable_mlock ||
016a62
+                    qemu_opt_get_bool(opts, "mem-lock", false);
016a62
+                enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", false);
016a62
                 break;
016a62
             case QEMU_OPTION_msg:
016a62
                 opts = qemu_opts_parse_noisily(qemu_find_opts("msg"), optarg,
016a62
-- 
016a62
1.8.3.1
016a62