Blame SOURCES/kvm-target-i386-kvm-Add-support-for-KVM_CAP_EXCEPTION_PA.patch

4ec855
From 05a54f3fc44598f917d72a1f2570c43ec042cdb8 Mon Sep 17 00:00:00 2001
4ec855
From: Paolo Bonzini <pbonzini@redhat.com>
4ec855
Date: Mon, 22 Jul 2019 18:22:16 +0100
4ec855
Subject: [PATCH 35/39] target/i386: kvm: Add support for
4ec855
 KVM_CAP_EXCEPTION_PAYLOAD
4ec855
4ec855
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
4ec855
Message-id: <20190722182220.19374-15-pbonzini@redhat.com>
4ec855
Patchwork-id: 89631
4ec855
O-Subject: [RHEL-8.1.0 PATCH qemu-kvm v3 14/18] target/i386: kvm: Add support for KVM_CAP_EXCEPTION_PAYLOAD
4ec855
Bugzilla: 1689269
4ec855
RH-Acked-by: Peter Xu <zhexu@redhat.com>
4ec855
RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
4ec855
RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
4ec855
4ec855
From: Liran Alon <liran.alon@oracle.com>
4ec855
4ec855
Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
4ec855
introduced a new KVM capability which allows userspace to correctly
4ec855
distinguish between pending and injected exceptions.
4ec855
4ec855
This distinguish is important in case of nested virtualization scenarios
4ec855
because a L2 pending exception can still be intercepted by the L1 hypervisor
4ec855
while a L2 injected exception cannot.
4ec855
4ec855
Furthermore, when an exception is attempted to be injected by QEMU,
4ec855
QEMU should specify the exception payload (CR2 in case of #PF or
4ec855
DR6 in case of #DB) instead of having the payload already delivered in
4ec855
the respective vCPU register. Because in case exception is injected to
4ec855
L2 guest and is intercepted by L1 hypervisor, then payload needs to be
4ec855
reported to L1 intercept (VMExit handler) while still preserving
4ec855
respective vCPU register unchanged.
4ec855
4ec855
This commit adds support for QEMU to properly utilise this new KVM
4ec855
capability (KVM_CAP_EXCEPTION_PAYLOAD).
4ec855
4ec855
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
4ec855
Signed-off-by: Liran Alon <liran.alon@oracle.com>
4ec855
Message-Id: <20190619162140.133674-10-liran.alon@oracle.com>
4ec855
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
4ec855
(cherry picked from commit fd13f23b8c95311eff74426921557eee592b0ed3)
4ec855
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
4ec855
---
4ec855
 target/i386/cpu.c        |   6 ++-
4ec855
 target/i386/cpu.h        |   6 ++-
4ec855
 target/i386/hvf/hvf.c    |  10 +++--
4ec855
 target/i386/hvf/x86hvf.c |   4 +-
4ec855
 target/i386/kvm.c        | 101 +++++++++++++++++++++++++++++++++++++++--------
4ec855
 target/i386/machine.c    |  84 ++++++++++++++++++++++++++++++++++++++-
4ec855
 6 files changed, 187 insertions(+), 24 deletions(-)
4ec855
4ec855
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
4ec855
index bd0b784..f71b044 100644
4ec855
--- a/target/i386/cpu.c
4ec855
+++ b/target/i386/cpu.c
4ec855
@@ -4645,7 +4645,11 @@ static void x86_cpu_reset(CPUState *s)
4ec855
     memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
4ec855
 
4ec855
     env->interrupt_injected = -1;
4ec855
-    env->exception_injected = -1;
4ec855
+    env->exception_nr = -1;
4ec855
+    env->exception_pending = 0;
4ec855
+    env->exception_injected = 0;
4ec855
+    env->exception_has_payload = false;
4ec855
+    env->exception_payload = 0;
4ec855
     env->nmi_injected = false;
4ec855
 #if !defined(CONFIG_USER_ONLY)
4ec855
     /* We hard-wire the BSP to the first CPU. */
4ec855
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
4ec855
index 86f3d98..d120f62 100644
4ec855
--- a/target/i386/cpu.h
4ec855
+++ b/target/i386/cpu.h
4ec855
@@ -1325,10 +1325,14 @@ typedef struct CPUX86State {
4ec855
 
4ec855
     /* For KVM */
4ec855
     uint32_t mp_state;
4ec855
-    int32_t exception_injected;
4ec855
+    int32_t exception_nr;
4ec855
     int32_t interrupt_injected;
4ec855
     uint8_t soft_interrupt;
4ec855
+    uint8_t exception_pending;
4ec855
+    uint8_t exception_injected;
4ec855
     uint8_t has_error_code;
4ec855
+    uint8_t exception_has_payload;
4ec855
+    uint64_t exception_payload;
4ec855
     uint32_t ins_len;
4ec855
     uint32_t sipi_vector;
4ec855
     bool tsc_valid;
4ec855
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
4ec855
index c367539..acc0bb9 100644
4ec855
--- a/target/i386/hvf/hvf.c
4ec855
+++ b/target/i386/hvf/hvf.c
4ec855
@@ -617,7 +617,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
4ec855
     X86CPU *x86_cpu = X86_CPU(cpu);
4ec855
     CPUX86State *env = &x86_cpu->env;
4ec855
 
4ec855
-    env->exception_injected = -1;
4ec855
+    env->exception_nr = -1;
4ec855
+    env->exception_pending = 0;
4ec855
+    env->exception_injected = 0;
4ec855
     env->interrupt_injected = -1;
4ec855
     env->nmi_injected = false;
4ec855
     if (idtvec_info & VMCS_IDT_VEC_VALID) {
4ec855
@@ -631,7 +633,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
4ec855
             break;
4ec855
         case VMCS_IDT_VEC_HWEXCEPTION:
4ec855
         case VMCS_IDT_VEC_SWEXCEPTION:
4ec855
-            env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
4ec855
+            env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
4ec855
+            env->exception_injected = 1;
4ec855
             break;
4ec855
         case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
4ec855
         default:
4ec855
@@ -925,7 +928,8 @@ int hvf_vcpu_exec(CPUState *cpu)
4ec855
             macvm_set_rip(cpu, rip + ins_len);
4ec855
             break;
4ec855
         case VMX_REASON_VMCALL:
4ec855
-            env->exception_injected = EXCP0D_GPF;
4ec855
+            env->exception_nr = EXCP0D_GPF;
4ec855
+            env->exception_injected = 1;
4ec855
             env->has_error_code = true;
4ec855
             env->error_code = 0;
4ec855
             break;
4ec855
diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
4ec855
index 6c88939..f0e58a8 100644
4ec855
--- a/target/i386/hvf/x86hvf.c
4ec855
+++ b/target/i386/hvf/x86hvf.c
4ec855
@@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state)
4ec855
     if (env->interrupt_injected != -1) {
4ec855
         vector = env->interrupt_injected;
4ec855
         intr_type = VMCS_INTR_T_SWINTR;
4ec855
-    } else if (env->exception_injected != -1) {
4ec855
-        vector = env->exception_injected;
4ec855
+    } else if (env->exception_nr != -1) {
4ec855
+        vector = env->exception_nr;
4ec855
         if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
4ec855
             intr_type = VMCS_INTR_T_SWEXCEPTION;
4ec855
         } else {
4ec855
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
4ec855
index ddceb7d..aa2d589 100644
4ec855
--- a/target/i386/kvm.c
4ec855
+++ b/target/i386/kvm.c
4ec855
@@ -103,6 +103,7 @@ static uint32_t num_architectural_pmu_fixed_counters;
4ec855
 static int has_xsave;
4ec855
 static int has_xcrs;
4ec855
 static int has_pit_state2;
4ec855
+static int has_exception_payload;
4ec855
 
4ec855
 static bool has_msr_mcg_ext_ctl;
4ec855
 
4ec855
@@ -569,15 +570,56 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
4ec855
     /* Hope we are lucky for AO MCE */
4ec855
 }
4ec855
 
4ec855
+static void kvm_reset_exception(CPUX86State *env)
4ec855
+{
4ec855
+    env->exception_nr = -1;
4ec855
+    env->exception_pending = 0;
4ec855
+    env->exception_injected = 0;
4ec855
+    env->exception_has_payload = false;
4ec855
+    env->exception_payload = 0;
4ec855
+}
4ec855
+
4ec855
+static void kvm_queue_exception(CPUX86State *env,
4ec855
+                                int32_t exception_nr,
4ec855
+                                uint8_t exception_has_payload,
4ec855
+                                uint64_t exception_payload)
4ec855
+{
4ec855
+    assert(env->exception_nr == -1);
4ec855
+    assert(!env->exception_pending);
4ec855
+    assert(!env->exception_injected);
4ec855
+    assert(!env->exception_has_payload);
4ec855
+
4ec855
+    env->exception_nr = exception_nr;
4ec855
+
4ec855
+    if (has_exception_payload) {
4ec855
+        env->exception_pending = 1;
4ec855
+
4ec855
+        env->exception_has_payload = exception_has_payload;
4ec855
+        env->exception_payload = exception_payload;
4ec855
+    } else {
4ec855
+        env->exception_injected = 1;
4ec855
+
4ec855
+        if (exception_nr == EXCP01_DB) {
4ec855
+            assert(exception_has_payload);
4ec855
+            env->dr[6] = exception_payload;
4ec855
+        } else if (exception_nr == EXCP0E_PAGE) {
4ec855
+            assert(exception_has_payload);
4ec855
+            env->cr[2] = exception_payload;
4ec855
+        } else {
4ec855
+            assert(!exception_has_payload);
4ec855
+        }
4ec855
+    }
4ec855
+}
4ec855
+
4ec855
 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
4ec855
 {
4ec855
     CPUX86State *env = &cpu->env;
4ec855
 
4ec855
-    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
4ec855
+    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
4ec855
         unsigned int bank, bank_num = env->mcg_cap & 0xff;
4ec855
         struct kvm_x86_mce mce;
4ec855
 
4ec855
-        env->exception_injected = -1;
4ec855
+        kvm_reset_exception(env);
4ec855
 
4ec855
         /*
4ec855
          * There must be at least one bank in use if an MCE is pending.
4ec855
@@ -1458,6 +1500,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
4ec855
     has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
4ec855
 #endif
4ec855
 
4ec855
+    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
4ec855
+    if (has_exception_payload) {
4ec855
+        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
4ec855
+        if (ret < 0) {
4ec855
+            error_report("kvm: Failed to enable exception payload cap: %s",
4ec855
+                         strerror(-ret));
4ec855
+            return ret;
4ec855
+        }
4ec855
+    }
4ec855
+
4ec855
     ret = kvm_get_supported_msrs(s);
4ec855
     if (ret < 0) {
4ec855
         return ret;
4ec855
@@ -2717,8 +2769,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
4ec855
         return 0;
4ec855
     }
4ec855
 
4ec855
-    events.exception.injected = (env->exception_injected >= 0);
4ec855
-    events.exception.nr = env->exception_injected;
4ec855
+    events.flags = 0;
4ec855
+
4ec855
+    if (has_exception_payload) {
4ec855
+        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4ec855
+        events.exception.pending = env->exception_pending;
4ec855
+        events.exception_has_payload = env->exception_has_payload;
4ec855
+        events.exception_payload = env->exception_payload;
4ec855
+    }
4ec855
+    events.exception.nr = env->exception_nr;
4ec855
+    events.exception.injected = env->exception_injected;
4ec855
     events.exception.has_error_code = env->has_error_code;
4ec855
     events.exception.error_code = env->error_code;
4ec855
 
4ec855
@@ -2731,7 +2791,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
4ec855
     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
4ec855
 
4ec855
     events.sipi_vector = env->sipi_vector;
4ec855
-    events.flags = 0;
4ec855
 
4ec855
     if (has_msr_smbase) {
4ec855
         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
4ec855
@@ -2781,8 +2840,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
4ec855
     if (ret < 0) {
4ec855
        return ret;
4ec855
     }
4ec855
-    env->exception_injected =
4ec855
-       events.exception.injected ? events.exception.nr : -1;
4ec855
+
4ec855
+    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4ec855
+        env->exception_pending = events.exception.pending;
4ec855
+        env->exception_has_payload = events.exception_has_payload;
4ec855
+        env->exception_payload = events.exception_payload;
4ec855
+    } else {
4ec855
+        env->exception_pending = 0;
4ec855
+        env->exception_has_payload = false;
4ec855
+    }
4ec855
+    env->exception_injected = events.exception.injected;
4ec855
+    env->exception_nr =
4ec855
+        (env->exception_pending || env->exception_injected) ?
4ec855
+        events.exception.nr : -1;
4ec855
     env->has_error_code = events.exception.has_error_code;
4ec855
     env->error_code = events.exception.error_code;
4ec855
 
4ec855
@@ -2834,12 +2904,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu)
4ec855
     unsigned long reinject_trap = 0;
4ec855
 
4ec855
     if (!kvm_has_vcpu_events()) {
4ec855
-        if (env->exception_injected == EXCP01_DB) {
4ec855
+        if (env->exception_nr == EXCP01_DB) {
4ec855
             reinject_trap = KVM_GUESTDBG_INJECT_DB;
4ec855
         } else if (env->exception_injected == EXCP03_INT3) {
4ec855
             reinject_trap = KVM_GUESTDBG_INJECT_BP;
4ec855
         }
4ec855
-        env->exception_injected = -1;
4ec855
+        kvm_reset_exception(env);
4ec855
     }
4ec855
 
4ec855
     /*
4ec855
@@ -3215,13 +3285,13 @@ int kvm_arch_process_async_events(CPUState *cs)
4ec855
 
4ec855
         kvm_cpu_synchronize_state(cs);
4ec855
 
4ec855
-        if (env->exception_injected == EXCP08_DBLE) {
4ec855
+        if (env->exception_nr == EXCP08_DBLE) {
4ec855
             /* this means triple fault */
4ec855
             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4ec855
             cs->exit_request = 1;
4ec855
             return 0;
4ec855
         }
4ec855
-        env->exception_injected = EXCP12_MCHK;
4ec855
+        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
4ec855
         env->has_error_code = 0;
4ec855
 
4ec855
         cs->halted = 0;
4ec855
@@ -3436,14 +3506,13 @@ static int kvm_handle_debug(X86CPU *cpu,
4ec855
     }
4ec855
     if (ret == 0) {
4ec855
         cpu_synchronize_state(cs);
4ec855
-        assert(env->exception_injected == -1);
4ec855
+        assert(env->exception_nr == -1);
4ec855
 
4ec855
         /* pass to guest */
4ec855
-        env->exception_injected = arch_info->exception;
4ec855
+        kvm_queue_exception(env, arch_info->exception,
4ec855
+                            arch_info->exception == EXCP01_DB,
4ec855
+                            arch_info->dr6);
4ec855
         env->has_error_code = 0;
4ec855
-        if (arch_info->exception == EXCP01_DB) {
4ec855
-            env->dr[6] = arch_info->dr6;
4ec855
-        }
4ec855
     }
4ec855
 
4ec855
     return ret;
4ec855
diff --git a/target/i386/machine.c b/target/i386/machine.c
4ec855
index a2ddbba..5ffee8f 100644
4ec855
--- a/target/i386/machine.c
4ec855
+++ b/target/i386/machine.c
4ec855
@@ -239,6 +239,41 @@ static int cpu_pre_save(void *opaque)
4ec855
     }
4ec855
 #endif
4ec855
 
4ec855
+    /*
4ec855
+     * When vCPU is running L2 and exception is still pending,
4ec855
+     * it can potentially be intercepted by L1 hypervisor.
4ec855
+     * In contrast to an injected exception which cannot be
4ec855
+     * intercepted anymore.
4ec855
+     *
4ec855
+     * Furthermore, when a L2 exception is intercepted by L1
4ec855
+     * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB)
4ec855
+     * should not be set yet in the respective vCPU register.
4ec855
+     * Thus, in case an exception is pending, it is
4ec855
+     * important to save the exception payload seperately.
4ec855
+     *
4ec855
+     * Therefore, if an exception is not in a pending state
4ec855
+     * or vCPU is not in guest-mode, it is not important to
4ec855
+     * distinguish between a pending and injected exception
4ec855
+     * and we don't need to store seperately the exception payload.
4ec855
+     *
4ec855
+     * In order to preserve better backwards-compatabile migration,
4ec855
+     * convert a pending exception to an injected exception in
4ec855
+     * case it is not important to distingiush between them
4ec855
+     * as described above.
4ec855
+     */
4ec855
+    if (env->exception_pending && !(env->hflags & HF_GUEST_MASK)) {
4ec855
+        env->exception_pending = 0;
4ec855
+        env->exception_injected = 1;
4ec855
+
4ec855
+        if (env->exception_has_payload) {
4ec855
+            if (env->exception_nr == EXCP01_DB) {
4ec855
+                env->dr[6] = env->exception_payload;
4ec855
+            } else if (env->exception_nr == EXCP0E_PAGE) {
4ec855
+                env->cr[2] = env->exception_payload;
4ec855
+            }
4ec855
+        }
4ec855
+    }
4ec855
+
4ec855
     return 0;
4ec855
 }
4ec855
 
4ec855
@@ -296,6 +331,23 @@ static int cpu_post_load(void *opaque, int version_id)
4ec855
     }
4ec855
 #endif
4ec855
 
4ec855
+    /*
4ec855
+     * There are cases that we can get valid exception_nr with both
4ec855
+     * exception_pending and exception_injected being cleared.
4ec855
+     * This can happen in one of the following scenarios:
4ec855
+     * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
4ec855
+     * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support.
4ec855
+     * 3) "cpu/exception_info" subsection not sent because there is no exception
4ec855
+     *    pending or guest wasn't running L2 (See comment in cpu_pre_save()).
4ec855
+     *
4ec855
+     * In those cases, we can just deduce that a valid exception_nr means
4ec855
+     * we can treat the exception as already injected.
4ec855
+     */
4ec855
+    if ((env->exception_nr != -1) &&
4ec855
+        !env->exception_pending && !env->exception_injected) {
4ec855
+        env->exception_injected = 1;
4ec855
+    }
4ec855
+
4ec855
     env->fpstt = (env->fpus_vmstate >> 11) & 7;
4ec855
     env->fpus = env->fpus_vmstate & ~0x3800;
4ec855
     env->fptag_vmstate ^= 0xff;
4ec855
@@ -341,6 +393,35 @@ static bool steal_time_msr_needed(void *opaque)
4ec855
     return cpu->env.steal_time_msr != 0;
4ec855
 }
4ec855
 
4ec855
+static bool exception_info_needed(void *opaque)
4ec855
+{
4ec855
+    X86CPU *cpu = opaque;
4ec855
+    CPUX86State *env = &cpu->env;
4ec855
+
4ec855
+    /*
4ec855
+     * It is important to save exception-info only in case
4ec855
+     * we need to distingiush between a pending and injected
4ec855
+     * exception. Which is only required in case there is a
4ec855
+     * pending exception and vCPU is running L2.
4ec855
+     * For more info, refer to comment in cpu_pre_save().
4ec855
+     */
4ec855
+    return env->exception_pending && (env->hflags & HF_GUEST_MASK);
4ec855
+}
4ec855
+
4ec855
+static const VMStateDescription vmstate_exception_info = {
4ec855
+    .name = "cpu/exception_info",
4ec855
+    .version_id = 1,
4ec855
+    .minimum_version_id = 1,
4ec855
+    .needed = exception_info_needed,
4ec855
+    .fields = (VMStateField[]) {
4ec855
+        VMSTATE_UINT8(env.exception_pending, X86CPU),
4ec855
+        VMSTATE_UINT8(env.exception_injected, X86CPU),
4ec855
+        VMSTATE_UINT8(env.exception_has_payload, X86CPU),
4ec855
+        VMSTATE_UINT64(env.exception_payload, X86CPU),
4ec855
+        VMSTATE_END_OF_LIST()
4ec855
+    }
4ec855
+};
4ec855
+
4ec855
 static const VMStateDescription vmstate_steal_time_msr = {
4ec855
     .name = "cpu/steal_time_msr",
4ec855
     .version_id = 1,
4ec855
@@ -1219,7 +1300,7 @@ VMStateDescription vmstate_x86_cpu = {
4ec855
         VMSTATE_INT32(env.interrupt_injected, X86CPU),
4ec855
         VMSTATE_UINT32(env.mp_state, X86CPU),
4ec855
         VMSTATE_UINT64(env.tsc, X86CPU),
4ec855
-        VMSTATE_INT32(env.exception_injected, X86CPU),
4ec855
+        VMSTATE_INT32(env.exception_nr, X86CPU),
4ec855
         VMSTATE_UINT8(env.soft_interrupt, X86CPU),
4ec855
         VMSTATE_UINT8(env.nmi_injected, X86CPU),
4ec855
         VMSTATE_UINT8(env.nmi_pending, X86CPU),
4ec855
@@ -1243,6 +1324,7 @@ VMStateDescription vmstate_x86_cpu = {
4ec855
         /* The above list is not sorted /wrt version numbers, watch out! */
4ec855
     },
4ec855
     .subsections = (const VMStateDescription*[]) {
4ec855
+        &vmstate_exception_info,
4ec855
         &vmstate_async_pf_msr,
4ec855
         &vmstate_pv_eoi_msr,
4ec855
         &vmstate_steal_time_msr,
4ec855
-- 
4ec855
1.8.3.1
4ec855