1be5c7
From 50840e01d05a466a1dfbc219e49233834e5d7ed0 Mon Sep 17 00:00:00 2001
1be5c7
From: Yang Zhong <yang.zhong@intel.com>
1be5c7
Date: Wed, 16 Feb 2022 22:04:29 -0800
1be5c7
Subject: [PATCH 07/24] x86: Grant AMX permission for guest
1be5c7
1be5c7
RH-Author: Paul Lai <plai@redhat.com>
1be5c7
RH-MergeRequest: 176: Enable KVM AMX support
1be5c7
RH-Commit: [7/13] 437578191f61139ca710cc7045ab38eb0d05eae2
1be5c7
RH-Bugzilla: 1916415
1be5c7
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
1be5c7
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
1be5c7
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
1be5c7
1be5c7
Kernel allocates 4K xstate buffer by default. For XSAVE features
1be5c7
which require large state component (e.g. AMX), Linux kernel
1be5c7
dynamically expands the xstate buffer only after the process has
1be5c7
acquired the necessary permissions. Those are called dynamically-
1be5c7
enabled XSAVE features (or dynamic xfeatures).
1be5c7
1be5c7
There are separate permissions for native tasks and guests.
1be5c7
1be5c7
Qemu should request the guest permissions for dynamic xfeatures
1be5c7
which will be exposed to the guest. This only needs to be done
1be5c7
once before the first vcpu is created.
1be5c7
1be5c7
KVM implemented one new ARCH_GET_XCOMP_SUPP system attribute API to
1be5c7
get host side supported_xcr0 and Qemu can decide if it can request
1be5c7
dynamically enabled XSAVE features permission.
1be5c7
https://lore.kernel.org/all/20220126152210.3044876-1-pbonzini@redhat.com/
1be5c7
1be5c7
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
1be5c7
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
1be5c7
Signed-off-by: Jing Liu <jing2.liu@intel.com>
1be5c7
Message-Id: <20220217060434.52460-4-yang.zhong@intel.com>
1be5c7
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1be5c7
(cherry picked from commit 19db68ca68a78fa033a21d419036b6e416554564)
1be5c7
Signed-off-by: Paul Lai <plai@redhat.com>
1be5c7
---
1be5c7
 target/i386/cpu.c          |  7 +++++
1be5c7
 target/i386/cpu.h          |  4 +++
1be5c7
 target/i386/kvm/kvm-cpu.c  | 12 ++++----
1be5c7
 target/i386/kvm/kvm.c      | 57 ++++++++++++++++++++++++++++++++++++++
1be5c7
 target/i386/kvm/kvm_i386.h |  1 +
1be5c7
 5 files changed, 75 insertions(+), 6 deletions(-)
1be5c7
1be5c7
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
1be5c7
index 0453c27c9d..c19b51ea32 100644
1be5c7
--- a/target/i386/cpu.c
1be5c7
+++ b/target/i386/cpu.c
1be5c7
@@ -6027,6 +6027,7 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
1be5c7
     CPUX86State *env = &cpu->env;
1be5c7
     int i;
1be5c7
     uint64_t mask;
1be5c7
+    static bool request_perm;
1be5c7
 
1be5c7
     if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) {
1be5c7
         env->features[FEAT_XSAVE_COMP_LO] = 0;
1be5c7
@@ -6042,6 +6043,12 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
1be5c7
         }
1be5c7
     }
1be5c7
 
1be5c7
+    /* Only request permission for first vcpu */
1be5c7
+    if (kvm_enabled() && !request_perm) {
1be5c7
+        kvm_request_xsave_components(cpu, mask);
1be5c7
+        request_perm = true;
1be5c7
+    }
1be5c7
+
1be5c7
     env->features[FEAT_XSAVE_COMP_LO] = mask;
1be5c7
     env->features[FEAT_XSAVE_COMP_HI] = mask >> 32;
1be5c7
 }
1be5c7
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
1be5c7
index e1dd8b9555..58676390e6 100644
1be5c7
--- a/target/i386/cpu.h
1be5c7
+++ b/target/i386/cpu.h
1be5c7
@@ -549,6 +549,10 @@ typedef enum X86Seg {
1be5c7
 #define XSTATE_ZMM_Hi256_MASK           (1ULL << XSTATE_ZMM_Hi256_BIT)
1be5c7
 #define XSTATE_Hi16_ZMM_MASK            (1ULL << XSTATE_Hi16_ZMM_BIT)
1be5c7
 #define XSTATE_PKRU_MASK                (1ULL << XSTATE_PKRU_BIT)
1be5c7
+#define XSTATE_XTILE_CFG_MASK           (1ULL << XSTATE_XTILE_CFG_BIT)
1be5c7
+#define XSTATE_XTILE_DATA_MASK          (1ULL << XSTATE_XTILE_DATA_BIT)
1be5c7
+
1be5c7
+#define XSTATE_DYNAMIC_MASK             (XSTATE_XTILE_DATA_MASK)
1be5c7
 
1be5c7
 #define ESA_FEATURE_ALIGN64_BIT         1
1be5c7
 
1be5c7
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
1be5c7
index 86ef7b2712..bdc967c484 100644
1be5c7
--- a/target/i386/kvm/kvm-cpu.c
1be5c7
+++ b/target/i386/kvm/kvm-cpu.c
1be5c7
@@ -84,7 +84,7 @@ static void kvm_cpu_max_instance_init(X86CPU *cpu)
1be5c7
 static void kvm_cpu_xsave_init(void)
1be5c7
 {
1be5c7
     static bool first = true;
1be5c7
-    KVMState *s = kvm_state;
1be5c7
+    uint32_t eax, ebx, ecx, edx;
1be5c7
     int i;
1be5c7
 
1be5c7
     if (!first) {
1be5c7
@@ -100,11 +100,11 @@ static void kvm_cpu_xsave_init(void)
1be5c7
         ExtSaveArea *esa = &x86_ext_save_areas[i];
1be5c7
 
1be5c7
         if (esa->size) {
1be5c7
-            int sz = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EAX);
1be5c7
-            if (sz != 0) {
1be5c7
-                assert(esa->size == sz);
1be5c7
-                esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX);
1be5c7
-                esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX);
1be5c7
+            host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx;;
1be5c7
+            if (eax != 0) {
1be5c7
+                assert(esa->size == eax);
1be5c7
+                esa->offset = ebx;
1be5c7
+                esa->ecx = ecx;
1be5c7
             }
1be5c7
         }
1be5c7
     }
1be5c7
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
1be5c7
index a668f521ac..b5d98c4361 100644
1be5c7
--- a/target/i386/kvm/kvm.c
1be5c7
+++ b/target/i386/kvm/kvm.c
1be5c7
@@ -17,6 +17,7 @@
1be5c7
 #include "qapi/error.h"
1be5c7
 #include <sys/ioctl.h>
1be5c7
 #include <sys/utsname.h>
1be5c7
+#include <sys/syscall.h>
1be5c7
 
1be5c7
 #include <linux/kvm.h>
1be5c7
 #include "standard-headers/asm-x86/kvm_para.h"
1be5c7
@@ -347,6 +348,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
1be5c7
     struct kvm_cpuid2 *cpuid;
1be5c7
     uint32_t ret = 0;
1be5c7
     uint32_t cpuid_1_edx;
1be5c7
+    uint64_t bitmask;
1be5c7
 
1be5c7
     cpuid = get_supported_cpuid(s);
1be5c7
 
1be5c7
@@ -404,6 +406,25 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
1be5c7
         if (!has_msr_arch_capabs) {
1be5c7
             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
1be5c7
         }
1be5c7
+    } else if (function == 0xd && index == 0 &&
1be5c7
+               (reg == R_EAX || reg == R_EDX)) {
1be5c7
+        struct kvm_device_attr attr = {
1be5c7
+            .group = 0,
1be5c7
+            .attr = KVM_X86_XCOMP_GUEST_SUPP,
1be5c7
+            .addr = (unsigned long) &bitmask
1be5c7
+        };
1be5c7
+
1be5c7
+        bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
1be5c7
+        if (!sys_attr) {
1be5c7
+            warn_report("cannot get sys attribute capabilities %d", sys_attr);
1be5c7
+        }
1be5c7
+
1be5c7
+        int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
1be5c7
+        if (rc == -1 && (errno == ENXIO || errno == EINVAL)) {
1be5c7
+            warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
1be5c7
+                        "error: %d", rc);
1be5c7
+        }
1be5c7
+        ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
1be5c7
     } else if (function == 0x80000001 && reg == R_ECX) {
1be5c7
         /*
1be5c7
          * It's safe to enable TOPOEXT even if it's not returned by
1be5c7
@@ -5054,3 +5075,39 @@ bool kvm_arch_cpu_check_are_resettable(void)
1be5c7
 {
1be5c7
     return !sev_es_enabled();
1be5c7
 }
1be5c7
+
1be5c7
+#define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
1be5c7
+
1be5c7
+void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
1be5c7
+{
1be5c7
+    KVMState *s = kvm_state;
1be5c7
+    uint64_t supported;
1be5c7
+
1be5c7
+    mask &= XSTATE_DYNAMIC_MASK;
1be5c7
+    if (!mask) {
1be5c7
+        return;
1be5c7
+    }
1be5c7
+    /*
1be5c7
+     * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
1be5c7
+     * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
1be5c7
+     * about them already because they are not supported features.
1be5c7
+     */
1be5c7
+    supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
1be5c7
+    supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
1be5c7
+    mask &= supported;
1be5c7
+
1be5c7
+    while (mask) {
1be5c7
+        int bit = ctz64(mask);
1be5c7
+        int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
1be5c7
+        if (rc) {
1be5c7
+            /*
1be5c7
+             * Older kernel version (<5.17) do not support
1be5c7
+             * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
1be5c7
+             * any dynamic feature from kvm_arch_get_supported_cpuid.
1be5c7
+             */
1be5c7
+            warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
1be5c7
+                        "for feature bit %d", bit);
1be5c7
+        }
1be5c7
+        mask &= ~BIT_ULL(bit);
1be5c7
+    }
1be5c7
+}
1be5c7
diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
1be5c7
index a978509d50..4124912c20 100644
1be5c7
--- a/target/i386/kvm/kvm_i386.h
1be5c7
+++ b/target/i386/kvm/kvm_i386.h
1be5c7
@@ -52,5 +52,6 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp);
1be5c7
 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address);
1be5c7
 
1be5c7
 bool kvm_enable_sgx_provisioning(KVMState *s);
1be5c7
+void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask);
1be5c7
 
1be5c7
 #endif
1be5c7
-- 
1be5c7
2.35.3
1be5c7