62547e
From 86aeb4fd7ff9395afba574e422d83f990ce1f047 Mon Sep 17 00:00:00 2001
62547e
From: Janosch Frank <frankja@linux.ibm.com>
62547e
Date: Mon, 17 Oct 2022 08:38:22 +0000
62547e
Subject: [PATCH 41/42] s390x: pv: Add dump support
62547e
MIME-Version: 1.0
62547e
Content-Type: text/plain; charset=UTF-8
62547e
Content-Transfer-Encoding: 8bit
62547e
62547e
RH-Author: Cédric Le Goater <clg@redhat.com>
62547e
RH-MergeRequest: 226: s390: Enhanced Interpretation for PCI Functions and Secure Execution guest dump
62547e
RH-Bugzilla: 1664378 2043909
62547e
RH-Acked-by: Thomas Huth <thuth@redhat.com>
62547e
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
62547e
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
62547e
RH-Commit: [41/41] 2731c2329276e76013e3b3df21e9743bc74edd2b
62547e
62547e
Sometimes dumping a guest from the outside is the only way to get the
62547e
data that is needed. This can be the case if a dumping mechanism like
62547e
KDUMP hasn't been configured or data needs to be fetched at a specific
62547e
point. Dumping a protected guest from the outside without help from
62547e
fw/hw doesn't yield sufficient data to be useful. Hence we now
62547e
introduce PV dump support.
62547e
62547e
The PV dump support works by integrating the firmware into the dump
62547e
process. New Ultravisor calls are used to initiate the dump process,
62547e
dump cpu data, dump memory state and lastly complete the dump process.
62547e
The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
62547e
its subcommands. The guest's data is fully encrypted and can only be
62547e
decrypted by the entity that owns the customer communication key for
62547e
the dumped guest. Also dumping needs to be allowed via a flag in the
62547e
SE header.
62547e
62547e
On the QEMU side of things we store the PV dump data in the newly
62547e
introduced architecture ELF sections (storage state and completion
62547e
data) and the cpu notes (for cpu dump data).
62547e
62547e
Users can use the zgetdump tool to convert the encrypted QEMU dump to an
62547e
unencrypted one.
62547e
62547e
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
62547e
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
62547e
Message-Id: <20221017083822.43118-11-frankja@linux.ibm.com>
62547e
(cherry picked from commit 113d8f4e95cf0450bea421263de6ec016c779ad0)
62547e
Signed-off-by: Cédric Le Goater <clg@redhat.com>
62547e
---
62547e
 dump/dump.c              |  12 +-
62547e
 include/sysemu/dump.h    |   5 +
62547e
 target/s390x/arch_dump.c | 262 +++++++++++++++++++++++++++++++++++----
62547e
 3 files changed, 246 insertions(+), 33 deletions(-)
62547e
62547e
diff --git a/dump/dump.c b/dump/dump.c
62547e
index 4aa8fb64d2..5dee060b73 100644
62547e
--- a/dump/dump.c
62547e
+++ b/dump/dump.c
62547e
@@ -709,9 +709,9 @@ static void dump_begin(DumpState *s, Error **errp)
62547e
     write_elf_notes(s, errp);
62547e
 }
62547e
 
62547e
-static int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
62547e
-                                           int64_t filter_area_start,
62547e
-                                           int64_t filter_area_length)
62547e
+int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
62547e
+                                    int64_t filter_area_start,
62547e
+                                    int64_t filter_area_length)
62547e
 {
62547e
     int64_t size, left, right;
62547e
 
62547e
@@ -729,9 +729,9 @@ static int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
62547e
     return size;
62547e
 }
62547e
 
62547e
-static int64_t dump_filtered_memblock_start(GuestPhysBlock *block,
62547e
-                                            int64_t filter_area_start,
62547e
-                                            int64_t filter_area_length)
62547e
+int64_t dump_filtered_memblock_start(GuestPhysBlock *block,
62547e
+                                     int64_t filter_area_start,
62547e
+                                     int64_t filter_area_length)
62547e
 {
62547e
     if (filter_area_length) {
62547e
         /* return -1 if the block is not within filter area */
62547e
diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h
62547e
index 38ccac7190..4ffed0b659 100644
62547e
--- a/include/sysemu/dump.h
62547e
+++ b/include/sysemu/dump.h
62547e
@@ -215,4 +215,9 @@ typedef struct DumpState {
62547e
 uint16_t cpu_to_dump16(DumpState *s, uint16_t val);
62547e
 uint32_t cpu_to_dump32(DumpState *s, uint32_t val);
62547e
 uint64_t cpu_to_dump64(DumpState *s, uint64_t val);
62547e
+
62547e
+int64_t dump_filtered_memblock_size(GuestPhysBlock *block, int64_t filter_area_start,
62547e
+                                    int64_t filter_area_length);
62547e
+int64_t dump_filtered_memblock_start(GuestPhysBlock *block, int64_t filter_area_start,
62547e
+                                     int64_t filter_area_length);
62547e
 #endif
62547e
diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c
62547e
index f60a14920d..a2329141e8 100644
62547e
--- a/target/s390x/arch_dump.c
62547e
+++ b/target/s390x/arch_dump.c
62547e
@@ -12,11 +12,13 @@
62547e
  */
62547e
 
62547e
 #include "qemu/osdep.h"
62547e
+#include "qemu/units.h"
62547e
 #include "cpu.h"
62547e
 #include "s390x-internal.h"
62547e
 #include "elf.h"
62547e
 #include "sysemu/dump.h"
62547e
-
62547e
+#include "hw/s390x/pv.h"
62547e
+#include "kvm/kvm_s390x.h"
62547e
 
62547e
 struct S390xUserRegsStruct {
62547e
     uint64_t psw[2];
62547e
@@ -76,9 +78,16 @@ typedef struct noteStruct {
62547e
         uint64_t todcmp;
62547e
         uint32_t todpreg;
62547e
         uint64_t ctrs[16];
62547e
+        uint8_t dynamic[1];  /*
62547e
+                              * Would be a flexible array member, if
62547e
+                              * that was legal inside a union. Real
62547e
+                              * size comes from PV info interface.
62547e
+                              */
62547e
     } contents;
62547e
 } QEMU_PACKED Note;
62547e
 
62547e
+static bool pv_dump_initialized;
62547e
+
62547e
 static void s390x_write_elf64_prstatus(Note *note, S390CPU *cpu, int id)
62547e
 {
62547e
     int i;
62547e
@@ -177,28 +186,39 @@ static void s390x_write_elf64_prefix(Note *note, S390CPU *cpu, int id)
62547e
     note->contents.prefix = cpu_to_be32((uint32_t)(cpu->env.psa));
62547e
 }
62547e
 
62547e
+static void s390x_write_elf64_pv(Note *note, S390CPU *cpu, int id)
62547e
+{
62547e
+    note->hdr.n_type = cpu_to_be32(NT_S390_PV_CPU_DATA);
62547e
+    if (!pv_dump_initialized) {
62547e
+        return;
62547e
+    }
62547e
+    kvm_s390_dump_cpu(cpu, &note->contents.dynamic);
62547e
+}
62547e
 
62547e
 typedef struct NoteFuncDescStruct {
62547e
     int contents_size;
62547e
+    uint64_t (*note_size_func)(void); /* NULL for non-dynamic sized contents */
62547e
     void (*note_contents_func)(Note *note, S390CPU *cpu, int id);
62547e
+    bool pvonly;
62547e
 } NoteFuncDesc;
62547e
 
62547e
 static const NoteFuncDesc note_core[] = {
62547e
-    {sizeof_field(Note, contents.prstatus), s390x_write_elf64_prstatus},
62547e
-    {sizeof_field(Note, contents.fpregset), s390x_write_elf64_fpregset},
62547e
-    { 0, NULL}
62547e
+    {sizeof_field(Note, contents.prstatus), NULL, s390x_write_elf64_prstatus, false},
62547e
+    {sizeof_field(Note, contents.fpregset), NULL, s390x_write_elf64_fpregset, false},
62547e
+    { 0, NULL, NULL, false}
62547e
 };
62547e
 
62547e
 static const NoteFuncDesc note_linux[] = {
62547e
-    {sizeof_field(Note, contents.prefix),   s390x_write_elf64_prefix},
62547e
-    {sizeof_field(Note, contents.ctrs),     s390x_write_elf64_ctrs},
62547e
-    {sizeof_field(Note, contents.timer),    s390x_write_elf64_timer},
62547e
-    {sizeof_field(Note, contents.todcmp),   s390x_write_elf64_todcmp},
62547e
-    {sizeof_field(Note, contents.todpreg),  s390x_write_elf64_todpreg},
62547e
-    {sizeof_field(Note, contents.vregslo),  s390x_write_elf64_vregslo},
62547e
-    {sizeof_field(Note, contents.vregshi),  s390x_write_elf64_vregshi},
62547e
-    {sizeof_field(Note, contents.gscb),     s390x_write_elf64_gscb},
62547e
-    { 0, NULL}
62547e
+    {sizeof_field(Note, contents.prefix),   NULL, s390x_write_elf64_prefix,  false},
62547e
+    {sizeof_field(Note, contents.ctrs),     NULL, s390x_write_elf64_ctrs,    false},
62547e
+    {sizeof_field(Note, contents.timer),    NULL, s390x_write_elf64_timer,   false},
62547e
+    {sizeof_field(Note, contents.todcmp),   NULL, s390x_write_elf64_todcmp,  false},
62547e
+    {sizeof_field(Note, contents.todpreg),  NULL, s390x_write_elf64_todpreg, false},
62547e
+    {sizeof_field(Note, contents.vregslo),  NULL, s390x_write_elf64_vregslo, false},
62547e
+    {sizeof_field(Note, contents.vregshi),  NULL, s390x_write_elf64_vregshi, false},
62547e
+    {sizeof_field(Note, contents.gscb),     NULL, s390x_write_elf64_gscb,    false},
62547e
+    {0, kvm_s390_pv_dmp_get_size_cpu,       s390x_write_elf64_pv, true},
62547e
+    { 0, NULL, NULL, false}
62547e
 };
62547e
 
62547e
 static int s390x_write_elf64_notes(const char *note_name,
62547e
@@ -207,22 +227,41 @@ static int s390x_write_elf64_notes(const char *note_name,
62547e
                                        DumpState *s,
62547e
                                        const NoteFuncDesc *funcs)
62547e
 {
62547e
-    Note note;
62547e
+    Note note, *notep;
62547e
     const NoteFuncDesc *nf;
62547e
-    int note_size;
62547e
+    int note_size, content_size;
62547e
     int ret = -1;
62547e
 
62547e
     assert(strlen(note_name) < sizeof(note.name));
62547e
 
62547e
     for (nf = funcs; nf->note_contents_func; nf++) {
62547e
-        memset(&note, 0, sizeof(note));
62547e
-        note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
62547e
-        note.hdr.n_descsz = cpu_to_be32(nf->contents_size);
62547e
-        g_strlcpy(note.name, note_name, sizeof(note.name));
62547e
-        (*nf->note_contents_func)(&note, cpu, id);
62547e
+        notep = ¬e;
62547e
+        if (nf->pvonly && !s390_is_pv()) {
62547e
+            continue;
62547e
+        }
62547e
+
62547e
+        content_size = nf->note_size_func ? nf->note_size_func() : nf->contents_size;
62547e
+        note_size = sizeof(note) - sizeof(notep->contents) + content_size;
62547e
+
62547e
+        /* Notes with dynamic sizes need to allocate a note */
62547e
+        if (nf->note_size_func) {
62547e
+            notep = g_malloc(note_size);
62547e
+        }
62547e
+
62547e
+        memset(notep, 0, sizeof(note));
62547e
 
62547e
-        note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size;
62547e
-        ret = f(&note, note_size, s);
62547e
+        /* Setup note header data */
62547e
+        notep->hdr.n_descsz = cpu_to_be32(content_size);
62547e
+        notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
62547e
+        g_strlcpy(notep->name, note_name, sizeof(notep->name));
62547e
+
62547e
+        /* Get contents and write them out */
62547e
+        (*nf->note_contents_func)(notep, cpu, id);
62547e
+        ret = f(notep, note_size, s);
62547e
+
62547e
+        if (nf->note_size_func) {
62547e
+            g_free(notep);
62547e
+        }
62547e
 
62547e
         if (ret < 0) {
62547e
             return -1;
62547e
@@ -247,13 +286,179 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
62547e
     return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux);
62547e
 }
62547e
 
62547e
+/* PV dump section size functions */
62547e
+static uint64_t get_mem_state_size_from_len(uint64_t len)
62547e
+{
62547e
+    return (len / (MiB)) * kvm_s390_pv_dmp_get_size_mem_state();
62547e
+}
62547e
+
62547e
+static uint64_t get_size_mem_state(DumpState *s)
62547e
+{
62547e
+    return get_mem_state_size_from_len(s->total_size);
62547e
+}
62547e
+
62547e
+static uint64_t get_size_completion_data(DumpState *s)
62547e
+{
62547e
+    return kvm_s390_pv_dmp_get_size_completion_data();
62547e
+}
62547e
+
62547e
+/* PV dump section data functions*/
62547e
+static int get_data_completion(DumpState *s, uint8_t *buff)
62547e
+{
62547e
+    int rc;
62547e
+
62547e
+    if (!pv_dump_initialized) {
62547e
+        return 0;
62547e
+    }
62547e
+    rc = kvm_s390_dump_completion_data(buff);
62547e
+    if (!rc) {
62547e
+            pv_dump_initialized = false;
62547e
+    }
62547e
+    return rc;
62547e
+}
62547e
+
62547e
+static int get_mem_state(DumpState *s, uint8_t *buff)
62547e
+{
62547e
+    int64_t memblock_size, memblock_start;
62547e
+    GuestPhysBlock *block;
62547e
+    uint64_t off;
62547e
+    int rc;
62547e
+
62547e
+    QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
62547e
+        memblock_start = dump_filtered_memblock_start(block, s->filter_area_begin,
62547e
+                                                      s->filter_area_length);
62547e
+        if (memblock_start == -1) {
62547e
+            continue;
62547e
+        }
62547e
+
62547e
+        memblock_size = dump_filtered_memblock_size(block, s->filter_area_begin,
62547e
+                                                    s->filter_area_length);
62547e
+
62547e
+        off = get_mem_state_size_from_len(block->target_start);
62547e
+
62547e
+        rc = kvm_s390_dump_mem_state(block->target_start,
62547e
+                                     get_mem_state_size_from_len(memblock_size),
62547e
+                                     buff + off);
62547e
+        if (rc) {
62547e
+            return rc;
62547e
+        }
62547e
+    }
62547e
+
62547e
+    return 0;
62547e
+}
62547e
+
62547e
+static struct sections {
62547e
+    uint64_t (*sections_size_func)(DumpState *s);
62547e
+    int (*sections_contents_func)(DumpState *s, uint8_t *buff);
62547e
+    char sctn_str[12];
62547e
+} sections[] = {
62547e
+    { get_size_mem_state, get_mem_state, "pv_mem_meta"},
62547e
+    { get_size_completion_data, get_data_completion, "pv_compl"},
62547e
+    {NULL , NULL, ""}
62547e
+};
62547e
+
62547e
+static uint64_t arch_sections_write_hdr(DumpState *s, uint8_t *buff)
62547e
+{
62547e
+    Elf64_Shdr *shdr = (void *)buff;
62547e
+    struct sections *sctn = sections;
62547e
+    uint64_t off = s->section_offset;
62547e
+
62547e
+    if (!pv_dump_initialized) {
62547e
+        return 0;
62547e
+    }
62547e
+
62547e
+    for (; sctn->sections_size_func; off += shdr->sh_size, sctn++, shdr++) {
62547e
+        memset(shdr, 0, sizeof(*shdr));
62547e
+        shdr->sh_type = SHT_PROGBITS;
62547e
+        shdr->sh_offset = off;
62547e
+        shdr->sh_size = sctn->sections_size_func(s);
62547e
+        shdr->sh_name = s->string_table_buf->len;
62547e
+        g_array_append_vals(s->string_table_buf, sctn->sctn_str, sizeof(sctn->sctn_str));
62547e
+    }
62547e
+
62547e
+    return (uintptr_t)shdr - (uintptr_t)buff;
62547e
+}
62547e
+
62547e
+
62547e
+/* Add arch specific number of sections and their respective sizes */
62547e
+static void arch_sections_add(DumpState *s)
62547e
+{
62547e
+    struct sections *sctn = sections;
62547e
+
62547e
+    /*
62547e
+     * We only do a PV dump if we are running a PV guest, KVM supports
62547e
+     * the dump API and we got valid dump length information.
62547e
+     */
62547e
+    if (!s390_is_pv() || !kvm_s390_get_protected_dump() ||
62547e
+        !kvm_s390_pv_info_basic_valid()) {
62547e
+        return;
62547e
+    }
62547e
+
62547e
+    /*
62547e
+     * Start the UV dump process by doing the initialize dump call via
62547e
+     * KVM as the proxy.
62547e
+     */
62547e
+    if (!kvm_s390_dump_init()) {
62547e
+        pv_dump_initialized = true;
62547e
+    } else {
62547e
+        /*
62547e
+         * Dump init failed, maybe the guest owner disabled dumping.
62547e
+         * We'll continue the non-PV dump process since this is no
62547e
+         * reason to crash qemu.
62547e
+         */
62547e
+        return;
62547e
+    }
62547e
+
62547e
+    for (; sctn->sections_size_func; sctn++) {
62547e
+        s->shdr_num += 1;
62547e
+        s->elf_section_data_size += sctn->sections_size_func(s);
62547e
+    }
62547e
+}
62547e
+
62547e
+/*
62547e
+ * After the PV dump has been initialized, the CPU data has been
62547e
+ * fetched and memory has been dumped, we need to grab the tweak data
62547e
+ * and the completion data.
62547e
+ */
62547e
+static int arch_sections_write(DumpState *s, uint8_t *buff)
62547e
+{
62547e
+    struct sections *sctn = sections;
62547e
+    int rc;
62547e
+
62547e
+    if (!pv_dump_initialized) {
62547e
+        return -EINVAL;
62547e
+    }
62547e
+
62547e
+    for (; sctn->sections_size_func; sctn++) {
62547e
+        rc = sctn->sections_contents_func(s, buff);
62547e
+        buff += sctn->sections_size_func(s);
62547e
+        if (rc) {
62547e
+            return rc;
62547e
+        }
62547e
+    }
62547e
+    return 0;
62547e
+}
62547e
+
62547e
 int cpu_get_dump_info(ArchDumpInfo *info,
62547e
                       const struct GuestPhysBlockList *guest_phys_blocks)
62547e
 {
62547e
     info->d_machine = EM_S390;
62547e
     info->d_endian = ELFDATA2MSB;
62547e
     info->d_class = ELFCLASS64;
62547e
-
62547e
+    /*
62547e
+     * This is evaluated for each dump so we can freely switch
62547e
+     * between PV and non-PV.
62547e
+     */
62547e
+    if (s390_is_pv() && kvm_s390_get_protected_dump() &&
62547e
+        kvm_s390_pv_info_basic_valid()) {
62547e
+        info->arch_sections_add_fn = *arch_sections_add;
62547e
+        info->arch_sections_write_hdr_fn = *arch_sections_write_hdr;
62547e
+        info->arch_sections_write_fn = *arch_sections_write;
62547e
+    } else {
62547e
+        info->arch_sections_add_fn = NULL;
62547e
+        info->arch_sections_write_hdr_fn = NULL;
62547e
+        info->arch_sections_write_fn = NULL;
62547e
+    }
62547e
     return 0;
62547e
 }
62547e
 
62547e
@@ -261,7 +466,7 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
62547e
 {
62547e
     int name_size = 8; /* "LINUX" or "CORE" + pad */
62547e
     size_t elf_note_size = 0;
62547e
-    int note_head_size;
62547e
+    int note_head_size, content_size;
62547e
     const NoteFuncDesc *nf;
62547e
 
62547e
     assert(class == ELFCLASS64);
62547e
@@ -270,12 +475,15 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
62547e
     note_head_size = sizeof(Elf64_Nhdr);
62547e
 
62547e
     for (nf = note_core; nf->note_contents_func; nf++) {
62547e
-        elf_note_size = elf_note_size + note_head_size + name_size +
62547e
-                        nf->contents_size;
62547e
+        elf_note_size = elf_note_size + note_head_size + name_size + nf->contents_size;
62547e
     }
62547e
     for (nf = note_linux; nf->note_contents_func; nf++) {
62547e
+        if (nf->pvonly && !s390_is_pv()) {
62547e
+            continue;
62547e
+        }
62547e
+        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
62547e
         elf_note_size = elf_note_size + note_head_size + name_size +
62547e
-                        nf->contents_size;
62547e
+                        content_size;
62547e
     }
62547e
 
62547e
     return (elf_note_size) * nr_cpus;
62547e
-- 
62547e
2.37.3
62547e