7f1c5b
From e13fdc97ff05cdee46c112c2dee70b6ef33e7fa7 Mon Sep 17 00:00:00 2001
7f1c5b
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
7f1c5b
Date: Mon, 16 Jan 2023 07:17:31 -0500
7f1c5b
Subject: [PATCH 31/31] kvm: Atomic memslot updates
7f1c5b
7f1c5b
RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
7f1c5b
RH-MergeRequest: 138: accel: introduce accelerator blocker API
7f1c5b
RH-Bugzilla: 1979276
7f1c5b
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
7f1c5b
RH-Acked-by: David Hildenbrand <david@redhat.com>
7f1c5b
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
7f1c5b
RH-Commit: [3/3] 9f03181ebcad2474fbe859acbce7b9891caa216b (eesposit/qemu-kvm)
7f1c5b
7f1c5b
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1979276
7f1c5b
7f1c5b
commit f39b7d2b96e3e73c01bb678cd096f7baf0b9ab39
7f1c5b
Author: David Hildenbrand <david@redhat.com>
7f1c5b
Date:   Fri Nov 11 10:47:58 2022 -0500
7f1c5b
7f1c5b
    kvm: Atomic memslot updates
7f1c5b
7f1c5b
    If we update an existing memslot (e.g., resize, split), we temporarily
7f1c5b
    remove the memslot to re-add it immediately afterwards. These updates
7f1c5b
    are not atomic, especially not for KVM VCPU threads, such that we can
7f1c5b
    get spurious faults.
7f1c5b
7f1c5b
    Let's inhibit most KVM ioctls while performing relevant updates, such
7f1c5b
    that we can perform the update just as if it would happen atomically
7f1c5b
    without additional kernel support.
7f1c5b
7f1c5b
    We capture the add/del changes and apply them in the notifier commit
7f1c5b
    stage instead. There, we can check for overlaps and perform the ioctl
7f1c5b
    inhibiting only if really required (-> overlap).
7f1c5b
7f1c5b
    To keep things simple we don't perform additional checks that wouldn't
7f1c5b
    actually result in an overlap -- such as !RAM memory regions in some
7f1c5b
    cases (see kvm_set_phys_mem()).
7f1c5b
7f1c5b
    To minimize cache-line bouncing, use a separate indicator
7f1c5b
    (in_ioctl_lock) per CPU.  Also, make sure to hold the kvm_slots_lock
7f1c5b
    while performing both actions (removing+re-adding).
7f1c5b
7f1c5b
    We have to wait until all IOCTLs were exited and block new ones from
7f1c5b
    getting executed.
7f1c5b
7f1c5b
    This approach cannot result in a deadlock as long as the inhibitor does
7f1c5b
    not hold any locks that might hinder an IOCTL from getting finished and
7f1c5b
    exited - something fairly unusual. The inhibitor will always hold the BQL.
7f1c5b
7f1c5b
    AFAIKs, one possible candidate would be userfaultfd. If a page cannot be
7f1c5b
    placed (e.g., during postcopy), because we're waiting for a lock, or if the
7f1c5b
    userfaultfd thread cannot process a fault, because it is waiting for a
7f1c5b
    lock, there could be a deadlock. However, the BQL is not applicable here,
7f1c5b
    because any other guest memory access while holding the BQL would already
7f1c5b
    result in a deadlock.
7f1c5b
7f1c5b
    Nothing else in the kernel should block forever and wait for userspace
7f1c5b
    intervention.
7f1c5b
7f1c5b
    Note: pause_all_vcpus()/resume_all_vcpus() or
7f1c5b
    start_exclusive()/end_exclusive() cannot be used, as they either drop
7f1c5b
    the BQL or require to be called without the BQL - something inhibitors
7f1c5b
    cannot handle. We need a low-level locking mechanism that is
7f1c5b
    deadlock-free even when not releasing the BQL.
7f1c5b
7f1c5b
    Signed-off-by: David Hildenbrand <david@redhat.com>
7f1c5b
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
7f1c5b
    Tested-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
7f1c5b
    Message-Id: <20221111154758.1372674-4-eesposit@redhat.com>
7f1c5b
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
7f1c5b
7f1c5b
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
7f1c5b
---
7f1c5b
 accel/kvm/kvm-all.c      | 101 ++++++++++++++++++++++++++++++++++-----
7f1c5b
 include/sysemu/kvm_int.h |   8 ++++
7f1c5b
 2 files changed, 98 insertions(+), 11 deletions(-)
7f1c5b
7f1c5b
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
7f1c5b
index ff660fd469..39ed30ab59 100644
7f1c5b
--- a/accel/kvm/kvm-all.c
7f1c5b
+++ b/accel/kvm/kvm-all.c
7f1c5b
@@ -31,6 +31,7 @@
7f1c5b
 #include "sysemu/kvm_int.h"
7f1c5b
 #include "sysemu/runstate.h"
7f1c5b
 #include "sysemu/cpus.h"
7f1c5b
+#include "sysemu/accel-blocker.h"
7f1c5b
 #include "qemu/bswap.h"
7f1c5b
 #include "exec/memory.h"
7f1c5b
 #include "exec/ram_addr.h"
7f1c5b
@@ -46,6 +47,7 @@
7f1c5b
 #include "sysemu/hw_accel.h"
7f1c5b
 #include "kvm-cpus.h"
7f1c5b
 #include "sysemu/dirtylimit.h"
7f1c5b
+#include "qemu/range.h"
7f1c5b
 
7f1c5b
 #include "hw/boards.h"
7f1c5b
 #include "monitor/stats.h"
7f1c5b
@@ -1292,6 +1294,7 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size)
7f1c5b
     kvm_max_slot_size = max_slot_size;
7f1c5b
 }
7f1c5b
 
7f1c5b
+/* Called with KVMMemoryListener.slots_lock held */
7f1c5b
 static void kvm_set_phys_mem(KVMMemoryListener *kml,
7f1c5b
                              MemoryRegionSection *section, bool add)
7f1c5b
 {
7f1c5b
@@ -1326,14 +1329,12 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
7f1c5b
     ram = memory_region_get_ram_ptr(mr) + mr_offset;
7f1c5b
     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
7f1c5b
 
7f1c5b
-    kvm_slots_lock();
7f1c5b
-
7f1c5b
     if (!add) {
7f1c5b
         do {
7f1c5b
             slot_size = MIN(kvm_max_slot_size, size);
7f1c5b
             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
7f1c5b
             if (!mem) {
7f1c5b
-                goto out;
7f1c5b
+                return;
7f1c5b
             }
7f1c5b
             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
7f1c5b
                 /*
7f1c5b
@@ -1371,7 +1372,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
7f1c5b
             start_addr += slot_size;
7f1c5b
             size -= slot_size;
7f1c5b
         } while (size);
7f1c5b
-        goto out;
7f1c5b
+        return;
7f1c5b
     }
7f1c5b
 
7f1c5b
     /* register the new slot */
7f1c5b
@@ -1396,9 +1397,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
7f1c5b
         ram += slot_size;
7f1c5b
         size -= slot_size;
7f1c5b
     } while (size);
7f1c5b
-
7f1c5b
-out:
7f1c5b
-    kvm_slots_unlock();
7f1c5b
 }
7f1c5b
 
7f1c5b
 static void *kvm_dirty_ring_reaper_thread(void *data)
7f1c5b
@@ -1455,18 +1453,95 @@ static void kvm_region_add(MemoryListener *listener,
7f1c5b
                            MemoryRegionSection *section)
7f1c5b
 {
7f1c5b
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
7f1c5b
+    KVMMemoryUpdate *update;
7f1c5b
+
7f1c5b
+    update = g_new0(KVMMemoryUpdate, 1);
7f1c5b
+    update->section = *section;
7f1c5b
 
7f1c5b
-    memory_region_ref(section->mr);
7f1c5b
-    kvm_set_phys_mem(kml, section, true);
7f1c5b
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
7f1c5b
 }
7f1c5b
 
7f1c5b
 static void kvm_region_del(MemoryListener *listener,
7f1c5b
                            MemoryRegionSection *section)
7f1c5b
 {
7f1c5b
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
7f1c5b
+    KVMMemoryUpdate *update;
7f1c5b
+
7f1c5b
+    update = g_new0(KVMMemoryUpdate, 1);
7f1c5b
+    update->section = *section;
7f1c5b
+
7f1c5b
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
7f1c5b
+}
7f1c5b
+
7f1c5b
+static void kvm_region_commit(MemoryListener *listener)
7f1c5b
+{
7f1c5b
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
7f1c5b
+                                          listener);
7f1c5b
+    KVMMemoryUpdate *u1, *u2;
7f1c5b
+    bool need_inhibit = false;
7f1c5b
+
7f1c5b
+    if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
7f1c5b
+        QSIMPLEQ_EMPTY(&kml->transaction_del)) {
7f1c5b
+        return;
7f1c5b
+    }
7f1c5b
+
7f1c5b
+    /*
7f1c5b
+     * We have to be careful when regions to add overlap with ranges to remove.
7f1c5b
+     * We have to simulate atomic KVM memslot updates by making sure no ioctl()
7f1c5b
+     * is currently active.
7f1c5b
+     *
7f1c5b
+     * The lists are order by addresses, so it's easy to find overlaps.
7f1c5b
+     */
7f1c5b
+    u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
7f1c5b
+    u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
7f1c5b
+    while (u1 && u2) {
7f1c5b
+        Range r1, r2;
7f1c5b
+
7f1c5b
+        range_init_nofail(&r1, u1->section.offset_within_address_space,
7f1c5b
+                          int128_get64(u1->section.size));
7f1c5b
+        range_init_nofail(&r2, u2->section.offset_within_address_space,
7f1c5b
+                          int128_get64(u2->section.size));
7f1c5b
+
7f1c5b
+        if (range_overlaps_range(&r1, &r2)) {
7f1c5b
+            need_inhibit = true;
7f1c5b
+            break;
7f1c5b
+        }
7f1c5b
+        if (range_lob(&r1) < range_lob(&r2)) {
7f1c5b
+            u1 = QSIMPLEQ_NEXT(u1, next);
7f1c5b
+        } else {
7f1c5b
+            u2 = QSIMPLEQ_NEXT(u2, next);
7f1c5b
+        }
7f1c5b
+    }
7f1c5b
+
7f1c5b
+    kvm_slots_lock();
7f1c5b
+    if (need_inhibit) {
7f1c5b
+        accel_ioctl_inhibit_begin();
7f1c5b
+    }
7f1c5b
+
7f1c5b
+    /* Remove all memslots before adding the new ones. */
7f1c5b
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
7f1c5b
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
7f1c5b
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
7f1c5b
 
7f1c5b
-    kvm_set_phys_mem(kml, section, false);
7f1c5b
-    memory_region_unref(section->mr);
7f1c5b
+        kvm_set_phys_mem(kml, &u1->section, false);
7f1c5b
+        memory_region_unref(u1->section.mr);
7f1c5b
+
7f1c5b
+        g_free(u1);
7f1c5b
+    }
7f1c5b
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
7f1c5b
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
7f1c5b
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
7f1c5b
+
7f1c5b
+        memory_region_ref(u1->section.mr);
7f1c5b
+        kvm_set_phys_mem(kml, &u1->section, true);
7f1c5b
+
7f1c5b
+        g_free(u1);
7f1c5b
+    }
7f1c5b
+
7f1c5b
+    if (need_inhibit) {
7f1c5b
+        accel_ioctl_inhibit_end();
7f1c5b
+    }
7f1c5b
+    kvm_slots_unlock();
7f1c5b
 }
7f1c5b
 
7f1c5b
 static void kvm_log_sync(MemoryListener *listener,
7f1c5b
@@ -1610,8 +1685,12 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
7f1c5b
         kml->slots[i].slot = i;
7f1c5b
     }
7f1c5b
 
7f1c5b
+    QSIMPLEQ_INIT(&kml->transaction_add);
7f1c5b
+    QSIMPLEQ_INIT(&kml->transaction_del);
7f1c5b
+
7f1c5b
     kml->listener.region_add = kvm_region_add;
7f1c5b
     kml->listener.region_del = kvm_region_del;
7f1c5b
+    kml->listener.commit = kvm_region_commit;
7f1c5b
     kml->listener.log_start = kvm_log_start;
7f1c5b
     kml->listener.log_stop = kvm_log_stop;
7f1c5b
     kml->listener.priority = 10;
7f1c5b
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
7f1c5b
index 3b4adcdc10..60b520a13e 100644
7f1c5b
--- a/include/sysemu/kvm_int.h
7f1c5b
+++ b/include/sysemu/kvm_int.h
7f1c5b
@@ -12,6 +12,7 @@
7f1c5b
 #include "exec/memory.h"
7f1c5b
 #include "qapi/qapi-types-common.h"
7f1c5b
 #include "qemu/accel.h"
7f1c5b
+#include "qemu/queue.h"
7f1c5b
 #include "sysemu/kvm.h"
7f1c5b
 
7f1c5b
 typedef struct KVMSlot
7f1c5b
@@ -31,10 +32,17 @@ typedef struct KVMSlot
7f1c5b
     ram_addr_t ram_start_offset;
7f1c5b
 } KVMSlot;
7f1c5b
 
7f1c5b
+typedef struct KVMMemoryUpdate {
7f1c5b
+    QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
7f1c5b
+    MemoryRegionSection section;
7f1c5b
+} KVMMemoryUpdate;
7f1c5b
+
7f1c5b
 typedef struct KVMMemoryListener {
7f1c5b
     MemoryListener listener;
7f1c5b
     KVMSlot *slots;
7f1c5b
     int as_id;
7f1c5b
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
7f1c5b
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
7f1c5b
 } KVMMemoryListener;
7f1c5b
 
7f1c5b
 #define KVM_MSI_HASHTAB_SIZE    256
7f1c5b
-- 
7f1c5b
2.31.1
7f1c5b