97168e
From 93ec857c46911b95ed8e3abc6a9d432ae847c084 Mon Sep 17 00:00:00 2001
97168e
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
97168e
Date: Mon, 16 Jan 2023 07:51:56 -0500
97168e
Subject: [PATCH 06/11] kvm: Atomic memslot updates
97168e
97168e
RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
97168e
RH-MergeRequest: 247: accel: introduce accelerator blocker API
97168e
RH-Bugzilla: 2161188
97168e
RH-Acked-by: David Hildenbrand <david@redhat.com>
97168e
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
97168e
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
97168e
RH-Commit: [3/3] 520e41c0f58066a7381a5f6b32b81bc01cce51c0
97168e
97168e
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2161188
97168e
97168e
commit f39b7d2b96e3e73c01bb678cd096f7baf0b9ab39
97168e
Author: David Hildenbrand <david@redhat.com>
97168e
Date:   Fri Nov 11 10:47:58 2022 -0500
97168e
97168e
    kvm: Atomic memslot updates
97168e
97168e
    If we update an existing memslot (e.g., resize, split), we temporarily
97168e
    remove the memslot to re-add it immediately afterwards. These updates
97168e
    are not atomic, especially not for KVM VCPU threads, such that we can
97168e
    get spurious faults.
97168e
97168e
    Let's inhibit most KVM ioctls while performing relevant updates, such
97168e
    that we can perform the update just as if it would happen atomically
97168e
    without additional kernel support.
97168e
97168e
    We capture the add/del changes and apply them in the notifier commit
97168e
    stage instead. There, we can check for overlaps and perform the ioctl
97168e
    inhibiting only if really required (-> overlap).
97168e
97168e
    To keep things simple we don't perform additional checks that wouldn't
97168e
    actually result in an overlap -- such as !RAM memory regions in some
97168e
    cases (see kvm_set_phys_mem()).
97168e
97168e
    To minimize cache-line bouncing, use a separate indicator
97168e
    (in_ioctl_lock) per CPU.  Also, make sure to hold the kvm_slots_lock
97168e
    while performing both actions (removing+re-adding).
97168e
97168e
    We have to wait until all IOCTLs were exited and block new ones from
97168e
    getting executed.
97168e
97168e
    This approach cannot result in a deadlock as long as the inhibitor does
97168e
    not hold any locks that might hinder an IOCTL from getting finished and
97168e
    exited - something fairly unusual. The inhibitor will always hold the BQL.
97168e
97168e
    AFAIKs, one possible candidate would be userfaultfd. If a page cannot be
97168e
    placed (e.g., during postcopy), because we're waiting for a lock, or if the
97168e
    userfaultfd thread cannot process a fault, because it is waiting for a
97168e
    lock, there could be a deadlock. However, the BQL is not applicable here,
97168e
    because any other guest memory access while holding the BQL would already
97168e
    result in a deadlock.
97168e
97168e
    Nothing else in the kernel should block forever and wait for userspace
97168e
    intervention.
97168e
97168e
    Note: pause_all_vcpus()/resume_all_vcpus() or
97168e
    start_exclusive()/end_exclusive() cannot be used, as they either drop
97168e
    the BQL or require to be called without the BQL - something inhibitors
97168e
    cannot handle. We need a low-level locking mechanism that is
97168e
    deadlock-free even when not releasing the BQL.
97168e
97168e
    Signed-off-by: David Hildenbrand <david@redhat.com>
97168e
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
97168e
    Tested-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
97168e
    Message-Id: <20221111154758.1372674-4-eesposit@redhat.com>
97168e
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
97168e
97168e
Conflicts:
97168e
	accel/kvm/kvm-all.c: include "sysemu/dirtylimit.h" is missing in
97168e
	rhel 8.8.0
97168e
97168e
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
97168e
---
97168e
 accel/kvm/kvm-all.c      | 101 ++++++++++++++++++++++++++++++++++-----
97168e
 include/sysemu/kvm_int.h |   8 ++++
97168e
 2 files changed, 98 insertions(+), 11 deletions(-)
97168e
97168e
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
97168e
index 221aadfda7..3b7bc39823 100644
97168e
--- a/accel/kvm/kvm-all.c
97168e
+++ b/accel/kvm/kvm-all.c
97168e
@@ -31,6 +31,7 @@
97168e
 #include "sysemu/kvm_int.h"
97168e
 #include "sysemu/runstate.h"
97168e
 #include "sysemu/cpus.h"
97168e
+#include "sysemu/accel-blocker.h"
97168e
 #include "qemu/bswap.h"
97168e
 #include "exec/memory.h"
97168e
 #include "exec/ram_addr.h"
97168e
@@ -45,6 +46,7 @@
97168e
 #include "qemu/guest-random.h"
97168e
 #include "sysemu/hw_accel.h"
97168e
 #include "kvm-cpus.h"
97168e
+#include "qemu/range.h"
97168e
 
97168e
 #include "hw/boards.h"
97168e
 
97168e
@@ -1334,6 +1336,7 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size)
97168e
     kvm_max_slot_size = max_slot_size;
97168e
 }
97168e
 
97168e
+/* Called with KVMMemoryListener.slots_lock held */
97168e
 static void kvm_set_phys_mem(KVMMemoryListener *kml,
97168e
                              MemoryRegionSection *section, bool add)
97168e
 {
97168e
@@ -1368,14 +1371,12 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
97168e
     ram = memory_region_get_ram_ptr(mr) + mr_offset;
97168e
     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
97168e
 
97168e
-    kvm_slots_lock();
97168e
-
97168e
     if (!add) {
97168e
         do {
97168e
             slot_size = MIN(kvm_max_slot_size, size);
97168e
             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
97168e
             if (!mem) {
97168e
-                goto out;
97168e
+                return;
97168e
             }
97168e
             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
97168e
                 /*
97168e
@@ -1413,7 +1414,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
97168e
             start_addr += slot_size;
97168e
             size -= slot_size;
97168e
         } while (size);
97168e
-        goto out;
97168e
+        return;
97168e
     }
97168e
 
97168e
     /* register the new slot */
97168e
@@ -1438,9 +1439,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
97168e
         ram += slot_size;
97168e
         size -= slot_size;
97168e
     } while (size);
97168e
-
97168e
-out:
97168e
-    kvm_slots_unlock();
97168e
 }
97168e
 
97168e
 static void *kvm_dirty_ring_reaper_thread(void *data)
97168e
@@ -1492,18 +1490,95 @@ static void kvm_region_add(MemoryListener *listener,
97168e
                            MemoryRegionSection *section)
97168e
 {
97168e
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
97168e
+    KVMMemoryUpdate *update;
97168e
+
97168e
+    update = g_new0(KVMMemoryUpdate, 1);
97168e
+    update->section = *section;
97168e
 
97168e
-    memory_region_ref(section->mr);
97168e
-    kvm_set_phys_mem(kml, section, true);
97168e
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
97168e
 }
97168e
 
97168e
 static void kvm_region_del(MemoryListener *listener,
97168e
                            MemoryRegionSection *section)
97168e
 {
97168e
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
97168e
+    KVMMemoryUpdate *update;
97168e
+
97168e
+    update = g_new0(KVMMemoryUpdate, 1);
97168e
+    update->section = *section;
97168e
+
97168e
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
97168e
+}
97168e
+
97168e
+static void kvm_region_commit(MemoryListener *listener)
97168e
+{
97168e
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
97168e
+                                          listener);
97168e
+    KVMMemoryUpdate *u1, *u2;
97168e
+    bool need_inhibit = false;
97168e
+
97168e
+    if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
97168e
+        QSIMPLEQ_EMPTY(&kml->transaction_del)) {
97168e
+        return;
97168e
+    }
97168e
+
97168e
+    /*
97168e
+     * We have to be careful when regions to add overlap with ranges to remove.
97168e
+     * We have to simulate atomic KVM memslot updates by making sure no ioctl()
97168e
+     * is currently active.
97168e
+     *
97168e
+     * The lists are order by addresses, so it's easy to find overlaps.
97168e
+     */
97168e
+    u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
97168e
+    u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
97168e
+    while (u1 && u2) {
97168e
+        Range r1, r2;
97168e
+
97168e
+        range_init_nofail(&r1, u1->section.offset_within_address_space,
97168e
+                          int128_get64(u1->section.size));
97168e
+        range_init_nofail(&r2, u2->section.offset_within_address_space,
97168e
+                          int128_get64(u2->section.size));
97168e
+
97168e
+        if (range_overlaps_range(&r1, &r2)) {
97168e
+            need_inhibit = true;
97168e
+            break;
97168e
+        }
97168e
+        if (range_lob(&r1) < range_lob(&r2)) {
97168e
+            u1 = QSIMPLEQ_NEXT(u1, next);
97168e
+        } else {
97168e
+            u2 = QSIMPLEQ_NEXT(u2, next);
97168e
+        }
97168e
+    }
97168e
+
97168e
+    kvm_slots_lock();
97168e
+    if (need_inhibit) {
97168e
+        accel_ioctl_inhibit_begin();
97168e
+    }
97168e
+
97168e
+    /* Remove all memslots before adding the new ones. */
97168e
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
97168e
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
97168e
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
97168e
 
97168e
-    kvm_set_phys_mem(kml, section, false);
97168e
-    memory_region_unref(section->mr);
97168e
+        kvm_set_phys_mem(kml, &u1->section, false);
97168e
+        memory_region_unref(u1->section.mr);
97168e
+
97168e
+        g_free(u1);
97168e
+    }
97168e
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
97168e
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
97168e
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
97168e
+
97168e
+        memory_region_ref(u1->section.mr);
97168e
+        kvm_set_phys_mem(kml, &u1->section, true);
97168e
+
97168e
+        g_free(u1);
97168e
+    }
97168e
+
97168e
+    if (need_inhibit) {
97168e
+        accel_ioctl_inhibit_end();
97168e
+    }
97168e
+    kvm_slots_unlock();
97168e
 }
97168e
 
97168e
 static void kvm_log_sync(MemoryListener *listener,
97168e
@@ -1647,8 +1722,12 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
97168e
         kml->slots[i].slot = i;
97168e
     }
97168e
 
97168e
+    QSIMPLEQ_INIT(&kml->transaction_add);
97168e
+    QSIMPLEQ_INIT(&kml->transaction_del);
97168e
+
97168e
     kml->listener.region_add = kvm_region_add;
97168e
     kml->listener.region_del = kvm_region_del;
97168e
+    kml->listener.commit = kvm_region_commit;
97168e
     kml->listener.log_start = kvm_log_start;
97168e
     kml->listener.log_stop = kvm_log_stop;
97168e
     kml->listener.priority = 10;
97168e
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
97168e
index 1f5487d9b7..7e18c0a3c0 100644
97168e
--- a/include/sysemu/kvm_int.h
97168e
+++ b/include/sysemu/kvm_int.h
97168e
@@ -11,6 +11,7 @@
97168e
 
97168e
 #include "exec/memory.h"
97168e
 #include "qemu/accel.h"
97168e
+#include "qemu/queue.h"
97168e
 #include "sysemu/kvm.h"
97168e
 
97168e
 typedef struct KVMSlot
97168e
@@ -30,10 +31,17 @@ typedef struct KVMSlot
97168e
     ram_addr_t ram_start_offset;
97168e
 } KVMSlot;
97168e
 
97168e
+typedef struct KVMMemoryUpdate {
97168e
+    QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
97168e
+    MemoryRegionSection section;
97168e
+} KVMMemoryUpdate;
97168e
+
97168e
 typedef struct KVMMemoryListener {
97168e
     MemoryListener listener;
97168e
     KVMSlot *slots;
97168e
     int as_id;
97168e
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
97168e
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
97168e
 } KVMMemoryListener;
97168e
 
97168e
 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
97168e
-- 
97168e
2.37.3
97168e