bf143f
From 93ec857c46911b95ed8e3abc6a9d432ae847c084 Mon Sep 17 00:00:00 2001
bf143f
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
bf143f
Date: Mon, 16 Jan 2023 07:51:56 -0500
bf143f
Subject: [PATCH 06/11] kvm: Atomic memslot updates
bf143f
bf143f
RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
bf143f
RH-MergeRequest: 247: accel: introduce accelerator blocker API
bf143f
RH-Bugzilla: 2161188
bf143f
RH-Acked-by: David Hildenbrand <david@redhat.com>
bf143f
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
bf143f
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
bf143f
RH-Commit: [3/3] 520e41c0f58066a7381a5f6b32b81bc01cce51c0
bf143f
bf143f
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2161188
bf143f
bf143f
commit f39b7d2b96e3e73c01bb678cd096f7baf0b9ab39
bf143f
Author: David Hildenbrand <david@redhat.com>
bf143f
Date:   Fri Nov 11 10:47:58 2022 -0500
bf143f
bf143f
    kvm: Atomic memslot updates
bf143f
bf143f
    If we update an existing memslot (e.g., resize, split), we temporarily
bf143f
    remove the memslot to re-add it immediately afterwards. These updates
bf143f
    are not atomic, especially not for KVM VCPU threads, such that we can
bf143f
    get spurious faults.
bf143f
bf143f
    Let's inhibit most KVM ioctls while performing relevant updates, such
bf143f
    that we can perform the update just as if it would happen atomically
bf143f
    without additional kernel support.
bf143f
bf143f
    We capture the add/del changes and apply them in the notifier commit
bf143f
    stage instead. There, we can check for overlaps and perform the ioctl
bf143f
    inhibiting only if really required (-> overlap).
bf143f
bf143f
    To keep things simple we don't perform additional checks that wouldn't
bf143f
    actually result in an overlap -- such as !RAM memory regions in some
bf143f
    cases (see kvm_set_phys_mem()).
bf143f
bf143f
    To minimize cache-line bouncing, use a separate indicator
bf143f
    (in_ioctl_lock) per CPU.  Also, make sure to hold the kvm_slots_lock
bf143f
    while performing both actions (removing+re-adding).
bf143f
bf143f
    We have to wait until all IOCTLs were exited and block new ones from
bf143f
    getting executed.
bf143f
bf143f
    This approach cannot result in a deadlock as long as the inhibitor does
bf143f
    not hold any locks that might hinder an IOCTL from getting finished and
bf143f
    exited - something fairly unusual. The inhibitor will always hold the BQL.
bf143f
bf143f
    AFAIKs, one possible candidate would be userfaultfd. If a page cannot be
bf143f
    placed (e.g., during postcopy), because we're waiting for a lock, or if the
bf143f
    userfaultfd thread cannot process a fault, because it is waiting for a
bf143f
    lock, there could be a deadlock. However, the BQL is not applicable here,
bf143f
    because any other guest memory access while holding the BQL would already
bf143f
    result in a deadlock.
bf143f
bf143f
    Nothing else in the kernel should block forever and wait for userspace
bf143f
    intervention.
bf143f
bf143f
    Note: pause_all_vcpus()/resume_all_vcpus() or
bf143f
    start_exclusive()/end_exclusive() cannot be used, as they either drop
bf143f
    the BQL or require to be called without the BQL - something inhibitors
bf143f
    cannot handle. We need a low-level locking mechanism that is
bf143f
    deadlock-free even when not releasing the BQL.
bf143f
bf143f
    Signed-off-by: David Hildenbrand <david@redhat.com>
bf143f
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
bf143f
    Tested-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
bf143f
    Message-Id: <20221111154758.1372674-4-eesposit@redhat.com>
bf143f
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
bf143f
bf143f
Conflicts:
bf143f
	accel/kvm/kvm-all.c: include "sysemu/dirtylimit.h" is missing in
bf143f
	rhel 8.8.0
bf143f
bf143f
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
bf143f
---
bf143f
 accel/kvm/kvm-all.c      | 101 ++++++++++++++++++++++++++++++++++-----
bf143f
 include/sysemu/kvm_int.h |   8 ++++
bf143f
 2 files changed, 98 insertions(+), 11 deletions(-)
bf143f
bf143f
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
bf143f
index 221aadfda7..3b7bc39823 100644
bf143f
--- a/accel/kvm/kvm-all.c
bf143f
+++ b/accel/kvm/kvm-all.c
bf143f
@@ -31,6 +31,7 @@
bf143f
 #include "sysemu/kvm_int.h"
bf143f
 #include "sysemu/runstate.h"
bf143f
 #include "sysemu/cpus.h"
bf143f
+#include "sysemu/accel-blocker.h"
bf143f
 #include "qemu/bswap.h"
bf143f
 #include "exec/memory.h"
bf143f
 #include "exec/ram_addr.h"
bf143f
@@ -45,6 +46,7 @@
bf143f
 #include "qemu/guest-random.h"
bf143f
 #include "sysemu/hw_accel.h"
bf143f
 #include "kvm-cpus.h"
bf143f
+#include "qemu/range.h"
bf143f
 
bf143f
 #include "hw/boards.h"
bf143f
 
bf143f
@@ -1334,6 +1336,7 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size)
bf143f
     kvm_max_slot_size = max_slot_size;
bf143f
 }
bf143f
 
bf143f
+/* Called with KVMMemoryListener.slots_lock held */
bf143f
 static void kvm_set_phys_mem(KVMMemoryListener *kml,
bf143f
                              MemoryRegionSection *section, bool add)
bf143f
 {
bf143f
@@ -1368,14 +1371,12 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
bf143f
     ram = memory_region_get_ram_ptr(mr) + mr_offset;
bf143f
     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
bf143f
 
bf143f
-    kvm_slots_lock();
bf143f
-
bf143f
     if (!add) {
bf143f
         do {
bf143f
             slot_size = MIN(kvm_max_slot_size, size);
bf143f
             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
bf143f
             if (!mem) {
bf143f
-                goto out;
bf143f
+                return;
bf143f
             }
bf143f
             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
bf143f
                 /*
bf143f
@@ -1413,7 +1414,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
bf143f
             start_addr += slot_size;
bf143f
             size -= slot_size;
bf143f
         } while (size);
bf143f
-        goto out;
bf143f
+        return;
bf143f
     }
bf143f
 
bf143f
     /* register the new slot */
bf143f
@@ -1438,9 +1439,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
bf143f
         ram += slot_size;
bf143f
         size -= slot_size;
bf143f
     } while (size);
bf143f
-
bf143f
-out:
bf143f
-    kvm_slots_unlock();
bf143f
 }
bf143f
 
bf143f
 static void *kvm_dirty_ring_reaper_thread(void *data)
bf143f
@@ -1492,18 +1490,95 @@ static void kvm_region_add(MemoryListener *listener,
bf143f
                            MemoryRegionSection *section)
bf143f
 {
bf143f
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
bf143f
+    KVMMemoryUpdate *update;
bf143f
+
bf143f
+    update = g_new0(KVMMemoryUpdate, 1);
bf143f
+    update->section = *section;
bf143f
 
bf143f
-    memory_region_ref(section->mr);
bf143f
-    kvm_set_phys_mem(kml, section, true);
bf143f
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
bf143f
 }
bf143f
 
bf143f
 static void kvm_region_del(MemoryListener *listener,
bf143f
                            MemoryRegionSection *section)
bf143f
 {
bf143f
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
bf143f
+    KVMMemoryUpdate *update;
bf143f
+
bf143f
+    update = g_new0(KVMMemoryUpdate, 1);
bf143f
+    update->section = *section;
bf143f
+
bf143f
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
bf143f
+}
bf143f
+
bf143f
+static void kvm_region_commit(MemoryListener *listener)
bf143f
+{
bf143f
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
bf143f
+                                          listener);
bf143f
+    KVMMemoryUpdate *u1, *u2;
bf143f
+    bool need_inhibit = false;
bf143f
+
bf143f
+    if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
bf143f
+        QSIMPLEQ_EMPTY(&kml->transaction_del)) {
bf143f
+        return;
bf143f
+    }
bf143f
+
bf143f
+    /*
bf143f
+     * We have to be careful when regions to add overlap with ranges to remove.
bf143f
+     * We have to simulate atomic KVM memslot updates by making sure no ioctl()
bf143f
+     * is currently active.
bf143f
+     *
bf143f
+     * The lists are order by addresses, so it's easy to find overlaps.
bf143f
+     */
bf143f
+    u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
bf143f
+    u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
bf143f
+    while (u1 && u2) {
bf143f
+        Range r1, r2;
bf143f
+
bf143f
+        range_init_nofail(&r1, u1->section.offset_within_address_space,
bf143f
+                          int128_get64(u1->section.size));
bf143f
+        range_init_nofail(&r2, u2->section.offset_within_address_space,
bf143f
+                          int128_get64(u2->section.size));
bf143f
+
bf143f
+        if (range_overlaps_range(&r1, &r2)) {
bf143f
+            need_inhibit = true;
bf143f
+            break;
bf143f
+        }
bf143f
+        if (range_lob(&r1) < range_lob(&r2)) {
bf143f
+            u1 = QSIMPLEQ_NEXT(u1, next);
bf143f
+        } else {
bf143f
+            u2 = QSIMPLEQ_NEXT(u2, next);
bf143f
+        }
bf143f
+    }
bf143f
+
bf143f
+    kvm_slots_lock();
bf143f
+    if (need_inhibit) {
bf143f
+        accel_ioctl_inhibit_begin();
bf143f
+    }
bf143f
+
bf143f
+    /* Remove all memslots before adding the new ones. */
bf143f
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
bf143f
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
bf143f
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
bf143f
 
bf143f
-    kvm_set_phys_mem(kml, section, false);
bf143f
-    memory_region_unref(section->mr);
bf143f
+        kvm_set_phys_mem(kml, &u1->section, false);
bf143f
+        memory_region_unref(u1->section.mr);
bf143f
+
bf143f
+        g_free(u1);
bf143f
+    }
bf143f
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
bf143f
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
bf143f
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
bf143f
+
bf143f
+        memory_region_ref(u1->section.mr);
bf143f
+        kvm_set_phys_mem(kml, &u1->section, true);
bf143f
+
bf143f
+        g_free(u1);
bf143f
+    }
bf143f
+
bf143f
+    if (need_inhibit) {
bf143f
+        accel_ioctl_inhibit_end();
bf143f
+    }
bf143f
+    kvm_slots_unlock();
bf143f
 }
bf143f
 
bf143f
 static void kvm_log_sync(MemoryListener *listener,
bf143f
@@ -1647,8 +1722,12 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
bf143f
         kml->slots[i].slot = i;
bf143f
     }
bf143f
 
bf143f
+    QSIMPLEQ_INIT(&kml->transaction_add);
bf143f
+    QSIMPLEQ_INIT(&kml->transaction_del);
bf143f
+
bf143f
     kml->listener.region_add = kvm_region_add;
bf143f
     kml->listener.region_del = kvm_region_del;
bf143f
+    kml->listener.commit = kvm_region_commit;
bf143f
     kml->listener.log_start = kvm_log_start;
bf143f
     kml->listener.log_stop = kvm_log_stop;
bf143f
     kml->listener.priority = 10;
bf143f
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
bf143f
index 1f5487d9b7..7e18c0a3c0 100644
bf143f
--- a/include/sysemu/kvm_int.h
bf143f
+++ b/include/sysemu/kvm_int.h
bf143f
@@ -11,6 +11,7 @@
bf143f
 
bf143f
 #include "exec/memory.h"
bf143f
 #include "qemu/accel.h"
bf143f
+#include "qemu/queue.h"
bf143f
 #include "sysemu/kvm.h"
bf143f
 
bf143f
 typedef struct KVMSlot
bf143f
@@ -30,10 +31,17 @@ typedef struct KVMSlot
bf143f
     ram_addr_t ram_start_offset;
bf143f
 } KVMSlot;
bf143f
 
bf143f
+typedef struct KVMMemoryUpdate {
bf143f
+    QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
bf143f
+    MemoryRegionSection section;
bf143f
+} KVMMemoryUpdate;
bf143f
+
bf143f
 typedef struct KVMMemoryListener {
bf143f
     MemoryListener listener;
bf143f
     KVMSlot *slots;
bf143f
     int as_id;
bf143f
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
bf143f
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
bf143f
 } KVMMemoryListener;
bf143f
 
bf143f
 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
bf143f
-- 
bf143f
2.37.3
bf143f