ed5979
From a91da7741464dadeb306a741b4fb562e49ffea57 Mon Sep 17 00:00:00 2001
ed5979
From: Peter Xu <peterx@redhat.com>
ed5979
Date: Tue, 7 Feb 2023 15:57:11 -0500
ed5979
Subject: [PATCH 5/8] util/userfaultfd: Support /dev/userfaultfd
ed5979
ed5979
RH-Author: Peter Xu <peterx@redhat.com>
ed5979
RH-MergeRequest: 149: Support /dev/userfaultfd
ed5979
RH-Bugzilla: 2158704
ed5979
RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
ed5979
RH-Acked-by: quintela1 <quintela@redhat.com>
ed5979
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
ed5979
RH-Commit: [3/3] 5f427d8c18c210ff8f66724c9e358a7120619e69 (peterx/qemu-kvm)
ed5979
ed5979
Teach QEMU to use /dev/userfaultfd when it existed and fallback to the
ed5979
system call if either it's not there or doesn't have enough permission.
ed5979
ed5979
Firstly, as long as the app has permission to access /dev/userfaultfd, it
ed5979
always have the ability to trap kernel faults which QEMU mostly wants.
ed5979
Meanwhile, in some context (e.g. containers) the userfaultfd syscall can be
ed5979
forbidden, so it can be the major way to use postcopy in a restricted
ed5979
environment with strict seccomp setup.
ed5979
ed5979
Signed-off-by: Peter Xu <peterx@redhat.com>
ed5979
Reviewed-by: Juan Quintela <quintela@redhat.com>
ed5979
Signed-off-by: Juan Quintela <quintela@redhat.com>
ed5979
(cherry picked from commit c40c0463413b941c13fe5f99a90c02d7d6584828)
ed5979
Signed-off-by: Peter Xu <peterx@redhat.com>
ed5979
---
ed5979
 util/trace-events  |  1 +
ed5979
 util/userfaultfd.c | 32 ++++++++++++++++++++++++++++++++
ed5979
 2 files changed, 33 insertions(+)
ed5979
ed5979
diff --git a/util/trace-events b/util/trace-events
ed5979
index c8f53d7d9f..16f78d8fe5 100644
ed5979
--- a/util/trace-events
ed5979
+++ b/util/trace-events
ed5979
@@ -93,6 +93,7 @@ qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_siz
ed5979
 qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
ed5979
 
ed5979
 #userfaultfd.c
ed5979
+uffd_detect_open_mode(int mode) "%d"
ed5979
 uffd_query_features_nosys(int err) "errno: %i"
ed5979
 uffd_query_features_api_failed(int err) "errno: %i"
ed5979
 uffd_create_fd_nosys(int err) "errno: %i"
ed5979
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
ed5979
index 4953b3137d..fdff4867e8 100644
ed5979
--- a/util/userfaultfd.c
ed5979
+++ b/util/userfaultfd.c
ed5979
@@ -18,10 +18,42 @@
ed5979
 #include <poll.h>
ed5979
 #include <sys/syscall.h>
ed5979
 #include <sys/ioctl.h>
ed5979
+#include <fcntl.h>
ed5979
+
ed5979
+typedef enum {
ed5979
+    UFFD_UNINITIALIZED = 0,
ed5979
+    UFFD_USE_DEV_PATH,
ed5979
+    UFFD_USE_SYSCALL,
ed5979
+} uffd_open_mode;
ed5979
 
ed5979
 int uffd_open(int flags)
ed5979
 {
ed5979
 #if defined(__NR_userfaultfd)
ed5979
+    static uffd_open_mode open_mode;
ed5979
+    static int uffd_dev;
ed5979
+
ed5979
+    /* Detect how to generate uffd desc when run the 1st time */
ed5979
+    if (open_mode == UFFD_UNINITIALIZED) {
ed5979
+        /*
ed5979
+         * Make /dev/userfaultfd the default approach because it has better
ed5979
+         * permission controls, meanwhile allows kernel faults without any
ed5979
+         * privilege requirement (e.g. SYS_CAP_PTRACE).
ed5979
+         */
ed5979
+        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
ed5979
+        if (uffd_dev >= 0) {
ed5979
+            open_mode = UFFD_USE_DEV_PATH;
ed5979
+        } else {
ed5979
+            /* Fallback to the system call */
ed5979
+            open_mode = UFFD_USE_SYSCALL;
ed5979
+        }
ed5979
+        trace_uffd_detect_open_mode(open_mode);
ed5979
+    }
ed5979
+
ed5979
+    if (open_mode == UFFD_USE_DEV_PATH) {
ed5979
+        assert(uffd_dev >= 0);
ed5979
+        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
ed5979
+    }
ed5979
+
ed5979
     return syscall(__NR_userfaultfd, flags);
ed5979
 #else
ed5979
     return -EINVAL;
ed5979
-- 
ed5979
2.31.1
ed5979