Blame SOURCES/kvm-scsi-file-posix-add-support-for-persistent-reservati.patch

4a2fec
From f9b538c808178d27af2d4726a8f4b36a305b072b Mon Sep 17 00:00:00 2001
4a2fec
From: Paolo Bonzini <pbonzini@redhat.com>
4a2fec
Date: Sat, 2 Dec 2017 12:19:48 +0100
4a2fec
Subject: [PATCH 22/36] scsi, file-posix: add support for persistent
4a2fec
 reservation management
4a2fec
4a2fec
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
4a2fec
Message-id: <20171202121953.13317-13-pbonzini@redhat.com>
4a2fec
Patchwork-id: 78087
4a2fec
O-Subject: [RHEL7.4 qemu-kvm-rhev PATCH 12/17] scsi, file-posix: add support for persistent reservation management
4a2fec
Bugzilla: 1464908
4a2fec
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
4a2fec
RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
4a2fec
RH-Acked-by: John Snow <jsnow@redhat.com>
4a2fec
4a2fec
It is a common requirement for virtual machine to send persistent
4a2fec
reservations, but this currently requires either running QEMU with
4a2fec
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
4a2fec
QEMU bypass Linux's filter on SG_IO commands.
4a2fec
4a2fec
As an alternative mechanism, the next patches will introduce a
4a2fec
privileged helper to run persistent reservation commands without
4a2fec
expanding QEMU's attack surface unnecessarily.
4a2fec
4a2fec
The helper is invoked through a "pr-manager" QOM object, to which
4a2fec
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
4a2fec
PERSISTENT RESERVE IN commands.  For example:
4a2fec
4a2fec
  $ qemu-system-x86_64
4a2fec
      -device virtio-scsi \
4a2fec
      -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
4a2fec
      -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
4a2fec
      -device scsi-block,drive=hd
4a2fec
4a2fec
or:
4a2fec
4a2fec
  $ qemu-system-x86_64
4a2fec
      -device virtio-scsi \
4a2fec
      -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
4a2fec
      -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
4a2fec
      -device scsi-block,drive=hd
4a2fec
4a2fec
Multiple pr-manager implementations are conceivable and possible, though
4a2fec
only one is implemented right now.  For example, a pr-manager could:
4a2fec
4a2fec
- talk directly to the multipath daemon from a privileged QEMU
4a2fec
  (i.e. QEMU links to libmpathpersist); this makes reservation work
4a2fec
  properly with multipath, but still requires CAP_SYS_RAWIO
4a2fec
4a2fec
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
4a2fec
4a2fec
- more interestingly, implement reservations directly in QEMU
4a2fec
  through file system locks or a shared database (e.g. sqlite)
4a2fec
4a2fec
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
4a2fec
(cherry picked from commit 7c9e527659c67d4d7b41d9504f93d2d7ee482488)
4a2fec
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
4a2fec
---
4a2fec
 Makefile.objs             |   1 +
4a2fec
 block/file-posix.c        |  30 +++++++++++++
4a2fec
 docs/pr-manager.rst       |  51 ++++++++++++++++++++++
4a2fec
 include/scsi/pr-manager.h |  56 ++++++++++++++++++++++++
4a2fec
 qapi/block-core.json      |   4 ++
4a2fec
 scsi/Makefile.objs        |   2 +
4a2fec
 scsi/pr-manager.c         | 109 ++++++++++++++++++++++++++++++++++++++++++++++
4a2fec
 scsi/trace-events         |   3 ++
4a2fec
 vl.c                      |   3 +-
4a2fec
 9 files changed, 258 insertions(+), 1 deletion(-)
4a2fec
 create mode 100644 docs/pr-manager.rst
4a2fec
 create mode 100644 include/scsi/pr-manager.h
4a2fec
 create mode 100644 scsi/pr-manager.c
4a2fec
 create mode 100644 scsi/trace-events
4a2fec
4a2fec
diff --git a/Makefile.objs b/Makefile.objs
4a2fec
index f68aa3b..64bebd0 100644
4a2fec
--- a/Makefile.objs
4a2fec
+++ b/Makefile.objs
4a2fec
@@ -168,6 +168,7 @@ trace-events-subdirs += qapi
4a2fec
 trace-events-subdirs += accel/tcg
4a2fec
 trace-events-subdirs += accel/kvm
4a2fec
 trace-events-subdirs += nbd
4a2fec
+trace-events-subdirs += scsi
4a2fec
 
4a2fec
 trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events)
4a2fec
 
4a2fec
diff --git a/block/file-posix.c b/block/file-posix.c
4a2fec
index cb3bfce..9cacf06 100644
4a2fec
--- a/block/file-posix.c
4a2fec
+++ b/block/file-posix.c
4a2fec
@@ -34,6 +34,9 @@
4a2fec
 #include "qapi/util.h"
4a2fec
 #include "qapi/qmp/qstring.h"
4a2fec
 
4a2fec
+#include "scsi/pr-manager.h"
4a2fec
+#include "scsi/constants.h"
4a2fec
+
4a2fec
 #if defined(__APPLE__) && (__MACH__)
4a2fec
 #include <paths.h>
4a2fec
 #include <sys/param.h>
4a2fec
@@ -156,6 +159,8 @@ typedef struct BDRVRawState {
4a2fec
     bool page_cache_inconsistent:1;
4a2fec
     bool has_fallocate;
4a2fec
     bool needs_alignment;
4a2fec
+
4a2fec
+    PRManager *pr_mgr;
4a2fec
 } BDRVRawState;
4a2fec
 
4a2fec
 typedef struct BDRVRawReopenState {
4a2fec
@@ -403,6 +408,11 @@ static QemuOptsList raw_runtime_opts = {
4a2fec
             .type = QEMU_OPT_STRING,
4a2fec
             .help = "file locking mode (on/off/auto, default: auto)",
4a2fec
         },
4a2fec
+        {
4a2fec
+            .name = "pr-manager",
4a2fec
+            .type = QEMU_OPT_STRING,
4a2fec
+            .help = "id of persistent reservation manager object (default: none)",
4a2fec
+        },
4a2fec
         { /* end of list */ }
4a2fec
     },
4a2fec
 };
4a2fec
@@ -414,6 +424,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
4a2fec
     QemuOpts *opts;
4a2fec
     Error *local_err = NULL;
4a2fec
     const char *filename = NULL;
4a2fec
+    const char *str;
4a2fec
     BlockdevAioOptions aio, aio_default;
4a2fec
     int fd, ret;
4a2fec
     struct stat st;
4a2fec
@@ -475,6 +486,16 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
4a2fec
         abort();
4a2fec
     }
4a2fec
 
4a2fec
+    str = qemu_opt_get(opts, "pr-manager");
4a2fec
+    if (str) {
4a2fec
+        s->pr_mgr = pr_manager_lookup(str, &local_err);
4a2fec
+        if (local_err) {
4a2fec
+            error_propagate(errp, local_err);
4a2fec
+            ret = -EINVAL;
4a2fec
+            goto fail;
4a2fec
+        }
4a2fec
+    }
4a2fec
+
4a2fec
     s->open_flags = open_flags;
4a2fec
     raw_parse_flags(bdrv_flags, &s->open_flags);
4a2fec
 
4a2fec
@@ -2597,6 +2618,15 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
4a2fec
     if (fd_open(bs) < 0)
4a2fec
         return NULL;
4a2fec
 
4a2fec
+    if (req == SG_IO && s->pr_mgr) {
4a2fec
+        struct sg_io_hdr *io_hdr = buf;
4a2fec
+        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
4a2fec
+            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
4a2fec
+            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
4a2fec
+                                      s->fd, io_hdr, cb, opaque);
4a2fec
+        }
4a2fec
+    }
4a2fec
+
4a2fec
     acb = g_new(RawPosixAIOData, 1);
4a2fec
     acb->bs = bs;
4a2fec
     acb->aio_type = QEMU_AIO_IOCTL;
4a2fec
diff --git a/docs/pr-manager.rst b/docs/pr-manager.rst
4a2fec
new file mode 100644
4a2fec
index 0000000..b6089fb
4a2fec
--- /dev/null
4a2fec
+++ b/docs/pr-manager.rst
4a2fec
@@ -0,0 +1,51 @@
4a2fec
+======================================
4a2fec
+Persistent reservation managers
4a2fec
+======================================
4a2fec
+
4a2fec
+SCSI persistent Reservations allow restricting access to block devices
4a2fec
+to specific initiators in a shared storage setup.  When implementing
4a2fec
+clustering of virtual machines, it is a common requirement for virtual
4a2fec
+machines to send persistent reservation SCSI commands.  However,
4a2fec
+the operating system restricts sending these commands to unprivileged
4a2fec
+programs because incorrect usage can disrupt regular operation of the
4a2fec
+storage fabric.
4a2fec
+
4a2fec
+For this reason, QEMU's SCSI passthrough devices, ``scsi-block``
4a2fec
+and ``scsi-generic`` (both are only available on Linux) can delegate
4a2fec
+implementation of persistent reservations to a separate object,
4a2fec
+the "persistent reservation manager".  Only PERSISTENT RESERVE OUT and
4a2fec
+PERSISTENT RESERVE IN commands are passed to the persistent reservation
4a2fec
+manager object; other commands are processed by QEMU as usual.
4a2fec
+
4a2fec
+-----------------------------------------
4a2fec
+Defining a persistent reservation manager
4a2fec
+-----------------------------------------
4a2fec
+
4a2fec
+A persistent reservation manager is an instance of a subclass of the
4a2fec
+"pr-manager" QOM class.
4a2fec
+
4a2fec
+Right now only one subclass is defined, ``pr-manager-helper``, which
4a2fec
+forwards the commands to an external privileged helper program
4a2fec
+over Unix sockets.  The helper program only allows sending persistent
4a2fec
+reservation commands to devices for which QEMU has a file descriptor,
4a2fec
+so that QEMU will not be able to effect persistent reservations
4a2fec
+unless it has access to both the socket and the device.
4a2fec
+
4a2fec
+``pr-manager-helper`` has a single string property, ``path``, which
4a2fec
+accepts the path to the helper program's Unix socket.  For example,
4a2fec
+the following command line defines a ``pr-manager-helper`` object and
4a2fec
+attaches it to a SCSI passthrough device::
4a2fec
+
4a2fec
+      $ qemu-system-x86_64
4a2fec
+          -device virtio-scsi \
4a2fec
+          -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
4a2fec
+          -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
4a2fec
+          -device scsi-block,drive=hd
4a2fec
+
4a2fec
+Alternatively, using ``-blockdev``::
4a2fec
+
4a2fec
+      $ qemu-system-x86_64
4a2fec
+          -device virtio-scsi \
4a2fec
+          -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
4a2fec
+          -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
4a2fec
+          -device scsi-block,drive=hd
4a2fec
diff --git a/include/scsi/pr-manager.h b/include/scsi/pr-manager.h
4a2fec
new file mode 100644
4a2fec
index 0000000..b2b37d6
4a2fec
--- /dev/null
4a2fec
+++ b/include/scsi/pr-manager.h
4a2fec
@@ -0,0 +1,56 @@
4a2fec
+#ifndef PR_MANAGER_H
4a2fec
+#define PR_MANAGER_H
4a2fec
+
4a2fec
+#include "qom/object.h"
4a2fec
+#include "qapi/qmp/qdict.h"
4a2fec
+#include "qapi/visitor.h"
4a2fec
+#include "qom/object_interfaces.h"
4a2fec
+#include "block/aio.h"
4a2fec
+
4a2fec
+#define TYPE_PR_MANAGER "pr-manager"
4a2fec
+
4a2fec
+#define PR_MANAGER_CLASS(klass) \
4a2fec
+     OBJECT_CLASS_CHECK(PRManagerClass, (klass), TYPE_PR_MANAGER)
4a2fec
+#define PR_MANAGER_GET_CLASS(obj) \
4a2fec
+     OBJECT_GET_CLASS(PRManagerClass, (obj), TYPE_PR_MANAGER)
4a2fec
+#define PR_MANAGER(obj) \
4a2fec
+     OBJECT_CHECK(PRManager, (obj), TYPE_PR_MANAGER)
4a2fec
+
4a2fec
+struct sg_io_hdr;
4a2fec
+
4a2fec
+typedef struct PRManager {
4a2fec
+    /* <private> */
4a2fec
+    Object parent;
4a2fec
+} PRManager;
4a2fec
+
4a2fec
+/**
4a2fec
+ * PRManagerClass:
4a2fec
+ * @parent_class: the base class
4a2fec
+ * @run: callback invoked in thread pool context
4a2fec
+ */
4a2fec
+typedef struct PRManagerClass {
4a2fec
+    /* <private> */
4a2fec
+    ObjectClass parent_class;
4a2fec
+
4a2fec
+    /* <public> */
4a2fec
+    int (*run)(PRManager *pr_mgr, int fd, struct sg_io_hdr *hdr);
4a2fec
+} PRManagerClass;
4a2fec
+
4a2fec
+BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
4a2fec
+                               AioContext *ctx, int fd,
4a2fec
+                               struct sg_io_hdr *hdr,
4a2fec
+                               BlockCompletionFunc *complete,
4a2fec
+                               void *opaque);
4a2fec
+
4a2fec
+#ifdef CONFIG_LINUX
4a2fec
+PRManager *pr_manager_lookup(const char *id, Error **errp);
4a2fec
+#else
4a2fec
+static inline PRManager *pr_manager_lookup(const char *id, Error **errp)
4a2fec
+{
4a2fec
+    /* The classes do not exist at all!  */
4a2fec
+    error_setg(errp, "No persistent reservation manager with id '%s'", id);
4a2fec
+    return NULL;
4a2fec
+}
4a2fec
+#endif
4a2fec
+
4a2fec
+#endif
4a2fec
diff --git a/qapi/block-core.json b/qapi/block-core.json
4a2fec
index 8f5f105..15fc08f 100644
4a2fec
--- a/qapi/block-core.json
4a2fec
+++ b/qapi/block-core.json
4a2fec
@@ -2191,6 +2191,9 @@
4a2fec
 # Driver specific block device options for the file backend.
4a2fec
 #
4a2fec
 # @filename:    path to the image file
4a2fec
+# @pr-manager:  the id for the object that will handle persistent reservations
4a2fec
+#               for this device (default: none, forward the commands via SG_IO;
4a2fec
+#               since 2.11)
4a2fec
 # @aio:         AIO backend (default: threads) (since: 2.8)
4a2fec
 # @locking:     whether to enable file locking. If set to 'auto', only enable
4a2fec
 #               when Open File Descriptor (OFD) locking API is available
4a2fec
@@ -2200,6 +2203,7 @@
4a2fec
 ##
4a2fec
 { 'struct': 'BlockdevOptionsFile',
4a2fec
   'data': { 'filename': 'str',
4a2fec
+            '*pr-manager': 'str',
4a2fec
             '*locking': 'OnOffAuto',
4a2fec
             '*aio': 'BlockdevAioOptions' } }
4a2fec
 
4a2fec
diff --git a/scsi/Makefile.objs b/scsi/Makefile.objs
4a2fec
index 31b82a5..5496d2a 100644
4a2fec
--- a/scsi/Makefile.objs
4a2fec
+++ b/scsi/Makefile.objs
4a2fec
@@ -1 +1,3 @@
4a2fec
 block-obj-y += utils.o
4a2fec
+
4a2fec
+block-obj-$(CONFIG_LINUX) += pr-manager.o
4a2fec
diff --git a/scsi/pr-manager.c b/scsi/pr-manager.c
4a2fec
new file mode 100644
4a2fec
index 0000000..87c45db
4a2fec
--- /dev/null
4a2fec
+++ b/scsi/pr-manager.c
4a2fec
@@ -0,0 +1,109 @@
4a2fec
+/*
4a2fec
+ * Persistent reservation manager abstract class
4a2fec
+ *
4a2fec
+ * Copyright (c) 2017 Red Hat, Inc.
4a2fec
+ *
4a2fec
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
4a2fec
+ *
4a2fec
+ * This code is licensed under the LGPL.
4a2fec
+ *
4a2fec
+ */
4a2fec
+
4a2fec
+#include "qemu/osdep.h"
4a2fec
+#include <scsi/sg.h>
4a2fec
+
4a2fec
+#include "qapi/error.h"
4a2fec
+#include "block/aio.h"
4a2fec
+#include "block/thread-pool.h"
4a2fec
+#include "scsi/pr-manager.h"
4a2fec
+#include "trace.h"
4a2fec
+
4a2fec
+typedef struct PRManagerData {
4a2fec
+    PRManager *pr_mgr;
4a2fec
+    struct sg_io_hdr *hdr;
4a2fec
+    int fd;
4a2fec
+} PRManagerData;
4a2fec
+
4a2fec
+static int pr_manager_worker(void *opaque)
4a2fec
+{
4a2fec
+    PRManagerData *data = opaque;
4a2fec
+    PRManager *pr_mgr = data->pr_mgr;
4a2fec
+    PRManagerClass *pr_mgr_class =
4a2fec
+        PR_MANAGER_GET_CLASS(pr_mgr);
4a2fec
+    struct sg_io_hdr *hdr = data->hdr;
4a2fec
+    int fd = data->fd;
4a2fec
+    int r;
4a2fec
+
4a2fec
+    g_free(data);
4a2fec
+    trace_pr_manager_run(fd, hdr->cmdp[0], hdr->cmdp[1]);
4a2fec
+
4a2fec
+    /* The reference was taken in pr_manager_execute.  */
4a2fec
+    r = pr_mgr_class->run(pr_mgr, fd, hdr);
4a2fec
+    object_unref(OBJECT(pr_mgr));
4a2fec
+    return r;
4a2fec
+}
4a2fec
+
4a2fec
+
4a2fec
+BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
4a2fec
+                               AioContext *ctx, int fd,
4a2fec
+                               struct sg_io_hdr *hdr,
4a2fec
+                               BlockCompletionFunc *complete,
4a2fec
+                               void *opaque)
4a2fec
+{
4a2fec
+    PRManagerData *data = g_new(PRManagerData, 1);
4a2fec
+    ThreadPool *pool = aio_get_thread_pool(ctx);
4a2fec
+
4a2fec
+    trace_pr_manager_execute(fd, hdr->cmdp[0], hdr->cmdp[1], opaque);
4a2fec
+    data->pr_mgr = pr_mgr;
4a2fec
+    data->fd = fd;
4a2fec
+    data->hdr = hdr;
4a2fec
+
4a2fec
+    /* The matching object_unref is in pr_manager_worker.  */
4a2fec
+    object_ref(OBJECT(pr_mgr));
4a2fec
+    return thread_pool_submit_aio(pool, pr_manager_worker,
4a2fec
+                                  data, complete, opaque);
4a2fec
+}
4a2fec
+
4a2fec
+static const TypeInfo pr_manager_info = {
4a2fec
+    .parent = TYPE_OBJECT,
4a2fec
+    .name = TYPE_PR_MANAGER,
4a2fec
+    .class_size = sizeof(PRManagerClass),
4a2fec
+    .abstract = true,
4a2fec
+    .interfaces = (InterfaceInfo[]) {
4a2fec
+        { TYPE_USER_CREATABLE },
4a2fec
+        { }
4a2fec
+    }
4a2fec
+};
4a2fec
+
4a2fec
+PRManager *pr_manager_lookup(const char *id, Error **errp)
4a2fec
+{
4a2fec
+    Object *obj;
4a2fec
+    PRManager *pr_mgr;
4a2fec
+
4a2fec
+    obj = object_resolve_path_component(object_get_objects_root(), id);
4a2fec
+    if (!obj) {
4a2fec
+        error_setg(errp, "No persistent reservation manager with id '%s'", id);
4a2fec
+        return NULL;
4a2fec
+    }
4a2fec
+
4a2fec
+    pr_mgr = (PRManager *)
4a2fec
+        object_dynamic_cast(obj,
4a2fec
+                            TYPE_PR_MANAGER);
4a2fec
+    if (!pr_mgr) {
4a2fec
+        error_setg(errp,
4a2fec
+                   "Object with id '%s' is not a persistent reservation manager",
4a2fec
+                   id);
4a2fec
+        return NULL;
4a2fec
+    }
4a2fec
+
4a2fec
+    return pr_mgr;
4a2fec
+}
4a2fec
+
4a2fec
+static void
4a2fec
+pr_manager_register_types(void)
4a2fec
+{
4a2fec
+    type_register_static(&pr_manager_info);
4a2fec
+}
4a2fec
+
4a2fec
+
4a2fec
+type_init(pr_manager_register_types);
4a2fec
diff --git a/scsi/trace-events b/scsi/trace-events
4a2fec
new file mode 100644
4a2fec
index 0000000..45f5b6e
4a2fec
--- /dev/null
4a2fec
+++ b/scsi/trace-events
4a2fec
@@ -0,0 +1,3 @@
4a2fec
+# scsi/pr-manager.c
4a2fec
+pr_manager_execute(int fd, int cmd, int sa, void *opaque) "fd=%d cmd=0x%02x service action=0x%02x opaque=%p"
4a2fec
+pr_manager_run(int fd, int cmd, int sa) "fd=%d cmd=0x%02x service action=0x%02x"
4a2fec
diff --git a/vl.c b/vl.c
4a2fec
index 55949e6..bef5ae3 100644
4a2fec
--- a/vl.c
4a2fec
+++ b/vl.c
4a2fec
@@ -2820,7 +2820,8 @@ static int machine_set_property(void *opaque,
4a2fec
  */
4a2fec
 static bool object_create_initial(const char *type)
4a2fec
 {
4a2fec
-    if (g_str_equal(type, "rng-egd")) {
4a2fec
+    if (g_str_equal(type, "rng-egd") ||
4a2fec
+        g_str_has_prefix(type, "pr-manager-")) {
4a2fec
         return false;
4a2fec
     }
4a2fec
 
4a2fec
-- 
4a2fec
1.8.3.1
4a2fec