thebeanogamer / rpms / qemu-kvm

Forked from rpms/qemu-kvm 5 months ago
Clone
0727d3
From 5fd7af93a06adaddbae719aabbaf912159f4fb28 Mon Sep 17 00:00:00 2001
0727d3
From: Leonardo Bras <leobras@redhat.com>
0727d3
Date: Wed, 18 May 2022 02:52:25 -0300
0727d3
Subject: [PATCH 18/37] QIOChannelSocket: Implement io_writev zero copy flag &
0727d3
 io_flush for CONFIG_LINUX
0727d3
MIME-Version: 1.0
0727d3
Content-Type: text/plain; charset=UTF-8
0727d3
Content-Transfer-Encoding: 8bit
0727d3
0727d3
RH-Author: Leonardo Brás <leobras@redhat.com>
0727d3
RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7
0727d3
RH-Commit: [18/26] 6f65c8c879a5df57213b541d58285b65178f8547
0727d3
RH-Bugzilla: 2072049
0727d3
RH-Acked-by: Peter Xu <peterx@redhat.com>
0727d3
RH-Acked-by: Daniel P. Berrangé <berrange@redhat.com>
0727d3
RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
0727d3
0727d3
For CONFIG_LINUX, implement the new zero copy flag and the optional callback
0727d3
io_flush on QIOChannelSocket, but enables it only when MSG_ZEROCOPY
0727d3
feature is available in the host kernel, which is checked on
0727d3
qio_channel_socket_connect_sync()
0727d3
0727d3
qio_channel_socket_flush() was implemented by counting how many times
0727d3
sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the
0727d3
socket's error queue, in order to find how many of them finished sending.
0727d3
Flush will loop until those counters are the same, or until some error occurs.
0727d3
0727d3
Notes on using writev() with QIO_CHANNEL_WRITE_FLAG_ZERO_COPY:
0727d3
1: Buffer
0727d3
- As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying,
0727d3
some caution is necessary to avoid overwriting any buffer before it's sent.
0727d3
If something like this happen, a newer version of the buffer may be sent instead.
0727d3
- If this is a problem, it's recommended to call qio_channel_flush() before freeing
0727d3
or re-using the buffer.
0727d3
0727d3
2: Locked memory
0727d3
- When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and
0727d3
unlocked after it's sent.
0727d3
- Depending on the size of each buffer, and how often it's sent, it may require
0727d3
a larger amount of locked memory than usually available to non-root user.
0727d3
- If the required amount of locked memory is not available, writev_zero_copy
0727d3
will return an error, which can abort an operation like migration,
0727d3
- Because of this, when an user code wants to add zero copy as a feature, it
0727d3
requires a mechanism to disable it, so it can still be accessible to less
0727d3
privileged users.
0727d3
0727d3
Signed-off-by: Leonardo Bras <leobras@redhat.com>
0727d3
Reviewed-by: Peter Xu <peterx@redhat.com>
0727d3
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
0727d3
Reviewed-by: Juan Quintela <quintela@redhat.com>
0727d3
Message-Id: <20220513062836.965425-4-leobras@redhat.com>
0727d3
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
0727d3
(cherry picked from commit 2bc58ffc2926a4efdd03edfb5909861fefc68c3d)
0727d3
Signed-off-by: Leonardo Bras <leobras@redhat.com>
0727d3
---
0727d3
 include/io/channel-socket.h |   2 +
0727d3
 io/channel-socket.c         | 116 ++++++++++++++++++++++++++++++++++--
0727d3
 2 files changed, 114 insertions(+), 4 deletions(-)
0727d3
0727d3
diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h
0727d3
index e747e63514..513c428fe4 100644
0727d3
--- a/include/io/channel-socket.h
0727d3
+++ b/include/io/channel-socket.h
0727d3
@@ -47,6 +47,8 @@ struct QIOChannelSocket {
0727d3
     socklen_t localAddrLen;
0727d3
     struct sockaddr_storage remoteAddr;
0727d3
     socklen_t remoteAddrLen;
0727d3
+    ssize_t zero_copy_queued;
0727d3
+    ssize_t zero_copy_sent;
0727d3
 };
0727d3
 
0727d3
 
0727d3
diff --git a/io/channel-socket.c b/io/channel-socket.c
0727d3
index bfbd64787e..38a46ba213 100644
0727d3
--- a/io/channel-socket.c
0727d3
+++ b/io/channel-socket.c
0727d3
@@ -26,6 +26,14 @@
0727d3
 #include "io/channel-watch.h"
0727d3
 #include "trace.h"
0727d3
 #include "qapi/clone-visitor.h"
0727d3
+#ifdef CONFIG_LINUX
0727d3
+#include <linux/errqueue.h>
0727d3
+#include <sys/socket.h>
0727d3
+
0727d3
+#if (defined(MSG_ZEROCOPY) && defined(SO_ZEROCOPY))
0727d3
+#define QEMU_MSG_ZEROCOPY
0727d3
+#endif
0727d3
+#endif
0727d3
 
0727d3
 #define SOCKET_MAX_FDS 16
0727d3
 
0727d3
@@ -55,6 +63,8 @@ qio_channel_socket_new(void)
0727d3
 
0727d3
     sioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET));
0727d3
     sioc->fd = -1;
0727d3
+    sioc->zero_copy_queued = 0;
0727d3
+    sioc->zero_copy_sent = 0;
0727d3
 
0727d3
     ioc = QIO_CHANNEL(sioc);
0727d3
     qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
0727d3
@@ -154,6 +164,16 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
0727d3
         return -1;
0727d3
     }
0727d3
 
0727d3
+#ifdef QEMU_MSG_ZEROCOPY
0727d3
+    int ret, v = 1;
0727d3
+    ret = setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &v, sizeof(v));
0727d3
+    if (ret == 0) {
0727d3
+        /* Zero copy available on host */
0727d3
+        qio_channel_set_feature(QIO_CHANNEL(ioc),
0727d3
+                                QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY);
0727d3
+    }
0727d3
+#endif
0727d3
+
0727d3
     return 0;
0727d3
 }
0727d3
 
0727d3
@@ -534,6 +554,7 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
0727d3
     char control[CMSG_SPACE(sizeof(int) * SOCKET_MAX_FDS)];
0727d3
     size_t fdsize = sizeof(int) * nfds;
0727d3
     struct cmsghdr *cmsg;
0727d3
+    int sflags = 0;
0727d3
 
0727d3
     memset(control, 0, CMSG_SPACE(sizeof(int) * SOCKET_MAX_FDS));
0727d3
 
0727d3
@@ -558,15 +579,31 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
0727d3
         memcpy(CMSG_DATA(cmsg), fds, fdsize);
0727d3
     }
0727d3
 
0727d3
+#ifdef QEMU_MSG_ZEROCOPY
0727d3
+    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
0727d3
+        sflags = MSG_ZEROCOPY;
0727d3
+    }
0727d3
+#endif
0727d3
+
0727d3
  retry:
0727d3
-    ret = sendmsg(sioc->fd, &msg, 0);
0727d3
+    ret = sendmsg(sioc->fd, &msg, sflags);
0727d3
     if (ret <= 0) {
0727d3
-        if (errno == EAGAIN) {
0727d3
+        switch (errno) {
0727d3
+        case EAGAIN:
0727d3
             return QIO_CHANNEL_ERR_BLOCK;
0727d3
-        }
0727d3
-        if (errno == EINTR) {
0727d3
+        case EINTR:
0727d3
             goto retry;
0727d3
+#ifdef QEMU_MSG_ZEROCOPY
0727d3
+        case ENOBUFS:
0727d3
+            if (sflags & MSG_ZEROCOPY) {
0727d3
+                error_setg_errno(errp, errno,
0727d3
+                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
0727d3
+                return -1;
0727d3
+            }
0727d3
+            break;
0727d3
+#endif
0727d3
         }
0727d3
+
0727d3
         error_setg_errno(errp, errno,
0727d3
                          "Unable to write to socket");
0727d3
         return -1;
0727d3
@@ -660,6 +697,74 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
0727d3
 }
0727d3
 #endif /* WIN32 */
0727d3
 
0727d3
+
0727d3
+#ifdef QEMU_MSG_ZEROCOPY
0727d3
+static int qio_channel_socket_flush(QIOChannel *ioc,
0727d3
+                                    Error **errp)
0727d3
+{
0727d3
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
0727d3
+    struct msghdr msg = {};
0727d3
+    struct sock_extended_err *serr;
0727d3
+    struct cmsghdr *cm;
0727d3
+    char control[CMSG_SPACE(sizeof(*serr))];
0727d3
+    int received;
0727d3
+    int ret = 1;
0727d3
+
0727d3
+    msg.msg_control = control;
0727d3
+    msg.msg_controllen = sizeof(control);
0727d3
+    memset(control, 0, sizeof(control));
0727d3
+
0727d3
+    while (sioc->zero_copy_sent < sioc->zero_copy_queued) {
0727d3
+        received = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE);
0727d3
+        if (received < 0) {
0727d3
+            switch (errno) {
0727d3
+            case EAGAIN:
0727d3
+                /* Nothing on errqueue, wait until something is available */
0727d3
+                qio_channel_wait(ioc, G_IO_ERR);
0727d3
+                continue;
0727d3
+            case EINTR:
0727d3
+                continue;
0727d3
+            default:
0727d3
+                error_setg_errno(errp, errno,
0727d3
+                                 "Unable to read errqueue");
0727d3
+                return -1;
0727d3
+            }
0727d3
+        }
0727d3
+
0727d3
+        cm = CMSG_FIRSTHDR(&msg;;
0727d3
+        if (cm->cmsg_level != SOL_IP &&
0727d3
+            cm->cmsg_type != IP_RECVERR) {
0727d3
+            error_setg_errno(errp, EPROTOTYPE,
0727d3
+                             "Wrong cmsg in errqueue");
0727d3
+            return -1;
0727d3
+        }
0727d3
+
0727d3
+        serr = (void *) CMSG_DATA(cm);
0727d3
+        if (serr->ee_errno != SO_EE_ORIGIN_NONE) {
0727d3
+            error_setg_errno(errp, serr->ee_errno,
0727d3
+                             "Error on socket");
0727d3
+            return -1;
0727d3
+        }
0727d3
+        if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
0727d3
+            error_setg_errno(errp, serr->ee_origin,
0727d3
+                             "Error not from zero copy");
0727d3
+            return -1;
0727d3
+        }
0727d3
+
0727d3
+        /* No errors, count successfully finished sendmsg()*/
0727d3
+        sioc->zero_copy_sent += serr->ee_data - serr->ee_info + 1;
0727d3
+
0727d3
+        /* If any sendmsg() succeeded using zero copy, return 0 at the end */
0727d3
+        if (serr->ee_code != SO_EE_CODE_ZEROCOPY_COPIED) {
0727d3
+            ret = 0;
0727d3
+        }
0727d3
+    }
0727d3
+
0727d3
+    return ret;
0727d3
+}
0727d3
+
0727d3
+#endif /* QEMU_MSG_ZEROCOPY */
0727d3
+
0727d3
 static int
0727d3
 qio_channel_socket_set_blocking(QIOChannel *ioc,
0727d3
                                 bool enabled,
0727d3
@@ -789,6 +894,9 @@ static void qio_channel_socket_class_init(ObjectClass *klass,
0727d3
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
0727d3
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
0727d3
     ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
0727d3
+#ifdef QEMU_MSG_ZEROCOPY
0727d3
+    ioc_klass->io_flush = qio_channel_socket_flush;
0727d3
+#endif
0727d3
 }
0727d3
 
0727d3
 static const TypeInfo qio_channel_socket_info = {
0727d3
-- 
0727d3
2.35.3
0727d3