29b115
From 881945094c0e4d33614d40959bfc20e395f5a478 Mon Sep 17 00:00:00 2001
29b115
From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma@redhat.com>
29b115
Date: Thu, 21 Jul 2022 16:05:40 +0200
29b115
Subject: [PATCH 24/32] vdpa: Buffer CVQ support on shadow virtqueue
29b115
MIME-Version: 1.0
29b115
Content-Type: text/plain; charset=UTF-8
29b115
Content-Transfer-Encoding: 8bit
29b115
29b115
RH-Author: Eugenio Pérez <eperezma@redhat.com>
29b115
RH-MergeRequest: 108: Net Control Virtqueue shadow Support
29b115
RH-Commit: [24/27] 5486f80141a3ad968a32e782bdcdead32f417352 (eperezmartin/qemu-kvm)
29b115
RH-Bugzilla: 1939363
29b115
RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
29b115
RH-Acked-by: Cindy Lu <lulu@redhat.com>
29b115
RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
29b115
29b115
Bugzilla: https://bugzilla.redhat.com/1939363
29b115
29b115
Upstream Status: git://git.qemu.org/qemu.git
29b115
29b115
commit 2df4dd31e194c94da7d28c02e92449f4a989fca9
29b115
Author: Eugenio Pérez <eperezma@redhat.com>
29b115
Date:   Wed Jul 20 08:59:43 2022 +0200
29b115
29b115
    vdpa: Buffer CVQ support on shadow virtqueue
29b115
29b115
    Introduce the control virtqueue support for vDPA shadow virtqueue. This
29b115
    is needed for advanced networking features like rx filtering.
29b115
29b115
    Virtio-net control VQ copies the descriptors to qemu's VA, so we avoid
29b115
    TOCTOU with the guest's or device's memory every time there is a device
29b115
    model change.  Otherwise, the guest could change the memory content in
29b115
    the time between qemu and the device read it.
29b115
29b115
    To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR is
29b115
    implemented.  If the virtio-net driver changes MAC the virtio-net device
29b115
    model will be updated with the new one, and a rx filtering change event
29b115
    will be raised.
29b115
29b115
    More cvq commands could be added here straightforwardly but they have
29b115
    not been tested.
29b115
29b115
    Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
29b115
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
29b115
    Signed-off-by: Jason Wang <jasowang@redhat.com>
29b115
29b115
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
29b115
---
29b115
 net/vhost-vdpa.c | 213 +++++++++++++++++++++++++++++++++++++++++++++--
29b115
 1 file changed, 205 insertions(+), 8 deletions(-)
29b115
29b115
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
29b115
index 2e3b6b10d8..df42822463 100644
29b115
--- a/net/vhost-vdpa.c
29b115
+++ b/net/vhost-vdpa.c
29b115
@@ -33,6 +33,9 @@ typedef struct VhostVDPAState {
29b115
     NetClientState nc;
29b115
     struct vhost_vdpa vhost_vdpa;
29b115
     VHostNetState *vhost_net;
29b115
+
29b115
+    /* Control commands shadow buffers */
29b115
+    void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
29b115
     bool started;
29b115
 } VhostVDPAState;
29b115
 
29b115
@@ -131,6 +134,8 @@ static void vhost_vdpa_cleanup(NetClientState *nc)
29b115
 {
29b115
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
29b115
 
29b115
+    qemu_vfree(s->cvq_cmd_out_buffer);
29b115
+    qemu_vfree(s->cvq_cmd_in_buffer);
29b115
     if (s->vhost_net) {
29b115
         vhost_net_cleanup(s->vhost_net);
29b115
         g_free(s->vhost_net);
29b115
@@ -190,24 +195,191 @@ static NetClientInfo net_vhost_vdpa_info = {
29b115
         .check_peer_type = vhost_vdpa_check_peer_type,
29b115
 };
29b115
 
29b115
+static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
29b115
+{
29b115
+    VhostIOVATree *tree = v->iova_tree;
29b115
+    DMAMap needle = {
29b115
+        /*
29b115
+         * No need to specify size or to look for more translations since
29b115
+         * this contiguous chunk was allocated by us.
29b115
+         */
29b115
+        .translated_addr = (hwaddr)(uintptr_t)addr,
29b115
+    };
29b115
+    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
29b115
+    int r;
29b115
+
29b115
+    if (unlikely(!map)) {
29b115
+        error_report("Cannot locate expected map");
29b115
+        return;
29b115
+    }
29b115
+
29b115
+    r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1);
29b115
+    if (unlikely(r != 0)) {
29b115
+        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
29b115
+    }
29b115
+
29b115
+    vhost_iova_tree_remove(tree, map);
29b115
+}
29b115
+
29b115
+static size_t vhost_vdpa_net_cvq_cmd_len(void)
29b115
+{
29b115
+    /*
29b115
+     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
29b115
+     * In buffer is always 1 byte, so it should fit here
29b115
+     */
29b115
+    return sizeof(struct virtio_net_ctrl_hdr) +
29b115
+           2 * sizeof(struct virtio_net_ctrl_mac) +
29b115
+           MAC_TABLE_ENTRIES * ETH_ALEN;
29b115
+}
29b115
+
29b115
+static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
29b115
+{
29b115
+    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size);
29b115
+}
29b115
+
29b115
+/** Copy and map a guest buffer. */
29b115
+static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v,
29b115
+                                   const struct iovec *out_data,
29b115
+                                   size_t out_num, size_t data_len, void *buf,
29b115
+                                   size_t *written, bool write)
29b115
+{
29b115
+    DMAMap map = {};
29b115
+    int r;
29b115
+
29b115
+    if (unlikely(!data_len)) {
29b115
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n",
29b115
+                      __func__, write ? "in" : "out");
29b115
+        return false;
29b115
+    }
29b115
+
29b115
+    *written = iov_to_buf(out_data, out_num, 0, buf, data_len);
29b115
+    map.translated_addr = (hwaddr)(uintptr_t)buf;
29b115
+    map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1;
29b115
+    map.perm = write ? IOMMU_RW : IOMMU_RO,
29b115
+    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
29b115
+    if (unlikely(r != IOVA_OK)) {
29b115
+        error_report("Cannot map injected element");
29b115
+        return false;
29b115
+    }
29b115
+
29b115
+    r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf,
29b115
+                           !write);
29b115
+    if (unlikely(r < 0)) {
29b115
+        goto dma_map_err;
29b115
+    }
29b115
+
29b115
+    return true;
29b115
+
29b115
+dma_map_err:
29b115
+    vhost_iova_tree_remove(v->iova_tree, &map);
29b115
+    return false;
29b115
+}
29b115
+
29b115
 /**
29b115
- * Forward buffer for the moment.
29b115
+ * Copy the guest element into a dedicated buffer suitable to be sent to NIC
29b115
+ *
29b115
+ * @iov: [0] is the out buffer, [1] is the in one
29b115
+ */
29b115
+static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
29b115
+                                        VirtQueueElement *elem,
29b115
+                                        struct iovec *iov)
29b115
+{
29b115
+    size_t in_copied;
29b115
+    bool ok;
29b115
+
29b115
+    iov[0].iov_base = s->cvq_cmd_out_buffer;
29b115
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num,
29b115
+                                vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base,
29b115
+                                &iov[0].iov_len, false);
29b115
+    if (unlikely(!ok)) {
29b115
+        return false;
29b115
+    }
29b115
+
29b115
+    iov[1].iov_base = s->cvq_cmd_in_buffer;
29b115
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0,
29b115
+                                sizeof(virtio_net_ctrl_ack), iov[1].iov_base,
29b115
+                                &in_copied, true);
29b115
+    if (unlikely(!ok)) {
29b115
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
29b115
+        return false;
29b115
+    }
29b115
+
29b115
+    iov[1].iov_len = sizeof(virtio_net_ctrl_ack);
29b115
+    return true;
29b115
+}
29b115
+
29b115
+/**
29b115
+ * Do not forward commands not supported by SVQ. Otherwise, the device could
29b115
+ * accept it and qemu would not know how to update the device model.
29b115
+ */
29b115
+static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out,
29b115
+                                            size_t out_num)
29b115
+{
29b115
+    struct virtio_net_ctrl_hdr ctrl;
29b115
+    size_t n;
29b115
+
29b115
+    n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl));
29b115
+    if (unlikely(n < sizeof(ctrl))) {
29b115
+        qemu_log_mask(LOG_GUEST_ERROR,
29b115
+                      "%s: invalid legnth of out buffer %zu\n", __func__, n);
29b115
+        return false;
29b115
+    }
29b115
+
29b115
+    switch (ctrl.class) {
29b115
+    case VIRTIO_NET_CTRL_MAC:
29b115
+        switch (ctrl.cmd) {
29b115
+        case VIRTIO_NET_CTRL_MAC_ADDR_SET:
29b115
+            return true;
29b115
+        default:
29b115
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n",
29b115
+                          __func__, ctrl.cmd);
29b115
+        };
29b115
+        break;
29b115
+    default:
29b115
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
29b115
+                      __func__, ctrl.class);
29b115
+    };
29b115
+
29b115
+    return false;
29b115
+}
29b115
+
29b115
+/**
29b115
+ * Validate and copy control virtqueue commands.
29b115
+ *
29b115
+ * Following QEMU guidelines, we offer a copy of the buffers to the device to
29b115
+ * prevent TOCTOU bugs.
29b115
  */
29b115
 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
29b115
                                             VirtQueueElement *elem,
29b115
                                             void *opaque)
29b115
 {
29b115
-    unsigned int n = elem->out_num + elem->in_num;
29b115
-    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
29b115
+    VhostVDPAState *s = opaque;
29b115
     size_t in_len, dev_written;
29b115
     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
29b115
-    int r;
29b115
+    /* out and in buffers sent to the device */
29b115
+    struct iovec dev_buffers[2] = {
29b115
+        { .iov_base = s->cvq_cmd_out_buffer },
29b115
+        { .iov_base = s->cvq_cmd_in_buffer },
29b115
+    };
29b115
+    /* in buffer used for device model */
29b115
+    const struct iovec in = {
29b115
+        .iov_base = &status,
29b115
+        .iov_len = sizeof(status),
29b115
+    };
29b115
+    int r = -EINVAL;
29b115
+    bool ok;
29b115
+
29b115
+    ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);
29b115
+    if (unlikely(!ok)) {
29b115
+        goto out;
29b115
+    }
29b115
 
29b115
-    memcpy(dev_buffers, elem->out_sg, elem->out_num);
29b115
-    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
29b115
+    ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1);
29b115
+    if (unlikely(!ok)) {
29b115
+        goto out;
29b115
+    }
29b115
 
29b115
-    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
29b115
-                      elem->in_num, elem);
29b115
+    r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem);
29b115
     if (unlikely(r != 0)) {
29b115
         if (unlikely(r == -ENOSPC)) {
29b115
             qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
29b115
@@ -224,6 +396,18 @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
29b115
     dev_written = vhost_svq_poll(svq);
29b115
     if (unlikely(dev_written < sizeof(status))) {
29b115
         error_report("Insufficient written data (%zu)", dev_written);
29b115
+        goto out;
29b115
+    }
29b115
+
29b115
+    memcpy(&status, dev_buffers[1].iov_base, sizeof(status));
29b115
+    if (status != VIRTIO_NET_OK) {
29b115
+        goto out;
29b115
+    }
29b115
+
29b115
+    status = VIRTIO_NET_ERR;
29b115
+    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1);
29b115
+    if (status != VIRTIO_NET_OK) {
29b115
+        error_report("Bad CVQ processing in model");
29b115
     }
29b115
 
29b115
 out:
29b115
@@ -234,6 +418,12 @@ out:
29b115
     }
29b115
     vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
29b115
     g_free(elem);
29b115
+    if (dev_buffers[0].iov_base) {
29b115
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base);
29b115
+    }
29b115
+    if (dev_buffers[1].iov_base) {
29b115
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base);
29b115
+    }
29b115
     return r;
29b115
 }
29b115
 
29b115
@@ -266,6 +456,13 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
29b115
     s->vhost_vdpa.device_fd = vdpa_device_fd;
29b115
     s->vhost_vdpa.index = queue_pair_index;
29b115
     if (!is_datapath) {
29b115
+        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size,
29b115
+                                            vhost_vdpa_net_cvq_cmd_page_len());
29b115
+        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
29b115
+        s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size,
29b115
+                                            vhost_vdpa_net_cvq_cmd_page_len());
29b115
+        memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
29b115
+
29b115
         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
29b115
         s->vhost_vdpa.shadow_vq_ops_opaque = s;
29b115
     }
29b115
-- 
29b115
2.31.1
29b115