render / rpms / qemu

Forked from rpms/qemu 5 months ago
Clone

Blame qemu-vhost-vhost-net-support.patch

Justin M. Forbes 272dfe
This adds vhost net support in qemu. Will be tied to tap device and
Justin M. Forbes 272dfe
virtio by following patches.  Raw backend is currently missing, will be
Justin M. Forbes 272dfe
worked on/submitted separately.
Justin M. Forbes 272dfe
Justin M. Forbes 272dfe
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Justin M. Forbes 272dfe
---
Justin M. Forbes 272dfe
 Makefile.target |    1 +
Justin M. Forbes 272dfe
 hw/vhost.c      |  603 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
Justin M. Forbes 272dfe
 hw/vhost.h      |   44 ++++
Justin M. Forbes 272dfe
 hw/vhost_net.c  |  147 ++++++++++++++
Justin M. Forbes 272dfe
 hw/vhost_net.h  |   20 ++
Justin M. Forbes 272dfe
 5 files changed, 815 insertions(+), 0 deletions(-)
Justin M. Forbes 272dfe
 create mode 100644 hw/vhost.c
Justin M. Forbes 272dfe
 create mode 100644 hw/vhost.h
Justin M. Forbes 272dfe
 create mode 100644 hw/vhost_net.c
Justin M. Forbes 272dfe
 create mode 100644 hw/vhost_net.h
Justin M. Forbes 272dfe
Justin M. Forbes 272dfe
diff --git a/Makefile.target b/Makefile.target
Justin M. Forbes 272dfe
index 0c844a9..2ebd30c 100644
Justin M. Forbes 272dfe
--- a/Makefile.target
Justin M. Forbes 272dfe
+++ b/Makefile.target
Justin M. Forbes 272dfe
@@ -168,6 +168,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
Justin M. Forbes 272dfe
 # need to fix this properly
Justin M. Forbes 272dfe
 obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o
Justin M. Forbes 272dfe
 obj-y += notifier.o
Justin M. Forbes 272dfe
+obj-y += vhost_net.o vhost.o
Justin M. Forbes 272dfe
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
Justin M. Forbes 272dfe
 # MSI-X depends on kvm for interrupt injection,
Justin M. Forbes 272dfe
 # so moved it from Makefile.hw to Makefile.target for now
Justin M. Forbes 272dfe
diff --git a/hw/vhost.c b/hw/vhost.c
Justin M. Forbes 272dfe
new file mode 100644
Justin M. Forbes 272dfe
index 0000000..e5c1ead
Justin M. Forbes 272dfe
--- /dev/null
Justin M. Forbes 272dfe
+++ b/hw/vhost.c
Justin M. Forbes 272dfe
@@ -0,0 +1,603 @@
Justin M. Forbes 272dfe
+#include "linux/vhost.h"
Justin M. Forbes 272dfe
+#include <sys/ioctl.h>
Justin M. Forbes 272dfe
+#include <sys/eventfd.h>
Justin M. Forbes 272dfe
+#include "vhost.h"
Justin M. Forbes 272dfe
+#include "hw/hw.h"
Justin M. Forbes 272dfe
+/* For range_get_last */
Justin M. Forbes 272dfe
+#include "pci.h"
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static void vhost_dev_sync_region(struct vhost_dev *dev,
Justin M. Forbes 272dfe
+				     uint64_t mfirst, uint64_t mlast,
Justin M. Forbes 272dfe
+				     uint64_t rfirst, uint64_t rlast)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	uint64_t start = MAX(mfirst, rfirst);
Justin M. Forbes 272dfe
+	uint64_t end = MIN(mlast, rlast);
Justin M. Forbes 272dfe
+	vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK;
Justin M. Forbes 272dfe
+	vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1;
Justin M. Forbes 272dfe
+	uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	assert(end / VHOST_LOG_CHUNK < dev->log_size);
Justin M. Forbes 272dfe
+	assert(start / VHOST_LOG_CHUNK < dev->log_size);
Justin M. Forbes 272dfe
+	if (end < start) {
Justin M. Forbes 272dfe
+		return;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	for (;from < to; ++from) {
Justin M. Forbes 272dfe
+		vhost_log_chunk_t log;
Justin M. Forbes 272dfe
+		int bit;
Justin M. Forbes 272dfe
+		/* We first check with non-atomic: much cheaper,
Justin M. Forbes 272dfe
+		 * and we expect non-dirty to be the common case. */
Justin M. Forbes 272dfe
+		if (!*from) {
Justin M. Forbes 272dfe
+			continue;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+		/* Data must be read atomically. We don't really
Justin M. Forbes 272dfe
+		 * need the barrier semantics of __sync
Justin M. Forbes 272dfe
+		 * builtins, but it's easier to use them than
Justin M. Forbes 272dfe
+		 * roll our own. */
Justin M. Forbes 272dfe
+		log = __sync_fetch_and_and(from, 0);
Justin M. Forbes 272dfe
+		while ((bit = sizeof(log) > sizeof(int) ?
Justin M. Forbes 272dfe
+		       ffsll(log) : ffs(log))) {
Justin M. Forbes 272dfe
+			bit -= 1;
Justin M. Forbes 272dfe
+			cpu_physical_memory_set_dirty(addr + bit * VHOST_LOG_PAGE);
Justin M. Forbes 272dfe
+			log &= ~(0x1ull << bit);
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+		addr += VHOST_LOG_CHUNK;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
Justin M. Forbes 272dfe
+					target_phys_addr_t start_addr,
Justin M. Forbes 272dfe
+					target_phys_addr_t end_addr)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
Justin M. Forbes 272dfe
+	int i;
Justin M. Forbes 272dfe
+	if (!dev->log_enabled || !dev->started) {
Justin M. Forbes 272dfe
+		return 0;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	for (i = 0; i < dev->mem->nregions; ++i) {
Justin M. Forbes 272dfe
+		struct vhost_memory_region *reg = dev->mem->regions + i;
Justin M. Forbes 272dfe
+		vhost_dev_sync_region(dev, start_addr, end_addr,
Justin M. Forbes 272dfe
+				      reg->guest_phys_addr,
Justin M. Forbes 272dfe
+				      range_get_last(reg->guest_phys_addr,
Justin M. Forbes 272dfe
+						     reg->memory_size));
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	for (i = 0; i < dev->nvqs; ++i) {
Justin M. Forbes 272dfe
+		struct vhost_virtqueue *vq = dev->vqs + i;
Justin M. Forbes 272dfe
+		unsigned size = sizeof(struct vring_used_elem) * vq->num;
Justin M. Forbes 272dfe
+		vhost_dev_sync_region(dev, start_addr, end_addr, vq->used_phys,
Justin M. Forbes 272dfe
+				      range_get_last(vq->used_phys, size));
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+/* Assign/unassign. Keep an unsorted array of non-overlapping
Justin M. Forbes 272dfe
+ * memory regions in dev->mem. */
Justin M. Forbes 272dfe
+static void vhost_dev_unassign_memory(struct vhost_dev *dev,
Justin M. Forbes 272dfe
+				      uint64_t start_addr,
Justin M. Forbes 272dfe
+				      uint64_t size)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	int from, to, n = dev->mem->nregions;
Justin M. Forbes 272dfe
+	/* Track overlapping/split regions for sanity checking. */
Justin M. Forbes 272dfe
+	int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	for (from = 0, to = 0; from < n; ++from, ++to) {
Justin M. Forbes 272dfe
+		struct vhost_memory_region *reg = dev->mem->regions + to;
Justin M. Forbes 272dfe
+		uint64_t reglast;
Justin M. Forbes 272dfe
+		uint64_t memlast;
Justin M. Forbes 272dfe
+		uint64_t change;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* clone old region */
Justin M. Forbes 272dfe
+		if (to != from) {
Justin M. Forbes 272dfe
+			memcpy(reg, dev->mem->regions + from, sizeof *reg);
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* No overlap is simple */
Justin M. Forbes 272dfe
+		if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
Justin M. Forbes 272dfe
+				    start_addr, size)) {
Justin M. Forbes 272dfe
+			continue;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* Split only happens if supplied region
Justin M. Forbes 272dfe
+		 * is in the middle of an existing one. Thus it can not
Justin M. Forbes 272dfe
+		 * overlap with any other existing region. */
Justin M. Forbes 272dfe
+		assert(!split);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
Justin M. Forbes 272dfe
+		memlast = range_get_last(start_addr, size);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* Remove whole region */
Justin M. Forbes 272dfe
+		if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
Justin M. Forbes 272dfe
+			--dev->mem->nregions;
Justin M. Forbes 272dfe
+			--to;
Justin M. Forbes 272dfe
+			assert(to >= 0);
Justin M. Forbes 272dfe
+			++overlap_middle;
Justin M. Forbes 272dfe
+			continue;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* Shrink region */
Justin M. Forbes 272dfe
+		if (memlast >= reglast) {
Justin M. Forbes 272dfe
+			reg->memory_size = start_addr - reg->guest_phys_addr;
Justin M. Forbes 272dfe
+			assert(reg->memory_size);
Justin M. Forbes 272dfe
+			assert(!overlap_end);
Justin M. Forbes 272dfe
+			++overlap_end;
Justin M. Forbes 272dfe
+			continue;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* Shift region */
Justin M. Forbes 272dfe
+		if (start_addr <= reg->guest_phys_addr) {
Justin M. Forbes 272dfe
+			change = memlast + 1 - reg->guest_phys_addr;
Justin M. Forbes 272dfe
+			reg->memory_size -= change;
Justin M. Forbes 272dfe
+			reg->guest_phys_addr += change;
Justin M. Forbes 272dfe
+			reg->userspace_addr += change;
Justin M. Forbes 272dfe
+			assert(reg->memory_size);
Justin M. Forbes 272dfe
+			assert(!overlap_start);
Justin M. Forbes 272dfe
+			++overlap_start;
Justin M. Forbes 272dfe
+			continue;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* This only happens if supplied region
Justin M. Forbes 272dfe
+		 * is in the middle of an existing one. Thus it can not
Justin M. Forbes 272dfe
+		 * overlap with any other existing region. */
Justin M. Forbes 272dfe
+		assert(!overlap_start);
Justin M. Forbes 272dfe
+		assert(!overlap_end);
Justin M. Forbes 272dfe
+		assert(!overlap_middle);
Justin M. Forbes 272dfe
+		/* Split region: shrink first part, shift second part. */
Justin M. Forbes 272dfe
+		memcpy(dev->mem->regions + n, reg, sizeof *reg);
Justin M. Forbes 272dfe
+		reg->memory_size = start_addr - reg->guest_phys_addr;
Justin M. Forbes 272dfe
+		assert(reg->memory_size);
Justin M. Forbes 272dfe
+		change = memlast + 1 - reg->guest_phys_addr;
Justin M. Forbes 272dfe
+		reg = dev->mem->regions + n;
Justin M. Forbes 272dfe
+		reg->memory_size -= change;
Justin M. Forbes 272dfe
+		assert(reg->memory_size);
Justin M. Forbes 272dfe
+		reg->guest_phys_addr += change;
Justin M. Forbes 272dfe
+		reg->userspace_addr += change;
Justin M. Forbes 272dfe
+		/* Never add more than 1 region */
Justin M. Forbes 272dfe
+		assert(dev->mem->nregions == n);
Justin M. Forbes 272dfe
+		++dev->mem->nregions;
Justin M. Forbes 272dfe
+		++split;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+/* Called after unassign, so no regions overlap the given range. */
Justin M. Forbes 272dfe
+static void vhost_dev_assign_memory(struct vhost_dev *dev,
Justin M. Forbes 272dfe
+				    uint64_t start_addr,
Justin M. Forbes 272dfe
+				    uint64_t size,
Justin M. Forbes 272dfe
+				    uint64_t uaddr)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	int from, to;
Justin M. Forbes 272dfe
+	struct vhost_memory_region *merged = NULL;
Justin M. Forbes 272dfe
+	for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
Justin M. Forbes 272dfe
+		struct vhost_memory_region *reg = dev->mem->regions + to;
Justin M. Forbes 272dfe
+		uint64_t prlast, urlast;
Justin M. Forbes 272dfe
+		uint64_t pmlast, umlast;
Justin M. Forbes 272dfe
+		uint64_t s, e, u;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* clone old region */
Justin M. Forbes 272dfe
+		if (to != from) {
Justin M. Forbes 272dfe
+			memcpy(reg, dev->mem->regions + from, sizeof *reg);
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+		prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
Justin M. Forbes 272dfe
+		pmlast = range_get_last(start_addr, size);
Justin M. Forbes 272dfe
+		urlast = range_get_last(reg->userspace_addr, reg->memory_size);
Justin M. Forbes 272dfe
+		umlast = range_get_last(uaddr, size);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		/* check for overlapping regions: should never happen. */
Justin M. Forbes 272dfe
+		assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
Justin M. Forbes 272dfe
+		/* Not an adjacent or overlapping region - do not merge. */
Justin M. Forbes 272dfe
+		if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
Justin M. Forbes 272dfe
+		    (pmlast + 1 != reg->guest_phys_addr ||
Justin M. Forbes 272dfe
+		     umlast + 1 != reg->userspace_addr)) {
Justin M. Forbes 272dfe
+			continue;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+		if (merged) {
Justin M. Forbes 272dfe
+			--to;
Justin M. Forbes 272dfe
+			assert(to >= 0);
Justin M. Forbes 272dfe
+		} else {
Justin M. Forbes 272dfe
+			merged = reg;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+		u = MIN(uaddr, reg->userspace_addr);
Justin M. Forbes 272dfe
+		s = MIN(start_addr, reg->guest_phys_addr);
Justin M. Forbes 272dfe
+		e = MAX(pmlast, prlast);
Justin M. Forbes 272dfe
+		uaddr = merged->userspace_addr = u;
Justin M. Forbes 272dfe
+		start_addr = merged->guest_phys_addr = s;
Justin M. Forbes 272dfe
+		size = merged->memory_size = e - s + 1;
Justin M. Forbes 272dfe
+		assert(merged->memory_size);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	if (!merged) {
Justin M. Forbes 272dfe
+		struct vhost_memory_region *reg = dev->mem->regions + to;
Justin M. Forbes 272dfe
+		memset(reg, 0, sizeof *reg);
Justin M. Forbes 272dfe
+		reg->memory_size = size;
Justin M. Forbes 272dfe
+		assert(reg->memory_size);
Justin M. Forbes 272dfe
+		reg->guest_phys_addr = start_addr;
Justin M. Forbes 272dfe
+		reg->userspace_addr = uaddr;
Justin M. Forbes 272dfe
+		++to;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	assert(to <= dev->mem->nregions + 1);
Justin M. Forbes 272dfe
+	dev->mem->nregions = to;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static uint64_t vhost_get_log_size(struct vhost_dev *dev)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	uint64_t log_size = 0;
Justin M. Forbes 272dfe
+	int i;
Justin M. Forbes 272dfe
+	for (i = 0; i < dev->mem->nregions; ++i) {
Justin M. Forbes 272dfe
+		struct vhost_memory_region *reg = dev->mem->regions + i;
Justin M. Forbes 272dfe
+		uint64_t last = range_get_last(reg->guest_phys_addr,
Justin M. Forbes 272dfe
+					       reg->memory_size);
Justin M. Forbes 272dfe
+		log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	for (i = 0; i < dev->nvqs; ++i) {
Justin M. Forbes 272dfe
+		struct vhost_virtqueue *vq = dev->vqs + i;
Justin M. Forbes 272dfe
+		uint64_t last = vq->used_phys +
Justin M. Forbes 272dfe
+			sizeof(struct vring_used_elem) * vq->num - 1;
Justin M. Forbes 272dfe
+		log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	return log_size;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	vhost_log_chunk_t *log;
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	if (size) {
Justin M. Forbes 272dfe
+		log = qemu_mallocz(size * sizeof *log);
Justin M. Forbes 272dfe
+	} else {
Justin M. Forbes 272dfe
+		log = NULL;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_LOG_BASE,
Justin M. Forbes 272dfe
+		  (uint64_t)(unsigned long)log);
Justin M. Forbes 272dfe
+	assert(r >= 0);
Justin M. Forbes 272dfe
+	vhost_client_sync_dirty_bitmap(&dev->client, 0,
Justin M. Forbes 272dfe
+				       (target_phys_addr_t)~0x0ull);
Justin M. Forbes 272dfe
+	if (dev->log) {
Justin M. Forbes 272dfe
+		qemu_free(dev->log);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	dev->log = log;
Justin M. Forbes 272dfe
+	dev->log_size = size;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static void vhost_client_set_memory(CPUPhysMemoryClient *client,
Justin M. Forbes 272dfe
+				    target_phys_addr_t start_addr,
Justin M. Forbes 272dfe
+				    ram_addr_t size,
Justin M. Forbes 272dfe
+				    ram_addr_t phys_offset)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
Justin M. Forbes 272dfe
+	ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
Justin M. Forbes 272dfe
+	int s = offsetof(struct vhost_memory, regions) +
Justin M. Forbes 272dfe
+		(dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
Justin M. Forbes 272dfe
+	uint64_t log_size;
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	dev->mem = qemu_realloc(dev->mem, s);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	assert(size);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	vhost_dev_unassign_memory(dev, start_addr, size);
Justin M. Forbes 272dfe
+	if (flags == IO_MEM_RAM) {
Justin M. Forbes 272dfe
+		/* Add given mapping, merging adjacent regions if any */
Justin M. Forbes 272dfe
+		vhost_dev_assign_memory(dev, start_addr, size,
Justin M. Forbes 272dfe
+				(uintptr_t)qemu_get_ram_ptr(phys_offset));
Justin M. Forbes 272dfe
+	} else {
Justin M. Forbes 272dfe
+		/* Remove old mapping for this memory, if any. */
Justin M. Forbes 272dfe
+		vhost_dev_unassign_memory(dev, start_addr, size);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	if (!dev->started) {
Justin M. Forbes 272dfe
+		return;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	if (!dev->log_enabled) {
Justin M. Forbes 272dfe
+		r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
Justin M. Forbes 272dfe
+		assert(r >= 0);
Justin M. Forbes 272dfe
+		return;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	log_size = vhost_get_log_size(dev);
Justin M. Forbes 272dfe
+	/* We allocate an extra 4K bytes to log,
Justin M. Forbes 272dfe
+	 * to reduce the * number of reallocations. */
Justin M. Forbes 272dfe
+#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
Justin M. Forbes 272dfe
+	/* To log more, must increase log size before table update. */
Justin M. Forbes 272dfe
+	if (dev->log_size < log_size) {
Justin M. Forbes 272dfe
+		vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
Justin M. Forbes 272dfe
+	assert(r >= 0);
Justin M. Forbes 272dfe
+	/* To log less, can only decrease log size after table update. */
Justin M. Forbes 272dfe
+	if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
Justin M. Forbes 272dfe
+		vhost_dev_log_resize(dev, log_size);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	uint64_t features = dev->acked_features;
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	if (dev->log_enabled) {
Justin M. Forbes 272dfe
+		features |= 0x1 << VHOST_F_LOG_ALL;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_FEATURES, &features);
Justin M. Forbes 272dfe
+	return r < 0 ? -errno : 0;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static int vhost_client_migration_log(struct CPUPhysMemoryClient *client,
Justin M. Forbes 272dfe
+				      int enable)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	if (!!enable == dev->log_enabled) {
Justin M. Forbes 272dfe
+		return 0;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	if (!dev->started) {
Justin M. Forbes 272dfe
+		dev->log_enabled = enable;
Justin M. Forbes 272dfe
+		return 0;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	if (!enable) {
Justin M. Forbes 272dfe
+		r = vhost_dev_set_log(dev, false);
Justin M. Forbes 272dfe
+		if (r < 0) {
Justin M. Forbes 272dfe
+			return r;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+		if (dev->log) {
Justin M. Forbes 272dfe
+			qemu_free(dev->log);
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+		dev->log = NULL;
Justin M. Forbes 272dfe
+		dev->log_size = 0;
Justin M. Forbes 272dfe
+	} else {
Justin M. Forbes 272dfe
+		vhost_dev_log_resize(dev, vhost_get_log_size(dev));
Justin M. Forbes 272dfe
+		r = vhost_dev_set_log(dev, false);
Justin M. Forbes 272dfe
+		if (r < 0) {
Justin M. Forbes 272dfe
+			return r;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	dev->log_enabled = enable;
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
Justin M. Forbes 272dfe
+				    struct vhost_virtqueue *vq,
Justin M. Forbes 272dfe
+				    unsigned idx, bool enable_log)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_vring_addr addr = {
Justin M. Forbes 272dfe
+		.index = idx,
Justin M. Forbes 272dfe
+		.desc_user_addr = (u_int64_t)(unsigned long)vq->desc,
Justin M. Forbes 272dfe
+		.avail_user_addr = (u_int64_t)(unsigned long)vq->avail,
Justin M. Forbes 272dfe
+		.used_user_addr = (u_int64_t)(unsigned long)vq->used,
Justin M. Forbes 272dfe
+		.log_guest_addr = vq->used_phys,
Justin M. Forbes 272dfe
+		.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
Justin M. Forbes 272dfe
+	};
Justin M. Forbes 272dfe
+	int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		return -errno;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static int vhost_virtqueue_init(struct vhost_dev *dev,
Justin M. Forbes 272dfe
+				struct VirtIODevice *vdev,
Justin M. Forbes 272dfe
+				struct vhost_virtqueue *vq,
Justin M. Forbes 272dfe
+				unsigned idx)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	target_phys_addr_t s, l, a;
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	struct vhost_vring_file file = {
Justin M. Forbes 272dfe
+		.index = idx,
Justin M. Forbes 272dfe
+	};
Justin M. Forbes 272dfe
+	struct vhost_vring_state state = {
Justin M. Forbes 272dfe
+		.index = idx,
Justin M. Forbes 272dfe
+	};
Justin M. Forbes 272dfe
+	struct VirtQueue *q = virtio_queue(vdev, idx);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	vq->num = state.num = virtio_queue_get_num(vdev, idx);
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
Justin M. Forbes 272dfe
+	if (r) {
Justin M. Forbes 272dfe
+		return -errno;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	state.num = virtio_queue_last_avail_idx(vdev, idx);
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
Justin M. Forbes 272dfe
+	if (r) {
Justin M. Forbes 272dfe
+		return -errno;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	s = l = sizeof(struct vring_desc) * vq->num;
Justin M. Forbes 272dfe
+	a = virtio_queue_get_desc(vdev, idx);
Justin M. Forbes 272dfe
+	vq->desc = cpu_physical_memory_map(a, &l, 0);
Justin M. Forbes 272dfe
+	if (!vq->desc || l != s) {
Justin M. Forbes 272dfe
+		r = -ENOMEM;
Justin M. Forbes 272dfe
+		goto fail_alloc;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	s = l = offsetof(struct vring_avail, ring) +
Justin M. Forbes 272dfe
+		sizeof(u_int64_t) * vq->num;
Justin M. Forbes 272dfe
+	a = virtio_queue_get_avail(vdev, idx);
Justin M. Forbes 272dfe
+	vq->avail = cpu_physical_memory_map(a, &l, 0);
Justin M. Forbes 272dfe
+	if (!vq->avail || l != s) {
Justin M. Forbes 272dfe
+		r = -ENOMEM;
Justin M. Forbes 272dfe
+		goto fail_alloc;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	s = l = offsetof(struct vring_used, ring) +
Justin M. Forbes 272dfe
+		sizeof(struct vring_used_elem) * vq->num;
Justin M. Forbes 272dfe
+	vq->used_phys = a = virtio_queue_get_used(vdev, idx);
Justin M. Forbes 272dfe
+	vq->used = cpu_physical_memory_map(a, &l, 1);
Justin M. Forbes 272dfe
+	if (!vq->used || l != s) {
Justin M. Forbes 272dfe
+		r = -ENOMEM;
Justin M. Forbes 272dfe
+		goto fail_alloc;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		r = -errno;
Justin M. Forbes 272dfe
+		goto fail_alloc;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "binding does not support irqfd/queuefd\n");
Justin M. Forbes 272dfe
+		r = -ENOSYS;
Justin M. Forbes 272dfe
+		goto fail_alloc;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+        r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "Error binding guest notifier: %d\n", -r);
Justin M. Forbes 272dfe
+		goto fail_guest_notifier;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+        r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "Error binding host notifier: %d\n", -r);
Justin M. Forbes 272dfe
+		goto fail_host_notifier;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file;;
Justin M. Forbes 272dfe
+	if (r) {
Justin M. Forbes 272dfe
+		goto fail_kick;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file;;
Justin M. Forbes 272dfe
+	if (r) {
Justin M. Forbes 272dfe
+		goto fail_call;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+fail_call:
Justin M. Forbes 272dfe
+fail_kick:
Justin M. Forbes 272dfe
+        vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
Justin M. Forbes 272dfe
+fail_host_notifier:
Justin M. Forbes 272dfe
+        vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
Justin M. Forbes 272dfe
+fail_guest_notifier:
Justin M. Forbes 272dfe
+fail_alloc:
Justin M. Forbes 272dfe
+	return r;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
Justin M. Forbes 272dfe
+				    struct VirtIODevice *vdev,
Justin M. Forbes 272dfe
+				    struct vhost_virtqueue *vq,
Justin M. Forbes 272dfe
+				    unsigned idx)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_vring_state state = {
Justin M. Forbes 272dfe
+		.index = idx,
Justin M. Forbes 272dfe
+	};
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "vhost VQ %d guest cleanup failed: %d\n", idx, r);
Justin M. Forbes 272dfe
+		fflush(stderr);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	assert (r >= 0);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	r = vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
Justin M. Forbes 272dfe
+		fflush(stderr);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	assert (r >= 0);
Justin M. Forbes 272dfe
+	r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
Justin M. Forbes 272dfe
+		fflush(stderr);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	virtio_queue_set_last_avail_idx(vdev, idx, state.num);
Justin M. Forbes 272dfe
+	assert (r >= 0);
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+int vhost_dev_init(struct vhost_dev *hdev, int devfd)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	uint64_t features;
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	if (devfd >= 0) {
Justin M. Forbes 272dfe
+		hdev->control = devfd;
Justin M. Forbes 272dfe
+	} else {
Justin M. Forbes 272dfe
+		hdev->control = open("/dev/vhost-net", O_RDWR);
Justin M. Forbes 272dfe
+		if (hdev->control < 0)
Justin M. Forbes 272dfe
+			return -errno;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	r = ioctl(hdev->control, VHOST_SET_OWNER, NULL);
Justin M. Forbes 272dfe
+	if (r < 0)
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	r = ioctl(hdev->control, VHOST_GET_FEATURES, &features);
Justin M. Forbes 272dfe
+	if (r < 0)
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	hdev->features = features;
Justin M. Forbes 272dfe
+	
Justin M. Forbes 272dfe
+	hdev->client.set_memory = vhost_client_set_memory;
Justin M. Forbes 272dfe
+	hdev->client.sync_dirty_bitmap = vhost_client_sync_dirty_bitmap;
Justin M. Forbes 272dfe
+	hdev->client.migration_log = vhost_client_migration_log;
Justin M. Forbes 272dfe
+	hdev->mem = qemu_mallocz(offsetof(struct vhost_memory, regions));
Justin M. Forbes 272dfe
+	hdev->log = NULL;
Justin M. Forbes 272dfe
+	hdev->log_size = 0;
Justin M. Forbes 272dfe
+	hdev->log_enabled = false;
Justin M. Forbes 272dfe
+	hdev->started = false;
Justin M. Forbes 272dfe
+	cpu_register_phys_memory_client(&hdev->client);
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+fail:
Justin M. Forbes 272dfe
+	r = -errno;
Justin M. Forbes 272dfe
+	close(hdev->control);
Justin M. Forbes 272dfe
+	return r;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+void vhost_dev_cleanup(struct vhost_dev *hdev)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	cpu_unregister_phys_memory_client(&hdev->client);
Justin M. Forbes 272dfe
+	qemu_free(hdev->mem);
Justin M. Forbes 272dfe
+	close(hdev->control);
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	int i, r;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	r = vhost_dev_set_log(hdev, hdev->log_enabled);
Justin M. Forbes 272dfe
+	if (r < 0)
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem);
Justin M. Forbes 272dfe
+	if (r < 0) {
Justin M. Forbes 272dfe
+		r = -errno;
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	if (hdev->log_enabled) {
Justin M. Forbes 272dfe
+		hdev->log_size = vhost_get_log_size(hdev);
Justin M. Forbes 272dfe
+		hdev->log = hdev->log_size ?
Justin M. Forbes 272dfe
+			qemu_mallocz(hdev->log_size * sizeof *hdev->log) : NULL;
Justin M. Forbes 272dfe
+		r = ioctl(hdev->control, VHOST_SET_LOG_BASE,
Justin M. Forbes 272dfe
+			  (uint64_t)(unsigned long)hdev->log);
Justin M. Forbes 272dfe
+		if (r < 0) {
Justin M. Forbes 272dfe
+			r = -errno;
Justin M. Forbes 272dfe
+			goto fail;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	for (i = 0; i < hdev->nvqs; ++i) {
Justin M. Forbes 272dfe
+		r = vhost_virtqueue_init(hdev,
Justin M. Forbes 272dfe
+		   			 vdev,
Justin M. Forbes 272dfe
+					 hdev->vqs + i,
Justin M. Forbes 272dfe
+					 i);
Justin M. Forbes 272dfe
+		if (r < 0)
Justin M. Forbes 272dfe
+			goto fail_vq;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	hdev->started = true;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+fail_vq:
Justin M. Forbes 272dfe
+	while (--i >= 0) {
Justin M. Forbes 272dfe
+		vhost_virtqueue_cleanup(hdev,
Justin M. Forbes 272dfe
+					vdev,
Justin M. Forbes 272dfe
+					hdev->vqs + i,
Justin M. Forbes 272dfe
+					i);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+fail:
Justin M. Forbes 272dfe
+	return r;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	int i;
Justin M. Forbes 272dfe
+	for (i = 0; i < hdev->nvqs; ++i) {
Justin M. Forbes 272dfe
+		vhost_virtqueue_cleanup(hdev,
Justin M. Forbes 272dfe
+					vdev,
Justin M. Forbes 272dfe
+					hdev->vqs + i,
Justin M. Forbes 272dfe
+					i);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	vhost_client_sync_dirty_bitmap(&hdev->client, 0,
Justin M. Forbes 272dfe
+				       (target_phys_addr_t)~0x0ull);
Justin M. Forbes 272dfe
+	hdev->started = false;
Justin M. Forbes 272dfe
+	qemu_free(hdev->log);
Justin M. Forbes 272dfe
+	hdev->log_size = 0;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
diff --git a/hw/vhost.h b/hw/vhost.h
Justin M. Forbes 272dfe
new file mode 100644
Justin M. Forbes 272dfe
index 0000000..2ed3933
Justin M. Forbes 272dfe
--- /dev/null
Justin M. Forbes 272dfe
+++ b/hw/vhost.h
Justin M. Forbes 272dfe
@@ -0,0 +1,44 @@
Justin M. Forbes 272dfe
+#ifndef VHOST_H
Justin M. Forbes 272dfe
+#define VHOST_H
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#include "hw/hw.h"
Justin M. Forbes 272dfe
+#include "hw/virtio.h"
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+/* Generic structures common for any vhost based device. */
Justin M. Forbes 272dfe
+struct vhost_virtqueue {
Justin M. Forbes 272dfe
+	int kick;
Justin M. Forbes 272dfe
+	int call;
Justin M. Forbes 272dfe
+	void *desc;
Justin M. Forbes 272dfe
+	void *avail;
Justin M. Forbes 272dfe
+	void *used;
Justin M. Forbes 272dfe
+	int num;
Justin M. Forbes 272dfe
+	unsigned long long used_phys;
Justin M. Forbes 272dfe
+};
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+typedef unsigned long vhost_log_chunk_t;
Justin M. Forbes 272dfe
+#define VHOST_LOG_PAGE 0x1000
Justin M. Forbes 272dfe
+#define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t))
Justin M. Forbes 272dfe
+#define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS)
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+struct vhost_memory;
Justin M. Forbes 272dfe
+struct vhost_dev {
Justin M. Forbes 272dfe
+	CPUPhysMemoryClient client;
Justin M. Forbes 272dfe
+	int control;
Justin M. Forbes 272dfe
+	struct vhost_memory *mem;
Justin M. Forbes 272dfe
+	struct vhost_virtqueue *vqs;
Justin M. Forbes 272dfe
+	int nvqs;
Justin M. Forbes 272dfe
+	unsigned long long features;
Justin M. Forbes 272dfe
+	unsigned long long acked_features;
Justin M. Forbes 272dfe
+	unsigned long long backend_features;
Justin M. Forbes 272dfe
+	bool started;
Justin M. Forbes 272dfe
+	bool log_enabled;
Justin M. Forbes 272dfe
+	vhost_log_chunk_t *log;
Justin M. Forbes 272dfe
+	unsigned long long log_size;
Justin M. Forbes 272dfe
+};
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+int vhost_dev_init(struct vhost_dev *hdev, int devfd);
Justin M. Forbes 272dfe
+void vhost_dev_cleanup(struct vhost_dev *hdev);
Justin M. Forbes 272dfe
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
Justin M. Forbes 272dfe
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#endif
Justin M. Forbes 272dfe
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
Justin M. Forbes 272dfe
new file mode 100644
Justin M. Forbes 272dfe
index 0000000..c89ff40
Justin M. Forbes 272dfe
--- /dev/null
Justin M. Forbes 272dfe
+++ b/hw/vhost_net.c
Justin M. Forbes 272dfe
@@ -0,0 +1,147 @@
Justin M. Forbes 272dfe
+#include <sys/eventfd.h>
Justin M. Forbes 272dfe
+#include <sys/socket.h>
Justin M. Forbes 272dfe
+#include <linux/kvm.h>
Justin M. Forbes 272dfe
+#include <fcntl.h>
Justin M. Forbes 272dfe
+#include <sys/ioctl.h>
Justin M. Forbes 272dfe
+#include <linux/vhost.h>
Justin M. Forbes 272dfe
+#include <linux/virtio_ring.h>
Justin M. Forbes 272dfe
+#include <netpacket/packet.h>
Justin M. Forbes 272dfe
+#include <net/ethernet.h>
Justin M. Forbes 272dfe
+#include <net/if.h>
Justin M. Forbes 272dfe
+#include <netinet/in.h>
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#include <stdio.h>
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#include "net.h"
Justin M. Forbes 272dfe
+#include "net/tap.h"
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#include "virtio-net.h"
Justin M. Forbes 272dfe
+#include "vhost.h"
Justin M. Forbes 272dfe
+#include "vhost_net.h"
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+struct vhost_net {
Justin M. Forbes 272dfe
+	struct vhost_dev dev;
Justin M. Forbes 272dfe
+	struct vhost_virtqueue vqs[2];
Justin M. Forbes 272dfe
+	int backend;
Justin M. Forbes 272dfe
+	VLANClientState *vc;
Justin M. Forbes 272dfe
+};
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	/* Clear features not supported by host kernel. */
Justin M. Forbes 272dfe
+	if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)))
Justin M. Forbes 272dfe
+		features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY);
Justin M. Forbes 272dfe
+	if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC)))
Justin M. Forbes 272dfe
+		features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
Justin M. Forbes 272dfe
+	if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
Justin M. Forbes 272dfe
+		features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
Justin M. Forbes 272dfe
+	return features;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+void vhost_net_ack_features(struct vhost_net *net, unsigned features)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	net->dev.acked_features = net->dev.backend_features;
Justin M. Forbes 272dfe
+	if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))
Justin M. Forbes 272dfe
+		net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY);
Justin M. Forbes 272dfe
+	if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC))
Justin M. Forbes 272dfe
+		net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+static int vhost_net_get_fd(VLANClientState *backend)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	switch (backend->info->type) {
Justin M. Forbes 272dfe
+	case NET_CLIENT_TYPE_TAP:
Justin M. Forbes 272dfe
+		return tap_get_fd(backend);
Justin M. Forbes 272dfe
+	default:
Justin M. Forbes 272dfe
+		fprintf(stderr, "vhost-net requires tap backend\n");
Justin M. Forbes 272dfe
+		return -EBADFD;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+	struct vhost_net *net = qemu_malloc(sizeof *net);
Justin M. Forbes 272dfe
+	if (!backend) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "vhost-net requires backend to be setup\n");
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	r = vhost_net_get_fd(backend);
Justin M. Forbes 272dfe
+	if (r < 0)
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	net->vc = backend;
Justin M. Forbes 272dfe
+	net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
Justin M. Forbes 272dfe
+		(1 << VHOST_NET_F_VIRTIO_NET_HDR);
Justin M. Forbes 272dfe
+	net->backend = r;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	r = vhost_dev_init(&net->dev, devfd);
Justin M. Forbes 272dfe
+	if (r < 0)
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	if (~net->dev.features & net->dev.backend_features) {
Justin M. Forbes 272dfe
+		fprintf(stderr, "vhost lacks feature mask %llu for backend\n",
Justin M. Forbes 272dfe
+			~net->dev.features & net->dev.backend_features);
Justin M. Forbes 272dfe
+		vhost_dev_cleanup(&net->dev);
Justin M. Forbes 272dfe
+		goto fail;
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	/* Set sane init value. Override when guest acks. */
Justin M. Forbes 272dfe
+	vhost_net_ack_features(net, 0);
Justin M. Forbes 272dfe
+	return net;
Justin M. Forbes 272dfe
+fail:
Justin M. Forbes 272dfe
+	qemu_free(net);
Justin M. Forbes 272dfe
+	return NULL;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+int vhost_net_start(struct vhost_net *net,
Justin M. Forbes 272dfe
+		    VirtIODevice *dev)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_vring_file file = { };
Justin M. Forbes 272dfe
+	int r;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	net->dev.nvqs = 2;
Justin M. Forbes 272dfe
+	net->dev.vqs = net->vqs;
Justin M. Forbes 272dfe
+	r = vhost_dev_start(&net->dev, dev);
Justin M. Forbes 272dfe
+	if (r < 0)
Justin M. Forbes 272dfe
+		return r;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	net->vc->info->poll(net->vc, false);
Justin M. Forbes 272dfe
+	qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
Justin M. Forbes 272dfe
+	file.fd = net->backend;
Justin M. Forbes 272dfe
+	for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
Justin M. Forbes 272dfe
+		r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file;;
Justin M. Forbes 272dfe
+		if (r < 0) {
Justin M. Forbes 272dfe
+			r = -errno;
Justin M. Forbes 272dfe
+			goto fail;
Justin M. Forbes 272dfe
+		}
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	return 0;
Justin M. Forbes 272dfe
+fail:
Justin M. Forbes 272dfe
+	file.fd = -1;
Justin M. Forbes 272dfe
+	while (--file.index >= 0) {
Justin M. Forbes 272dfe
+		int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file;;
Justin M. Forbes 272dfe
+		assert(r >= 0);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	net->vc->info->poll(net->vc, true);
Justin M. Forbes 272dfe
+	vhost_dev_stop(&net->dev, dev);
Justin M. Forbes 272dfe
+	return r;
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+void vhost_net_stop(struct vhost_net *net,
Justin M. Forbes 272dfe
+		    VirtIODevice *dev)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	struct vhost_vring_file file = { .fd = -1 };
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+	for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
Justin M. Forbes 272dfe
+		int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file;;
Justin M. Forbes 272dfe
+		assert(r >= 0);
Justin M. Forbes 272dfe
+	}
Justin M. Forbes 272dfe
+	net->vc->info->poll(net->vc, true);
Justin M. Forbes 272dfe
+	vhost_dev_stop(&net->dev, dev);
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+void vhost_net_cleanup(struct vhost_net *net)
Justin M. Forbes 272dfe
+{
Justin M. Forbes 272dfe
+	vhost_dev_cleanup(&net->dev);
Justin M. Forbes 272dfe
+	qemu_free(net);
Justin M. Forbes 272dfe
+}
Justin M. Forbes 272dfe
+/* TODO: log */
Justin M. Forbes 272dfe
diff --git a/hw/vhost_net.h b/hw/vhost_net.h
Justin M. Forbes 272dfe
new file mode 100644
Justin M. Forbes 272dfe
index 0000000..21f0277
Justin M. Forbes 272dfe
--- /dev/null
Justin M. Forbes 272dfe
+++ b/hw/vhost_net.h
Justin M. Forbes 272dfe
@@ -0,0 +1,20 @@
Justin M. Forbes 272dfe
+#ifndef VHOST_NET_H
Justin M. Forbes 272dfe
+#define VHOST_NET_H
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#include "net.h"
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+struct vhost_net;
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+int vhost_net_start(struct vhost_net *net,
Justin M. Forbes 272dfe
+		    VirtIODevice *dev);
Justin M. Forbes 272dfe
+void vhost_net_stop(struct vhost_net *net,
Justin M. Forbes 272dfe
+		    VirtIODevice *dev);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+void vhost_net_cleanup(struct vhost_net *net);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features);
Justin M. Forbes 272dfe
+void vhost_net_ack_features(struct vhost_net *net, unsigned features);
Justin M. Forbes 272dfe
+
Justin M. Forbes 272dfe
+#endif
Justin M. Forbes 272dfe
-- 
Justin M. Forbes 272dfe
1.6.6.144.g5c3af