Blame SOURCES/0001-vhost-improve-dirty-pages-logging-performance.patch

a6040a
From 4d8b1e6aa5d7ecfc1d2ee606b4bd838b4f1ac9d2 Mon Sep 17 00:00:00 2001
a6040a
From: Maxime Coquelin <maxime.coquelin@redhat.com>
a6040a
Date: Thu, 17 May 2018 13:44:47 +0200
a6040a
Subject: [PATCH] vhost: improve dirty pages logging performance
a6040a
a6040a
[ upstream commit c16915b8710911a75f0fbdb1aa5243f4cdfaf26a ]
a6040a
a6040a
This patch caches all dirty pages logging until the used ring index
a6040a
is updated.
a6040a
a6040a
The goal of this optimization is to fix a performance regression
a6040a
introduced when the vhost library started to use atomic operations
a6040a
to set bits in the shared dirty log map. While the fix was valid
a6040a
as previous implementation wasn't safe against concurrent accesses,
a6040a
contention was induced.
a6040a
a6040a
With this patch, during migration, we have:
a6040a
1. Less atomic operations as only a single atomic OR operation
a6040a
per 32 or 64 (depending on CPU) pages.
a6040a
2. Less atomic operations as during a burst, the same page will
a6040a
be marked dirty only once.
a6040a
3. Less write memory barriers.
a6040a
a6040a
Fixes: 897f13a1f726 ("vhost: make page logging atomic")
a6040a
Cc: stable@dpdk.org
a6040a
a6040a
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
a6040a
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
a6040a
Reviewed-by: Tiwei Bie <tiwei.bie@intel.com>
a6040a
---
a6040a
 lib/librte_vhost/vhost.h      | 119 +++++++++++++++++++++++++++++++++++++++++-
a6040a
 lib/librte_vhost/virtio_net.c |  29 ++++++----
a6040a
 2 files changed, 137 insertions(+), 11 deletions(-)
a6040a
a6040a
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
a6040a
index 16d6b8913..42c6a3a75 100644
a6040a
--- a/lib/librte_vhost/vhost.h
a6040a
+++ b/lib/librte_vhost/vhost.h
a6040a
@@ -59,6 +59,8 @@
a6040a
 
a6040a
 #define BUF_VECTOR_MAX 256
a6040a
 
a6040a
+#define VHOST_LOG_CACHE_NR 32
a6040a
+
a6040a
 /**
a6040a
  * Structure contains buffer address, length and descriptor index
a6040a
  * from vring to do scatter RX.
a6040a
@@ -92,6 +94,14 @@ struct batch_copy_elem {
a6040a
 	uint64_t log_addr;
a6040a
 };
a6040a
 
a6040a
+/*
a6040a
+ * Structure that contains the info for batched dirty logging.
a6040a
+ */
a6040a
+struct log_cache_entry {
a6040a
+	uint32_t offset;
a6040a
+	unsigned long val;
a6040a
+};
a6040a
+
a6040a
 /**
a6040a
  * Structure contains variables relevant to RX/TX virtqueues.
a6040a
  */
a6040a
@@ -133,6 +143,9 @@ struct vhost_virtqueue {
a6040a
 	struct batch_copy_elem	*batch_copy_elems;
a6040a
 	uint16_t		batch_copy_nb_elems;
a6040a
 
a6040a
+	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
a6040a
+	uint16_t log_cache_nb_elem;
a6040a
+
a6040a
 	rte_rwlock_t	iotlb_lock;
a6040a
 	rte_rwlock_t	iotlb_pending_lock;
a6040a
 	struct rte_mempool *iotlb_pool;
a6040a
@@ -266,7 +279,15 @@ struct virtio_net {
a6040a
 static __rte_always_inline void
a6040a
 vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
a6040a
 {
a6040a
-	__sync_fetch_and_or_8(addr, (1U << nr));
a6040a
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
a6040a
+	/*
a6040a
+	 * __sync_ built-ins are deprecated, but __atomic_ ones
a6040a
+	 * are sub-optimized in older GCC versions.
a6040a
+	 */
a6040a
+	__sync_fetch_and_or_1(addr, (1U << nr));
a6040a
+#else
a6040a
+	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
a6040a
+#endif
a6040a
 }
a6040a
 
a6040a
 static __rte_always_inline void
a6040a
@@ -297,6 +318,102 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
a6040a
 	}
a6040a
 }
a6040a
 
a6040a
+static __rte_always_inline void
a6040a
+vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
a6040a
+{
a6040a
+	unsigned long *log_base;
a6040a
+	int i;
a6040a
+
a6040a
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
a6040a
+		   !dev->log_base))
a6040a
+		return;
a6040a
+
a6040a
+	log_base = (unsigned long *)(uintptr_t)dev->log_base;
a6040a
+
a6040a
+	/*
a6040a
+	 * It is expected a write memory barrier has been issued
a6040a
+	 * before this function is called.
a6040a
+	 */
a6040a
+
a6040a
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
a6040a
+		struct log_cache_entry *elem = vq->log_cache + i;
a6040a
+
a6040a
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
a6040a
+		/*
a6040a
+		 * '__sync' builtins are deprecated, but '__atomic' ones
a6040a
+		 * are sub-optimized in older GCC versions.
a6040a
+		 */
a6040a
+		__sync_fetch_and_or(log_base + elem->offset, elem->val);
a6040a
+#else
a6040a
+		__atomic_fetch_or(log_base + elem->offset, elem->val,
a6040a
+				__ATOMIC_RELAXED);
a6040a
+#endif
a6040a
+	}
a6040a
+
a6040a
+	rte_smp_wmb();
a6040a
+
a6040a
+	vq->log_cache_nb_elem = 0;
a6040a
+}
a6040a
+
a6040a
+static __rte_always_inline void
a6040a
+vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
+			uint64_t page)
a6040a
+{
a6040a
+	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
a6040a
+	uint32_t offset = page / (sizeof(unsigned long) << 3);
a6040a
+	int i;
a6040a
+
a6040a
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
a6040a
+		struct log_cache_entry *elem = vq->log_cache + i;
a6040a
+
a6040a
+		if (elem->offset == offset) {
a6040a
+			elem->val |= (1UL << bit_nr);
a6040a
+			return;
a6040a
+		}
a6040a
+	}
a6040a
+
a6040a
+	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
a6040a
+		/*
a6040a
+		 * No more room for a new log cache entry,
a6040a
+		 * so write the dirty log map directly.
a6040a
+		 */
a6040a
+		rte_smp_wmb();
a6040a
+		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
a6040a
+
a6040a
+		return;
a6040a
+	}
a6040a
+
a6040a
+	vq->log_cache[i].offset = offset;
a6040a
+	vq->log_cache[i].val = (1UL << bit_nr);
a6040a
+}
a6040a
+
a6040a
+static __rte_always_inline void
a6040a
+vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
+			uint64_t addr, uint64_t len)
a6040a
+{
a6040a
+	uint64_t page;
a6040a
+
a6040a
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
a6040a
+		   !dev->log_base || !len))
a6040a
+		return;
a6040a
+
a6040a
+	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
a6040a
+		return;
a6040a
+
a6040a
+	page = addr / VHOST_LOG_PAGE;
a6040a
+	while (page * VHOST_LOG_PAGE < addr + len) {
a6040a
+		vhost_log_cache_page(dev, vq, page);
a6040a
+		page += 1;
a6040a
+	}
a6040a
+}
a6040a
+
a6040a
+static __rte_always_inline void
a6040a
+vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
+			uint64_t offset, uint64_t len)
a6040a
+{
a6040a
+	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
a6040a
+}
a6040a
+
a6040a
 static __rte_always_inline void
a6040a
 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 		     uint64_t offset, uint64_t len)
a6040a
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
a6040a
index a013c07b0..5f8763d3a 100644
a6040a
--- a/lib/librte_vhost/virtio_net.c
a6040a
+++ b/lib/librte_vhost/virtio_net.c
a6040a
@@ -107,7 +107,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 	rte_memcpy(&vq->used->ring[to],
a6040a
 			&vq->shadow_used_ring[from],
a6040a
 			size * sizeof(struct vring_used_elem));
a6040a
-	vhost_log_used_vring(dev, vq,
a6040a
+	vhost_log_cache_used_vring(dev, vq,
a6040a
 			offsetof(struct vring_used, ring[to]),
a6040a
 			size * sizeof(struct vring_used_elem));
a6040a
 }
a6040a
@@ -135,6 +135,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
a6040a
 
a6040a
 	rte_smp_wmb();
a6040a
 
a6040a
+	vhost_log_cache_sync(dev, vq);
a6040a
+
a6040a
 	*(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
a6040a
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
a6040a
 		sizeof(vq->used->idx));
a6040a
@@ -159,7 +161,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
a6040a
 
a6040a
 	for (i = 0; i < count; i++) {
a6040a
 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
a6040a
-		vhost_log_write(dev, elem[i].log_addr, elem[i].len);
a6040a
+		vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
a6040a
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
a6040a
 	}
a6040a
 }
a6040a
@@ -275,7 +277,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 		virtio_enqueue_offload(m,
a6040a
 				(struct virtio_net_hdr *)(uintptr_t)desc_addr);
a6040a
 		PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
a6040a
-		vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
a6040a
+		vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
a6040a
 	} else {
a6040a
 		struct virtio_net_hdr vnet_hdr;
a6040a
 		uint64_t remain = dev->vhost_hlen;
a6040a
@@ -298,7 +300,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 					(void *)(uintptr_t)src, len);
a6040a
 
a6040a
 			PRINT_PACKET(dev, (uintptr_t)dst, len, 0);
a6040a
-			vhost_log_write(dev, guest_addr, len);
a6040a
+			vhost_log_cache_write(dev, vq, guest_addr, len);
a6040a
 			remain -= len;
a6040a
 			guest_addr += len;
a6040a
 			dst += len;
a6040a
@@ -379,7 +381,8 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 							desc_offset)),
a6040a
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
a6040a
 				cpy_len);
a6040a
-			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
a6040a
+			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
a6040a
+					cpy_len);
a6040a
 			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
a6040a
 				     cpy_len, 0);
a6040a
 		} else {
a6040a
@@ -468,7 +471,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
a6040a
 		vq->used->ring[used_idx].id = desc_indexes[i];
a6040a
 		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
a6040a
 					       dev->vhost_hlen;
a6040a
-		vhost_log_used_vring(dev, vq,
a6040a
+		vhost_log_cache_used_vring(dev, vq,
a6040a
 			offsetof(struct vring_used, ring[used_idx]),
a6040a
 			sizeof(vq->used->ring[used_idx]));
a6040a
 	}
a6040a
@@ -528,6 +531,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
a6040a
 
a6040a
 	rte_smp_wmb();
a6040a
 
a6040a
+	vhost_log_cache_sync(dev, vq);
a6040a
+
a6040a
 	*(volatile uint16_t *)&vq->used->idx += count;
a6040a
 	vq->last_used_idx += count;
a6040a
 	vhost_log_used_vring(dev, vq,
a6040a
@@ -797,7 +802,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 
a6040a
 					PRINT_PACKET(dev, (uintptr_t)dst,
a6040a
 							len, 0);
a6040a
-					vhost_log_write(dev, guest_addr, len);
a6040a
+					vhost_log_cache_write(dev, vq,
a6040a
+							guest_addr, len);
a6040a
 
a6040a
 					remain -= len;
a6040a
 					guest_addr += len;
a6040a
@@ -806,7 +812,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 			} else {
a6040a
 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
a6040a
 						dev->vhost_hlen, 0);
a6040a
-				vhost_log_write(dev, hdr_phys_addr,
a6040a
+				vhost_log_cache_write(dev, vq, hdr_phys_addr,
a6040a
 						dev->vhost_hlen);
a6040a
 			}
a6040a
 
a6040a
@@ -820,7 +826,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 							desc_offset)),
a6040a
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
a6040a
 				cpy_len);
a6040a
-			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
a6040a
+			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
a6040a
+					cpy_len);
a6040a
 			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
a6040a
 				cpy_len, 0);
a6040a
 		} else {
a6040a
@@ -1384,7 +1391,7 @@ update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 {
a6040a
 	vq->used->ring[used_idx].id  = desc_idx;
a6040a
 	vq->used->ring[used_idx].len = 0;
a6040a
-	vhost_log_used_vring(dev, vq,
a6040a
+	vhost_log_cache_used_vring(dev, vq,
a6040a
 			offsetof(struct vring_used, ring[used_idx]),
a6040a
 			sizeof(vq->used->ring[used_idx]));
a6040a
 }
a6040a
@@ -1399,6 +1406,8 @@ update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
a6040a
 	rte_smp_wmb();
a6040a
 	rte_smp_rmb();
a6040a
 
a6040a
+	vhost_log_cache_sync(dev, vq);
a6040a
+
a6040a
 	vq->used->idx += count;
a6040a
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
a6040a
 			sizeof(vq->used->idx));
a6040a
-- 
a6040a
2.14.3
a6040a