Blame SOURCES/0011-netdrv-net-mlx5e-XDP-Close-TX-MPWQE-session-when-no-.patch

d8f823
From eee2fd0e4f3d4d9f833a2eec6169c8c46c9388c2 Mon Sep 17 00:00:00 2001
d8f823
From: Alaa Hleihel <ahleihel@redhat.com>
d8f823
Date: Sun, 10 May 2020 14:51:31 -0400
d8f823
Subject: [PATCH 011/312] [netdrv] net/mlx5e: XDP, Close TX MPWQE session when
d8f823
 no room for inline packet left
d8f823
d8f823
Message-id: <20200510145245.10054-9-ahleihel@redhat.com>
d8f823
Patchwork-id: 306548
d8f823
Patchwork-instance: patchwork
d8f823
O-Subject: [RHEL8.3 BZ 1789378 v2 08/82] net/mlx5e: XDP, Close TX MPWQE session when no room for inline packet left
d8f823
Bugzilla: 1789378
d8f823
RH-Acked-by: Kamal Heib <kheib@redhat.com>
d8f823
RH-Acked-by: Jarod Wilson <jarod@redhat.com>
d8f823
RH-Acked-by: Tony Camuso <tcamuso@redhat.com>
d8f823
RH-Acked-by: Jonathan Toppins <jtoppins@redhat.com>
d8f823
d8f823
Bugzilla: http://bugzilla.redhat.com/1789378
d8f823
Upstream: v5.4-rc1
d8f823
d8f823
commit 6c085a8aab5183d8658c9a692bcfda3e24195b7a
d8f823
Author: Shay Agroskin <shayag@mellanox.com>
d8f823
Date:   Sun May 12 18:28:27 2019 +0300
d8f823
d8f823
    net/mlx5e: XDP, Close TX MPWQE session when no room for inline packet left
d8f823
d8f823
    In MPWQE mode, when transmitting packets with XDP, a packet that is smaller
d8f823
    than a certain size (set to 256 bytes) would be sent inline within its WQE
d8f823
    TX descriptor (mem-copied), in case the hardware tx queue is congested
d8f823
    beyond a pre-defined water-mark.
d8f823
d8f823
    If a MPWQE cannot contain an additional inline packet, we close this
d8f823
    MPWQE session, and send the packet inlined within the next MPWQE.
d8f823
    To save some MPWQE session close+open operations, we don't open MPWQE
d8f823
    sessions that are contiguously smaller than certain size (set to the
d8f823
    HW MPWQE maximum size). If there isn't enough contiguous room in the
d8f823
    send queue, we fill it with NOPs and wrap the send queue index around.
d8f823
d8f823
    This way, qualified packets are always sent inline.
d8f823
d8f823
    Perf tests:
d8f823
    Tested packet rate for UDP 64Byte multi-stream
d8f823
    over two dual port ConnectX-5 100Gbps NICs.
d8f823
    CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
d8f823
d8f823
    XDP_TX:
d8f823
d8f823
    With 24 channels:
d8f823
    | ------ | bounced packets | inlined packets | inline ratio |
d8f823
    | before | 113.6Mpps       | 96.3Mpps        | 84%          |
d8f823
    | after  |   115Mpps       | 99.5Mpps        | 86%          |
d8f823
d8f823
    With one channel:
d8f823
d8f823
    | ------ | bounced packets | inlined packets | inline ratio |
d8f823
    | before | 6.7Mpps         | 0pps            | 0%           |
d8f823
    | after  | 6.8Mpps         | 0pps            | 0%           |
d8f823
d8f823
    As we can see, there is improvement in both inline ratio and overall
d8f823
    packet rate for 24 channels. Also, we see no degradation for the
d8f823
    one-channel case.
d8f823
d8f823
    Signed-off-by: Shay Agroskin <shayag@mellanox.com>
d8f823
    Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
d8f823
    Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
d8f823
d8f823
Signed-off-by: Alaa Hleihel <ahleihel@redhat.com>
d8f823
Signed-off-by: Frantisek Hrbata <fhrbata@redhat.com>
d8f823
---
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  2 -
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c   | 32 ++++---------
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h   | 53 ++++++++++++++++++----
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  6 +++
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  3 ++
d8f823
 5 files changed, 63 insertions(+), 33 deletions(-)
d8f823
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
index 3b77b43db748..bc2c38faadc8 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
@@ -488,8 +488,6 @@ struct mlx5e_xdp_mpwqe {
d8f823
 	struct mlx5e_tx_wqe *wqe;
d8f823
 	u8                   ds_count;
d8f823
 	u8                   pkt_count;
d8f823
-	u8                   max_ds_count;
d8f823
-	u8                   complete;
d8f823
 	u8                   inline_on;
d8f823
 };
d8f823
 
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
d8f823
index b0b982cf69bb..8cb98326531f 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
d8f823
@@ -179,34 +179,22 @@ static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq)
d8f823
 	struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
d8f823
 	struct mlx5e_xdpsq_stats *stats = sq->stats;
d8f823
 	struct mlx5_wq_cyc *wq = &sq->wq;
d8f823
-	u8  wqebbs;
d8f823
-	u16 pi;
d8f823
+	u16 pi, contig_wqebbs;
d8f823
+
d8f823
+	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
d8f823
+	contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi);
d8f823
+
d8f823
+	if (unlikely(contig_wqebbs < MLX5_SEND_WQE_MAX_WQEBBS))
d8f823
+		mlx5e_fill_xdpsq_frag_edge(sq, wq, pi, contig_wqebbs);
d8f823
 
d8f823
 	mlx5e_xdpsq_fetch_wqe(sq, &session->wqe);
d8f823
 
d8f823
 	prefetchw(session->wqe->data);
d8f823
 	session->ds_count  = MLX5E_XDP_TX_EMPTY_DS_COUNT;
d8f823
 	session->pkt_count = 0;
d8f823
-	session->complete  = 0;
d8f823
 
d8f823
 	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
d8f823
 
d8f823
-/* The mult of MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS
d8f823
- * (16 * 4 == 64) does not fit in the 6-bit DS field of Ctrl Segment.
d8f823
- * We use a bound lower that MLX5_SEND_WQE_MAX_WQEBBS to let a
d8f823
- * full-session WQE be cache-aligned.
d8f823
- */
d8f823
-#if L1_CACHE_BYTES < 128
d8f823
-#define MLX5E_XDP_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 1)
d8f823
-#else
d8f823
-#define MLX5E_XDP_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 2)
d8f823
-#endif
d8f823
-
d8f823
-	wqebbs = min_t(u16, mlx5_wq_cyc_get_contig_wqebbs(wq, pi),
d8f823
-		       MLX5E_XDP_MPW_MAX_WQEBBS);
d8f823
-
d8f823
-	session->max_ds_count = MLX5_SEND_WQEBB_NUM_DS * wqebbs;
d8f823
-
d8f823
 	mlx5e_xdp_update_inline_state(sq);
d8f823
 
d8f823
 	stats->mpwqe++;
d8f823
@@ -244,7 +232,7 @@ static int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq)
d8f823
 {
d8f823
 	if (unlikely(!sq->mpwqe.wqe)) {
d8f823
 		if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc,
d8f823
-						     MLX5_SEND_WQE_MAX_WQEBBS))) {
d8f823
+						     MLX5E_XDPSQ_STOP_ROOM))) {
d8f823
 			/* SQ is full, ring doorbell */
d8f823
 			mlx5e_xmit_xdp_doorbell(sq);
d8f823
 			sq->stats->full++;
d8f823
@@ -285,8 +273,8 @@ static bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq,
d8f823
 
d8f823
 	mlx5e_xdp_mpwqe_add_dseg(sq, xdptxd, stats);
d8f823
 
d8f823
-	if (unlikely(session->complete ||
d8f823
-		     session->ds_count == session->max_ds_count))
d8f823
+	if (unlikely(mlx5e_xdp_no_room_for_inline_pkt(session) ||
d8f823
+		     session->ds_count == MLX5E_XDP_MPW_MAX_NUM_DS))
d8f823
 		mlx5e_xdp_mpwqe_complete(sq);
d8f823
 
d8f823
 	mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
d8f823
index d5b0d55d434b..c52f72062b33 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
d8f823
@@ -40,6 +40,26 @@
d8f823
 	(sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
d8f823
 #define MLX5E_XDP_TX_DS_COUNT (MLX5E_XDP_TX_EMPTY_DS_COUNT + 1 /* SG DS */)
d8f823
 
d8f823
+#define MLX5E_XDPSQ_STOP_ROOM (MLX5E_SQ_STOP_ROOM)
d8f823
+
d8f823
+#define MLX5E_XDP_INLINE_WQE_SZ_THRSD (256 - sizeof(struct mlx5_wqe_inline_seg))
d8f823
+#define MLX5E_XDP_INLINE_WQE_MAX_DS_CNT \
d8f823
+	DIV_ROUND_UP(MLX5E_XDP_INLINE_WQE_SZ_THRSD, MLX5_SEND_WQE_DS)
d8f823
+
d8f823
+/* The mult of MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS
d8f823
+ * (16 * 4 == 64) does not fit in the 6-bit DS field of Ctrl Segment.
d8f823
+ * We use a bound lower that MLX5_SEND_WQE_MAX_WQEBBS to let a
d8f823
+ * full-session WQE be cache-aligned.
d8f823
+ */
d8f823
+#if L1_CACHE_BYTES < 128
d8f823
+#define MLX5E_XDP_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 1)
d8f823
+#else
d8f823
+#define MLX5E_XDP_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 2)
d8f823
+#endif
d8f823
+
d8f823
+#define MLX5E_XDP_MPW_MAX_NUM_DS \
d8f823
+	(MLX5E_XDP_MPW_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS)
d8f823
+
d8f823
 struct mlx5e_xsk_param;
d8f823
 int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
d8f823
 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
d8f823
@@ -110,6 +130,30 @@ static inline void mlx5e_xdp_update_inline_state(struct mlx5e_xdpsq *sq)
d8f823
 		session->inline_on = 1;
d8f823
 }
d8f823
 
d8f823
+static inline bool
d8f823
+mlx5e_xdp_no_room_for_inline_pkt(struct mlx5e_xdp_mpwqe *session)
d8f823
+{
d8f823
+	return session->inline_on &&
d8f823
+	       session->ds_count + MLX5E_XDP_INLINE_WQE_MAX_DS_CNT > MLX5E_XDP_MPW_MAX_NUM_DS;
d8f823
+}
d8f823
+
d8f823
+static inline void
d8f823
+mlx5e_fill_xdpsq_frag_edge(struct mlx5e_xdpsq *sq, struct mlx5_wq_cyc *wq,
d8f823
+			   u16 pi, u16 nnops)
d8f823
+{
d8f823
+	struct mlx5e_xdp_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi];
d8f823
+
d8f823
+	edge_wi = wi + nnops;
d8f823
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
d8f823
+	for (; wi < edge_wi; wi++) {
d8f823
+		wi->num_wqebbs = 1;
d8f823
+		wi->num_pkts   = 0;
d8f823
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
d8f823
+	}
d8f823
+
d8f823
+	sq->stats->nops += nnops;
d8f823
+}
d8f823
+
d8f823
 static inline void
d8f823
 mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq,
d8f823
 			 struct mlx5e_xdp_xmit_data *xdptxd,
d8f823
@@ -122,20 +166,12 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq,
d8f823
 
d8f823
 	session->pkt_count++;
d8f823
 
d8f823
-#define MLX5E_XDP_INLINE_WQE_SZ_THRSD (256 - sizeof(struct mlx5_wqe_inline_seg))
d8f823
-
d8f823
 	if (session->inline_on && dma_len <= MLX5E_XDP_INLINE_WQE_SZ_THRSD) {
d8f823
 		struct mlx5_wqe_inline_seg *inline_dseg =
d8f823
 			(struct mlx5_wqe_inline_seg *)dseg;
d8f823
 		u16 ds_len = sizeof(*inline_dseg) + dma_len;
d8f823
 		u16 ds_cnt = DIV_ROUND_UP(ds_len, MLX5_SEND_WQE_DS);
d8f823
 
d8f823
-		if (unlikely(session->ds_count + ds_cnt > session->max_ds_count)) {
d8f823
-			/* Not enough space for inline wqe, send with memory pointer */
d8f823
-			session->complete = true;
d8f823
-			goto no_inline;
d8f823
-		}
d8f823
-
d8f823
 		inline_dseg->byte_count = cpu_to_be32(dma_len | MLX5_INLINE_SEG);
d8f823
 		memcpy(inline_dseg->data, xdptxd->data, dma_len);
d8f823
 
d8f823
@@ -144,7 +180,6 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq,
d8f823
 		return;
d8f823
 	}
d8f823
 
d8f823
-no_inline:
d8f823
 	dseg->addr       = cpu_to_be64(xdptxd->dma_addr);
d8f823
 	dseg->byte_count = cpu_to_be32(dma_len);
d8f823
 	dseg->lkey       = sq->mkey_be;
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
d8f823
index b4f5ae30dae2..3d993e2e7bea 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
d8f823
@@ -126,6 +126,7 @@ static const struct counter_desc sw_stats_desc[] = {
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_xmit) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_mpwqe) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_inlnw) },
d8f823
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_nops) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_err) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe) },
d8f823
@@ -142,6 +143,7 @@ static const struct counter_desc sw_stats_desc[] = {
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_xmit) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_mpwqe) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_inlnw) },
d8f823
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_nops) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_full) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_err) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_cqes) },
d8f823
@@ -252,6 +254,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
d8f823
 		s->rx_xdp_tx_xmit  += xdpsq_stats->xmit;
d8f823
 		s->rx_xdp_tx_mpwqe += xdpsq_stats->mpwqe;
d8f823
 		s->rx_xdp_tx_inlnw += xdpsq_stats->inlnw;
d8f823
+		s->rx_xdp_tx_nops  += xdpsq_stats->nops;
d8f823
 		s->rx_xdp_tx_full  += xdpsq_stats->full;
d8f823
 		s->rx_xdp_tx_err   += xdpsq_stats->err;
d8f823
 		s->rx_xdp_tx_cqe   += xdpsq_stats->cqes;
d8f823
@@ -279,6 +282,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
d8f823
 		s->tx_xdp_xmit    += xdpsq_red_stats->xmit;
d8f823
 		s->tx_xdp_mpwqe   += xdpsq_red_stats->mpwqe;
d8f823
 		s->tx_xdp_inlnw   += xdpsq_red_stats->inlnw;
d8f823
+		s->tx_xdp_nops	  += xdpsq_red_stats->nops;
d8f823
 		s->tx_xdp_full    += xdpsq_red_stats->full;
d8f823
 		s->tx_xdp_err     += xdpsq_red_stats->err;
d8f823
 		s->tx_xdp_cqes    += xdpsq_red_stats->cqes;
d8f823
@@ -1517,6 +1521,7 @@ static const struct counter_desc rq_xdpsq_stats_desc[] = {
d8f823
 	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) },
d8f823
 	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) },
d8f823
 	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, inlnw) },
d8f823
+	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, nops) },
d8f823
 	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, full) },
d8f823
 	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) },
d8f823
 	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) },
d8f823
@@ -1526,6 +1531,7 @@ static const struct counter_desc xdpsq_stats_desc[] = {
d8f823
 	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) },
d8f823
 	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) },
d8f823
 	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, inlnw) },
d8f823
+	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, nops) },
d8f823
 	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, full) },
d8f823
 	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) },
d8f823
 	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) },
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
d8f823
index 0f9fa22a955e..a4a43613d026 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
d8f823
@@ -132,6 +132,7 @@ struct mlx5e_sw_stats {
d8f823
 	u64 rx_xdp_tx_xmit;
d8f823
 	u64 rx_xdp_tx_mpwqe;
d8f823
 	u64 rx_xdp_tx_inlnw;
d8f823
+	u64 rx_xdp_tx_nops;
d8f823
 	u64 rx_xdp_tx_full;
d8f823
 	u64 rx_xdp_tx_err;
d8f823
 	u64 rx_xdp_tx_cqe;
d8f823
@@ -148,6 +149,7 @@ struct mlx5e_sw_stats {
d8f823
 	u64 tx_xdp_xmit;
d8f823
 	u64 tx_xdp_mpwqe;
d8f823
 	u64 tx_xdp_inlnw;
d8f823
+	u64 tx_xdp_nops;
d8f823
 	u64 tx_xdp_full;
d8f823
 	u64 tx_xdp_err;
d8f823
 	u64 tx_xdp_cqes;
d8f823
@@ -341,6 +343,7 @@ struct mlx5e_xdpsq_stats {
d8f823
 	u64 xmit;
d8f823
 	u64 mpwqe;
d8f823
 	u64 inlnw;
d8f823
+	u64 nops;
d8f823
 	u64 full;
d8f823
 	u64 err;
d8f823
 	/* dirtied @completion */
d8f823
-- 
d8f823
2.13.6
d8f823