Blame SOURCES/0030-netdrv-net-mlx5e-Report-and-recover-from-CQE-error-o.patch

d8f823
From beae62dd1772b395964f8e73f82c202f1ad346d9 Mon Sep 17 00:00:00 2001
d8f823
From: Alaa Hleihel <ahleihel@redhat.com>
d8f823
Date: Sun, 10 May 2020 14:51:54 -0400
d8f823
Subject: [PATCH 030/312] [netdrv] net/mlx5e: Report and recover from CQE error
d8f823
 on ICOSQ
d8f823
d8f823
Message-id: <20200510145245.10054-32-ahleihel@redhat.com>
d8f823
Patchwork-id: 306571
d8f823
Patchwork-instance: patchwork
d8f823
O-Subject: [RHEL8.3 BZ 1789378 v2 31/82] net/mlx5e: Report and recover from CQE error on ICOSQ
d8f823
Bugzilla: 1790198 1789378
d8f823
RH-Acked-by: Kamal Heib <kheib@redhat.com>
d8f823
RH-Acked-by: Jarod Wilson <jarod@redhat.com>
d8f823
RH-Acked-by: Tony Camuso <tcamuso@redhat.com>
d8f823
RH-Acked-by: Jonathan Toppins <jtoppins@redhat.com>
d8f823
d8f823
Bugzilla: http://bugzilla.redhat.com/1789378
d8f823
Bugzilla: http://bugzilla.redhat.com/1790198
d8f823
Upstream: v5.4-rc1
d8f823
Conflicts:
d8f823
 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c
d8f823
 - drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
   Dropped hunks that were previously applied for fixing incremental build.
d8f823
d8f823
 - drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
d8f823
   Adapt mlx5e_rx_reporter_recover parameters to current API due to already
d8f823
   backported commit:
d8f823
   e7a981050a7f ("devlink: propagate extack down to health reporter ops")
d8f823
   ---> .recover callback now expects to get extact as well.
d8f823
d8f823
commit be5323c8379f488f1de53206edeaf80fc20d7686
d8f823
Author: Aya Levin <ayal@mellanox.com>
d8f823
Date:   Tue Jun 25 17:44:28 2019 +0300
d8f823
d8f823
    net/mlx5e: Report and recover from CQE error on ICOSQ
d8f823
d8f823
    Add support for report and recovery from error on completion on ICOSQ.
d8f823
    Deactivate RQ and flush, then deactivate ICOSQ. Set the queue back to
d8f823
    ready state (firmware) and reset the ICOSQ and the RQ (software
d8f823
    resources). Finally, activate the ICOSQ and the RQ.
d8f823
d8f823
    Signed-off-by: Aya Levin <ayal@mellanox.com>
d8f823
    Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
d8f823
    Acked-by: Jiri Pirko <jiri@mellanox.com>
d8f823
    Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
d8f823
d8f823
Signed-off-by: Alaa Hleihel <ahleihel@redhat.com>
d8f823
Signed-off-by: Frantisek Hrbata <fhrbata@redhat.com>
d8f823
---
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   6 ++
d8f823
 .../net/ethernet/mellanox/mlx5/core/en/health.h    |   1 +
d8f823
 .../ethernet/mellanox/mlx5/core/en/reporter_rx.c   | 110 ++++++++++++++++++++-
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  18 +++-
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   2 +
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |   3 +
d8f823
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   2 +
d8f823
 7 files changed, 137 insertions(+), 5 deletions(-)
d8f823
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
index 21926cb209f9..f0ba350579ae 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
d8f823
@@ -559,6 +559,8 @@ struct mlx5e_icosq {
d8f823
 	/* control path */
d8f823
 	struct mlx5_wq_ctrl        wq_ctrl;
d8f823
 	struct mlx5e_channel      *channel;
d8f823
+
d8f823
+	struct work_struct         recover_work;
d8f823
 } ____cacheline_aligned_in_smp;
d8f823
 
d8f823
 struct mlx5e_wqe_frag_info {
d8f823
@@ -1037,6 +1039,10 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
d8f823
 void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
d8f823
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
d8f823
 			       struct mlx5e_params *params);
d8f823
+int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state);
d8f823
+void mlx5e_activate_rq(struct mlx5e_rq *rq);
d8f823
+void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
d8f823
+void mlx5e_free_rx_descs(struct mlx5e_rq *rq);
d8f823
 void mlx5e_activate_icosq(struct mlx5e_icosq *icosq);
d8f823
 void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq);
d8f823
 
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
d8f823
index a751c5316baf..8acd9dc520cf 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
d8f823
@@ -18,6 +18,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg);
d8f823
 
d8f823
 int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
d8f823
 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
d8f823
+void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
d8f823
 
d8f823
 #define MLX5E_REPORTER_PER_Q_MAX_LEN 256
d8f823
 
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
d8f823
index 7cd767f0b8c7..661de567ca6c 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
d8f823
@@ -27,6 +27,110 @@ static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state)
d8f823
 	return err;
d8f823
 }
d8f823
 
d8f823
+static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq)
d8f823
+{
d8f823
+	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
d8f823
+
d8f823
+	while (time_before(jiffies, exp_time)) {
d8f823
+		if (icosq->cc == icosq->pc)
d8f823
+			return 0;
d8f823
+
d8f823
+		msleep(20);
d8f823
+	}
d8f823
+
d8f823
+	netdev_err(icosq->channel->netdev,
d8f823
+		   "Wait for ICOSQ 0x%x flush timeout (cc = 0x%x, pc = 0x%x)\n",
d8f823
+		   icosq->sqn, icosq->cc, icosq->pc);
d8f823
+
d8f823
+	return -ETIMEDOUT;
d8f823
+}
d8f823
+
d8f823
+static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq)
d8f823
+{
d8f823
+	WARN_ONCE(icosq->cc != icosq->pc, "ICOSQ 0x%x: cc (0x%x) != pc (0x%x)\n",
d8f823
+		  icosq->sqn, icosq->cc, icosq->pc);
d8f823
+	icosq->cc = 0;
d8f823
+	icosq->pc = 0;
d8f823
+}
d8f823
+
d8f823
+static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)
d8f823
+{
d8f823
+	struct mlx5_core_dev *mdev;
d8f823
+	struct mlx5e_icosq *icosq;
d8f823
+	struct net_device *dev;
d8f823
+	struct mlx5e_rq *rq;
d8f823
+	u8 state;
d8f823
+	int err;
d8f823
+
d8f823
+	icosq = ctx;
d8f823
+	rq = &icosq->channel->rq;
d8f823
+	mdev = icosq->channel->mdev;
d8f823
+	dev = icosq->channel->netdev;
d8f823
+	err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state);
d8f823
+	if (err) {
d8f823
+		netdev_err(dev, "Failed to query ICOSQ 0x%x state. err = %d\n",
d8f823
+			   icosq->sqn, err);
d8f823
+		goto out;
d8f823
+	}
d8f823
+
d8f823
+	if (state != MLX5_SQC_STATE_ERR)
d8f823
+		goto out;
d8f823
+
d8f823
+	mlx5e_deactivate_rq(rq);
d8f823
+	err = mlx5e_wait_for_icosq_flush(icosq);
d8f823
+	if (err)
d8f823
+		goto out;
d8f823
+
d8f823
+	mlx5e_deactivate_icosq(icosq);
d8f823
+
d8f823
+	/* At this point, both the rq and the icosq are disabled */
d8f823
+
d8f823
+	err = mlx5e_health_sq_to_ready(icosq->channel, icosq->sqn);
d8f823
+	if (err)
d8f823
+		goto out;
d8f823
+
d8f823
+	mlx5e_reset_icosq_cc_pc(icosq);
d8f823
+	mlx5e_free_rx_descs(rq);
d8f823
+	clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
d8f823
+	mlx5e_activate_icosq(icosq);
d8f823
+	mlx5e_activate_rq(rq);
d8f823
+
d8f823
+	rq->stats->recover++;
d8f823
+	return 0;
d8f823
+out:
d8f823
+	clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
d8f823
+	return err;
d8f823
+}
d8f823
+
d8f823
+void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
d8f823
+{
d8f823
+	struct mlx5e_priv *priv = icosq->channel->priv;
d8f823
+	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
d8f823
+	struct mlx5e_err_ctx err_ctx = {};
d8f823
+
d8f823
+	err_ctx.ctx = icosq;
d8f823
+	err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover;
d8f823
+	sprintf(err_str, "ERR CQE on ICOSQ: 0x%x", icosq->sqn);
d8f823
+
d8f823
+	mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
d8f823
+}
d8f823
+
d8f823
+static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
d8f823
+{
d8f823
+	return err_ctx->recover(err_ctx->ctx);
d8f823
+}
d8f823
+
d8f823
+static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter,
d8f823
+				     void *context,
d8f823
+				     struct netlink_ext_ack *extack)
d8f823
+{
d8f823
+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
d8f823
+	struct mlx5e_err_ctx *err_ctx = context;
d8f823
+
d8f823
+	return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) :
d8f823
+			 mlx5e_health_recover_channels(priv);
d8f823
+}
d8f823
+
d8f823
 static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq,
d8f823
 						   struct devlink_fmsg *fmsg)
d8f823
 {
d8f823
@@ -168,9 +272,12 @@ static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter,
d8f823
 
d8f823
 static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = {
d8f823
 	.name = "rx",
d8f823
+	.recover = mlx5e_rx_reporter_recover,
d8f823
 	.diagnose = mlx5e_rx_reporter_diagnose,
d8f823
 };
d8f823
 
d8f823
+#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500
d8f823
+
d8f823
 int mlx5e_reporter_rx_create(struct mlx5e_priv *priv)
d8f823
 {
d8f823
 	struct devlink *devlink = priv_to_devlink(priv->mdev);
d8f823
@@ -178,7 +285,8 @@ int mlx5e_reporter_rx_create(struct mlx5e_priv *priv)
d8f823
 
d8f823
 	reporter = devlink_health_reporter_create(devlink,
d8f823
 						  &mlx5_rx_reporter_ops,
d8f823
-						  0, false, priv);
d8f823
+						  MLX5E_REPORTER_RX_GRACEFUL_PERIOD,
d8f823
+						  true, priv);
d8f823
 	if (IS_ERR(reporter)) {
d8f823
 		netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n",
d8f823
 			    PTR_ERR(reporter));
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
d8f823
index 7dde1be49f35..430fb04ea96f 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
d8f823
@@ -691,8 +691,7 @@ static int mlx5e_create_rq(struct mlx5e_rq *rq,
d8f823
 	return err;
d8f823
 }
d8f823
 
d8f823
-static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
d8f823
-				 int next_state)
d8f823
+int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state)
d8f823
 {
d8f823
 	struct mlx5_core_dev *mdev = rq->mdev;
d8f823
 
d8f823
@@ -803,7 +802,7 @@ int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
d8f823
 	return -ETIMEDOUT;
d8f823
 }
d8f823
 
d8f823
-static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
d8f823
+void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
d8f823
 {
d8f823
 	__be16 wqe_ix_be;
d8f823
 	u16 wqe_ix;
d8f823
@@ -882,7 +881,7 @@ int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
d8f823
 	return err;
d8f823
 }
d8f823
 
d8f823
-static void mlx5e_activate_rq(struct mlx5e_rq *rq)
d8f823
+void mlx5e_activate_rq(struct mlx5e_rq *rq)
d8f823
 {
d8f823
 	set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
d8f823
 	mlx5e_trigger_irq(&rq->channel->icosq);
d8f823
@@ -897,6 +896,7 @@ void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
d8f823
 void mlx5e_close_rq(struct mlx5e_rq *rq)
d8f823
 {
d8f823
 	cancel_work_sync(&rq->dim.work);
d8f823
+	cancel_work_sync(&rq->channel->icosq.recover_work);
d8f823
 	mlx5e_destroy_rq(rq);
d8f823
 	mlx5e_free_rx_descs(rq);
d8f823
 	mlx5e_free_rq(rq);
d8f823
@@ -1013,6 +1013,14 @@ static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa)
d8f823
 	return 0;
d8f823
 }
d8f823
 
d8f823
+static void mlx5e_icosq_err_cqe_work(struct work_struct *recover_work)
d8f823
+{
d8f823
+	struct mlx5e_icosq *sq = container_of(recover_work, struct mlx5e_icosq,
d8f823
+					      recover_work);
d8f823
+
d8f823
+	mlx5e_reporter_icosq_cqe_err(sq);
d8f823
+}
d8f823
+
d8f823
 static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
d8f823
 			     struct mlx5e_sq_param *param,
d8f823
 			     struct mlx5e_icosq *sq)
d8f823
@@ -1035,6 +1043,8 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
d8f823
 	if (err)
d8f823
 		goto err_sq_wq_destroy;
d8f823
 
d8f823
+	INIT_WORK(&sq->recover_work, mlx5e_icosq_err_cqe_work);
d8f823
+
d8f823
 	return 0;
d8f823
 
d8f823
 err_sq_wq_destroy:
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
d8f823
index a22b3a3db253..ce4d357188df 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
d8f823
@@ -616,6 +616,8 @@ void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
d8f823
 		if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
d8f823
 			netdev_WARN_ONCE(cq->channel->netdev,
d8f823
 					 "Bad OP in ICOSQ CQE: 0x%x\n", get_cqe_opcode(cqe));
d8f823
+			if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
d8f823
+				queue_work(cq->channel->priv->wq, &sq->recover_work);
d8f823
 			break;
d8f823
 		}
d8f823
 		do {
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
d8f823
index 3d993e2e7bea..79b3ec005f43 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
d8f823
@@ -161,6 +161,7 @@ static const struct counter_desc sw_stats_desc[] = {
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_congst_umr) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_err) },
d8f823
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_recover) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_events) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_poll) },
d8f823
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_arm) },
d8f823
@@ -272,6 +273,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
d8f823
 		s->rx_cache_waive += rq_stats->cache_waive;
d8f823
 		s->rx_congst_umr  += rq_stats->congst_umr;
d8f823
 		s->rx_arfs_err    += rq_stats->arfs_err;
d8f823
+		s->rx_recover     += rq_stats->recover;
d8f823
 		s->ch_events      += ch_stats->events;
d8f823
 		s->ch_poll        += ch_stats->poll;
d8f823
 		s->ch_arm         += ch_stats->arm;
d8f823
@@ -1484,6 +1486,7 @@ static const struct counter_desc rq_stats_desc[] = {
d8f823
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_waive) },
d8f823
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, congst_umr) },
d8f823
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_err) },
d8f823
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, recover) },
d8f823
 };
d8f823
 
d8f823
 static const struct counter_desc sq_stats_desc[] = {
d8f823
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
d8f823
index a4a43613d026..ab1c3366ff7d 100644
d8f823
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
d8f823
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
d8f823
@@ -167,6 +167,7 @@ struct mlx5e_sw_stats {
d8f823
 	u64 rx_cache_waive;
d8f823
 	u64 rx_congst_umr;
d8f823
 	u64 rx_arfs_err;
d8f823
+	u64 rx_recover;
d8f823
 	u64 ch_events;
d8f823
 	u64 ch_poll;
d8f823
 	u64 ch_arm;
d8f823
@@ -302,6 +303,7 @@ struct mlx5e_rq_stats {
d8f823
 	u64 cache_waive;
d8f823
 	u64 congst_umr;
d8f823
 	u64 arfs_err;
d8f823
+	u64 recover;
d8f823
 };
d8f823
 
d8f823
 struct mlx5e_sq_stats {
d8f823
-- 
d8f823
2.13.6
d8f823