Blame SOURCES/0079-netdrv-bnxt_en-Add-FW-fatal-devlink_health_reporter.patch

f95c89
From b3e7152c648b111070c144a01ce482ec7f3f593c Mon Sep 17 00:00:00 2001
f95c89
From: Jonathan Toppins <jtoppins@redhat.com>
f95c89
Date: Wed, 2 Oct 2019 18:23:34 -0400
f95c89
Subject: [PATCH 79/96] [netdrv] bnxt_en: Add FW fatal devlink_health_reporter
f95c89
f95c89
Message-id: <f7f97c323916640b6204ae069cfe0aaf36db26da.1570027456.git.jtoppins@redhat.com>
f95c89
Patchwork-id: 276494
f95c89
O-Subject: [RHEL-8.2 PATCH 72/78] bnxt_en: Add FW fatal devlink_health_reporter.
f95c89
Bugzilla: 1724766
f95c89
RH-Acked-by: John Linville <linville@redhat.com>
f95c89
RH-Acked-by: Jarod Wilson <jarod@redhat.com>
f95c89
f95c89
Health show command example and output:
f95c89
f95c89
$ devlink health show pci/0000:af:00.0 reporter fw_fatal
f95c89
f95c89
pci/0000:af:00.0:
f95c89
  name fw_fatal
f95c89
    state healthy error 1 recover 1 grace_period 0 auto_recover true
f95c89
f95c89
Fatal events from firmware or missing periodic heartbeats will
f95c89
be reported and recovery will be handled.
f95c89
f95c89
We also turn on the support flags when we register with the firmware to
f95c89
enable this health and recovery feature in the firmware.
f95c89
f95c89
Cc: Jiri Pirko <jiri@mellanox.com>
f95c89
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
f95c89
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
f95c89
Signed-off-by: David S. Miller <davem@davemloft.net>
f95c89
(cherry picked from commit acfb50e4e773c9a5755a3c265c7c20d37a8642e5)
f95c89
Bugzilla: 1724766
f95c89
Build Info: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=23809532
f95c89
Tested: build, boot, basic ping
f95c89
Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
f95c89
Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
f95c89
---
f95c89
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 80 ++++++++++++++++++++++-
f95c89
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  7 ++
f95c89
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 56 ++++++++++++++++
f95c89
 3 files changed, 141 insertions(+), 2 deletions(-)
f95c89
f95c89
Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt.c
f95c89
===================================================================
f95c89
--- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt.c	2020-02-06 16:23:20.864465843 +0100
f95c89
+++ src/drivers/net/ethernet/broadcom/bnxt/bnxt.c	2020-02-06 16:23:21.000464594 +0100
f95c89
@@ -1990,7 +1990,9 @@
f95c89
 			goto async_event_process_exit;
f95c89
 		set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
f95c89
 		break;
f95c89
-	case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY:
f95c89
+	case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: {
f95c89
+		u32 data1 = le32_to_cpu(cmpl->event_data1);
f95c89
+
f95c89
 		bp->fw_reset_timestamp = jiffies;
f95c89
 		bp->fw_reset_min_dsecs = cmpl->timestamp_lo;
f95c89
 		if (!bp->fw_reset_min_dsecs)
f95c89
@@ -1998,8 +2000,16 @@
f95c89
 		bp->fw_reset_max_dsecs = le16_to_cpu(cmpl->timestamp_hi);
f95c89
 		if (!bp->fw_reset_max_dsecs)
f95c89
 			bp->fw_reset_max_dsecs = BNXT_DFLT_FW_RST_MAX_DSECS;
f95c89
+		if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
f95c89
+			netdev_warn(bp->dev, "Firmware fatal reset event received\n");
f95c89
+			set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
f95c89
+		} else {
f95c89
+			netdev_warn(bp->dev, "Firmware non-fatal reset event received, max wait time %d msec\n",
f95c89
+				    bp->fw_reset_max_dsecs * 100);
f95c89
+		}
f95c89
 		set_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event);
f95c89
 		break;
f95c89
+	}
f95c89
 	case ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY: {
f95c89
 		struct bnxt_fw_health *fw_health = bp->fw_health;
f95c89
 		u32 data1 = le32_to_cpu(cmpl->event_data1);
f95c89
@@ -4419,6 +4429,7 @@
f95c89
 {
f95c89
 	struct hwrm_func_drv_rgtr_output *resp = bp->hwrm_cmd_resp_addr;
f95c89
 	struct hwrm_func_drv_rgtr_input req = {0};
f95c89
+	u32 flags;
f95c89
 	int rc;
f95c89
 
f95c89
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
f95c89
@@ -4428,7 +4439,11 @@
f95c89
 			    FUNC_DRV_RGTR_REQ_ENABLES_VER);
f95c89
 
f95c89
 	req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
f95c89
-	req.flags = cpu_to_le32(FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE);
f95c89
+	flags = FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE |
f95c89
+		FUNC_DRV_RGTR_REQ_FLAGS_HOT_RESET_SUPPORT;
f95c89
+	if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
f95c89
+		flags |= FUNC_DRV_RGTR_REQ_FLAGS_ERROR_RECOVERY_SUPPORT;
f95c89
+	req.flags = cpu_to_le32(flags);
f95c89
 	req.ver_maj_8b = DRV_VER_MAJ;
f95c89
 	req.ver_min_8b = DRV_VER_MIN;
f95c89
 	req.ver_upd_8b = DRV_VER_UPD;
f95c89
@@ -9931,6 +9946,38 @@
f95c89
 	bnxt_queue_sp_work(bp);
f95c89
 }
f95c89
 
f95c89
+static void bnxt_fw_health_check(struct bnxt *bp)
f95c89
+{
f95c89
+	struct bnxt_fw_health *fw_health = bp->fw_health;
f95c89
+	u32 val;
f95c89
+
f95c89
+	if (!fw_health || !fw_health->enabled ||
f95c89
+	    test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
f95c89
+		return;
f95c89
+
f95c89
+	if (fw_health->tmr_counter) {
f95c89
+		fw_health->tmr_counter--;
f95c89
+		return;
f95c89
+	}
f95c89
+
f95c89
+	val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
f95c89
+	if (val == fw_health->last_fw_heartbeat)
f95c89
+		goto fw_reset;
f95c89
+
f95c89
+	fw_health->last_fw_heartbeat = val;
f95c89
+
f95c89
+	val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
f95c89
+	if (val != fw_health->last_fw_reset_cnt)
f95c89
+		goto fw_reset;
f95c89
+
f95c89
+	fw_health->tmr_counter = fw_health->tmr_multiplier;
f95c89
+	return;
f95c89
+
f95c89
+fw_reset:
f95c89
+	set_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event);
f95c89
+	bnxt_queue_sp_work(bp);
f95c89
+}
f95c89
+
f95c89
 static void bnxt_timer(struct timer_list *t)
f95c89
 {
f95c89
 	struct bnxt *bp = from_timer(bp, t, timer);
f95c89
@@ -9942,6 +9989,9 @@
f95c89
 	if (atomic_read(&bp->intr_sem) != 0)
f95c89
 		goto bnxt_restart_timer;
f95c89
 
f95c89
+	if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
f95c89
+		bnxt_fw_health_check(bp);
f95c89
+
f95c89
 	if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) &&
f95c89
 	    bp->stats_coal_ticks) {
f95c89
 		set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
f95c89
@@ -10008,6 +10058,26 @@
f95c89
 	bp->ctx = NULL;
f95c89
 }
f95c89
 
f95c89
+static bool is_bnxt_fw_ok(struct bnxt *bp)
f95c89
+{
f95c89
+	struct bnxt_fw_health *fw_health = bp->fw_health;
f95c89
+	bool no_heartbeat = false, has_reset = false;
f95c89
+	u32 val;
f95c89
+
f95c89
+	val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
f95c89
+	if (val == fw_health->last_fw_heartbeat)
f95c89
+		no_heartbeat = true;
f95c89
+
f95c89
+	val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
f95c89
+	if (val != fw_health->last_fw_reset_cnt)
f95c89
+		has_reset = true;
f95c89
+
f95c89
+	if (!no_heartbeat && has_reset)
f95c89
+		return true;
f95c89
+
f95c89
+	return false;
f95c89
+}
f95c89
+
f95c89
 /* rtnl_lock is acquired before calling this function */
f95c89
 static void bnxt_force_fw_reset(struct bnxt *bp)
f95c89
 {
f95c89
@@ -10212,6 +10282,12 @@
f95c89
 	if (test_and_clear_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event))
f95c89
 		bnxt_devlink_health_report(bp, BNXT_FW_RESET_NOTIFY_SP_EVENT);
f95c89
 
f95c89
+	if (test_and_clear_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event)) {
f95c89
+		if (!is_bnxt_fw_ok(bp))
f95c89
+			bnxt_devlink_health_report(bp,
f95c89
+						   BNXT_FW_EXCEPTION_SP_EVENT);
f95c89
+	}
f95c89
+
f95c89
 	smp_mb__before_atomic();
f95c89
 	clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
f95c89
 }
f95c89
Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt.h
f95c89
===================================================================
f95c89
--- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt.h	2020-02-06 16:23:20.864465843 +0100
f95c89
+++ src/drivers/net/ethernet/broadcom/bnxt/bnxt.h	2020-02-06 16:23:21.001464585 +0100
f95c89
@@ -472,6 +472,11 @@
f95c89
 	((le32_to_cpu((rx_tpa_end_ext)->rx_tpa_end_cmp_dup_acks) &	\
f95c89
 	 RX_TPA_END_CMP_AGG_BUFS_P5) >> RX_TPA_END_CMP_AGG_BUFS_SHIFT_P5)
f95c89
 
f95c89
+#define EVENT_DATA1_RESET_NOTIFY_FATAL(data1)				\
f95c89
+	(((data1) &							\
f95c89
+	  ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MASK) ==\
f95c89
+	 ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL)
f95c89
+
f95c89
 #define EVENT_DATA1_RECOVERY_MASTER_FUNC(data1)				\
f95c89
 	!!((data1) &							\
f95c89
 	   ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASTER_FUNC)
f95c89
@@ -1372,6 +1377,7 @@
f95c89
 	u32 fw_reset_seq_delay_msec[16];
f95c89
 	struct devlink_health_reporter	*fw_reporter;
f95c89
 	struct devlink_health_reporter *fw_reset_reporter;
f95c89
+	struct devlink_health_reporter *fw_fatal_reporter;
f95c89
 };
f95c89
 
f95c89
 struct bnxt_fw_reporter_ctx {
f95c89
@@ -1728,6 +1734,7 @@
f95c89
 #define BNXT_UPDATE_PHY_SP_EVENT	16
f95c89
 #define BNXT_RING_COAL_NOW_SP_EVENT	17
f95c89
 #define BNXT_FW_RESET_NOTIFY_SP_EVENT	18
f95c89
+#define BNXT_FW_EXCEPTION_SP_EVENT	19
f95c89
 
f95c89
 	struct delayed_work	fw_reset_task;
f95c89
 	int			fw_reset_state;
f95c89
Index: src/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
f95c89
===================================================================
f95c89
--- src.orig/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c	2020-02-06 16:23:20.308470946 +0100
f95c89
+++ src/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c	2020-02-06 16:23:21.001464585 +0100
f95c89
@@ -83,6 +83,31 @@
f95c89
 	.recover = bnxt_fw_reset_recover,
f95c89
 };
f95c89
 
f95c89
+static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter,
f95c89
+				 void *priv_ctx)
f95c89
+{
f95c89
+	struct bnxt *bp = devlink_health_reporter_priv(reporter);
f95c89
+	struct bnxt_fw_reporter_ctx *fw_reporter_ctx = priv_ctx;
f95c89
+	unsigned long event;
f95c89
+
f95c89
+	if (!priv_ctx)
f95c89
+		return -EOPNOTSUPP;
f95c89
+
f95c89
+	event = fw_reporter_ctx->sp_event;
f95c89
+	if (event == BNXT_FW_RESET_NOTIFY_SP_EVENT)
f95c89
+		bnxt_fw_reset(bp);
f95c89
+	else if (event == BNXT_FW_EXCEPTION_SP_EVENT)
f95c89
+		bnxt_fw_exception(bp);
f95c89
+
f95c89
+	return 0;
f95c89
+}
f95c89
+
f95c89
+static const
f95c89
+struct devlink_health_reporter_ops bnxt_dl_fw_fatal_reporter_ops = {
f95c89
+	.name = "fw_fatal",
f95c89
+	.recover = bnxt_fw_fatal_recover,
f95c89
+};
f95c89
+
f95c89
 static void bnxt_dl_fw_reporters_create(struct bnxt *bp)
f95c89
 {
f95c89
 	struct bnxt_fw_health *health = bp->fw_health;
f95c89
@@ -108,6 +133,16 @@
f95c89
 			    PTR_ERR(health->fw_reset_reporter));
f95c89
 		health->fw_reset_reporter = NULL;
f95c89
 	}
f95c89
+
f95c89
+	health->fw_fatal_reporter =
f95c89
+		devlink_health_reporter_create(bp->dl,
f95c89
+					       &bnxt_dl_fw_fatal_reporter_ops,
f95c89
+					       0, true, bp);
f95c89
+	if (IS_ERR(health->fw_fatal_reporter)) {
f95c89
+		netdev_warn(bp->dev, "Failed to create FW fatal health reporter, rc = %ld\n",
f95c89
+			    PTR_ERR(health->fw_fatal_reporter));
f95c89
+		health->fw_fatal_reporter = NULL;
f95c89
+	}
f95c89
 }
f95c89
 
f95c89
 static void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
f95c89
@@ -122,6 +157,9 @@
f95c89
 
f95c89
 	if (health->fw_reset_reporter)
f95c89
 		devlink_health_reporter_destroy(health->fw_reset_reporter);
f95c89
+
f95c89
+	if (health->fw_fatal_reporter)
f95c89
+		devlink_health_reporter_destroy(health->fw_fatal_reporter);
f95c89
 }
f95c89
 
f95c89
 void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event)
f95c89
@@ -135,6 +173,15 @@
f95c89
 	fw_reporter_ctx.sp_event = event;
f95c89
 	switch (event) {
f95c89
 	case BNXT_FW_RESET_NOTIFY_SP_EVENT:
f95c89
+		if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) {
f95c89
+			if (!fw_health->fw_fatal_reporter)
f95c89
+				return;
f95c89
+
f95c89
+			devlink_health_report(fw_health->fw_fatal_reporter,
f95c89
+					      "FW fatal async event received",
f95c89
+					      &fw_reporter_ctx);
f95c89
+			return;
f95c89
+		}
f95c89
 		if (!fw_health->fw_reset_reporter)
f95c89
 			return;
f95c89
 
f95c89
@@ -142,6 +189,15 @@
f95c89
 				      "FW non-fatal reset event received",
f95c89
 				      &fw_reporter_ctx);
f95c89
 		return;
f95c89
+
f95c89
+	case BNXT_FW_EXCEPTION_SP_EVENT:
f95c89
+		if (!fw_health->fw_fatal_reporter)
f95c89
+			return;
f95c89
+
f95c89
+		devlink_health_report(fw_health->fw_fatal_reporter,
f95c89
+				      "FW fatal error reported",
f95c89
+				      &fw_reporter_ctx);
f95c89
+		return;
f95c89
 	}
f95c89
 }
f95c89