Blame rasdaemon-openbmc-ipmitool-sel-logging.diff

Michel Lind 3a9b30
From e4f0bdc7b62459ff50605ea867898ce1026e5905 Mon Sep 17 00:00:00 2001
Michel Lind 3a9b30
From: Krishna Dhulipala <krishnad@meta.com>
Michel Lind 3a9b30
Date: Thu, 19 Sep 2024 07:58:37 -0700
Michel Lind 3a9b30
Subject: [PATCH] ipmitool SEL logging of AER CEs on OpenBMC platforms
Michel Lind 3a9b30
Michel Lind 3a9b30
Signed-off-by: Krishna Dhulipala <krishnad@meta.com>
Michel Lind 3a9b30
---
Michel Lind 3a9b30
 Makefile.am       |  6 ++--
Michel Lind 3a9b30
 configure.ac      | 11 ++++++
Michel Lind 3a9b30
 ras-aer-handler.c | 24 +++++++++++++
Michel Lind 3a9b30
 ras-aer-handler.h |  1 +
Michel Lind 3a9b30
 ras-events.c      |  3 +-
Michel Lind 3a9b30
 ras-events.h      |  3 +-
Michel Lind 3a9b30
 rasdaemon.c       | 11 +++++-
Michel Lind 3a9b30
 unified-sel.c     | 89 +++++++++++++++++++++++++++++++++++++++++++++++
Michel Lind 3a9b30
 unified-sel.h     | 17 +++++++++
Michel Lind 3a9b30
 9 files changed, 160 insertions(+), 5 deletions(-)
Michel Lind 3a9b30
 create mode 100644 unified-sel.c
Michel Lind 3a9b30
 create mode 100644 unified-sel.h
Michel Lind 3a9b30
Michel Lind 3a9b30
--- a/Makefile.am
Michel Lind 3a9b30
+++ b/Makefile.am
Michel Lind 3a9b30
@@ -76,7 +76,9 @@ endif
Michel Lind 3a9b30
 if WITH_CPU_FAULT_ISOLATION
Michel Lind 3a9b30
    rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
Michel Lind 3a9b30
 endif
Michel Lind 3a9b30
-
Michel Lind 3a9b30
+if WITH_OPENBMC_UNIFIED_SEL
Michel Lind 3a9b30
+   rasdaemon_SOURCES += unified-sel.c
Michel Lind 3a9b30
+endif
Michel Lind 3a9b30
 if WITH_CXL
Michel Lind 3a9b30
    rasdaemon_SOURCES += ras-cxl-handler.c
Michel Lind 3a9b30
 endif
Michel Lind 3a9b30
@@ -96,7 +98,7 @@ include_HEADERS = config.h  ras-events.h
Michel Lind 3a9b30
 		  ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
Michel Lind 3a9b30
 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
Michel Lind 3a9b30
 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
Michel Lind 3a9b30
-		  non-standard-jaguarmicro.h trigger.h
Michel Lind 3a9b30
+		  non-standard-jaguarmicro.h trigger.h unified-sel.h
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 # This rule can't be called with more than one Makefile job (like make -j8)
Michel Lind 3a9b30
 # I can't figure out a way to fix that
Michel Lind 3a9b30
--- a/configure.ac
Michel Lind 3a9b30
+++ b/configure.ac
Michel Lind 3a9b30
@@ -190,6 +190,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "
Michel Lind 3a9b30
 AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all = xyes])
Michel Lind 3a9b30
 AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
Michel Lind 3a9b30
 
Michel Lind 3a9b30
+AC_ARG_ENABLE([openbmc_unified_sel],
Michel Lind 3a9b30
+    AS_HELP_STRING([--enable-openbmc-unified-sel], [enable OPENBMC_UNIFIED_SEL events (currently experimental)]))
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+AS_IF([test "x$enable_openbmc_unified_sel" = "xyes" || test "x$enable_all" = "xyes"], [
Michel Lind 3a9b30
+  AC_DEFINE(HAVE_OPENBMC_UNIFIED_SEL,1,"have OpenBMC unified SEL")
Michel Lind 3a9b30
+  AC_SUBST([WITH_OPENBMC_UNIFIED_SEL])
Michel Lind 3a9b30
+])
Michel Lind 3a9b30
+AM_CONDITIONAL([WITH_OPENBMC_UNIFIED_SEL], [test x$enable_openbmc_unified_sel = xyes || test x$enable_all = xyes])
Michel Lind 3a9b30
+AM_COND_IF([WITH_OPENBMC_UNIFIED_SEL], [USE_OPENBMC_UNIFIED_SEL="yes"], [USE_OPENBMC_UNIFIED_SEL="no"])
Michel Lind 3a9b30
+
Michel Lind 3a9b30
 AC_ARG_ENABLE([jaguar_ns_decode],
Michel Lind 3a9b30
     AS_HELP_STRING([--enable-jaguar-ns-decode], [enable JAGUAR_NS_DECODE events (currently experimental)]))
Michel Lind 3a9b30
 
Michel Lind 3a9b30
@@ -261,6 +271,7 @@ compile time options summary
Michel Lind 3a9b30
     CXL events          : $USE_CXL
Michel Lind 3a9b30
     Memory CE PFA       : $USE_MEMORY_CE_PFA
Michel Lind 3a9b30
     AMP RAS errors      : $USE_AMP_NS_DECODE
Michel Lind 3a9b30
+    OpenBMC unified     : $USE_OPENBMC_UNIFIED_SEL
Michel Lind 3a9b30
     CPU fault isolation : $USE_CPU_FAULT_ISOLATION
Michel Lind 3a9b30
     YITIAN RAS errors   : $USE_YITIAN_NS_DECODE
Michel Lind 3a9b30
     JAGUAR RAS errors   : $USE_JAGUAR_NS_DECODE
Michel Lind 3a9b30
--- a/ras-aer-handler.c
Michel Lind 3a9b30
+++ b/ras-aer-handler.c
Michel Lind 3a9b30
@@ -25,6 +25,7 @@
Michel Lind 3a9b30
 #include "ras-logger.h"
Michel Lind 3a9b30
 #include "bitfield.h"
Michel Lind 3a9b30
 #include "ras-report.h"
Michel Lind 3a9b30
+#include "unified-sel.h"
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 /* bit field meaning for correctable error */
Michel Lind 3a9b30
 static const char *aer_cor_errors[32] = {
Michel Lind 3a9b30
@@ -36,12 +37,14 @@ static const char *aer_cor_errors[32] =
Michel Lind 3a9b30
 	[12] = "Replay Timer Timeout",
Michel Lind 3a9b30
 	[13] = "Advisory Non-Fatal",
Michel Lind 3a9b30
 	[14] = "Corrected Internal Error",
Michel Lind 3a9b30
+	[15] = "Header Log Overflow",
Michel Lind 3a9b30
 };
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 /* bit field meaning for uncorrectable error */
Michel Lind 3a9b30
 static const char *aer_uncor_errors[32] = {
Michel Lind 3a9b30
 	/* Uncorrectable errors */
Michel Lind 3a9b30
 	[4]  = "Data Link Protocol",
Michel Lind 3a9b30
+	[5]  = "Surprise Link Down",
Michel Lind 3a9b30
 	[12] = "Poisoned TLP",
Michel Lind 3a9b30
 	[13] = "Flow Control Protocol",
Michel Lind 3a9b30
 	[14] = "Completion Timeout",
Michel Lind 3a9b30
@@ -51,8 +54,23 @@ static const char *aer_uncor_errors[32]
Michel Lind 3a9b30
 	[18] = "Malformed TLP",
Michel Lind 3a9b30
 	[19] = "ECRC",
Michel Lind 3a9b30
 	[20] = "Unsupported Request",
Michel Lind 3a9b30
+	[21] = "ACS Violation",
Michel Lind 3a9b30
+	[22] = "Uncorrected Internal",
Michel Lind 3a9b30
+	[23] = "MC Blocked TLP",
Michel Lind 3a9b30
+	[24] = "AtomicOp Egress Blocked",
Michel Lind 3a9b30
+	[25] = "TLP Prefix Blocked",
Michel Lind 3a9b30
+	[26] = "Poisoned TLP Egrees Blocked",
Michel Lind 3a9b30
 };
Michel Lind 3a9b30
 
Michel Lind 3a9b30
+static bool use_ipmitool = false;
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+void ras_aer_handler_init(int enable_ipmitool)
Michel Lind 3a9b30
+{
Michel Lind 3a9b30
+#ifdef HAVE_OPENBMC_UNIFIED_SEL
Michel Lind 3a9b30
+	use_ipmitool = (enable_ipmitool > 0) ? 1 : 0;
Michel Lind 3a9b30
+#endif
Michel Lind 3a9b30
+}
Michel Lind 3a9b30
+
Michel Lind 3a9b30
 #define BUF_LEN	1024
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 int ras_aer_event_handler(struct trace_seq *s,
Michel Lind 3a9b30
@@ -195,5 +213,11 @@ int ras_aer_event_handler(struct trace_s
Michel Lind 3a9b30
 		log(SYSLOG, LOG_WARNING, "Failed to execute ipmitool\n");
Michel Lind 3a9b30
 #endif
Michel Lind 3a9b30
 
Michel Lind 3a9b30
+#ifdef HAVE_OPENBMC_UNIFIED_SEL
Michel Lind 3a9b30
+	if (use_ipmitool)
Michel Lind 3a9b30
+		if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0)
Michel Lind 3a9b30
+			return -1;
Michel Lind 3a9b30
+#endif
Michel Lind 3a9b30
+
Michel Lind 3a9b30
 	return 0;
Michel Lind 3a9b30
 }
Michel Lind 3a9b30
--- a/ras-aer-handler.h
Michel Lind 3a9b30
+++ b/ras-aer-handler.h
Michel Lind 3a9b30
@@ -26,4 +26,5 @@ int ras_aer_event_handler(struct trace_s
Michel Lind 3a9b30
 			  struct tep_record *record,
Michel Lind 3a9b30
 			  struct tep_event *event, void *context);
Michel Lind 3a9b30
 
Michel Lind 3a9b30
+void ras_aer_handler_init(int enable_ipmitool);
Michel Lind 3a9b30
 #endif
Michel Lind 3a9b30
--- a/ras-events.c
Michel Lind 3a9b30
+++ b/ras-events.c
Michel Lind 3a9b30
@@ -894,7 +894,7 @@ static int add_event_handler(struct ras_
Michel Lind 3a9b30
 	return 0;
Michel Lind 3a9b30
 }
Michel Lind 3a9b30
 
Michel Lind 3a9b30
-int handle_ras_events(int record_events)
Michel Lind 3a9b30
+int handle_ras_events(int record_events, int enable_ipmitool)
Michel Lind 3a9b30
 {
Michel Lind 3a9b30
 	int rc, page_size, i;
Michel Lind 3a9b30
 	int num_events = 0;
Michel Lind 3a9b30
@@ -951,6 +951,7 @@ int handle_ras_events(int record_events)
Michel Lind 3a9b30
 		    "ras", "mc_event");
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 #ifdef HAVE_AER
Michel Lind 3a9b30
+	ras_aer_handler_init(enable_ipmitool);
Michel Lind 3a9b30
 	rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event",
Michel Lind 3a9b30
 			       ras_aer_event_handler, NULL, AER_EVENT);
Michel Lind 3a9b30
 	if (!rc)
Michel Lind 3a9b30
--- a/ras-events.h
Michel Lind 3a9b30
+++ b/ras-events.h
Michel Lind 3a9b30
@@ -109,7 +109,8 @@ enum ghes_severity {
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 /* Function prototypes */
Michel Lind 3a9b30
 int toggle_ras_mc_event(int enable);
Michel Lind 3a9b30
+int handle_ras_events(int record_events, int enable_ipmitool);
Michel Lind 3a9b30
 int ras_offline_mce_event(struct ras_mc_offline_event *event);
Michel Lind 3a9b30
-int handle_ras_events(int record_events);
Michel Lind 3a9b30
+int handle_ras_events(int record_events, int enable_ipmitool);
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 #endif
Michel Lind 3a9b30
--- a/rasdaemon.c
Michel Lind 3a9b30
+++ b/rasdaemon.c
Michel Lind 3a9b30
@@ -42,6 +42,7 @@ const char *argp_program_bug_address = "
Michel Lind 3a9b30
 struct arguments {
Michel Lind 3a9b30
 	int record_events;
Michel Lind 3a9b30
 	int enable_ras;
Michel Lind 3a9b30
+	int enable_ipmitool;
Michel Lind 3a9b30
 	int foreground;
Michel Lind 3a9b30
 	int offline;
Michel Lind 3a9b30
 };
Michel Lind 3a9b30
@@ -74,6 +75,11 @@ static error_t parse_opt(int k, char *ar
Michel Lind 3a9b30
 		args->record_events++;
Michel Lind 3a9b30
 		break;
Michel Lind 3a9b30
 #endif
Michel Lind 3a9b30
+#ifdef HAVE_OPENBMC_UNIFIED_SEL
Michel Lind 3a9b30
+	case 'i':
Michel Lind 3a9b30
+		args->enable_ipmitool++;
Michel Lind 3a9b30
+		break;
Michel Lind 3a9b30
+#endif
Michel Lind 3a9b30
 	case 'f':
Michel Lind 3a9b30
 		args->foreground++;
Michel Lind 3a9b30
 		break;
Michel Lind 3a9b30
@@ -164,6 +170,9 @@ int main(int argc, char *argv[])
Michel Lind 3a9b30
 		{"record",  'r', 0, 0, "record events via sqlite3", 0},
Michel Lind 3a9b30
 #endif
Michel Lind 3a9b30
 		{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
Michel Lind 3a9b30
+#ifdef HAVE_OPENBMC_UNIFIED_SEL
Michel Lind 3a9b30
+		{"ipmitool", 'i', 0, 0, "enable ipmitool logging", 0},
Michel Lind 3a9b30
+#endif
Michel Lind 3a9b30
 #ifdef HAVE_MCE
Michel Lind 3a9b30
 		{"post-processing", 'p', 0, 0,
Michel Lind 3a9b30
 		"Post-processing MCE's with raw register values"},
Michel Lind 3a9b30
@@ -212,7 +221,7 @@ int main(int argc, char *argv[])
Michel Lind 3a9b30
 		if (daemon(0, 0))
Michel Lind 3a9b30
 			exit(EXIT_FAILURE);
Michel Lind 3a9b30
 
Michel Lind 3a9b30
-	handle_ras_events(args.record_events);
Michel Lind 3a9b30
+	handle_ras_events(args.record_events, args.enable_ipmitool);
Michel Lind 3a9b30
 
Michel Lind 3a9b30
 	return 0;
Michel Lind 3a9b30
 }
Michel Lind 3a9b30
--- /dev/null
Michel Lind 3a9b30
+++ b/unified-sel.c
Michel Lind 3a9b30
@@ -0,0 +1,89 @@
Michel Lind 3a9b30
+/*
Michel Lind 3a9b30
+ * Copyright (c) 2023, Meta Platforms Inc.
Michel Lind 3a9b30
+ *
Michel Lind 3a9b30
+ * This program is free software; you can redistribute it and/or modify
Michel Lind 3a9b30
+ * it under the terms of the GNU General Public License as published by
Michel Lind 3a9b30
+ * the Free Software Foundation; either version 2 of the License, or
Michel Lind 3a9b30
+ * (at your option) any later version.
Michel Lind 3a9b30
+ *
Michel Lind 3a9b30
+ */
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+#include <stdio.h>
Michel Lind 3a9b30
+#include <stdlib.h>
Michel Lind 3a9b30
+#include <string.h>
Michel Lind 3a9b30
+#include <stdbool.h>
Michel Lind 3a9b30
+#include "ras-record.h"
Michel Lind 3a9b30
+#include "ras-logger.h"
Michel Lind 3a9b30
+#include "ras-report.h"
Michel Lind 3a9b30
+#include "unified-sel.h"
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+/* CPU Root Port Error ID corresponding to each status bit set */
Michel Lind 3a9b30
+static const char *cor_error_ids[32] = {
Michel Lind 3a9b30
+	/* Correctable errors */
Michel Lind 3a9b30
+	[0]  = "0x00", /* Receiver Error */
Michel Lind 3a9b30
+	[6]  = "0x01", /* Bad TLP */
Michel Lind 3a9b30
+	[7]  = "0x02", /* Bad DLLP */
Michel Lind 3a9b30
+	[8]  = "0x04", /* RELAY_NUM Rollover */
Michel Lind 3a9b30
+	[12] = "0x03", /* Replay Timer Timeout */
Michel Lind 3a9b30
+	[13] = "0x05", /* Advisory Non-Fatal */
Michel Lind 3a9b30
+	[14] = "0x06", /* Corrected Internal */
Michel Lind 3a9b30
+	[15] = "0x07", /* Header Log Overflow */
Michel Lind 3a9b30
+};
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+static int verify_id_log_sel(uint64_t status,
Michel Lind 3a9b30
+			     const char **idarray,
Michel Lind 3a9b30
+			     unsigned bus,
Michel Lind 3a9b30
+			     unsigned dev_fn)
Michel Lind 3a9b30
+{
Michel Lind 3a9b30
+	int i;
Michel Lind 3a9b30
+	char openbmc_ipmi_add_sel[105];
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+	/*
Michel Lind 3a9b30
+	 * Get PCIe AER error source bus/dev/fn and save it to the BMC SEL
Michel Lind 3a9b30
+	 * as a OpenBMC unified SEL record type.
Michel Lind 3a9b30
+	 * The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec)
Michel Lind 3a9b30
+	 * ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6
Michel Lind 3a9b30
+	 * The 16 byte that follow form the SEL Record
Michel Lind 3a9b30
+	 * defined in IPMI spec chapter 32.1 "SEL Event Records"
Michel Lind 3a9b30
+	 * Byte 1~2 are Record ID = 0x00 0x00, unused
Michel Lind 3a9b30
+	 * Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL
Michel Lind 3a9b30
+	 * Byte 4~16 are OEM defined
Michel Lind 3a9b30
+	 * Byte 11:
Michel Lind 3a9b30
+	 * Byte11[7:3] Device#
Michel Lind 3a9b30
+	 * Byte11[2:0] Function#
Michel Lind 3a9b30
+	 * Byte 12: Bus number
Michel Lind 3a9b30
+	 * Byte 13-15: Reserved
Michel Lind 3a9b30
+	 * Byte 16: ID of the error detected on the PCle device that triggered this SEL record
Michel Lind 3a9b30
+	 */
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+	/* Potentially all error status bits could be set for a given PCIe device.
Michel Lind 3a9b30
+	 * Therefore, iterate over all 32 bits each of cor and uncor errors
Michel Lind 3a9b30
+	 */
Michel Lind 3a9b30
+	for (i = 0; i < 32; i++) {
Michel Lind 3a9b30
+		if ((status & (1 << i)) && idarray[i]) {
Michel Lind 3a9b30
+			sprintf(openbmc_ipmi_add_sel,
Michel Lind 3a9b30
+				"ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff %s",
Michel Lind 3a9b30
+				dev_fn, bus, idarray[i]);
Michel Lind 3a9b30
+			if (system(openbmc_ipmi_add_sel) != 0)
Michel Lind 3a9b30
+				return -1;
Michel Lind 3a9b30
+		}
Michel Lind 3a9b30
+	}
Michel Lind 3a9b30
+	return 0;
Michel Lind 3a9b30
+}
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status)
Michel Lind 3a9b30
+{
Michel Lind 3a9b30
+	int bus, dev, dev_fn, fn;
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+	sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn);
Michel Lind 3a9b30
+	dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7));
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+	/* Use the appropriate correctable error status ID
Michel Lind 3a9b30
+	 * for a given severity level
Michel Lind 3a9b30
+	 * */
Michel Lind 3a9b30
+	if (severity == HW_EVENT_AER_CORRECTED) {
Michel Lind 3a9b30
+		if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0)
Michel Lind 3a9b30
+			return -1;
Michel Lind 3a9b30
+	}
Michel Lind 3a9b30
+	return 0;
Michel Lind 3a9b30
+}
Michel Lind 3a9b30
--- /dev/null
Michel Lind 3a9b30
+++ b/unified-sel.h
Michel Lind 3a9b30
@@ -0,0 +1,17 @@
Michel Lind 3a9b30
+/*
Michel Lind 3a9b30
+ * Copyright (c) 2023, Meta Platforms Inc.
Michel Lind 3a9b30
+ *
Michel Lind 3a9b30
+ * This program is free software; you can redistribute it and/or modify
Michel Lind 3a9b30
+ * it under the terms of the GNU General Public License as published by
Michel Lind 3a9b30
+ * the Free Software Foundation; either version 2 of the License, or
Michel Lind 3a9b30
+ * (at your option) any later version.
Michel Lind 3a9b30
+ *
Michel Lind 3a9b30
+ */
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+#ifndef _UNIFIED_SEL_H
Michel Lind 3a9b30
+#define _UNIFIED_SEL_H
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status);
Michel Lind 3a9b30
+
Michel Lind 3a9b30
+#endif