krishnanadh / rpms / rasdaemon

Forked from rpms/rasdaemon a year ago
Clone

Blame SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch

7fca25
commit aecf33aa70331670c06db6b652712b476e24051c
7fca25
Author: Muralidhara M K <muralimk@amd.com>
7fca25
Date:   Mon Jul 12 05:40:46 2021 -0500
7fca25
7fca25
    rasdaemon: Enumerate memory on noncpu nodes
7fca25
    
7fca25
    On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
7fca25
    banks) connected via xGMI links to the CPUs.
7fca25
    
7fca25
    The node id information is available in the InstanceHI[47:44] of
7fca25
    the IPID register.
7fca25
    
7fca25
    The UMC Phys on Aldeberan nodes are enumerated as csrow
7fca25
    The UMC channels connected to HBMs are enumerated as ranks.
7fca25
    
7fca25
    Signed-off-by: Muralidhara M K <muralimk@amd.com>
7fca25
    Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
7fca25
    Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
7fca25
7fca25
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
7fca25
index 3c346f4..f3379fc 100644
7fca25
--- a/mce-amd-smca.c
7fca25
+++ b/mce-amd-smca.c
7fca25
@@ -78,6 +78,12 @@ enum smca_bank_types {
7fca25
 /* Maximum number of MCA banks per CPU. */
7fca25
 #define MAX_NR_BANKS	64
7fca25
 
7fca25
+/*
7fca25
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
7fca25
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
7fca25
+ */
7fca25
+#define NONCPU_NODE_INDEX	8
7fca25
+
7fca25
 /* SMCA Extended error strings */
7fca25
 /* Load Store */
7fca25
 static const char * const smca_ls_mce_desc[] = {
7fca25
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
7fca25
 {
7fca25
 	return EXTRACT(e->ipid, 0, 31) >> 20;
7fca25
 }
7fca25
+
7fca25
+/*
7fca25
+ * The HBM memory managed by the UMCCH of the noncpu node
7fca25
+ * can be calculated based on the [15:12]bits of IPID
7fca25
+ */
7fca25
+static int find_hbm_channel(struct mce_event *e)
7fca25
+{
7fca25
+	int umc, tmp;
7fca25
+
7fca25
+	umc = EXTRACT(e->ipid, 0, 31) >> 20;
7fca25
+
7fca25
+	/*
7fca25
+	 * The HBM channel managed by the UMC of the noncpu node
7fca25
+	 * can be calculated based on the [15:12]bits of IPID as follows
7fca25
+	 */
7fca25
+	tmp = ((e->ipid >> 12) & 0xf);
7fca25
+
7fca25
+	return (umc % 2) ? tmp + 4 : tmp;
7fca25
+}
7fca25
+
7fca25
 /* Decode extended errors according to Scalable MCA specification */
7fca25
 static void decode_smca_error(struct mce_event *e)
7fca25
 {
7fca25
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
7fca25
 	unsigned short xec = (e->status >> 16) & 0x3f;
7fca25
 	const struct smca_hwid *s_hwid;
7fca25
 	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
7fca25
+	uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
7fca25
 	unsigned int csrow = -1, channel = -1;
7fca25
 	unsigned int i;
7fca25
 
7fca25
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
7fca25
 			bank_type = s_hwid->bank_type;
7fca25
 			break;
7fca25
 		}
7fca25
+		if (mcatype_instancehi >= NONCPU_NODE_INDEX)
7fca25
+			bank_type = SMCA_UMC_V2;
7fca25
 	}
7fca25
 
7fca25
-	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
7fca25
+	if (i >= MAX_NR_BANKS) {
7fca25
 		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
7fca25
 		return;
7fca25
 	}
7fca25
 
7fca25
-	if (bank_type >= N_SMCA_BANK_TYPES) {
7fca25
+	if (bank_type >= MAX_NR_BANKS) {
7fca25
 		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
7fca25
 		return;
7fca25
 	}
7fca25
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
7fca25
 		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
7fca25
 			     channel, csrow);
7fca25
 	}
7fca25
+
7fca25
+	if (bank_type == SMCA_UMC_V2 && xec == 0) {
7fca25
+		/* The UMCPHY is reported as csrow in case of noncpu nodes */
7fca25
+		csrow = find_umc_channel(e) / 2;
7fca25
+		/* UMCCH is managing the HBM memory */
7fca25
+		channel = find_hbm_channel(e);
7fca25
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
7fca25
+			     channel, csrow);
7fca25
+	}
7fca25
+
7fca25
 }
7fca25
 
7fca25
 int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)