krishnanadh / rpms / rasdaemon

Forked from rpms/rasdaemon a year ago
Clone

Blame SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch

df8b6a
commit aecf33aa70331670c06db6b652712b476e24051c
df8b6a
Author: Muralidhara M K <muralimk@amd.com>
df8b6a
Date:   Mon Jul 12 05:40:46 2021 -0500
df8b6a
df8b6a
    rasdaemon: Enumerate memory on noncpu nodes
df8b6a
    
df8b6a
    On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
df8b6a
    banks) connected via xGMI links to the CPUs.
df8b6a
    
df8b6a
    The node id information is available in the InstanceHI[47:44] of
df8b6a
    the IPID register.
df8b6a
    
df8b6a
    The UMC Phys on Aldeberan nodes are enumerated as csrow
df8b6a
    The UMC channels connected to HBMs are enumerated as ranks.
df8b6a
    
df8b6a
    Signed-off-by: Muralidhara M K <muralimk@amd.com>
df8b6a
    Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
df8b6a
    Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
df8b6a
df8b6a
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
df8b6a
index 3c346f4..f3379fc 100644
df8b6a
--- a/mce-amd-smca.c
df8b6a
+++ b/mce-amd-smca.c
df8b6a
@@ -78,6 +78,12 @@ enum smca_bank_types {
df8b6a
 /* Maximum number of MCA banks per CPU. */
df8b6a
 #define MAX_NR_BANKS	64
df8b6a
 
df8b6a
+/*
df8b6a
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
df8b6a
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
df8b6a
+ */
df8b6a
+#define NONCPU_NODE_INDEX	8
df8b6a
+
df8b6a
 /* SMCA Extended error strings */
df8b6a
 /* Load Store */
df8b6a
 static const char * const smca_ls_mce_desc[] = {
df8b6a
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
df8b6a
 {
df8b6a
 	return EXTRACT(e->ipid, 0, 31) >> 20;
df8b6a
 }
df8b6a
+
df8b6a
+/*
df8b6a
+ * The HBM memory managed by the UMCCH of the noncpu node
df8b6a
+ * can be calculated based on the [15:12]bits of IPID
df8b6a
+ */
df8b6a
+static int find_hbm_channel(struct mce_event *e)
df8b6a
+{
df8b6a
+	int umc, tmp;
df8b6a
+
df8b6a
+	umc = EXTRACT(e->ipid, 0, 31) >> 20;
df8b6a
+
df8b6a
+	/*
df8b6a
+	 * The HBM channel managed by the UMC of the noncpu node
df8b6a
+	 * can be calculated based on the [15:12]bits of IPID as follows
df8b6a
+	 */
df8b6a
+	tmp = ((e->ipid >> 12) & 0xf);
df8b6a
+
df8b6a
+	return (umc % 2) ? tmp + 4 : tmp;
df8b6a
+}
df8b6a
+
df8b6a
 /* Decode extended errors according to Scalable MCA specification */
df8b6a
 static void decode_smca_error(struct mce_event *e)
df8b6a
 {
df8b6a
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
df8b6a
 	unsigned short xec = (e->status >> 16) & 0x3f;
df8b6a
 	const struct smca_hwid *s_hwid;
df8b6a
 	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
df8b6a
+	uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
df8b6a
 	unsigned int csrow = -1, channel = -1;
df8b6a
 	unsigned int i;
df8b6a
 
df8b6a
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
df8b6a
 			bank_type = s_hwid->bank_type;
df8b6a
 			break;
df8b6a
 		}
df8b6a
+		if (mcatype_instancehi >= NONCPU_NODE_INDEX)
df8b6a
+			bank_type = SMCA_UMC_V2;
df8b6a
 	}
df8b6a
 
df8b6a
-	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
df8b6a
+	if (i >= MAX_NR_BANKS) {
df8b6a
 		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
df8b6a
 		return;
df8b6a
 	}
df8b6a
 
df8b6a
-	if (bank_type >= N_SMCA_BANK_TYPES) {
df8b6a
+	if (bank_type >= MAX_NR_BANKS) {
df8b6a
 		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
df8b6a
 		return;
df8b6a
 	}
df8b6a
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
df8b6a
 		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
df8b6a
 			     channel, csrow);
df8b6a
 	}
df8b6a
+
df8b6a
+	if (bank_type == SMCA_UMC_V2 && xec == 0) {
df8b6a
+		/* The UMCPHY is reported as csrow in case of noncpu nodes */
df8b6a
+		csrow = find_umc_channel(e) / 2;
df8b6a
+		/* UMCCH is managing the HBM memory */
df8b6a
+		channel = find_hbm_channel(e);
df8b6a
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
df8b6a
+			     channel, csrow);
df8b6a
+	}
df8b6a
+
df8b6a
 }
df8b6a
 
df8b6a
 int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)