krishnanadh / rpms / rasdaemon

Forked from rpms/rasdaemon a year ago
Clone
81d547
commit aecf33aa70331670c06db6b652712b476e24051c
81d547
Author: Muralidhara M K <muralimk@amd.com>
81d547
Date:   Mon Jul 12 05:40:46 2021 -0500
81d547
81d547
    rasdaemon: Enumerate memory on noncpu nodes
81d547
    
81d547
    On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
81d547
    banks) connected via xGMI links to the CPUs.
81d547
    
81d547
    The node id information is available in the InstanceHI[47:44] of
81d547
    the IPID register.
81d547
    
81d547
    The UMC Phys on Aldeberan nodes are enumerated as csrow
81d547
    The UMC channels connected to HBMs are enumerated as ranks.
81d547
    
81d547
    Signed-off-by: Muralidhara M K <muralimk@amd.com>
81d547
    Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
81d547
    Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
81d547
81d547
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
81d547
index 3c346f4..f3379fc 100644
81d547
--- a/mce-amd-smca.c
81d547
+++ b/mce-amd-smca.c
81d547
@@ -78,6 +78,12 @@ enum smca_bank_types {
81d547
 /* Maximum number of MCA banks per CPU. */
81d547
 #define MAX_NR_BANKS	64
81d547
 
81d547
+/*
81d547
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
81d547
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
81d547
+ */
81d547
+#define NONCPU_NODE_INDEX	8
81d547
+
81d547
 /* SMCA Extended error strings */
81d547
 /* Load Store */
81d547
 static const char * const smca_ls_mce_desc[] = {
81d547
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
81d547
 {
81d547
 	return EXTRACT(e->ipid, 0, 31) >> 20;
81d547
 }
81d547
+
81d547
+/*
81d547
+ * The HBM memory managed by the UMCCH of the noncpu node
81d547
+ * can be calculated based on the [15:12]bits of IPID
81d547
+ */
81d547
+static int find_hbm_channel(struct mce_event *e)
81d547
+{
81d547
+	int umc, tmp;
81d547
+
81d547
+	umc = EXTRACT(e->ipid, 0, 31) >> 20;
81d547
+
81d547
+	/*
81d547
+	 * The HBM channel managed by the UMC of the noncpu node
81d547
+	 * can be calculated based on the [15:12]bits of IPID as follows
81d547
+	 */
81d547
+	tmp = ((e->ipid >> 12) & 0xf);
81d547
+
81d547
+	return (umc % 2) ? tmp + 4 : tmp;
81d547
+}
81d547
+
81d547
 /* Decode extended errors according to Scalable MCA specification */
81d547
 static void decode_smca_error(struct mce_event *e)
81d547
 {
81d547
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
81d547
 	unsigned short xec = (e->status >> 16) & 0x3f;
81d547
 	const struct smca_hwid *s_hwid;
81d547
 	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
81d547
+	uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
81d547
 	unsigned int csrow = -1, channel = -1;
81d547
 	unsigned int i;
81d547
 
81d547
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
81d547
 			bank_type = s_hwid->bank_type;
81d547
 			break;
81d547
 		}
81d547
+		if (mcatype_instancehi >= NONCPU_NODE_INDEX)
81d547
+			bank_type = SMCA_UMC_V2;
81d547
 	}
81d547
 
81d547
-	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
81d547
+	if (i >= MAX_NR_BANKS) {
81d547
 		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
81d547
 		return;
81d547
 	}
81d547
 
81d547
-	if (bank_type >= N_SMCA_BANK_TYPES) {
81d547
+	if (bank_type >= MAX_NR_BANKS) {
81d547
 		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
81d547
 		return;
81d547
 	}
81d547
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
81d547
 		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
81d547
 			     channel, csrow);
81d547
 	}
81d547
+
81d547
+	if (bank_type == SMCA_UMC_V2 && xec == 0) {
81d547
+		/* The UMCPHY is reported as csrow in case of noncpu nodes */
81d547
+		csrow = find_umc_channel(e) / 2;
81d547
+		/* UMCCH is managing the HBM memory */
81d547
+		channel = find_hbm_channel(e);
81d547
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
81d547
+			     channel, csrow);
81d547
+	}
81d547
+
81d547
 }
81d547
 
81d547
 int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)