Blame SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch

9fadc0
commit aecf33aa70331670c06db6b652712b476e24051c
9fadc0
Author: Muralidhara M K <muralimk@amd.com>
9fadc0
Date:   Mon Jul 12 05:40:46 2021 -0500
9fadc0
9fadc0
    rasdaemon: Enumerate memory on noncpu nodes
9fadc0
    
9fadc0
    On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
9fadc0
    banks) connected via xGMI links to the CPUs.
9fadc0
    
9fadc0
    The node id information is available in the InstanceHI[47:44] of
9fadc0
    the IPID register.
9fadc0
    
9fadc0
    The UMC Phys on Aldeberan nodes are enumerated as csrow
9fadc0
    The UMC channels connected to HBMs are enumerated as ranks.
9fadc0
    
9fadc0
    Signed-off-by: Muralidhara M K <muralimk@amd.com>
9fadc0
    Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
9fadc0
    Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
9fadc0
9fadc0
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
9fadc0
index 3c346f4..f3379fc 100644
9fadc0
--- a/mce-amd-smca.c
9fadc0
+++ b/mce-amd-smca.c
9fadc0
@@ -78,6 +78,12 @@ enum smca_bank_types {
9fadc0
 /* Maximum number of MCA banks per CPU. */
9fadc0
 #define MAX_NR_BANKS	64
9fadc0
 
9fadc0
+/*
9fadc0
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
9fadc0
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
9fadc0
+ */
9fadc0
+#define NONCPU_NODE_INDEX	8
9fadc0
+
9fadc0
 /* SMCA Extended error strings */
9fadc0
 /* Load Store */
9fadc0
 static const char * const smca_ls_mce_desc[] = {
9fadc0
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
9fadc0
 {
9fadc0
 	return EXTRACT(e->ipid, 0, 31) >> 20;
9fadc0
 }
9fadc0
+
9fadc0
+/*
9fadc0
+ * The HBM memory managed by the UMCCH of the noncpu node
9fadc0
+ * can be calculated based on the [15:12]bits of IPID
9fadc0
+ */
9fadc0
+static int find_hbm_channel(struct mce_event *e)
9fadc0
+{
9fadc0
+	int umc, tmp;
9fadc0
+
9fadc0
+	umc = EXTRACT(e->ipid, 0, 31) >> 20;
9fadc0
+
9fadc0
+	/*
9fadc0
+	 * The HBM channel managed by the UMC of the noncpu node
9fadc0
+	 * can be calculated based on the [15:12]bits of IPID as follows
9fadc0
+	 */
9fadc0
+	tmp = ((e->ipid >> 12) & 0xf);
9fadc0
+
9fadc0
+	return (umc % 2) ? tmp + 4 : tmp;
9fadc0
+}
9fadc0
+
9fadc0
 /* Decode extended errors according to Scalable MCA specification */
9fadc0
 static void decode_smca_error(struct mce_event *e)
9fadc0
 {
9fadc0
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
9fadc0
 	unsigned short xec = (e->status >> 16) & 0x3f;
9fadc0
 	const struct smca_hwid *s_hwid;
9fadc0
 	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
9fadc0
+	uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
9fadc0
 	unsigned int csrow = -1, channel = -1;
9fadc0
 	unsigned int i;
9fadc0
 
9fadc0
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
9fadc0
 			bank_type = s_hwid->bank_type;
9fadc0
 			break;
9fadc0
 		}
9fadc0
+		if (mcatype_instancehi >= NONCPU_NODE_INDEX)
9fadc0
+			bank_type = SMCA_UMC_V2;
9fadc0
 	}
9fadc0
 
9fadc0
-	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
9fadc0
+	if (i >= MAX_NR_BANKS) {
9fadc0
 		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
9fadc0
 		return;
9fadc0
 	}
9fadc0
 
9fadc0
-	if (bank_type >= N_SMCA_BANK_TYPES) {
9fadc0
+	if (bank_type >= MAX_NR_BANKS) {
9fadc0
 		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
9fadc0
 		return;
9fadc0
 	}
9fadc0
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
9fadc0
 		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
9fadc0
 			     channel, csrow);
9fadc0
 	}
9fadc0
+
9fadc0
+	if (bank_type == SMCA_UMC_V2 && xec == 0) {
9fadc0
+		/* The UMCPHY is reported as csrow in case of noncpu nodes */
9fadc0
+		csrow = find_umc_channel(e) / 2;
9fadc0
+		/* UMCCH is managing the HBM memory */
9fadc0
+		channel = find_hbm_channel(e);
9fadc0
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
9fadc0
+			     channel, csrow);
9fadc0
+	}
9fadc0
+
9fadc0
 }
9fadc0
 
9fadc0
 int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)