Blame SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch

d77db6
commit aecf33aa70331670c06db6b652712b476e24051c
d77db6
Author: Muralidhara M K <muralimk@amd.com>
d77db6
Date:   Mon Jul 12 05:40:46 2021 -0500
d77db6
d77db6
    rasdaemon: Enumerate memory on noncpu nodes
d77db6
    
d77db6
    On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
d77db6
    banks) connected via xGMI links to the CPUs.
d77db6
    
d77db6
    The node id information is available in the InstanceHI[47:44] of
d77db6
    the IPID register.
d77db6
    
d77db6
    The UMC Phys on Aldeberan nodes are enumerated as csrow
d77db6
    The UMC channels connected to HBMs are enumerated as ranks.
d77db6
    
d77db6
    Signed-off-by: Muralidhara M K <muralimk@amd.com>
d77db6
    Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
d77db6
    Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
d77db6
d77db6
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
d77db6
index 3c346f4..f3379fc 100644
d77db6
--- a/mce-amd-smca.c
d77db6
+++ b/mce-amd-smca.c
d77db6
@@ -78,6 +78,12 @@ enum smca_bank_types {
d77db6
 /* Maximum number of MCA banks per CPU. */
d77db6
 #define MAX_NR_BANKS	64
d77db6
 
d77db6
+/*
d77db6
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
d77db6
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
d77db6
+ */
d77db6
+#define NONCPU_NODE_INDEX	8
d77db6
+
d77db6
 /* SMCA Extended error strings */
d77db6
 /* Load Store */
d77db6
 static const char * const smca_ls_mce_desc[] = {
d77db6
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
d77db6
 {
d77db6
 	return EXTRACT(e->ipid, 0, 31) >> 20;
d77db6
 }
d77db6
+
d77db6
+/*
d77db6
+ * The HBM memory managed by the UMCCH of the noncpu node
d77db6
+ * can be calculated based on the [15:12]bits of IPID
d77db6
+ */
d77db6
+static int find_hbm_channel(struct mce_event *e)
d77db6
+{
d77db6
+	int umc, tmp;
d77db6
+
d77db6
+	umc = EXTRACT(e->ipid, 0, 31) >> 20;
d77db6
+
d77db6
+	/*
d77db6
+	 * The HBM channel managed by the UMC of the noncpu node
d77db6
+	 * can be calculated based on the [15:12]bits of IPID as follows
d77db6
+	 */
d77db6
+	tmp = ((e->ipid >> 12) & 0xf);
d77db6
+
d77db6
+	return (umc % 2) ? tmp + 4 : tmp;
d77db6
+}
d77db6
+
d77db6
 /* Decode extended errors according to Scalable MCA specification */
d77db6
 static void decode_smca_error(struct mce_event *e)
d77db6
 {
d77db6
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
d77db6
 	unsigned short xec = (e->status >> 16) & 0x3f;
d77db6
 	const struct smca_hwid *s_hwid;
d77db6
 	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
d77db6
+	uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
d77db6
 	unsigned int csrow = -1, channel = -1;
d77db6
 	unsigned int i;
d77db6
 
d77db6
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
d77db6
 			bank_type = s_hwid->bank_type;
d77db6
 			break;
d77db6
 		}
d77db6
+		if (mcatype_instancehi >= NONCPU_NODE_INDEX)
d77db6
+			bank_type = SMCA_UMC_V2;
d77db6
 	}
d77db6
 
d77db6
-	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
d77db6
+	if (i >= MAX_NR_BANKS) {
d77db6
 		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
d77db6
 		return;
d77db6
 	}
d77db6
 
d77db6
-	if (bank_type >= N_SMCA_BANK_TYPES) {
d77db6
+	if (bank_type >= MAX_NR_BANKS) {
d77db6
 		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
d77db6
 		return;
d77db6
 	}
d77db6
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
d77db6
 		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
d77db6
 			     channel, csrow);
d77db6
 	}
d77db6
+
d77db6
+	if (bank_type == SMCA_UMC_V2 && xec == 0) {
d77db6
+		/* The UMCPHY is reported as csrow in case of noncpu nodes */
d77db6
+		csrow = find_umc_channel(e) / 2;
d77db6
+		/* UMCCH is managing the HBM memory */
d77db6
+		channel = find_hbm_channel(e);
d77db6
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
d77db6
+			     channel, csrow);
d77db6
+	}
d77db6
+
d77db6
 }
d77db6
 
d77db6
 int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)