|
|
df8b6a |
commit aecf33aa70331670c06db6b652712b476e24051c
|
|
|
df8b6a |
Author: Muralidhara M K <muralimk@amd.com>
|
|
|
df8b6a |
Date: Mon Jul 12 05:40:46 2021 -0500
|
|
|
df8b6a |
|
|
|
df8b6a |
rasdaemon: Enumerate memory on noncpu nodes
|
|
|
df8b6a |
|
|
|
df8b6a |
On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
|
|
|
df8b6a |
banks) connected via xGMI links to the CPUs.
|
|
|
df8b6a |
|
|
|
df8b6a |
The node id information is available in the InstanceHI[47:44] of
|
|
|
df8b6a |
the IPID register.
|
|
|
df8b6a |
|
|
|
df8b6a |
The UMC Phys on Aldeberan nodes are enumerated as csrow
|
|
|
df8b6a |
The UMC channels connected to HBMs are enumerated as ranks.
|
|
|
df8b6a |
|
|
|
df8b6a |
Signed-off-by: Muralidhara M K <muralimk@amd.com>
|
|
|
df8b6a |
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
|
|
|
df8b6a |
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
|
|
df8b6a |
|
|
|
df8b6a |
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
|
|
df8b6a |
index 3c346f4..f3379fc 100644
|
|
|
df8b6a |
--- a/mce-amd-smca.c
|
|
|
df8b6a |
+++ b/mce-amd-smca.c
|
|
|
df8b6a |
@@ -78,6 +78,12 @@ enum smca_bank_types {
|
|
|
df8b6a |
/* Maximum number of MCA banks per CPU. */
|
|
|
df8b6a |
#define MAX_NR_BANKS 64
|
|
|
df8b6a |
|
|
|
df8b6a |
+/*
|
|
|
df8b6a |
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
|
|
|
df8b6a |
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
|
|
|
df8b6a |
+ */
|
|
|
df8b6a |
+#define NONCPU_NODE_INDEX 8
|
|
|
df8b6a |
+
|
|
|
df8b6a |
/* SMCA Extended error strings */
|
|
|
df8b6a |
/* Load Store */
|
|
|
df8b6a |
static const char * const smca_ls_mce_desc[] = {
|
|
|
df8b6a |
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
|
|
|
df8b6a |
{
|
|
|
df8b6a |
return EXTRACT(e->ipid, 0, 31) >> 20;
|
|
|
df8b6a |
}
|
|
|
df8b6a |
+
|
|
|
df8b6a |
+/*
|
|
|
df8b6a |
+ * The HBM memory managed by the UMCCH of the noncpu node
|
|
|
df8b6a |
+ * can be calculated based on the [15:12]bits of IPID
|
|
|
df8b6a |
+ */
|
|
|
df8b6a |
+static int find_hbm_channel(struct mce_event *e)
|
|
|
df8b6a |
+{
|
|
|
df8b6a |
+ int umc, tmp;
|
|
|
df8b6a |
+
|
|
|
df8b6a |
+ umc = EXTRACT(e->ipid, 0, 31) >> 20;
|
|
|
df8b6a |
+
|
|
|
df8b6a |
+ /*
|
|
|
df8b6a |
+ * The HBM channel managed by the UMC of the noncpu node
|
|
|
df8b6a |
+ * can be calculated based on the [15:12]bits of IPID as follows
|
|
|
df8b6a |
+ */
|
|
|
df8b6a |
+ tmp = ((e->ipid >> 12) & 0xf);
|
|
|
df8b6a |
+
|
|
|
df8b6a |
+ return (umc % 2) ? tmp + 4 : tmp;
|
|
|
df8b6a |
+}
|
|
|
df8b6a |
+
|
|
|
df8b6a |
/* Decode extended errors according to Scalable MCA specification */
|
|
|
df8b6a |
static void decode_smca_error(struct mce_event *e)
|
|
|
df8b6a |
{
|
|
|
df8b6a |
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
|
|
|
df8b6a |
unsigned short xec = (e->status >> 16) & 0x3f;
|
|
|
df8b6a |
const struct smca_hwid *s_hwid;
|
|
|
df8b6a |
uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
|
|
|
df8b6a |
+ uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
|
|
|
df8b6a |
unsigned int csrow = -1, channel = -1;
|
|
|
df8b6a |
unsigned int i;
|
|
|
df8b6a |
|
|
|
df8b6a |
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
|
|
|
df8b6a |
bank_type = s_hwid->bank_type;
|
|
|
df8b6a |
break;
|
|
|
df8b6a |
}
|
|
|
df8b6a |
+ if (mcatype_instancehi >= NONCPU_NODE_INDEX)
|
|
|
df8b6a |
+ bank_type = SMCA_UMC_V2;
|
|
|
df8b6a |
}
|
|
|
df8b6a |
|
|
|
df8b6a |
- if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
|
|
|
df8b6a |
+ if (i >= MAX_NR_BANKS) {
|
|
|
df8b6a |
strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
|
|
|
df8b6a |
return;
|
|
|
df8b6a |
}
|
|
|
df8b6a |
|
|
|
df8b6a |
- if (bank_type >= N_SMCA_BANK_TYPES) {
|
|
|
df8b6a |
+ if (bank_type >= MAX_NR_BANKS) {
|
|
|
df8b6a |
strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
|
|
|
df8b6a |
return;
|
|
|
df8b6a |
}
|
|
|
df8b6a |
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
|
|
|
df8b6a |
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
|
|
|
df8b6a |
channel, csrow);
|
|
|
df8b6a |
}
|
|
|
df8b6a |
+
|
|
|
df8b6a |
+ if (bank_type == SMCA_UMC_V2 && xec == 0) {
|
|
|
df8b6a |
+ /* The UMCPHY is reported as csrow in case of noncpu nodes */
|
|
|
df8b6a |
+ csrow = find_umc_channel(e) / 2;
|
|
|
df8b6a |
+ /* UMCCH is managing the HBM memory */
|
|
|
df8b6a |
+ channel = find_hbm_channel(e);
|
|
|
df8b6a |
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
|
|
|
df8b6a |
+ channel, csrow);
|
|
|
df8b6a |
+ }
|
|
|
df8b6a |
+
|
|
|
df8b6a |
}
|
|
|
df8b6a |
|
|
|
df8b6a |
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
|