krishnanadh / rpms / rasdaemon

Forked from rpms/rasdaemon a year ago
Clone
7ea41b
commit a16ca0711001957ee98f2c124abce0fa1f801529
7ea41b
Author: Chandu-babu Namburu <chandu@amd.com>
7ea41b
Date:   Wed Jan 30 20:36:45 2019 +0530
7ea41b
7ea41b
    rasdaemon: add support for AMD Scalable MCA
7ea41b
    
7ea41b
    Add logic here to decode errors from all known IP blocks for
7ea41b
    AMD Scalable MCA supported processors
7ea41b
    
7ea41b
    Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
7ea41b
    Signed-off-by: Chandu-babu Namburu <chandu@amd.com>
7ea41b
7ea41b
---
7ea41b
 mce-amd-smca.c    |  371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
7ea41b
 mce-amd.c         |  122 +++++++++++++++++
7ea41b
 ras-mce-handler.c |   24 +++
7ea41b
 ras-mce-handler.h |   15 ++
7ea41b
 4 files changed, 530 insertions(+), 2 deletions(-)
7ea41b
7ea41b
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
7ea41b
+++ rasdaemon-0.6.1/mce-amd-smca.c	2019-07-12 11:35:04.836470461 -0400
7ea41b
@@ -0,0 +1,371 @@
7ea41b
+/*
7ea41b
+ * Copyright (c) 2018, AMD, Inc. All rights reserved.
7ea41b
+ *
7ea41b
+ * This program is free software; you can redistribute it and/or modify
7ea41b
+ * it under the terms of the GNU General Public License version 2 and
7ea41b
+ * only version 2 as published by the Free Software Foundation.
7ea41b
+ *
7ea41b
+ * This program is distributed in the hope that it will be useful,
7ea41b
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
7ea41b
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
7ea41b
+ * GNU General Public License for more details.
7ea41b
+ */
7ea41b
+
7ea41b
+#include <stdio.h>
7ea41b
+#include <string.h>
7ea41b
+
7ea41b
+#include "ras-mce-handler.h"
7ea41b
+#include "bitfield.h"
7ea41b
+
7ea41b
+/* MCA_STATUS REGISTER FOR FAMILY 17H
7ea41b
+ *********************** Higher 32-bits *****************************
7ea41b
+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE,
7ea41b
+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid,
7ea41b
+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet,
7ea41b
+ * 51: RES, 50: RES, 49: RES, 48: RES,
7ea41b
+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred,
7ea41b
+ * 43: Poison, 42: RES, 41: RES, 40: RES,
7ea41b
+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4],
7ea41b
+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0]
7ea41b
+ *********************** Lower 32-bits ******************************
7ea41b
+ * 31: RES, 30: RES, 29: RES, 28: RES,
7ea41b
+ * 27: RES, 26: RES, 25: RES, 24: RES
7ea41b
+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4],
7ea41b
+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0]
7ea41b
+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12],
7ea41b
+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8],
7ea41b
+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4],
7ea41b
+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
7ea41b
+ */
7ea41b
+
7ea41b
+/* These may be used by multiple smca_hwid_mcatypes */
7ea41b
+enum smca_bank_types {
7ea41b
+	SMCA_LS = 0,    /* Load Store */
7ea41b
+	SMCA_IF,        /* Instruction Fetch */
7ea41b
+	SMCA_L2_CACHE,  /* L2 Cache */
7ea41b
+	SMCA_DE,        /* Decoder Unit */
7ea41b
+	SMCA_RESERVED,  /* Reserved */
7ea41b
+	SMCA_EX,        /* Execution Unit */
7ea41b
+	SMCA_FP,        /* Floating Point */
7ea41b
+	SMCA_L3_CACHE,  /* L3 Cache */
7ea41b
+	SMCA_CS,        /* Coherent Slave */
7ea41b
+	SMCA_PIE,       /* Power, Interrupts, etc. */
7ea41b
+	SMCA_UMC,       /* Unified Memory Controller */
7ea41b
+	SMCA_PB,        /* Parameter Block */
7ea41b
+	SMCA_PSP,       /* Platform Security Processor */
7ea41b
+	SMCA_SMU,       /* System Management Unit */
7ea41b
+	N_SMCA_BANK_TYPES
7ea41b
+};
7ea41b
+
7ea41b
+/* SMCA Extended error strings */
7ea41b
+/* Load Store */
7ea41b
+static const char * const smca_ls_mce_desc[] = {
7ea41b
+	"Load queue parity",
7ea41b
+	"Store queue parity",
7ea41b
+	"Miss address buffer payload parity",
7ea41b
+	"L1 TLB parity",
7ea41b
+	"Reserved",
7ea41b
+	"DC tag error type 6",
7ea41b
+	"DC tag error type 1",
7ea41b
+	"Internal error type 1",
7ea41b
+	"Internal error type 2",
7ea41b
+	"Sys Read data error thread 0",
7ea41b
+	"Sys read data error thread 1",
7ea41b
+	"DC tag error type 2",
7ea41b
+	"DC data error type 1 (poison consumption)",
7ea41b
+	"DC data error type 2",
7ea41b
+	"DC data error type 3",
7ea41b
+	"DC tag error type 4",
7ea41b
+	"L2 TLB parity",
7ea41b
+	"PDC parity error",
7ea41b
+	"DC tag error type 3",
7ea41b
+	"DC tag error type 5",
7ea41b
+	"L2 fill data error",
7ea41b
+};
7ea41b
+/* Instruction Fetch */
7ea41b
+static const char * const smca_if_mce_desc[] = {
7ea41b
+	"microtag probe port parity error",
7ea41b
+	"IC microtag or full tag multi-hit error",
7ea41b
+	"IC full tag parity",
7ea41b
+	"IC data array parity",
7ea41b
+	"Decoupling queue phys addr parity error",
7ea41b
+	"L0 ITLB parity error",
7ea41b
+	"L1 ITLB parity error",
7ea41b
+	"L2 ITLB parity error",
7ea41b
+	"BPQ snoop parity on Thread 0",
7ea41b
+	"BPQ snoop parity on Thread 1",
7ea41b
+	"L1 BTB multi-match error",
7ea41b
+	"L2 BTB multi-match error",
7ea41b
+	"L2 Cache Response Poison error",
7ea41b
+	"System Read Data error",
7ea41b
+};
7ea41b
+/* L2 Cache */
7ea41b
+static const char * const smca_l2_mce_desc[] = {
7ea41b
+	"L2M tag multi-way-hit error",
7ea41b
+	"L2M tag ECC error",
7ea41b
+	"L2M data ECC error",
7ea41b
+	"HW assert",
7ea41b
+};
7ea41b
+/* Decoder Unit */
7ea41b
+static const char * const smca_de_mce_desc[] = {
7ea41b
+	"uop cache tag parity error",
7ea41b
+	"uop cache data parity error",
7ea41b
+	"Insn buffer parity error",
7ea41b
+	"uop queue parity error",
7ea41b
+	"Insn dispatch queue parity error",
7ea41b
+	"Fetch address FIFO parity",
7ea41b
+	"Patch RAM data parity",
7ea41b
+	"Patch RAM sequencer parity",
7ea41b
+	"uop buffer parity"
7ea41b
+};
7ea41b
+/* Execution Unit */
7ea41b
+static const char * const smca_ex_mce_desc[] = {
7ea41b
+	"Watchdog timeout error",
7ea41b
+	"Phy register file parity",
7ea41b
+	"Flag register file parity",
7ea41b
+	"Immediate displacement register file parity",
7ea41b
+	"Address generator payload parity",
7ea41b
+	"EX payload parity",
7ea41b
+	"Checkpoint queue parity",
7ea41b
+	"Retire dispatch queue parity",
7ea41b
+	"Retire status queue parity error",
7ea41b
+	"Scheduling queue parity error",
7ea41b
+	"Branch buffer queue parity error",
7ea41b
+};
7ea41b
+/* Floating Point Unit */
7ea41b
+static const char * const smca_fp_mce_desc[] = {
7ea41b
+	"Physical register file parity",
7ea41b
+	"Freelist parity error",
7ea41b
+	"Schedule queue parity",
7ea41b
+	"NSQ parity error",
7ea41b
+	"Retire queue parity",
7ea41b
+	"Status register file parity",
7ea41b
+	"Hardware assertion",
7ea41b
+};
7ea41b
+/* L3 Cache */
7ea41b
+static const char * const smca_l3_mce_desc[] = {
7ea41b
+	"Shadow tag macro ECC error",
7ea41b
+	"Shadow tag macro multi-way-hit error",
7ea41b
+	"L3M tag ECC error",
7ea41b
+	"L3M tag multi-way-hit error",
7ea41b
+	"L3M data ECC error",
7ea41b
+	"XI parity, L3 fill done channel error",
7ea41b
+	"L3 victim queue parity",
7ea41b
+	"L3 HW assert",
7ea41b
+};
7ea41b
+/* Coherent Slave Unit */
7ea41b
+static const char * const smca_cs_mce_desc[] = {
7ea41b
+	"Illegal request from transport layer",
7ea41b
+	"Address violation",
7ea41b
+	"Security violation",
7ea41b
+	"Illegal response from transport layer",
7ea41b
+	"Unexpected response",
7ea41b
+	"Parity error on incoming request or probe response data",
7ea41b
+	"Parity error on incoming read response data",
7ea41b
+	"Atomic request parity",
7ea41b
+	"ECC error on probe filter access",
7ea41b
+};
7ea41b
+/* Power, Interrupt, etc.. */
7ea41b
+static const char * const smca_pie_mce_desc[] = {
7ea41b
+	"HW assert",
7ea41b
+	"Internal PIE register security violation",
7ea41b
+	"Error on GMI link",
7ea41b
+	"Poison data written to internal PIE register",
7ea41b
+};
7ea41b
+/* Unified Memory Controller */
7ea41b
+static const char * const smca_umc_mce_desc[] = {
7ea41b
+	"DRAM ECC error",
7ea41b
+	"Data poison error on DRAM",
7ea41b
+	"SDP parity error",
7ea41b
+	"Advanced peripheral bus error",
7ea41b
+	"Command/address parity error",
7ea41b
+	"Write data CRC error",
7ea41b
+};
7ea41b
+/* Parameter Block */
7ea41b
+static const char * const smca_pb_mce_desc[] = {
7ea41b
+	"Parameter Block RAM ECC error",
7ea41b
+};
7ea41b
+/* Platform Security Processor */
7ea41b
+static const char * const smca_psp_mce_desc[] = {
7ea41b
+	"PSP RAM ECC or parity error",
7ea41b
+};
7ea41b
+/* System Management Unit */
7ea41b
+static const char * const smca_smu_mce_desc[] = {
7ea41b
+	"SMU RAM ECC or parity error",
7ea41b
+};
7ea41b
+
7ea41b
+struct smca_mce_desc {
7ea41b
+	const char * const *descs;
7ea41b
+	unsigned int num_descs;
7ea41b
+};
7ea41b
+
7ea41b
+static struct smca_mce_desc smca_mce_descs[] = {
7ea41b
+	[SMCA_LS]       = { smca_ls_mce_desc,   ARRAY_SIZE(smca_ls_mce_desc)  },
7ea41b
+	[SMCA_IF]       = { smca_if_mce_desc,   ARRAY_SIZE(smca_if_mce_desc)  },
7ea41b
+	[SMCA_L2_CACHE] = { smca_l2_mce_desc,   ARRAY_SIZE(smca_l2_mce_desc)  },
7ea41b
+	[SMCA_DE]       = { smca_de_mce_desc,   ARRAY_SIZE(smca_de_mce_desc)  },
7ea41b
+	[SMCA_EX]       = { smca_ex_mce_desc,   ARRAY_SIZE(smca_ex_mce_desc)  },
7ea41b
+	[SMCA_FP]       = { smca_fp_mce_desc,   ARRAY_SIZE(smca_fp_mce_desc)  },
7ea41b
+	[SMCA_L3_CACHE] = { smca_l3_mce_desc,   ARRAY_SIZE(smca_l3_mce_desc)  },
7ea41b
+	[SMCA_CS]       = { smca_cs_mce_desc,   ARRAY_SIZE(smca_cs_mce_desc)  },
7ea41b
+	[SMCA_PIE]      = { smca_pie_mce_desc,  ARRAY_SIZE(smca_pie_mce_desc) },
7ea41b
+	[SMCA_UMC]      = { smca_umc_mce_desc,  ARRAY_SIZE(smca_umc_mce_desc) },
7ea41b
+	[SMCA_PB]       = { smca_pb_mce_desc,   ARRAY_SIZE(smca_pb_mce_desc)  },
7ea41b
+	[SMCA_PSP]      = { smca_psp_mce_desc,  ARRAY_SIZE(smca_psp_mce_desc) },
7ea41b
+	[SMCA_SMU]      = { smca_smu_mce_desc,  ARRAY_SIZE(smca_smu_mce_desc) },
7ea41b
+};
7ea41b
+
7ea41b
+struct smca_hwid {
7ea41b
+	unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
7ea41b
+	uint32_t mcatype_hwid;  /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
7ea41b
+};
7ea41b
+
7ea41b
+static struct smca_hwid smca_hwid_mcatypes[] = {
7ea41b
+	/* { bank_type, mcatype_hwid } */
7ea41b
+
7ea41b
+	/* ZN Core (HWID=0xB0) MCA types */
7ea41b
+	{ SMCA_LS,       0x000000B0 },
7ea41b
+	{ SMCA_IF,       0x000100B0 },
7ea41b
+	{ SMCA_L2_CACHE, 0x000200B0 },
7ea41b
+	{ SMCA_DE,       0x000300B0 },
7ea41b
+	/* HWID 0xB0 MCATYPE 0x4 is Reserved */
7ea41b
+	{ SMCA_EX,       0x000500B0 },
7ea41b
+	{ SMCA_FP,       0x000600B0 },
7ea41b
+	{ SMCA_L3_CACHE, 0x000700B0 },
7ea41b
+
7ea41b
+	/* Data Fabric MCA types */
7ea41b
+	{ SMCA_CS,       0x0000002E },
7ea41b
+	{ SMCA_PIE,      0x0001002E },
7ea41b
+
7ea41b
+	/* Unified Memory Controller MCA type */
7ea41b
+	{ SMCA_UMC,      0x00000096 },
7ea41b
+
7ea41b
+	/* Parameter Block MCA type */
7ea41b
+	{ SMCA_PB,       0x00000005 },
7ea41b
+
7ea41b
+	/* Platform Security Processor MCA type */
7ea41b
+	{ SMCA_PSP,      0x000000FF },
7ea41b
+
7ea41b
+	/* System Management Unit MCA type */
7ea41b
+	{ SMCA_SMU,      0x00000001 },
7ea41b
+};
7ea41b
+
7ea41b
+struct smca_bank_name {
7ea41b
+	const char *name;
7ea41b
+};
7ea41b
+
7ea41b
+static struct smca_bank_name smca_names[] = {
7ea41b
+	[SMCA_LS]       = { "Load Store Unit" },
7ea41b
+	[SMCA_IF]       = { "Instruction Fetch Unit" },
7ea41b
+	[SMCA_L2_CACHE] = { "L2 Cache" },
7ea41b
+	[SMCA_DE]       = { "Decode Unit" },
7ea41b
+	[SMCA_RESERVED] = { "Reserved" },
7ea41b
+	[SMCA_EX]       = { "Execution Unit" },
7ea41b
+	[SMCA_FP]       = { "Floating Point Unit" },
7ea41b
+	[SMCA_L3_CACHE] = { "L3 Cache" },
7ea41b
+	[SMCA_CS]       = { "Coherent Slave" },
7ea41b
+	[SMCA_PIE]      = { "Power, Interrupts, etc." },
7ea41b
+	[SMCA_UMC]      = { "Unified Memory Controller" },
7ea41b
+	[SMCA_PB]       = { "Parameter Block" },
7ea41b
+	[SMCA_PSP]      = { "Platform Security Processor" },
7ea41b
+	[SMCA_SMU]      = { "System Management Unit" },
7ea41b
+};
7ea41b
+
7ea41b
+static void amd_decode_errcode(struct mce_event *e)
7ea41b
+{
7ea41b
+
7ea41b
+	decode_amd_errcode(e);
7ea41b
+
7ea41b
+	if (e->status & MCI_STATUS_POISON)
7ea41b
+		mce_snprintf(e->mcistatus_msg, "Poison consumed");
7ea41b
+
7ea41b
+	if (e->status & MCI_STATUS_TCC)
7ea41b
+		mce_snprintf(e->mcistatus_msg, "Task_context_corrupt");
7ea41b
+
7ea41b
+}
7ea41b
+/*
7ea41b
+ * To find the UMC channel represented by this bank we need to match on its
7ea41b
+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its
7ea41b
+ * IPID.
7ea41b
+ */
7ea41b
+static int find_umc_channel(struct mce_event *e)
7ea41b
+{
7ea41b
+	uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
7ea41b
+	uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
7ea41b
+	int i, channel = -1;
7ea41b
+
7ea41b
+	for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
7ea41b
+		if (umc_instance_id[i] == instance_id)
7ea41b
+			channel = i;
7ea41b
+
7ea41b
+	return channel;
7ea41b
+}
7ea41b
+/* Decode extended errors according to Scalable MCA specification */
7ea41b
+static void decode_smca_error(struct mce_event *e)
7ea41b
+{
7ea41b
+	enum smca_bank_types bank_type;
7ea41b
+	const char *ip_name;
7ea41b
+	unsigned short xec = (e->status >> 16) & 0x3f;
7ea41b
+	const struct smca_hwid *s_hwid;
7ea41b
+	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
7ea41b
+	unsigned int csrow = -1, channel = -1;
7ea41b
+	unsigned int i;
7ea41b
+
7ea41b
+	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
7ea41b
+		s_hwid = &smca_hwid_mcatypes[i];
7ea41b
+		if (mcatype_hwid == s_hwid->mcatype_hwid) {
7ea41b
+			bank_type = s_hwid->bank_type;
7ea41b
+			break;
7ea41b
+		}
7ea41b
+	}
7ea41b
+
7ea41b
+	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
7ea41b
+		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
7ea41b
+		return;
7ea41b
+	}
7ea41b
+
7ea41b
+	if (bank_type >= N_SMCA_BANK_TYPES) {
7ea41b
+		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
7ea41b
+		return;
7ea41b
+	}
7ea41b
+
7ea41b
+	if (bank_type == SMCA_RESERVED) {
7ea41b
+		strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n");
7ea41b
+		return;
7ea41b
+	}
7ea41b
+
7ea41b
+	ip_name = smca_names[bank_type].name;
7ea41b
+
7ea41b
+	mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank);
7ea41b
+
7ea41b
+	/* Only print the descriptor of valid extended error code */
7ea41b
+	if (xec < smca_mce_descs[bank_type].num_descs)
7ea41b
+		mce_snprintf(e->mcastatus_msg,
7ea41b
+			     " %s.\n", smca_mce_descs[bank_type].descs[xec]);
7ea41b
+
7ea41b
+	if (bank_type == SMCA_UMC && xec == 0) {
7ea41b
+		channel = find_umc_channel(e);
7ea41b
+		csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
7ea41b
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
7ea41b
+			     channel, csrow);
7ea41b
+	}
7ea41b
+}
7ea41b
+
7ea41b
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
7ea41b
+{
7ea41b
+	uint64_t mcgstatus = e->mcgstatus;
7ea41b
+
7ea41b
+	mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld",
7ea41b
+		    (long long)e->mcgstatus);
7ea41b
+
7ea41b
+	if (mcgstatus & MCG_STATUS_RIPV)
7ea41b
+		mce_snprintf(e->mcgstatus_msg, "RIPV");
7ea41b
+	if (mcgstatus & MCG_STATUS_EIPV)
7ea41b
+		mce_snprintf(e->mcgstatus_msg, "EIPV");
7ea41b
+	if (mcgstatus & MCG_STATUS_MCIP)
7ea41b
+		mce_snprintf(e->mcgstatus_msg, "MCIP");
7ea41b
+
7ea41b
+	decode_smca_error(e);
7ea41b
+	amd_decode_errcode(e);
7ea41b
+	return 0;
7ea41b
+}
7ea41b
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
7ea41b
+++ rasdaemon-0.6.1/mce-amd.c	2019-07-12 11:35:04.836470461 -0400
7ea41b
@@ -0,0 +1,122 @@
7ea41b
+/*
7ea41b
+ * Copyright (c) 2018, The AMD, Inc. All rights reserved.
7ea41b
+ *
7ea41b
+ * This program is free software; you can redistribute it and/or modify
7ea41b
+ * it under the terms of the GNU General Public License version 2 and
7ea41b
+ * only version 2 as published by the Free Software Foundation.
7ea41b
+ *
7ea41b
+ * This program is distributed in the hope that it will be useful,
7ea41b
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
7ea41b
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
7ea41b
+ * GNU General Public License for more details.
7ea41b
+ */
7ea41b
+
7ea41b
+#include <stdio.h>
7ea41b
+#include <string.h>
7ea41b
+
7ea41b
+#include "ras-mce-handler.h"
7ea41b
+
7ea41b
+/* Error Code Types */
7ea41b
+#define TLB_ERROR(x)                    (((x) & 0xFFF0) == 0x0010)
7ea41b
+#define MEM_ERROR(x)                    (((x) & 0xFF00) == 0x0100)
7ea41b
+#define BUS_ERROR(x)                    (((x) & 0xF800) == 0x0800)
7ea41b
+#define INT_ERROR(x)                    (((x) & 0xF4FF) == 0x0400)
7ea41b
+
7ea41b
+/* Error code: transaction type (TT) */
7ea41b
+static char *transaction[] = {
7ea41b
+	"instruction", "data", "generic", "reserved"
7ea41b
+};
7ea41b
+/* Error codes: cache level (LL) */
7ea41b
+static char *cachelevel[] = {
7ea41b
+	"reserved", "L1", "L2", "L3/generic"
7ea41b
+};
7ea41b
+/* Error codes: memory transaction type (RRRR) */
7ea41b
+static char *memtrans[] = {
7ea41b
+	"generic", "generic read", "generic write", "data read",
7ea41b
+	"data write", "instruction fetch", "prefetch", "evict", "snoop",
7ea41b
+	"?", "?", "?", "?", "?", "?", "?"
7ea41b
+};
7ea41b
+/* Participation Processor */
7ea41b
+static char *partproc[] = {
7ea41b
+	"local node origin", "local node response",
7ea41b
+	"local node observed", "generic participation"
7ea41b
+};
7ea41b
+/* Timeout */
7ea41b
+static char *timeout[] = {
7ea41b
+	"request didn't time out",
7ea41b
+	"request timed out"
7ea41b
+};
7ea41b
+/* internal unclassified error code */
7ea41b
+static char *internal[] = { "reserved",
7ea41b
+			    "reserved",
7ea41b
+			    "hardware assert",
7ea41b
+			    "reserved" };
7ea41b
+
7ea41b
+#define TT(x)         (((x) >> 2) & 0x3)   /*bit 2, bit 3*/
7ea41b
+#define TT_MSG(x)     transaction[TT(x)]
7ea41b
+#define LL(x)         ((x) & 0x3)          /*bit 0, bit 1*/
7ea41b
+#define LL_MSG(x)     cachelevel[LL(x)]
7ea41b
+
7ea41b
+#define R4(x)         (((x) >> 4) & 0xF)   /*bit 4, bit 5, bit 6, bit 7 */
7ea41b
+#define R4_MSG(x)     ((R4(x) < 9) ?  memtrans[R4(x)] : "Wrong R4!")
7ea41b
+
7ea41b
+#define TO(x)         (((x) >> 8) & 0x1)   /*bit 8*/
7ea41b
+#define TO_MSG(x)     timeout[TO(x)]
7ea41b
+#define PP(x)         (((x) >> 9) & 0x3)   /*bit 9, bit 10*/
7ea41b
+#define PP_MSG(x)     partproc[PP(x)]
7ea41b
+
7ea41b
+#define UU(x)         (((x) >> 8) & 0x3)   /*bit 8, bit 9*/
7ea41b
+#define UU_MSG(x)     internal[UU(x)]
7ea41b
+
7ea41b
+void decode_amd_errcode(struct mce_event *e)
7ea41b
+{
7ea41b
+	uint16_t ec = e->status & 0xffff;
7ea41b
+	uint16_t ecc = (e->status >> 45) & 0x3;
7ea41b
+
7ea41b
+	if (e->status & MCI_STATUS_UC) {
7ea41b
+		if (e->status & MCI_STATUS_PCC)
7ea41b
+			strcpy(e->error_msg, "System Fatal error.");
7ea41b
+		if (e->mcgstatus & MCG_STATUS_RIPV)
7ea41b
+			strcpy(e->error_msg,
7ea41b
+			       "Uncorrected, software restartable error.");
7ea41b
+		strcpy(e->error_msg,
7ea41b
+		       "Uncorrected, software containable error.");
7ea41b
+	} else if (e->status & MCI_STATUS_DEFERRED)
7ea41b
+		strcpy(e->error_msg, "Deferred error, no action required.");
7ea41b
+	else
7ea41b
+		strcpy(e->error_msg, "Corrected error, no action required.");
7ea41b
+
7ea41b
+	if (!(e->status & MCI_STATUS_VAL))
7ea41b
+		mce_snprintf(e->mcistatus_msg, "MCE_INVALID");
7ea41b
+
7ea41b
+	if (e->status & MCI_STATUS_OVER)
7ea41b
+		mce_snprintf(e->mcistatus_msg, "Error_overflow");
7ea41b
+
7ea41b
+	if (e->status & MCI_STATUS_PCC)
7ea41b
+		mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt");
7ea41b
+
7ea41b
+	if (ecc)
7ea41b
+		mce_snprintf(e->mcistatus_msg,
7ea41b
+			     "%sECC", ((ecc == 2) ? "C" : "U"));
7ea41b
+
7ea41b
+	if (INT_ERROR(ec)) {
7ea41b
+		mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec));
7ea41b
+		return;
7ea41b
+	}
7ea41b
+
7ea41b
+	if (TLB_ERROR(ec))
7ea41b
+		mce_snprintf(e->mcastatus_msg,
7ea41b
+			     "TLB Error 'tx: %s, level: %s'",
7ea41b
+			     TT_MSG(ec), LL_MSG(ec));
7ea41b
+	else if (MEM_ERROR(ec))
7ea41b
+		mce_snprintf(e->mcastatus_msg,
7ea41b
+			     "Memory Error 'mem-tx: %s, tx: %s, level: %s'",
7ea41b
+			     R4_MSG(ec), TT_MSG(ec), LL_MSG(ec));
7ea41b
+	else if (BUS_ERROR(ec))
7ea41b
+		mce_snprintf(e->mcastatus_msg,
7ea41b
+			     "Bus Error '%s, %s, mem-tx: %s, level: %s'",
7ea41b
+			     PP_MSG(ec), TO_MSG(ec),
7ea41b
+			     R4_MSG(ec), LL_MSG(ec));
7ea41b
+	return;
7ea41b
+
7ea41b
+}
7ea41b
--- rasdaemon-0.6.1.orig/ras-mce-handler.c	2019-07-12 11:35:01.585502811 -0400
7ea41b
+++ rasdaemon-0.6.1/ras-mce-handler.c	2019-07-12 11:35:04.836470461 -0400
7ea41b
@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
7ea41b
 	[CPU_KNIGHTS_LANDING] = "Knights Landing",
7ea41b
 	[CPU_KNIGHTS_MILL] = "Knights Mill",
7ea41b
 	[CPU_SKYLAKE_XEON] = "Skylake server",
7ea41b
+	[CPU_NAPLES] = "AMD Family 17h Zen1"
7ea41b
 };
7ea41b
 
7ea41b
 static enum cputype select_intel_cputype(struct ras_events *ras)
7ea41b
@@ -190,9 +191,12 @@ ret = 0;
7ea41b
 	if (!strcmp(mce->vendor, "AuthenticAMD")) {
7ea41b
 		if (mce->family == 15)
7ea41b
 			mce->cputype = CPU_K8;
7ea41b
-		if (mce->family > 15) {
7ea41b
+		if (mce->family == 23)
7ea41b
+			mce->cputype = CPU_NAPLES;
7ea41b
+		if (mce->family > 23) {
7ea41b
 			log(ALL, LOG_INFO,
7ea41b
-			    "Can't parse MCE for this AMD CPU yet\n");
7ea41b
+			    "Can't parse MCE for this AMD CPU yet %d\n",
7ea41b
+			    mce->family);
7ea41b
 			ret = EINVAL;
7ea41b
 		}
7ea41b
 		goto ret;
7ea41b
@@ -331,6 +335,12 @@ #if 0
7ea41b
 	if (e->status & MCI_STATUS_ADDRV)
7ea41b
 		trace_seq_printf(s, ", addr= %llx", (long long)e->addr);
7ea41b
 
7ea41b
+	if (e->status & MCI_STATUS_SYNDV)
7ea41b
+		trace_seq_printf(s, ", synd= %llx", (long long)e->synd);
7ea41b
+
7ea41b
+	if (e->ipid)
7ea41b
+		trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid);
7ea41b
+
7ea41b
 	if (e->mcgstatus_msg)
7ea41b
 		trace_seq_printf(s, ", %s", e->mcgstatus_msg);
7ea41b
 	else
7ea41b
@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank
7ea41b
 	if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0)
7ea41b
 		return -1;
7ea41b
 	e.cpuvendor = val;
7ea41b
+	/* Get New entries */
7ea41b
+	if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0)
7ea41b
+		return -1;
7ea41b
+	e.synd = val;
7ea41b
+	if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0)
7ea41b
+		return -1;
7ea41b
+	e.ipid = val;
7ea41b
 
7ea41b
 	switch (mce->cputype) {
7ea41b
 	case CPU_GENERIC:
7ea41b
@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv
7ea41b
 	case CPU_K8:
7ea41b
 		rc = parse_amd_k8_event(ras, &e);
7ea41b
 		break;
7ea41b
+	case CPU_NAPLES:
7ea41b
+		rc = parse_amd_smca_event(ras, &e);
7ea41b
+		break;
7ea41b
 	default:			/* All other CPU types are Intel */
7ea41b
 		rc = parse_intel_event(ras, &e);
7ea41b
 	}
7ea41b
--- rasdaemon-0.6.1.orig/ras-mce-handler.h	2019-07-12 11:35:01.585502811 -0400
7ea41b
+++ rasdaemon-0.6.1/ras-mce-handler.h	2019-07-12 11:35:04.836470461 -0400
7ea41b
@@ -50,6 +50,7 @@ enum cputype {
7ea41b
 	CPU_KNIGHTS_LANDING,
7ea41b
 	CPU_KNIGHTS_MILL,
7ea41b
 	CPU_SKYLAKE_XEON,
7ea41b
+	CPU_NAPLES,
7ea41b
 };
7ea41b
 
7ea41b
 struct mce_event {
7ea41b
@@ -69,6 +70,8 @@ struct mce_event {
7ea41b
 	uint8_t		cs;
7ea41b
 	uint8_t		bank;
7ea41b
 	uint8_t		cpuvendor;
7ea41b
+	uint64_t        synd;   /* MCA_SYND MSR: only valid on SMCA systems */
7ea41b
+	uint64_t        ipid;   /* MCA_IPID MSR: only valid on SMCA systems */
7ea41b
 
7ea41b
 	/* Parsed data */
7ea41b
 	char		timestamp[64];
7ea41b
@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra
7ea41b
 void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
7ea41b
 void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
7ea41b
 
7ea41b
+/* AMD error code decode function */
7ea41b
+void decode_amd_errcode(struct mce_event *e);
7ea41b
+
7ea41b
 /* Software defined banks */
7ea41b
 #define MCE_EXTENDED_BANK	128
7ea41b
 
7ea41b
@@ -144,6 +150,13 @@ #define MCI_STATUS_EN    (1ULL<<60)  /*
7ea41b
 #define MCI_STATUS_S	 (1ULL<<56)  /* signalled */
7ea41b
 #define MCI_STATUS_AR	 (1ULL<<55)  /* action-required */
7ea41b
 
7ea41b
+/* AMD-specific bits */
7ea41b
+#define MCI_STATUS_TCC          (1ULL<<55)  /* Task context corrupt */
7ea41b
+#define MCI_STATUS_SYNDV        (1ULL<<53)  /* synd reg. valid */
7ea41b
+/* uncorrected error,deferred exception */
7ea41b
+#define MCI_STATUS_DEFERRED     (1ULL<<44)
7ea41b
+#define MCI_STATUS_POISON       (1ULL<<43)  /* access poisonous data */
7ea41b
+
7ea41b
 #define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
7ea41b
 #define MCG_STATUS_EIPV  (1ULL<<1)   /* eip points to correct instruction */
7ea41b
 #define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
7ea41b
@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events
7ea41b
 
7ea41b
 int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e);
7ea41b
 
7ea41b
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e);
7ea41b
+
7ea41b
 #endif
7ea41b
--- rasdaemon-0.6.1.orig/Makefile.in	2018-04-25 06:29:05.000000000 -0400
7ea41b
+++ rasdaemon-0.6.1/Makefile.in	2019-07-15 14:41:22.308278851 -0400
7ea41b
@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT)
7ea41b
 @WITH_MCE_TRUE@			mce-intel-dunnington.c mce-intel-tulsa.c \
7ea41b
 @WITH_MCE_TRUE@			mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
7ea41b
 @WITH_MCE_TRUE@			mce-intel-knl.c mce-intel-broadwell-de.c \
7ea41b
-@WITH_MCE_TRUE@			mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
7ea41b
+@WITH_MCE_TRUE@			mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c
7ea41b
 
7ea41b
 @WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c
7ea41b
 @WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c
7ea41b
@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c
7ea41b
 	mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \
7ea41b
 	mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \
7ea41b
 	mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \
7ea41b
-	non-standard-hisi_hip07.c
7ea41b
+	non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c
7ea41b
 @WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT)
7ea41b
 @WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT)
7ea41b
 @WITH_NON_STANDARD_TRUE@am__objects_3 =  \
7ea41b
@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c
7ea41b
 @WITH_MCE_TRUE@	mce-intel-knl.$(OBJEXT) \
7ea41b
 @WITH_MCE_TRUE@	mce-intel-broadwell-de.$(OBJEXT) \
7ea41b
 @WITH_MCE_TRUE@	mce-intel-broadwell-epex.$(OBJEXT) \
7ea41b
-@WITH_MCE_TRUE@	mce-intel-skylake-xeon.$(OBJEXT)
7ea41b
+@WITH_MCE_TRUE@	mce-intel-skylake-xeon.$(OBJEXT) \
7ea41b
+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \
7ea41b
+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT)
7ea41b
 @WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT)
7ea41b
 @WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT)
7ea41b
 @WITH_HISI_NS_DECODE_TRUE@am__objects_8 =  \
7ea41b
@@ -595,6 +597,8 @@ distclean-compile:
7ea41b
 
7ea41b
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@
7ea41b
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@
7ea41b
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@
7ea41b
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@
7ea41b
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@
7ea41b
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@
7ea41b
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@