diff --git a/SOURCES/0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch b/SOURCES/0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch new file mode 100644 index 0000000..2fb8639 --- /dev/null +++ b/SOURCES/0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch @@ -0,0 +1,43 @@ +From d9fe70fe7db45618f7b46b81ebee85e7a8801870 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 10 Aug 2015 14:24:41 -0400 +Subject: [PATCH 1/5] rasdaemon: fix typos on ras-mc-ctl man page + +Fixed two markers and two typos in the documentation. + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + man/ras-mc-ctl.8.in | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in +index 7441b3a..60997dd 100644 +--- a/man/ras-mc-ctl.8.in ++++ b/man/ras-mc-ctl.8.in +@@ -69,14 +69,14 @@ Display the configured labels for the current hardware, as + well as the current labels registered with EDAC. + .TP + .BI "--guess-labels" +-Print DMI labels, when bank locator is available at the DMI table. ++Print DMI labels, when bank locator is available in the DMI table. + It helps to fill the labels database at @sysconfdir@/ras/dimm_labels.d/. + .TP + .BI "--labeldb="DB + Specify an alternate location for the labels database. + .TP + .BI "--delay="time +-Specify a delay of \ftime\fR seconds before registering dimm labels. ++Specify a delay of \fBtime\fR seconds before registering DIMM labels. + Only meaninful if used together with --register-labels. + .TP + .BI "--layout +@@ -121,4 +121,4 @@ back to parsing output of the \fBdmidecode\fR(8) utility. Use of this + utility will most often require that \fBras-mc-ctl\fR be run as root. + + .SH SEE ALSO +-\f\fBrasdaemon\fR(1) ++\fBrasdaemon\fR(1) +-- +1.8.3.1 + diff --git a/SOURCES/0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch b/SOURCES/0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch new file mode 100644 index 0000000..a0fa572 --- /dev/null +++ b/SOURCES/0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch @@ -0,0 +1,213 @@ +From 2d656c4ec9d5f68ac39b2a8461b0cd4f77dd7c21 Mon Sep 17 00:00:00 2001 +From: Marcin Koss +Date: Thu, 3 Dec 2015 15:19:47 +0100 +Subject: [PATCH 3/5] rasdaemon: Add support for Knights Landing processor + +Signed-off-by: Mauro Carvalho Chehab +--- + Makefile.am | 3 +- + mce-intel-knl.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 5 +++ + ras-mce-handler.c | 1 + + ras-mce-handler.h | 1 + + 5 files changed, 137 insertions(+), 1 deletion(-) + create mode 100644 mce-intel-knl.c + +diff --git a/Makefile.am b/Makefile.am +index a6bf18f..a1cb02a 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -28,7 +28,8 @@ if WITH_MCE + rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \ + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ +- mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c ++ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ ++ mce-intel-knl.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-knl.c b/mce-intel-knl.c +new file mode 100644 +index 0000000..96b0a59 +--- /dev/null ++++ b/mce-intel-knl.c +@@ -0,0 +1,128 @@ ++/* ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include ++#include ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++static struct field memctrl_mc7[] = { ++ SBITFIELD(16, "CA Parity error"), ++ SBITFIELD(17, "Internal Parity error except WDB"), ++ SBITFIELD(18, "Internal Parity error from WDB"), ++ SBITFIELD(19, "Correctable Patrol Scrub"), ++ SBITFIELD(20, "Uncorrectable Patrol Scrub"), ++ SBITFIELD(21, "Spare Correctable Error"), ++ SBITFIELD(22, "Spare UC Error"), ++ SBITFIELD(23, "CORR Chip fail even MC only, 4 bit burst error EDC only"), ++ {} ++}; ++ ++void knl_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan = 0; ++ ++ switch (e->bank) { ++ case 5: ++ switch (EXTRACT(status, 0, 15)) { ++ case 0x402: ++ mce_snprintf(e->mcastatus_msg, "PCU Internal Errors"); ++ break; ++ case 0x403: ++ mce_snprintf(e->mcastatus_msg, "VCU Internal Errors"); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors"); ++ break; ++ } ++ break; ++ case 7: case 8: case 9: case 10: ++ case 11: case 12: case 13: case 14: ++ case 15: case 16: ++ if ((EXTRACT(status, 0, 15)) == 0x5) { ++ mce_snprintf(e->mcastatus_msg, "Internal Parity error"); ++ } else { ++ chan = (EXTRACT(status, 0, 3)) + 3 * (e->bank == 15); ++ switch (EXTRACT(status, 4, 7)) { ++ case 0x0: ++ mce_snprintf(e->mcastatus_msg, "Undefined request on channel %d", chan); ++ break; ++ case 0x1: ++ mce_snprintf(e->mcastatus_msg, "Read on channel %d", chan); ++ break; ++ case 0x2: ++ mce_snprintf(e->mcastatus_msg, "Write on channel %d", chan); ++ break; ++ case 0x3: ++ mce_snprintf(e->mcastatus_msg, "CA error on channel %d", chan); ++ break; ++ case 0x4: ++ mce_snprintf(e->mcastatus_msg, "Scrub error on channel %d", chan); ++ break; ++ } ++ } ++ decode_bitfield(e, status, memctrl_mc7); ++ break; ++ default: ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 7 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ { ++ mce_snprintf(e->mc_location, "memory_channel=unspecified"); ++ } ++ else ++ { ++ chan = chan + 3 * (e->bank == 15); ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++ } ++} +diff --git a/mce-intel.c b/mce-intel.c +index 77b929b..032f4e0 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -397,6 +397,10 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + break; + case CPU_HASWELL_EPEX: + hsw_decode_model(ras, e); ++ break; ++ case CPU_KNIGHTS_LANDING: ++ knl_decode_model(ras, e); ++ break; + default: + break; + } +@@ -460,6 +464,7 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: ++ case CPU_KNIGHTS_LANDING: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 23f2488..3b0b05b 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -223,6 +223,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: ++ case CPU_KNIGHTS_LANDING: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 13b8f52..5466743 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -119,6 +119,7 @@ void dunnington_decode_model(struct mce_event *e); + void snb_decode_model(struct ras_events *ras, struct mce_event *e); + void ivb_decode_model(struct ras_events *ras, struct mce_event *e); + void hsw_decode_model(struct ras_events *ras, struct mce_event *e); ++void knl_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); + + /* Software defined banks */ +-- +1.8.3.1 + diff --git a/SOURCES/0060-mce-intel-knl-Fix-CodingStyle.patch b/SOURCES/0060-mce-intel-knl-Fix-CodingStyle.patch new file mode 100644 index 0000000..3f2da38 --- /dev/null +++ b/SOURCES/0060-mce-intel-knl-Fix-CodingStyle.patch @@ -0,0 +1,106 @@ +From 17f4e17d9870fbd35572ae6bf6c227c787b07fe9 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab +Date: Fri, 5 Feb 2016 15:15:18 -0200 +Subject: [PATCH 4/5] mce-intel-knl: Fix CodingStyle + +Signed-off-by: Mauro Carvalho Chehab +--- + mce-intel-knl.c | 43 +++++++++++++++++++++++++++---------------- + 1 file changed, 27 insertions(+), 16 deletions(-) + +diff --git a/mce-intel-knl.c b/mce-intel-knl.c +index 96b0a59..7062fbb 100644 +--- a/mce-intel-knl.c ++++ b/mce-intel-knl.c +@@ -48,32 +48,46 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + mce_snprintf(e->mcastatus_msg, "VCU Internal Errors"); + break; + case 0x407: +- mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors"); ++ mce_snprintf(e->mcastatus_msg, ++ "Other UBOX Internal Errors"); + break; + } + break; +- case 7: case 8: case 9: case 10: +- case 11: case 12: case 13: case 14: +- case 15: case 16: ++ case 7: ++ case 8: ++ case 9: ++ case 10: ++ case 11: ++ case 12: ++ case 13: ++ case 14: ++ case 15: ++ case 16: + if ((EXTRACT(status, 0, 15)) == 0x5) { + mce_snprintf(e->mcastatus_msg, "Internal Parity error"); + } else { + chan = (EXTRACT(status, 0, 3)) + 3 * (e->bank == 15); + switch (EXTRACT(status, 4, 7)) { + case 0x0: +- mce_snprintf(e->mcastatus_msg, "Undefined request on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Undefined request on channel %d", ++ chan); + break; + case 0x1: +- mce_snprintf(e->mcastatus_msg, "Read on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Read on channel %d", chan); + break; + case 0x2: +- mce_snprintf(e->mcastatus_msg, "Write on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Write on channel %d", chan); + break; + case 0x3: +- mce_snprintf(e->mcastatus_msg, "CA error on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "CA error on channel %d", chan); + break; + case 0x4: +- mce_snprintf(e->mcastatus_msg, "Scrub error on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Scrub error on channel %d", chan); + break; + } + } +@@ -93,7 +107,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + + /* Ignore unless this is an corrected extended error from an iMC bank */ + if (e->bank < 7 || e->bank > 16 || (status & MCI_STATUS_UC) || +- !test_prefix(7, status & 0xefff)) ++ !test_prefix(7, status & 0xefff)) + return; + + /* +@@ -101,12 +115,9 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + */ + + chan = EXTRACT(status, 0, 3); +- if (chan == 0xf) +- { ++ if (chan == 0xf) { + mce_snprintf(e->mc_location, "memory_channel=unspecified"); +- } +- else +- { ++ } else { + chan = chan + 3 * (e->bank == 15); + mce_snprintf(e->mc_location, "memory_channel=%d", chan); + +@@ -121,7 +132,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + */ + if (rank0 != -1 && rank1 != -1) + mce_snprintf(e->mc_location, "ranks=%d and %d", +- rank0, rank1); ++ rank0, rank1); + else if (rank0 != -1) + mce_snprintf(e->mc_location, "rank=%d", rank0); + } +-- +1.8.3.1 + diff --git a/SOURCES/0061-Add-Broadwell-DE-MSCOD-values.patch b/SOURCES/0061-Add-Broadwell-DE-MSCOD-values.patch new file mode 100644 index 0000000..d32380c --- /dev/null +++ b/SOURCES/0061-Add-Broadwell-DE-MSCOD-values.patch @@ -0,0 +1,244 @@ +From e7b88730f8a753a50fa0b8d1f7027f79baa05ca4 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Fri, 8 Apr 2016 15:07:18 -0400 +Subject: [PATCH 1/2] Add Broadwell DE MSCOD values + +Based on mcelog commit id 32252e9c37e97ea5083d90d2cf194bb85a4a0cda. + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + Makefile.am | 2 +- + mce-intel-broadwell-de.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 3 + + ras-mce-handler.c | 6 +- + ras-mce-handler.h | 2 + + 5 files changed, 156 insertions(+), 3 deletions(-) + create mode 100644 mce-intel-broadwell-de.c + +diff --git a/Makefile.am b/Makefile.am +index a1cb02a..a8477d3 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -29,7 +29,7 @@ if WITH_MCE + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ +- mce-intel-knl.c ++ mce-intel-knl.c mce-intel-broadwell-de.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-broadwell-de.c b/mce-intel-broadwell-de.c +new file mode 100644 +index 0000000..d52c82e +--- /dev/null ++++ b/mce-intel-broadwell-de.c +@@ -0,0 +1,146 @@ ++/* ++ * The code below came from Tony Luck's mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include ++#include ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++/* See IA32 SDM Vol3B Table 16-24 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x26] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-18 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC, WDB, parity errors"), ++ {} ++}; ++ ++void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ mce_snprintf(e->mcastatus_msg, "Internal errors "); ++ break; ++ case 0x406: ++ mce_snprintf(e->mcastatus_msg, "Intel TXT errors "); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19) & 3) ++ mce_snprintf(e->mcastatus_msg, "PCU internal error "); ++ if (EXTRACT(status, 20, 23) & 4) ++ mce_snprintf(e->mcastatus_msg, "Ubox error "); ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 9: case 10: ++ mce_snprintf(e->mcastatus_msg, "MemCtrl: "); ++ decode_bitfield(e, status, memctrl_mc9); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) { ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++} +diff --git a/mce-intel.c b/mce-intel.c +index 032f4e0..b132903 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -401,6 +401,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + case CPU_KNIGHTS_LANDING: + knl_decode_model(ras, e); + break; ++ case CPU_BROADWELL_DE: ++ broadwell_de_decode_model(ras, e); ++ break; + default: + break; + } +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 3b0b05b..b58d6e0 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -50,6 +50,7 @@ static char *cputype_name[] = { + [CPU_HASWELL] = "Haswell", + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + [CPU_BROADWELL] = "Broadwell", ++ [CPU_BROADWELL_DE] = "Broadwell DE", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + }; + +@@ -90,8 +91,9 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL; + else if (mce->model == 0x3f) + return CPU_HASWELL_EPEX; +- else if (mce->model == 0x3d || mce->model == 0x4f || +- mce->model == 0x56) ++ else if (mce->model == 0x56) ++ return CPU_BROADWELL_DE; ++ else if (mce->model == 0x3d || mce->model == 0x4f) + return CPU_BROADWELL; + else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 5466743..2648048 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -45,6 +45,7 @@ enum cputype { + CPU_HASWELL, + CPU_HASWELL_EPEX, + CPU_BROADWELL, ++ CPU_BROADWELL_DE, + CPU_KNIGHTS_LANDING, + }; + +@@ -121,6 +122,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e); + void hsw_decode_model(struct ras_events *ras, struct mce_event *e); + void knl_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); ++void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); + + /* Software defined banks */ + #define MCE_EXTENDED_BANK 128 +-- +1.8.3.1 + diff --git a/SOURCES/0062-Add-Broadwell-EP-EX-MSCOD-values.patch b/SOURCES/0062-Add-Broadwell-EP-EX-MSCOD-values.patch new file mode 100644 index 0000000..23f8f81 --- /dev/null +++ b/SOURCES/0062-Add-Broadwell-EP-EX-MSCOD-values.patch @@ -0,0 +1,289 @@ +From 0dd44fca9d756990acf01cd2cdaa585f369168bc Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Fri, 8 Apr 2016 15:07:19 -0400 +Subject: [PATCH 2/2] Add Broadwell EP/EX MSCOD values + +Based on mcelog commit id 32252e9c37e97ea5083d90d2cf194bb85a4a0cda. + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + Makefile.am | 3 +- + mce-intel-broadwell-epex.c | 191 +++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 3 + + ras-mce-handler.c | 5 +- + ras-mce-handler.h | 2 + + 5 files changed, 202 insertions(+), 2 deletions(-) + create mode 100644 mce-intel-broadwell-epex.c + +diff --git a/Makefile.am b/Makefile.am +index a8477d3..c9e4481 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -29,7 +29,8 @@ if WITH_MCE + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ +- mce-intel-knl.c mce-intel-broadwell-de.c ++ mce-intel-knl.c mce-intel-broadwell-de.c \ ++ mce-intel-broadwell-epex.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-broadwell-epex.c b/mce-intel-broadwell-epex.c +new file mode 100644 +index 0000000..f7cd3b6 +--- /dev/null ++++ b/mce-intel-broadwell-epex.c +@@ -0,0 +1,191 @@ ++/* ++ * The code below came from Tony Luck's mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include ++#include ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-26 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ mce_snprintf(e->mcastatus_msg, "Internal errors "); ++ break; ++ case 0x406: ++ mce_snprintf(e->mcastatus_msg, "Intel TXT errors "); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ mce_snprintf(e->mcastatus_msg, "PCU internal error "); ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ mce_snprintf(e->mcastatus_msg, "QPI: "); ++ decode_bitfield(e, status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ mce_snprintf(e->mcastatus_msg, "MemCtrl: "); ++ decode_bitfield(e, status, memctrl_mc9); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) { ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++} +diff --git a/mce-intel.c b/mce-intel.c +index b132903..bf68d9b 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -404,6 +404,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + case CPU_BROADWELL_DE: + broadwell_de_decode_model(ras, e); + break; ++ case CPU_BROADWELL_EPEX: ++ broadwell_epex_decode_model(ras, e); ++ break; + default: + break; + } +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index b58d6e0..b875512 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -51,6 +51,7 @@ static char *cputype_name[] = { + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + [CPU_BROADWELL] = "Broadwell", + [CPU_BROADWELL_DE] = "Broadwell DE", ++ [CPU_BROADWELL_EPEX] = "Broadwell EP/EX", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + }; + +@@ -93,7 +94,9 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL_EPEX; + else if (mce->model == 0x56) + return CPU_BROADWELL_DE; +- else if (mce->model == 0x3d || mce->model == 0x4f) ++ else if (mce->model == 0x4f) ++ return CPU_BROADWELL_EPEX; ++ else if (mce->model == 0x3d) + return CPU_BROADWELL; + else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 2648048..c5a3717 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -46,6 +46,7 @@ enum cputype { + CPU_HASWELL_EPEX, + CPU_BROADWELL, + CPU_BROADWELL_DE, ++ CPU_BROADWELL_EPEX, + CPU_KNIGHTS_LANDING, + }; + +@@ -123,6 +124,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e); + void knl_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); + void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); ++void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); + + /* Software defined banks */ + #define MCE_EXTENDED_BANK 128 +-- +1.8.3.1 + diff --git a/SOURCES/rasdaemon-dont_use_memerror_log_enable_on_knl.patch b/SOURCES/rasdaemon-dont_use_memerror_log_enable_on_knl.patch new file mode 100644 index 0000000..da6cadc --- /dev/null +++ b/SOURCES/rasdaemon-dont_use_memerror_log_enable_on_knl.patch @@ -0,0 +1,24 @@ +diff --git a/mce-intel.c b/mce-intel.c +index bf68d9b..80e4b6f 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -470,7 +470,6 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: +- case CPU_KNIGHTS_LANDING: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index b875512..f930fd1 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -228,7 +228,6 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: +- case CPU_KNIGHTS_LANDING: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec index 1bb8756..c64fc6b 100644 --- a/SPECS/rasdaemon.spec +++ b/SPECS/rasdaemon.spec @@ -2,7 +2,7 @@ Name: rasdaemon Version: 0.4.1 -Release: 20%{?dist} +Release: 24%{?dist} Summary: Utility to receive RAS error tracings Group: Applications/System License: GPLv2 @@ -70,7 +70,13 @@ Patch44: 0054-rasdaemon-unnecessary-comma-for-empty-mc_location-st.patch Patch45: 0055-rasdaemon-use-MCA-error-msg-as-error_msg.patch Patch46: 0056-x86-rasdaemon-Add-support-to-log-Local-Machine-Check.patch Patch47: 0057-rasdaemon-add-support-for-haswell-ex.patch - +Patch48: 0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch +Patch49: 0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch +Patch50: 0060-mce-intel-knl-Fix-CodingStyle.patch +Patch51: 0061-Add-Broadwell-DE-MSCOD-values.patch +Patch52: 0062-Add-Broadwell-EP-EX-MSCOD-values.patch +# Patch53 was submitted upstream but not merged yet +Patch53: rasdaemon-dont_use_memerror_log_enable_on_knl.patch %description %{name} is a RAS (Reliability, Availability and Serviceability) logging tool. @@ -131,6 +137,12 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch45 -p1 %patch46 -p1 %patch47 -p1 +%patch48 -p1 +%patch49 -p1 +%patch50 -p1 +%patch51 -p1 +%patch52 -p1 +%patch53 -p1 %build autoreconf -vfi @@ -158,6 +170,19 @@ rm -rf %{buildroot} %{_sysconfdir}/ras/dimm_labels.d %changelog +* Wed Aug 24 2016 Aristeu Rozanski 0.4.1-24.el7 +- don't use MemError Log Enable on Knights Landing [1273326] + +* Fri Apr 15 2016 Aristeu Rozanski 0.4.1-23.el7 +- add Broadwell DE/EP/EX MSCOD values [1299512] + +* Mon Feb 08 2016 Aristeu Rozanski 0.4.1-22.el7 +- add missing upstream patches for Knights Landing [1273326] +- fix documentation typos [1247562] + +* Thu Dec 03 2015 Aristeu Rozanski 0.4.1-21.el7 +- add support for Knights Landing [1273326] + * Wed Sep 30 2015 Aristeu Rozanski 0.4.1-20.el7 - add support for Haswell EP/EX [1267137]