diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..27cb2dc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/mcelog-d2e13bf0.tar.bz2 diff --git a/.mcelog.metadata b/.mcelog.metadata new file mode 100644 index 0000000..dd111d8 --- /dev/null +++ b/.mcelog.metadata @@ -0,0 +1 @@ +b50f67497d5d26e3d56107abfcbb22dde0fca49c SOURCES/mcelog-d2e13bf0.tar.bz2 diff --git a/SOURCES/mcelog-fix-trigger-path-and-cacheing.patch b/SOURCES/mcelog-fix-trigger-path-and-cacheing.patch new file mode 100644 index 0000000..631a971 --- /dev/null +++ b/SOURCES/mcelog-fix-trigger-path-and-cacheing.patch @@ -0,0 +1,11 @@ +diff --git a/mcelog.cron b/mcelog.cron +index 4335caa..317b699 100755 +--- a/mcelog.cron ++++ b/mcelog.cron +@@ -1,2 +1,5 @@ + #!/bin/bash +-/usr/sbin/mcelog --ignorenodev --filter >> /var/log/mcelog ++ ++# Disabled by default on Fedora since this is run as daemon ++# using the mcelog.service systemd configuration entries. ++#/usr/sbin/mcelog --ignorenodev --filter >> /var/log/mcelog diff --git a/SOURCES/mcelog-haswell-support.patch b/SOURCES/mcelog-haswell-support.patch new file mode 100644 index 0000000..8c05ce7 --- /dev/null +++ b/SOURCES/mcelog-haswell-support.patch @@ -0,0 +1,348 @@ +The patches were in the process of being committed to Andi's upstream mcelog +tree when they were applied to the RHEL source. The patch subjects are + + Add better decoding support for Haswell server processors + More compact data structures for reporting SNB/IVB memory controller errors + +and were provided early by Tony Luck @ Intel. + +diff -urNp mcelog-d2e13bf0.orig/haswell.c mcelog-d2e13bf0/haswell.c +--- mcelog-d2e13bf0.orig/haswell.c 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/haswell.c 2014-09-08 09:59:52.998327718 -0400 +@@ -0,0 +1,150 @@ ++/* Copyright (C) 2013 Intel Corporation ++ Decode Intel Ivy Bridge specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Tony Luck ++*/ ++ ++#include "mcelog.h" ++#include "bitfield.h" ++#include "haswell.h" ++#include "memdb.h" ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-22 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void hsw_decode_model(int cputype, int bank, u64 status, u64 misc) ++{ ++ switch (bank) { ++ case 4: ++ Wprintf("PCU: "); ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ Wprintf("Internal errors "); ++ break; ++ case 0x406: ++ Wprintf("Intel TXT errors "); ++ break; ++ case 0x407: ++ Wprintf("Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ Wprintf("PCU internal error "); ++ decode_bitfield(status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ Wprintf("QPI: "); ++ decode_bitfield(status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ Wprintf("MemCtrl: "); ++ decode_bitfield(status, memctrl_mc9); ++ break; ++ } ++} +diff -urNp mcelog-d2e13bf0.orig/haswell.h mcelog-d2e13bf0/haswell.h +--- mcelog-d2e13bf0.orig/haswell.h 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/haswell.h 2014-09-08 09:59:52.998327718 -0400 +@@ -0,0 +1,2 @@ ++void hsw_decode_model(int cputype, int bank, u64 status, u64 misc); ++void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm); +diff -urNp mcelog-d2e13bf0.orig/intel.c mcelog-d2e13bf0/intel.c +--- mcelog-d2e13bf0.orig/intel.c 2014-09-08 09:59:39.622699389 -0400 ++++ mcelog-d2e13bf0/intel.c 2014-09-08 09:59:52.998327718 -0400 +@@ -24,6 +24,7 @@ + #include "page.h" + #include "sandy-bridge.h" + #include "ivy-bridge.h" ++#include "haswell.h" + #include "xeon75xx.h" + + int memory_error_support; +@@ -33,7 +34,7 @@ void intel_cpu_init(enum cputype cpu) + if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL || + cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || + cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || +- cpu == CPU_HASWELL) ++ cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX) + memory_error_support = 1; + } + +@@ -67,9 +68,10 @@ enum cputype select_intel_cputype(int fa + return CPU_IVY_BRIDGE; + else if (model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; +- else if (model == 0x3c || model == 0x3f || model == 0x45 || +- model == 0x46) ++ else if (model == 0x3c || model == 0x45 || model == 0x46) + return CPU_HASWELL; ++ else if (model == 0x3f) ++ return CPU_HASWELL_EPEX; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", + model); +diff -urNp mcelog-d2e13bf0.orig/intel.h mcelog-d2e13bf0/intel.h +--- mcelog-d2e13bf0.orig/intel.h 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/intel.h 2014-09-08 09:59:52.998327718 -0400 +@@ -18,5 +18,6 @@ extern int memory_error_support; + case CPU_SANDY_BRIDGE: \ + case CPU_IVY_BRIDGE: \ + case CPU_IVY_BRIDGE_EPEX: \ +- case CPU_HASWELL ++ case CPU_HASWELL: \ ++ case CPU_HASWELL_EPEX + +diff -urNp mcelog-d2e13bf0.orig/ivy-bridge.c mcelog-d2e13bf0/ivy-bridge.c +--- mcelog-d2e13bf0.orig/ivy-bridge.c 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/ivy-bridge.c 2014-09-08 09:59:56.033470497 -0400 +@@ -68,20 +68,16 @@ static struct field pcu_mc4[] = { + + /* See IA32 SDM Vol3B Table 16-18 */ + +-static char *memctrl_1[] = { +- [0x001] = "Address parity error", +- [0x002] = "HA Wrt buffer Data parity error", +- [0x004] = "HA Wrt byte enable parity error", +- [0x008] = "Corrected patrol scrub error", +- [0x010] = "Uncorrected patrol scrub error", +- [0x020] = "Corrected spare error", +- [0x040] = "Uncorrected spare error", +- [0x080] = "Corrected memory read error", +- [0x100] = "iMC, WDB, parity errors", +-}; +- + static struct field memctrl_mc9[] = { +- FIELD(16, memctrl_1), ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC, WDB, parity errors"), + {} + }; + +diff -urNp mcelog-d2e13bf0.orig/Makefile mcelog-d2e13bf0/Makefile +--- mcelog-d2e13bf0.orig/Makefile 2014-09-08 09:59:39.610698703 -0400 ++++ mcelog-d2e13bf0/Makefile 2014-09-08 09:59:52.998327718 -0400 +@@ -32,7 +32,7 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co + nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ + eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ + client.o cache.o sysfs.o yellow.o page.o rbtree.o \ +- xeon75xx.o sandy-bridge.o ivy-bridge.o msr.o ++ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o + DISKDB_OBJ := diskdb.o dimm.o db.o + CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} + DOC := mce.pdf +diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c +--- mcelog-d2e13bf0.orig/mcelog.c 2014-09-08 09:59:39.622699389 -0400 ++++ mcelog-d2e13bf0/mcelog.c 2014-09-08 09:59:52.999327768 -0400 +@@ -228,6 +228,7 @@ static char *cputype_name[] = { + [CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */ + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ + [CPU_HASWELL] = "Haswell", /* Fill in better name */ ++ [CPU_HASWELL_EPEX] = "Haswell EP/EX", /* Fill in better name */ + }; + + static struct config_choice cpu_choices[] = { +@@ -264,6 +265,8 @@ static struct config_choice cpu_choices[ + { "ivybridge-ep", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "ivybridge-ex", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "haswell", CPU_HASWELL }, /* Fill in better name */ ++ { "haswell-ep", CPU_HASWELL_EPEX }, /* Fill in better name */ ++ { "haswell-ex", CPU_HASWELL_EPEX }, /* Fill in better name */ + {} + }; + +@@ -424,7 +427,8 @@ static void dump_mce(struct mce *m, unsi + fam, + mod); + } +- if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX) ++ if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX && ++ cputype != CPU_HASWELL_EPEX) + resolveaddr(m->addr); + if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { + diskdb_resolve_addr(m->addr); +diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h +--- mcelog-d2e13bf0.orig/mcelog.h 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/mcelog.h 2014-09-08 09:59:52.999327768 -0400 +@@ -118,6 +118,7 @@ enum cputype { + CPU_IVY_BRIDGE, + CPU_IVY_BRIDGE_EPEX, + CPU_HASWELL, ++ CPU_HASWELL_EPEX, + }; + + enum option_ranges { +diff -urNp mcelog-d2e13bf0.orig/p4.c mcelog-d2e13bf0/p4.c +--- mcelog-d2e13bf0.orig/p4.c 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/p4.c 2014-09-08 09:59:52.999327768 -0400 +@@ -33,6 +33,7 @@ + #include "bitfield.h" + #include "sandy-bridge.h" + #include "ivy-bridge.h" ++#include "haswell.h" + + /* decode mce for P4/Xeon and Core2 family */ + +@@ -360,6 +361,9 @@ void decode_intel_mc(struct mce *log, in + case CPU_IVY_BRIDGE_EPEX: + ivb_decode_model(cputype, log->bank, log->status, log->misc); + break; ++ case CPU_HASWELL_EPEX: ++ hsw_decode_model(cputype, log->bank, log->status, log->misc); ++ break; + } + + /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values +diff -urNp mcelog-d2e13bf0.orig/sandy-bridge.c mcelog-d2e13bf0/sandy-bridge.c +--- mcelog-d2e13bf0.orig/sandy-bridge.c 2014-09-08 09:59:39.616699077 -0400 ++++ mcelog-d2e13bf0/sandy-bridge.c 2014-09-08 09:59:56.033470497 -0400 +@@ -63,18 +63,14 @@ static struct field pcu_mc4[] = { + {} + }; + +-static char *memctrl_1[] = { +- [0x001] = "Address parity error", +- [0x002] = "HA Wrt buffer Data parity error", +- [0x004] = "HA Wrt byte enable parity error", +- [0x008] = "Corrected patrol scrub error", +- [0x010] = "Uncorrected patrol scrub error", +- [0x020] = "Corrected spare error", +- [0x040] = "Uncorrected spare error", +-}; +- + static struct field memctrl_mc8[] = { +- FIELD(16, memctrl_1), ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), + {} + }; + diff --git a/SOURCES/mcelog-patch-0755b55af.patch b/SOURCES/mcelog-patch-0755b55af.patch new file mode 100644 index 0000000..391ca15 --- /dev/null +++ b/SOURCES/mcelog-patch-0755b55af.patch @@ -0,0 +1,45 @@ +From: Prarit Bhargava + +Subject: Makefile: Make `git` check portable (POSIX compatible) + +commit 0755b55afbf76fcacc31e98bb8c163c8e055ccb2 +Author: Paul Menzel +Date: Mon Jan 30 15:56:41 2017 +0100 + + Makefile: Make `git` check portable (POSIX compatible) + + On systems using not the *GNU Bourne-Again SHell* (bash) as shell, the + git check might fail. This happens on Debian systems where the *Debian + Almquist Shell* (dash) is used by default for `/bin/sh`. + + As a result, after running `make` the file `version.c` always contains + `unknown` instead of the correct version. + + The problem is, that `type -p git` is not portable. Use `command -v` + instead [1][2]. + + > Where bash is your shell/hashbang, consistently use hash (for + > commands) or type (to consider built-ins & keywords). + > + > When writing a POSIX script, use command -v. + + Fixes: #42 + + [1] https://stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script + "Check if a program exists from a Bash script" + [2] https://unix.stackexchange.com/questions/85249/why-not-use-which-what-to-use-then + "Why not use “which”? What to use then?" + +diff --git a/Makefile b/Makefile +index 9dbbdbf8eb97e21a6d2efee7f308c1e7e742845b..e4341fceb882597ceebb56883625338c89923ee0 100644 +--- a/Makefile ++++ b/Makefile +@@ -82,7 +82,7 @@ depend: .depend + + version.tmp: FORCE + ( echo -n "char version[] = \"" ; \ +- if type -p git >/dev/null; then \ ++ if command -v git >/dev/null; then \ + if [ -d .git ] ; then \ + git describe --tags HEAD | tr -d '\n'; \ + else \ diff --git a/SOURCES/mcelog-patch-1bd2984.patch b/SOURCES/mcelog-patch-1bd2984.patch new file mode 100644 index 0000000..8cc0ffc --- /dev/null +++ b/SOURCES/mcelog-patch-1bd2984.patch @@ -0,0 +1,35 @@ +From: Prarit Bhargava + +fix: server does not start because it assumed it is already running + +commit 1bd29846db20a76ec7492dd54716a11976a5d9fb +Author: Balázs Póka +Date: Sat Aug 22 14:06:14 2015 +0200 + + fix: server does not start because it assumed it is already running + +diff --git a/server.c b/server.c +index a1fa7da16ba79663bdf2f4542d0b82fdd2a786b9..eddf147a527e721419b752ea56577ee36c85056a 100644 +--- a/server.c ++++ b/server.c +@@ -297,9 +297,9 @@ static int server_ping(struct sockaddr_un *un) + if (fd < 0) + return 0; + +- sigaction(SIGALRM, &sa, &oldsa); ++ sigaction(SIGALRM, &sa, &oldsa); + if (sigsetjmp(ping_timeout_ctx, 1) == 0) { +- ret = 0; ++ ret = -1; + alarm(initial_ping_timeout); + if (connect(fd, un, sizeof(struct sockaddr_un)) < 0) + goto cleanup; +@@ -315,7 +315,7 @@ cleanup: + sigaction(SIGALRM, &oldsa, NULL); + alarm(0); + close(fd); +- return ret; ++ return ret; + } + + void server_setup(void) diff --git a/SOURCES/mcelog-patch-595a2dcfe.patch b/SOURCES/mcelog-patch-595a2dcfe.patch new file mode 100644 index 0000000..e19d57c --- /dev/null +++ b/SOURCES/mcelog-patch-595a2dcfe.patch @@ -0,0 +1,24 @@ +diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c +--- mcelog-d2e13bf0.orig/mcelog.c 2018-06-28 11:53:15.907862669 -0400 ++++ mcelog-d2e13bf0/mcelog.c 2018-06-28 11:54:56.247852219 -0400 +@@ -446,6 +446,9 @@ static void dump_mce(struct mce *m, unsi + if (recordlen >= offsetof(struct mce, ppin) && m->ppin) + n += Wprintf("PPIN %llx\n", m->ppin); + ++ if (recordlen > offsetof(struct mce, microcode) && m->microcode) ++ n += Wprintf("MICROCODE %x\n", m->microcode); ++ + if (recordlen >= offsetof(struct mce, cpuid) && m->cpuid) { + u32 fam, mod; + parse_cpuid(m->cpuid, &fam, &mod); +diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h +--- mcelog-d2e13bf0.orig/mcelog.h 2018-06-28 11:53:15.908862699 -0400 ++++ mcelog-d2e13bf0/mcelog.h 2018-06-28 11:54:04.754318009 -0400 +@@ -34,6 +34,7 @@ struct mce { + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ ++ __u32 microcode;/* Microcode revision */ + }; + + #define X86_VENDOR_INTEL 0 diff --git a/SOURCES/mcelog-patch-59b8cab3f.patch b/SOURCES/mcelog-patch-59b8cab3f.patch new file mode 100644 index 0000000..bba7666 --- /dev/null +++ b/SOURCES/mcelog-patch-59b8cab3f.patch @@ -0,0 +1,99 @@ +From: Prarit Bhargava + +Subject: mcelog version: Add ability for OS to define version + +commit 59b8cab3f607573cb55b6841ece2f2577be6b419 +Author: Prarit Bhargava +Date: Mon May 22 10:25:57 2017 -0400 + + mcelog version: Add ability for OS to define version + + Example usage of this patch: + + [root@intel-purley-03 mcelog]# cat .os_version + mcelog-144-3.94d853b2ea81.el7 + [root@intel-purley-03 mcelog]# make version.tmp + ( printf "char version[] = \"" ; \ + if test -e .os_version; then \ + cat .os_version | tr -d '\n' ; \ + elif command -v git >/dev/null; then \ + if [ -d .git ] ; then \ + git describe --tags HEAD | tr -d '\n'; \ + else \ + printf "unknown" ; \ + fi ; \ + else \ + printf "unknown" ; \ + fi ; \ + printf '";\n' \ + ) > version.tmp + [root@intel-purley-03 mcelog]# cat version.tmp + char version[] = "mcelog-144-3.94d853b2ea81.el7"; + [root@intel-purley-03 mcelog]# rm .os_version + rm: remove regular file ‘.os_version’? y + [root@intel-purley-03 mcelog]# make version.tmp + ( printf "char version[] = \"" ; \ + if test -e .os_version; then \ + cat .os_version | tr -d '\n' ; \ + elif command -v git >/dev/null; then \ + if [ -d .git ] ; then \ + git describe --tags HEAD | tr -d '\n'; \ + else \ + printf "unknown" ; \ + fi ; \ + else \ + printf "unknown" ; \ + fi ; \ + printf '";\n' \ + ) > version.tmp + [root@intel-purley-03 mcelog]# cat version.tmp + char version[] = "v149-7-g62e645e4c0e9"; + + ----8<---- + + OSes package mcelog (rpm, deb, etc.) and the package version may differ + from the git version. + + Add the ability for the OS to define a version in a local .os_version + file. If the file exists, the version in the file is returned for + 'mcelog --version'. If the file isn't specified the old method of using + git and defaulting to 'unknown' are used. + + Signed-off-by: Prarit Bhargava + Signed-off-by: Andi Kleen + +diff --git a/Makefile b/Makefile +index 864378eb72de315f07d424540509e844241fd932..57373afaf6c77f40d2188dc855c5bc2bd5a02657 100644 +--- a/Makefile ++++ b/Makefile +@@ -81,16 +81,20 @@ depend: .depend + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(WARNINGS) $(ADD_DEFINES) -o $@ $< + + version.tmp: FORCE +- ( echo -n "char version[] = \"" ; \ +- if command -v git >/dev/null; then \ +- if [ -d .git ] ; then \ +- git describe --tags HEAD | tr -d '\n'; \ +- else \ +- echo -n "unknown" ; \ +- fi ; \ +- else echo -n "unknown" ; fi ; \ +- echo '";' \ +- ) > version.tmp ++ ( printf "char version[] = \"" ; \ ++ if test -e .os_version; then \ ++ cat .os_version | tr -d '\n' ; \ ++ elif command -v git >/dev/null; then \ ++ if [ -d .git ] ; then \ ++ git describe --tags HEAD | tr -d '\n'; \ ++ else \ ++ printf "unknown" ; \ ++ fi ; \ ++ else \ ++ printf "unknown" ; \ ++ fi ; \ ++ printf '";\n' \ ++ ) > version.tmp + + version.c: version.tmp + cmp version.tmp version.c || mv version.tmp version.c diff --git a/SOURCES/mcelog-patch-6ed93e30f835.patch b/SOURCES/mcelog-patch-6ed93e30f835.patch new file mode 100644 index 0000000..1362751 --- /dev/null +++ b/SOURCES/mcelog-patch-6ed93e30f835.patch @@ -0,0 +1,161 @@ +From: Prarit Bhargava + +Subject: mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems + +commit 6ed93e30f83519b0ab71f8ecd156b8ff0b2912b6 +Author: Tony Luck +Date: Mon Sep 24 11:14:45 2018 -0700 + + mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems + + Ivy Bridge was the last system that gave us enough information + to figure out the exact DIMM that is the source of a memory error. + We gave up on DIMM logging at that point. + + But we can still figure out the socket, memory controller and channel. + + Signed-off-by: Tony Luck + Signed-off-by: Andi Kleen + +diff --git a/haswell.c b/haswell.c +index 892ebc7248e808248798f21506b54faca147db9b..4eccbeb21a281467495e024b376d81be96b2183e 100644 +--- a/haswell.c ++++ b/haswell.c +@@ -148,3 +148,45 @@ void hsw_decode_model(int cputype, int bank, u64 status, u64 misc) + break; + } + } ++ ++/* ++ * There isn't enough information to identify the DIMM. But ++ * we can derive the channel from the bank number. ++ * There can be two memory controllers. We number the channels ++ * on the second controller: 4, 5, 6, 7 ++ */ ++void haswell_memerr_misc(struct mce *m, int *channel, int *dimm) ++{ ++ u64 status = m->status; ++ unsigned chan; ++ ++ /* Check this is a memory error */ ++ if (!test_prefix(7, status & 0xefff)) ++ return; ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ switch (m->bank) { ++ case 7: ++ /* Home agent 0 */ ++ break; ++ case 8: ++ /* Home agent 1 */ ++ chan += 4; ++ break; ++ case 9: case 10: case 11: case 12: ++ /* Memory controller 0 */ ++ chan = m->bank - 9; ++ break; ++ case 13: case 14: case 15: case 16: ++ /* Memory controller 1 */ ++ chan = (m->bank - 13) + 4; ++ break; ++ default: ++ return; ++ } ++ ++ channel[0] = chan; ++} +diff --git a/haswell.h b/haswell.h +index ba3fb1c3c985aec0ac1a0a271dca3c3afd18874c..712c8eb66d50a1bf63a7dbd67382fe775b59d69b 100644 +--- a/haswell.h ++++ b/haswell.h +@@ -1,2 +1,3 @@ + void hsw_decode_model(int cputype, int bank, u64 status, u64 misc); + void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm); ++void haswell_memerr_misc(struct mce *m, int *channel, int *dimm); +diff --git a/intel.c b/intel.c +index 20d2acdc12daa1128d72471d53639aebf82f4854..b655c4162f8980d5d826640fa4375c7ba6b1e97d 100644 +--- a/intel.c ++++ b/intel.c +@@ -25,6 +25,7 @@ + #include "sandy-bridge.h" + #include "ivy-bridge.h" + #include "haswell.h" ++#include "skylake_xeon.h" + + int memory_error_support; + +@@ -140,6 +141,13 @@ static int intel_memory_error(struct mce *m, unsigned recordlen) + case CPU_IVY_BRIDGE_EPEX: + ivy_bridge_ep_memerr_misc(m, channel, dimm); + break; ++ case CPU_HASWELL_EPEX: ++ case CPU_BROADWELL_EPEX: ++ haswell_memerr_misc(m, channel, dimm); ++ break; ++ case CPU_SKYLAKE_XEON: ++ skylake_memerr_misc(m, channel, dimm); ++ break; + default: + break; + } +diff --git a/skylake_xeon.c b/skylake_xeon.c +index 16c6181987f0126d377b64a8f5d4a96a01bfa1c4..b02f8acd806e2a64ed1653f44349fd3e9abf374e 100644 +--- a/skylake_xeon.c ++++ b/skylake_xeon.c +@@ -228,3 +228,45 @@ int skylake_s_ce_type(int bank, u64 status, u64 misc) + + return 0; + } ++ ++/* ++ * There isn't enough information to identify the DIMM. But ++ * we can derive the channel from the bank number. ++ * There can be two memory controllers. We number the channels ++ * on the second controller: 3, 4, 5 ++ */ ++void skylake_memerr_misc(struct mce *m, int *channel, int *dimm) ++{ ++ u64 status = m->status; ++ unsigned chan; ++ ++ /* Check this is a memory error */ ++ if (!test_prefix(7, status & 0xefff)) ++ return; ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ switch (m->bank) { ++ case 7: ++ /* Home agent 0 */ ++ break; ++ case 8: ++ /* Home agent 1 */ ++ chan += 3; ++ break; ++ case 13: case 14: case 15: ++ /* Memory controller 0 */ ++ chan = m->bank - 13; ++ break; ++ case 16: case 17: case 18: ++ /* Memory controller 1 */ ++ chan = (m->bank - 16) + 3; ++ break; ++ default: ++ return; ++ } ++ ++ channel[0] = chan; ++} +diff --git a/skylake_xeon.h b/skylake_xeon.h +index edcd9c030fa70f10ac23f2df9be948b10c73f4a1..098e6fa0e3eaff1b1d7e3040eddfb9187dabd7dd 100644 +--- a/skylake_xeon.h ++++ b/skylake_xeon.h +@@ -1,2 +1,3 @@ + void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc); + int skylake_s_ce_type(int bank, u64 status, u64 misc); ++void skylake_memerr_misc(struct mce *m, int *channel, int *dimm); diff --git a/SOURCES/mcelog-patch-cfa11588ad8b.patch b/SOURCES/mcelog-patch-cfa11588ad8b.patch new file mode 100644 index 0000000..f821414 --- /dev/null +++ b/SOURCES/mcelog-patch-cfa11588ad8b.patch @@ -0,0 +1,45 @@ +From cfa11588ad8b95b81b272e6fcec41b788455e8ec Mon Sep 17 00:00:00 2001 +From: Tony Luck +Date: Fri, 3 Feb 2017 16:51:04 -0800 +Subject: [PATCH] Intel Xeons from Ivy Bridge onwards support a processor + identification number. Kernels v4.9 and higher include it + in the "mce" record. + +Signed-off-by: Tony Luck +Signed-off-by: Andi Kleen +--- + mcelog.c | 3 +++ + mcelog.h | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/mcelog.c b/mcelog.c +index 3ae230dc7ef3..507f11bdbccb 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -445,6 +445,9 @@ static void dump_mce(struct mce *m, unsigned recordlen) + if (n > 0) + Wprintf("\n"); + ++ if (recordlen >= offsetof(struct mce, ppin) && m->ppin) ++ n += Wprintf("PPIN %llx\n", m->ppin); ++ + if (recordlen >= offsetof(struct mce, cpuid) && m->cpuid) { + u32 fam, mod; + parse_cpuid(m->cpuid, &fam, &mod); +diff --git a/mcelog.h b/mcelog.h +index 6e175fede0f4..1f9453459b5d 100644 +--- a/mcelog.h ++++ b/mcelog.h +@@ -31,6 +31,9 @@ struct mce { + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial apic ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ ++ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ ++ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ ++ __u64 ppin; /* Protected Processor Inventory Number */ + }; + + #define X86_VENDOR_INTEL 0 +-- +1.7.9.3 + diff --git a/SOURCES/mcelog-patch-commit-916015663906.patch b/SOURCES/mcelog-patch-commit-916015663906.patch new file mode 100644 index 0000000..518a18c --- /dev/null +++ b/SOURCES/mcelog-patch-commit-916015663906.patch @@ -0,0 +1,157 @@ +From: Prarit Bhargava + +Subject: mcelog: Add --help option. + +commit 91601566390676d3c590bbe4b680f4009b6caa22 +Author: Prarit Bhargava +Date: Thu Oct 12 13:35:33 2017 -0400 + + mcelog: Add --help option. + + The mcelog man page states "See mcelog --help for a list of valid CPUs.". + This command returns 1 because --help is not a valid option. + + Separate the exit(1) from the usage() function and add a --help option. + + Signed-off-by: Prarit Bhargava + Signed-off-by: Andi Kleen + +diff --git a/mcelog.c b/mcelog.c +index 507f11bdbccb87cad45a2f182edd2a8146bd89f3..58a0aac26b53fa382567e034b2e2a8f25735e3aa 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -982,10 +982,10 @@ void usage(void) + "--pidfile file Write pid of daemon into file\n" + "--no-imc-log Disable extended iMC logging\n" + "--is-cpu-supported Exit with return code indicating whether the CPU is supported\n" ++"--help Display this message.\n" + ); + printf("\n"); + print_cputypes(); +- exit(1); + } + + enum options { +@@ -1017,6 +1017,7 @@ enum options { + O_DEBUG_NUMERRORS, + O_NO_IMC_LOG, + O_IS_CPU_SUPPORTED, ++ O_HELP, + }; + + static struct option options[] = { +@@ -1050,6 +1051,7 @@ static struct option options[] = { + { "pidfile", 1, NULL, O_PIDFILE }, + { "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */ + { "no-imc-log", 0, NULL, O_NO_IMC_LOG }, ++ { "help", 0, NULL, O_HELP }, + { "is-cpu-supported", 0, NULL, O_IS_CPU_SUPPORTED }, + {} + }; +@@ -1080,12 +1082,15 @@ static int modifier(int opt) + break; + case O_INTEL_CPU: { + unsigned fam, mod; +- if (sscanf(optarg, "%i,%i", &fam, &mod) != 2) ++ if (sscanf(optarg, "%i,%i", &fam, &mod) != 2) { + usage(); ++ exit(1); ++ } + cputype = select_intel_cputype(fam, mod); + if (cputype == CPU_GENERIC) { + fprintf(stderr, "Unknown Intel CPU\n"); + usage(); ++ exit(1); + } + cpu_forced = 1; + break; +@@ -1104,8 +1109,10 @@ static int modifier(int opt) + do_dmi = 0; + break; + case O_DMI_VERBOSE: +- if (sscanf(optarg, "%i", &v) != 1) ++ if (sscanf(optarg, "%i", &v) != 1) { + usage(); ++ exit(1); ++ } + dmi_set_verbosity(v); + break; + case O_SYSLOG: +@@ -1117,8 +1124,10 @@ static int modifier(int opt) + break; + case O_CPUMHZ: + cpumhz_forced = 1; +- if (sscanf(optarg, "%lf", &cpumhz) != 1) ++ if (sscanf(optarg, "%lf", &cpumhz) != 1) { + usage(); ++ exit(1); ++ } + break; + case O_SYSLOG_ERROR: + syslog_level = LOG_ERR; +@@ -1155,6 +1164,10 @@ static int modifier(int opt) + case O_IS_CPU_SUPPORTED: + check_only = 1; + break; ++ case O_HELP: ++ usage(); ++ exit(0); ++ break; + case 0: + break; + default: +@@ -1184,8 +1197,10 @@ void argsleft(int ac, char **av) + int opt; + + while ((opt = getopt_long(ac, av, "", options, NULL)) != -1) { +- if (modifier(opt) != 1) ++ if (modifier(opt) != 1) { + usage(); ++ exit(1); ++ } + } + } + +@@ -1284,16 +1299,20 @@ static void process(int fd, unsigned recordlen, unsigned loglen, char *buf) + + static void noargs(int ac, char **av) + { +- if (getopt_long(ac, av, "", options, NULL) != -1) ++ if (getopt_long(ac, av, "", options, NULL) != -1) { + usage(); ++ exit(1); ++ } + } + + static void parse_config(char **av) + { + static const char config_fn[] = CONFIG_FILENAME; + const char *fn = config_file(av, config_fn); +- if (!fn) ++ if (!fn) { + usage(); ++ exit(1); ++ } + if (parse_config_file(fn) < 0) { + /* If it's the default file don't complain if it isn't there */ + if (fn != config_fn) { +@@ -1362,6 +1381,7 @@ int main(int ac, char **av) + while ((opt = getopt_long(ac, av, "", options, NULL)) != -1) { + if (opt == '?') { + usage(); ++ exit(1); + } else if (combined_modifier(opt) > 0) { + continue; + } else if (opt == O_ASCII) { +@@ -1404,8 +1424,10 @@ int main(int ac, char **av) + modifier_finish(); + if (av[optind]) + logfn = av[optind++]; +- if (av[optind]) ++ if (av[optind]) { + usage(); ++ exit(1); ++ } + checkdmi(); + general_setup(); + diff --git a/SOURCES/mcelog-patch-d1f37aae14d4.patch b/SOURCES/mcelog-patch-d1f37aae14d4.patch new file mode 100644 index 0000000..d720464 --- /dev/null +++ b/SOURCES/mcelog-patch-d1f37aae14d4.patch @@ -0,0 +1,36 @@ +From: Prarit Bhargava + +Subject: mcelog: Add decoding for Optane DC persistent memory mode + +commit d1f37aae14d476af6260af01904aed0f2e942542 +Author: Tony Luck +Date: Tue Feb 5 10:10:38 2019 -0800 + + mcelog: Add decoding for Optane DC persistent memory mode + + Recognise the new MCACOD for errors in DDR memory in use as a cache + for Optane DC persistent memory. + + In binary the new code looks similar to the old memory controller + compound error code. The only difference is that bit 9 is set in + addition to bit 7: + + 000F 0010 1MMM CCCC + + Signed-off-by: Tony Luck + Signed-off-by: Andi Kleen + +diff --git a/p4.c b/p4.c +index adb4c5eab976edf2c99efddd65e6abfcfeb97da2..8e1ec2fe435f0883c5146fa55e8e703713236d0a 100644 +--- a/p4.c ++++ b/p4.c +@@ -199,6 +199,9 @@ static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, i + CACHE_RRRR_SHIFT)); + if (track == 2) + run_yellow_trigger(cpu, typenum, levelnum, type, level,socket); ++ } else if (test_prefix(9, mca) && EXTRACT(mca, 7, 8) == 1) { ++ Wprintf("Memory as cache: "); ++ decode_memory_controller(mca, bank); + } else if (test_prefix(10, mca)) { + if (mca == 0x400) + Wprintf("Internal Timer error\n"); diff --git a/SOURCES/mcelog-patch-e9aeed03f3d1.patch b/SOURCES/mcelog-patch-e9aeed03f3d1.patch new file mode 100644 index 0000000..60b048d --- /dev/null +++ b/SOURCES/mcelog-patch-e9aeed03f3d1.patch @@ -0,0 +1,39 @@ +From 4e9aeed03f3d17fb92662ff656566b0afb2ec99f Mon Sep 17 00:00:00 2001 +From: Prarit Bhargava +Date: Wed, 11 Jan 2017 10:01:40 -0500 +Subject: [PATCH] mcelog: is_cpu_supported() error message must be printed + Eprintf + +SYSERRprintf outputs a ": Success" message so the error message looks like + +mcelog: ERROR: AMD Processor family 21: mcelog does not support this +processor. Please use the edac_mce_amd module instead. +: Success + +which is confusing for end-users. + +I changed this to do Eprintf which keeps the error return to userspace +but drops the ": Success" line. + +Signed-off-by: Prarit Bhargava +Signed-off-by: Andi Kleen +--- + mcelog.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mcelog.c b/mcelog.c +index 96bebee14afe..37c0af624870 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -542,7 +542,7 @@ int is_cpu_supported(void) + if (family == 15) { + cputype = CPU_K8; + } else if (family >= 16) { +- SYSERRprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); ++ Eprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); + return 0; + } + } else if (!strcmp(vendor,"GenuineIntel")) +-- +1.7.9.3 + diff --git a/SOURCES/mcelog-patch-f8f1490cb.patch b/SOURCES/mcelog-patch-f8f1490cb.patch new file mode 100644 index 0000000..bebb773 --- /dev/null +++ b/SOURCES/mcelog-patch-f8f1490cb.patch @@ -0,0 +1,27 @@ +From: Prarit Bhargava + +Subject: Document .os_release in README + +commit f8f1490cb2c6754aa108779e04fa2f099e394895 +Author: Andi Kleen +Date: Mon May 22 09:38:47 2017 -0700 + + Document .os_release in README + + Signed-off-by: Andi Kleen + +diff --git a/README.md b/README.md +index 7e848a598e40ee4ef7162bbad8c1280d001b9dbd..d47928ff614934730105c314aef1c7a084e16a6a 100644 +--- a/README.md ++++ b/README.md +@@ -55,6 +55,10 @@ error predictive failure analysis) require a continuously running daemon. + You can run mcelog from systemd or similar daemons. An example systemd unit + file is in `mcelog.service`. + ++By default mcelog reports its version as the git tag. This can be overridden ++by setting up a .os_release file in the source directory. A build system ++could write the OS version to this file to mark the binary. ++ + ### For older distributions using init scripts + + Please install an init script by default that runs mcelog in daemon mode. diff --git a/SOURCES/mcelog-update-2577aeb.patch b/SOURCES/mcelog-update-2577aeb.patch new file mode 100644 index 0000000..3bbdeab --- /dev/null +++ b/SOURCES/mcelog-update-2577aeb.patch @@ -0,0 +1,129 @@ +diff --git a/intel.c b/intel.c +index 099c4ad..ba353c2 100644 +--- a/intel.c ++++ b/intel.c +@@ -32,7 +32,8 @@ void intel_cpu_init(enum cputype cpu) + { + if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL || + cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || +- cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX) ++ cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || ++ cpu == CPU_HASWELL) + memory_error_support = 1; + } + +@@ -58,7 +59,7 @@ enum cputype select_intel_cputype(int family, int model) + return CPU_NEHALEM; + else if (model == 0x2e || model == 0x2f) + return CPU_XEON75XX; +- else if (model == 0x2a || model == 0x3a) ++ else if (model == 0x2a) + return CPU_SANDY_BRIDGE; + else if (model == 0x2d) + return CPU_SANDY_BRIDGE_EP; +@@ -66,6 +67,8 @@ enum cputype select_intel_cputype(int family, int model) + return CPU_IVY_BRIDGE; + else if (model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; ++ else if (model == 0x3c || model == 0x45 || model == 0x46) ++ return CPU_HASWELL; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", + model); +diff --git a/intel.h b/intel.h +index 105f806..b2b949a 100644 +--- a/intel.h ++++ b/intel.h +@@ -17,5 +17,6 @@ extern int memory_error_support; + case CPU_SANDY_BRIDGE_EP: \ + case CPU_SANDY_BRIDGE: \ + case CPU_IVY_BRIDGE: \ +- case CPU_IVY_BRIDGE_EPEX ++ case CPU_IVY_BRIDGE_EPEX: \ ++ case CPU_HASWELL + +diff --git a/ivy-bridge.c b/ivy-bridge.c +index 46ef7ae..5908ee2 100644 +--- a/ivy-bridge.c ++++ b/ivy-bridge.c +@@ -76,6 +76,7 @@ static char *memctrl_1[] = { + [0x010] = "Uncorrected patrol scrub error", + [0x020] = "Corrected spare error", + [0x040] = "Uncorrected spare error", ++ [0x080] = "Corrected memory read error", + [0x100] = "iMC, WDB, parity errors", + }; + +diff --git a/leaky-bucket.c b/leaky-bucket.c +index 66c2f08..6bb345a 100644 +--- a/leaky-bucket.c ++++ b/leaky-bucket.c +@@ -69,7 +69,7 @@ static int timeconv(char unit, int *out) + unsigned corr = 1; + switch (unit) { + case 'd': corr *= 24; +- case 'h': corr *= 3600; ++ case 'h': corr *= 60; + case 'm': corr *= 60; + case 0: break; + default: return -1; +diff --git a/mcelog.c b/mcelog.c +index 1a4c691..83eb607 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -227,6 +227,7 @@ static char *cputype_name[] = { + [CPU_SANDY_BRIDGE_EP] = "Sandy Bridge EP", /* Fill in better name */ + [CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */ + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ ++ [CPU_HASWELL] = "Haswell", /* Fill in better name */ + }; + + static struct config_choice cpu_choices[] = { +@@ -262,6 +263,7 @@ static struct config_choice cpu_choices[] = { + { "ivybridge", CPU_IVY_BRIDGE }, /* Fill in better name */ + { "ivybridge-ep", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "ivybridge-ex", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ ++ { "haswell", CPU_HASWELL }, /* Fill in better name */ + {} + }; + +diff --git a/mcelog.h b/mcelog.h +index ad69351..ddf90ac 100644 +--- a/mcelog.h ++++ b/mcelog.h +@@ -117,6 +117,7 @@ enum cputype { + CPU_SANDY_BRIDGE_EP, + CPU_IVY_BRIDGE, + CPU_IVY_BRIDGE_EPEX, ++ CPU_HASWELL, + }; + + enum option_ranges { +diff --git a/p4.c b/p4.c +index b5b34e2..93b59f3 100644 +--- a/p4.c ++++ b/p4.c +@@ -360,6 +360,23 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) + ivb_decode_model(cputype, log->bank, log->status, log->misc); + break; + } ++ ++ /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values ++ * and MISCV set. MISC register points to root port that reported the error ++ * need to cross check with AER logs for more details. ++ * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html ++ */ ++ if ((log->status & MCI_STATUS_MISCV) && ++ (log->status & 0xefff) == 0x0e0b) { ++ int seg, bus, dev, fn; ++ ++ seg = EXTRACT(log->misc, 32, 39); ++ bus = EXTRACT(log->misc, 24, 31); ++ dev = EXTRACT(log->misc, 19, 23); ++ fn = EXTRACT(log->misc, 16, 18); ++ Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", ++ seg, bus, dev, fn); ++ } + } + + char *intel_bank_name(int num) diff --git a/SOURCES/mcelog-update-94d853b2ea81.patch b/SOURCES/mcelog-update-94d853b2ea81.patch new file mode 100644 index 0000000..741dc58 --- /dev/null +++ b/SOURCES/mcelog-update-94d853b2ea81.patch @@ -0,0 +1,1894 @@ +diff -urNp mcelog-d2e13bf0.orig/broadwell_epex.c mcelog-d2e13bf0/broadwell_epex.c +--- mcelog-d2e13bf0.orig/broadwell_epex.c 2016-11-30 11:23:54.542909636 -0500 ++++ mcelog-d2e13bf0/broadwell_epex.c 2016-11-30 11:24:12.203619329 -0500 +@@ -23,6 +23,11 @@ + #include "broadwell_epex.h" + #include "memdb.h" + ++/* Memory error was corrected by mirroring with channel failover */ ++#define BDW_MCI_MISC_FO (1ULL<<41) ++/* Memory error was corrected by mirroring and primary channel scrubbed successfully */ ++#define BDW_MCI_MISC_MC (1ULL<<42) ++ + /* See IA32 SDM Vol3B Table 16-20 */ + + static char *pcu_1[] = { +@@ -147,3 +152,23 @@ void bdw_epex_decode_model(int cputype, + break; + } + } ++ ++/* ++ * return: 0 - CE by normal ECC ++ * 1 - CE by mirroring with channel failover ++ * 2 - CE by mirroring and primary channel scrubbed successfully ++ */ ++int bdw_epex_ce_type(int bank, u64 status, u64 misc) ++{ ++ if (!(bank == 7 || bank == 8)) ++ return 0; ++ ++ if (status & MCI_STATUS_MISCV) { ++ if (misc & BDW_MCI_MISC_FO) ++ return 1; ++ if (misc & BDW_MCI_MISC_MC) ++ return 2; ++ } ++ ++ return 0; ++} +diff -urNp mcelog-d2e13bf0.orig/broadwell_epex.h mcelog-d2e13bf0/broadwell_epex.h +--- mcelog-d2e13bf0.orig/broadwell_epex.h 2016-11-30 11:23:54.542909636 -0500 ++++ mcelog-d2e13bf0/broadwell_epex.h 2016-11-30 11:24:12.203619329 -0500 +@@ -1 +1,2 @@ + void bdw_epex_decode_model(int cputype, int bank, u64 status, u64 misc); ++int bdw_epex_ce_type(int bank, u64 status, u64 misc); +diff -urNp mcelog-d2e13bf0.orig/client.c mcelog-d2e13bf0/client.c +--- mcelog-d2e13bf0.orig/client.c 2016-11-30 11:23:54.530909154 -0500 ++++ mcelog-d2e13bf0/client.c 2016-11-30 11:24:12.203619329 -0500 +@@ -67,3 +67,11 @@ void ask_server(char *command) + + SYSERRprintf("client read"); + } ++ ++void client_cleanup(void) ++{ ++ char *path = config_string("server", "socket-path"); ++ if (!path) ++ path = SOCKET_PATH; ++ unlink(path); ++} +diff -urNp mcelog-d2e13bf0.orig/client.h mcelog-d2e13bf0/client.h +--- mcelog-d2e13bf0.orig/client.h 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/client.h 2016-11-30 11:24:12.203619329 -0500 +@@ -1 +1,2 @@ + void ask_server(char *command); ++void client_cleanup(void); +diff -urNp mcelog-d2e13bf0.orig/db.c mcelog-d2e13bf0/db.c +--- mcelog-d2e13bf0.orig/db.c 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/db.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,599 +0,0 @@ +-/* Copyright (C) 2006 Andi Kleen, SuSE Labs. +- Dumb database manager. +- not suitable for large datasets, but human readable files and simple. +- assumes groups and entries-per-group are max low double digits. +- the in memory presentation could be easily optimized with a few +- hashes, but that shouldn't be needed for now. +- Note: obsolete, new design uses in memory databases only +- +- mcelog is free software; you can redistribute it and/or +- modify it under the terms of the GNU General Public +- License as published by the Free Software Foundation; version +- 2. +- +- mcelog is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- General Public License for more details. +- +- You should find a copy of v2 of the GNU General Public License somewhere +- on your Linux system; if not, write to the Free Software Foundation, +- Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +- +-/* TBD: +- add lock file to protect final rename +- timeout for locks +-*/ +- +-#define _GNU_SOURCE 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include "db.h" +-#include "memutil.h" +- +-/* file format +- +-# comment +-[group1] +-entry1: value +-entry2: value +- +-# comment +-# comment2 +-[group2] +-entry: value +- +-value is anything before new line, but first will be skipped +-spaces are allowed in entry names or groups +-comments are preserved, but moved in front of the group +-blank lines allowed. +- +-code doesnt check for unique records/entries right now. first wins. +- +-*/ +- +-struct entry { +- char *name; +- char *val; +-}; +- +-struct group { +- struct group *next; +- char *name; +- struct entry *entries; +- char *comment; +- int numentries; +-}; +- +-#define ENTRY_CHUNK (128 / sizeof(struct entry)) +- +-struct database { +- struct group *groups; +- FILE *fh; +- char *fn; +- int dirty; +-}; +- +-static int read_db(struct database *db); +-static FILE *open_file(char *fn, int wr); +-static void free_group(struct group *g); +- +-static void DBerror(char *fmt, ...) +-{ +- va_list ap; +- va_start(ap,fmt); +- vfprintf(stderr, fmt, ap); +- va_end(ap); +- exit(1); +-} +- +-#define DB_NEW(p) ((p) = xalloc(sizeof(*(p)))) +- +-static struct group *alloc_group(char *name) +-{ +- struct group *g; +- DB_NEW(g); +- g->entries = xalloc(ENTRY_CHUNK * sizeof(struct entry)); +- g->name = name; +- return g; +-} +- +-static char *cleanline(char *s) +-{ +- char *p; +- while (isspace(*s)) +- s++; +- if (*s == 0) +- return NULL; +- p = strchr(s, '\n'); +- if (p) +- *p = 0; +- return s; +-} +- +-struct database *open_db(char *fn, int wr) +-{ +- struct database *db; +- +- DB_NEW(db); +- db->fh = open_file(fn, wr); +- if (!db->fh) { +- DBerror("Cannot open database %s\n", fn); +- free(db); +- return NULL; +- } +- db->fn = xstrdup(fn); +- if (read_db(db) < 0) { +- free(db->fn); +- free(db); +- return NULL; +- } +- return db; +-} +- +-static int read_db(struct database *db) +-{ +- char *line = NULL; +- size_t linesz = 0; +- struct group *group = NULL, **pgroup = &db->groups; +- int linenr = 0; +- +- while (getline(&line, &linesz, db->fh) > 0) { +- char *s; +- s = strchr(line, '#'); +- if (s) { +- struct group *cmt; +- DB_NEW(cmt); +- *pgroup = cmt; +- pgroup = &cmt->next; +- cmt->comment = xstrdup(s + 1); +- *s = 0; +- } +- s = cleanline(line); +- linenr++; +- if (!s) +- continue; +- if (*s == '[') { +- int n; +- char *name; +- ++s; +- n = strcspn(s, "]"); +- if (s[n] == 0) +- goto parse_error; +- name = xalloc(n + 1); +- memcpy(name, s, n); +- group = alloc_group(name); +- *pgroup = group; +- pgroup = &group->next; +- } else { +- char *p; +- if (!group) +- goto parse_error; +- p = s + strcspn(s, ":"); +- if (*p != ':') +- goto parse_error; +- *p++ = 0; +- if (*p == ' ') +- p++; +- else +- goto parse_error; +- change_entry(db, group, line, p); +- } +- } +- +- if (ferror(db->fh)) { +- DBerror("IO error while reading database %s: %s\n", db->fn, +- strerror(errno)); +- goto error; +- } +- +- free(line); +- return 0; +- +-parse_error: +- DBerror("Parse error in database %s at line %d\n", db->fn, linenr); +-error: +- free(line); +- return -1; +-} +- +-/* +-Crash safety strategy: +- +-While the database is opened hold a exclusive flock on the file +-When writing write to a temporary file (.out). Only when the file +-is written rename to another temporary file (.complete). +- +-Then sync and swap tmp file with main file, then sync directory +-(later is linux specific) +- +-During open if the main file doesn't exist and a .complete file does +-rename the .complete file to main first; or open the .complete +-file if the file system is read only. +- +-*/ +- +-/* Flush directory. Useful on ext2, on journaling file systems +- the later fsync would usually force earlier transactions on the +- metadata too. */ +-static int flush_dir(char *fn) +-{ +- int err, fd; +- char *p; +- char dir[strlen(fn) + 1]; +- strcpy(dir, fn); +- p = strrchr(dir, '/'); +- if (p) +- *p = 0; +- else +- strcpy(dir, "."); +- fd = open(dir, O_DIRECTORY|O_RDONLY); +- if (fd < 0) +- return -1; +- err = 0; +- if (fsync(fd) < 0) +- err = -1; +- if (close(fd) < 0) +- err = -1; +- return err; +-} +- +-static int force_rename(char *a, char *b) +-{ +- unlink(b); /* ignore error */ +- return rename(a, b); +-} +- +-static int rewrite_db(struct database *db) +-{ +- FILE *fhtmp; +- int err; +- +- int tmplen = strlen(db->fn) + 10; +- char fn_complete[tmplen], fn_old[tmplen], fn_out[tmplen]; +- +- sprintf(fn_complete, "%s.complete", db->fn); +- sprintf(fn_old, "%s~", db->fn); +- sprintf(fn_out, "%s.out", db->fn); +- +- fhtmp = fopen(fn_out, "w"); +- if (!fhtmp) { +- DBerror("Cannot open `%s' output file: %s\n", fn_out, +- strerror(errno)); +- return -1; +- } +- +- dump_database(db, fhtmp); +- +- err = 0; +- /* Finish the output file */ +- if (ferror(fhtmp) || fflush(fhtmp) != 0 || fsync(fileno(fhtmp)) != 0 || +- fclose(fhtmp)) +- err = -1; +- /* Rename to .complete */ +- else if (force_rename(fn_out, fn_complete)) +- err = -1; +- /* RED-PEN: need to do retry for race */ +- /* Move to final name */ +- else if (force_rename(db->fn, fn_old) || rename(fn_complete, db->fn)) +- err = -1; +- /* Hit disk */ +- else if (flush_dir(db->fn)) +- err = -1; +- +- if (err) { +- DBerror("Error writing to database %s: %s\n", db->fn, +- strerror(errno)); +- } +- +- return err; +-} +- +-int sync_db(struct database *db) +-{ +- if (!db->dirty) +- return 0; +- /* RED-PEN window without lock */ +- if (rewrite_db(db)) +- return -1; +- fclose(db->fh); +- db->dirty = 0; +- db->fh = open_file(db->fn, 1); +- if (!db->fh) +- return -1; +- return 0; +-} +- +-static void free_group(struct group *g) +-{ +- free(g->entries); +- free(g->name); +- free(g->comment); +- free(g); +-} +- +-static void free_data(struct database *db) +-{ +- struct group *g, *gnext; +- for (g = db->groups; g; g = gnext) { +- gnext = g->next; +- free_group(g); +- } +-} +- +-int close_db(struct database *db) +-{ +- if (db->dirty && rewrite_db(db)) +- return -1; +- if (fclose(db->fh)) +- return -1; +- free_data(db); +- free(db->fn); +- free(db); +- return 0; +-} +- +-static FILE *open_file(char *fn, int wr) +-{ +- char tmp[strlen(fn) + 10]; +- FILE *fh; +- if (access(fn, wr ? (R_OK|W_OK) : R_OK)) { +- switch (errno) { +- case EROFS: +- wr = 0; +- break; +- case ENOENT: +- /* No main DB file */ +- sprintf(tmp, "%s.complete", fn); +- /* Handle race */ +- if (!access(tmp, R_OK)) { +- if (rename(tmp, fn) < 0 && errno == EEXIST) +- return open_file(fn, wr); +- } else +- creat(fn, 0644); +- break; +- } +- } +- fh = fopen(fn, wr ? "r+" : "r"); +- if (fh) { +- if (flock(fileno(fh), wr ? LOCK_EX : LOCK_SH) < 0) { +- fclose(fh); +- return NULL; +- } +- } +- return fh; +-} +- +-void dump_group(struct group *g, FILE *out) +-{ +- struct entry *e; +- fprintf(out, "[%s]\n", g->name); +- for (e = &g->entries[0]; e->name && !ferror(out); e++) +- fprintf(out, "%s: %s\n", e->name, e->val); +-} +- +-void dump_database(struct database *db, FILE *out) +-{ +- struct group *g; +- for (g = db->groups; g && !ferror(out); g = g->next) { +- if (g->comment) { +- fprintf(out, "#%s", g->comment); +- continue; +- } +- dump_group(g, out); +- } +-} +- +-struct group *find_group(struct database *db, char *name) +-{ +- struct group *g; +- for (g = db->groups; g; g = g->next) +- if (g->name && !strcmp(g->name, name)) +- return g; +- return NULL; +-} +- +-int delete_group(struct database *db, struct group *group) +-{ +- struct group *g, **gprev; +- gprev = &db->groups; +- for (g = *gprev; g; gprev = &g->next, g = g->next) { +- if (g == group) { +- *gprev = g->next; +- free_group(g); +- return 0; +- } +- } +- db->dirty = 1; +- return -1; +-} +- +-char *entry_val(struct group *g, char *entry) +-{ +- struct entry *e; +- for (e = &g->entries[0]; e->name; e++) +- if (!strcmp(e->name, entry)) +- return e->val; +- return NULL; +-} +- +-struct group *add_group(struct database *db, char *name, int *existed) +-{ +- struct group *g, **gprev = &db->groups; +- for (g = *gprev; g; gprev = &g->next, g = g->next) +- if (g->name && !strcmp(g->name, name)) +- break; +- if (existed) +- *existed = (g != NULL); +- if (!g) { +- g = alloc_group(xstrdup(name)); +- g->next = *gprev; +- *gprev = g; +- } +- db->dirty = 1; +- return g; +- +-} +- +-void change_entry(struct database *db, struct group *g, +- char *entry, char *newval) +-{ +- int i; +- struct entry *e, *entries; +- db->dirty = 1; +- entries = &g->entries[0]; +- for (e = entries; e->name; e++) { +- if (!strcmp(e->name, entry)) { +- free(e->val); +- e->val = xstrdup(newval); +- return; +- } +- } +- i = e - entries; +- assert(i == g->numentries); +- if (i > 0 && (i % ENTRY_CHUNK) == 0) { +- int new = (i + ENTRY_CHUNK) * sizeof(struct entry); +- g->entries = xrealloc(g->entries, new); +- } +- entries = &g->entries[0]; +- e = &entries[i]; +- e->name = xstrdup(entry); +- e->val = xstrdup(newval); +- g->numentries++; +-} +- +-void delete_entry(struct database *db, struct group *g, char *entry) +-{ +- struct entry *e; +- for (e = &g->entries[0]; e->name; e++) +- if (!strcmp(e->name, entry)) +- break; +- if (e->name == NULL) +- return; +- while ((++e)->name) +- e[-1] = e[0]; +- g->numentries--; +-} +- +-struct group * +-clone_group(struct database *db, struct group *gold, char *newname) +-{ +- struct entry *e; +- struct group *gnew = add_group(db, newname, NULL); +- for (e = &gold->entries[0]; e->name; e++) +- change_entry(db, gnew, e->name, e->val); +- return gnew; +-} +- +-static char *save_comment(char *c) +-{ +- int len = strlen(c); +- char *s = xalloc(len + 2); +- strcpy(s, c); +- if (len == 0 || c[len - 1] != '\n') +- s[len] = '\n'; +- return s; +-} +- +-void add_comment(struct database *db, struct group *group, char *comment) +-{ +- struct group *g; +- struct group **gprev = &db->groups; +- for (g = *gprev; g; gprev = &g->next, g = g->next) { +- if ((group && g == group) || (!group && g->comment == NULL)) +- break; +- } +- DB_NEW(g); +- g->comment = save_comment(comment); +- g->next = *gprev; +- *gprev = g; +- db->dirty = 1; +-} +- +-struct group *first_group(struct database *db) +-{ +- return next_group(db->groups); +-} +- +-struct group *next_group(struct group *g) +-{ +- struct group *n; +- if (!g) +- return NULL; +- n = g->next; +- while (n && n->comment) +- n = n->next; +- return n; +-} +- +-char *group_name(struct group *g) +-{ +- return g->name; +-} +- +-struct group *find_entry(struct database *db, struct group *prev, +- char *entry, char *value) +-{ +- int previ = 0; +- struct entry *e; +- struct group *g; +- if (prev) +- g = prev->next; +- else +- g = db->groups; +- for (; g; g = g->next) { +- if (g->comment) +- continue; +- /* Short cut when entry is at the same place as previous */ +- if (previ < g->numentries) { +- e = &g->entries[previ]; +- if (!strcmp(e->name, entry)) { +- if (!strcmp(e->val, value)) +- return g; +- continue; +- } +- } +- for (e = &g->entries[0]; e->name; e++) { +- if (strcmp(e->name, entry)) +- continue; +- if (!strcmp(e->val, value)) +- return g; +- previ = e - &g->entries[0]; +- break; +- } +- } +- return NULL; +-} +- +-void rename_group(struct database *db, struct group *g, char *newname) +-{ +- free(g->name); +- g->name = xstrdup(newname); +- db->dirty = 1; +-} +- +-unsigned long entry_num(struct group *g, char *entry) +-{ +- char *e = entry_val(g, entry); +- unsigned long val = 0; +- if (e) +- sscanf(e, "%lu", &val); +- return val; +-} +- +-void change_entry_num(struct database *db, struct group *g, +- char *entry, unsigned long val) +-{ +- char buf[20]; +- sprintf(buf, "%lu", val); +- change_entry(db, g, entry, buf); +-} +diff -urNp mcelog-d2e13bf0.orig/db.h mcelog-d2e13bf0/db.h +--- mcelog-d2e13bf0.orig/db.h 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/db.h 1969-12-31 19:00:00.000000000 -0500 +@@ -1,29 +0,0 @@ +-#include +-struct database; +-struct group; +- +-struct database *open_db(char *fn, int wr); +-int sync_db(struct database *db); +-int close_db(struct database *db); +-struct group *find_group(struct database *db, char *name); +-char *entry_val(struct group *g, char *entry); +-struct group *add_group(struct database *db, char *name, int *existed); +-int delete_group(struct database *db, struct group *g); +-void change_entry(struct database *db, struct group *g, +- char *entry, char *newval); +-void add_comment(struct database *db, struct group *group, char *comment); +-struct group *first_group(struct database *db); +-struct group *next_group(struct group *g); +-void dump_group(struct group *g, FILE *out); +-void dump_database(struct database *db, FILE *out); +-struct group *find_entry(struct database *db, struct group *prev, +- char *entry, char *value); +-void rename_group(struct database *db, struct group *group, char *newname); +-char *group_name(struct group *g); +-unsigned long entry_num(struct group *g, char *entry); +-void change_entry_num(struct database *db, struct group *g, char *entry, +- unsigned long val); +-void delete_entry(struct database *db, struct group *g, char *entry); +-struct group * +-clone_group(struct database *db, struct group *gold, char *newname); +- +diff -urNp mcelog-d2e13bf0.orig/dbquery.c mcelog-d2e13bf0/dbquery.c +--- mcelog-d2e13bf0.orig/dbquery.c 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/dbquery.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,130 +0,0 @@ +-/* Access db files. This is for testing and debugging only. */ +-#define _GNU_SOURCE 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include "db.h" +- +-#define C(x) if (x) printf(#x " failed: %s\n", strerror(errno)) +-#define NEEDGROUP if (!group) { printf("need group first\n"); break; } +- +-void Eprintf(char *fmt, ...) +-{ +- va_list ap; +- va_start(ap, fmt); +- vfprintf(stderr, fmt, ap); +- va_end(ap); +-} +- +-void usage(void) +-{ +- printf( +- "s sync\n" +- "q close/quit\n" +- "ggroup find group\n" +- "G delete group\n" +- "agroup add group\n" +- "ventry dump entry\n" +- "centry,val change entry to val\n" +- "fentry,val find entry with value and dump its group\n" +- "Ccomment add comment\n" +- "Lnewname clone group to newname\n" +- "d dump group\n" +- "D dump database\n"); +-} +- +-int main(int ac, char **av) +-{ +- struct database *db; +- struct group *group = NULL; +- char *line = NULL; +- size_t linesz = 0; +- if (!av[1]) { +- printf("%s database\n", av[0]); +- exit(1); +- } +- printf("dbtest\n"); +- db = open_db(av[1], 1); +- while (printf("> "), +- fflush(stdout), +- getline(&line, &linesz, stdin) > 0) { +- char *p = line + strlen(line) - 1; +- while (p >= line && isspace(*p)) +- *p-- = 0; +- switch (line[0]) { +- case 's': +- C(sync_db(db)); +- break; +- case 'q': +- C(close_db(db)); +- exit(0); +- case 'g': +- group = find_group(db, line + 1); +- if (group) +- printf("found\n"); +- break; +- case 'G': +- NEEDGROUP; +- C(delete_group(db, group)); +- group = NULL; +- break; +- case 'a': { +- int existed = 0; +- group = add_group(db, line + 1, &existed); +- if (existed) +- printf("existed\n"); +- break; +- } +- case 'v': +- NEEDGROUP; +- printf("%s\n", entry_val(group, line + 1)); +- break; +- case 'c': { +- p = line + 1; +- char *entry = strsep(&p, ","); +- NEEDGROUP; +- change_entry(db, group, entry, strsep(&p, "")); +- break; +- } +- case 'L': +- NEEDGROUP; +- clone_group(db, group, line + 1); +- break; +- case 'f': { +- struct group *g; +- p = line + 1; +- char *entry = strsep(&p, ","); +- char *val = strsep(&p, ""); +- g = NULL; +- int nr = 0; +- while ((g = find_entry(db, g, entry, val)) != NULL) { +- if (nr == 0) +- group = g; +- nr++; +- dump_group(group, stdout); +- } +- if (nr == 0) +- printf("not found\n"); +- break; +- } +- case 'C': +- NEEDGROUP; +- add_comment(db, group, line + 1); +- break; +- case 'd': +- NEEDGROUP; +- dump_group(group, stdout); +- break; +- case 'D': +- dump_database(db, stdout); +- break; +- default: +- usage(); +- break; +- } +- } +- return 0; +-} +diff -urNp mcelog-d2e13bf0.orig/denverton.c mcelog-d2e13bf0/denverton.c +--- mcelog-d2e13bf0.orig/denverton.c 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/denverton.c 2016-11-30 11:24:12.204619369 -0500 +@@ -0,0 +1,45 @@ ++/* Copyright (C) 2016 Intel Corporation ++ Decode Intel Denverton specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Tony Luck ++*/ ++ ++#include "mcelog.h" ++#include "bitfield.h" ++#include "denverton.h" ++#include "memdb.h" ++ ++/* See IA32 SDM Vol3B Table 16-33 */ ++ ++static struct field mc_bits[] = { ++ SBITFIELD(16, "Cmd/Addr parity"), ++ SBITFIELD(17, "Corrected Demand/Patrol Scrub Error"), ++ SBITFIELD(18, "Uncorrected patrol scrub error"), ++ SBITFIELD(19, "Uncorrected demand read error"), ++ SBITFIELD(20, "WDB read ECC"), ++ {} ++}; ++ ++void denverton_decode_model(int cputype, int bank, u64 status, u64 misc) ++{ ++ switch (bank) { ++ case 6: case 7: ++ Wprintf("MemCtrl: "); ++ decode_bitfield(status, mc_bits); ++ break; ++ } ++} +diff -urNp mcelog-d2e13bf0.orig/denverton.h mcelog-d2e13bf0/denverton.h +--- mcelog-d2e13bf0.orig/denverton.h 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/denverton.h 2016-11-30 11:24:12.204619369 -0500 +@@ -0,0 +1 @@ ++void denverton_decode_model(int cputype, int bank, u64 status, u64 misc); +diff -urNp mcelog-d2e13bf0.orig/diskdb.c mcelog-d2e13bf0/diskdb.c +--- mcelog-d2e13bf0.orig/diskdb.c 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/diskdb.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,96 +0,0 @@ +-/* High level interface to disk based DIMM database */ +-/* Note: obsolete: new design is in memdb.c */ +-#include +-#include +-#include +-#include "mcelog.h" +-#include "diskdb.h" +-#include "paths.h" +-#include "dimm.h" +-#include "dmi.h" +- +-char *error_trigger; +-unsigned error_thresh = 20; +-char *dimm_db_fn = DIMM_DB_FILENAME; +- +-static void checkdimmdb(void) +-{ +- if (open_dimm_db(dimm_db_fn) < 0) +- exit(1); +-} +- +-int diskdb_modifier(int opt) +-{ +- char *end; +- +- switch (opt) { +- case O_DATABASE: +- dimm_db_fn = optarg; +- checkdmi(); +- checkdimmdb(); +- break; +- case O_ERROR_TRIGGER: +- checkdmi(); +- open_dimm_db(dimm_db_fn); +- error_thresh = strtoul(optarg, &end, 0); +- if (end == optarg || *end != ',') +- usage(); +- error_trigger = end + 1; +- break; +- default: +- return 0; +- } +- return 1; +-} +- +-void diskdb_resolve_addr(u64 addr) +-{ +- if (open_dimm_db(dimm_db_fn) >= 0) +- new_error(addr, error_thresh, error_trigger); +-} +- +- +-void diskdb_usage(void) +-{ +- fprintf(stderr, +- "Manage disk DIMM error database\n" +- " mcelog [options] --drop-old-memory|--reset-memory locator\n" +- " mcelog --dump-memory locator\n" +- " old can be either locator or name\n" +- "Disk database options:" +- "--database fn Set filename of DIMM database (default " DIMM_DB_FILENAME ")\n" +- "--error-trigger cmd,thresh Run cmd on exceeding thresh errors per DIMM\n"); +-} +- +- +-static void dimm_common(int ac, char **av) +-{ +- no_syslog(); +- checkdmi(); +- checkdimmdb(); +- argsleft(ac, av); +-} +- +-int diskdb_cmd(int opt, int ac, char **av) +-{ +- char *arg = optarg; +- +- switch (opt) { +- case O_DUMP_MEMORY: +- dimm_common(ac, av); +- if (arg) +- dump_dimm(arg); +- else +- dump_all_dimms(); +- return 1; +- case O_RESET_MEMORY: +- dimm_common(ac, av); +- reset_dimm(arg); +- return 1; +- case O_DROP_OLD_MEMORY: +- dimm_common(ac, av); +- gc_dimms(); +- return 1; +- } +- return 0; +-} +diff -urNp mcelog-d2e13bf0.orig/diskdb.h mcelog-d2e13bf0/diskdb.h +--- mcelog-d2e13bf0.orig/diskdb.h 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/diskdb.h 1969-12-31 19:00:00.000000000 -0500 +@@ -1,32 +0,0 @@ +- +-#ifdef CONFIG_DISKDB +-enum diskdb_options { +- O_DATABASE = O_DISKDB, +- O_ERROR_TRIGGER, +- O_DUMP_MEMORY, +- O_RESET_MEMORY, +- O_DROP_OLD_MEMORY, +-}; +- +-void diskdb_resolve_addr(u64 addr); +-int diskdb_modifier(int opt); +-int diskdb_cmd(int opt, int ac, char **av); +-void diskdb_usage(void); +- +-#define DISKDB_OPTIONS \ +- { "database", 1, NULL, O_DATABASE }, \ +- { "error-trigger", 1, NULL, O_ERROR_TRIGGER }, \ +- { "dump-memory", 2, NULL, O_DUMP_MEMORY }, \ +- { "reset-memory", 2, NULL, O_RESET_MEMORY }, \ +- { "drop-old-memory", 0, NULL, O_DROP_OLD_MEMORY }, +- +-#else +- +-static inline void diskdb_resolve_addr(u64 addr) {} +-static inline int diskdb_modifier(int opt) { return 0; } +-static inline int diskdb_cmd(int opt, int ac, char **av) { return 0; } +-static inline void diskdb_usage(void) {} +- +-#define DISKDB_OPTIONS +- +-#endif +diff -urNp mcelog-d2e13bf0.orig/dmi.h mcelog-d2e13bf0/dmi.h +--- mcelog-d2e13bf0.orig/dmi.h 2016-11-30 11:23:54.534909314 -0500 ++++ mcelog-d2e13bf0/dmi.h 2016-11-30 11:24:12.205619409 -0500 +@@ -3,7 +3,7 @@ struct dmi_entry { + unsigned char type; + unsigned char length; + unsigned short handle; +-}; ++} __attribute__((packed)); + + enum { + DMI_MEMORY_ARRAY = 16, +diff -urNp mcelog-d2e13bf0.orig/.gitignore mcelog-d2e13bf0/.gitignore +--- mcelog-d2e13bf0.orig/.gitignore 2016-11-30 11:23:54.530909154 -0500 ++++ mcelog-d2e13bf0/.gitignore 2016-11-30 11:24:12.202619289 -0500 +@@ -8,3 +8,5 @@ dbquery + .depend + tsc + core ++version.c ++version.tmp +diff -urNp mcelog-d2e13bf0.orig/input/bdw_mirror1 mcelog-d2e13bf0/input/bdw_mirror1 +--- mcelog-d2e13bf0.orig/input/bdw_mirror1 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/input/bdw_mirror1 2016-11-30 11:24:12.205619409 -0500 +@@ -0,0 +1,6 @@ ++# Broadwell mirror corrected with mirror failover ++CPU 0 7 ++PROCESSOR 0:0x406f0 ++STATUS 0x8800000000000080 ++MISC 20000000000 ++ +diff -urNp mcelog-d2e13bf0.orig/input/bdw_mirror2 mcelog-d2e13bf0/input/bdw_mirror2 +--- mcelog-d2e13bf0.orig/input/bdw_mirror2 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/input/bdw_mirror2 2016-11-30 11:24:12.205619409 -0500 +@@ -0,0 +1,6 @@ ++# Broadwell mirror corrected with successful scrub ++CPU 0 7 ++PROCESSOR 0:0x406f0 ++STATUS 0x8800000000000080 ++MISC 40000000000 ++ +diff -urNp mcelog-d2e13bf0.orig/input/GENMEM mcelog-d2e13bf0/input/GENMEM +--- mcelog-d2e13bf0.orig/input/GENMEM 2016-11-30 11:23:54.532909234 -0500 ++++ mcelog-d2e13bf0/input/GENMEM 2016-11-30 11:24:12.205619409 -0500 +@@ -11,7 +11,7 @@ dimm=${3:-0} + corr_err_cnt=${4:-0} + + if [ ! -z "$5" ] ; then +- ucflag=$[1 << (61-32)] ++ ucflag=$[(1 << (61-32)) | (1 << (60-32)) | (1 << (56-32))] + else + ucflag=0 + fi +diff -urNp mcelog-d2e13bf0.orig/input/skx_mirror1 mcelog-d2e13bf0/input/skx_mirror1 +--- mcelog-d2e13bf0.orig/input/skx_mirror1 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/input/skx_mirror1 2016-11-30 11:24:12.205619409 -0500 +@@ -0,0 +1,6 @@ ++# Skylake mirror corrected with mirror failover ++CPU 0 7 ++PROCESSOR 0:0x50650 ++STATUS 0x8800000000000080 ++MISC 8000000000000000 ++ +diff -urNp mcelog-d2e13bf0.orig/input/skx_mirror2 mcelog-d2e13bf0/input/skx_mirror2 +--- mcelog-d2e13bf0.orig/input/skx_mirror2 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/input/skx_mirror2 2016-11-30 11:24:12.205619409 -0500 +@@ -0,0 +1,6 @@ ++# Skylake mirror corrected with successful scrub ++CPU 0 7 ++PROCESSOR 0:0x50650 ++STATUS 0x8800000000000080 ++MISC 4000000000000000 ++ +diff -urNp mcelog-d2e13bf0.orig/intel.c mcelog-d2e13bf0/intel.c +--- mcelog-d2e13bf0.orig/intel.c 2016-11-30 11:23:54.538909475 -0500 ++++ mcelog-d2e13bf0/intel.c 2016-11-30 11:24:12.206619450 -0500 +@@ -25,7 +25,6 @@ + #include "sandy-bridge.h" + #include "ivy-bridge.h" + #include "haswell.h" +-#include "xeon75xx.h" + + int memory_error_support; + +@@ -36,7 +35,9 @@ void intel_cpu_init(enum cputype cpu) + cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || + cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL || + cpu == CPU_BROADWELL_DE || cpu == CPU_BROADWELL_EPEX || +- cpu == CPU_KNIGHTS_LANDING || cpu == CPU_SKYLAKE || cpu == CPU_SKYLAKE_XEON) ++ cpu == CPU_KNIGHTS_LANDING || cpu == CPU_KNIGHTS_MILL || ++ cpu == CPU_SKYLAKE || cpu == CPU_SKYLAKE_XEON || ++ cpu == CPU_KABYLAKE || cpu == CPU_DENVERTON) + memory_error_support = 1; + } + +@@ -82,6 +83,8 @@ enum cputype select_intel_cputype(int fa + return CPU_BROADWELL_DE; + else if (model == 0x57) + return CPU_KNIGHTS_LANDING; ++ else if (model == 0x85) ++ return CPU_KNIGHTS_MILL; + else if (model == 0x1c || model == 0x26 || model == 0x27 || + model == 0x35 || model == 0x36 || model == 0x36 || + model == 0x37 || model == 0x4a || model == 0x4c || +@@ -91,18 +94,22 @@ enum cputype select_intel_cputype(int fa + return CPU_SKYLAKE; + else if (model == 0x55) + return CPU_SKYLAKE_XEON; ++ else if (model == 0x8E || model == 0x9E) ++ return CPU_KABYLAKE; ++ else if (model == 0x5f) ++ return CPU_DENVERTON; + if (model > 0x1a) { +- Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", ++ Eprintf("Family 6 Model %u CPU: only decoding architectural errors\n", + model); + return CPU_INTEL; + } + } + if (family > 6) { +- Eprintf("Family %u Model %x CPU: only decoding architectural errors\n", ++ Eprintf("Family %u Model %u CPU: only decoding architectural errors\n", + family, model); + return CPU_INTEL; + } +- Eprintf("Unknown Intel CPU type family %x model %x\n", family, model); ++ Eprintf("Unknown Intel CPU type family %u model %u\n", family, model); + return family == 6 ? CPU_P6OLD : CPU_GENERIC; + } + +@@ -127,9 +134,6 @@ static int intel_memory_error(struct mce + case CPU_NEHALEM: + nehalem_memerr_misc(m, channel, dimm); + break; +- case CPU_XEON75XX: +- xeon75xx_memory_error(m, recordlen, channel, dimm); +- break; + case CPU_SANDY_BRIDGE_EP: + sandy_bridge_ep_memerr_misc(m, channel, dimm); + break; +diff -urNp mcelog-d2e13bf0.orig/intel.h mcelog-d2e13bf0/intel.h +--- mcelog-d2e13bf0.orig/intel.h 2016-11-30 11:23:54.530909154 -0500 ++++ mcelog-d2e13bf0/intel.h 2016-11-30 11:24:12.206619450 -0500 +@@ -25,6 +25,9 @@ extern int memory_error_support; + case CPU_BROADWELL_EPEX: \ + case CPU_ATOM: \ + case CPU_KNIGHTS_LANDING: \ ++ case CPU_KNIGHTS_MILL: \ + case CPU_SKYLAKE: \ +- case CPU_SKYLAKE_XEON ++ case CPU_SKYLAKE_XEON: \ ++ case CPU_KABYLAKE: \ ++ case CPU_DENVERTON + +diff -urNp mcelog-d2e13bf0.orig/leaky-bucket.c mcelog-d2e13bf0/leaky-bucket.c +--- mcelog-d2e13bf0.orig/leaky-bucket.c 2016-11-30 11:23:54.537909435 -0500 ++++ mcelog-d2e13bf0/leaky-bucket.c 2016-11-30 11:24:12.206619450 -0500 +@@ -72,7 +72,9 @@ static int timeconv(char unit, int *out) + case 'h': corr *= 60; + case 'm': corr *= 60; + case 0: break; +- default: return -1; ++ default: ++ *out = 1; ++ return -1; + } + *out = corr; + return 0; +diff -urNp mcelog-d2e13bf0.orig/Makefile mcelog-d2e13bf0/Makefile +--- mcelog-d2e13bf0.orig/Makefile 2016-11-30 11:23:54.538909475 -0500 ++++ mcelog-d2e13bf0/Makefile 2016-11-30 11:24:12.202619289 -0500 +@@ -17,11 +17,6 @@ WARNINGS := -Wall -Wextra -Wno-missing-f + -Wstrict-prototypes -Wformat-security -Wmissing-declarations \ + -Wdeclaration-after-statement + +-# The on disk database has still many problems (partly in this code and partly +-# due to missing support from BIOS), so it's disabled by default. You can +-# enable it here by uncommenting the following line +-# CONFIG_DISKDB = 1 +- + TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \ + socket-memory-error-trigger \ + bus-error-trigger \ +@@ -36,23 +31,16 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co + nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ + eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ + client.o cache.o sysfs.o yellow.o page.o rbtree.o \ +- xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o \ ++ sandy-bridge.o ivy-bridge.o haswell.o \ + broadwell_de.o broadwell_epex.o skylake_xeon.o \ ++ denverton.o \ + msr.o bus.o unknown.o +-DISKDB_OBJ := diskdb.o dimm.o db.o +-CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} \ ++CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o \ + version.o version.c version.tmp + DOC := mce.pdf + + ADD_DEFINES := + +-ifdef CONFIG_DISKDB +-ADD_DEFINES := -DCONFIG_DISKDB=1 +-OBJ += ${DISKDB_OBJ} +- +-all: dbquery +-endif +- + SRC := $(OBJ:.o=.c) + + mcelog: ${OBJ} version.o +diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c +--- mcelog-d2e13bf0.orig/mcelog.c 2016-11-30 11:23:54.531909194 -0500 ++++ mcelog-d2e13bf0/mcelog.c 2016-11-30 11:25:24.563516902 -0500 +@@ -48,7 +48,6 @@ + #include "tsc.h" + #include "version.h" + #include "config.h" +-#include "diskdb.h" + #include "memutil.h" + #include "eventloop.h" + #include "memdb.h" +@@ -236,9 +235,12 @@ static char *cputype_name[] = { + [CPU_BROADWELL_DE] = "Intel Xeon (Broadwell) D family", + [CPU_BROADWELL_EPEX] = "Intel Xeon v4 (Broadwell) EP/EX", + [CPU_KNIGHTS_LANDING] = "Knights Landing", ++ [CPU_KNIGHTS_MILL] = "Knights Mill", + [CPU_ATOM] = "ATOM", + [CPU_SKYLAKE] = "Skylake", + [CPU_SKYLAKE_XEON] = "Skylake server", ++ [CPU_KABYLAKE] = "Kabylake", ++ [CPU_DENVERTON] = "Denverton", + }; + + static struct config_choice cpu_choices[] = { +@@ -282,10 +284,13 @@ static struct config_choice cpu_choices[ + { "broadwell-ep", CPU_BROADWELL_EPEX }, + { "broadwell-ex", CPU_BROADWELL_EPEX }, + { "knightslanding", CPU_KNIGHTS_LANDING }, ++ { "knightsmill", CPU_KNIGHTS_MILL }, + { "xeon-v4", CPU_BROADWELL_EPEX }, + { "atom", CPU_ATOM }, + { "skylake", CPU_SKYLAKE }, + { "skylake_server", CPU_SKYLAKE_XEON }, ++ { "kabylake", CPU_KABYLAKE }, ++ { "denverton", CPU_DENVERTON }, + { NULL } + }; + +@@ -356,7 +361,7 @@ static enum cputype setup_cpuid(u32 cpuv + return CPU_K8; + /* FALL THROUGH */ + default: +- Eprintf("Unknown CPU type vendor %u family %x model %x", ++ Eprintf("Unknown CPU type vendor %u family %u model %u", + cpuvendor, family, model); + return CPU_GENERIC; + } +@@ -449,12 +454,10 @@ static void dump_mce(struct mce *m, unsi + if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX && + cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL && + cputype != CPU_BROADWELL_DE && cputype != CPU_BROADWELL_EPEX && +- cputype != CPU_KNIGHTS_LANDING && cputype != CPU_SKYLAKE && +- cputype != CPU_SKYLAKE_XEON) ++ cputype != CPU_KNIGHTS_LANDING && cputype != CPU_KNIGHTS_MILL && ++ cputype != CPU_SKYLAKE && cputype != CPU_SKYLAKE_XEON && ++ cputype != CPU_KABYLAKE && cputype != CPU_DENVERTON) + resolveaddr(m->addr); +- if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { +- diskdb_resolve_addr(m->addr); +- } + } + + static void dump_mce_raw_ascii(struct mce *m, unsigned recordlen) +@@ -889,6 +892,7 @@ static void remove_pidfile(void) + static void signal_exit(int sig) + { + remove_pidfile(); ++ client_cleanup(); + _exit(sig); + } + +@@ -974,7 +978,6 @@ void usage(void) + "--no-imc-log Disable extended iMC logging\n" + "--is-cpu-supported Exit with return code indicating whether the CPU is supported\n" + ); +- diskdb_usage(); + printf("\n"); + print_cputypes(); + exit(1); +@@ -1043,7 +1046,6 @@ static struct option options[] = { + { "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */ + { "no-imc-log", 0, NULL, O_NO_IMC_LOG }, + { "is-cpu-supported", 0, NULL, O_IS_CPU_SUPPORTED }, +- DISKDB_OPTIONS + {} + }; + +@@ -1191,8 +1193,6 @@ void no_syslog(void) + static int combined_modifier(int opt) + { + int r = modifier(opt); +- if (r == 0) +- r = diskdb_modifier(opt); + return r; + } + +@@ -1369,8 +1369,6 @@ int main(int ac, char **av) + noargs(ac, av); + fprintf(stderr, "mcelog %s\n", MCELOG_VERSION); + exit(0); +- } else if (diskdb_cmd(opt, ac, av)) { +- exit(0); + } else if (opt == 0) + break; + } +diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h +--- mcelog-d2e13bf0.orig/mcelog.h 2016-11-30 11:23:54.539909515 -0500 ++++ mcelog-d2e13bf0/mcelog.h 2016-11-30 11:24:12.207619490 -0500 +@@ -127,9 +127,12 @@ enum cputype { + CPU_BROADWELL_DE, + CPU_BROADWELL_EPEX, + CPU_KNIGHTS_LANDING, ++ CPU_KNIGHTS_MILL, + CPU_ATOM, + CPU_SKYLAKE, + CPU_SKYLAKE_XEON, ++ CPU_KABYLAKE, ++ CPU_DENVERTON, + }; + + enum option_ranges { +diff -urNp mcelog-d2e13bf0.orig/mcelog.service mcelog-d2e13bf0/mcelog.service +--- mcelog-d2e13bf0.orig/mcelog.service 2016-11-30 11:23:54.540909556 -0500 ++++ mcelog-d2e13bf0/mcelog.service 2016-11-30 11:24:12.207619490 -0500 +@@ -5,6 +5,7 @@ After=syslog.target + [Service] + ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground + StandardOutput=syslog ++SuccessExitStatus=0 15 + + [Install] + WantedBy=multi-user.target +diff -urNp mcelog-d2e13bf0.orig/msr.c mcelog-d2e13bf0/msr.c +--- mcelog-d2e13bf0.orig/msr.c 2016-11-30 11:23:54.538909475 -0500 ++++ mcelog-d2e13bf0/msr.c 2016-11-30 11:24:12.207619490 -0500 +@@ -25,19 +25,20 @@ static void domsr(int cpu, int msr, int + } + if (pread(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot read MSR_ERROR_CONTROL from %s\n", fpath); +- return; ++ goto out; + } + data |= bit; + if (pwrite(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot write MSR_ERROR_CONTROL to %s\n", fpath); +- return; ++ goto out; + } + if (pread(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); +- return; ++ goto out; + } + if ((data & bit) == 0) + Lprintf("No DIMM detection available on cpu %d (normal in virtual environments)\n", cpu); ++out: + close(fd); + } + +diff -urNp mcelog-d2e13bf0.orig/nehalem.c mcelog-d2e13bf0/nehalem.c +--- mcelog-d2e13bf0.orig/nehalem.c 2016-11-30 11:23:54.537909435 -0500 ++++ mcelog-d2e13bf0/nehalem.c 2016-11-30 11:24:12.207619490 -0500 +@@ -24,7 +24,6 @@ + #include "nehalem.h" + #include "bitfield.h" + #include "memdb.h" +-#include "xeon75xx.h" + + /* See IA32 SDM Vol3B Appendix E.3.2 ff */ + +@@ -130,7 +129,8 @@ void decode_memory_controller(u32 status + if ((status & 0xf) == 0xf) + strcpy(channel, "unspecified"); + else { +- if (cputype == CPU_KNIGHTS_LANDING) /* Fix for Knights Landing MIC */ ++ /* Fix for Knights Landing/Mill MIC */ ++ if (cputype == CPU_KNIGHTS_LANDING || cputype == CPU_KNIGHTS_MILL) + sprintf(channel, "%u", (status & 0xf) + 3 * (bank == 15)); + else + sprintf(channel, "%u", status & 0xf); +@@ -170,7 +170,6 @@ void xeon75xx_decode_model(struct mce *m + decode_bitfield(status, internal_error_status); + decode_numfield(status, internal_error_numbers); + } +- xeon75xx_decode_dimm(m, msize); + } + + /* Nehalem-EP specific DIMM decoding */ +diff -urNp mcelog-d2e13bf0.orig/p4.c mcelog-d2e13bf0/p4.c +--- mcelog-d2e13bf0.orig/p4.c 2016-11-30 11:23:54.534909314 -0500 ++++ mcelog-d2e13bf0/p4.c 2016-11-30 11:24:12.208619530 -0500 +@@ -39,6 +39,7 @@ + #include "broadwell_de.h" + #include "broadwell_epex.h" + #include "skylake_xeon.h" ++#include "denverton.h" + + /* decode mce for P4/Xeon and Core2 family */ + +@@ -289,10 +290,29 @@ static const char *arstate[4] = { + [3] = "SRAR" + }; + ++static const char *ce_types[] = { ++ [0] = "ecc", ++ [1] = "mirroring with channel failover", ++ [2] = "mirroring. Primary channel scrubbed successfully" ++}; ++ ++static int check_for_mirror(__u8 bank, __u64 status, __u64 misc) ++{ ++ switch (cputype) { ++ case CPU_BROADWELL_EPEX: ++ return bdw_epex_ce_type(bank, status, misc); ++ case CPU_SKYLAKE_XEON: ++ return skylake_s_ce_type(bank, status, misc); ++ default: ++ return 0; ++ } ++} ++ + static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr, + int socket, __u8 bank) + { + u64 track = 0; ++ int i; + + Wprintf("MCi status:\n"); + if (!(status & MCI_STATUS_VAL)) +@@ -303,6 +323,8 @@ static int decode_mci(__u64 status, __u6 + + if (status & MCI_STATUS_UC) + Wprintf("Uncorrected error\n"); ++ else if ((i = check_for_mirror(bank, status, misc))) ++ Wprintf("Corrected error by %s\n", ce_types[i]); + else + Wprintf("Corrected error\n"); + +@@ -428,6 +450,9 @@ void decode_intel_mc(struct mce *log, in + case CPU_SKYLAKE_XEON: + skylake_s_decode_model(cputype, log->bank, log->status, log->misc); + break; ++ case CPU_DENVERTON: ++ denverton_decode_model(cputype, log->bank, log->status, log->misc); ++ break; + } + } + +diff -urNp mcelog-d2e13bf0.orig/README mcelog-d2e13bf0/README +--- mcelog-d2e13bf0.orig/README 2016-11-30 11:23:54.538909475 -0500 ++++ mcelog-d2e13bf0/README 1969-12-31 19:00:00.000000000 -0500 +@@ -1,119 +0,0 @@ +-mcelog is the user space backend for logging machine check errors +-reported by the hardware to the kernel. The kernel does the immediate +-actions (like killing processes etc.) and mcelog decodes the errors +-and manages various other advanced error responses like +-offlining memory, CPUs or triggering events. In addition +-mcelog also handles corrected errors, by logging and accounting them. +- +-It primarily handles machine checks and thermal events, which +-are reported for errors detected by the CPU. +- +-For more details on what mcelog can do and the underlying theory +-see http://www.mcelog.org +- +-It is recommended that mcelog runs on all x86 machines, both +-64bit (since early 2.6) and 32bit (since 2.6.32) +- +-mcelog can run in several modi: cronjob, trigger, daemon +- +-cronjob is the old method. mcelog runs every 5 minutes from cron and checks +-for errors. Disadvantage of this is that it can delay error reporting +-significantly (upto 10 minutes) and does not allow mcelog to keep extended state. +- +-trigger is a newer method where the kernel runs mcelog on a error. +-This is configured with +-echo /usr/sbin/mcelog > /sys/devices/system/machinecheck/machinecheck0/trigger +-This is faster, but still doesn't allow mcelog to keep state, +-and has relatively high overhead for each error because a program has +-to be initialized from scratch. +- +-In daemon mode mcelog runs continuously as a daemon in the background +-and wait for errors. It is enabled by running mcelog --daemon & +-from a init script. This is the fastest and most feature-ful. +- +-The recommended mode is daemon, because several new functions (like page error +-predictive failure analysis) require a continuously running daemon. +- +-Documentation: +- +-The primary reference documentation are the man pages. +-lk10-mcelog.pdf has a overview over the errors mcelog handles +-(originally from Linux Kongress 2010) +-mce.pdf is a very old paper describing the first releases of mcelog +-(some parts are obsolete) +- +-For distributors: +- +-You can run mcelog from systemd or similar daemons. An example +-systemd unit file is in mcelog.service. +- +-For older distributions using init scripts: +- +-Please install a init script by default that runs mcelog in daemon mode. +-The mcelog.init script is a good starting point. +- +-Also install a logrotated file (mcelog.logrotate) or equivalent +-when mcelog is running in daemon mode. +- +-These two are not in make install. +- +-The installation also requires a config file (/etc/mcelog.conf) and +-the default triggers. These are all installed by "make install" +- +-/dev/mcelog is needed for mcelog operation +-If it's not there it can be created with mknod /dev/mcelog c 10 227 +-Normally it should be created automatically in udev. +- +-Security: +- +-mcelog needs to run as root because it might trigger actions like +-page-offlining, which require CAP_SYS_ADMIN. Also it opens /dev/mcelog +-and a unix socket for client support. +- +-It also opens /dev/mem to parse the BIOS DMI tables. It is careful +-to close the file descriptor and unmap any mappings after using them. +- +-There is support for changing the user in daemon mode after opening +-the device and the sockets, but that would stop triggers from +-doing corrective action that require root. +- +-In principle it would be possible to only keep CAP_SYS_ADMIN +-for page-offling, but that would prevent triggers from doing root +-only actions not covered by it (and CAP_SYS_ADMIN is not that different +-from full root) +- +-In daemon mode mcelog listens to a unix socket and processes +-requests from mcelog --client. This can be disabled in the configuration file. +-The uid/gid of the requestor is checked on access and is configurable +-(default 0/0 only). The command parsing code is very straight forward +-(server.c) The client parsing/reply is currently done with full privileges +-of the daemon. +- +-Testing: +- +-There is a simple test suite in tests/. The test suite requires root to +-run and access to mce-inject and a kernel with MCE injection support +-(CONFIG_X86_MCE_INJECT). It will kill any running mcelog daemon. +- +-Run it with "make test" +- +-The test suite requires the mce-inject tool, available from +-git://git.kernel.org/pub/utils/cpu/mce/mce-inject.git +-The mce-inject executable must be either in $PATH or in the +-../mce-inject directory. +- +-You can also test under valgrind with "make valgrind-test". For +-this valgrind needs to be installed of course. Advanced +-valgrind options can be specified with +-make VALGRIND="valgrind --option" valgrind-test +- +-Other checks: +- +-make iccverify and make clangverify run the static verifiers +-in clang and icc respectively. +- +-License: +- +-This program is licensed under the subject of the GNU Public General +-License, v.2 +- +diff -urNp mcelog-d2e13bf0.orig/README.md mcelog-d2e13bf0/README.md +--- mcelog-d2e13bf0.orig/README.md 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/README.md 2016-11-30 11:24:12.202619289 -0500 +@@ -0,0 +1,129 @@ ++# mcelog ++ ++mcelog is the user space backend for logging machine check errors reported ++by the hardware to the kernel. The kernel does the immediate actions ++(like killing processes etc.) and mcelog decodes the errors and manages ++various other advanced error responses like offlining memory, CPUs or triggering ++events. In addition mcelog also handles corrected errors, by logging and ++accounting them. ++It primarily handles machine checks and thermal events, which are reported ++for errors detected by the CPU. ++ ++For more details on what mcelog can do and the underlying theory ++see [mcelog.org](http://www.mcelog.org). ++ ++It is recommended that mcelog runs on all x86 machines, both 64bit ++(since early 2.6) and 32bit (since 2.6.32). ++ ++mcelog can run in several modes: ++ ++- cronjob ++- trigger ++- daemon ++ ++**cronjob** is the old method. mcelog runs every 5 minutes from cron and checks ++for errors. Disadvantage of this is that it can delay error reporting ++significantly (upto 10 minutes) and does not allow mcelog to keep extended state. ++ ++**trigger** is a newer method where the kernel runs mcelog on a error. ++ ++This is configured with: ++```sh ++echo /usr/sbin/mcelog > /sys/devices/system/machinecheck/machinecheck0/trigger ++``` ++This is faster, but still doesn't allow mcelog to keep state, ++and has relatively high overhead for each error because a program has ++to be initialized from scratch. ++ ++In **daemon** mode mcelog runs continuously as a daemon in the background and ++wait for errors. It is enabled by running `mcelog --daemon &` ++from a init script. This is the fastest and most feature-ful. ++ ++The recommended mode is **daemon**, because several new functions (like page ++error predictive failure analysis) require a continuously running daemon. ++ ++## Documentation ++ ++- The primary reference documentation are the man pages. ++- [lk10-mcelog.pdf](https://github.com/andikleen/mcelog/blob/master/lk10-mcelog.pdf) ++ has a overview over the errors mcelog handles (originally from Linux Kongress 2010). ++- [mce.pdf](https://github.com/mjtrangoni/mcelog/blob/README.md/mce.pdf) ++ is a very old paper describing the first releases of mcelog (some parts are obsolete). ++ ++## For distributors ++ ++You can run mcelog from systemd or similar daemons. An example systemd unit ++file is in `mcelog.service`. ++ ++### For older distributions using init scripts ++ ++Please install an init script by default that runs mcelog in daemon mode. ++The `mcelog.init` script is a good starting point. Also install a ++logrotated file (mcelog.logrotate) or equivalent when mcelog is running ++in daemon mode. ++These two are not in make install. ++ ++The installation also requires a config file `/etc/mcelog.conf` and the default ++triggers. These are all installed by `make install` ++ ++`/dev/mcelog` is needed for mcelog operation. If it's not there it can be ++created with: ++```sh ++mknod /dev/mcelog c 10 227 ++``` ++ ++Normally it should be created automatically in udev. ++ ++## Security ++ ++mcelog needs to run as root because it might trigger actions like ++page-offlining, which require `CAP_SYS_ADMIN`. Also it opens `/dev/mcelog` ++and an UNIX socket for client support. ++ ++It also opens `/dev/mem` to parse the BIOS DMI tables. It is careful to close ++the file descriptor and unmap any mappings after using them. ++ ++There is support for changing the user in daemon mode after opening the device ++and the sockets, but that would stop triggers from doing corrective action ++that require `root`. ++ ++In principle it would be possible to only keep `CAP_SYS_ADMIN` for page-offling, ++but that would prevent triggers from doing root-only actions not covered by ++it (and `CAP_SYS_ADMIN` is not that different from full root) ++ ++In `daemon` mode mcelog listens to a UNIX socket and processes requests from ++`sh mcelog --client`. This can be disabled in the configuration file. ++The uid/gid of the requestor is checked on access and is configurable ++(default 0/0 only). The command parsing code is very straight forward ++(server.c). The client parsing/reply is currently done with full privileges ++of the `daemon`. ++ ++## Testing ++ ++There is a simple test suite in `sh tests/`. The test suite requires root to ++run and access to mce-inject and a kernel with MCE injection support ++`CONFIG_X86_MCE_INJECT`. It will kill any running mcelog daemon. ++ ++Run it with `sh make test`. ++ ++The test suite requires the ++[mce-inject](git://git.kernel.org/pub/utils/cpu/mce/mce-inject.git) tool. ++The `mce-inject` executable must be either in `$PATH` or in the ++`../mce-inject` directory. ++ ++You can also test under **valgrind** with `sh make valgrind-test`. For this ++valgrind needs to be installed of course. Advanced valgrind options can be ++specified with: ++```sh ++make VALGRIND="valgrind --option" valgrind-test ++``` ++ ++### Other checks ++ ++`make iccverify` and `make clangverify` run the static verifiers in *clang* ++and *icc* respectively. ++ ++## License ++ ++This program is licensed under the subject of the GNU Public General ++License, v.2 +diff -urNp mcelog-d2e13bf0.orig/skylake_xeon.c mcelog-d2e13bf0/skylake_xeon.c +--- mcelog-d2e13bf0.orig/skylake_xeon.c 2016-11-30 11:23:54.538909475 -0500 ++++ mcelog-d2e13bf0/skylake_xeon.c 2016-11-30 11:24:12.208619530 -0500 +@@ -23,6 +23,11 @@ + #include "skylake_xeon.h" + #include "memdb.h" + ++/* Memory error was corrected by mirroring with channel failover */ ++#define SKX_MCI_MISC_FO (1ULL<<63) ++/* Memory error was corrected by mirroring and primary channel scrubbed successfully */ ++#define SKX_MCI_MISC_MC (1ULL<<62) ++ + /* See IA32 SDM Vol3B Table 16-27 */ + + static char *pcu_1[] = { +@@ -208,3 +213,18 @@ void skylake_s_decode_model(int cputype, + break; + } + } ++ ++int skylake_s_ce_type(int bank, u64 status, u64 misc) ++{ ++ if (!(bank == 7 || bank == 8)) ++ return 0; ++ ++ if (status & MCI_STATUS_MISCV) { ++ if (misc & SKX_MCI_MISC_FO) ++ return 1; ++ if (misc & SKX_MCI_MISC_MC) ++ return 2; ++ } ++ ++ return 0; ++} +diff -urNp mcelog-d2e13bf0.orig/skylake_xeon.h mcelog-d2e13bf0/skylake_xeon.h +--- mcelog-d2e13bf0.orig/skylake_xeon.h 2016-11-30 11:23:54.539909515 -0500 ++++ mcelog-d2e13bf0/skylake_xeon.h 2016-11-30 11:24:12.208619530 -0500 +@@ -1 +1,2 @@ + void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc); ++int skylake_s_ce_type(int bank, u64 status, u64 misc); +diff -urNp mcelog-d2e13bf0.orig/sysfs.c mcelog-d2e13bf0/sysfs.c +--- mcelog-d2e13bf0.orig/sysfs.c 2016-11-30 11:23:54.534909314 -0500 ++++ mcelog-d2e13bf0/sysfs.c 2016-11-30 11:24:12.208619530 -0500 +@@ -37,10 +37,10 @@ char *read_field(char *base, char *name) + + asprintf(&fn, "%s/%s", base, name); + fd = open(fn, O_RDONLY); ++ free(fn); + if (fstat(fd, &st) < 0) + goto bad; + buf = xalloc(st.st_size); +- free(fn); + if (fd < 0) + goto bad; + n = read(fd, buf, st.st_size); +@@ -81,10 +81,12 @@ unsigned read_field_map(char *base, char + if (!strcmp(val, map->name)) + break; + } +- free(val); +- if (map->name) ++ if (map->name) { ++ free(val); + return map->value; ++ } + Eprintf("sysfs field %s/%s has unknown string value `%s'\n", base, name, val); ++ free(val); + return -1; + } + +diff -urNp mcelog-d2e13bf0.orig/TODO-diskdb mcelog-d2e13bf0/TODO-diskdb +--- mcelog-d2e13bf0.orig/TODO-diskdb 2016-11-30 11:23:54.530909154 -0500 ++++ mcelog-d2e13bf0/TODO-diskdb 1969-12-31 19:00:00.000000000 -0500 +@@ -1,31 +0,0 @@ +- +-diskdb was a experimental attempt to track errors per DIMM +-on disk. It ran into problems unfortunately. +- +-diskdb is not compiled by default now. It can be enabled with +-make CONFIG_DISKDB=1 +- +-It is replaced with a new memory only database now that +-relies on daemon mode. +- +-Open fundamental issues: +-- DIMM tracking over boot doesn't work due to SMBIOS not reporting +-serial numbers +- +-Code problems: +-- Missing aging +-- For Intel Nehalem CE errors need reverse smbios translation +-- SMBIOS interleaving decoding missing +-- Some crash races in db.c (see comments there) +-- Need lock timeout +-- Default enable/disable heuristics (smbios check etc.) +-- write db test suite (with crash) +- +-General: +-- Missing CPU database +- +-Missing: +-- rename to different name without memory +- +-Old: +-- add ifdef for memory because it's broken +diff -urNp mcelog-d2e13bf0.orig/xeon75xx.c mcelog-d2e13bf0/xeon75xx.c +--- mcelog-d2e13bf0.orig/xeon75xx.c 2016-11-30 11:23:54.537909435 -0500 ++++ mcelog-d2e13bf0/xeon75xx.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,39 +0,0 @@ +-/* Copyright (C) 2009/2010 Intel Corporation +- +- Decode Intel Xeon75xx memory errors. Requires the mce-75xx.ko driver +- load. The core errors are the same as Nehalem. +- +- mcelog is free software; you can redistribute it and/or +- modify it under the terms of the GNU General Public +- License as published by the Free Software Foundation; version +- 2. +- +- mcelog is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- General Public License for more details. +- +- You should find a copy of v2 of the GNU General Public License somewhere +- on your Linux system; if not, write to the Free Software Foundation, +- Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +- +- Author: Andi Kleen +-*/ +- +-#include +-#include +-#include "mcelog.h" +-#include "xeon75xx.h" +- +-/* This used to decode the old xeon 75xx memory error aux format. But that has never +- been merged into mainline kernels, so removed it again. */ +- +-void +-xeon75xx_memory_error(struct mce *m, unsigned msize, int *channel, int *dimm) +-{ +-} +- +- +-void xeon75xx_decode_dimm(struct mce *m, unsigned msize) +-{ +-} +diff -urNp mcelog-d2e13bf0.orig/xeon75xx.h mcelog-d2e13bf0/xeon75xx.h +--- mcelog-d2e13bf0.orig/xeon75xx.h 2016-11-30 11:23:54.537909435 -0500 ++++ mcelog-d2e13bf0/xeon75xx.h 1969-12-31 19:00:00.000000000 -0500 +@@ -1,2 +0,0 @@ +-void xeon75xx_memory_error(struct mce *m, unsigned msize, int *channel, int *dimm); +-void xeon75xx_decode_dimm(struct mce *m, unsigned msize); diff --git a/SOURCES/mcelog-update-9de4924.patch b/SOURCES/mcelog-update-9de4924.patch new file mode 100644 index 0000000..00f6812 --- /dev/null +++ b/SOURCES/mcelog-update-9de4924.patch @@ -0,0 +1,687 @@ +diff --git a/Makefile b/Makefile +index a91950c..f8199f6 100644 +--- a/Makefile ++++ b/Makefile +@@ -22,7 +22,10 @@ WARNINGS := -Wall -Wextra -Wno-missing-field-initializers -Wno-unused-parameter + # CONFIG_DISKDB = 1 + + TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \ +- socket-memory-error-trigger ++ socket-memory-error-trigger \ ++ bus-error-trigger \ ++ iomca-error-trigger \ ++ unknown-error-trigger + + all: mcelog + +@@ -32,7 +35,7 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \ + nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ + eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ + client.o cache.o sysfs.o yellow.o page.o rbtree.o \ +- xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o ++ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o bus.o unknown.o + DISKDB_OBJ := diskdb.o dimm.o db.o + CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} + DOC := mce.pdf +diff --git a/bus.c b/bus.c +new file mode 100644 +index 0000000..f48bc38 +--- /dev/null ++++ b/bus.c +@@ -0,0 +1,129 @@ ++/* Copyright (C) 20014 Intel Corporation ++ Author: Rui Wang ++ Handle 'Bus and Interconnect' error threshold indications. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system. */ ++#define _GNU_SOURCE 1 ++#include ++#include ++#include ++#include ++#include ++#include "memutil.h" ++#include "mcelog.h" ++#include "config.h" ++#include "trigger.h" ++#include "bus.h" ++ ++static char *bus_trigger, *iomca_trigger; ++ ++enum { ++ MAX_ENV = 20, ++}; ++ ++void bus_setup(void) ++{ ++ bus_trigger = config_string("socket", "bus-uc-threshold-trigger"); ++ if (bus_trigger && trigger_check(bus_trigger) < 0) { ++ SYSERRprintf("Cannot access bus threshold trigger `%s'", ++ bus_trigger); ++ exit(1); ++ } ++ ++ iomca_trigger = config_string("socket", "iomca-threshold-trigger"); ++ if (iomca_trigger && trigger_check(iomca_trigger) < 0) { ++ SYSERRprintf("Cannot access iomca threshold trigger `%s'", ++ iomca_trigger); ++ exit(1); ++ } ++} ++ ++void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, ++ char *ii, char *timeout) ++{ ++ int ei = 0; ++ char *env[MAX_ENV]; ++ int i; ++ char *msg; ++ char *location; ++ ++ if (socket >= 0) ++ asprintf(&location, "CPU %d on socket %d", cpu, socket); ++ else ++ asprintf(&location, "CPU %d", cpu); ++ asprintf(&msg, "%s received Bus and Interconnect Errors in %s", ++ location, ii); ++ asprintf(&env[ei++], "LOCATION=%s", location); ++ free(location); ++ ++ if (!bus_trigger) ++ goto out; ++ ++ if (socket >= 0) ++ asprintf(&env[ei++], "SOCKETID=%d", socket); ++ asprintf(&env[ei++], "MESSAGE=%s", msg); ++ asprintf(&env[ei++], "CPU=%d", cpu); ++ asprintf(&env[ei++], "LEVEL=%s", level); ++ asprintf(&env[ei++], "PARTICIPATION=%s", pp); ++ asprintf(&env[ei++], "REQUEST=%s", rrrr); ++ asprintf(&env[ei++], "ORIGIN=%s", ii); ++ asprintf(&env[ei++], "TIMEOUT=%s", timeout); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(bus_trigger, NULL, env); ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++out: ++ free(msg); ++} ++ ++void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn) ++{ ++ int ei = 0; ++ char *env[MAX_ENV]; ++ int i; ++ char *msg; ++ char *location; ++ ++ if (socket >= 0) ++ asprintf(&location, "CPU %d on socket %d", cpu, socket); ++ else ++ asprintf(&location, "CPU %d", cpu); ++ asprintf(&msg, "%s received IO MCA Errors from %x:%02x:%02x.%x", ++ location, seg, bus, dev, fn); ++ asprintf(&env[ei++], "LOCATION=%s", location); ++ free(location); ++ ++ if (!iomca_trigger) ++ goto out; ++ ++ if (socket >= 0) ++ asprintf(&env[ei++], "SOCKETID=%d", socket); ++ asprintf(&env[ei++], "MESSAGE=%s", msg); ++ asprintf(&env[ei++], "CPU=%d", cpu); ++ asprintf(&env[ei++], "SEG=%x", seg); ++ asprintf(&env[ei++], "BUS=%02x", bus); ++ asprintf(&env[ei++], "DEVICE=%02x", dev); ++ asprintf(&env[ei++], "FUNCTION=%x", fn); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(iomca_trigger, NULL, env); ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++out: ++ free(msg); ++ ++} +diff --git a/bus.h b/bus.h +new file mode 100644 +index 0000000..37ac592 +--- /dev/null ++++ b/bus.h +@@ -0,0 +1,4 @@ ++void bus_setup(void); ++void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, ++ char *ii, char *timeout); ++void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn); +diff --git a/input/iomca b/input/iomca +new file mode 100644 +index 0000000..9a1e27d +--- /dev/null ++++ b/input/iomca +@@ -0,0 +1,4 @@ ++CPU 0 BANK 1 ++STATUS 0x9c00000000000e0b ++MISC 0xabcdef ++ADDR 0xabcd +diff --git a/input/unknown b/input/unknown +new file mode 100644 +index 0000000..29a2436 +--- /dev/null ++++ b/input/unknown +@@ -0,0 +1,4 @@ ++CPU 0 BANK 1 ++STATUS 0x9c0000000000040b ++MISC 0xabcdef ++ADDR 0xabcd +diff --git a/mcelog.c b/mcelog.c +index 89bb537..95a913f 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -58,6 +58,8 @@ + #include "msg.h" + #include "yellow.h" + #include "page.h" ++#include "bus.h" ++#include "unknown.h" + + enum cputype cputype = CPU_GENERIC; + +@@ -567,6 +569,12 @@ static char *skipgunk(char *s) + if (*s == ']') + ++s; + } ++ ++ s = skipspace(s); ++ ++ if (strncmp(s, "mce: [Hardware Error]:", 22) == 0) ++ s += 22; ++ + return skipspace(s); + } + +@@ -1153,6 +1161,8 @@ static void general_setup(void) + { + trigger_setup(); + yellow_setup(); ++ bus_setup(); ++ unknown_setup(); + config_cred("global", "run-credentials", &runcred); + if (config_bool("global", "filter-memory-errors") == 1) + filter_memory_errors = 1; +diff --git a/mcelog.conf b/mcelog.conf +index 1bab3ee..6a2be26 100644 +--- a/mcelog.conf ++++ b/mcelog.conf +@@ -127,6 +127,9 @@ mem-ce-error-threshold = 100 / 24h + # Log socket error threshold explicitely? + mem-ce-error-log = yes + ++bus-uc-threshold-trigger = bus-error-trigger ++iomca-threshold-trigger = iomca-error-trigger ++unknown-threshold-trigger = unknown-error-trigger + + [cache] + # Processing of cache error thresholds reported by Intel CPUs +diff --git a/msr.c b/msr.c +index 2eef9d2..665cac3 100644 +--- a/msr.c ++++ b/msr.c +@@ -36,10 +36,8 @@ static void domsr(int cpu, int msr, int bit) + SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); + exit(1); + } +- if ((data & bit) == 0) { +- SYSERRprintf("Failed to set imc_log on cpu %d\n", cpu); +- exit(1); +- } ++ if ((data & bit) == 0) ++ Lprintf("No DIMM detection available on cpu %d (normal in virtual environments)\n", cpu); + close(fd); + } + +@@ -54,6 +52,8 @@ void set_imc_log(int cputype) + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; ++ default: ++ return; + } + + for (cpu = 0; cpu < ncpus; cpu++) +diff --git a/p4.c b/p4.c +index 8a3b5a6..f938196 100644 +--- a/p4.c ++++ b/p4.c +@@ -30,6 +30,8 @@ + #include "tulsa.h" + #include "intel.h" + #include "yellow.h" ++#include "bus.h" ++#include "unknown.h" + #include "bitfield.h" + #include "sandy-bridge.h" + #include "ivy-bridge.h" +@@ -116,7 +118,7 @@ static char* get_II_str(__u8 i) + return II[i]; + } + +-static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) ++static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, int socket) + { + #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ + #define TLB_LL_SHIFT 0x0 +@@ -141,6 +143,8 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ + #define BUS_PP_SHIFT 0x9 + ++ u32 mca; ++ int ret = 0; + static char *msg[] = { + [0] = "No Error", + [1] = "Unclassified", +@@ -151,6 +155,7 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + [6] = "SMM Handler Code Access Violation", + }; + ++ mca = status & 0xffff; + if (mca & (1UL << 12)) { + Wprintf("corrected filtering (some unreported errors in same region)\n"); + mca &= ~(1UL << 12); +@@ -158,16 +163,27 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + + if (mca < NELE(msg)) { + Wprintf("%s\n", msg[mca]); +- return; ++ return ret; + } + + if ((mca >> 2) == 3) { +- Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3)); ++ unsigned levelnum; ++ char *level; ++ levelnum = mca & 3; ++ level = get_LL_str(levelnum); ++ Wprintf("%s Generic cache hierarchy error\n", level); ++ if (track == 2) ++ run_yellow_trigger(cpu, -1, levelnum, "unknown", level, socket); + } else if (test_prefix(4, mca)) { +- Wprintf("%s TLB %s Error\n", +- get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT), +- get_LL_str((mca & TLB_LL_MASK) >> +- TLB_LL_SHIFT)); ++ unsigned levelnum, typenum; ++ char *level, *type; ++ typenum = (mca & TLB_TT_MASK) >> TLB_TT_SHIFT; ++ type = get_TT_str(typenum); ++ levelnum = (mca & TLB_LL_MASK) >> TLB_LL_SHIFT; ++ level = get_LL_str(levelnum); ++ Wprintf("%s TLB %s Error\n", type, level); ++ if (track == 2) ++ run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); + } else if (test_prefix(8, mca)) { + unsigned typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT; + unsigned levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT; +@@ -177,25 +193,51 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + get_RRRR_str((mca & CACHE_RRRR_MASK) >> + CACHE_RRRR_SHIFT)); + if (track == 2) +- run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); ++ run_yellow_trigger(cpu, typenum, levelnum, type, level,socket); + } else if (test_prefix(10, mca)) { + if (mca == 0x400) + Wprintf("Internal Timer error\n"); + else + Wprintf("Internal unclassified error: %x\n", mca & 0xffff); ++ ++ ret = 1; + } else if (test_prefix(11, mca)) { +- Wprintf("BUS %s %s %s %s %s Error\n", +- get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT), +- get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT), +- get_RRRR_str((mca & BUS_RRRR_MASK) >> +- BUS_RRRR_SHIFT), +- get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT), +- get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT)); ++ char *level, *pp, *rrrr, *ii, *timeout; ++ ++ level = get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT); ++ pp = get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT); ++ rrrr = get_RRRR_str((mca & BUS_RRRR_MASK) >> BUS_RRRR_SHIFT); ++ ii = get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT); ++ timeout = get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT); ++ ++ Wprintf("BUS error: %d %d %s %s %s %s %s\n", socket, cpu, ++ level, pp, rrrr, ii, timeout); ++ run_bus_trigger(socket, cpu, level, pp, rrrr, ii, timeout); ++ /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values ++ * and MISCV set. MISC register points to root port that reported the error ++ * need to cross check with AER logs for more details. ++ * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html ++ */ ++ if ((status & MCI_STATUS_MISCV) && ++ (status & 0xefff) == 0x0e0b) { ++ int seg, bus, dev, fn; ++ ++ seg = EXTRACT(misc, 32, 39); ++ bus = EXTRACT(misc, 24, 31); ++ dev = EXTRACT(misc, 19, 23); ++ fn = EXTRACT(misc, 16, 18); ++ Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", ++ seg, bus, dev, fn); ++ run_iomca_trigger(socket, cpu, seg, bus, dev, fn); ++ } + } else if (test_prefix(7, mca)) { + decode_memory_controller(mca); + *ismemerr = 1; +- } else ++ } else { + Wprintf("Unknown Error %x\n", mca); ++ ret = 1; ++ } ++ return ret; + } + + static void p4_decode_model(__u32 model) +@@ -243,7 +285,7 @@ static const char *arstate[4] = { + [3] = "SRAR" + }; + +-static void decode_mci(__u64 status, int cpu, unsigned mcgcap, int *ismemerr, ++static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr, + int socket) + { + u64 track = 0; +@@ -280,7 +322,7 @@ static void decode_mci(__u64 status, int cpu, unsigned mcgcap, int *ismemerr, + decode_tracking(track); + } + Wprintf("MCA: "); +- decode_mca(status & 0xffffL, track, cpu, ismemerr, socket); ++ return decode_mca(status, misc, track, cpu, ismemerr, socket); + } + + static void decode_mcg(__u64 mcgstatus) +@@ -314,11 +356,14 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) + + if (log->bank == MCE_THERMAL_BANK) { + decode_thermal(log, cpu); ++ run_unknown_trigger(socket, cpu, log); + return; + } + + decode_mcg(log->mcgstatus); +- decode_mci(log->status, cpu, log->mcgcap, ismemerr, socket); ++ if (decode_mci(log->status, log->misc, cpu, log->mcgcap, ismemerr, ++ socket)) ++ run_unknown_trigger(socket, cpu, log); + + if (test_prefix(11, (log->status & 0xffffL))) { + switch (cputype) { +@@ -365,23 +410,6 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) + hsw_decode_model(cputype, log->bank, log->status, log->misc); + break; + } +- +- /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values +- * and MISCV set. MISC register points to root port that reported the error +- * need to cross check with AER logs for more details. +- * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html +- */ +- if ((log->status & MCI_STATUS_MISCV) && +- (log->status & 0xefff) == 0x0e0b) { +- int seg, bus, dev, fn; +- +- seg = EXTRACT(log->misc, 32, 39); +- bus = EXTRACT(log->misc, 24, 31); +- dev = EXTRACT(log->misc, 19, 23); +- fn = EXTRACT(log->misc, 16, 18); +- Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", +- seg, bus, dev, fn); +- } + } + + char *intel_bank_name(int num) +diff --git a/tests/unknown/inject b/tests/unknown/inject +new file mode 100755 +index 0000000..7be39a7 +--- /dev/null ++++ b/tests/unknown/inject +@@ -0,0 +1,8 @@ ++#!/bin/sh ++ ++B=$(pwd)/../.. ++ ++PATH=$PATH:$B/../mce-inject ++ ++mce-inject $B/input/iomca ++mce-inject $B/input/unknown +diff --git a/tests/unknown/unknown.conf b/tests/unknown/unknown.conf +new file mode 100644 +index 0000000..4b86db7 +--- /dev/null ++++ b/tests/unknown/unknown.conf +@@ -0,0 +1,11 @@ ++# trigger: 3 ++ ++num-errors = 2 ++ ++[socket] ++bus-uc-threshold-trigger = ../trigger ++iomca-threshold-trigger = ../trigger ++unknown-threshold-trigger = ../trigger ++ ++[trigger] ++directory = . +diff --git a/triggers/bus-error-trigger b/triggers/bus-error-trigger +new file mode 100644 +index 0000000..c996001 +--- /dev/null ++++ b/triggers/bus-error-trigger +@@ -0,0 +1,23 @@ ++#!/bin/sh ++# This shell script can be executed by mcelog in daemon mode when a sockets ++# receives Bus and Interconnect errors ++# ++# environment: ++# MESSAGE Human readable consolidated error message ++# LOCATION Consolidated location as a single string ++# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM ++# LEVEL Interconnect level ++# PARTICIPATION Processor Participation (Originator, Responder or Observer) ++# REQUEST Request type (read, write, prefetch, etc.) ++# ORIGIN Memory or IO ++# TIMEOUT The request timed out or not ++# ++# note: will run as mcelog configured user ++# this can be changed in mcelog.conf ++ ++logger -s -p daemon.err -t mcelog "$MESSAGE" ++logger -s -p daemon.err -t mcelog "Location: $LOCATION" ++ ++[ -x ./bus-error-trigger.local ] && . ./bus-error-trigger.local ++ ++exit 0 +diff --git a/triggers/iomca-error-trigger b/triggers/iomca-error-trigger +new file mode 100644 +index 0000000..3888461 +--- /dev/null ++++ b/triggers/iomca-error-trigger +@@ -0,0 +1,23 @@ ++#!/bin/sh ++# This shell script can be executed by mcelog in daemon mode when a sockets ++# receives Bus and Interconnect errors ++# ++# environment: ++# MESSAGE Human readable consolidated error message ++# LOCATION Consolidated location as a single string ++# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM ++# CPU Linux CPU number that triggered the error ++# SET PCI segment number ++# BUS PCI bus number ++# DEVICE PCI device number ++# FUNCTION PCI function number ++# ++# note: will run as mcelog configured user ++# this can be changed in mcelog.conf ++ ++logger -s -p daemon.err -t mcelog "$MESSAGE" ++logger -s -p daemon.err -t mcelog "Location: $LOCATION" ++ ++[ -x ./iomca-error-trigger.local ] && . ./iomca-error-trigger.local ++ ++exit 0 +diff --git a/triggers/unknown-error-trigger b/triggers/unknown-error-trigger +new file mode 100644 +index 0000000..b924a0e +--- /dev/null ++++ b/triggers/unknown-error-trigger +@@ -0,0 +1,26 @@ ++#!/bin/sh ++# This shell script is executed by mcelog in daemon mode when ++# an not otherwise handled machine check error happens. ++# ++# environment: ++# MESSAGE Human readable consolidated error message ++# LOCATION Consolidated location as a single string ++# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM ++# CPU Linux CPU number that triggered the error ++# STATUS IA32_MCi_STATUS register value ++# ADDR IA32_MCi_ADDR register value ++# MISC IA32_MCi_MISC regiser value ++# MCGSTATUS IA32_MCG_STATUS register value ++# MCGCAP IA32_MCG_CAP register value ++# For details on the register layout please see the Intel SDM http://www.intel.com/sdm ++# volume 3, chapter 15 ++# ++# note: will run as mcelog configured user ++# this can be changed in mcelog.conf ++ ++logger -s -p daemon.err -t mcelog "$MESSAGE" ++logger -s -p daemon.err -t mcelog "Location: $LOCATION" ++ ++[ -x ./unknown-error-trigger.local ] && . ./unknown-error-trigger.local ++ ++exit 0 +diff --git a/unknown.c b/unknown.c +new file mode 100644 +index 0000000..482c29e +--- /dev/null ++++ b/unknown.c +@@ -0,0 +1,82 @@ ++/* Copyright (C) 20014 Intel Corporation ++ Author: Rui Wang ++ Handle all other unknown error requests. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system. */ ++#define _GNU_SOURCE 1 ++#include ++#include ++#include ++#include ++#include ++#include "memutil.h" ++#include "mcelog.h" ++#include "config.h" ++#include "trigger.h" ++#include "unknown.h" ++ ++static char *unknown_trigger; ++ ++enum { ++ MAX_ENV = 20, ++}; ++ ++void unknown_setup(void) ++{ ++ unknown_trigger = config_string("socket", "unknown-threshold-trigger"); ++ if (unknown_trigger && trigger_check(unknown_trigger) < 0) { ++ SYSERRprintf("Cannot access unknown threshold trigger `%s'", ++ unknown_trigger); ++ exit(1); ++ } ++} ++ ++void run_unknown_trigger(int socket, int cpu, struct mce *log) ++{ ++ int ei = 0; ++ char *env[MAX_ENV]; ++ int i; ++ char *msg; ++ char *location; ++ ++ if (socket >= 0) ++ asprintf(&location, "CPU %d on socket %d", cpu, socket); ++ else ++ asprintf(&location, "CPU %d", cpu); ++ asprintf(&msg, "%s received unknown error", location); ++ asprintf(&env[ei++], "LOCATION=%s", location); ++ free(location); ++ ++ if (!unknown_trigger) ++ goto out; ++ ++ if (socket >= 0) ++ asprintf(&env[ei++], "SOCKETID=%d", socket); ++ asprintf(&env[ei++], "MESSAGE=%s", msg); ++ asprintf(&env[ei++], "CPU=%d", cpu); ++ asprintf(&env[ei++], "STATUS=%llx", log->status); ++ asprintf(&env[ei++], "MISC=%llx", log->misc); ++ asprintf(&env[ei++], "ADDR=%llx", log->addr); ++ asprintf(&env[ei++], "MCGSTATUS=%llx", log->mcgstatus); ++ asprintf(&env[ei++], "MCGCAP=%llx", log->mcgcap); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(unknown_trigger, NULL, env); ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++out: ++ free(msg); ++} ++ +diff --git a/unknown.h b/unknown.h +new file mode 100644 +index 0000000..0c6d876 +--- /dev/null ++++ b/unknown.h +@@ -0,0 +1,2 @@ ++void unknown_setup(void); ++void run_unknown_trigger(int socket, int cpu, struct mce *log); +diff --git a/yellow.c b/yellow.c +index 0f8ccd0..57978ee 100644 +--- a/yellow.c ++++ b/yellow.c +@@ -90,6 +90,8 @@ void run_yellow_trigger(int cpu, int tnum, int lnum, char *ts, char *ls, int soc + asprintf(&env[ei++], "TYPE=%s", ts); + if (cache_to_cpus(cpu, lnum, tnum, &cpumasklen, &cpumask) >= 0) + env[ei++] = cpulist("AFFECTED_CPUS=", cpumask, cpumasklen); ++ else ++ asprintf(&env[ei++], "AFFECTED_CPUS=unknown"); + env[ei] = NULL; + assert(ei < MAX_ENV); + diff --git a/SOURCES/mcelog-update-e4aca63.patch b/SOURCES/mcelog-update-e4aca63.patch new file mode 100644 index 0000000..b3a9636 --- /dev/null +++ b/SOURCES/mcelog-update-e4aca63.patch @@ -0,0 +1,1612 @@ +diff -urNp mcelog-d2e13bf0.orig/bitfield.c mcelog-d2e13bf0/bitfield.c +--- mcelog-d2e13bf0.orig/bitfield.c 2016-05-14 08:34:40.434107718 -0400 ++++ mcelog-d2e13bf0/bitfield.c 2016-05-14 08:34:58.868975011 -0400 +@@ -56,7 +56,7 @@ void decode_numfield(u64 status, struct + u64 v = (status >> f->start) & mask; + if (v > 0 || f->force) { + char fmt[30]; +- snprintf(fmt, 30, "%%s: %s\n", f->fmt ? f->fmt : "%Lu"); ++ snprintf(fmt, 30, "%%s: %s\n", f->fmt ? f->fmt : "%llu"); + Wprintf(fmt, f->name, v); + } + } +diff -urNp mcelog-d2e13bf0.orig/bitfield.h mcelog-d2e13bf0/bitfield.h +--- mcelog-d2e13bf0.orig/bitfield.h 2016-05-14 08:34:40.434107718 -0400 ++++ mcelog-d2e13bf0/bitfield.h 2016-05-14 08:34:58.869975058 -0400 +@@ -16,10 +16,10 @@ struct numfield { + #define FIELD(start_bit, name) { start_bit, name, NELE(name) } + #define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 } + +-#define NUMBER(start, end, name) { start, end, name, "%Lu", 0 } +-#define NUMBERFORCE(start, end, name) { start, end, name, "%Lu", 1 } +-#define HEXNUMBER(start, end, name) { start, end, name, "%Lx", 0 } +-#define HEXNUMBERFORCE(start, end, name) { start, end, name, "%Lx", 1 } ++#define NUMBER(start, end, name) { start, end, name, "%llu", 0 } ++#define NUMBERFORCE(start, end, name) { start, end, name, "%llu", 1 } ++#define HEXNUMBER(start, end, name) { start, end, name, "%llx", 0 } ++#define HEXNUMBERFORCE(start, end, name) { start, end, name, "%llx", 1 } + + void decode_bitfield(u64 status, struct field *fields); + void decode_numfield(u64 status, struct numfield *fields); +diff -urNp mcelog-d2e13bf0.orig/broadwell_de.c mcelog-d2e13bf0/broadwell_de.c +--- mcelog-d2e13bf0.orig/broadwell_de.c 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/broadwell_de.c 2016-05-14 08:34:58.869975058 -0400 +@@ -0,0 +1,104 @@ ++/* Copyright (C) 2015 Intel Corporation ++ Decode Intel Broadwell D specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Tony Luck ++*/ ++ ++#include "mcelog.h" ++#include "bitfield.h" ++#include "broadwell_de.h" ++#include "memdb.h" ++ ++/* See IA32 SDM Vol3B Table 16-24 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x26] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-18 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC, WDB, parity errors"), ++ {} ++}; ++ ++void bdw_de_decode_model(int cputype, int bank, u64 status, u64 misc) ++{ ++ switch (bank) { ++ case 4: ++ Wprintf("PCU: "); ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ Wprintf("Internal errors "); ++ break; ++ case 0x406: ++ Wprintf("Intel TXT errors "); ++ break; ++ case 0x407: ++ Wprintf("Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19) & 3) ++ Wprintf("PCU internal error "); ++ if (EXTRACT(status, 20, 23) & 4) ++ Wprintf("Ubox error "); ++ decode_bitfield(status, pcu_mc4); ++ break; ++ case 9: case 10: ++ Wprintf("MemCtrl: "); ++ decode_bitfield(status, memctrl_mc9); ++ break; ++ } ++} +diff -urNp mcelog-d2e13bf0.orig/broadwell_de.h mcelog-d2e13bf0/broadwell_de.h +--- mcelog-d2e13bf0.orig/broadwell_de.h 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/broadwell_de.h 2016-05-14 08:34:58.869975058 -0400 +@@ -0,0 +1,2 @@ ++void bdw_d_decode_model(int cputype, int bank, u64 status, u64 misc); ++void bdw_de_decode_model(int cputype, int bank, u64 status, u64 misc); +diff -urNp mcelog-d2e13bf0.orig/broadwell_epex.c mcelog-d2e13bf0/broadwell_epex.c +--- mcelog-d2e13bf0.orig/broadwell_epex.c 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/broadwell_epex.c 2016-05-14 08:34:58.869975058 -0400 +@@ -0,0 +1,149 @@ ++/* Copyright (C) 2015 Intel Corporation ++ Decode Intel Broadwell specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Tony Luck ++*/ ++ ++#include "mcelog.h" ++#include "bitfield.h" ++#include "broadwell_epex.h" ++#include "memdb.h" ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR without Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-26 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void bdw_epex_decode_model(int cputype, int bank, u64 status, u64 misc) ++{ ++ switch (bank) { ++ case 4: ++ Wprintf("PCU: "); ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ Wprintf("Internal errors "); ++ break; ++ case 0x406: ++ Wprintf("Intel TXT errors "); ++ break; ++ case 0x407: ++ Wprintf("Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ Wprintf("PCU internal error "); ++ decode_bitfield(status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ Wprintf("QPI: "); ++ decode_bitfield(status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ Wprintf("MemCtrl: "); ++ decode_bitfield(status, memctrl_mc9); ++ break; ++ } ++} +diff -urNp mcelog-d2e13bf0.orig/broadwell_epex.h mcelog-d2e13bf0/broadwell_epex.h +--- mcelog-d2e13bf0.orig/broadwell_epex.h 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/broadwell_epex.h 2016-05-14 08:34:58.869975058 -0400 +@@ -0,0 +1 @@ ++void bdw_epex_decode_model(int cputype, int bank, u64 status, u64 misc); +diff -urNp mcelog-d2e13bf0.orig/bus.c mcelog-d2e13bf0/bus.c +--- mcelog-d2e13bf0.orig/bus.c 2016-05-14 08:34:40.446108283 -0400 ++++ mcelog-d2e13bf0/bus.c 2016-05-14 08:34:58.869975058 -0400 +@@ -58,6 +58,9 @@ void run_bus_trigger(int socket, int cpu + char *msg; + char *location; + ++ if (!bus_trigger) ++ return; ++ + if (socket >= 0) + asprintf(&location, "CPU %d on socket %d", cpu, socket); + else +@@ -67,9 +70,6 @@ void run_bus_trigger(int socket, int cpu + asprintf(&env[ei++], "LOCATION=%s", location); + free(location); + +- if (!bus_trigger) +- goto out; +- + if (socket >= 0) + asprintf(&env[ei++], "SOCKETID=%d", socket); + asprintf(&env[ei++], "MESSAGE=%s", msg); +@@ -85,7 +85,6 @@ void run_bus_trigger(int socket, int cpu + run_trigger(bus_trigger, NULL, env); + for (i = 0; i < ei; i++) + free(env[i]); +-out: + free(msg); + } + +@@ -97,6 +96,9 @@ void run_iomca_trigger(int socket, int c + char *msg; + char *location; + ++ if (!iomca_trigger) ++ return; ++ + if (socket >= 0) + asprintf(&location, "CPU %d on socket %d", cpu, socket); + else +@@ -106,9 +108,6 @@ void run_iomca_trigger(int socket, int c + asprintf(&env[ei++], "LOCATION=%s", location); + free(location); + +- if (!iomca_trigger) +- goto out; +- + if (socket >= 0) + asprintf(&env[ei++], "SOCKETID=%d", socket); + asprintf(&env[ei++], "MESSAGE=%s", msg); +@@ -123,7 +122,6 @@ void run_iomca_trigger(int socket, int c + run_trigger(iomca_trigger, NULL, env); + for (i = 0; i < ei; i++) + free(env[i]); +-out: + free(msg); + + } +diff -urNp mcelog-d2e13bf0.orig/cache.c mcelog-d2e13bf0/cache.c +--- mcelog-d2e13bf0.orig/cache.c 2016-05-14 08:34:40.434107718 -0400 ++++ mcelog-d2e13bf0/cache.c 2016-05-14 08:34:58.869975058 -0400 +@@ -97,9 +97,14 @@ static void parse_cpumap(char *map, unsi + static void read_cpu_map(struct cache *c, char *cfn) + { + char *map = read_field(cfn, "shared_cpu_map"); ++ if (map[0] == 0) { ++ c->cpumap = NULL; ++ goto out; ++ } + c->cpumaplen = cpumap_len(map); + c->cpumap = xalloc(c->cpumaplen); + parse_cpumap(map, c->cpumap, c->cpumaplen); ++out: + free(map); + } + +diff -urNp mcelog-d2e13bf0.orig/core2.c mcelog-d2e13bf0/core2.c +--- mcelog-d2e13bf0.orig/core2.c 2016-05-14 08:34:40.434107718 -0400 ++++ mcelog-d2e13bf0/core2.c 2016-05-14 08:34:58.869975058 -0400 +@@ -69,7 +69,7 @@ static struct field p6old_status[] = { + FIELD(31, reserved_1bit), + FIELD(32, reserved_3bits), + SBITFIELD(35, "BINIT received from external bus"), +- SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), ++ SBITFIELD(37, "Received hard error response on split transaction (Bus BINIT)"), + {} + }; + +diff -urNp mcelog-d2e13bf0.orig/dimm.c mcelog-d2e13bf0/dimm.c +--- mcelog-d2e13bf0.orig/dimm.c 2016-05-14 08:34:40.435107765 -0400 ++++ mcelog-d2e13bf0/dimm.c 2016-05-14 08:34:58.870975105 -0400 +@@ -351,14 +351,14 @@ static void run_trigger(char *trigger, c + Eprintf("Cannot run error trigger %s for %s\n", trigger, loc); + open_dimm_db(NULL); + } +-void new_error(unsigned long addr, unsigned long max_error, char *trigger) ++void new_error(unsigned long long addr, unsigned long max_error, char *trigger) + { + struct dmi_memdev **devs; + int i; + + devs = dmi_find_addr(addr); + if (devs[0] == NULL) { +- Wprintf("No memory found for address %lx\n", addr); ++ Wprintf("No memory found for address %llx\n", addr); + exit(1); + } + for (i = 0; devs[i]; i++) { +@@ -366,7 +366,7 @@ void new_error(unsigned long addr, unsig + char *loc = dmi_getstring(&d->header, d->device_locator); + struct group *g = find_entry(dimm_db, NULL, "Locator", loc); + if (!g) { // shouldn't happen +- Eprintf("No record found for %lx\n", addr); ++ Eprintf("No record found for %llx\n", addr); + return; + } + unsigned long val = inc_val(g, "corrected errors"); +diff -urNp mcelog-d2e13bf0.orig/dimm.h mcelog-d2e13bf0/dimm.h +--- mcelog-d2e13bf0.orig/dimm.h 2016-05-14 08:34:40.435107765 -0400 ++++ mcelog-d2e13bf0/dimm.h 2016-05-14 08:34:58.870975105 -0400 +@@ -1,6 +1,6 @@ + void close_dimm_db(void); + int open_dimm_db(char *fn); +-void new_error(unsigned long addr, unsigned long max_error, char *trigger); ++void new_error(unsigned long long addr, unsigned long max_error, char *trigger); + void reset_dimm(char *locator); + void gc_dimms(void); + void dump_all_dimms(void); +diff -urNp mcelog-d2e13bf0.orig/dmi.c mcelog-d2e13bf0/dmi.c +--- mcelog-d2e13bf0.orig/dmi.c 2016-05-14 08:34:40.446108283 -0400 ++++ mcelog-d2e13bf0/dmi.c 2016-05-14 08:34:58.870975105 -0400 +@@ -1,6 +1,8 @@ + /* Copyright (C) 2006 Andi Kleen, SuSE Labs. ++ Portions Copyright (C) 2016 Sergio Gelato. ++ + Use SMBIOS/DMI to map address to DIMM description. +- For reference see the SMBIOS specification 2.4 ++ For reference see the SMBIOS specification 2.4, 3.0 + + dmi is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public +@@ -55,9 +57,9 @@ struct anchor { + } __attribute__((packed)); + + static struct dmi_entry *entries; +-static int entrieslen; ++static size_t entrieslen; + static int numentries; +-static int dmi_length; ++static size_t dmi_length; + static struct dmi_entry **handle_to_entry; + + struct dmi_memdev **dmi_dimms; +@@ -137,6 +139,59 @@ static void fill_handles(void) + } + } + ++static int append_sysfs_dmi_entry(unsigned char type, int instance) ++{ ++ char filename[64]; /* 40 bytes should be enough */ ++ char buf[1024]; ++ int r; ++ ssize_t nr; ++ size_t l; ++ int fd; ++ r = snprintf(filename, sizeof(filename), ++ "/sys/firmware/dmi/entries/%hhu-%d/raw", ++ type, instance); ++ if (r < 0 || (unsigned int)r >= sizeof(filename)) { ++ Eprintf("Can't build pathname for DMI type %hhu instance %d\n", ++ type, instance); ++ return 0; ++ } ++ fd = open(filename, O_RDONLY); ++ if (fd == (-1)) { ++ if (errno != ENOENT) ++ perror(filename); ++ return 0; ++ } ++ l = dmi_length; ++ for (;;) { ++ nr = read(fd, buf, sizeof(buf)); ++ if (nr < 0) { ++ if (errno == EINTR) ++ continue; ++ perror(filename); ++ close(fd); ++ return 0; ++ } else if (nr > 0) { ++ while (l + nr > entrieslen) { ++ entrieslen += 4096; ++ entries = xrealloc(entries, entrieslen); ++ } ++ memcpy((char *)entries+l, buf, nr); ++ l += nr; ++ } else { ++ numentries ++; ++ dmi_length = l; ++ close(fd); ++ return 1; ++ } ++ } ++} ++ ++static void append_sysfs_dmi_entries(unsigned char type) ++{ ++ int i; ++ for (i=0; append_sysfs_dmi_entry(type, i); i++) ; ++} ++ + static int get_efi_base_addr(size_t *address) + { + FILE *efi_systab; +@@ -190,10 +245,12 @@ check_symbol: + int opendmi(void) + { + struct anchor *a, *abase; ++ void *ebase; + void *p, *q; + int pagesize = getpagesize(); + int memfd; +- unsigned corr; ++ off_t emapbase, corr; ++ size_t emapsize; + int err = -1; + const int segsize = 0x10000; + size_t entry_point_addr = 0; +@@ -201,6 +258,18 @@ int opendmi(void) + + if (entries) + return 0; ++ ++ if (access("/sys/firmware/dmi/entries/0-0/raw", R_OK) == 0) { ++ numentries = 0; ++ append_sysfs_dmi_entries(DMI_MEMORY_ARRAY); ++ append_sysfs_dmi_entries(DMI_MEMORY_DEVICE); ++ append_sysfs_dmi_entries(DMI_MEMORY_ARRAY_ADDR); ++ append_sysfs_dmi_entries(DMI_MEMORY_MAPPED_ADDR); ++ fill_handles(); ++ collect_dmi_dimms(); ++ return 0; ++ } ++ + memfd = open("/dev/mem", O_RDONLY); + if (memfd < 0) { + Eprintf("Cannot open /dev/mem for DMI decoding: %s", +@@ -228,8 +297,6 @@ int opendmi(void) + } + a = (struct anchor*)((char*)abase + (entry_point_addr - addr_start)); + goto fill_entries; +- } else { +- return -1; + } + + legacy: +@@ -264,17 +331,18 @@ fill_entries: + if (verbose) + printf("DMI tables at %x, %u bytes, %u entries\n", + a->table, a->length, a->numentries); +- corr = a->table - round_down(a->table, pagesize); +- entrieslen = round_up(a->table + a->length, pagesize) - +- round_down(a->table, pagesize); +- entries = mmap(NULL, entrieslen, +- PROT_READ, MAP_SHARED, memfd, +- round_down(a->table, pagesize)); +- if (entries == (struct dmi_entry *)-1) { ++ emapbase = round_down(a->table, pagesize); ++ corr = a->table - emapbase; ++ emapsize = round_up(a->table + a->length, pagesize) - emapbase; ++ ebase = mmap(NULL, emapsize, PROT_READ, MAP_SHARED, memfd, emapbase); ++ if (ebase == MAP_FAILED) { + Eprintf("Cannot mmap SMBIOS tables at %x", a->table); + goto out_mmap; + } +- entries = (struct dmi_entry *)(((char *)entries) + corr); ++ entrieslen = a->length; ++ entries = xalloc_nonzero(entrieslen); ++ memcpy(entries, (char *)ebase+corr, entrieslen); ++ munmap(ebase, emapsize); + numentries = a->numentries; + dmi_length = a->length; + fill_handles(); +@@ -307,13 +375,15 @@ static char *form_factors[] = { + "?", + "Other", "Unknown", "SIMM", "SIP", "Chip", "DIP", "ZIP", + "Proprietary Card", "DIMM", "TSOP", "Row of chips", "RIMM", +- "SODIMM", "SRIMM" ++ "SODIMM", "SRIMM", "FB-DIMM" + }; + static char *memory_types[] = { + "?", + "Other", "Unknown", "DRAM", "EDRAM", "VRAM", "SRAM", "RAM", + "ROM", "FLASH", "EEPROM", "FEPROM", "EPROM", "CDRAM", "3DRAM", +- "SDRAM", "SGRAM", "RDRAM", "DDR", "DDR2" ++ "SDRAM", "SGRAM", "RDRAM", "DDR", "DDR2", "DDR2 FB-DIMM", ++ "Reserved 0x15", "Reserved 0x16", "Reserved 0x17", "DDR3", ++ "FBD2", "DDR4", "LPDDR", "LPDDR2", "LPDDR3", "LPDDR4" + }; + + #define LOOKUP(array, val, buf) \ +@@ -324,7 +394,8 @@ static char *memory_types[] = { + static char *type_details[16] = { + "Reserved", "Other", "Unknown", "Fast-paged", "Static Column", + "Pseudo static", "RAMBUS", "Synchronous", "CMOS", "EDO", +- "Window DRAM", "Cache DRAM", "Non-volatile", "Res13", "Res14", "Res15" ++ "Window DRAM", "Cache DRAM", "Non-volatile", "Registered", ++ "Unbuffered", "LRDIMM" + }; + + static void dump_type_details(unsigned short td) +@@ -337,7 +408,7 @@ static void dump_type_details(unsigned s + Wprintf("%s ", type_details[i]); + } + +-static void dump_memdev(struct dmi_memdev *md, unsigned long addr) ++static void dump_memdev(struct dmi_memdev *md, unsigned long long addr) + { + char tmp[20]; + char unit[10]; +@@ -346,7 +417,7 @@ static void dump_memdev(struct dmi_memde + if (md->header.length < + offsetof(struct dmi_memdev, manufacturer)) { + if (verbose > 0) +- printf("Memory device for address %lx too short %u\n", ++ printf("Memory device for address %llx too short %u\n", + addr, md->header.length); + return; + } +@@ -500,7 +571,7 @@ int dmi_sanity_check(void) + dmi_dimms[i]->device_locator); + if (!strcmp(b, loc)) { + if (verbose > 0) +- printf("Ambigious locators `%s'<->`%s'." ++ printf("Ambiguous locators `%s'<->`%s'." + FAILED, b, loc); + return 0; + } +@@ -538,7 +609,7 @@ dump_ranges(struct dmi_memdev_addr **ran + DMIGET(dmi_dimms[i],device_set)); + } + +-struct dmi_memdev **dmi_find_addr(unsigned long addr) ++struct dmi_memdev **dmi_find_addr(unsigned long long addr) + { + struct dmi_memdev **devs; + int i, k; +@@ -582,7 +653,7 @@ struct dmi_memdev **dmi_find_addr(unsign + return devs; + } + +-void dmi_decodeaddr(unsigned long addr) ++void dmi_decodeaddr(unsigned long long addr) + { + struct dmi_memdev **devs = dmi_find_addr(addr); + if (devs[0]) { +@@ -591,7 +662,7 @@ void dmi_decodeaddr(unsigned long addr) + for (i = 0; devs[i]; i++) + dump_memdev(devs[i], addr); + } else { +- Wprintf("No DIMM found for %lx in SMBIOS\n", addr); ++ Wprintf("No DIMM found for %llx in SMBIOS\n", addr); + } + free(devs); + } +@@ -625,11 +696,11 @@ void closedmi(void) + { + if (!entries) + return; +- munmap(entries, entrieslen); +- entries = NULL; + FREE(dmi_dimms); + FREE(dmi_arrays); + FREE(dmi_ranges); + FREE(dmi_array_ranges); + FREE(handle_to_entry); ++ FREE(entries); ++ entrieslen = 0; + } +diff -urNp mcelog-d2e13bf0.orig/dmi.h mcelog-d2e13bf0/dmi.h +--- mcelog-d2e13bf0.orig/dmi.h 2016-05-14 08:34:40.435107765 -0400 ++++ mcelog-d2e13bf0/dmi.h 2016-05-14 08:34:58.870975105 -0400 +@@ -62,10 +62,10 @@ struct dmi_memarray_addr { + } __attribute__((packed)); + + int opendmi(void); +-void dmi_decodeaddr(unsigned long addr); ++void dmi_decodeaddr(unsigned long long addr); + int dmi_sanity_check(void); + unsigned dmi_dimm_size(unsigned short size, char *unit); +-struct dmi_memdev **dmi_find_addr(unsigned long addr); ++struct dmi_memdev **dmi_find_addr(unsigned long long addr); + void dmi_set_verbosity(int v); + + char *dmi_getstring(struct dmi_entry *e, unsigned number); +diff -urNp mcelog-d2e13bf0.orig/genconfig.py mcelog-d2e13bf0/genconfig.py +--- mcelog-d2e13bf0.orig/genconfig.py 2016-05-14 08:34:40.436107812 -0400 ++++ mcelog-d2e13bf0/genconfig.py 2016-05-14 08:34:58.870975105 -0400 +@@ -66,7 +66,7 @@ def new_option(): + + + print """ +-.\" Auto generated mcelog.conf manpage. Do not edit. ++.\\" Auto generated mcelog.conf manpage. Do not edit. + .TH "mcelog.conf" 5 "mcelog" + """ + +diff -urNp mcelog-d2e13bf0.orig/haswell.c mcelog-d2e13bf0/haswell.c +--- mcelog-d2e13bf0.orig/haswell.c 2016-05-14 08:34:40.445108236 -0400 ++++ mcelog-d2e13bf0/haswell.c 2016-05-14 08:34:58.870975105 -0400 +@@ -91,7 +91,7 @@ static char *qpi[] = { + [0x22] = "Phy detected in-band reset (no width change)", + [0x23] = "Link failover clock failover", + [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", +- [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR without Phy re-init", + }; + + static struct field qpi_mc[] = { +diff -urNp mcelog-d2e13bf0.orig/intel.c mcelog-d2e13bf0/intel.c +--- mcelog-d2e13bf0.orig/intel.c 2016-05-14 08:34:40.434107718 -0400 ++++ mcelog-d2e13bf0/intel.c 2016-05-14 08:36:22.435906530 -0400 +@@ -35,7 +35,8 @@ void intel_cpu_init(enum cputype cpu) + cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || + cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || + cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL || +- cpu == CPU_KNIGHTS_LANDING) ++ cpu == CPU_BROADWELL_DE || cpu == CPU_BROADWELL_EPEX || ++ cpu == CPU_KNIGHTS_LANDING || cpu == CPU_SKYLAKE || cpu == CPU_SKYLAKE_XEON) + memory_error_support = 1; + } + +@@ -73,15 +74,23 @@ enum cputype select_intel_cputype(int fa + return CPU_HASWELL; + else if (model == 0x3f) + return CPU_HASWELL_EPEX; +- else if (model == 0x3d || model == 0x56) ++ else if (model == 0x3d) + return CPU_BROADWELL; +- else if (model == 0x57) ++ else if (model == 0x4f) ++ return CPU_BROADWELL_EPEX; ++ else if (model == 0x56) ++ return CPU_BROADWELL_DE; ++ else if (model == 0x57) + return CPU_KNIGHTS_LANDING; + else if (model == 0x1c || model == 0x26 || model == 0x27 || + model == 0x35 || model == 0x36 || model == 0x36 || + model == 0x37 || model == 0x4a || model == 0x4c || + model == 0x4d || model == 0x5a || model == 0x5d) + return CPU_ATOM; ++ else if (model == 0x4e || model == 0x5e) ++ return CPU_SKYLAKE; ++ else if (model == 0x55) ++ return CPU_SKYLAKE_XEON; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", + model); +diff -urNp mcelog-d2e13bf0.orig/intel.h mcelog-d2e13bf0/intel.h +--- mcelog-d2e13bf0.orig/intel.h 2016-05-14 08:34:40.444108189 -0400 ++++ mcelog-d2e13bf0/intel.h 2016-05-14 08:34:58.871975152 -0400 +@@ -21,5 +21,10 @@ extern int memory_error_support; + case CPU_HASWELL: \ + case CPU_HASWELL_EPEX: \ + case CPU_BROADWELL: \ +- case CPU_KNIGHTS_LANDING ++ case CPU_BROADWELL_DE: \ ++ case CPU_BROADWELL_EPEX: \ ++ case CPU_ATOM: \ ++ case CPU_KNIGHTS_LANDING: \ ++ case CPU_SKYLAKE: \ ++ case CPU_SKYLAKE_XEON + +diff -urNp mcelog-d2e13bf0.orig/k8.c mcelog-d2e13bf0/k8.c +--- mcelog-d2e13bf0.orig/k8.c 2016-05-14 08:34:40.437107859 -0400 ++++ mcelog-d2e13bf0/k8.c 2016-05-14 08:34:58.871975152 -0400 +@@ -89,7 +89,7 @@ static char *highbits[32] = { + [0] = "err cpu0", + }; + static char *k8threshold[] = { +- [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknow threshold counter", ++ [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknown threshold counter", + [K8_MCELOG_THRESHOLD_DRAM_ECC] = "MC4_MISC0 DRAM threshold", + [K8_MCELOG_THRESHOLD_LINK] = "MC4_MISC1 Link threshold", + [K8_MCELOG_THRESHOLD_L3_CACHE] = "MC4_MISC2 L3 Cache threshold", +diff -urNp mcelog-d2e13bf0.orig/Makefile mcelog-d2e13bf0/Makefile +--- mcelog-d2e13bf0.orig/Makefile 2016-05-14 08:34:40.447108330 -0400 ++++ mcelog-d2e13bf0/Makefile 2016-05-14 08:34:58.868975011 -0400 +@@ -30,15 +30,18 @@ TRIGGERS=cache-error-trigger dimm-error- + + all: mcelog + +-.PHONY: install clean depend ++.PHONY: install clean depend FORCE + + OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \ + nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ + eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ + client.o cache.o sysfs.o yellow.o page.o rbtree.o \ +- xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o bus.o unknown.o ++ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o \ ++ broadwell_de.o broadwell_epex.o skylake_xeon.o \ ++ msr.o bus.o unknown.o + DISKDB_OBJ := diskdb.o dimm.o db.o +-CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} ++CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} \ ++ version.o version.c version.tmp + DOC := mce.pdf + + ADD_DEFINES := +@@ -52,7 +55,7 @@ endif + + SRC := $(OBJ:.o=.c) + +-mcelog: ${OBJ} ++mcelog: ${OBJ} version.o + + # dbquery intentionally not installed by default + install: mcelog mcelog.conf mcelog.conf.5 mcelog.triggers.5 +@@ -80,7 +83,7 @@ clean: test-clean + rm -f ${CLEAN} ${OBJ} + + tsc: tsc.c +- gcc -o tsc ${CFLAGS} -DSTANDALONE tsc.c ${LDFLAGS} ++ $(CC) -o tsc ${CFLAGS} -DSTANDALONE tsc.c ${LDFLAGS} + + dbquery: db.o dbquery.o memutil.o + +@@ -89,6 +92,21 @@ depend: .depend + %.o: %.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(WARNINGS) $(ADD_DEFINES) -o $@ $< + ++version.tmp: FORCE ++ ( echo -n "char version[] = \"" ; \ ++ if type -p git >/dev/null; then \ ++ if [ -d .git ] ; then \ ++ git describe --tags HEAD | tr -d '\n'; \ ++ else \ ++ echo -n "unknown" ; \ ++ fi ; \ ++ else echo -n "unknown" ; fi ; \ ++ echo '";' \ ++ ) > version.tmp ++ ++version.c: version.tmp ++ cmp version.tmp version.c || mv version.tmp version.c ++ + .depend: ${SRC} + ${CC} -MM -I. ${SRC} > .depend.X && mv .depend.X .depend + +@@ -111,7 +129,7 @@ src: + echo $(SRC) + + config-test: config.c +- gcc -DTEST=1 config.c -o config-test ++ $(CC) -DTEST=1 config.c -o config-test + + test: + $(MAKE) -C tests test DEBUG="" +diff -urNp mcelog-d2e13bf0.orig/mcelog.8 mcelog-d2e13bf0/mcelog.8 +--- mcelog-d2e13bf0.orig/mcelog.8 2016-05-14 08:34:40.437107859 -0400 ++++ mcelog-d2e13bf0/mcelog.8 2016-05-14 08:34:58.871975152 -0400 +@@ -16,6 +16,8 @@ mcelog [options] \-\-ascii + .\".br + .\"mcelog [options] \-\-dump-memory[=locator] + .br ++mcelog [options] \-\-is\-cpu\-supported ++.br + mcelog \-\-version + .SH DESCRIPTION + X86 CPUs report errors detected by the CPU as +@@ -81,6 +83,10 @@ store it anymore (different from + so the output should be always saved somewhere and mcelog + not run in uncontrolled ways. + ++When invoked with the ++.I \-\-is\-cpu\-supported ++option mcelog exits with code 0 if the current CPU is supported, 1 otherwise. ++ + .SH OPTIONS + When the + .B \-\-syslog +@@ -294,7 +300,7 @@ For more information on the config file + The kernel prefers old messages over new. If the log buffer overflows + only old ones will be kept. + +-The exact output in the log file depends on the CPU, unless the --raw option is used. ++The exact output in the log file depends on the CPU, unless the \-\-raw option is used. + + mcelog will report serious errors to the syslog during decoding. + +diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c +--- mcelog-d2e13bf0.orig/mcelog.c 2016-05-14 08:34:40.444108189 -0400 ++++ mcelog-d2e13bf0/mcelog.c 2016-05-14 08:37:03.210824839 -0400 +@@ -85,6 +85,7 @@ static char *pidfile = pidfile_default; + static char *logfile; + static int debug_numerrors; + int imc_log = -1; ++static int check_only = 0; + + static int is_cpu_supported(void); + +@@ -131,7 +132,7 @@ static char *bankname(unsigned bank) + } + } + +-static void resolveaddr(unsigned long addr) ++static void resolveaddr(unsigned long long addr) + { + if (addr && do_dmi && dmi_forced) + dmi_decodeaddr(addr); +@@ -232,8 +233,12 @@ static char *cputype_name[] = { + [CPU_HASWELL] = "Haswell", /* Fill in better name */ + [CPU_HASWELL_EPEX] = "Haswell EP/EX", /* Fill in better name */ + [CPU_BROADWELL] = "Broadwell", ++ [CPU_BROADWELL_DE] = "Intel Xeon (Broadwell) D family", ++ [CPU_BROADWELL_EPEX] = "Intel Xeon v4 (Broadwell) EP/EX", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + [CPU_ATOM] = "ATOM", ++ [CPU_SKYLAKE] = "Skylake", ++ [CPU_SKYLAKE_XEON] = "Skylake server", + }; + + static struct config_choice cpu_choices[] = { +@@ -273,8 +278,14 @@ static struct config_choice cpu_choices[ + { "haswell-ep", CPU_HASWELL_EPEX }, /* Fill in better name */ + { "haswell-ex", CPU_HASWELL_EPEX }, /* Fill in better name */ + { "broadwell", CPU_BROADWELL }, ++ { "broadwell-d", CPU_BROADWELL_DE }, ++ { "broadwell-ep", CPU_BROADWELL_EPEX }, ++ { "broadwell-ex", CPU_BROADWELL_EPEX }, + { "knightslanding", CPU_KNIGHTS_LANDING }, ++ { "xeon-v4", CPU_BROADWELL_EPEX }, + { "atom", CPU_ATOM }, ++ { "skylake", CPU_SKYLAKE }, ++ { "skylake_server", CPU_SKYLAKE_XEON }, + { NULL } + }; + +@@ -437,7 +448,9 @@ static void dump_mce(struct mce *m, unsi + } + if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX && + cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL && +- cputype != CPU_KNIGHTS_LANDING) ++ cputype != CPU_BROADWELL_DE && cputype != CPU_BROADWELL_EPEX && ++ cputype != CPU_KNIGHTS_LANDING && cputype != CPU_SKYLAKE && ++ cputype != CPU_SKYLAKE_XEON) + resolveaddr(m->addr); + if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { + diskdb_resolve_addr(m->addr); +@@ -916,22 +929,35 @@ void usage(void) + { + fprintf(stderr, + "Usage:\n" ++"\n" + " mcelog [options] [mcelogdevice]\n" + "Decode machine check error records from current kernel.\n" ++"\n" + " mcelog [options] --daemon\n" + "Run mcelog in daemon mode, waiting for errors from the kernel.\n" ++"\n" + " mcelog [options] --client\n" + "Query a currently running mcelog daemon for errors\n" ++"\n" + " mcelog [options] --ascii < log\n" + " mcelog [options] --ascii --file log\n" + "Decode machine check ASCII output from kernel logs\n" ++"\n" + "Options:\n" ++"--version Show the version of mcelog and exit\n" + "--cpu CPU Set CPU type CPU to decode (see below for valid types)\n" ++"--intel-cpu FAMILY,MODEL Set CPU type for an Intel CPU based on family and model from cpuid\n" ++"--k8 Set the CPU to be an AMD K8\n" ++"--p4 Set the CPU to be an Intel Pentium4\n" ++"--core2 Set the CPU to be an Intel Core2\n" ++"--generic Set the CPU to a generic version\n" + "--cpumhz MHZ Set CPU Mhz to decode time (output unreliable, not needed on new kernels)\n" + "--raw (with --ascii) Dump in raw ASCII format for machine processing\n" + "--daemon Run in background waiting for events (needs newer kernel)\n" ++"--client Query a currently running mcelog daemon for errors\n" + "--ignorenodev Exit silently when the device cannot be opened\n" + "--file filename With --ascii read machine check log from filename instead of stdin\n" ++"--logfile filename Log decoded machine checks in file filename\n" + "--syslog Log decoded machine checks in syslog (default stdout or syslog for daemon)\n" + "--syslog-error Log decoded machine checks in syslog with error level\n" + "--no-syslog Never log anything to syslog\n" +@@ -946,8 +972,10 @@ void usage(void) + "--num-errors N Only process N errors (for testing)\n" + "--pidfile file Write pid of daemon into file\n" + "--no-imc-log Disable extended iMC logging\n" ++"--is-cpu-supported Exit with return code indicating whether the CPU is supported\n" + ); + diskdb_usage(); ++ printf("\n"); + print_cputypes(); + exit(1); + } +@@ -980,6 +1008,7 @@ enum options { + O_PIDFILE, + O_DEBUG_NUMERRORS, + O_NO_IMC_LOG, ++ O_IS_CPU_SUPPORTED, + }; + + static struct option options[] = { +@@ -1013,6 +1042,7 @@ static struct option options[] = { + { "pidfile", 1, NULL, O_PIDFILE }, + { "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */ + { "no-imc-log", 0, NULL, O_NO_IMC_LOG }, ++ { "is-cpu-supported", 0, NULL, O_IS_CPU_SUPPORTED }, + DISKDB_OPTIONS + {} + }; +@@ -1115,6 +1145,9 @@ static int modifier(int opt) + case O_NO_IMC_LOG: + imc_log = 0; + break; ++ case O_IS_CPU_SUPPORTED: ++ check_only = 1; ++ break; + case 0: + break; + default: +@@ -1344,15 +1377,19 @@ int main(int ac, char **av) + + /* before doing anything else let's see if the CPUs are supported */ + if (!cpu_forced && !is_cpu_supported()) { +- fprintf(stderr, "CPU is unsupported\n"); ++ if (!check_only) ++ fprintf(stderr, "CPU is unsupported\n"); + exit(1); + } ++ if (check_only) ++ exit(0); + + /* If the user didn't tell us not to use iMC logging, check if CPU supports it */ + if (imc_log == -1) { + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: ++ case CPU_HASWELL_EPEX: + imc_log = 1; + break; + default: +diff -urNp mcelog-d2e13bf0.orig/mcelog.conf mcelog-d2e13bf0/mcelog.conf +--- mcelog-d2e13bf0.orig/mcelog.conf 2016-05-14 08:34:40.445108236 -0400 ++++ mcelog-d2e13bf0/mcelog.conf 2016-05-14 08:34:58.872975199 -0400 +@@ -23,7 +23,7 @@ + # If this value is set incorrectly the decoded output will be likely incorrect. + # By default when this parameter is not set mcelog uses the CPU it is running on + # on very new kernels the mcelog events reported by the kernel also carry +-# the CPU type which is used too when available and not overriden. ++# the CPU type which is used too when available and not overridden. + + # Enable daemon mode: + #daemon = yes +@@ -132,7 +132,7 @@ mem-ce-error-trigger = socket-memory-err + + mem-ce-error-threshold = 100 / 24h + +-# Log socket error threshold explicitely? ++# Log socket error threshold explicitly? + mem-ce-error-log = yes + + # Trigger script for uncorrected bus error events +@@ -148,7 +148,7 @@ unknown-threshold-trigger = unknown-erro + # Processing of cache error thresholds reported by Intel CPUs. + cache-threshold-trigger = cache-error-trigger + +-# Should cache threshold events be logged explicitely? ++# Should cache threshold events be logged explicitly? + cache-threshold-log = yes + + [page] +@@ -159,7 +159,7 @@ memory-ce-threshold = 10 / 24h + # Trigger script for corrected errors. + # memory-ce-trigger = page-error-trigger + +-# Should page threshold events be logged explicitely? ++# Should page threshold events be logged explicitly? + memory-ce-log = yes + + # specify the internal action in mcelog to exceeding a page error threshold +diff -urNp mcelog-d2e13bf0.orig/mcelog.conf.5 mcelog-d2e13bf0/mcelog.conf.5 +--- mcelog-d2e13bf0.orig/mcelog.conf.5 2016-05-14 08:34:40.437107859 -0400 ++++ mcelog-d2e13bf0/mcelog.conf.5 2016-05-14 08:34:58.872975199 -0400 +@@ -43,7 +43,7 @@ For valid values for type please see mce + If this value is set incorrectly the decoded output will be likely incorrect. + By default when this parameter is not set mcelog uses the CPU it is running on + on very new kernels the mcelog events reported by the kernel also carry +-the CPU type which is used too when available and not overriden. ++the CPU type which is used too when available and not overridden. + .PP + .PP + Enable daemon mode: +@@ -204,7 +204,7 @@ Threshold on when to trigger a correct e + .B mem-ce-error-threshold = 100 / 24h + .PP + .PP +- log socket error threshold explicitely? ++ log socket error threshold explicitly? + .PP + .B mem-ce-error-log = yes + .PP +@@ -230,7 +230,7 @@ Processing of cache error thresholds rep + .B cache-threshold-trigger = cache-error-trigger + .PP + .PP +-Should cache threshold events be logged explicitely? ++Should cache threshold events be logged explicitly? + .PP + .B cache-threshold-log = yes + .PP +@@ -246,7 +246,7 @@ Trigger script for corrected errors. + memory-ce-trigger = page-error-trigger + .PP + .PP +-Should page threshold events be logged explicitely? ++Should page threshold events be logged explicitly? + .PP + .B memory-ce-log = yes + .PP +diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h +--- mcelog-d2e13bf0.orig/mcelog.h 2016-05-14 08:34:40.446108283 -0400 ++++ mcelog-d2e13bf0/mcelog.h 2016-05-14 08:34:58.872975199 -0400 +@@ -124,8 +124,12 @@ enum cputype { + CPU_HASWELL, + CPU_HASWELL_EPEX, + CPU_BROADWELL, ++ CPU_BROADWELL_DE, ++ CPU_BROADWELL_EPEX, + CPU_KNIGHTS_LANDING, + CPU_ATOM, ++ CPU_SKYLAKE, ++ CPU_SKYLAKE_XEON, + }; + + enum option_ranges { +diff -urNp mcelog-d2e13bf0.orig/memdb.c mcelog-d2e13bf0/memdb.c +--- mcelog-d2e13bf0.orig/memdb.c 2016-05-14 08:34:40.437107859 -0400 ++++ mcelog-d2e13bf0/memdb.c 2016-05-14 08:34:58.872975199 -0400 +@@ -379,6 +379,14 @@ parse_dimm_addr(char *bl, unsigned *sock + channel, dimm) == 3) + return 1; + /* Add more DMI formats here */ ++ /* For new AMI BIOS Node0_Bank0 */ ++ if (sscanf(bl, "Node%u_Bank%u", socketid, dimm) == 2) ++ return 1; ++ ++ /* For old AMI BIOS A1_BANK0*/ ++ if (sscanf(bl, "A%u_BANK%u", socketid, dimm) == 2) ++ return 1; ++ + return 0; + } + +diff -urNp mcelog-d2e13bf0.orig/msr.c mcelog-d2e13bf0/msr.c +--- mcelog-d2e13bf0.orig/msr.c 2016-05-14 08:34:40.438107906 -0400 ++++ mcelog-d2e13bf0/msr.c 2016-05-14 08:34:58.872975199 -0400 +@@ -20,27 +20,28 @@ static void domsr(int cpu, int msr, int + return; + default: + SYSERRprintf("Cannot open %s to set imc_log\n", fpath); +- exit(1); ++ return; + } + } + if (pread(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot read MSR_ERROR_CONTROL from %s\n", fpath); +- exit(1); ++ return; + } + data |= bit; + if (pwrite(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot write MSR_ERROR_CONTROL to %s\n", fpath); +- exit(1); ++ return; + } + if (pread(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); +- exit(1); ++ return; + } + if ((data & bit) == 0) + Lprintf("No DIMM detection available on cpu %d (normal in virtual environments)\n", cpu); + close(fd); + } + ++/* XXX: assumes all CPUs are already onlined. */ + void set_imc_log(int cputype) + { + int cpu, ncpus = sysconf(_SC_NPROCESSORS_CONF); +@@ -49,6 +50,7 @@ void set_imc_log(int cputype) + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: ++ case CPU_HASWELL_EPEX: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff -urNp mcelog-d2e13bf0.orig/nehalem.c mcelog-d2e13bf0/nehalem.c +--- mcelog-d2e13bf0.orig/nehalem.c 2016-05-14 08:34:40.438107906 -0400 ++++ mcelog-d2e13bf0/nehalem.c 2016-05-14 08:34:58.872975199 -0400 +@@ -124,13 +124,17 @@ static char *mmm_desc[] = { + "Reserved 7" + }; + +-void decode_memory_controller(u32 status) ++void decode_memory_controller(u32 status, u8 bank) + { + char channel[30]; + if ((status & 0xf) == 0xf) + strcpy(channel, "unspecified"); +- else +- sprintf(channel, "%u", status & 0xf); ++ else { ++ if (cputype == CPU_KNIGHTS_LANDING) /* Fix for Knights Landing MIC */ ++ sprintf(channel, "%u", (status & 0xf) + 3 * (bank == 15)); ++ else ++ sprintf(channel, "%u", status & 0xf); ++ } + Wprintf("MEMORY CONTROLLER %s_CHANNEL%s_ERR\n", + mmm_mnemonic[(status >> 4) & 7], + channel); +diff -urNp mcelog-d2e13bf0.orig/nehalem.h mcelog-d2e13bf0/nehalem.h +--- mcelog-d2e13bf0.orig/nehalem.h 2016-05-14 08:34:40.438107906 -0400 ++++ mcelog-d2e13bf0/nehalem.h 2016-05-14 08:34:58.872975199 -0400 +@@ -1,4 +1,4 @@ + void nehalem_decode_model(u64 status, u64 misc); + void xeon75xx_decode_model(struct mce *m, unsigned msize); +-void decode_memory_controller(u32 status); ++void decode_memory_controller(u32 status, u8 bank); + void nehalem_memerr_misc(struct mce *m, int *channel, int *dimm); +diff -urNp mcelog-d2e13bf0.orig/p4.c mcelog-d2e13bf0/p4.c +--- mcelog-d2e13bf0.orig/p4.c 2016-05-14 08:34:40.444108189 -0400 ++++ mcelog-d2e13bf0/p4.c 2016-05-14 08:34:58.872975199 -0400 +@@ -36,6 +36,9 @@ + #include "sandy-bridge.h" + #include "ivy-bridge.h" + #include "haswell.h" ++#include "broadwell_de.h" ++#include "broadwell_epex.h" ++#include "skylake_xeon.h" + + /* decode mce for P4/Xeon and Core2 family */ + +@@ -52,7 +55,7 @@ static char* get_TT_str(__u8 t) + static char* get_LL_str(__u8 ll) + { + static char* LL[] = {"Level-0", "Level-1", "Level-2", "Level-3"}; +- if (ll > NELE(LL)) { ++ if (ll >= NELE(LL)) { + return "UNKNOWN"; + } + +@@ -118,7 +121,8 @@ static char* get_II_str(__u8 i) + return II[i]; + } + +-static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, int socket) ++static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, int socket, ++ u8 bank) + { + #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ + #define TLB_LL_SHIFT 0x0 +@@ -231,7 +235,7 @@ static int decode_mca(u64 status, u64 mi + run_iomca_trigger(socket, cpu, seg, bus, dev, fn); + } + } else if (test_prefix(7, mca)) { +- decode_memory_controller(mca); ++ decode_memory_controller(mca, bank); + *ismemerr = 1; + } else { + Wprintf("Unknown Error %x\n", mca); +@@ -286,7 +290,7 @@ static const char *arstate[4] = { + }; + + static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr, +- int socket) ++ int socket, __u8 bank) + { + u64 track = 0; + +@@ -326,7 +330,7 @@ static int decode_mci(__u64 status, __u6 + decode_tracking(track); + } + Wprintf("MCA: "); +- return decode_mca(status, misc, track, cpu, ismemerr, socket); ++ return decode_mca(status, misc, track, cpu, ismemerr, socket, bank); + } + + static void decode_mcg(__u64 mcgstatus) +@@ -368,7 +372,7 @@ void decode_intel_mc(struct mce *log, in + + decode_mcg(log->mcgstatus); + if (decode_mci(log->status, log->misc, cpu, log->mcgcap, ismemerr, +- socket)) ++ socket, log->bank)) + run_unknown_trigger(socket, cpu, log); + + if (test_prefix(11, (log->status & 0xffffL))) { +@@ -415,6 +419,15 @@ void decode_intel_mc(struct mce *log, in + case CPU_HASWELL_EPEX: + hsw_decode_model(cputype, log->bank, log->status, log->misc); + break; ++ case CPU_BROADWELL_DE: ++ bdw_de_decode_model(cputype, log->bank, log->status, log->misc); ++ break; ++ case CPU_BROADWELL_EPEX: ++ bdw_epex_decode_model(cputype, log->bank, log->status, log->misc); ++ break; ++ case CPU_SKYLAKE_XEON: ++ skylake_s_decode_model(cputype, log->bank, log->status, log->misc); ++ break; + } + } + +diff -urNp mcelog-d2e13bf0.orig/skylake_xeon.c mcelog-d2e13bf0/skylake_xeon.c +--- mcelog-d2e13bf0.orig/skylake_xeon.c 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/skylake_xeon.c 2016-05-14 08:35:00.556054382 -0400 +@@ -0,0 +1,210 @@ ++/* Copyright (C) 2016 Intel Corporation ++ Decode Intel Skylake specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Tony Luck ++*/ ++ ++#include "mcelog.h" ++#include "bitfield.h" ++#include "skylake_xeon.h" ++#include "memdb.h" ++ ++/* See IA32 SDM Vol3B Table 16-27 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x0d] = "MCA_DMI_TRAINING_TIMEOUT", ++ [0x0f] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x10] = "MCA_MORE_THAN_ONE_LT_AGENT", ++ [0x1e] = "MCA_BIOS_RST_CPL_INVALID_SEQ", ++ [0x1f] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG", ++ [0x25] = "MCA_MESSAGE_CHANNEL_TIMEOUT", ++ [0x27] = "MCA_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x30] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT", ++ [0x31] = "MCA_PKGC_INVALID_RSP_PCH", ++ [0x33] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x34] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x38] = "MCA_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE", ++ [0x41] = "MCA_SVID_COMMAND_TIMEOUT", ++ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE", ++ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR", ++ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED", ++ [0x45] = "MCA_SVID_SA_ITD_ERROR", ++ [0x46] = "MCA_SVID_READ_REG_FAILED", ++ [0x47] = "MCA_SVID_WRITE_REG_FAILED", ++ [0x48] = "MCA_SVID_PKGC_INIT_FAILED", ++ [0x49] = "MCA_SVID_PKGC_CONFIG_FAILED", ++ [0x4a] = "MCA_SVID_PKGC_REQUEST_FAILED", ++ [0x4b] = "MCA_SVID_IMON_REQUEST_FAILED", ++ [0x4c] = "MCA_SVID_ALERT_REQUEST_FAILED", ++ [0x4d] = "MCA_SVID_MCP_VR_ABSENT_OR_RAMP_ERROR", ++ [0x4e] = "MCA_SVID_UNEXPECTED_MCP_VR_DETECTED", ++ [0x51] = "MCA_FIVR_CATAS_OVERVOL_FAULT", ++ [0x52] = "MCA_FIVR_CATAS_OVERCUR_FAULT", ++ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x5a] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT", ++ [0x63] = "MCA_PKGS_INVALID_REQ_PCH", ++ [0x64] = "MCA_PKGS_INVALID_REQ_INTERNAL", ++ [0x65] = "MCA_PKGS_INVALID_RSP_INTERNAL", ++ [0x6b] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT", ++ [0x81] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT", ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-28 */ ++ ++static char *qpi[] = { ++ [0x00] = "UC Phy Initialization Failure", ++ [0x01] = "UC Phy detected drift buffer alarm", ++ [0x02] = "UC Phy detected latency buffer rollover", ++ [0x10] = "UC LL Rx detected CRC error: unsuccessful LLR: entered abort state", ++ [0x11] = "UC LL Rx unsupported or undefined packet", ++ [0x12] = "UC LL or Phy control error", ++ [0x13] = "UC LL Rx parameter exchange exception", ++ [0x1F] = "UC LL detected control error from the link-mesh interface", ++ [0x20] = "COR Phy initialization abort", ++ [0x21] = "COR Phy reset", ++ [0x22] = "COR Phy lane failure, recovery in x8 width", ++ [0x23] = "COR Phy L0c error corrected without Phy reset", ++ [0x24] = "COR Phy L0c error triggering Phy Reset", ++ [0x25] = "COR Phy L0p exit error corrected with Phy reset", ++ [0x30] = "COR LL Rx detected CRC error - successful LLR without Phy Reinit", ++ [0x31] = "COR LL Rx detected CRC error - successful LLR with Phy Reinit", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* These apply to MSCOD 0x12 "UC LL or Phy control error" */ ++static struct field qpi_0x12[] = { ++ SBITFIELD(22, "Phy Control Error"), ++ SBITFIELD(23, "Unexpected Retry.Ack flit"), ++ SBITFIELD(24, "Unexpected Retry.Req flit"), ++ SBITFIELD(25, "RF parity error"), ++ SBITFIELD(26, "Routeback Table error"), ++ SBITFIELD(27, "unexpected Tx Protocol flit (EOP, Header or Data)"), ++ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"), ++ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"), ++ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"), ++ SBITFIELD(31, "Link Layer Tx Parity Error"), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-29 */ ++ ++static struct field mc_bits[] = { ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA write data parity error"), ++ SBITFIELD(18, "HA write byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Any HA read error"), ++ SBITFIELD(24, "WDB read parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ SBITFIELD(26, "Uncorrected address parity error"), ++ {} ++}; ++ ++static char *mc_0x8xx[] = { ++ [0x0] = "Unrecognized request type", ++ [0x1] = "Read response to an invalid scoreboard entry", ++ [0x2] = "Unexpected read response", ++ [0x3] = "DDR4 completion to an invalid scoreboard entry", ++ [0x4] = "Completion to an invalid scoreboard entry", ++ [0x5] = "Completion FIFO overflow", ++ [0x6] = "Correctable parity error", ++ [0x7] = "Uncorrectable error", ++ [0x8] = "Interrupt received while outstanding interrupt was not ACKed", ++ [0x9] = "ERID FIFO overflow", ++ [0xa] = "Error on Write credits", ++ [0xb] = "Error on Read credits", ++ [0xc] = "Scheduler error", ++ [0xd] = "Error event", ++}; ++ ++static struct field memctrl_mc13[] = { ++ FIELD(16, mc_0x8xx), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-30 */ ++ ++static struct field m2m[] = { ++ SBITFIELD(16, "MscodDataRdErr"), ++ SBITFIELD(17, "Reserved"), ++ SBITFIELD(18, "MscodPtlWrErr"), ++ SBITFIELD(19, "MscodFullWrErr"), ++ SBITFIELD(20, "MscodBgfErr"), ++ SBITFIELD(21, "MscodTimeout"), ++ SBITFIELD(22, "MscodParErr"), ++ SBITFIELD(23, "MscodBucket1Err"), ++ {} ++}; ++ ++void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc) ++{ ++ switch (bank) { ++ case 4: ++ Wprintf("PCU: "); ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ Wprintf("Internal errors "); ++ break; ++ case 0x406: ++ Wprintf("Intel TXT errors "); ++ break; ++ case 0x407: ++ Wprintf("Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ Wprintf("PCU internal error "); ++ decode_bitfield(status, pcu_mc4); ++ break; ++ case 5: ++ case 12: ++ case 19: ++ Wprintf("QPI: "); ++ decode_bitfield(status, qpi_mc); ++ if (EXTRACT(status, 16, 21) == 0x12) ++ decode_bitfield(status, qpi_0x12); ++ break; ++ case 7: case 8: ++ Wprintf("M2M: "); ++ decode_bitfield(status, m2m); ++ break; ++ case 13: case 14: case 15: ++ case 16: case 17: case 18: ++ Wprintf("MemCtrl: "); ++ if (EXTRACT(status, 27, 27)) ++ decode_bitfield(status, memctrl_mc13); ++ else ++ decode_bitfield(status, mc_bits); ++ break; ++ } ++} +diff -urNp mcelog-d2e13bf0.orig/skylake_xeon.h mcelog-d2e13bf0/skylake_xeon.h +--- mcelog-d2e13bf0.orig/skylake_xeon.h 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/skylake_xeon.h 2016-05-14 08:35:00.557054429 -0400 +@@ -0,0 +1 @@ ++void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc); +diff -urNp mcelog-d2e13bf0.orig/trigger.c mcelog-d2e13bf0/trigger.c +--- mcelog-d2e13bf0.orig/trigger.c 2016-05-14 08:34:40.439107953 -0400 ++++ mcelog-d2e13bf0/trigger.c 2016-05-14 08:35:00.557054429 -0400 +@@ -151,14 +151,11 @@ void trigger_setup(void) + + void trigger_wait(void) + { +- int sig; +- sigset_t mask; +- sigemptyset(&mask); +- sigaddset(&mask, SIGCHLD); +- while (num_children > 0) { +- if (sigwait(&mask, &sig) < 0) +- SYSERRprintf("sigwait waiting for children"); +- } ++ int status; ++ int pid; ++ ++ while ((pid = waitpid((pid_t)-1, &status, 0)) > 0) ++ finish_child(pid, status); + } + + int trigger_check(char *s) +diff -urNp mcelog-d2e13bf0.orig/tsc.c mcelog-d2e13bf0/tsc.c +--- mcelog-d2e13bf0.orig/tsc.c 2016-05-14 08:34:40.443108142 -0400 ++++ mcelog-d2e13bf0/tsc.c 2016-05-14 08:35:00.557054429 -0400 +@@ -161,7 +161,7 @@ int main(void) + { + char *buf; + u64 tsc = rdtscll(); +- printf("%Lx tsc\n", tsc); ++ printf("%llx tsc\n", tsc); + if (decode_tsc_current(&buf, 0, CPU_CORE2, 0.0, tsc) >= 0) + printf("%s\n", buf); + else +diff -urNp mcelog-d2e13bf0.orig/unknown.c mcelog-d2e13bf0/unknown.c +--- mcelog-d2e13bf0.orig/unknown.c 2016-05-14 08:34:40.446108283 -0400 ++++ mcelog-d2e13bf0/unknown.c 2016-05-14 08:35:00.557054429 -0400 +@@ -50,6 +50,9 @@ void run_unknown_trigger(int socket, int + char *msg; + char *location; + ++ if (!unknown_trigger) ++ return; ++ + if (socket >= 0) + asprintf(&location, "CPU %d on socket %d", cpu, socket); + else +@@ -58,9 +61,6 @@ void run_unknown_trigger(int socket, int + asprintf(&env[ei++], "LOCATION=%s", location); + free(location); + +- if (!unknown_trigger) +- goto out; +- + if (socket >= 0) + asprintf(&env[ei++], "SOCKETID=%d", socket); + asprintf(&env[ei++], "MESSAGE=%s", msg); +@@ -76,7 +76,6 @@ void run_unknown_trigger(int socket, int + run_trigger(unknown_trigger, NULL, env); + for (i = 0; i < ei; i++) + free(env[i]); +-out: + free(msg); + } + +diff -urNp mcelog-d2e13bf0.orig/version.h mcelog-d2e13bf0/version.h +--- mcelog-d2e13bf0.orig/version.h 2016-05-14 08:34:40.444108189 -0400 ++++ mcelog-d2e13bf0/version.h 2016-05-14 08:35:00.557054429 -0400 +@@ -1,2 +1,3 @@ +-#define MCELOG_VERSION "1.0pre" ++extern char version[]; ++#define MCELOG_VERSION version + diff --git a/SOURCES/mcelog-update-e7e0ac1.patch b/SOURCES/mcelog-update-e7e0ac1.patch new file mode 100644 index 0000000..55000df --- /dev/null +++ b/SOURCES/mcelog-update-e7e0ac1.patch @@ -0,0 +1,1389 @@ +diff --git a/CHANGES b/CHANGES +index cd279c4..e3c4044 100644 +--- a/CHANGES ++++ b/CHANGES +@@ -1,5 +1,9 @@ + + ++Changes file is obsolete. ++Please see git log on https://git.kernel.org/cgit/utils/cpu/mce/mcelog.git/ ++for newer changes. ++ + Add Linux Kongress 2010 paper + Add Sandy Bridge Support + Write pid file by default in daemon mode +diff --git a/Makefile b/Makefile +index f8199f6..f3ba998 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,7 @@ + CFLAGS := -g -Os + prefix := /usr + etcprefix := ++MANDIR := ${prefix}/share/man + # Define appropiately for your distribution + # DOCDIR := /usr/share/doc/packages/mcelog + +@@ -54,21 +55,27 @@ SRC := $(OBJ:.o=.c) + mcelog: ${OBJ} + + # dbquery intentionally not installed by default +-install: mcelog +- mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)${prefix}/share/man/man8 ++install: mcelog mcelog.conf mcelog.conf.5 mcelog.triggers.5 ++ mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)$(MANDIR)/man5 $(DESTDIR)$(MANDIR)/man8 + install -m 755 -p mcelog $(DESTDIR)${prefix}/sbin/mcelog +- install -m 644 -p mcelog.8 $(DESTDIR)${prefix}/share/man/man8 ++ install -m 644 -p mcelog.8 $(DESTDIR)$(MANDIR)/man8 ++ install -m 644 -p mcelog.conf.5 $(DESTDIR)$(MANDIR)/man5 ++ install -m 644 -p mcelog.triggers.5 $(DESTDIR)$(MANDIR)/man5 + install -m 644 -p -b mcelog.conf $(DESTDIR)${etcprefix}/etc/mcelog/mcelog.conf + for i in ${TRIGGERS} ; do \ + install -m 755 -p -b triggers/$$i $(DESTDIR)${etcprefix}/etc/mcelog ; \ + done + ifdef DOCDIR ++ install -d 755 $(DESTDIR)${DOCDIR} + install -m 644 -p ${DOC} $(DESTDIR)${DOCDIR} + else + echo + echo "Consider defining DOCDIR to install additional documentation" + endif + ++mcelog.conf.5: mcelog.conf config-intro.man ++ ./genconfig.py mcelog.conf config-intro.man > mcelog.conf.5 ++ + clean: test-clean + rm -f ${CLEAN} ${OBJ} + +diff --git a/README b/README +index 08184ed..8aa8ec4 100644 +--- a/README ++++ b/README +@@ -2,11 +2,15 @@ mcelog is the user space backend for logging machine check errors + reported by the hardware to the kernel. The kernel does the immediate + actions (like killing processes etc.) and mcelog decodes the errors + and manages various other advanced error responses like +-offlining memory, CPUs or triggering events. ++offlining memory, CPUs or triggering events. In addition ++mcelog also handles corrected errors, by logging and accounting them. + + It primarily handles machine checks and thermal events, which + are reported for errors detected by the CPU. + ++For more details on what mcelog can do and the underlying theory ++see http://www.mcelog.org ++ + It is recommended that mcelog runs on all x86 machines, both + 64bit (since early 2.6) and 32bit (since 2.6.32) + +@@ -40,6 +44,11 @@ mce.pdf is a very old paper describing the first releases of mcelog + + For distributors: + ++You can run mcelog from systemd or similar daemons. An example ++systemd unit file is in mcelog.service. ++ ++For older distributions using init scripts: ++ + Please install a init script by default that runs mcelog in daemon mode. + The mcelog.init script is a good starting point. + +diff --git a/client.c b/client.c +index 6a67683..7c7aeb8 100644 +--- a/client.c ++++ b/client.c +@@ -29,9 +29,9 @@ void ask_server(char *command) + { + struct sockaddr_un sun; + int fd; ++ FILE * fp; + int n; + char buf[1024]; +- int done; + char *path = config_string("server", "socket-path"); + if (!path) + path = SOCKET_PATH; +@@ -52,14 +52,18 @@ void ask_server(char *command) + if (write(fd, command, n) != n) + SYSERRprintf("client command write"); + +- done = 0; +- while (!done && (n = read(fd, buf, sizeof buf)) > 0) { +- if (n >= 5 && !memcmp(buf + n - 5, "done\n", 5)) { +- n -= 5; +- done = 1; ++ if ((fp = fdopen(fd, "r")) != NULL) { ++ while (fgets(buf, sizeof buf, fp)) { ++ n = strlen(buf); ++ if (n >= 5 && !memcmp(buf + n - 5, "done\n", 5)) { ++ fclose(fp); ++ return; ++ } ++ ++ fputs(buf, stdout); + } +- write(1, buf, n); ++ fclose(fp); + } +- if (n < 0) +- SYSERRprintf("client read"); ++ ++ SYSERRprintf("client read"); + } +diff --git a/config-intro.man b/config-intro.man +new file mode 100644 +index 0000000..c06610d +--- /dev/null ++++ b/config-intro.man +@@ -0,0 +1,10 @@ ++.SH NAME ++mcelog.conf \- mcelog.conf reference ++.SH SYNOPSIS ++.B /etc/mcelog.conf ++.SH DESCRIPTION ++ ++/etc/mcelog.conf is the main configuration file for ++.B mcelog(8). ++This is configuration file separated into sections including ++a default section. +diff --git a/dmi.c b/dmi.c +index 290a053..b5492cd 100644 +--- a/dmi.c ++++ b/dmi.c +@@ -162,6 +162,8 @@ static int get_efi_base_addr(size_t *address) + check_symbol: + while ((fgets(linebuf, sizeof(linebuf) - 1, efi_systab)) != NULL) { + char *addrp = strchr(linebuf, '='); ++ if (!addrp) ++ break; + *(addrp++) = '\0'; + + if (strcmp(linebuf, "SMBIOS") == 0) { +diff --git a/genconfig.py b/genconfig.py +new file mode 100755 +index 0000000..aed6992 +--- /dev/null ++++ b/genconfig.py +@@ -0,0 +1,80 @@ ++#!/usr/bin/python ++# generate man config documentation from mcelog.conf example ++# genconfig.py mcelog.conf intro.html ++import sys ++import re ++import string ++import argparse ++ ++ap = argparse.ArgumentParser(description="generate man config documentation from mcelog.conf example") ++ap.add_argument('config', type=argparse.FileType('r'), help="mcelog example config file") ++ap.add_argument('intro', type=argparse.FileType('r'), help="intro file") ++args = ap.parse_args() ++ ++def parse(f): ++ lineno = 1 ++ explanation = 0 ++ header = 1 ++ for line in f: ++ lineno += 1 ++ ++ # skip first comment ++ if header: ++ if not re.match('^#', line): ++ header = 0 ++ continue ++ ++ # explanation ++ m = re.match('^#\s(.*)', line) ++ if m: ++ explanation += 1 ++ s = m.group(1) ++ if explanation == 1: ++ s = string.capitalize(s) ++ print s ++ continue ++ ++ if explanation: ++ print ".PP" ++ explanation = 0 ++ ++ # empty line: new option ++ if re.match('\s+', line): ++ new_option() ++ continue ++ # group ++ m = re.match('\[(.*)\]', line) ++ if m: ++ start_group(m.group(1)) ++ continue ++ # config option ++ m = re.match('^(#?)([a-z-]+) = (.*)', line) ++ if m: ++ config_option(m.group(1), m.group(2), m.group(3)) ++ continue ++ print >>sys.stderr, "Unparseable line %d" % (lineno-1) ++ ++def config_option(enabled, name, value): ++ print ".B %s = %s" % (name, value) ++ print ".PP" ++ ++def start_group(name): ++ print ".SS \"The %s config section\"" % (name) ++ ++def new_option(): ++ print ".PP" ++ ++ ++print """ ++.\" Auto generated mcelog.conf manpage. Do not edit. ++.TH "mcelog.conf" 5 "mcelog" ++""" ++ ++print args.intro.read() ++parse(args.config) ++print """ ++.SH SEE ALSO ++.BR mcelog (8), ++.BR mcelog.triggers (5) ++.B http://www.mcelog.org ++""" +diff --git a/haswell.c b/haswell.c +index 0fef6a5..b309ae5 100644 +--- a/haswell.c ++++ b/haswell.c +@@ -1,5 +1,5 @@ + /* Copyright (C) 2013 Intel Corporation +- Decode Intel Ivy Bridge specific machine check errors. ++ Decode Intel Haswell specific machine check errors. + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public +diff --git a/intel.c b/intel.c +index fe08eab..f893be5 100644 +--- a/intel.c ++++ b/intel.c +@@ -34,7 +34,8 @@ void intel_cpu_init(enum cputype cpu) + if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL || + cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || + cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || +- cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX) ++ cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL || ++ cpu == CPU_KNIGHTS_LANDING) + memory_error_support = 1; + } + +@@ -72,6 +73,15 @@ enum cputype select_intel_cputype(int family, int model) + return CPU_HASWELL; + else if (model == 0x3f) + return CPU_HASWELL_EPEX; ++ else if (model == 0x3d || model == 0x56) ++ return CPU_BROADWELL; ++ else if (model == 0x57) ++ return CPU_KNIGHTS_LANDING; ++ else if (model == 0x1c || model == 0x26 || model == 0x27 || ++ model == 0x35 || model == 0x36 || model == 0x36 || ++ model == 0x37 || model == 0x4a || model == 0x4c || ++ model == 0x4d || model == 0x5a || model == 0x5d) ++ return CPU_ATOM; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", + model); +diff --git a/intel.h b/intel.h +index 00191d5..9d109b1 100644 +--- a/intel.h ++++ b/intel.h +@@ -19,5 +19,7 @@ extern int memory_error_support; + case CPU_IVY_BRIDGE: \ + case CPU_IVY_BRIDGE_EPEX: \ + case CPU_HASWELL: \ +- case CPU_HASWELL_EPEX ++ case CPU_HASWELL_EPEX: \ ++ case CPU_BROADWELL: \ ++ case CPU_KNIGHTS_LANDING + +diff --git a/leaky-bucket.c b/leaky-bucket.c +index c2c501b..721ab22 100644 +--- a/leaky-bucket.c ++++ b/leaky-bucket.c +@@ -25,7 +25,7 @@ time_t __attribute__((weak)) bucket_time(void) + return time(NULL); + } + +-static void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b, ++void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b, + time_t now) + { + long diff; +diff --git a/leaky-bucket.h b/leaky-bucket.h +index 497719e..860ba3c 100644 +--- a/leaky-bucket.h ++++ b/leaky-bucket.h +@@ -27,5 +27,7 @@ char *bucket_output(const struct bucket_conf *c, struct leaky_bucket *b); + int bucket_conf_init(struct bucket_conf *c, const char *rate); + void bucket_init(struct leaky_bucket *b); + time_t bucket_time(void); ++void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b, ++ time_t now); + + #endif +diff --git a/mcelog.8 b/mcelog.8 +index f8a77c4..3781db6 100644 +--- a/mcelog.8 ++++ b/mcelog.8 +@@ -1,5 +1,4 @@ +-.\" disk db commented out for now because it's not usable enough +-.TH MCELOG 8 "May 2009" "" "Linux's Administrator's Manual" ++.TH MCELOG 8 "Mar 2015" "" "Linux's Administrator's Manual" + .SH NAME + mcelog \- Decode kernel machine check log on x86 machines + .SH SYNOPSIS +@@ -26,13 +25,16 @@ in main memory by an integrated memory controller, data + transfer errors on the front side bus or CPU interconnect or other internal + errors. + Possible causes can be cosmic radiation, instable power supplies, +-cooling problems, broken hardware, or bad luck. ++cooling problems, broken hardware, running systems out of specification, ++or bad luck. + + Most errors can be corrected by the CPU by internal error correction + mechanisms. Uncorrected errors cause machine check exceptions which +-may panic the machine. ++may kill processes or panic the machine. A small number of corrected ++errors is usually not a cause for worry, but a large number can indicate ++future failure. + +-When a corrected error happens the x86 kernel writes a record describing ++When a corrected or recovered error happens the x86 kernel writes a record describing + the MCE into a internal ring buffer available through the + .I /dev/mcelog + device +@@ -43,7 +45,11 @@ decodes them into a human readable format and prints them + on the standard output or optionally into the system log. + + Optionally it can also take more options like keeping statistics or +-triggering shell scripts on specific events. ++triggering shell scripts on specific events. By default mcelog ++supports offlining memory pages with persistent corrected errors, ++offlining CPU cores if they developed cache problems, ++and otherwise logging specific events to the system log after ++they crossed a threshold. + + The normal operating modi for mcelog are running + as a regular cron job (traditional way, deprecated), +@@ -112,12 +118,12 @@ and undocumented now. + + With the + .B \-\-dmi +-option mcelog will look up the addresses reported in machine ++option mcelog will look up the DIMMs reported in machine + checks in the + .I SMBIOS/DMI +-tables of the BIOS. +-This can sometimes tell you which DIMM or memory controller +-has developed a problem. More often the information reported ++tables of the BIOS and map the DIMMs to board identifiers. ++This only works when the BIOS reports the identifiers correctly. ++Unfortunately often the information reported + by the BIOS is either subtly or obviously wrong or useless. + This option requires that mcelog has read access to /dev/mem + (normally requires root) and runs on the same machine +@@ -281,6 +287,9 @@ option use + use + .I logfile = /tmp/logfile + ++For more information on the config file please see ++.B mcelog.conf(5). ++ + .SH NOTES + The kernel prefers old messages over new. If the log buffer overflows + only old ones will be kept. +@@ -308,9 +317,14 @@ restarting the daemon. + + .\"/var/lib/memory-errors + .SH SEE ALSO ++.BR mcelog.conf(5), ++.BR mcelog.triggers(5) ++ ++http://www.mcelog.org ++ + AMD x86-64 architecture programmer's manual, Volume 2, System programming + + Intel 64 and IA32 Architectures Software Developer's manual, Volume 3, System programming guide +-Parts 1 and 2. Machine checks are described in Chapter 14 in Part1 and in Appendix E in Part2. ++Chapter 15 and 16. http://www.intel.com/sdm + + Datasheet of your CPU. +diff --git a/mcelog.c b/mcelog.c +index 95a913f..96c0a9d 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -231,6 +231,9 @@ static char *cputype_name[] = { + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ + [CPU_HASWELL] = "Haswell", /* Fill in better name */ + [CPU_HASWELL_EPEX] = "Haswell EP/EX", /* Fill in better name */ ++ [CPU_BROADWELL] = "Broadwell", ++ [CPU_KNIGHTS_LANDING] = "Knights Landing", ++ [CPU_ATOM] = "ATOM", + }; + + static struct config_choice cpu_choices[] = { +@@ -269,7 +272,10 @@ static struct config_choice cpu_choices[ + { "haswell", CPU_HASWELL }, /* Fill in better name */ + { "haswell-ep", CPU_HASWELL_EPEX }, /* Fill in better name */ + { "haswell-ex", CPU_HASWELL_EPEX }, /* Fill in better name */ +- {} ++ { "broadwell", CPU_BROADWELL }, ++ { "knightslanding", CPU_KNIGHTS_LANDING }, ++ { "atom", CPU_ATOM }, ++ { NULL } + }; + + static void print_cputypes(void) +@@ -430,7 +436,8 @@ static void dump_mce(struct mce *m, unsi + mod); + } + if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX && +- cputype != CPU_HASWELL_EPEX) ++ cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL && ++ cputype != CPU_KNIGHTS_LANDING) + resolveaddr(m->addr); + if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { + diskdb_resolve_addr(m->addr); +@@ -517,7 +524,7 @@ int is_cpu_supported(void) + if (family == 15) { + cputype = CPU_K8; + } else if (family >= 16) { +- SYSERRprintf("AMD Processor family %d: Please use the edac_mce_amd module instead.\n", family); ++ SYSERRprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); + return 0; + } + } else if (!strcmp(vendor,"GenuineIntel")) +@@ -741,7 +748,7 @@ restart: + else + s += 3; + +- n = sscanf(s, "%02x:<%016Lx> {%100s}%n", ++ n = sscanf(s, "%02x:<%016Lx> {%99s}%n", + &cs, + &m.ip, + symbol, &next); +@@ -1377,7 +1384,7 @@ int main(int ac, char **av) + + d.buf = xalloc(d.recordlen * d.loglen); + if (daemon_mode) { +- prefill_memdb(); ++ prefill_memdb(do_dmi); + if (!do_dmi) + closedmi(); + server_setup(); +diff --git a/mcelog.conf b/mcelog.conf +index 6a2be26..f8abb99 100644 +--- a/mcelog.conf ++++ b/mcelog.conf +@@ -9,36 +9,36 @@ + # white space is not allowed in value currently, except at the end where it is dropped + # + +-# in general all command line options that are not commands work here +-# see man mcelog or mcelog --help for a list ++# In general all command line options that are not commands work here. ++# See man mcelog or mcelog --help for a list. + # e.g. to enable the --no-syslog option use + #no-syslog = yes (or no to disable) + # when the option has a argument + #logfile = /tmp/logfile +-# below are the options which are not command line options ++# below are the options which are not command line options. + + # Set CPU type for which mcelog decodes events: + #cpu = type +-# for valid values for type please see mcelog --help ++# For valid values for type please see mcelog --help. + # If this value is set incorrectly the decoded output will be likely incorrect. +-# by default when this parameter is not set mcelog uses the CPU it is running on ++# By default when this parameter is not set mcelog uses the CPU it is running on + # on very new kernels the mcelog events reported by the kernel also carry + # the CPU type which is used too when available and not overriden. + + # Enable daemon mode: + #daemon = yes + # By default mcelog just processes the currently pending events and exits. +-# in daemon mode it will keep running as a daemon in the background and poll ++# In daemon mode it will keep running as a daemon in the background and poll + # the kernel for events and then decode them. + +-# Filter out known broken events by default ++# Filter out known broken events by default. + filter = yes +-# don't log memory errors individually +-# they still get accounted if that is enabled ++# Don't log memory errors individually. ++# They still get accounted if that is enabled. + #filter-memory-errors = yes + + # output in undecoded raw format to be easier machine readable +-# (default is decoded) ++# (default is decoded). + #raw = yes + + # Set CPU Mhz to decode uptime from time stamp counter (output +@@ -62,16 +62,17 @@ filter = yes + # Append log output to logfile instead of stdout. Only when no syslog logging is active + #logfile = filename + +-# Use SMBIOS information to decode DIMMs (needs root) +-# This function is not recommended to use right now and generally not needed ++# Use SMBIOS information to decode DIMMs (needs root). ++# This function is not recommended to use right now and generally not needed. + # The exception is memdb prepopulation, which is configured separately below. + #dmi = no + +-# when in daemon mode run as this user after set up +-# note that the triggers will run as this user too +-# setting this to non root will mean that triggers cannot take some corrective +-# action, like offlining objects ++# When in daemon mode run as this user after set up. ++# Note that the triggers will run as this user too. ++# Setting this to non root will mean that triggers cannot take some corrective ++# action, like offlining objects. + #run-credentials-user = root ++ + # group to run as daemon with + # default to the group of the run-credentials-user + #run-credentials-group = nobody +@@ -79,72 +80,88 @@ filter = yes + [server] + # user allowed to access client socket. + # when set to * match any +-# root is always allowed to access ++# root is always allowed to access. + # default: root only + client-user = root + # group allowed to access mcelog +-# when no group is configured any group matches (but still user checking) ++# When no group is configured any group matches (but still user checking). + # when set to * match any + #client-group = root +-# path to the unix socket for client<->server communication +-# when no socket-path is configured the server will not start ++# Path to the unix socket for client<->server communication. ++# When no socket-path is configured the server will not start + #socket-path = /var/run/mcelog-client +-# when mcelog starts it checks if a server is already running. timeout ++# When mcelog starts it checks if a server is already running. This configures the timeout + # for this check. + #initial-ping-timeout = 2 + # + [dimm] + # Is the in memory DIMM error tracking enabled? + # Only works on systems with integrated memory controller and +-# which are supported +-# Only takes effect in daemon mode ++# which are supported. ++# Only takes effect in daemon mode. + dimm-tracking-enabled = yes +-# Use DMI information from the BIOS to prepopulate DIMM database ++# Use DMI information from the BIOS to prepopulate DIMM database. + # Note this might not work with all BIOS and requires mcelog to run as root. + # Alternative is to let mcelog create DIMM objects on demand. + dmi-prepopulate = yes + # +-# execute these triggers when the rate of corrected or uncorrected +-# errors per DIMM exceeds the threshold ++# Execute these triggers when the rate of corrected or uncorrected ++# Errors per DIMM exceeds the threshold. + # Note when the hardware does not report DIMMs this might also +-# be per channel ++# be per channel. + # The default of 10/24h is reasonable for server quality +-# DDR3 DIMMs as of 2009/10 ++# DDR3 DIMMs as of 2009/10. + #uc-error-trigger = dimm-error-trigger + uc-error-threshold = 1 / 24h + #ce-error-trigger = dimm-error-trigger + ce-error-threshold = 10 / 24h + + [socket] +-# Memory error accounting per socket ++# Enable memory error accounting per socket. + socket-tracking-enabled = yes +-# Threshold and trigger for uncorrected memory errors on a socket ++ ++# Threshold and trigger for uncorrected memory errors on a socket. + # mem-uc-error-trigger = socket-memory-error-trigger ++ + mem-uc-error-threshold = 100 / 24h +-# Threshold and trigger for corrected memory errors on a socket ++ ++# Trigger script for corrected memory errors on a socket. + mem-ce-error-trigger = socket-memory-error-trigger ++ ++# Threshold on when to trigger a correct error for the socket. ++ + mem-ce-error-threshold = 100 / 24h ++ + # Log socket error threshold explicitely? + mem-ce-error-log = yes + ++# Trigger script for uncorrected bus error events + bus-uc-threshold-trigger = bus-error-trigger ++ ++# Trigger script for uncorrected IOMCA erors + iomca-threshold-trigger = iomca-error-trigger ++ ++# Trigger script for other uncategorized errors + unknown-threshold-trigger = unknown-error-trigger + + [cache] +-# Processing of cache error thresholds reported by Intel CPUs ++# Processing of cache error thresholds reported by Intel CPUs. + cache-threshold-trigger = cache-error-trigger ++ + # Should cache threshold events be logged explicitely? + cache-threshold-log = yes + + [page] +-# Memory error accouting per 4K memory page +-# Threshold for the correct memory errors trigger script ++# Memory error accouting per 4K memory page. ++# Threshold for the correct memory errors trigger script. + memory-ce-threshold = 10 / 24h +-# Trigger script for corrected errors ++ ++# Trigger script for corrected errors. + # memory-ce-trigger = page-error-trigger ++ + # Should page threshold events be logged explicitely? + memory-ce-log = yes ++ + # specify the internal action in mcelog to exceeding a page error threshold + # this is done in addition to executing the trigger script if available + # off no action +diff --git a/mcelog.conf.5 b/mcelog.conf.5 +new file mode 100644 +index 0000000..5a9afda +--- /dev/null ++++ b/mcelog.conf.5 +@@ -0,0 +1,283 @@ ++ ++." Auto generated mcelog.conf manpage. Do not edit. ++.TH "mcelog.conf" 5 "mcelog" ++ ++.SH NAME ++mcelog.conf \- mcelog.conf reference ++.SH SYNOPSIS ++.B /etc/mcelog.conf ++.SH DESCRIPTION ++ ++/etc/mcelog.conf is the main configuration file for ++.B mcelog(8). ++This is configuration file separated into sections including ++a default section. ++ ++ ++General format ++.PP ++.B optionname = value ++.PP ++White space is not allowed in value currently, except at the end where it is dropped ++ ++.PP ++.PP ++In general all command line options that are not commands work here. ++See man mcelog or mcelog --help for a list. ++e.g. to enable the --no-syslog option use ++.PP ++.B no-syslog = yes (or no to disable) ++.PP ++When the option has a argument ++.PP ++.B logfile = /tmp/logfile ++.PP ++Below are the options which are not command line options. ++.PP ++.PP ++Set cpu type for which mcelog decodes events: ++.PP ++.B cpu = type ++.PP ++For valid values for type please see mcelog --help. ++If this value is set incorrectly the decoded output will be likely incorrect. ++By default when this parameter is not set mcelog uses the CPU it is running on ++on very new kernels the mcelog events reported by the kernel also carry ++the CPU type which is used too when available and not overriden. ++.PP ++.PP ++Enable daemon mode: ++.PP ++.B daemon = yes ++.PP ++By default mcelog just processes the currently pending events and exits. ++In daemon mode it will keep running as a daemon in the background and poll ++the kernel for events and then decode them. ++.PP ++.PP ++Filter out known broken events by default. ++.PP ++.B filter = yes ++.PP ++Don't log memory errors individually. ++They still get accounted if that is enabled. ++.PP ++.B filter-memory-errors = yes ++.PP ++.PP ++Output in undecoded raw format to be easier machine readable ++(default is decoded). ++.PP ++.B raw = yes ++.PP ++.PP ++Set cpu mhz to decode uptime from time stamp counter (output ++unreliable, not needed on new kernels which report the event time ++directly. A lot of systems don't have a linear time stamp clock ++and the output is wrong then. ++Normally mcelog tries to figure out if it the TSC is reliable ++and only uses the current frequency then. ++Setting a frequency forces timestamp decoding. ++This setting is obsolete with modern kernels which report the time ++directly. ++.PP ++.B cpumhz = 1800.00 ++.PP ++.PP ++Log output options ++Log decoded machine checks in syslog (default stdout or syslog for daemon) ++.PP ++.B syslog = yes ++.PP ++Log decoded machine checks in syslog with error level ++.PP ++.B syslog-error = yes ++.PP ++Never log anything to syslog ++.PP ++.B no-syslog = yes ++.PP ++Append log output to logfile instead of stdout. only when no syslog logging is active ++.PP ++.B logfile = filename ++.PP ++.PP ++Use smbios information to decode dimms (needs root). ++This function is not recommended to use right now and generally not needed. ++The exception is memdb prepopulation, which is configured separately below. ++.PP ++.B dmi = no ++.PP ++.PP ++When in daemon mode run as this user after set up. ++Note that the triggers will run as this user too. ++Setting this to non root will mean that triggers cannot take some corrective ++action, like offlining objects. ++.PP ++.B run-credentials-user = root ++.PP ++.PP ++Group to run as daemon with ++default to the group of the run-credentials-user ++.PP ++.B run-credentials-group = nobody ++.PP ++.PP ++.SS "The server config section" ++User allowed to access client socket. ++when set to * match any ++root is always allowed to access. ++default: root only ++.PP ++.B client-user = root ++.PP ++Group allowed to access mcelog ++When no group is configured any group matches (but still user checking). ++when set to * match any ++.PP ++.B client-group = root ++.PP ++Path to the unix socket for client<->server communication. ++When no socket-path is configured the server will not start ++.PP ++.B socket-path = /var/run/mcelog-client ++.PP ++When mcelog starts it checks if a server is already running. this configures the timeout ++for this check. ++.PP ++.B initial-ping-timeout = 2 ++.PP ++ ++.PP ++.SS "The dimm config section" ++Is the in memory dimm error tracking enabled? ++Only works on systems with integrated memory controller and ++which are supported. ++Only takes effect in daemon mode. ++.PP ++.B dimm-tracking-enabled = yes ++.PP ++Use dmi information from the bios to prepopulate dimm database. ++Note this might not work with all BIOS and requires mcelog to run as root. ++Alternative is to let mcelog create DIMM objects on demand. ++.PP ++.B dmi-prepopulate = yes ++.PP ++ ++Execute these triggers when the rate of corrected or uncorrected ++Errors per DIMM exceeds the threshold. ++Note when the hardware does not report DIMMs this might also ++be per channel. ++The default of 10/24h is reasonable for server quality ++DDR3 DIMMs as of 2009/10. ++.PP ++.B uc-error-trigger = dimm-error-trigger ++.PP ++.B uc-error-threshold = 1 / 24h ++.PP ++.B ce-error-trigger = dimm-error-trigger ++.PP ++.B ce-error-threshold = 10 / 24h ++.PP ++.PP ++.SS "The socket config section" ++Enable memory error accounting per socket. ++.PP ++.B socket-tracking-enabled = yes ++.PP ++.PP ++Threshold and trigger for uncorrected memory errors on a socket. ++mem-uc-error-trigger = socket-memory-error-trigger ++.PP ++.PP ++.B mem-uc-error-threshold = 100 / 24h ++.PP ++.PP ++Trigger script for corrected memory errors on a socket. ++.PP ++.B mem-ce-error-trigger = socket-memory-error-trigger ++.PP ++.PP ++Threshold on when to trigger a correct error for the socket. ++.PP ++.PP ++.B mem-ce-error-threshold = 100 / 24h ++.PP ++.PP ++ log socket error threshold explicitely? ++.PP ++.B mem-ce-error-log = yes ++.PP ++.PP ++Trigger script for uncorrected bus error events ++.PP ++.B bus-uc-threshold-trigger = bus-error-trigger ++.PP ++.PP ++Trigger script for uncorrected iomca erors ++.PP ++.B iomca-threshold-trigger = iomca-error-trigger ++.PP ++.PP ++Trigger script for other uncategorized errors ++.PP ++.B unknown-threshold-trigger = unknown-error-trigger ++.PP ++.PP ++.SS "The cache config section" ++Processing of cache error thresholds reported by intel cpus. ++.PP ++.B cache-threshold-trigger = cache-error-trigger ++.PP ++.PP ++Should cache threshold events be logged explicitely? ++.PP ++.B cache-threshold-log = yes ++.PP ++.PP ++.SS "The page config section" ++Memory error accouting per 4k memory page. ++Threshold for the correct memory errors trigger script. ++.PP ++.B memory-ce-threshold = 10 / 24h ++.PP ++.PP ++Trigger script for corrected errors. ++memory-ce-trigger = page-error-trigger ++.PP ++.PP ++Should page threshold events be logged explicitely? ++.PP ++.B memory-ce-log = yes ++.PP ++.PP ++Specify the internal action in mcelog to exceeding a page error threshold ++this is done in addition to executing the trigger script if available ++off no action ++account only account errors ++soft try to soft-offline page without killing any processes ++ This requires an uptodate kernel. Might not be successfull. ++hard try to hard-offline page by killing processes ++ Requires an uptodate kernel. Might not be successfull. ++soft-then-hard First try to soft offline, then try hard offlining ++.PP ++.B memory-ce-action = off|account|soft|hard|soft-then-hard ++.PP ++.B memory-ce-action = soft ++.PP ++.PP ++.SS "The trigger config section" ++Maximum number of running triggers ++.PP ++.B children-max = 2 ++.PP ++Execute triggers in this directory ++.PP ++.B directory = /etc/mcelog ++.PP ++ ++.SH SEE ALSO ++.BR mcelog (8) ++, ++.B http://www.mcelog.org ++ +diff --git a/mcelog.h b/mcelog.h +index 550a0a5..6c097cf 100644 +--- a/mcelog.h ++++ b/mcelog.h +@@ -65,14 +65,18 @@ struct mce { + #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ + #define MCI_STATUS_S (1ULL<<56) /* signalled */ + #define MCI_STATUS_AR (1ULL<<55) /* action-required */ ++#define MCI_STATUS_FWST (1ULL<<37) /* Firmware updated status indicator */ + + #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ + #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ + #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ ++#define MCG_STATUS_LMCES (1ULL<<3) /* local machine check signaled */ + + #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ + #define MCG_TES_P (1ULL<<11) /* Yellow bit cache threshold supported */ + #define MCG_SER_P (1ULL<<24) /* MCA recovery / new status */ ++#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ ++#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ + + #define NELE(x) (sizeof(x)/sizeof(*(x))) + #define err(x) perror(x),exit(1) +@@ -119,6 +123,9 @@ enum cputype { + CPU_IVY_BRIDGE_EPEX, + CPU_HASWELL, + CPU_HASWELL_EPEX, ++ CPU_BROADWELL, ++ CPU_KNIGHTS_LANDING, ++ CPU_ATOM, + }; + + enum option_ranges { +diff --git a/mcelog.service b/mcelog.service +new file mode 100644 +index 0000000..c5aaf07 +--- /dev/null ++++ b/mcelog.service +@@ -0,0 +1,10 @@ ++[Unit] ++Description=Machine Check Exception Logging Daemon ++After=syslog.target ++ ++[Service] ++ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground ++StandardOutput=syslog ++ ++[Install] ++WantedBy=multi-user.target +diff --git a/mcelog.triggers.5 b/mcelog.triggers.5 +new file mode 100644 +index 0000000..510bbef +--- /dev/null ++++ b/mcelog.triggers.5 +@@ -0,0 +1,231 @@ ++'\" t ++.TH "mcelog.triggers" 5 "mcelog" ++.SH NAME ++mcelog.triggers \- mcelog trigger scripts reference ++.SH SYNOPSIS ++.B /etc/mcelog/bus-error-trigger ++.br ++.B /etc/mcelog/cache-error-trigger ++.br ++.B /etc/mcelog/dimm-error-trigger ++.br ++.B /etc/mcelog/iomca-error-trigger ++.br ++.B /etc/mcelog/page-error-trigger ++.br ++.B /etc/mcelog/socket-memory-error-trigger ++.br ++.B /etc/mcelog/unknown-error-trigger ++.br ++.SH DESCRIPTION ++.BR mcelog(8) ++maintains thresholds of errors using a ++.I leaky-bucket ++algorithm. ++When the number of errors in a specific ++time window exceeds a pre-configured threshold a ++.I trigger ++will be executed. Triggers are usually shell scripts in the ++.B /etc/mcelog ++directory ++but can be also other internal actions. Thresholds and triggers ++can be configured in ++.BR mcelog.conf(5) ++ ++Trigger will run as the user configured for mcelog ++in ++.I mcelog.conf, ++by default root. The default trigger action can ++be overridden by specifying a different trigger script in the configuration file. ++Actions in addition to the default trigger ++(like notifying an administrator) can be put into the respective ++.I /etc/mcelog/*.local ++script which is executed after the default action. This allows updating the default ++scripts without overriding local actions. All trigger actions are also ++logged to syslog. ++.PP ++.B "The DIMM and socket memory error triggers" ++.PP ++The ++.B /etc/mcelog/dimm-error-trigger ++and ++.B /etc/mcelog/socket-memory-error-trigger ++scripts are executed when a DIMM or a CPU socket exceeds ++a configured corrected or uncorrected memory error threshold. ++The thresholds are configured in the ++.B mcelog.conf ++.I [dimm] ++and ++.I [socket] ++sections. ++The default triggers log a warning message in the system log. ++The triggers are only executed when mcelog runs as a daemon. ++ ++Arguments are passed as environment variables ++.TS ++tab(:); ++l l. ++THRESHOLD:human readable threshold status ++MESSAGE:Human readable consolidated error message ++TOTALCOUNT:total corrected or uncorrected count of errors for current DIMM depending on what triggered the event ++LOCATION:Consolidated location as a single string ++DMI_LOCATION:DIMM location from DMI/SMBIOS if available ++DMI_NAME:DIMM identifier from DMI/SMBIOS if available ++DIMM:DIMM number reported by hardware ++CHANNEL:Channel number reported by hardware ++SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM ++CECOUNT:Total corrected error count for DIMM ++UCCOUNT:Total uncorrected error count for DIMM ++LASTEVENT:Time stamp of event that triggered threshold (in time_t format, seconds) ++THRESHOLD_COUNT:Total umber of events in current threshold time period of specific type ++.TE ++ ++After the default action local actions in ++.B /etc/mcelog/dimm-error-trigger.local ++or respective ++.B /etc/mcelog/socket-memory-error-trigger.local ++are executed. ++ ++.PP ++.B "The page error trigger" ++.PP ++The ++.B /etc/mcelog/page-error-trigger ++script is ++executed by mcelog in daemon mode when a page ++in memory exceeds a pre-configured corrected or uncorrected error threshold. ++mcelog internally also implements offlining the page through the kernel. ++This is configured through the ++.I [page] ++section of ++.BR mcelog.conf(5) ++.PP ++The environment arguments are the same as for the ++.I dimm-error-trigger ++script ++.PP ++After the default action local actions in ++.I /etc/mcelog/page-error-trigger.loccal are executed. ++ ++.PP ++.B "The cache error trigger" ++.PP ++The ++.I /etc/mcelog/cache-error-trigger ++shell script is called for cache error handling in daemon mode ++when a CPU reports excessive corrected cache errors. ++This could be a indication for future uncorrected errors. ++.PP ++This trigger is configured through the ++.B [cache] ++section in the ++.BR mcelog.conf(5) ++configuration file. The threshold is defined by the CPU. The default trigger offlines the affected CPU cores, unless it is the last core running. ++.PP ++Arguments are passed as environment variables ++.TS ++tab(:); ++l l. ++MESSAGE:Human readable error message ++CPU:Linux CPU number that triggered the error ++LEVEL:Cache level affected by error ++TYPE:Cache type affected by error (Data,Instruction,Generic) ++AFFECTED_CPUS:List of CPUs sharing the affected cache ++SOCKETID:Socket ID of affected CPU ++.TE ++.PP ++After the default action local actions in ++.I /etc/mcelog/cache-error-trigger.local are executed. ++.PP ++.B "The bus-uc-threshold-trigger" ++.PP ++The ++.B bus-uc-threshold-trigger ++runs on uncorrected errors on a IO bus. It is configured through the ++.B bus-uc-threshold-trigger ++and ++.B bus-uc-threshold-trigger-threshold ++options in ++.I /etc/mcelog.conf(5). ++By default it logs a message with the error location to the system log. ++After the default action local actions in ++.I /etc/mcelog/bus-uc-error-trigger.local ++are executed. ++.PP ++Arguments are passed as environment variables ++.TS ++tab(:); ++l l. ++MESSAGE:Human readable consolidated error message. ++LOCATION:Consolidated location as a single string ++SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM ++LEVEL:Interconnect level ++PARTICIPATION:Processor Participation (Originator, Responder or Observer) ++REQUEST:Request type (read, write, prefetch, etc.) ++ORIGIN :Memory or IO ++TIMEOUT:The request timed out or not ++.TE ++.PP ++.B "The iomca-error-trigger" ++.PP ++The ++.B iomca-error-trigger ++runs when a socket receives bus or interconnect errors. ++It is configured through the ++.B iomca-error-trigger ++and ++.B iomca-error-trigger-threshold ++options in ++.I /etc/mcelog.conf. By default it logs a message with the error location to the system log. ++After the default action local actions in ++.I /etc/mcelog/iomca-error-trigger.local are executed. ++.PP ++Arguments are passed as environment variables ++.TS ++tab(:); ++l l. ++MESSAGE:Human readable consolidated error message ++LOCATION:Consolidated location as a single string ++SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM ++CPU:Linux CPU number that triggered the error ++SET:PCI segment number ++BUS:PCI bus number ++DEVICE:PCI device number ++FUNCTION:PCI function number ++.TE ++.PP ++.B "The unknown-error-trigger" ++.PP ++The ++.B unknown-error-trigger ++runs on any errors not otherwise categorized. ++It is configured through the ++.B unknown-error-trigger ++and ++.B unknown-error-trigger-threshold ++options in ++.I /etc/mcelog.conf. ++By default it logs a message to the system log. ++After the default action local actions in ++.I /etc/mcelog/unknown-error-trigger.local ++are executed. ++.PP ++Arguments are passed as environment variables ++.TS ++tab(:); ++l l. ++MESSAGE:Human readable consolidated error message ++LOCATION:Consolidated location as a single string ++SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM ++CPU:Linux CPU number that triggered the error ++STATUS:IA32_MCi_STATUS register value ++ADDR:IA32_MCi_ADDR register value ++MISC:IA32_MCi_MISC register value ++MCGSTATUS:IA32_MCG_STATUS register value ++MCGCAP:IA32_MCG_CAP register value ++.TE ++.SH SEE ALSO ++http://www.mcelog.org ++ ++.B mcelog(8), ++.B mcelog.conf(5) +diff --git a/memdb.c b/memdb.c +index bde8113..7a33750 100644 +--- a/memdb.c ++++ b/memdb.c +@@ -270,6 +270,7 @@ static void dump_errtype(char *name, struct err_type *e, FILE *f, enum printflag + int all = (flags & DUMP_ALL); + char *s; + ++ bucket_age(bc, &e->bucket, bucket_time()); + if (e->count || e->bucket.count || all) + fprintf(f, "%s:\n", name); + if (e->count || all) { +@@ -382,7 +383,7 @@ parse_dimm_addr(char *bl, unsigned *socketid, unsigned *channel, unsigned *dimm) + } + + /* Prepopulate DIMM database from BIOS information */ +-void prefill_memdb(void) ++void prefill_memdb(int do_dmi) + { + static int initialized; + int i; +@@ -395,7 +396,7 @@ void prefill_memdb(void) + if (!memdb_enabled) + return; + initialized = 1; +- if (config_bool("dimm", "dmi-prepopulate") == 0) ++ if (config_bool("dimm", "dmi-prepopulate") == 0 || !do_dmi) + return; + if (opendmi() < 0) + return; +diff --git a/memdb.h b/memdb.h +index 5c68581..afc3348 100644 +--- a/memdb.h ++++ b/memdb.h +@@ -11,7 +11,7 @@ enum printflags { + DUMP_BIOS = (1 << 1), + }; + +-void prefill_memdb(void); ++void prefill_memdb(int do_dmi); + void memdb_config(void); + void dump_memory_errors(FILE *f, enum printflags flags); + +diff --git a/p4.c b/p4.c +index f938196..2bf1eee 100644 +--- a/p4.c ++++ b/p4.c +@@ -317,6 +317,10 @@ static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *i + if (status & (MCI_STATUS_S|MCI_STATUS_AR)) + Wprintf("%s\n", arstate[(status >> 55) & 3]); + ++ if ((mcgcap & MCG_SER_P) && (status & MCI_STATUS_FWST)) { ++ Wprintf("Firmware may have updated this error\n"); ++ } ++ + if ((mcgcap == 0 || (mcgcap & MCG_TES_P)) && !(status & MCI_STATUS_UC)) { + track = (status >> 53) & 3; + decode_tracking(track); +@@ -334,6 +338,8 @@ static void decode_mcg(__u64 mcgstatus) + Wprintf("EIPV "); + if (mcgstatus & MCG_STATUS_MCIP) + Wprintf("MCIP "); ++ if (mcgstatus & MCG_STATUS_LMCES) ++ Wprintf("LMCE "); + Wprintf("\n"); + } + +diff --git a/server.c b/server.c +index 344eb38..a1fa7da 100644 +--- a/server.c ++++ b/server.c +@@ -291,7 +291,7 @@ static int server_ping(struct sockaddr_un *un) + { + struct sigaction oldsa; + struct sigaction sa = { .sa_handler = ping_timeout }; +- int ret = -1, n; ++ int ret, n; + char buf[10]; + int fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (fd < 0) +@@ -299,6 +299,7 @@ static int server_ping(struct sockaddr_un *un) + + sigaction(SIGALRM, &sa, &oldsa); + if (sigsetjmp(ping_timeout_ctx, 1) == 0) { ++ ret = 0; + alarm(initial_ping_timeout); + if (connect(fd, un, sizeof(struct sockaddr_un)) < 0) + goto cleanup; +@@ -308,7 +309,8 @@ static int server_ping(struct sockaddr_un *un) + goto cleanup; + if (n == 5 && !memcmp(buf, "pong\n", 5)) + ret = 0; +- } ++ } else ++ ret = -1; + cleanup: + sigaction(SIGALRM, &oldsa, NULL); + alarm(0); +diff --git a/tests/test b/tests/test +index 35bebd2..148bf1f 100755 +--- a/tests/test ++++ b/tests/test +@@ -17,6 +17,8 @@ if [ "$(whoami)" != "root" ] ; then + exit 1 + fi + ++[ ! -f /dev/mce-inject ] && modprobe mce-inject ++ + echo "++++++++++++ running $1 test +++++++++++++++++++" + + # disable trigger +diff --git a/trigger.c b/trigger.c +index 19466a6..5caca34 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -115,11 +115,18 @@ static void finish_child(pid_t child, int status) + static void child_handler(int sig, siginfo_t *si, void *ctx) + { + int status; ++ pid_t pid; ++ + if (waitpid(si->si_pid, &status, WNOHANG) < 0) { + SYSERRprintf("Cannot collect child %d", si->si_pid); + return; + } + finish_child(si->si_pid, status); ++ ++ /* Check other child(ren)'s status to avoid zombie process */ ++ while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { ++ finish_child(pid, status); ++ } + } + + void trigger_setup(void) +diff --git a/triggers/bus-error-trigger b/triggers/bus-error-trigger +old mode 100644 +new mode 100755 +diff --git a/triggers/iomca-error-trigger b/triggers/iomca-error-trigger +old mode 100644 +new mode 100755 +diff --git a/triggers/unknown-error-trigger b/triggers/unknown-error-trigger +old mode 100644 +new mode 100755 +index b924a0e..fa2866c +--- a/triggers/unknown-error-trigger ++++ b/triggers/unknown-error-trigger +@@ -9,7 +9,7 @@ + # CPU Linux CPU number that triggered the error + # STATUS IA32_MCi_STATUS register value + # ADDR IA32_MCi_ADDR register value +-# MISC IA32_MCi_MISC regiser value ++# MISC IA32_MCi_MISC register value + # MCGSTATUS IA32_MCG_STATUS register value + # MCGCAP IA32_MCG_CAP register value + # For details on the register layout please see the Intel SDM http://www.intel.com/sdm diff --git a/SOURCES/mcelog-update-f30da3d.patch b/SOURCES/mcelog-update-f30da3d.patch new file mode 100644 index 0000000..1cd0a25 --- /dev/null +++ b/SOURCES/mcelog-update-f30da3d.patch @@ -0,0 +1,212 @@ +diff --git a/dmi.c b/dmi.c +index 373837e..290a053 100644 +--- a/dmi.c ++++ b/dmi.c +@@ -174,8 +174,10 @@ check_symbol: + if (fclose(efi_systab) != 0) + perror(filename); + +- if (!ret) +- Eprintf("%s: SMBIOS entry point missing", filename); ++ if (!ret || !*address){ ++ Lprintf("No valid SMBIOS entry point: Continue without DMI decoding"); ++ return 0; ++ } + + if (verbose) + printf("%s: SMBIOS entry point at 0x%08lx\n", filename, +@@ -224,6 +226,8 @@ int opendmi(void) + } + a = (struct anchor*)((char*)abase + (entry_point_addr - addr_start)); + goto fill_entries; ++ } else { ++ return -1; + } + + legacy: +diff --git a/input/GENCACHE b/input/GENCACHE +index 71f1d1a..bcf689d 100755 +--- a/input/GENCACHE ++++ b/input/GENCACHE +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # GENCACHE cpu level type track + # generate a memory error. All fields are optional. + # see SDM 3a chapter 15 for details +diff --git a/input/GENMEM b/input/GENMEM +index c0a4c53..caa61b9 100755 +--- a/input/GENMEM ++++ b/input/GENMEM +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # GENMEM socketid channel dimm corr-err-cnt uc-flag + # generate a memory error. All fields are optional. + # suitable to be fed into mce-inject or mcelog --ascii +diff --git a/input/GENPAGE b/input/GENPAGE +index c63d607..14c20ba 100755 +--- a/input/GENPAGE ++++ b/input/GENPAGE +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # GENMPAGE pfn socketid channel dimm corr-err-cnt + # generate a memory error on a page. All fields are optional. + # dimm/channel can be out of sync with the address +diff --git a/intel.c b/intel.c +index ba353c2..0f5abac 100644 +--- a/intel.c ++++ b/intel.c +@@ -67,7 +67,8 @@ enum cputype select_intel_cputype(int family, int model) + return CPU_IVY_BRIDGE; + else if (model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; +- else if (model == 0x3c || model == 0x45 || model == 0x46) ++ else if (model == 0x3c || model == 0x3f || model == 0x45 || ++ model == 0x46) + return CPU_HASWELL; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", +diff --git a/mcelog.8 b/mcelog.8 +index fa923e7..f8a77c4 100644 +--- a/mcelog.8 ++++ b/mcelog.8 +@@ -181,7 +181,13 @@ With the + .B \-\-daemon + option mcelog will run in the background. This gives the fastest reaction + time and is the recommended operating mode. +-This option implies ++If an output option isn't selected ( ++.I \-\-logfile ++or ++.I \-\-syslog ++or ++.I \-\-syslog-error ++), this option implies + .I \-\-logfile=/var/log/mcelog. + Important messages will be logged as one-liner summaries to syslog + unless +diff --git a/mcelog.c b/mcelog.c +index 7ceb43d..d90589f 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -508,11 +508,12 @@ int is_cpu_supported(void) + } + if (seen == ALL) { + if (!strcmp(vendor,"AuthenticAMD")) { +- if (family == 15) ++ if (family == 15) { + cputype = CPU_K8; +- if (family >= 15) +- SYSERRprintf("AMD Processor family %d: Please load edac_mce_amd module.\n", family); +- return 0; ++ } else if (family >= 16) { ++ SYSERRprintf("AMD Processor family %d: Please use the edac_mce_amd module instead.\n", family); ++ return 0; ++ } + } else if (!strcmp(vendor,"GenuineIntel")) + cputype = select_intel_cputype(family, model); + /* Add checks for other CPUs here */ +@@ -1069,11 +1070,8 @@ static int modifier(int opt) + break; + case O_DAEMON: + daemon_mode = 1; +- if (!logfile && !foreground) +- logfile = logfile_default; + if (!(syslog_opt & SYSLOG_FORCE)) + syslog_opt = SYSLOG_REMARK|SYSLOG_ERROR; +- + break; + case O_FILE: + inputfile = optarg; +@@ -1082,8 +1080,6 @@ static int modifier(int opt) + foreground = 1; + if (!(syslog_opt & SYSLOG_FORCE)) + syslog_opt = SYSLOG_FORCE; +- if (logfile == logfile_default) +- logfile = NULL; + break; + case O_NUMERRORS: + numerrors = atoi(optarg); +@@ -1110,6 +1106,9 @@ static int modifier(int opt) + + static void modifier_finish(void) + { ++ if(!foreground && daemon_mode && !logfile && !(syslog_opt & SYSLOG_LOG)) { ++ logfile = logfile_default; ++ } + if (logfile) { + if (open_logfile(logfile) < 0) { + if (daemon_mode && !(syslog_opt & SYSLOG_FORCE)) +@@ -1174,8 +1173,8 @@ static void drop_cred(void) + static void process(int fd, unsigned recordlen, unsigned loglen, char *buf) + { + int i; +- int len; +- int finish = 0; ++ int len, count; ++ int finish = 0, flags; + + if (recordlen == 0) { + Wprintf("no data in mce record\n"); +@@ -1188,7 +1187,14 @@ static void process(int fd, unsigned recordlen, unsigned loglen, char *buf) + return; + } + +- for (i = 0; (i < len / (int)recordlen) && !finish; i++) { ++ count = len / (int)recordlen; ++ if (count == (int)loglen) { ++ if ((ioctl(fd, MCE_GETCLEAR_FLAGS, &flags) == 0) && ++ (flags & (1 << MCE_OVERFLOW))) ++ Eprintf("Warning: MCE buffer is overflowed.\n"); ++ } ++ ++ for (i = 0; (i < count) && !finish; i++) { + struct mce *mce = (struct mce *)(buf + i*recordlen); + mce_prepare(mce); + if (numerrors > 0 && --numerrors == 0) +diff --git a/p4.c b/p4.c +index 93b59f3..86e7dc5 100644 +--- a/p4.c ++++ b/p4.c +@@ -147,6 +147,7 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + [3] = "External error", + [4] = "FRC error", + [5] = "Internal parity error", ++ [6] = "SMM Handler Code Access Violation", + }; + + if (mca & (1UL << 12)) { +diff --git a/tests/test b/tests/test +index c673eb2..35bebd2 100755 +--- a/tests/test ++++ b/tests/test +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # simple test harness for mcelog daemon trigger test cases + # ./test subdir [debugger] + # run mcelog test in specific sub directory +diff --git a/triggers/cache-error-trigger b/triggers/cache-error-trigger +index e32bfd6..beb5f07 100755 +--- a/triggers/cache-error-trigger ++++ b/triggers/cache-error-trigger +@@ -15,16 +15,11 @@ + # this can be changed in mcelog.conf + + # +-# offline the CPUs (except CPU #0) sharing the affected cache ++# offline the CPUs sharing the affected cache + # + EXIT=0 + + for i in $AFFECTED_CPUS ; do +- if [ $i = 0 ] ; then +- logger -s -p daemon.warn -t mcelog "Not offlining CPU 0" +- EXIT=1 +- continue +- fi + logger -s -p daemon.crit -t mcelog "Offlining CPU $i due to cache error threshold" + F=$(printf "/sys/devices/system/cpu/cpu%d/online" $i) + echo 0 > $F diff --git a/SOURCES/mcelog.conf b/SOURCES/mcelog.conf new file mode 100644 index 0000000..163f744 --- /dev/null +++ b/SOURCES/mcelog.conf @@ -0,0 +1,59 @@ +# +# config file for mcelog +# For further options, see the mcelog manpage and documentation +# +# by default, disable extended error logging on newer Intel processors +no-imc-log = yes + +# Filter out known broken events by default +filter = yes +# don't log memory errors individually +#filter-memory-errors = yes + +# output in undecoded raw format to be easier machine readable +#raw = yes + +[server] +# An upstream bug prevents this from being disabled +# Only allow root to connect by default +client-user = root +# Path to socket client uses to connect +socket-path = /var/run/mcelog-client + +[dimm] +# Enable DIMM-tracking +dimm-tracking-enabled = yes +# Disable DIMM DMI pre-population unless supported on your system +dmi-prepopulate = no + +# execute these triggers when the rate of corrected or uncorrected +# errors per DIMM exceeds the threshold +uc-error-trigger = dimm-error-trigger +uc-error-threshold = 1 / 24h +ce-error-trigger = dimm-error-trigger +ce-error-threshold = 10 / 24h + +[socket] +# Memory error accounting per socket +socket-tracing-enabled = yes +mem-uc-error-threshold = 100 / 24h +mem-ce-error-trigger = socket-memory-error-trigger +mem-ce-error-threshold = 100 / 24h +mem-ce-error-log = yes + +[cache] +# Attempt to off-line CPUs causing cache errors +cache-threshold-trigger = cache-error-trigger +cache-threshold-log = yes + +[page] +# Try to soft-offline a 4K page if it exceeds the threshold +memory-ce-threshold = 10 / 24h +memory-ce-trigger = page-error-trigger +memory-ce-log = yes +memory-ce-action = soft + +[trigger] +# Maximum number of running triggers +children-max = 2 +directory = /etc/mcelog/triggers diff --git a/SOURCES/mcelog.service b/SOURCES/mcelog.service new file mode 100644 index 0000000..1371d5e --- /dev/null +++ b/SOURCES/mcelog.service @@ -0,0 +1,19 @@ +[Unit] +Description=Machine Check Exception Logging Daemon +After=syslog.target + +# FIXME - due to upstream kernel bug always start the mcelog process +# twice using the following ExecStartPre hack. This needs fixing. +# There is a bug filed against systemd for the ExecStartPre bit +# since it is not possible to specify that the ExecStarPre bit +# is allowed and expected to fail without aborting the daemon. + +[Service] +Type=forking +ExecStartPre=/etc/mcelog/mcelog.setup +ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --syslog +SuccessExitStatus=0 15 +StandardOutput=syslog + +[Install] +WantedBy=multi-user.target diff --git a/SOURCES/mcelog.setup b/SOURCES/mcelog.setup new file mode 100644 index 0000000..c1966b8 --- /dev/null +++ b/SOURCES/mcelog.setup @@ -0,0 +1,12 @@ +#!/bin/sh +# +# An upstream kernel bug prevents mcelog from starting normally in +# daemon mode the first time it is run. So, in the systemd service, +# we want to start it twice - one as a ExecStartPre that will fail. +# But systemd will abort the process if the "pre" fails, so we use +# this script - temporarily - to start the first process. +# +# Waiting on Andi Kleen to fix upstream. +# +/usr/sbin/mcelog --ignorenodev --syslog --foreground +exit 0 diff --git a/SPECS/mcelog.spec b/SPECS/mcelog.spec new file mode 100644 index 0000000..4877675 --- /dev/null +++ b/SPECS/mcelog.spec @@ -0,0 +1,296 @@ +%define last_tar_git_commit d2e13bf0 +%define last_git_commit 94d853b2ea81 + +Summary: Tool to translate x86-64 CPU Machine Check Exception data +Name: mcelog +Version: 144 +Release: 10.%{last_git_commit}%{?dist} +Epoch: 3 +Group: System Environment/Base +License: GPLv2 +Source0: mcelog-%{last_tar_git_commit}.tar.bz2 +# note that this source OVERRIDES the one on the tarball above! +Source1: mcelog.conf +Source2: mcelog.service +Source10: mcelog.setup +Patch0: mcelog-fix-trigger-path-and-cacheing.patch +# BZ 1039183: Add Haswell and correct Ivy Bridge +Patch1: mcelog-update-2577aeb.patch +Patch2: mcelog-update-f30da3d.patch +# BZ 1138319: Add additional Haswell support (see patch for additional info) +Patch3: mcelog-haswell-support.patch +Patch4: mcelog-update-9de4924.patch +Patch5: mcelog-update-e7e0ac1.patch +Patch6: mcelog-patch-1bd2984.patch +Patch7: mcelog-update-e4aca63.patch +Patch8: mcelog-update-94d853b2ea81.patch +Patch9: mcelog-patch-e9aeed03f3d1.patch +Patch10: mcelog-patch-cfa11588ad8b.patch +# Patches 11-15 below can be removed on the next full code update. +Patch11: mcelog-patch-commit-916015663906.patch +Patch12: mcelog-patch-0755b55af.patch +Patch13: mcelog-patch-59b8cab3f.patch +Patch14: mcelog-patch-f8f1490cb.patch +Patch15: mcelog-patch-595a2dcfe.patch +Patch16: mcelog-patch-6ed93e30f835.patch +Patch17: mcelog-patch-d1f37aae14d4.patch +URL: https://github.com/andikleen/mcelog.git +Buildroot: %{_tmppath}/%{name}-%{version}-root +ExclusiveArch: i686 x86_64 +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units +BuildRequires: systemd + +%description +mcelog is a utility that collects and decodes Machine Check Exception data +on x86-32 and x86-64 systems. It can be run either as a daemon, or by cron. + +%prep +%setup -q -n %{name}-%{last_tar_git_commit} +%patch0 -p1 +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 +%patch4 -p1 +%patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 + +%build +mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir} +mkdir -p $RPM_BUILD_ROOT/%{_sbindir} +mkdir -p $RPM_BUILD_ROOT/%{_mandir} + +# Make sure mcelog --version and 'rpm -q mcelog' are consistent +echo "%{name}-%{version}-%{release}" > .os_version + +make CFLAGS="$RPM_OPT_FLAGS -Wl,-z,relro,-z,now -fpie" LDFLAGS="-Wl,-z,relro,-z,now -fpie -pie" + +%install +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT/%{_mandir}/man{1,5,8} +mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog +mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/triggers +mkdir -p $RPM_BUILD_ROOT/%{_unitdir} +mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir}/cron.hourly +mkdir -p $RPM_BUILD_ROOT/%{_sbindir} +install -p -m755 mcelog $RPM_BUILD_ROOT/%{_sbindir}/mcelog +install -p -m644 %{SOURCE1} $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/mcelog.conf +install -p -m755 %{SOURCE10} $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/mcelog.setup +install -p -m755 triggers/cache-error-trigger $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/triggers/cache-error-trigger +install -p -m755 triggers/dimm-error-trigger $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/triggers/dimm-error-trigger +install -p -m755 triggers/page-error-trigger $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/triggers/page-error-trigger +install -p -m755 triggers/socket-memory-error-trigger $RPM_BUILD_ROOT/%{_sysconfdir}/mcelog/triggers/socket-memory-error-trigger +install -p -m755 mcelog.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.hourly/mcelog.cron +install -p -m644 %{SOURCE2} $RPM_BUILD_ROOT%{_unitdir}/mcelog.service +install -p -m644 mcelog.8 $RPM_BUILD_ROOT/%{_mandir}/man8 +install -p -m644 mcelog.conf.5 $RPM_BUILD_ROOT/%{_mandir}/man5 +install -p -m644 mcelog.triggers.5 $RPM_BUILD_ROOT/%{_mandir}/man5 + +%clean +rm -rf $RPM_BUILD_ROOT + +%post +case "$1" in + 1) # This is an initial installation + systemctl enable mcelog.service &> /dev/null || systemctl daemon-reload &> /dev/null + ;; + 2) # This is an upgrade - don't reactivate the service again + : + ;; +esac + +%preun +# Handle removing mcelog +if [ "$1" -eq 0 ]; then + systemctl disable mcelog.service &> /dev/null + systemctl stop mcelog.service &> /dev/null +fi + +%postun +# Handle upgrading mcelog +if [ "$1" -ge 1 ]; then + systemctl try-restart mcelog.service &> /dev/null +fi + +%files +%defattr(-,root,root,-) +%doc README.md CHANGES +%{_sbindir}/mcelog +%dir %{_sysconfdir}/mcelog +%{_sysconfdir}/mcelog/triggers +%config(noreplace) %{_sysconfdir}/mcelog/mcelog.conf +%{_sysconfdir}/mcelog/mcelog.setup +%{_sysconfdir}/cron.hourly/mcelog.cron +%{_unitdir}/mcelog.service +%attr(0644,root,root) %{_mandir}/*/* + +%changelog +* Mon Feb 25 2019 Prarit Bhargava - 3:144.10.94d853b2ea81 +- mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems [1641043] +- mcelog: Add decoding for Optane DC persistent memory mode [1645345] +* Fri Jun 29 2018 Prarit Bhargava - 3:144.9.94d853b2ea81 +- Print microcode version when the kernel provides it [1593109] +* Tue Oct 17 2017 Prarit Bhargava - 3:144.8.94d853b2ea81 +- Fix typo in spec file for .os_version[1454419] +* Tue Oct 17 2017 Prarit Bhargava - 3:144.7.94d853b2ea81 +- Fix mcelog --version [1454419] +* Mon Oct 16 2017 Prarit Bhargava - 3:144.6.94d853b2ea81 +- Fix return value from 'mcelog --help' [1481421] +* Thu Oct 5 2017 Prarit Bhargava - 3:144.5.94d853b2ea81 +- Fix mcelog.service file enable/disable after install & upgrade [1413284] +* Tue Sep 19 2017 Prarit Bhargava - 3:144.4.94d853b2ea81 +- Cleanup spec and patch files [1493151] +* Thu Apr 27 2017 Prarit Bhargava - 3:144.3.94d853b2ea81 +- Fix "warning: 16 bytes ignored in each record" warning [1445809] +* Thu Feb 2 2017 Prarit Bhargava - 3:144.2.94d853b2ea81 +- mcelog: is_cpu_supported() error message must be printed Eprintf [1406626] +* Wed Nov 30 2016 Prarit Bhargava - 3:144.1.94d853b2ea81 +- update NVR to 144 to match upstream +- add Denverton SoC support [1273768] +- add Kabylake U/Y support, 0x8E [1310954] +- add Kabylake H/S support, 0x9E [1310955] +- add Knights Mill support [1381316] +- mcelog didn't remove /var/run/mcelog-client when exitting [1362123] +* Mon Oct 24 2016 Prarit Bhargava - 3:136.2.e4aca63 +- fix post-uninstall script warning during upgrade [1257116] +* Fri May 13 2016 Prarit Bhargava - 3:136-1.e4aca63 +- update NVR to 136 to match upstream [1336431] +- additional general fixes [1336431] +- Skylake Client support (6,94) (6,78) [1255571] +- Broadwell SoC/DE, EP & EX support (6,79) [1255572] +* Mon Sep 21 2015 Prarit Bhargava - 3:120-3.e7e0ac1 +- Fix server restart when /var/run/mcelog-client socket exists [1256714] +* Fri Jun 12 2015 Prarit Bhargava - 3:120-2.e7e0ac1 +- add RELRO and PIE [1092567] +* Fri Jun 12 2015 Prarit Bhargava - 3:120-1.e7e0ac1 +- Add Broadwell-U, Broadwell-DE, and Knights Landing/Xeon Phi Support +- additional general fixes +- add mcelog.conf and mcelog.triggers man pages +- update NVR to 120 to match upstream +* Mon Oct 27 2014 Prarit Bhargava - 3:101-3.9de4924 +- Update with latest minor fixes, no new support [1157683] + +* Mon Sep 8 2014 Prarit Bhargava - 3:101-2.f30da3d +- Additional Haswell Support [1138319] + +* Thu Sep 4 2014 Prarit Bhargava - 3:101-1.f30da3d +- Update to upstream NVR (101) [1136989] + +* Wed Sep 3 2014 Prarit Bhargava - 2:1.0-0.13.f30da3d +- Update to upstream commit f30da3d, minor fixes, no features [1085134] +- Add /var/log/mcelog file [1098864] +- remove .src.rpm file [1038755] + +* Wed Jan 22 2014 Prarit Bhargava - 2:1.0-0.12.2577aeb +- Add Haswell client cpuids, identify Ivy Bridge properly, and fix issues + on Ivy Bridge + +* Fri Dec 27 2013 Daniel Mach - 2:1.0-0.11.d2e13bf0 +- Mass rebuild 2013-12-27 + +* Tue Dec 3 2013 Prarit Bhargava 2:1.0-0.10.d2e13bf0 +- Fix prebuilt binaries issue in tarball [1037730] + +* Thu Nov 21 2013 Prarit Bhargava 2:1.0-0.9.d2e13bf0 +- disable extended logging suppport [1028645] + +* Wed May 15 2013 Prarit Bhargava 2:1.0-0.8.d2e13bf0 +- update to commit d2e13bf0 [963287] + +* Thu Feb 14 2013 Fedora Release Engineering - 2:1.0-0.7.6e4e2a00 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_19_Mass_Rebuild + +* Thu Jul 19 2012 Fedora Release Engineering - 2:1.0-0.6.6e4e2a00 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_18_Mass_Rebuild + +* Thu Apr 26 2012 Jon Ciesla - 2:1.0-0.5.6e4e2a00 +- Merge review fixes, BZ 226132. + +* Fri Jan 13 2012 Fedora Release Engineering - 2:1.0-0.4.6e4e2a00 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_17_Mass_Rebuild + +* Thu Nov 17 2011 Prarit Bhargava 2:1.0-0.3.6e4e2a00 +- Updated sources to deal with various warning issues [701083] [704302] +- Update URL for new location of Andi's mcelog tree +- Update n-v-r to include latest git hash + +* Tue Feb 08 2011 Fedora Release Engineering - 2:1.0-0.3.pre3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_15_Mass_Rebuild + +* Wed Nov 10 2010 Jon Masters 2:1.0-0.2.pre3 +- Rework mcelog to use daemon mode and systemd. + +* Tue Nov 09 2010 Jon Masters 2:1.0-0.1.pre3 +- Bump epoch and use standard Fedora Packaging Guidelines for NVR. +- Switch to using signed bz2 source and remove dead patch. + +* Fri Sep 17 2010 Dave Jones 1:1.0pre3-0.1 +- Update to upstream mcelog-1.0pre3 + +* Mon Oct 05 2009 Orion Poplawski - 1:0.9pre1-0.1 +- Update to 0.9pre1 +- Update URL +- Add patch to update mcelog kernel record length (bug #507026) + +* Tue Aug 04 2009 Adam Jackson 0.7-5 +- Fix %%install for new buildroot cleanout. + +* Sat Jul 25 2009 Fedora Release Engineering - 1:0.7-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_12_Mass_Rebuild + +* Wed Feb 25 2009 Fedora Release Engineering - 1:0.7-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild + +* Thu Aug 7 2008 Tom "spot" Callaway - 1:0.7-2 +- fix license tag +- clean this package up + +* Tue Feb 19 2008 Fedora Release Engineering - 1:0.7-1.22 +- Autorebuild for GCC 4.3 + +* Mon Jul 17 2006 Jesse Keating +- Rebuild. + +* Fri Jun 30 2006 Dave Jones +- Rebuild. (#197385) + +* Wed May 17 2006 Dave Jones +- Update to upstream 0.7 +- Change frequency to hourly instead of daily. + +* Thu Feb 09 2006 Dave Jones +- rebuild. + +* Wed Feb 8 2006 Dave Jones +- Update to upstream 0.6 + +* Mon Dec 19 2005 Dave Jones +- Update to upstream 0.5 + +* Fri Dec 16 2005 Jesse Keating +- rebuilt for new gcj + +* Fri Dec 09 2005 Jesse Keating +- rebuilt + +* Tue Mar 1 2005 Dave Jones +- Rebuild for gcc4 + +* Wed Feb 9 2005 Dave Jones +- Update to upstream 0.4 + +* Thu Jan 27 2005 Dave Jones +- Initial packaging. +