diff --git a/SOURCES/mcelog-haswell-support.patch b/SOURCES/mcelog-haswell-support.patch new file mode 100644 index 0000000..8c05ce7 --- /dev/null +++ b/SOURCES/mcelog-haswell-support.patch @@ -0,0 +1,348 @@ +The patches were in the process of being committed to Andi's upstream mcelog +tree when they were applied to the RHEL source. The patch subjects are + + Add better decoding support for Haswell server processors + More compact data structures for reporting SNB/IVB memory controller errors + +and were provided early by Tony Luck @ Intel. + +diff -urNp mcelog-d2e13bf0.orig/haswell.c mcelog-d2e13bf0/haswell.c +--- mcelog-d2e13bf0.orig/haswell.c 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/haswell.c 2014-09-08 09:59:52.998327718 -0400 +@@ -0,0 +1,150 @@ ++/* Copyright (C) 2013 Intel Corporation ++ Decode Intel Ivy Bridge specific machine check errors. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system; if not, write to the Free Software Foundation, ++ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ Author: Tony Luck ++*/ ++ ++#include "mcelog.h" ++#include "bitfield.h" ++#include "haswell.h" ++#include "memdb.h" ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-22 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void hsw_decode_model(int cputype, int bank, u64 status, u64 misc) ++{ ++ switch (bank) { ++ case 4: ++ Wprintf("PCU: "); ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ Wprintf("Internal errors "); ++ break; ++ case 0x406: ++ Wprintf("Intel TXT errors "); ++ break; ++ case 0x407: ++ Wprintf("Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ Wprintf("PCU internal error "); ++ decode_bitfield(status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ Wprintf("QPI: "); ++ decode_bitfield(status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ Wprintf("MemCtrl: "); ++ decode_bitfield(status, memctrl_mc9); ++ break; ++ } ++} +diff -urNp mcelog-d2e13bf0.orig/haswell.h mcelog-d2e13bf0/haswell.h +--- mcelog-d2e13bf0.orig/haswell.h 1969-12-31 19:00:00.000000000 -0500 ++++ mcelog-d2e13bf0/haswell.h 2014-09-08 09:59:52.998327718 -0400 +@@ -0,0 +1,2 @@ ++void hsw_decode_model(int cputype, int bank, u64 status, u64 misc); ++void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm); +diff -urNp mcelog-d2e13bf0.orig/intel.c mcelog-d2e13bf0/intel.c +--- mcelog-d2e13bf0.orig/intel.c 2014-09-08 09:59:39.622699389 -0400 ++++ mcelog-d2e13bf0/intel.c 2014-09-08 09:59:52.998327718 -0400 +@@ -24,6 +24,7 @@ + #include "page.h" + #include "sandy-bridge.h" + #include "ivy-bridge.h" ++#include "haswell.h" + #include "xeon75xx.h" + + int memory_error_support; +@@ -33,7 +34,7 @@ void intel_cpu_init(enum cputype cpu) + if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL || + cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || + cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || +- cpu == CPU_HASWELL) ++ cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX) + memory_error_support = 1; + } + +@@ -67,9 +68,10 @@ enum cputype select_intel_cputype(int fa + return CPU_IVY_BRIDGE; + else if (model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; +- else if (model == 0x3c || model == 0x3f || model == 0x45 || +- model == 0x46) ++ else if (model == 0x3c || model == 0x45 || model == 0x46) + return CPU_HASWELL; ++ else if (model == 0x3f) ++ return CPU_HASWELL_EPEX; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", + model); +diff -urNp mcelog-d2e13bf0.orig/intel.h mcelog-d2e13bf0/intel.h +--- mcelog-d2e13bf0.orig/intel.h 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/intel.h 2014-09-08 09:59:52.998327718 -0400 +@@ -18,5 +18,6 @@ extern int memory_error_support; + case CPU_SANDY_BRIDGE: \ + case CPU_IVY_BRIDGE: \ + case CPU_IVY_BRIDGE_EPEX: \ +- case CPU_HASWELL ++ case CPU_HASWELL: \ ++ case CPU_HASWELL_EPEX + +diff -urNp mcelog-d2e13bf0.orig/ivy-bridge.c mcelog-d2e13bf0/ivy-bridge.c +--- mcelog-d2e13bf0.orig/ivy-bridge.c 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/ivy-bridge.c 2014-09-08 09:59:56.033470497 -0400 +@@ -68,20 +68,16 @@ static struct field pcu_mc4[] = { + + /* See IA32 SDM Vol3B Table 16-18 */ + +-static char *memctrl_1[] = { +- [0x001] = "Address parity error", +- [0x002] = "HA Wrt buffer Data parity error", +- [0x004] = "HA Wrt byte enable parity error", +- [0x008] = "Corrected patrol scrub error", +- [0x010] = "Uncorrected patrol scrub error", +- [0x020] = "Corrected spare error", +- [0x040] = "Uncorrected spare error", +- [0x080] = "Corrected memory read error", +- [0x100] = "iMC, WDB, parity errors", +-}; +- + static struct field memctrl_mc9[] = { +- FIELD(16, memctrl_1), ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC, WDB, parity errors"), + {} + }; + +diff -urNp mcelog-d2e13bf0.orig/Makefile mcelog-d2e13bf0/Makefile +--- mcelog-d2e13bf0.orig/Makefile 2014-09-08 09:59:39.610698703 -0400 ++++ mcelog-d2e13bf0/Makefile 2014-09-08 09:59:52.998327718 -0400 +@@ -32,7 +32,7 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co + nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ + eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ + client.o cache.o sysfs.o yellow.o page.o rbtree.o \ +- xeon75xx.o sandy-bridge.o ivy-bridge.o msr.o ++ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o + DISKDB_OBJ := diskdb.o dimm.o db.o + CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} + DOC := mce.pdf +diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c +--- mcelog-d2e13bf0.orig/mcelog.c 2014-09-08 09:59:39.622699389 -0400 ++++ mcelog-d2e13bf0/mcelog.c 2014-09-08 09:59:52.999327768 -0400 +@@ -228,6 +228,7 @@ static char *cputype_name[] = { + [CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */ + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ + [CPU_HASWELL] = "Haswell", /* Fill in better name */ ++ [CPU_HASWELL_EPEX] = "Haswell EP/EX", /* Fill in better name */ + }; + + static struct config_choice cpu_choices[] = { +@@ -264,6 +265,8 @@ static struct config_choice cpu_choices[ + { "ivybridge-ep", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "ivybridge-ex", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "haswell", CPU_HASWELL }, /* Fill in better name */ ++ { "haswell-ep", CPU_HASWELL_EPEX }, /* Fill in better name */ ++ { "haswell-ex", CPU_HASWELL_EPEX }, /* Fill in better name */ + {} + }; + +@@ -424,7 +427,8 @@ static void dump_mce(struct mce *m, unsi + fam, + mod); + } +- if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX) ++ if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX && ++ cputype != CPU_HASWELL_EPEX) + resolveaddr(m->addr); + if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { + diskdb_resolve_addr(m->addr); +diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h +--- mcelog-d2e13bf0.orig/mcelog.h 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/mcelog.h 2014-09-08 09:59:52.999327768 -0400 +@@ -118,6 +118,7 @@ enum cputype { + CPU_IVY_BRIDGE, + CPU_IVY_BRIDGE_EPEX, + CPU_HASWELL, ++ CPU_HASWELL_EPEX, + }; + + enum option_ranges { +diff -urNp mcelog-d2e13bf0.orig/p4.c mcelog-d2e13bf0/p4.c +--- mcelog-d2e13bf0.orig/p4.c 2014-09-08 09:59:39.621699344 -0400 ++++ mcelog-d2e13bf0/p4.c 2014-09-08 09:59:52.999327768 -0400 +@@ -33,6 +33,7 @@ + #include "bitfield.h" + #include "sandy-bridge.h" + #include "ivy-bridge.h" ++#include "haswell.h" + + /* decode mce for P4/Xeon and Core2 family */ + +@@ -360,6 +361,9 @@ void decode_intel_mc(struct mce *log, in + case CPU_IVY_BRIDGE_EPEX: + ivb_decode_model(cputype, log->bank, log->status, log->misc); + break; ++ case CPU_HASWELL_EPEX: ++ hsw_decode_model(cputype, log->bank, log->status, log->misc); ++ break; + } + + /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values +diff -urNp mcelog-d2e13bf0.orig/sandy-bridge.c mcelog-d2e13bf0/sandy-bridge.c +--- mcelog-d2e13bf0.orig/sandy-bridge.c 2014-09-08 09:59:39.616699077 -0400 ++++ mcelog-d2e13bf0/sandy-bridge.c 2014-09-08 09:59:56.033470497 -0400 +@@ -63,18 +63,14 @@ static struct field pcu_mc4[] = { + {} + }; + +-static char *memctrl_1[] = { +- [0x001] = "Address parity error", +- [0x002] = "HA Wrt buffer Data parity error", +- [0x004] = "HA Wrt byte enable parity error", +- [0x008] = "Corrected patrol scrub error", +- [0x010] = "Uncorrected patrol scrub error", +- [0x020] = "Corrected spare error", +- [0x040] = "Uncorrected spare error", +-}; +- + static struct field memctrl_mc8[] = { +- FIELD(16, memctrl_1), ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), + {} + }; + diff --git a/SOURCES/mcelog-update-9de4924.patch b/SOURCES/mcelog-update-9de4924.patch new file mode 100644 index 0000000..00f6812 --- /dev/null +++ b/SOURCES/mcelog-update-9de4924.patch @@ -0,0 +1,687 @@ +diff --git a/Makefile b/Makefile +index a91950c..f8199f6 100644 +--- a/Makefile ++++ b/Makefile +@@ -22,7 +22,10 @@ WARNINGS := -Wall -Wextra -Wno-missing-field-initializers -Wno-unused-parameter + # CONFIG_DISKDB = 1 + + TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \ +- socket-memory-error-trigger ++ socket-memory-error-trigger \ ++ bus-error-trigger \ ++ iomca-error-trigger \ ++ unknown-error-trigger + + all: mcelog + +@@ -32,7 +35,7 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \ + nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ + eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ + client.o cache.o sysfs.o yellow.o page.o rbtree.o \ +- xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o ++ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o bus.o unknown.o + DISKDB_OBJ := diskdb.o dimm.o db.o + CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} + DOC := mce.pdf +diff --git a/bus.c b/bus.c +new file mode 100644 +index 0000000..f48bc38 +--- /dev/null ++++ b/bus.c +@@ -0,0 +1,129 @@ ++/* Copyright (C) 20014 Intel Corporation ++ Author: Rui Wang ++ Handle 'Bus and Interconnect' error threshold indications. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system. */ ++#define _GNU_SOURCE 1 ++#include ++#include ++#include ++#include ++#include ++#include "memutil.h" ++#include "mcelog.h" ++#include "config.h" ++#include "trigger.h" ++#include "bus.h" ++ ++static char *bus_trigger, *iomca_trigger; ++ ++enum { ++ MAX_ENV = 20, ++}; ++ ++void bus_setup(void) ++{ ++ bus_trigger = config_string("socket", "bus-uc-threshold-trigger"); ++ if (bus_trigger && trigger_check(bus_trigger) < 0) { ++ SYSERRprintf("Cannot access bus threshold trigger `%s'", ++ bus_trigger); ++ exit(1); ++ } ++ ++ iomca_trigger = config_string("socket", "iomca-threshold-trigger"); ++ if (iomca_trigger && trigger_check(iomca_trigger) < 0) { ++ SYSERRprintf("Cannot access iomca threshold trigger `%s'", ++ iomca_trigger); ++ exit(1); ++ } ++} ++ ++void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, ++ char *ii, char *timeout) ++{ ++ int ei = 0; ++ char *env[MAX_ENV]; ++ int i; ++ char *msg; ++ char *location; ++ ++ if (socket >= 0) ++ asprintf(&location, "CPU %d on socket %d", cpu, socket); ++ else ++ asprintf(&location, "CPU %d", cpu); ++ asprintf(&msg, "%s received Bus and Interconnect Errors in %s", ++ location, ii); ++ asprintf(&env[ei++], "LOCATION=%s", location); ++ free(location); ++ ++ if (!bus_trigger) ++ goto out; ++ ++ if (socket >= 0) ++ asprintf(&env[ei++], "SOCKETID=%d", socket); ++ asprintf(&env[ei++], "MESSAGE=%s", msg); ++ asprintf(&env[ei++], "CPU=%d", cpu); ++ asprintf(&env[ei++], "LEVEL=%s", level); ++ asprintf(&env[ei++], "PARTICIPATION=%s", pp); ++ asprintf(&env[ei++], "REQUEST=%s", rrrr); ++ asprintf(&env[ei++], "ORIGIN=%s", ii); ++ asprintf(&env[ei++], "TIMEOUT=%s", timeout); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(bus_trigger, NULL, env); ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++out: ++ free(msg); ++} ++ ++void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn) ++{ ++ int ei = 0; ++ char *env[MAX_ENV]; ++ int i; ++ char *msg; ++ char *location; ++ ++ if (socket >= 0) ++ asprintf(&location, "CPU %d on socket %d", cpu, socket); ++ else ++ asprintf(&location, "CPU %d", cpu); ++ asprintf(&msg, "%s received IO MCA Errors from %x:%02x:%02x.%x", ++ location, seg, bus, dev, fn); ++ asprintf(&env[ei++], "LOCATION=%s", location); ++ free(location); ++ ++ if (!iomca_trigger) ++ goto out; ++ ++ if (socket >= 0) ++ asprintf(&env[ei++], "SOCKETID=%d", socket); ++ asprintf(&env[ei++], "MESSAGE=%s", msg); ++ asprintf(&env[ei++], "CPU=%d", cpu); ++ asprintf(&env[ei++], "SEG=%x", seg); ++ asprintf(&env[ei++], "BUS=%02x", bus); ++ asprintf(&env[ei++], "DEVICE=%02x", dev); ++ asprintf(&env[ei++], "FUNCTION=%x", fn); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(iomca_trigger, NULL, env); ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++out: ++ free(msg); ++ ++} +diff --git a/bus.h b/bus.h +new file mode 100644 +index 0000000..37ac592 +--- /dev/null ++++ b/bus.h +@@ -0,0 +1,4 @@ ++void bus_setup(void); ++void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, ++ char *ii, char *timeout); ++void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn); +diff --git a/input/iomca b/input/iomca +new file mode 100644 +index 0000000..9a1e27d +--- /dev/null ++++ b/input/iomca +@@ -0,0 +1,4 @@ ++CPU 0 BANK 1 ++STATUS 0x9c00000000000e0b ++MISC 0xabcdef ++ADDR 0xabcd +diff --git a/input/unknown b/input/unknown +new file mode 100644 +index 0000000..29a2436 +--- /dev/null ++++ b/input/unknown +@@ -0,0 +1,4 @@ ++CPU 0 BANK 1 ++STATUS 0x9c0000000000040b ++MISC 0xabcdef ++ADDR 0xabcd +diff --git a/mcelog.c b/mcelog.c +index 89bb537..95a913f 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -58,6 +58,8 @@ + #include "msg.h" + #include "yellow.h" + #include "page.h" ++#include "bus.h" ++#include "unknown.h" + + enum cputype cputype = CPU_GENERIC; + +@@ -567,6 +569,12 @@ static char *skipgunk(char *s) + if (*s == ']') + ++s; + } ++ ++ s = skipspace(s); ++ ++ if (strncmp(s, "mce: [Hardware Error]:", 22) == 0) ++ s += 22; ++ + return skipspace(s); + } + +@@ -1153,6 +1161,8 @@ static void general_setup(void) + { + trigger_setup(); + yellow_setup(); ++ bus_setup(); ++ unknown_setup(); + config_cred("global", "run-credentials", &runcred); + if (config_bool("global", "filter-memory-errors") == 1) + filter_memory_errors = 1; +diff --git a/mcelog.conf b/mcelog.conf +index 1bab3ee..6a2be26 100644 +--- a/mcelog.conf ++++ b/mcelog.conf +@@ -127,6 +127,9 @@ mem-ce-error-threshold = 100 / 24h + # Log socket error threshold explicitely? + mem-ce-error-log = yes + ++bus-uc-threshold-trigger = bus-error-trigger ++iomca-threshold-trigger = iomca-error-trigger ++unknown-threshold-trigger = unknown-error-trigger + + [cache] + # Processing of cache error thresholds reported by Intel CPUs +diff --git a/msr.c b/msr.c +index 2eef9d2..665cac3 100644 +--- a/msr.c ++++ b/msr.c +@@ -36,10 +36,8 @@ static void domsr(int cpu, int msr, int bit) + SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); + exit(1); + } +- if ((data & bit) == 0) { +- SYSERRprintf("Failed to set imc_log on cpu %d\n", cpu); +- exit(1); +- } ++ if ((data & bit) == 0) ++ Lprintf("No DIMM detection available on cpu %d (normal in virtual environments)\n", cpu); + close(fd); + } + +@@ -54,6 +52,8 @@ void set_imc_log(int cputype) + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; ++ default: ++ return; + } + + for (cpu = 0; cpu < ncpus; cpu++) +diff --git a/p4.c b/p4.c +index 8a3b5a6..f938196 100644 +--- a/p4.c ++++ b/p4.c +@@ -30,6 +30,8 @@ + #include "tulsa.h" + #include "intel.h" + #include "yellow.h" ++#include "bus.h" ++#include "unknown.h" + #include "bitfield.h" + #include "sandy-bridge.h" + #include "ivy-bridge.h" +@@ -116,7 +118,7 @@ static char* get_II_str(__u8 i) + return II[i]; + } + +-static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) ++static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, int socket) + { + #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ + #define TLB_LL_SHIFT 0x0 +@@ -141,6 +143,8 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ + #define BUS_PP_SHIFT 0x9 + ++ u32 mca; ++ int ret = 0; + static char *msg[] = { + [0] = "No Error", + [1] = "Unclassified", +@@ -151,6 +155,7 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + [6] = "SMM Handler Code Access Violation", + }; + ++ mca = status & 0xffff; + if (mca & (1UL << 12)) { + Wprintf("corrected filtering (some unreported errors in same region)\n"); + mca &= ~(1UL << 12); +@@ -158,16 +163,27 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + + if (mca < NELE(msg)) { + Wprintf("%s\n", msg[mca]); +- return; ++ return ret; + } + + if ((mca >> 2) == 3) { +- Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3)); ++ unsigned levelnum; ++ char *level; ++ levelnum = mca & 3; ++ level = get_LL_str(levelnum); ++ Wprintf("%s Generic cache hierarchy error\n", level); ++ if (track == 2) ++ run_yellow_trigger(cpu, -1, levelnum, "unknown", level, socket); + } else if (test_prefix(4, mca)) { +- Wprintf("%s TLB %s Error\n", +- get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT), +- get_LL_str((mca & TLB_LL_MASK) >> +- TLB_LL_SHIFT)); ++ unsigned levelnum, typenum; ++ char *level, *type; ++ typenum = (mca & TLB_TT_MASK) >> TLB_TT_SHIFT; ++ type = get_TT_str(typenum); ++ levelnum = (mca & TLB_LL_MASK) >> TLB_LL_SHIFT; ++ level = get_LL_str(levelnum); ++ Wprintf("%s TLB %s Error\n", type, level); ++ if (track == 2) ++ run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); + } else if (test_prefix(8, mca)) { + unsigned typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT; + unsigned levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT; +@@ -177,25 +193,51 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + get_RRRR_str((mca & CACHE_RRRR_MASK) >> + CACHE_RRRR_SHIFT)); + if (track == 2) +- run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); ++ run_yellow_trigger(cpu, typenum, levelnum, type, level,socket); + } else if (test_prefix(10, mca)) { + if (mca == 0x400) + Wprintf("Internal Timer error\n"); + else + Wprintf("Internal unclassified error: %x\n", mca & 0xffff); ++ ++ ret = 1; + } else if (test_prefix(11, mca)) { +- Wprintf("BUS %s %s %s %s %s Error\n", +- get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT), +- get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT), +- get_RRRR_str((mca & BUS_RRRR_MASK) >> +- BUS_RRRR_SHIFT), +- get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT), +- get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT)); ++ char *level, *pp, *rrrr, *ii, *timeout; ++ ++ level = get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT); ++ pp = get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT); ++ rrrr = get_RRRR_str((mca & BUS_RRRR_MASK) >> BUS_RRRR_SHIFT); ++ ii = get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT); ++ timeout = get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT); ++ ++ Wprintf("BUS error: %d %d %s %s %s %s %s\n", socket, cpu, ++ level, pp, rrrr, ii, timeout); ++ run_bus_trigger(socket, cpu, level, pp, rrrr, ii, timeout); ++ /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values ++ * and MISCV set. MISC register points to root port that reported the error ++ * need to cross check with AER logs for more details. ++ * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html ++ */ ++ if ((status & MCI_STATUS_MISCV) && ++ (status & 0xefff) == 0x0e0b) { ++ int seg, bus, dev, fn; ++ ++ seg = EXTRACT(misc, 32, 39); ++ bus = EXTRACT(misc, 24, 31); ++ dev = EXTRACT(misc, 19, 23); ++ fn = EXTRACT(misc, 16, 18); ++ Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", ++ seg, bus, dev, fn); ++ run_iomca_trigger(socket, cpu, seg, bus, dev, fn); ++ } + } else if (test_prefix(7, mca)) { + decode_memory_controller(mca); + *ismemerr = 1; +- } else ++ } else { + Wprintf("Unknown Error %x\n", mca); ++ ret = 1; ++ } ++ return ret; + } + + static void p4_decode_model(__u32 model) +@@ -243,7 +285,7 @@ static const char *arstate[4] = { + [3] = "SRAR" + }; + +-static void decode_mci(__u64 status, int cpu, unsigned mcgcap, int *ismemerr, ++static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr, + int socket) + { + u64 track = 0; +@@ -280,7 +322,7 @@ static void decode_mci(__u64 status, int cpu, unsigned mcgcap, int *ismemerr, + decode_tracking(track); + } + Wprintf("MCA: "); +- decode_mca(status & 0xffffL, track, cpu, ismemerr, socket); ++ return decode_mca(status, misc, track, cpu, ismemerr, socket); + } + + static void decode_mcg(__u64 mcgstatus) +@@ -314,11 +356,14 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) + + if (log->bank == MCE_THERMAL_BANK) { + decode_thermal(log, cpu); ++ run_unknown_trigger(socket, cpu, log); + return; + } + + decode_mcg(log->mcgstatus); +- decode_mci(log->status, cpu, log->mcgcap, ismemerr, socket); ++ if (decode_mci(log->status, log->misc, cpu, log->mcgcap, ismemerr, ++ socket)) ++ run_unknown_trigger(socket, cpu, log); + + if (test_prefix(11, (log->status & 0xffffL))) { + switch (cputype) { +@@ -365,23 +410,6 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) + hsw_decode_model(cputype, log->bank, log->status, log->misc); + break; + } +- +- /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values +- * and MISCV set. MISC register points to root port that reported the error +- * need to cross check with AER logs for more details. +- * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html +- */ +- if ((log->status & MCI_STATUS_MISCV) && +- (log->status & 0xefff) == 0x0e0b) { +- int seg, bus, dev, fn; +- +- seg = EXTRACT(log->misc, 32, 39); +- bus = EXTRACT(log->misc, 24, 31); +- dev = EXTRACT(log->misc, 19, 23); +- fn = EXTRACT(log->misc, 16, 18); +- Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", +- seg, bus, dev, fn); +- } + } + + char *intel_bank_name(int num) +diff --git a/tests/unknown/inject b/tests/unknown/inject +new file mode 100755 +index 0000000..7be39a7 +--- /dev/null ++++ b/tests/unknown/inject +@@ -0,0 +1,8 @@ ++#!/bin/sh ++ ++B=$(pwd)/../.. ++ ++PATH=$PATH:$B/../mce-inject ++ ++mce-inject $B/input/iomca ++mce-inject $B/input/unknown +diff --git a/tests/unknown/unknown.conf b/tests/unknown/unknown.conf +new file mode 100644 +index 0000000..4b86db7 +--- /dev/null ++++ b/tests/unknown/unknown.conf +@@ -0,0 +1,11 @@ ++# trigger: 3 ++ ++num-errors = 2 ++ ++[socket] ++bus-uc-threshold-trigger = ../trigger ++iomca-threshold-trigger = ../trigger ++unknown-threshold-trigger = ../trigger ++ ++[trigger] ++directory = . +diff --git a/triggers/bus-error-trigger b/triggers/bus-error-trigger +new file mode 100644 +index 0000000..c996001 +--- /dev/null ++++ b/triggers/bus-error-trigger +@@ -0,0 +1,23 @@ ++#!/bin/sh ++# This shell script can be executed by mcelog in daemon mode when a sockets ++# receives Bus and Interconnect errors ++# ++# environment: ++# MESSAGE Human readable consolidated error message ++# LOCATION Consolidated location as a single string ++# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM ++# LEVEL Interconnect level ++# PARTICIPATION Processor Participation (Originator, Responder or Observer) ++# REQUEST Request type (read, write, prefetch, etc.) ++# ORIGIN Memory or IO ++# TIMEOUT The request timed out or not ++# ++# note: will run as mcelog configured user ++# this can be changed in mcelog.conf ++ ++logger -s -p daemon.err -t mcelog "$MESSAGE" ++logger -s -p daemon.err -t mcelog "Location: $LOCATION" ++ ++[ -x ./bus-error-trigger.local ] && . ./bus-error-trigger.local ++ ++exit 0 +diff --git a/triggers/iomca-error-trigger b/triggers/iomca-error-trigger +new file mode 100644 +index 0000000..3888461 +--- /dev/null ++++ b/triggers/iomca-error-trigger +@@ -0,0 +1,23 @@ ++#!/bin/sh ++# This shell script can be executed by mcelog in daemon mode when a sockets ++# receives Bus and Interconnect errors ++# ++# environment: ++# MESSAGE Human readable consolidated error message ++# LOCATION Consolidated location as a single string ++# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM ++# CPU Linux CPU number that triggered the error ++# SET PCI segment number ++# BUS PCI bus number ++# DEVICE PCI device number ++# FUNCTION PCI function number ++# ++# note: will run as mcelog configured user ++# this can be changed in mcelog.conf ++ ++logger -s -p daemon.err -t mcelog "$MESSAGE" ++logger -s -p daemon.err -t mcelog "Location: $LOCATION" ++ ++[ -x ./iomca-error-trigger.local ] && . ./iomca-error-trigger.local ++ ++exit 0 +diff --git a/triggers/unknown-error-trigger b/triggers/unknown-error-trigger +new file mode 100644 +index 0000000..b924a0e +--- /dev/null ++++ b/triggers/unknown-error-trigger +@@ -0,0 +1,26 @@ ++#!/bin/sh ++# This shell script is executed by mcelog in daemon mode when ++# an not otherwise handled machine check error happens. ++# ++# environment: ++# MESSAGE Human readable consolidated error message ++# LOCATION Consolidated location as a single string ++# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM ++# CPU Linux CPU number that triggered the error ++# STATUS IA32_MCi_STATUS register value ++# ADDR IA32_MCi_ADDR register value ++# MISC IA32_MCi_MISC regiser value ++# MCGSTATUS IA32_MCG_STATUS register value ++# MCGCAP IA32_MCG_CAP register value ++# For details on the register layout please see the Intel SDM http://www.intel.com/sdm ++# volume 3, chapter 15 ++# ++# note: will run as mcelog configured user ++# this can be changed in mcelog.conf ++ ++logger -s -p daemon.err -t mcelog "$MESSAGE" ++logger -s -p daemon.err -t mcelog "Location: $LOCATION" ++ ++[ -x ./unknown-error-trigger.local ] && . ./unknown-error-trigger.local ++ ++exit 0 +diff --git a/unknown.c b/unknown.c +new file mode 100644 +index 0000000..482c29e +--- /dev/null ++++ b/unknown.c +@@ -0,0 +1,82 @@ ++/* Copyright (C) 20014 Intel Corporation ++ Author: Rui Wang ++ Handle all other unknown error requests. ++ ++ mcelog is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public ++ License as published by the Free Software Foundation; version ++ 2. ++ ++ mcelog is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should find a copy of v2 of the GNU General Public License somewhere ++ on your Linux system. */ ++#define _GNU_SOURCE 1 ++#include ++#include ++#include ++#include ++#include ++#include "memutil.h" ++#include "mcelog.h" ++#include "config.h" ++#include "trigger.h" ++#include "unknown.h" ++ ++static char *unknown_trigger; ++ ++enum { ++ MAX_ENV = 20, ++}; ++ ++void unknown_setup(void) ++{ ++ unknown_trigger = config_string("socket", "unknown-threshold-trigger"); ++ if (unknown_trigger && trigger_check(unknown_trigger) < 0) { ++ SYSERRprintf("Cannot access unknown threshold trigger `%s'", ++ unknown_trigger); ++ exit(1); ++ } ++} ++ ++void run_unknown_trigger(int socket, int cpu, struct mce *log) ++{ ++ int ei = 0; ++ char *env[MAX_ENV]; ++ int i; ++ char *msg; ++ char *location; ++ ++ if (socket >= 0) ++ asprintf(&location, "CPU %d on socket %d", cpu, socket); ++ else ++ asprintf(&location, "CPU %d", cpu); ++ asprintf(&msg, "%s received unknown error", location); ++ asprintf(&env[ei++], "LOCATION=%s", location); ++ free(location); ++ ++ if (!unknown_trigger) ++ goto out; ++ ++ if (socket >= 0) ++ asprintf(&env[ei++], "SOCKETID=%d", socket); ++ asprintf(&env[ei++], "MESSAGE=%s", msg); ++ asprintf(&env[ei++], "CPU=%d", cpu); ++ asprintf(&env[ei++], "STATUS=%llx", log->status); ++ asprintf(&env[ei++], "MISC=%llx", log->misc); ++ asprintf(&env[ei++], "ADDR=%llx", log->addr); ++ asprintf(&env[ei++], "MCGSTATUS=%llx", log->mcgstatus); ++ asprintf(&env[ei++], "MCGCAP=%llx", log->mcgcap); ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(unknown_trigger, NULL, env); ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++out: ++ free(msg); ++} ++ +diff --git a/unknown.h b/unknown.h +new file mode 100644 +index 0000000..0c6d876 +--- /dev/null ++++ b/unknown.h +@@ -0,0 +1,2 @@ ++void unknown_setup(void); ++void run_unknown_trigger(int socket, int cpu, struct mce *log); +diff --git a/yellow.c b/yellow.c +index 0f8ccd0..57978ee 100644 +--- a/yellow.c ++++ b/yellow.c +@@ -90,6 +90,8 @@ void run_yellow_trigger(int cpu, int tnum, int lnum, char *ts, char *ls, int soc + asprintf(&env[ei++], "TYPE=%s", ts); + if (cache_to_cpus(cpu, lnum, tnum, &cpumasklen, &cpumask) >= 0) + env[ei++] = cpulist("AFFECTED_CPUS=", cpumask, cpumasklen); ++ else ++ asprintf(&env[ei++], "AFFECTED_CPUS=unknown"); + env[ei] = NULL; + assert(ei < MAX_ENV); + diff --git a/SOURCES/mcelog-update-f30da3d.patch b/SOURCES/mcelog-update-f30da3d.patch new file mode 100644 index 0000000..1cd0a25 --- /dev/null +++ b/SOURCES/mcelog-update-f30da3d.patch @@ -0,0 +1,212 @@ +diff --git a/dmi.c b/dmi.c +index 373837e..290a053 100644 +--- a/dmi.c ++++ b/dmi.c +@@ -174,8 +174,10 @@ check_symbol: + if (fclose(efi_systab) != 0) + perror(filename); + +- if (!ret) +- Eprintf("%s: SMBIOS entry point missing", filename); ++ if (!ret || !*address){ ++ Lprintf("No valid SMBIOS entry point: Continue without DMI decoding"); ++ return 0; ++ } + + if (verbose) + printf("%s: SMBIOS entry point at 0x%08lx\n", filename, +@@ -224,6 +226,8 @@ int opendmi(void) + } + a = (struct anchor*)((char*)abase + (entry_point_addr - addr_start)); + goto fill_entries; ++ } else { ++ return -1; + } + + legacy: +diff --git a/input/GENCACHE b/input/GENCACHE +index 71f1d1a..bcf689d 100755 +--- a/input/GENCACHE ++++ b/input/GENCACHE +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # GENCACHE cpu level type track + # generate a memory error. All fields are optional. + # see SDM 3a chapter 15 for details +diff --git a/input/GENMEM b/input/GENMEM +index c0a4c53..caa61b9 100755 +--- a/input/GENMEM ++++ b/input/GENMEM +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # GENMEM socketid channel dimm corr-err-cnt uc-flag + # generate a memory error. All fields are optional. + # suitable to be fed into mce-inject or mcelog --ascii +diff --git a/input/GENPAGE b/input/GENPAGE +index c63d607..14c20ba 100755 +--- a/input/GENPAGE ++++ b/input/GENPAGE +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # GENMPAGE pfn socketid channel dimm corr-err-cnt + # generate a memory error on a page. All fields are optional. + # dimm/channel can be out of sync with the address +diff --git a/intel.c b/intel.c +index ba353c2..0f5abac 100644 +--- a/intel.c ++++ b/intel.c +@@ -67,7 +67,8 @@ enum cputype select_intel_cputype(int family, int model) + return CPU_IVY_BRIDGE; + else if (model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; +- else if (model == 0x3c || model == 0x45 || model == 0x46) ++ else if (model == 0x3c || model == 0x3f || model == 0x45 || ++ model == 0x46) + return CPU_HASWELL; + if (model > 0x1a) { + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", +diff --git a/mcelog.8 b/mcelog.8 +index fa923e7..f8a77c4 100644 +--- a/mcelog.8 ++++ b/mcelog.8 +@@ -181,7 +181,13 @@ With the + .B \-\-daemon + option mcelog will run in the background. This gives the fastest reaction + time and is the recommended operating mode. +-This option implies ++If an output option isn't selected ( ++.I \-\-logfile ++or ++.I \-\-syslog ++or ++.I \-\-syslog-error ++), this option implies + .I \-\-logfile=/var/log/mcelog. + Important messages will be logged as one-liner summaries to syslog + unless +diff --git a/mcelog.c b/mcelog.c +index 7ceb43d..d90589f 100644 +--- a/mcelog.c ++++ b/mcelog.c +@@ -508,11 +508,12 @@ int is_cpu_supported(void) + } + if (seen == ALL) { + if (!strcmp(vendor,"AuthenticAMD")) { +- if (family == 15) ++ if (family == 15) { + cputype = CPU_K8; +- if (family >= 15) +- SYSERRprintf("AMD Processor family %d: Please load edac_mce_amd module.\n", family); +- return 0; ++ } else if (family >= 16) { ++ SYSERRprintf("AMD Processor family %d: Please use the edac_mce_amd module instead.\n", family); ++ return 0; ++ } + } else if (!strcmp(vendor,"GenuineIntel")) + cputype = select_intel_cputype(family, model); + /* Add checks for other CPUs here */ +@@ -1069,11 +1070,8 @@ static int modifier(int opt) + break; + case O_DAEMON: + daemon_mode = 1; +- if (!logfile && !foreground) +- logfile = logfile_default; + if (!(syslog_opt & SYSLOG_FORCE)) + syslog_opt = SYSLOG_REMARK|SYSLOG_ERROR; +- + break; + case O_FILE: + inputfile = optarg; +@@ -1082,8 +1080,6 @@ static int modifier(int opt) + foreground = 1; + if (!(syslog_opt & SYSLOG_FORCE)) + syslog_opt = SYSLOG_FORCE; +- if (logfile == logfile_default) +- logfile = NULL; + break; + case O_NUMERRORS: + numerrors = atoi(optarg); +@@ -1110,6 +1106,9 @@ static int modifier(int opt) + + static void modifier_finish(void) + { ++ if(!foreground && daemon_mode && !logfile && !(syslog_opt & SYSLOG_LOG)) { ++ logfile = logfile_default; ++ } + if (logfile) { + if (open_logfile(logfile) < 0) { + if (daemon_mode && !(syslog_opt & SYSLOG_FORCE)) +@@ -1174,8 +1173,8 @@ static void drop_cred(void) + static void process(int fd, unsigned recordlen, unsigned loglen, char *buf) + { + int i; +- int len; +- int finish = 0; ++ int len, count; ++ int finish = 0, flags; + + if (recordlen == 0) { + Wprintf("no data in mce record\n"); +@@ -1188,7 +1187,14 @@ static void process(int fd, unsigned recordlen, unsigned loglen, char *buf) + return; + } + +- for (i = 0; (i < len / (int)recordlen) && !finish; i++) { ++ count = len / (int)recordlen; ++ if (count == (int)loglen) { ++ if ((ioctl(fd, MCE_GETCLEAR_FLAGS, &flags) == 0) && ++ (flags & (1 << MCE_OVERFLOW))) ++ Eprintf("Warning: MCE buffer is overflowed.\n"); ++ } ++ ++ for (i = 0; (i < count) && !finish; i++) { + struct mce *mce = (struct mce *)(buf + i*recordlen); + mce_prepare(mce); + if (numerrors > 0 && --numerrors == 0) +diff --git a/p4.c b/p4.c +index 93b59f3..86e7dc5 100644 +--- a/p4.c ++++ b/p4.c +@@ -147,6 +147,7 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) + [3] = "External error", + [4] = "FRC error", + [5] = "Internal parity error", ++ [6] = "SMM Handler Code Access Violation", + }; + + if (mca & (1UL << 12)) { +diff --git a/tests/test b/tests/test +index c673eb2..35bebd2 100755 +--- a/tests/test ++++ b/tests/test +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # simple test harness for mcelog daemon trigger test cases + # ./test subdir [debugger] + # run mcelog test in specific sub directory +diff --git a/triggers/cache-error-trigger b/triggers/cache-error-trigger +index e32bfd6..beb5f07 100755 +--- a/triggers/cache-error-trigger ++++ b/triggers/cache-error-trigger +@@ -15,16 +15,11 @@ + # this can be changed in mcelog.conf + + # +-# offline the CPUs (except CPU #0) sharing the affected cache ++# offline the CPUs sharing the affected cache + # + EXIT=0 + + for i in $AFFECTED_CPUS ; do +- if [ $i = 0 ] ; then +- logger -s -p daemon.warn -t mcelog "Not offlining CPU 0" +- EXIT=1 +- continue +- fi + logger -s -p daemon.crit -t mcelog "Offlining CPU $i due to cache error threshold" + F=$(printf "/sys/devices/system/cpu/cpu%d/online" $i) + echo 0 > $F diff --git a/SOURCES/mcelog.service b/SOURCES/mcelog.service index 62d8cbe..13e32e9 100644 --- a/SOURCES/mcelog.service +++ b/SOURCES/mcelog.service @@ -9,8 +9,9 @@ After=syslog.target # is allowed and expected to fail without aborting the daemon. [Service] +Type=forking ExecStartPre=/etc/mcelog/mcelog.setup -ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground +ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --syslog StandardOutput=syslog [Install] diff --git a/SPECS/mcelog.spec b/SPECS/mcelog.spec index 19d15b2..4d73022 100644 --- a/SPECS/mcelog.spec +++ b/SPECS/mcelog.spec @@ -1,11 +1,11 @@ %define last_tar_git_commit d2e13bf0 -%define last_git_commit 2577aeb +%define last_git_commit 9de4924 Summary: Tool to translate x86-64 CPU Machine Check Exception data Name: mcelog -Version: 1.0 -Release: 0.12.%{last_git_commit}%{?dist} -Epoch: 2 +Version: 101 +Release: 3.%{last_git_commit}%{?dist} +Epoch: 3 Group: System Environment/Base License: GPLv2 Source0: mcelog-%{last_tar_git_commit}.tar.bz2 @@ -16,6 +16,10 @@ Source10: mcelog.setup Patch0: mcelog-fix-trigger-path-and-cacheing.patch # BZ 1039183: Add Haswell and correct Ivy Bridge Patch1: mcelog-update-2577aeb.patch +Patch2: mcelog-update-f30da3d.patch +# BZ 1138319: Add additional Haswell support (see patch for additional info) +Patch3: mcelog-haswell-support.patch +Patch4: mcelog-update-9de4924.patch URL: https://github.com/andikleen/mcelog.git Buildroot: %{_tmppath}/%{name}-%{version}-root ExclusiveArch: i686 x86_64 @@ -32,6 +36,9 @@ on x86-32 and x86-64 systems. It can be run either as a daemon, or by cron. %setup -q -n %{name}-%{last_tar_git_commit} %patch0 -p1 -b .fix-triggers-and-cacheing %patch1 -p1 -b .mcelog-update-2577aeb +%patch2 -p1 -b .mcelog-update-f30da3d +%patch3 -p1 -b .mcelog-haswell-support +%patch4 -p1 -b .mcelog-update-9de4924 %build mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir} @@ -92,6 +99,20 @@ fi %attr(0644,root,root) %{_mandir}/*/* %changelog +* Mon Oct 27 2014 Prarit Bhargava - 3:101-3.9de4924 +- Update with latest minor fixes, no new support [1157683] + +* Mon Sep 8 2014 Prarit Bhargava - 3:101-2.f30da3d +- Additional Haswell Support [1138319] + +* Thu Sep 4 2014 Prarit Bhargava - 3:101-1.f30da3d +- Update to upstream NVR (101) [1136989] + +* Wed Sep 3 2014 Prarit Bhargava - 2:1.0-0.13.f30da3d +- Update to upstream commit f30da3d, minor fixes, no features [1085134] +- Add /var/log/mcelog file [1098864] +- remove .src.rpm file [1038755] + * Wed Jan 22 2014 Prarit Bhargava - 2:1.0-0.12.2577aeb - Add Haswell client cpuids, identify Ivy Bridge properly, and fix issues on Ivy Bridge