diff --git a/Makefile b/Makefile index a91950c..f8199f6 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,10 @@ WARNINGS := -Wall -Wextra -Wno-missing-field-initializers -Wno-unused-parameter # CONFIG_DISKDB = 1 TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \ - socket-memory-error-trigger + socket-memory-error-trigger \ + bus-error-trigger \ + iomca-error-trigger \ + unknown-error-trigger all: mcelog @@ -32,7 +35,7 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \ nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ client.o cache.o sysfs.o yellow.o page.o rbtree.o \ - xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o + xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o bus.o unknown.o DISKDB_OBJ := diskdb.o dimm.o db.o CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} DOC := mce.pdf diff --git a/bus.c b/bus.c new file mode 100644 index 0000000..f48bc38 --- /dev/null +++ b/bus.c @@ -0,0 +1,129 @@ +/* Copyright (C) 20014 Intel Corporation + Author: Rui Wang + Handle 'Bus and Interconnect' error threshold indications. + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; version + 2. + + mcelog is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should find a copy of v2 of the GNU General Public License somewhere + on your Linux system. */ +#define _GNU_SOURCE 1 +#include +#include +#include +#include +#include +#include "memutil.h" +#include "mcelog.h" +#include "config.h" +#include "trigger.h" +#include "bus.h" + +static char *bus_trigger, *iomca_trigger; + +enum { + MAX_ENV = 20, +}; + +void bus_setup(void) +{ + bus_trigger = config_string("socket", "bus-uc-threshold-trigger"); + if (bus_trigger && trigger_check(bus_trigger) < 0) { + SYSERRprintf("Cannot access bus threshold trigger `%s'", + bus_trigger); + exit(1); + } + + iomca_trigger = config_string("socket", "iomca-threshold-trigger"); + if (iomca_trigger && trigger_check(iomca_trigger) < 0) { + SYSERRprintf("Cannot access iomca threshold trigger `%s'", + iomca_trigger); + exit(1); + } +} + +void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, + char *ii, char *timeout) +{ + int ei = 0; + char *env[MAX_ENV]; + int i; + char *msg; + char *location; + + if (socket >= 0) + asprintf(&location, "CPU %d on socket %d", cpu, socket); + else + asprintf(&location, "CPU %d", cpu); + asprintf(&msg, "%s received Bus and Interconnect Errors in %s", + location, ii); + asprintf(&env[ei++], "LOCATION=%s", location); + free(location); + + if (!bus_trigger) + goto out; + + if (socket >= 0) + asprintf(&env[ei++], "SOCKETID=%d", socket); + asprintf(&env[ei++], "MESSAGE=%s", msg); + asprintf(&env[ei++], "CPU=%d", cpu); + asprintf(&env[ei++], "LEVEL=%s", level); + asprintf(&env[ei++], "PARTICIPATION=%s", pp); + asprintf(&env[ei++], "REQUEST=%s", rrrr); + asprintf(&env[ei++], "ORIGIN=%s", ii); + asprintf(&env[ei++], "TIMEOUT=%s", timeout); + env[ei] = NULL; + assert(ei < MAX_ENV); + + run_trigger(bus_trigger, NULL, env); + for (i = 0; i < ei; i++) + free(env[i]); +out: + free(msg); +} + +void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn) +{ + int ei = 0; + char *env[MAX_ENV]; + int i; + char *msg; + char *location; + + if (socket >= 0) + asprintf(&location, "CPU %d on socket %d", cpu, socket); + else + asprintf(&location, "CPU %d", cpu); + asprintf(&msg, "%s received IO MCA Errors from %x:%02x:%02x.%x", + location, seg, bus, dev, fn); + asprintf(&env[ei++], "LOCATION=%s", location); + free(location); + + if (!iomca_trigger) + goto out; + + if (socket >= 0) + asprintf(&env[ei++], "SOCKETID=%d", socket); + asprintf(&env[ei++], "MESSAGE=%s", msg); + asprintf(&env[ei++], "CPU=%d", cpu); + asprintf(&env[ei++], "SEG=%x", seg); + asprintf(&env[ei++], "BUS=%02x", bus); + asprintf(&env[ei++], "DEVICE=%02x", dev); + asprintf(&env[ei++], "FUNCTION=%x", fn); + env[ei] = NULL; + assert(ei < MAX_ENV); + + run_trigger(iomca_trigger, NULL, env); + for (i = 0; i < ei; i++) + free(env[i]); +out: + free(msg); + +} diff --git a/bus.h b/bus.h new file mode 100644 index 0000000..37ac592 --- /dev/null +++ b/bus.h @@ -0,0 +1,4 @@ +void bus_setup(void); +void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, + char *ii, char *timeout); +void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn); diff --git a/input/iomca b/input/iomca new file mode 100644 index 0000000..9a1e27d --- /dev/null +++ b/input/iomca @@ -0,0 +1,4 @@ +CPU 0 BANK 1 +STATUS 0x9c00000000000e0b +MISC 0xabcdef +ADDR 0xabcd diff --git a/input/unknown b/input/unknown new file mode 100644 index 0000000..29a2436 --- /dev/null +++ b/input/unknown @@ -0,0 +1,4 @@ +CPU 0 BANK 1 +STATUS 0x9c0000000000040b +MISC 0xabcdef +ADDR 0xabcd diff --git a/mcelog.c b/mcelog.c index 89bb537..95a913f 100644 --- a/mcelog.c +++ b/mcelog.c @@ -58,6 +58,8 @@ #include "msg.h" #include "yellow.h" #include "page.h" +#include "bus.h" +#include "unknown.h" enum cputype cputype = CPU_GENERIC; @@ -567,6 +569,12 @@ static char *skipgunk(char *s) if (*s == ']') ++s; } + + s = skipspace(s); + + if (strncmp(s, "mce: [Hardware Error]:", 22) == 0) + s += 22; + return skipspace(s); } @@ -1153,6 +1161,8 @@ static void general_setup(void) { trigger_setup(); yellow_setup(); + bus_setup(); + unknown_setup(); config_cred("global", "run-credentials", &runcred); if (config_bool("global", "filter-memory-errors") == 1) filter_memory_errors = 1; diff --git a/mcelog.conf b/mcelog.conf index 1bab3ee..6a2be26 100644 --- a/mcelog.conf +++ b/mcelog.conf @@ -127,6 +127,9 @@ mem-ce-error-threshold = 100 / 24h # Log socket error threshold explicitely? mem-ce-error-log = yes +bus-uc-threshold-trigger = bus-error-trigger +iomca-threshold-trigger = iomca-error-trigger +unknown-threshold-trigger = unknown-error-trigger [cache] # Processing of cache error thresholds reported by Intel CPUs diff --git a/msr.c b/msr.c index 2eef9d2..665cac3 100644 --- a/msr.c +++ b/msr.c @@ -36,10 +36,8 @@ static void domsr(int cpu, int msr, int bit) SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); exit(1); } - if ((data & bit) == 0) { - SYSERRprintf("Failed to set imc_log on cpu %d\n", cpu); - exit(1); - } + if ((data & bit) == 0) + Lprintf("No DIMM detection available on cpu %d (normal in virtual environments)\n", cpu); close(fd); } @@ -54,6 +52,8 @@ void set_imc_log(int cputype) msr = 0x17f; /* MSR_ERROR_CONTROL */ bit = 0x2; /* MemError Log Enable */ break; + default: + return; } for (cpu = 0; cpu < ncpus; cpu++) diff --git a/p4.c b/p4.c index 8a3b5a6..f938196 100644 --- a/p4.c +++ b/p4.c @@ -30,6 +30,8 @@ #include "tulsa.h" #include "intel.h" #include "yellow.h" +#include "bus.h" +#include "unknown.h" #include "bitfield.h" #include "sandy-bridge.h" #include "ivy-bridge.h" @@ -116,7 +118,7 @@ static char* get_II_str(__u8 i) return II[i]; } -static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) +static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, int socket) { #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ #define TLB_LL_SHIFT 0x0 @@ -141,6 +143,8 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ #define BUS_PP_SHIFT 0x9 + u32 mca; + int ret = 0; static char *msg[] = { [0] = "No Error", [1] = "Unclassified", @@ -151,6 +155,7 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) [6] = "SMM Handler Code Access Violation", }; + mca = status & 0xffff; if (mca & (1UL << 12)) { Wprintf("corrected filtering (some unreported errors in same region)\n"); mca &= ~(1UL << 12); @@ -158,16 +163,27 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) if (mca < NELE(msg)) { Wprintf("%s\n", msg[mca]); - return; + return ret; } if ((mca >> 2) == 3) { - Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3)); + unsigned levelnum; + char *level; + levelnum = mca & 3; + level = get_LL_str(levelnum); + Wprintf("%s Generic cache hierarchy error\n", level); + if (track == 2) + run_yellow_trigger(cpu, -1, levelnum, "unknown", level, socket); } else if (test_prefix(4, mca)) { - Wprintf("%s TLB %s Error\n", - get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT), - get_LL_str((mca & TLB_LL_MASK) >> - TLB_LL_SHIFT)); + unsigned levelnum, typenum; + char *level, *type; + typenum = (mca & TLB_TT_MASK) >> TLB_TT_SHIFT; + type = get_TT_str(typenum); + levelnum = (mca & TLB_LL_MASK) >> TLB_LL_SHIFT; + level = get_LL_str(levelnum); + Wprintf("%s TLB %s Error\n", type, level); + if (track == 2) + run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); } else if (test_prefix(8, mca)) { unsigned typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT; unsigned levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT; @@ -177,25 +193,51 @@ static void decode_mca(__u32 mca, u64 track, int cpu, int *ismemerr, int socket) get_RRRR_str((mca & CACHE_RRRR_MASK) >> CACHE_RRRR_SHIFT)); if (track == 2) - run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); + run_yellow_trigger(cpu, typenum, levelnum, type, level,socket); } else if (test_prefix(10, mca)) { if (mca == 0x400) Wprintf("Internal Timer error\n"); else Wprintf("Internal unclassified error: %x\n", mca & 0xffff); + + ret = 1; } else if (test_prefix(11, mca)) { - Wprintf("BUS %s %s %s %s %s Error\n", - get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT), - get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT), - get_RRRR_str((mca & BUS_RRRR_MASK) >> - BUS_RRRR_SHIFT), - get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT), - get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT)); + char *level, *pp, *rrrr, *ii, *timeout; + + level = get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT); + pp = get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT); + rrrr = get_RRRR_str((mca & BUS_RRRR_MASK) >> BUS_RRRR_SHIFT); + ii = get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT); + timeout = get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT); + + Wprintf("BUS error: %d %d %s %s %s %s %s\n", socket, cpu, + level, pp, rrrr, ii, timeout); + run_bus_trigger(socket, cpu, level, pp, rrrr, ii, timeout); + /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values + * and MISCV set. MISC register points to root port that reported the error + * need to cross check with AER logs for more details. + * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html + */ + if ((status & MCI_STATUS_MISCV) && + (status & 0xefff) == 0x0e0b) { + int seg, bus, dev, fn; + + seg = EXTRACT(misc, 32, 39); + bus = EXTRACT(misc, 24, 31); + dev = EXTRACT(misc, 19, 23); + fn = EXTRACT(misc, 16, 18); + Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", + seg, bus, dev, fn); + run_iomca_trigger(socket, cpu, seg, bus, dev, fn); + } } else if (test_prefix(7, mca)) { decode_memory_controller(mca); *ismemerr = 1; - } else + } else { Wprintf("Unknown Error %x\n", mca); + ret = 1; + } + return ret; } static void p4_decode_model(__u32 model) @@ -243,7 +285,7 @@ static const char *arstate[4] = { [3] = "SRAR" }; -static void decode_mci(__u64 status, int cpu, unsigned mcgcap, int *ismemerr, +static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr, int socket) { u64 track = 0; @@ -280,7 +322,7 @@ static void decode_mci(__u64 status, int cpu, unsigned mcgcap, int *ismemerr, decode_tracking(track); } Wprintf("MCA: "); - decode_mca(status & 0xffffL, track, cpu, ismemerr, socket); + return decode_mca(status, misc, track, cpu, ismemerr, socket); } static void decode_mcg(__u64 mcgstatus) @@ -314,11 +356,14 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) if (log->bank == MCE_THERMAL_BANK) { decode_thermal(log, cpu); + run_unknown_trigger(socket, cpu, log); return; } decode_mcg(log->mcgstatus); - decode_mci(log->status, cpu, log->mcgcap, ismemerr, socket); + if (decode_mci(log->status, log->misc, cpu, log->mcgcap, ismemerr, + socket)) + run_unknown_trigger(socket, cpu, log); if (test_prefix(11, (log->status & 0xffffL))) { switch (cputype) { @@ -365,23 +410,6 @@ void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) hsw_decode_model(cputype, log->bank, log->status, log->misc); break; } - - /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values - * and MISCV set. MISC register points to root port that reported the error - * need to cross check with AER logs for more details. - * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html - */ - if ((log->status & MCI_STATUS_MISCV) && - (log->status & 0xefff) == 0x0e0b) { - int seg, bus, dev, fn; - - seg = EXTRACT(log->misc, 32, 39); - bus = EXTRACT(log->misc, 24, 31); - dev = EXTRACT(log->misc, 19, 23); - fn = EXTRACT(log->misc, 16, 18); - Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", - seg, bus, dev, fn); - } } char *intel_bank_name(int num) diff --git a/tests/unknown/inject b/tests/unknown/inject new file mode 100755 index 0000000..7be39a7 --- /dev/null +++ b/tests/unknown/inject @@ -0,0 +1,8 @@ +#!/bin/sh + +B=$(pwd)/../.. + +PATH=$PATH:$B/../mce-inject + +mce-inject $B/input/iomca +mce-inject $B/input/unknown diff --git a/tests/unknown/unknown.conf b/tests/unknown/unknown.conf new file mode 100644 index 0000000..4b86db7 --- /dev/null +++ b/tests/unknown/unknown.conf @@ -0,0 +1,11 @@ +# trigger: 3 + +num-errors = 2 + +[socket] +bus-uc-threshold-trigger = ../trigger +iomca-threshold-trigger = ../trigger +unknown-threshold-trigger = ../trigger + +[trigger] +directory = . diff --git a/triggers/bus-error-trigger b/triggers/bus-error-trigger new file mode 100644 index 0000000..c996001 --- /dev/null +++ b/triggers/bus-error-trigger @@ -0,0 +1,23 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a sockets +# receives Bus and Interconnect errors +# +# environment: +# MESSAGE Human readable consolidated error message +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# LEVEL Interconnect level +# PARTICIPATION Processor Participation (Originator, Responder or Observer) +# REQUEST Request type (read, write, prefetch, etc.) +# ORIGIN Memory or IO +# TIMEOUT The request timed out or not +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./bus-error-trigger.local ] && . ./bus-error-trigger.local + +exit 0 diff --git a/triggers/iomca-error-trigger b/triggers/iomca-error-trigger new file mode 100644 index 0000000..3888461 --- /dev/null +++ b/triggers/iomca-error-trigger @@ -0,0 +1,23 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a sockets +# receives Bus and Interconnect errors +# +# environment: +# MESSAGE Human readable consolidated error message +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CPU Linux CPU number that triggered the error +# SET PCI segment number +# BUS PCI bus number +# DEVICE PCI device number +# FUNCTION PCI function number +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./iomca-error-trigger.local ] && . ./iomca-error-trigger.local + +exit 0 diff --git a/triggers/unknown-error-trigger b/triggers/unknown-error-trigger new file mode 100644 index 0000000..b924a0e --- /dev/null +++ b/triggers/unknown-error-trigger @@ -0,0 +1,26 @@ +#!/bin/sh +# This shell script is executed by mcelog in daemon mode when +# an not otherwise handled machine check error happens. +# +# environment: +# MESSAGE Human readable consolidated error message +# LOCATION Consolidated location as a single string +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CPU Linux CPU number that triggered the error +# STATUS IA32_MCi_STATUS register value +# ADDR IA32_MCi_ADDR register value +# MISC IA32_MCi_MISC regiser value +# MCGSTATUS IA32_MCG_STATUS register value +# MCGCAP IA32_MCG_CAP register value +# For details on the register layout please see the Intel SDM http://www.intel.com/sdm +# volume 3, chapter 15 +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./unknown-error-trigger.local ] && . ./unknown-error-trigger.local + +exit 0 diff --git a/unknown.c b/unknown.c new file mode 100644 index 0000000..482c29e --- /dev/null +++ b/unknown.c @@ -0,0 +1,82 @@ +/* Copyright (C) 20014 Intel Corporation + Author: Rui Wang + Handle all other unknown error requests. + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; version + 2. + + mcelog is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should find a copy of v2 of the GNU General Public License somewhere + on your Linux system. */ +#define _GNU_SOURCE 1 +#include +#include +#include +#include +#include +#include "memutil.h" +#include "mcelog.h" +#include "config.h" +#include "trigger.h" +#include "unknown.h" + +static char *unknown_trigger; + +enum { + MAX_ENV = 20, +}; + +void unknown_setup(void) +{ + unknown_trigger = config_string("socket", "unknown-threshold-trigger"); + if (unknown_trigger && trigger_check(unknown_trigger) < 0) { + SYSERRprintf("Cannot access unknown threshold trigger `%s'", + unknown_trigger); + exit(1); + } +} + +void run_unknown_trigger(int socket, int cpu, struct mce *log) +{ + int ei = 0; + char *env[MAX_ENV]; + int i; + char *msg; + char *location; + + if (socket >= 0) + asprintf(&location, "CPU %d on socket %d", cpu, socket); + else + asprintf(&location, "CPU %d", cpu); + asprintf(&msg, "%s received unknown error", location); + asprintf(&env[ei++], "LOCATION=%s", location); + free(location); + + if (!unknown_trigger) + goto out; + + if (socket >= 0) + asprintf(&env[ei++], "SOCKETID=%d", socket); + asprintf(&env[ei++], "MESSAGE=%s", msg); + asprintf(&env[ei++], "CPU=%d", cpu); + asprintf(&env[ei++], "STATUS=%llx", log->status); + asprintf(&env[ei++], "MISC=%llx", log->misc); + asprintf(&env[ei++], "ADDR=%llx", log->addr); + asprintf(&env[ei++], "MCGSTATUS=%llx", log->mcgstatus); + asprintf(&env[ei++], "MCGCAP=%llx", log->mcgcap); + env[ei] = NULL; + assert(ei < MAX_ENV); + + run_trigger(unknown_trigger, NULL, env); + for (i = 0; i < ei; i++) + free(env[i]); +out: + free(msg); +} + diff --git a/unknown.h b/unknown.h new file mode 100644 index 0000000..0c6d876 --- /dev/null +++ b/unknown.h @@ -0,0 +1,2 @@ +void unknown_setup(void); +void run_unknown_trigger(int socket, int cpu, struct mce *log); diff --git a/yellow.c b/yellow.c index 0f8ccd0..57978ee 100644 --- a/yellow.c +++ b/yellow.c @@ -90,6 +90,8 @@ void run_yellow_trigger(int cpu, int tnum, int lnum, char *ts, char *ls, int soc asprintf(&env[ei++], "TYPE=%s", ts); if (cache_to_cpus(cpu, lnum, tnum, &cpumasklen, &cpumask) >= 0) env[ei++] = cpulist("AFFECTED_CPUS=", cpumask, cpumasklen); + else + asprintf(&env[ei++], "AFFECTED_CPUS=unknown"); env[ei] = NULL; assert(ei < MAX_ENV);