diff --git a/SOURCES/0041-rasdaemon-add-support-for-Haswell.patch b/SOURCES/0041-rasdaemon-add-support-for-Haswell.patch new file mode 100644 index 0000000..0344103 --- /dev/null +++ b/SOURCES/0041-rasdaemon-add-support-for-Haswell.patch @@ -0,0 +1,295 @@ +From 108b124a09512d44cd810d1ef6b823c9d029d5d6 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 18 May 2015 14:19:28 -0300 +Subject: [PATCH 01/13] rasdaemon: add support for Haswell + +Based on mcelog code. + +Acked-by: Tony Luck +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + Makefile.am | 2 +- + mce-intel-haswell.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 2 + + ras-mce-handler.c | 8 +++ + ras-mce-handler.h | 3 + + 5 files changed, 208 insertions(+), 1 deletion(-) + create mode 100644 mce-intel-haswell.c + +diff --git a/Makefile.am b/Makefile.am +index 9c5f007..a6bf18f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -28,7 +28,7 @@ if WITH_MCE + rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \ + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ +- mce-intel-sb.c mce-intel-ivb.c ++ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +new file mode 100644 +index 0000000..c32704c +--- /dev/null ++++ b/mce-intel-haswell.c +@@ -0,0 +1,194 @@ ++/* ++ * The code below came from Tony Luck mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include ++#include ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-22 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void hsw_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ /* Internal errors */ ++ break; ++ case 0x406: ++ /* Intel TXT errors */ ++ break; ++ case 0x407: ++ /* Other UBOX Internal errors */ ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ /* PCU internal error */ ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ decode_bitfield(e, status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ decode_bitfield(e, status, memctrl_mc9); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) ++ rank0 = EXTRACT(e->misc, 46, 50); ++ ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 >= 0 && rank1 >= 0) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 >= 0) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++ else ++ mce_snprintf(e->mc_location, "rank=%d", rank1); ++} ++ +diff --git a/mce-intel.c b/mce-intel.c +index 427b98e..1546a1d 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -392,6 +392,8 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + case CPU_IVY_BRIDGE_EPEX: + ivb_decode_model(ras, e); + break; ++ case CPU_HASWELL_EPEX: ++ hsw_decode_model(ras, e); + default: + break; + } +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index a1d0b5d..d2de096 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -47,6 +47,8 @@ static char *cputype_name[] = { + [CPU_SANDY_BRIDGE_EP] = "Sandy Bridge EP", /* Fill in better name */ + [CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */ + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ ++ [CPU_HASWELL] = "Haswell", ++ [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -81,6 +83,12 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_IVY_BRIDGE; + else if (mce->model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; ++ else if (mce->model == 0x3c || mce->model == 0x45 || ++ mce->model == 0x46) ++ return CPU_HASWELL; ++ else if (mce->model == 0x3f) ++ return CPU_HASWELL_EPEX; ++ + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, + "Family 6 Model %x CPU: only decoding architectural errors\n", +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 80e9769..b8b3d4f 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -42,6 +42,8 @@ enum cputype { + CPU_SANDY_BRIDGE_EP, + CPU_IVY_BRIDGE, + CPU_IVY_BRIDGE_EPEX, ++ CPU_HASWELL, ++ CPU_HASWELL_EPEX, + }; + + struct mce_event { +@@ -114,6 +116,7 @@ void xeon75xx_decode_model(struct mce_event *e); + void dunnington_decode_model(struct mce_event *e); + void snb_decode_model(struct ras_events *ras, struct mce_event *e); + void ivb_decode_model(struct ras_events *ras, struct mce_event *e); ++void hsw_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); + + /* Software defined banks */ +-- +1.8.3.1 + diff --git a/SOURCES/0042-rasdaemon-decode-new-simple-error-code-number-6.patch b/SOURCES/0042-rasdaemon-decode-new-simple-error-code-number-6.patch new file mode 100644 index 0000000..0691768 --- /dev/null +++ b/SOURCES/0042-rasdaemon-decode-new-simple-error-code-number-6.patch @@ -0,0 +1,40 @@ +From 85a2ead8f2d6e380be8d8234ba752a558e8027ed Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 18 May 2015 14:19:29 -0300 +Subject: [PATCH 02/13] rasdaemon: decode new simple error code number 6 + +This patch was based on fa313dd0144596dfa140bd66805367250d6eae9b +(mcelog) + + mcelog: Decode new simple error code number 6 + + Edition 050 of the Intel SDM released in late February 2014 + includes a new simple error code in "Table 15-8. IA32_MCi_Status + [15:0] Simple Error Code Encoding". Code 6 (0000 0000 0000 0110) + has been allocated for the reporting of cases where the BIOS SMM + code attempts to execute code outside of the protected SMRR area. + + Signed-off-by: Tony Luck + Signed-off-by: Andi Kleen + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + mce-intel.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mce-intel.c b/mce-intel.c +index 1546a1d..69ea00e 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -115,6 +115,7 @@ static char *mca_msg[] = { + [3] = "External error", + [4] = "FRC error", + [5] = "Internal parity error", ++ [6] = "SMM Handler Code Access Violation", + }; + + static char *tracking_msg[] = { +-- +1.8.3.1 + diff --git a/SOURCES/0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch b/SOURCES/0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch new file mode 100644 index 0000000..0cb3df2 --- /dev/null +++ b/SOURCES/0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch @@ -0,0 +1,38 @@ +From 064a74b1202e529b5e16a54218fc17974906af2d Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 18 May 2015 14:19:30 -0300 +Subject: [PATCH 03/13] rasdaemon: Add missing entry to Ivy Bridge memory + controller decode table + +This patch is based on 2577aeb662374cb87169ee675b2e37c06f1aed99 (mcelog) + + mcelog: Add missing entry to Ivy Bridge memory controller decode table + + September 2013 edition of the software developer manual added an + entry that had been inadvertently omitted from earlier editions. + Add the 0x80 entry for "Corrected memory read error". + + Signed-off-by: Tony Luck + Signed-off-by: Andi Kleen + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + mce-intel-ivb.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mce-intel-ivb.c b/mce-intel-ivb.c +index f2a133a..0c5bebc 100644 +--- a/mce-intel-ivb.c ++++ b/mce-intel-ivb.c +@@ -76,6 +76,7 @@ static char *memctrl_1[] = { + [0x010] = "Uncorrected patrol scrub error", + [0x020] = "Corrected spare error", + [0x040] = "Uncorrected spare error", ++ [0x080] = "Corrected memory read error", + [0x100] = "iMC, WDB, parity errors", + }; + +-- +1.8.3.1 + diff --git a/SOURCES/0044-rasdaemon-Identify-Ivy-Bridge-properly.patch b/SOURCES/0044-rasdaemon-Identify-Ivy-Bridge-properly.patch new file mode 100644 index 0000000..27aee96 --- /dev/null +++ b/SOURCES/0044-rasdaemon-Identify-Ivy-Bridge-properly.patch @@ -0,0 +1,38 @@ +From 66021c20c92b5df16b5c8dae4fb664788fa40376 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 18 May 2015 14:19:31 -0300 +Subject: [PATCH 04/13] rasdaemon: Identify Ivy Bridge properly + +This patch is based on b29cc4d615cead87cbc163ada0645b10c5b1217d (mcelog) + mcelog: Identify Ivy Bridge properly + + Uniquely identify Ivy Bridge even though the machine checks are the same + for Sandy Bridge and Ivy Bridge. This makes the output for the processor + display "Ivy Bridge". + + Signed-off-by: Prarit Bhargava + Cc: tony.luck@intel.com + Signed-off-by: Andi Kleen + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + ras-mce-handler.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index d2de096..07e298f 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -75,7 +75,7 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_NEHALEM; + else if (mce->model == 0x2e || mce->model == 0x2f) + return CPU_XEON75XX; +- else if (mce->model == 0x2a || mce->model == 0x3a) ++ else if (mce->model == 0x2a) + return CPU_SANDY_BRIDGE; + else if (mce->model == 0x2d) + return CPU_SANDY_BRIDGE_EP; +-- +1.8.3.1 + diff --git a/SOURCES/0045-rasdaemon-add-support-for-Broadwell.patch b/SOURCES/0045-rasdaemon-add-support-for-Broadwell.patch new file mode 100644 index 0000000..ce568d3 --- /dev/null +++ b/SOURCES/0045-rasdaemon-add-support-for-Broadwell.patch @@ -0,0 +1,52 @@ +From a9810094cf838e03102f95333db7ddfe810ccabd Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 18 May 2015 14:19:32 -0300 +Subject: [PATCH 05/13] rasdaemon: add support for Broadwell + +Only basic support for now. + +Based on mcelog code. + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + ras-mce-handler.c | 3 +++ + ras-mce-handler.h | 1 + + 2 files changed, 4 insertions(+) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 07e298f..e059b92 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -49,6 +49,7 @@ static char *cputype_name[] = { + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ + [CPU_HASWELL] = "Haswell", + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", ++ [CPU_BROADWELL] = "Broadwell", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -88,6 +89,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL; + else if (mce->model == 0x3f) + return CPU_HASWELL_EPEX; ++ else if (mce->model == 0x3d) ++ return CPU_BROADWELL; + + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index b8b3d4f..ba01f55 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -44,6 +44,7 @@ enum cputype { + CPU_IVY_BRIDGE_EPEX, + CPU_HASWELL, + CPU_HASWELL_EPEX, ++ CPU_BROADWELL, + }; + + struct mce_event { +-- +1.8.3.1 + diff --git a/SOURCES/0046-rasdaemon-add-support-for-Knights-Landing.patch b/SOURCES/0046-rasdaemon-add-support-for-Knights-Landing.patch new file mode 100644 index 0000000..a6f4367 --- /dev/null +++ b/SOURCES/0046-rasdaemon-add-support-for-Knights-Landing.patch @@ -0,0 +1,50 @@ +From bd6c78d89f4e934fafb1136a15efc0d6df4635ed Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Mon, 18 May 2015 14:19:33 -0300 +Subject: [PATCH 06/13] rasdaemon: add support for Knights Landing + +Patch based on mcelog. + +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + ras-mce-handler.c | 3 +++ + ras-mce-handler.h | 1 + + 2 files changed, 4 insertions(+) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index e059b92..63f14fd 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -50,6 +50,7 @@ static char *cputype_name[] = { + [CPU_HASWELL] = "Haswell", + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + [CPU_BROADWELL] = "Broadwell", ++ [CPU_KNIGHTS_LANDING] = "Knights Landing", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -91,6 +92,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL_EPEX; + else if (mce->model == 0x3d) + return CPU_BROADWELL; ++ else if (mce->model == 0x57) ++ return CPU_KNIGHTS_LANDING; + + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index ba01f55..28aad00 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -45,6 +45,7 @@ enum cputype { + CPU_HASWELL, + CPU_HASWELL_EPEX, + CPU_BROADWELL, ++ CPU_KNIGHTS_LANDING, + }; + + struct mce_event { +-- +1.8.3.1 + diff --git a/SOURCES/0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch b/SOURCES/0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch new file mode 100644 index 0000000..12d58d2 --- /dev/null +++ b/SOURCES/0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch @@ -0,0 +1,33 @@ +From 5dd11c60b84294a3c6ce5ccb0db726b3dce35b10 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi +Date: Tue, 26 May 2015 11:59:36 -0300 +Subject: [PATCH 07/13] rasdaemon: properly pring message strings in + decode_bitfield() + +Fix decode_bitfield() so that it does print message strings from the struct +field table. + +Signed-off-by: Seiichi Ikarashi +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + bitfield.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/bitfield.c b/bitfield.c +index 1dda30d..d6931c9 100644 +--- a/bitfield.c ++++ b/bitfield.c +@@ -84,7 +84,8 @@ void decode_bitfield(struct mce_event *e, uint64_t status, + continue; + mce_snprintf(e->error_msg, "<%u:%llx>", + f->start_bit, (long long)v); +- } ++ } else ++ mce_snprintf(e->error_msg, "%s", s); + } + } + +-- +1.8.3.1 + diff --git a/SOURCES/0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch b/SOURCES/0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch new file mode 100644 index 0000000..b956655 --- /dev/null +++ b/SOURCES/0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch @@ -0,0 +1,31 @@ +From abf36efe909c4022260cb4016c54d1ec3ec18cb8 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi +Date: Tue, 26 May 2015 11:59:37 -0300 +Subject: [PATCH 08/13] rasdaemon: add missing semicolon in hsw_decode_model() + +hsw_decode_model() tries to skip decode_bitfield() if IA32_MC4_STATUS indicates +some internal errors. Unfortunately, here behaves opposite to the intention +because a semicolon is missing. + +Signed-off-by: Seiichi Ikarashi +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + mce-intel-haswell.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +index c32704c..3ac12f2 100644 +--- a/mce-intel-haswell.c ++++ b/mce-intel-haswell.c +@@ -137,6 +137,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) + } + if (EXTRACT(status, 16, 19)) + /* PCU internal error */ ++ ; + decode_bitfield(e, status, pcu_mc4); + break; + case 5: +-- +1.8.3.1 + diff --git a/SOURCES/0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch b/SOURCES/0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch new file mode 100644 index 0000000..24ec908 --- /dev/null +++ b/SOURCES/0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch @@ -0,0 +1,43 @@ +From f892a390c55c0b350c57cda9d166a9cf331aa36f Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi +Date: Tue, 26 May 2015 11:59:38 -0300 +Subject: [PATCH 09/13] rasdaemon: enable IMC status usage for Haswell-E + +Enable IMC status bank for Haswell-E, as described in Intel SDM Vol.3C +Table 35-27. + +Signed-off-by: Seiichi Ikarashi +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + mce-intel.c | 1 + + ras-mce-handler.c | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/mce-intel.c b/mce-intel.c +index 69ea00e..3684602 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -457,6 +457,7 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus) + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: ++ case CPU_HASWELL_EPEX: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 63f14fd..fb6db8a 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -221,6 +221,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) + switch (mce->cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: ++ case CPU_HASWELL_EPEX: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; +-- +1.8.3.1 + diff --git a/SOURCES/0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch b/SOURCES/0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch new file mode 100644 index 0000000..9c57427 --- /dev/null +++ b/SOURCES/0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch @@ -0,0 +1,54 @@ +From 56913e2f2a5a6ddf8ab684c8d528e9ef1d55cfba Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi +Date: Tue, 26 May 2015 11:59:39 -0300 +Subject: [PATCH 10/13] rasdaemon: make sure the error is valid before handling + ranks + +Fix "rank" handling according to the Bit 63 description in Intel SDM Vol.3C +Table 16-23, that says "... Use this information only after there is valid +first error info indicated by bit 62". +Also fix invalid comparisons of unsigned variables "rank0" and "rank1". + +Signed-off-by: Seiichi Ikarashi +Signed-off-by: Aristeu Rozanski +Signed-off-by: Mauro Carvalho Chehab +--- + mce-intel-haswell.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +index 3ac12f2..0a817bf 100644 +--- a/mce-intel-haswell.c ++++ b/mce-intel-haswell.c +@@ -174,22 +174,20 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) + + mce_snprintf(e->mc_location, "memory_channel=%d", chan); + +- if (EXTRACT(e->misc, 62, 62)) ++ if (EXTRACT(e->misc, 62, 62)) { + rank0 = EXTRACT(e->misc, 46, 50); +- +- if (EXTRACT(e->misc, 63, 63)) +- rank1 = EXTRACT(e->misc, 51, 55); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } + + /* + * FIXME: The conversion from rank to dimm requires to parse the + * DMI tables and call failrank2dimm(). + */ +- if (rank0 >= 0 && rank1 >= 0) ++ if (rank0 != -1 && rank1 != -1) + mce_snprintf(e->mc_location, "ranks=%d and %d", + rank0, rank1); +- else if (rank0 >= 0) ++ else if (rank0 != -1) + mce_snprintf(e->mc_location, "rank=%d", rank0); +- else +- mce_snprintf(e->mc_location, "rank=%d", rank1); + } + +-- +1.8.3.1 + diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec index f3c59de..c070011 100644 --- a/SPECS/rasdaemon.spec +++ b/SPECS/rasdaemon.spec @@ -2,7 +2,7 @@ Name: rasdaemon Version: 0.4.1 -Release: 14%{?dist} +Release: 14.1%{?dist} Summary: Utility to receive RAS error tracings Group: Applications/System License: GPLv2 @@ -53,6 +53,17 @@ Patch27: 0037-rasdaemon-sqlite-truncates-some-MCE-fields-to-32-bit.patch Patch28: 0038-rasdaemon-fix-mce-numfield-decoded-error.patch Patch29: 0039-rasdaemon-do-not-assume-dimmX-directories-will-be-pr.patch Patch30: 0040-rasdaemon-add-more-dell-labels.patch +Patch31: 0041-rasdaemon-add-support-for-Haswell.patch +Patch32: 0042-rasdaemon-decode-new-simple-error-code-number-6.patch +Patch33: 0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch +Patch34: 0044-rasdaemon-Identify-Ivy-Bridge-properly.patch +Patch35: 0045-rasdaemon-add-support-for-Broadwell.patch +Patch36: 0046-rasdaemon-add-support-for-Knights-Landing.patch +Patch37: 0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch +Patch38: 0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch +Patch39: 0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch +Patch40: 0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch +#Patch41: 0051-rasdaemon-add-support-to-match-the-machine-by-system.patch %description @@ -97,6 +108,17 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch28 -p1 %patch29 -p1 %patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +#%patch41 -p1 %build autoreconf -vfi @@ -124,6 +146,9 @@ rm -rf %{buildroot} %{_sysconfdir}/ras/dimm_labels.d %changelog +* Wed Jun 03 2015 Aristeu Rozanski 0.4.1-14.1.el7 +- add support to Haswell and newer processors [1230243] + * Tue Dec 16 2014 Aristeu Rozanski 0.4.1-14.el7 - properly install the labels so it can be packaged [1073090]