Blame SOURCES/mcelog-patch-6ed93e30f835.patch

a6d7b5
From: Prarit Bhargava <prarit@redhat.com>
a6d7b5
a6d7b5
Subject: mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
a6d7b5
a6d7b5
commit 6ed93e30f83519b0ab71f8ecd156b8ff0b2912b6
a6d7b5
Author: Tony Luck <tony.luck@intel.com>
a6d7b5
Date:   Mon Sep 24 11:14:45 2018 -0700
a6d7b5
a6d7b5
    mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
a6d7b5
    
a6d7b5
    Ivy Bridge was the last system that gave us enough information
a6d7b5
    to figure out the exact DIMM that is the source of a memory error.
a6d7b5
    We gave up on DIMM logging at that point.
a6d7b5
    
a6d7b5
    But we can still figure out the socket, memory controller and channel.
a6d7b5
    
a6d7b5
    Signed-off-by: Tony Luck <tony.luck@intel.com>
a6d7b5
    Signed-off-by: Andi Kleen <ak@linux.intel.com>
a6d7b5
a6d7b5
diff --git a/haswell.c b/haswell.c
a6d7b5
index 892ebc7248e808248798f21506b54faca147db9b..4eccbeb21a281467495e024b376d81be96b2183e 100644
a6d7b5
--- a/haswell.c
a6d7b5
+++ b/haswell.c
a6d7b5
@@ -148,3 +148,45 @@ void hsw_decode_model(int cputype, int bank, u64 status, u64 misc)
a6d7b5
 		break;
a6d7b5
 	}
a6d7b5
 }
a6d7b5
+
a6d7b5
+/*
a6d7b5
+ * There isn't enough information to identify the DIMM. But
a6d7b5
+ * we can derive the channel from the bank number.
a6d7b5
+ * There can be two memory controllers. We number the channels
a6d7b5
+ * on the second controller: 4, 5, 6, 7
a6d7b5
+ */
a6d7b5
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm)
a6d7b5
+{
a6d7b5
+	u64 status = m->status;
a6d7b5
+	unsigned	chan;
a6d7b5
+
a6d7b5
+	/* Check this is a memory error */
a6d7b5
+	if (!test_prefix(7, status & 0xefff))
a6d7b5
+		return;
a6d7b5
+
a6d7b5
+	chan = EXTRACT(status, 0, 3);
a6d7b5
+	if (chan == 0xf)
a6d7b5
+		return;
a6d7b5
+
a6d7b5
+	switch (m->bank) {
a6d7b5
+	case 7:
a6d7b5
+		/* Home agent 0 */
a6d7b5
+		break;
a6d7b5
+	case 8:
a6d7b5
+		/* Home agent 1 */
a6d7b5
+		chan += 4;
a6d7b5
+		break;
a6d7b5
+	case 9: case 10: case 11: case 12:
a6d7b5
+		/* Memory controller 0 */
a6d7b5
+		chan = m->bank - 9;
a6d7b5
+		break;
a6d7b5
+	case 13: case 14: case 15: case 16:
a6d7b5
+		/* Memory controller 1 */
a6d7b5
+		chan = (m->bank - 13) + 4;
a6d7b5
+		break;
a6d7b5
+	default:
a6d7b5
+		return;
a6d7b5
+	}
a6d7b5
+
a6d7b5
+	channel[0] = chan;
a6d7b5
+}
a6d7b5
diff --git a/haswell.h b/haswell.h
a6d7b5
index ba3fb1c3c985aec0ac1a0a271dca3c3afd18874c..712c8eb66d50a1bf63a7dbd67382fe775b59d69b 100644
a6d7b5
--- a/haswell.h
a6d7b5
+++ b/haswell.h
a6d7b5
@@ -1,2 +1,3 @@
a6d7b5
 void hsw_decode_model(int cputype, int bank, u64 status, u64 misc);
a6d7b5
 void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm);
a6d7b5
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm);
a6d7b5
diff --git a/intel.c b/intel.c
a6d7b5
index 20d2acdc12daa1128d72471d53639aebf82f4854..b655c4162f8980d5d826640fa4375c7ba6b1e97d 100644
a6d7b5
--- a/intel.c
a6d7b5
+++ b/intel.c
a6d7b5
@@ -25,6 +25,7 @@
a6d7b5
 #include "sandy-bridge.h"
a6d7b5
 #include "ivy-bridge.h"
a6d7b5
 #include "haswell.h"
a6d7b5
+#include "skylake_xeon.h"
a6d7b5
 
a6d7b5
 int memory_error_support;
a6d7b5
 
a6d7b5
@@ -140,6 +141,13 @@ static int intel_memory_error(struct mce *m, unsigned recordlen)
a6d7b5
 		case CPU_IVY_BRIDGE_EPEX:
a6d7b5
 			ivy_bridge_ep_memerr_misc(m, channel, dimm);
a6d7b5
 			break;
a6d7b5
+		case CPU_HASWELL_EPEX:
a6d7b5
+		case CPU_BROADWELL_EPEX:
a6d7b5
+			haswell_memerr_misc(m, channel, dimm);
a6d7b5
+			break;
a6d7b5
+		case CPU_SKYLAKE_XEON:
a6d7b5
+			skylake_memerr_misc(m, channel, dimm);
a6d7b5
+			break;
a6d7b5
 		default:
a6d7b5
 			break;
a6d7b5
 		} 
a6d7b5
diff --git a/skylake_xeon.c b/skylake_xeon.c
a6d7b5
index 16c6181987f0126d377b64a8f5d4a96a01bfa1c4..b02f8acd806e2a64ed1653f44349fd3e9abf374e 100644
a6d7b5
--- a/skylake_xeon.c
a6d7b5
+++ b/skylake_xeon.c
a6d7b5
@@ -228,3 +228,45 @@ int skylake_s_ce_type(int bank, u64 status, u64 misc)
a6d7b5
 
a6d7b5
 	return 0;
a6d7b5
 }
a6d7b5
+
a6d7b5
+/*
a6d7b5
+ * There isn't enough information to identify the DIMM. But
a6d7b5
+ * we can derive the channel from the bank number.
a6d7b5
+ * There can be two memory controllers. We number the channels
a6d7b5
+ * on the second controller: 3, 4, 5
a6d7b5
+ */
a6d7b5
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm)
a6d7b5
+{
a6d7b5
+	u64 status = m->status;
a6d7b5
+	unsigned	chan;
a6d7b5
+
a6d7b5
+	/* Check this is a memory error */
a6d7b5
+	if (!test_prefix(7, status & 0xefff))
a6d7b5
+		return;
a6d7b5
+
a6d7b5
+	chan = EXTRACT(status, 0, 3);
a6d7b5
+	if (chan == 0xf)
a6d7b5
+		return;
a6d7b5
+
a6d7b5
+	switch (m->bank) {
a6d7b5
+	case 7:
a6d7b5
+		/* Home agent 0 */
a6d7b5
+		break;
a6d7b5
+	case 8:
a6d7b5
+		/* Home agent 1 */
a6d7b5
+		chan += 3;
a6d7b5
+		break;
a6d7b5
+	case 13: case 14: case 15:
a6d7b5
+		/* Memory controller 0 */
a6d7b5
+		chan = m->bank - 13;
a6d7b5
+		break;
a6d7b5
+	case 16: case 17: case 18:
a6d7b5
+		/* Memory controller 1 */
a6d7b5
+		chan = (m->bank - 16) + 3;
a6d7b5
+		break;
a6d7b5
+	default:
a6d7b5
+		return;
a6d7b5
+	}
a6d7b5
+
a6d7b5
+	channel[0] = chan;
a6d7b5
+}
a6d7b5
diff --git a/skylake_xeon.h b/skylake_xeon.h
a6d7b5
index edcd9c030fa70f10ac23f2df9be948b10c73f4a1..098e6fa0e3eaff1b1d7e3040eddfb9187dabd7dd 100644
a6d7b5
--- a/skylake_xeon.h
a6d7b5
+++ b/skylake_xeon.h
a6d7b5
@@ -1,2 +1,3 @@
a6d7b5
 void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc);
a6d7b5
 int skylake_s_ce_type(int bank, u64 status, u64 misc);
a6d7b5
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm);