Blob Blame History Raw
From: Prarit Bhargava <prarit@redhat.com>

Subject: mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems

commit 6ed93e30f83519b0ab71f8ecd156b8ff0b2912b6
Author: Tony Luck <tony.luck@intel.com>
Date:   Mon Sep 24 11:14:45 2018 -0700

    mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
    
    Ivy Bridge was the last system that gave us enough information
    to figure out the exact DIMM that is the source of a memory error.
    We gave up on DIMM logging at that point.
    
    But we can still figure out the socket, memory controller and channel.
    
    Signed-off-by: Tony Luck <tony.luck@intel.com>
    Signed-off-by: Andi Kleen <ak@linux.intel.com>

diff --git a/haswell.c b/haswell.c
index 892ebc7248e808248798f21506b54faca147db9b..4eccbeb21a281467495e024b376d81be96b2183e 100644
--- a/haswell.c
+++ b/haswell.c
@@ -148,3 +148,45 @@ void hsw_decode_model(int cputype, int bank, u64 status, u64 misc)
 		break;
 	}
 }
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be two memory controllers. We number the channels
+ * on the second controller: 4, 5, 6, 7
+ */
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm)
+{
+	u64 status = m->status;
+	unsigned	chan;
+
+	/* Check this is a memory error */
+	if (!test_prefix(7, status & 0xefff))
+		return;
+
+	chan = EXTRACT(status, 0, 3);
+	if (chan == 0xf)
+		return;
+
+	switch (m->bank) {
+	case 7:
+		/* Home agent 0 */
+		break;
+	case 8:
+		/* Home agent 1 */
+		chan += 4;
+		break;
+	case 9: case 10: case 11: case 12:
+		/* Memory controller 0 */
+		chan = m->bank - 9;
+		break;
+	case 13: case 14: case 15: case 16:
+		/* Memory controller 1 */
+		chan = (m->bank - 13) + 4;
+		break;
+	default:
+		return;
+	}
+
+	channel[0] = chan;
+}
diff --git a/haswell.h b/haswell.h
index ba3fb1c3c985aec0ac1a0a271dca3c3afd18874c..712c8eb66d50a1bf63a7dbd67382fe775b59d69b 100644
--- a/haswell.h
+++ b/haswell.h
@@ -1,2 +1,3 @@
 void hsw_decode_model(int cputype, int bank, u64 status, u64 misc);
 void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm);
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm);
diff --git a/intel.c b/intel.c
index 20d2acdc12daa1128d72471d53639aebf82f4854..b655c4162f8980d5d826640fa4375c7ba6b1e97d 100644
--- a/intel.c
+++ b/intel.c
@@ -25,6 +25,7 @@
 #include "sandy-bridge.h"
 #include "ivy-bridge.h"
 #include "haswell.h"
+#include "skylake_xeon.h"
 
 int memory_error_support;
 
@@ -140,6 +141,13 @@ static int intel_memory_error(struct mce *m, unsigned recordlen)
 		case CPU_IVY_BRIDGE_EPEX:
 			ivy_bridge_ep_memerr_misc(m, channel, dimm);
 			break;
+		case CPU_HASWELL_EPEX:
+		case CPU_BROADWELL_EPEX:
+			haswell_memerr_misc(m, channel, dimm);
+			break;
+		case CPU_SKYLAKE_XEON:
+			skylake_memerr_misc(m, channel, dimm);
+			break;
 		default:
 			break;
 		} 
diff --git a/skylake_xeon.c b/skylake_xeon.c
index 16c6181987f0126d377b64a8f5d4a96a01bfa1c4..b02f8acd806e2a64ed1653f44349fd3e9abf374e 100644
--- a/skylake_xeon.c
+++ b/skylake_xeon.c
@@ -228,3 +228,45 @@ int skylake_s_ce_type(int bank, u64 status, u64 misc)
 
 	return 0;
 }
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be two memory controllers. We number the channels
+ * on the second controller: 3, 4, 5
+ */
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm)
+{
+	u64 status = m->status;
+	unsigned	chan;
+
+	/* Check this is a memory error */
+	if (!test_prefix(7, status & 0xefff))
+		return;
+
+	chan = EXTRACT(status, 0, 3);
+	if (chan == 0xf)
+		return;
+
+	switch (m->bank) {
+	case 7:
+		/* Home agent 0 */
+		break;
+	case 8:
+		/* Home agent 1 */
+		chan += 3;
+		break;
+	case 13: case 14: case 15:
+		/* Memory controller 0 */
+		chan = m->bank - 13;
+		break;
+	case 16: case 17: case 18:
+		/* Memory controller 1 */
+		chan = (m->bank - 16) + 3;
+		break;
+	default:
+		return;
+	}
+
+	channel[0] = chan;
+}
diff --git a/skylake_xeon.h b/skylake_xeon.h
index edcd9c030fa70f10ac23f2df9be948b10c73f4a1..098e6fa0e3eaff1b1d7e3040eddfb9187dabd7dd 100644
--- a/skylake_xeon.h
+++ b/skylake_xeon.h
@@ -1,2 +1,3 @@
 void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc);
 int skylake_s_ce_type(int bank, u64 status, u64 misc);
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm);