|
|
b404a9 |
From 744e4c6a6cbbb9ba0569bf8e3ab50171e974b2e3 Mon Sep 17 00:00:00 2001
|
|
|
b404a9 |
From: Denys Vlasenko <dvlasenk@redhat.com>
|
|
|
b404a9 |
Date: Mon, 6 Jan 2014 17:18:31 +0100
|
|
|
b404a9 |
Subject: [ABRT PATCH 19/27] Fix handling of Machine Check Exceptions.
|
|
|
b404a9 |
|
|
|
b404a9 |
Closes #764.
|
|
|
b404a9 |
|
|
|
b404a9 |
If non-fatal MCE is seen, abrt will detect it as an oops
|
|
|
b404a9 |
and alert user in a usual manner. When user opens this
|
|
|
b404a9 |
abrt problem for reporting, he will see that "comment"
|
|
|
b404a9 |
field is pre-filled with a text.
|
|
|
b404a9 |
What it says depends on whether mcelog tool is installed.
|
|
|
b404a9 |
If mcelog is installed, the text will say that hardware errors
|
|
|
b404a9 |
were detected, and will show the tail of either /var/log/mcelog
|
|
|
b404a9 |
or syslog.
|
|
|
b404a9 |
Otherwise the text will say that hardware errors
|
|
|
b404a9 |
were detected, but they can't be usefully diagnosed,
|
|
|
b404a9 |
and user is strongly advised to install mcelog tool.
|
|
|
b404a9 |
|
|
|
b404a9 |
If fatal MCE is encountered, kernel always panics,
|
|
|
b404a9 |
(abrt has no chance of catching the oops),
|
|
|
b404a9 |
kdump kicks in, and then after reboot abrt says that new vmcore
|
|
|
b404a9 |
is found. When user generates backtrace, he will see oops text
|
|
|
b404a9 |
which starts with
|
|
|
b404a9 |
"Machine Check Exception: BANK nnn ..." and (hopefully)
|
|
|
b404a9 |
is already explanatory enough.
|
|
|
b404a9 |
|
|
|
b404a9 |
(Yes, it's weird that kernel shows human-readable error messages
|
|
|
b404a9 |
on fatal MCEs but doesn't do that for non-fatal ones.
|
|
|
b404a9 |
This makes fetching MCE info significantly different...
|
|
|
b404a9 |
I wish kernel would show human-readable MCEs in both cases,
|
|
|
b404a9 |
we wouldn't need mcelog then... oh well.)
|
|
|
b404a9 |
|
|
|
b404a9 |
In order to generate meaningful hash for MCE's,
|
|
|
b404a9 |
oops hashing was extended for oopses without backtraces.
|
|
|
b404a9 |
|
|
|
b404a9 |
Since MCEs, unlike regular oopses, don't contain kernel version,
|
|
|
b404a9 |
additional magic is added to extract kernel version
|
|
|
b404a9 |
in vmcore event handling.
|
|
|
b404a9 |
|
|
|
b404a9 |
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
|
|
|
b404a9 |
|
|
|
b404a9 |
Related to rhbz#1032077
|
|
|
b404a9 |
|
|
|
b404a9 |
Signed-off-by: Jakub Filak <jfilak@redhat.com>
|
|
|
b404a9 |
---
|
|
|
b404a9 |
src/lib/kernel.c | 31 +++++++++++++++++++++++++
|
|
|
b404a9 |
src/plugins/koops_event.conf | 54 +++++++++++++++++++++++++++++++++++++++++++
|
|
|
b404a9 |
src/plugins/vmcore_event.conf | 18 ++++++++++++++-
|
|
|
b404a9 |
3 files changed, 102 insertions(+), 1 deletion(-)
|
|
|
b404a9 |
|
|
|
b404a9 |
diff --git a/src/lib/kernel.c b/src/lib/kernel.c
|
|
|
b404a9 |
index ce8815b..340ec39 100644
|
|
|
b404a9 |
--- a/src/lib/kernel.c
|
|
|
b404a9 |
+++ b/src/lib/kernel.c
|
|
|
b404a9 |
@@ -115,8 +115,29 @@ static const char *const s_koops_suspicious_strings[] = {
|
|
|
b404a9 |
* arch/x86/kernel/cpu/mcheck/p5.c: "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
|
|
|
b404a9 |
* arch/x86/kernel/cpu/mcheck/mce.c: pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
|
|
|
b404a9 |
* drivers/edac/sb_edac.c: printk("CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
|
|
|
b404a9 |
+ *
|
|
|
b404a9 |
+ * MCEs can be fatal (they panic kernel) or not.
|
|
|
b404a9 |
+ * Fatal MCE are delivered as exception#18 to the CPU.
|
|
|
b404a9 |
+ * Non-fatal ones sometimes are delivered as exception#18;
|
|
|
b404a9 |
+ * other times they are silently recorded in magic MSRs, CPU is not alerted.
|
|
|
b404a9 |
+ * Linux kernel periodically (up to 5 mins interval) reads those MSRs
|
|
|
b404a9 |
+ * and if MCE is seen there, it is piped in binary form through
|
|
|
b404a9 |
+ * /dev/mcelog to whoever listens on it. (Such as mcelog tool in --daemon
|
|
|
b404a9 |
+ * mode; but cat
|
|
|
b404a9 |
+ *
|
|
|
b404a9 |
+ * "Machine Check Exception:" message is printed *only*
|
|
|
b404a9 |
+ * by fatal MCEs (so far, future kernels may be different).
|
|
|
b404a9 |
+ * It will be caught as vmcore if kdump is configured.
|
|
|
b404a9 |
+ *
|
|
|
b404a9 |
+ * Non-fatal MCEs have "[Hardware Error]: Machine check events logged"
|
|
|
b404a9 |
+ * message in kernel log.
|
|
|
b404a9 |
+ * When /dev/mcelog is read, *no additional kernel log messages appear*:
|
|
|
b404a9 |
+ * if we want more readable data, we must rely on other tools
|
|
|
b404a9 |
+ * (such as mcelog daemon consuming binary /dev/mcelog and writing
|
|
|
b404a9 |
+ * human-readable /var/log/mcelog).
|
|
|
b404a9 |
*/
|
|
|
b404a9 |
"Machine Check Exception:",
|
|
|
b404a9 |
+ "Machine check events logged",
|
|
|
b404a9 |
|
|
|
b404a9 |
/* X86 TRAPs */
|
|
|
b404a9 |
"divide error:",
|
|
|
b404a9 |
@@ -299,6 +320,16 @@ next_line:
|
|
|
b404a9 |
if (strcasestr(curline, "Call Trace:")) /* yes, it must be case-insensitive */
|
|
|
b404a9 |
inbacktrace = 1;
|
|
|
b404a9 |
else
|
|
|
b404a9 |
+ /* Fatal MCE's have a few lines of useful information between
|
|
|
b404a9 |
+ * first "Machine check exception:" line and the final "Kernel panic"
|
|
|
b404a9 |
+ * line. Such oops, of course, is only detectable in kdumps (tested)
|
|
|
b404a9 |
+ * or possibly pstore-saved logs (I did not try this yet).
|
|
|
b404a9 |
+ * In order to capture all these lines, we treat final line
|
|
|
b404a9 |
+ * as "backtrace" (which is admittedly a hack):
|
|
|
b404a9 |
+ */
|
|
|
b404a9 |
+ if (strstr(curline, "Kernel panic - not syncing"))
|
|
|
b404a9 |
+ inbacktrace = 1;
|
|
|
b404a9 |
+ else
|
|
|
b404a9 |
if (strnlen(curline, 9) > 8
|
|
|
b404a9 |
&& ( (curline[0] == '(' && curline[1] == '[' && curline[2] == '<')
|
|
|
b404a9 |
|| (curline[0] == '[' && curline[1] == '<'))
|
|
|
b404a9 |
diff --git a/src/plugins/koops_event.conf b/src/plugins/koops_event.conf
|
|
|
b404a9 |
index c0277c8..7dfbe36 100644
|
|
|
b404a9 |
--- a/src/plugins/koops_event.conf
|
|
|
b404a9 |
+++ b/src/plugins/koops_event.conf
|
|
|
b404a9 |
@@ -4,6 +4,60 @@ EVENT=post-create analyzer=Kerneloops
|
|
|
b404a9 |
abrt-action-analyze-oops &&
|
|
|
b404a9 |
dmesg >>dmesg &&
|
|
|
b404a9 |
abrt-action-save-kernel-data
|
|
|
b404a9 |
+ abrt-action-save-kernel-data || exit $?
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # If it exists, we can save a copy of MCE log here:
|
|
|
b404a9 |
+ #test -f /var/log/mcelog && cp /var/log/mcelog .
|
|
|
b404a9 |
+ # but in current config, sosreport already does that.
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # See if MCEs were seen but mcelog isn't installed or running
|
|
|
b404a9 |
+ grep -qFi 'Machine check events logged' dmesg || exit 0
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # There was an MCE. IOW: it's not a bug, it's a HW error.
|
|
|
b404a9 |
+ # Did mcelog logged it to /var/log/mcelog
|
|
|
b404a9 |
+ # (RHEL6 by default does this)?
|
|
|
b404a9 |
+ test -f /var/log/mcelog &&
|
|
|
b404a9 |
+ {
|
|
|
b404a9 |
+ # (Ab)use user comment field to inform user about it.
|
|
|
b404a9 |
+ echo "The kernel log indicates that hardware errors were detected."
|
|
|
b404a9 |
+ echo "/var/log/mcelog file may have more information."
|
|
|
b404a9 |
+ echo "The last 20 lines of /var/log/mcelog are:"
|
|
|
b404a9 |
+ echo "========================================="
|
|
|
b404a9 |
+ # Redirecting sterr in case selinux makes it unreadable
|
|
|
b404a9 |
+ # (annoying anyway, but at least user knows what's going on):
|
|
|
b404a9 |
+ tail -n20 /var/log/mcelog 2>&1
|
|
|
b404a9 |
+ exit 0
|
|
|
b404a9 |
+ } >comment
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # On RHEL7, mcelog is run so that its output ends up in syslog.
|
|
|
b404a9 |
+ # Do we see that?
|
|
|
b404a9 |
+ grep -qFi 'mcelog: Hardware event' /var/log/messages &&
|
|
|
b404a9 |
+ {
|
|
|
b404a9 |
+ echo "The kernel log indicates that hardware errors were detected."
|
|
|
b404a9 |
+ echo "System log may have more information."
|
|
|
b404a9 |
+ echo "The last 20 mcelog lines of system log are:"
|
|
|
b404a9 |
+ echo "========================================="
|
|
|
b404a9 |
+ # Redirecting sterr in case selinux makes it unreadable
|
|
|
b404a9 |
+ # (annoying anyway, but at least user knows what's going on):
|
|
|
b404a9 |
+ grep -Fi 'mcelog:' /var/log/messages | tail -n20 2>&1
|
|
|
b404a9 |
+ exit 0
|
|
|
b404a9 |
+ } >comment
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # Apparently, there is no running mcelog daemon!
|
|
|
b404a9 |
+ # Let user know that he needs one.
|
|
|
b404a9 |
+ {
|
|
|
b404a9 |
+ echo "The kernel log indicates that hardware errors were detected."
|
|
|
b404a9 |
+ echo "The data was saved by kernel for processing by the mcelog tool."
|
|
|
b404a9 |
+ echo "However, neither /var/log/mcelog nor system log contain mcelog messages."
|
|
|
b404a9 |
+ echo "Most likely reason is that mcelog is not installed or not configured"
|
|
|
b404a9 |
+ echo "to be started during boot."
|
|
|
b404a9 |
+ echo "Without this tool running, the binary data saved by kernel"
|
|
|
b404a9 |
+ echo "is of limited usefulness."
|
|
|
b404a9 |
+ echo "(You can save this data anyway by running 'cat </dev/mcelog >FILE')."
|
|
|
b404a9 |
+ echo "The recommended course of action is to install mcelog."
|
|
|
b404a9 |
+ echo "If another hardware error would occur, a user-readable description"
|
|
|
b404a9 |
+ echo "of it will be saved in system log or /var/log/mcelog."
|
|
|
b404a9 |
+ } >comment
|
|
|
b404a9 |
|
|
|
b404a9 |
# If you want behavior similar to one provided by kerneloops daemon
|
|
|
b404a9 |
# distributed by kerneloops.org - that is, if you want
|
|
|
b404a9 |
diff --git a/src/plugins/vmcore_event.conf b/src/plugins/vmcore_event.conf
|
|
|
b404a9 |
index f8de3c5..655d842 100644
|
|
|
b404a9 |
--- a/src/plugins/vmcore_event.conf
|
|
|
b404a9 |
+++ b/src/plugins/vmcore_event.conf
|
|
|
b404a9 |
@@ -1,6 +1,22 @@
|
|
|
b404a9 |
# analyze
|
|
|
b404a9 |
EVENT=analyze_VMcore analyzer=vmcore
|
|
|
b404a9 |
- abrt-action-analyze-vmcore &&
|
|
|
b404a9 |
+ # If kdump machinery already extracted dmesg...
|
|
|
b404a9 |
+ if test -f vmcore-dmesg.txt; then
|
|
|
b404a9 |
+ # ...use that
|
|
|
b404a9 |
+ abrt-dump-oops -o vmcore-dmesg.txt >backtrace || exit $?
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # Does "kernel" element exist?
|
|
|
b404a9 |
+ test -f kernel && exit 0
|
|
|
b404a9 |
+ #
|
|
|
b404a9 |
+ # Try creating it from vmcore-dmesg.txt:
|
|
|
b404a9 |
+ # MCE oopses don't have kernel version in them,
|
|
|
b404a9 |
+ # but it should be specified earlier in the log.
|
|
|
b404a9 |
+ k=`sed -n '/Linux version/ s/.*Linux version \([^ ]*\) .*/\1/p' vmcore-dmesg.txt | tail -n1`
|
|
|
b404a9 |
+ test "$k" != "" && printf "%s" "$k" >kernel
|
|
|
b404a9 |
+ else
|
|
|
b404a9 |
+ # No vmcore-dmesg.txt, do it the hard way:
|
|
|
b404a9 |
+ abrt-action-analyze-vmcore
|
|
|
b404a9 |
+ fi &&
|
|
|
b404a9 |
abrt-action-analyze-oops &&
|
|
|
b404a9 |
abrt-action-save-kernel-data
|
|
|
b404a9 |
|
|
|
b404a9 |
--
|
|
|
b404a9 |
1.8.3.1
|
|
|
b404a9 |
|