|
|
ace748 |
From 3ce2235abbd28cefcef348ad7895e2529810ad7c Mon Sep 17 00:00:00 2001
|
|
|
ace748 |
From: Xunlei Pang <xlpang@redhat.com>
|
|
|
ace748 |
Date: Thu, 3 Nov 2016 19:30:41 +0800
|
|
|
ace748 |
Subject: [PATCH] 99base: add memtrace-ko.sh to debug kernel module large
|
|
|
ace748 |
memory consumption
|
|
|
ace748 |
|
|
|
ace748 |
The current method for memory debug is to use "rd.memdebug=[0-3]",
|
|
|
ace748 |
it is not enough for debugging kernel modules. For example, when we
|
|
|
ace748 |
want to find out which kernel module consumes a large amount of memory,
|
|
|
ace748 |
"rd.memdebug=[0-3]" won't help too much.
|
|
|
ace748 |
|
|
|
ace748 |
A better way is needed to achieve this requirement, this is useful for
|
|
|
ace748 |
kdump OOM debugging.
|
|
|
ace748 |
|
|
|
ace748 |
The principle of this patch is to use kernel trace to track slab and
|
|
|
ace748 |
buddy allocation calls during kernel module loading(module_init), thus
|
|
|
ace748 |
we can analyze all the trace data and get the total memory consumption.
|
|
|
ace748 |
As for large slab allocation, it will probably fall into buddy allocation,
|
|
|
ace748 |
thus tracing "mm_page_alloc" alone should be enough for the purpose(this
|
|
|
ace748 |
saves quite some trace buffer memory, also large free is quite unlikey
|
|
|
ace748 |
during module loading, we neglect those memory free events).
|
|
|
ace748 |
|
|
|
ace748 |
The trace events include memory calls under "tracing/events/":
|
|
|
ace748 |
kmem/mm_page_alloc
|
|
|
ace748 |
|
|
|
ace748 |
We also inpect the following events to detect the module loading:
|
|
|
ace748 |
module/module_load
|
|
|
ace748 |
module/module_put
|
|
|
ace748 |
|
|
|
ace748 |
Since we use filters to trace events, the final trace data size won't
|
|
|
ace748 |
be too big. Users can adjust the trace buffer size via "trace_buf_size"
|
|
|
ace748 |
kernel boot command line as needed.
|
|
|
ace748 |
|
|
|
ace748 |
We can get the module name and task pid from "module_load" event which
|
|
|
ace748 |
also mark the beginning of the loading, and module_put called by the
|
|
|
ace748 |
same task pid implies the end of the loading. So the memory events
|
|
|
ace748 |
recorded in between by the same task pid are consumed by this module
|
|
|
ace748 |
during loading(i.e. modprobe or module_init()).
|
|
|
ace748 |
|
|
|
ace748 |
With these information, we can record the rough total memory(the larger,
|
|
|
ace748 |
the more precise the result will be) consumption involved by each kernel
|
|
|
ace748 |
module loading.
|
|
|
ace748 |
|
|
|
ace748 |
Thus we introduce this shell script to find out which kernel module
|
|
|
ace748 |
consumes a large amount of memory during loading. Use "rd.memdebug=4"
|
|
|
ace748 |
as the tigger.
|
|
|
ace748 |
|
|
|
ace748 |
After applying this patch and specifying "rd.memdebug=4", during booting
|
|
|
ace748 |
it will print out something extra like below:
|
|
|
ace748 |
0 pages consumed by "pata_acpi"
|
|
|
ace748 |
0 pages consumed by "ata_generic"
|
|
|
ace748 |
1 pages consumed by "drm"
|
|
|
ace748 |
0 pages consumed by "ttm"
|
|
|
ace748 |
0 pages consumed by "drm_kms_helper"
|
|
|
ace748 |
835 pages consumed by "qxl"
|
|
|
ace748 |
0 pages consumed by "mii"
|
|
|
ace748 |
6 pages consumed by "8139cp"
|
|
|
ace748 |
0 pages consumed by "virtio"
|
|
|
ace748 |
0 pages consumed by "virtio_ring"
|
|
|
ace748 |
9 pages consumed by "virtio_pci"
|
|
|
ace748 |
1 pages consumed by "8139too"
|
|
|
ace748 |
0 pages consumed by "serio_raw"
|
|
|
ace748 |
0 pages consumed by "crc32c_intel"
|
|
|
ace748 |
199 pages consumed by "virtio_console"
|
|
|
ace748 |
0 pages consumed by "libcrc32c"
|
|
|
ace748 |
9 pages consumed by "xfs"
|
|
|
ace748 |
|
|
|
ace748 |
From the print, we see clearly that "qxl" consumed the most memory.
|
|
|
ace748 |
|
|
|
ace748 |
This file will be installed as a separate executable named "tracekomem"
|
|
|
ace748 |
in the following patch.
|
|
|
ace748 |
|
|
|
ace748 |
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
|
|
|
ace748 |
---
|
|
|
49e6a8 |
modules.d/99base/memtrace-ko.sh | 191 ++++++++++++++++++++++++++++++++
|
|
|
ace748 |
1 file changed, 191 insertions(+)
|
|
|
ace748 |
create mode 100755 modules.d/99base/memtrace-ko.sh
|
|
|
ace748 |
|
|
|
ace748 |
diff --git a/modules.d/99base/memtrace-ko.sh b/modules.d/99base/memtrace-ko.sh
|
|
|
ace748 |
new file mode 100755
|
|
|
1755ca |
index 00000000..ee035e15
|
|
|
ace748 |
--- /dev/null
|
|
|
ace748 |
+++ b/modules.d/99base/memtrace-ko.sh
|
|
|
ace748 |
@@ -0,0 +1,191 @@
|
|
|
ace748 |
+#!/bin/sh
|
|
|
ace748 |
+
|
|
|
ace748 |
+# Try to find out kernel modules with large total memory allocation during loading.
|
|
|
ace748 |
+# For large slab allocation, it will fall into buddy, also not trace "mm_page_free"
|
|
|
ace748 |
+# considering large free is quite rare for module_init, thus saving tons of events
|
|
|
ace748 |
+# to avoid trace data overwritten.
|
|
|
ace748 |
+#
|
|
|
ace748 |
+# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose.
|
|
|
ace748 |
+
|
|
|
ace748 |
+# "sys/kernel/tracing" has the priority if exists.
|
|
|
ace748 |
+get_trace_base() {
|
|
|
ace748 |
+ # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available.
|
|
|
ace748 |
+ if [ -d "/sys/kernel/tracing" ]; then
|
|
|
ace748 |
+ echo "/sys/kernel"
|
|
|
ace748 |
+ else
|
|
|
ace748 |
+ echo "/sys/kernel/debug"
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+# We want to enable these trace events.
|
|
|
ace748 |
+get_want_events() {
|
|
|
ace748 |
+ echo "module:module_put module:module_load kmem:mm_page_alloc"
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+get_event_filter() {
|
|
|
ace748 |
+ echo "comm == systemd-udevd || comm == modprobe || comm == insmod"
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+is_trace_ready() {
|
|
|
ace748 |
+ local trace_base want_events current_events
|
|
|
ace748 |
+
|
|
|
ace748 |
+ trace_base=$(get_trace_base)
|
|
|
ace748 |
+ ! [ -f "$trace_base/tracing/trace" ] && return 1
|
|
|
ace748 |
+
|
|
|
ace748 |
+ [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1
|
|
|
ace748 |
+
|
|
|
ace748 |
+ # Also check if trace events were properly setup.
|
|
|
ace748 |
+ want_events=$(get_want_events)
|
|
|
ace748 |
+ current_events=$(echo $(cat $trace_base/tracing/set_event))
|
|
|
ace748 |
+ [ "$current_events" != "$want_events" ] && return 1
|
|
|
ace748 |
+
|
|
|
ace748 |
+ return 0
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+prepare_trace() {
|
|
|
ace748 |
+ local trace_base
|
|
|
ace748 |
+
|
|
|
ace748 |
+ trace_base=$(get_trace_base)
|
|
|
ace748 |
+ # old debugfs interface case.
|
|
|
ace748 |
+ if ! [ -d "$trace_base/tracing" ]; then
|
|
|
ace748 |
+ mount none -t debugfs $trace_base
|
|
|
ace748 |
+ # new tracefs interface case.
|
|
|
ace748 |
+ elif ! [ -f "$trace_base/tracing/trace" ]; then
|
|
|
ace748 |
+ mount none -t tracefs "$trace_base/tracing"
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ if ! [ -f "$trace_base/tracing/trace" ]; then
|
|
|
ace748 |
+ echo "WARN: Mount trace failed for kernel module memory analyzing."
|
|
|
ace748 |
+ return 1
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ # Active all the wanted trace events.
|
|
|
ace748 |
+ echo "$(get_want_events)" > $trace_base/tracing/set_event
|
|
|
ace748 |
+
|
|
|
ace748 |
+ # There are three kinds of known applications for module loading:
|
|
|
ace748 |
+ # "systemd-udevd", "modprobe" and "insmod".
|
|
|
ace748 |
+ # Set them as the global events filter.
|
|
|
ace748 |
+ # NOTE: Some kernel may not support this format of filter, anyway
|
|
|
ace748 |
+ # the operation will fail and it doesn't matter.
|
|
|
ace748 |
+ echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1
|
|
|
ace748 |
+ echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1
|
|
|
ace748 |
+
|
|
|
ace748 |
+ # Set the number of comm-pid if supported.
|
|
|
ace748 |
+ if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then
|
|
|
ace748 |
+ # Thanks to filters, 4096 is big enough(also well supported).
|
|
|
ace748 |
+ echo 4096 > $trace_base/tracing/saved_cmdlines_size
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ # Enable and clear trace data for the first time.
|
|
|
ace748 |
+ echo 1 > $trace_base/tracing/tracing_on
|
|
|
ace748 |
+ echo > $trace_base/tracing/trace
|
|
|
ace748 |
+ echo "Prepare trace success."
|
|
|
ace748 |
+ return 0
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+order_to_pages()
|
|
|
ace748 |
+{
|
|
|
ace748 |
+ local pages=1
|
|
|
ace748 |
+ local order=$1
|
|
|
ace748 |
+
|
|
|
ace748 |
+ while [ "$order" != 0 ]; do
|
|
|
ace748 |
+ order=$((order-1))
|
|
|
ace748 |
+ pages=$(($pages*2))
|
|
|
ace748 |
+ done
|
|
|
ace748 |
+
|
|
|
ace748 |
+ echo $pages
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+parse_trace_data() {
|
|
|
ace748 |
+ local module_name tmp_eval pages
|
|
|
ace748 |
+
|
|
|
ace748 |
+ cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args
|
|
|
ace748 |
+ do
|
|
|
ace748 |
+ # Skip comment lines
|
|
|
ace748 |
+ if [ "$pid" = "#" ]; then
|
|
|
ace748 |
+ continue
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ pid=${pid##*-}
|
|
|
ace748 |
+ function=${function%:}
|
|
|
ace748 |
+ if [ "$function" = "module_load" ]; then
|
|
|
ace748 |
+ # One module is being loaded, save the task pid for tracking.
|
|
|
ace748 |
+ # Remove the trailing after whitespace, there may be the module flags.
|
|
|
ace748 |
+ module_name=${args%% *}
|
|
|
ace748 |
+ # Mark current_module to track the task.
|
|
|
ace748 |
+ eval current_module_$pid="$module_name"
|
|
|
ace748 |
+ tmp_eval=$(eval echo '${module_loaded_'${module_name}'}')
|
|
|
ace748 |
+ if [ -n "$tmp_eval" ]; then
|
|
|
ace748 |
+ echo "WARN: \"$module_name\" was loaded multiple times!"
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+ eval unset module_loaded_$module_name
|
|
|
ace748 |
+ eval nr_alloc_pages_$module_name=0
|
|
|
ace748 |
+ continue
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ module_name=$(eval echo '${current_module_'${pid}'}')
|
|
|
ace748 |
+ if [ -z "$module_name" ]; then
|
|
|
ace748 |
+ continue
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ # Once we get here, the task is being tracked(is loading a module).
|
|
|
ace748 |
+ if [ "$function" = "module_put" ]; then
|
|
|
ace748 |
+ # Mark the module as loaded when the first module_put event happens after module_load.
|
|
|
ace748 |
+ tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
|
|
|
ace748 |
+ echo "$tmp_eval pages consumed by \"$module_name\""
|
|
|
ace748 |
+ eval module_loaded_$module_name=1
|
|
|
ace748 |
+ # Module loading finished, so untrack the task.
|
|
|
ace748 |
+ eval unset current_module_$pid
|
|
|
ace748 |
+ eval unset nr_alloc_pages_$module_name
|
|
|
ace748 |
+ continue
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+ if [ "$function" = "mm_page_alloc" ]; then
|
|
|
ace748 |
+ # Get order first, then convert to actual pages.
|
|
|
ace748 |
+ pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/')
|
|
|
ace748 |
+ pages=$(order_to_pages "$pages")
|
|
|
ace748 |
+ tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
|
|
|
ace748 |
+ eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))"
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+ done
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+cleanup_trace() {
|
|
|
ace748 |
+ local trace_base
|
|
|
ace748 |
+
|
|
|
ace748 |
+ if is_trace_ready; then
|
|
|
ace748 |
+ trace_base=$(get_trace_base)
|
|
|
ace748 |
+ echo 0 > $trace_base/tracing/tracing_on
|
|
|
ace748 |
+ echo > $trace_base/tracing/trace
|
|
|
ace748 |
+ echo > $trace_base/tracing/set_event
|
|
|
ace748 |
+ echo 0 > $trace_base/tracing/events/kmem/filter
|
|
|
ace748 |
+ echo 0 > $trace_base/tracing/events/module/filter
|
|
|
ace748 |
+ fi
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+show_usage() {
|
|
|
ace748 |
+ echo "Find out kernel modules with large memory consumption during loading based on trace."
|
|
|
ace748 |
+ echo "Usage:"
|
|
|
ace748 |
+ echo "1) run it first to setup trace."
|
|
|
ace748 |
+ echo "2) run again to parse the trace data if any."
|
|
|
ace748 |
+ echo "3) run with \"--cleanup\" option to cleanup trace after use."
|
|
|
ace748 |
+}
|
|
|
ace748 |
+
|
|
|
ace748 |
+if [ "$1" = "--help" ]; then
|
|
|
ace748 |
+ show_usage
|
|
|
ace748 |
+ exit 0
|
|
|
ace748 |
+fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+if [ "$1" = "--cleanup" ]; then
|
|
|
ace748 |
+ cleanup_trace
|
|
|
ace748 |
+ exit 0
|
|
|
ace748 |
+fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+if is_trace_ready ; then
|
|
|
ace748 |
+ echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)"
|
|
|
ace748 |
+ parse_trace_data
|
|
|
ace748 |
+else
|
|
|
ace748 |
+ prepare_trace
|
|
|
ace748 |
+fi
|
|
|
ace748 |
+
|
|
|
ace748 |
+exit $?
|