Blame SOURCES/0487-99base-add-memtrace-ko.sh-to-debug-kernel-module-lar.patch

ace748
From 3ce2235abbd28cefcef348ad7895e2529810ad7c Mon Sep 17 00:00:00 2001
ace748
From: Xunlei Pang <xlpang@redhat.com>
ace748
Date: Thu, 3 Nov 2016 19:30:41 +0800
ace748
Subject: [PATCH] 99base: add memtrace-ko.sh to debug kernel module large
ace748
 memory consumption
ace748
ace748
The current method for memory debug is to use "rd.memdebug=[0-3]",
ace748
it is not enough for debugging kernel modules. For example, when we
ace748
want to find out which kernel module consumes a large amount of memory,
ace748
"rd.memdebug=[0-3]" won't help too much.
ace748
ace748
A better way is needed to achieve this requirement, this is useful for
ace748
kdump OOM debugging.
ace748
ace748
The principle of this patch is to use kernel trace to track slab and
ace748
buddy allocation calls during kernel module loading(module_init), thus
ace748
we can analyze all the trace data and get the total memory consumption.
ace748
As for large slab allocation, it will probably fall into buddy allocation,
ace748
thus tracing "mm_page_alloc" alone should be enough for the purpose(this
ace748
saves quite some trace buffer memory, also large free is quite unlikey
ace748
during module loading, we neglect those memory free events).
ace748
ace748
The trace events include memory calls under "tracing/events/":
ace748
  kmem/mm_page_alloc
ace748
ace748
We also inpect the following events to detect the module loading:
ace748
  module/module_load
ace748
  module/module_put
ace748
ace748
Since we use filters to trace events, the final trace data size won't
ace748
be too big. Users can adjust the trace buffer size via "trace_buf_size"
ace748
kernel boot command line as needed.
ace748
ace748
We can get the module name and task pid from "module_load" event which
ace748
also mark the beginning of the loading, and module_put called by the
ace748
same task pid implies the end of the loading. So the memory events
ace748
recorded in between by the same task pid are consumed by this module
ace748
during loading(i.e. modprobe or module_init()).
ace748
ace748
With these information, we can record the rough total memory(the larger,
ace748
the more precise the result will be) consumption involved by each kernel
ace748
module loading.
ace748
ace748
Thus we introduce this shell script to find out which kernel module
ace748
consumes a large amount of memory during loading. Use "rd.memdebug=4"
ace748
as the tigger.
ace748
ace748
After applying this patch and specifying "rd.memdebug=4", during booting
ace748
it will print out something extra like below:
ace748
0 pages consumed by "pata_acpi"
ace748
0 pages consumed by "ata_generic"
ace748
1 pages consumed by "drm"
ace748
0 pages consumed by "ttm"
ace748
0 pages consumed by "drm_kms_helper"
ace748
835 pages consumed by "qxl"
ace748
0 pages consumed by "mii"
ace748
6 pages consumed by "8139cp"
ace748
0 pages consumed by "virtio"
ace748
0 pages consumed by "virtio_ring"
ace748
9 pages consumed by "virtio_pci"
ace748
1 pages consumed by "8139too"
ace748
0 pages consumed by "serio_raw"
ace748
0 pages consumed by "crc32c_intel"
ace748
199 pages consumed by "virtio_console"
ace748
0 pages consumed by "libcrc32c"
ace748
9 pages consumed by "xfs"
ace748
ace748
From the print, we see clearly that "qxl" consumed the most memory.
ace748
ace748
This file will be installed as a separate executable named "tracekomem"
ace748
in the following patch.
ace748
ace748
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
ace748
---
ace748
 modules.d/99base/memtrace-ko.sh | 191 ++++++++++++++++++++++++++++++++++++++++
ace748
 1 file changed, 191 insertions(+)
ace748
 create mode 100755 modules.d/99base/memtrace-ko.sh
ace748
ace748
diff --git a/modules.d/99base/memtrace-ko.sh b/modules.d/99base/memtrace-ko.sh
ace748
new file mode 100755
1755ca
index 00000000..ee035e15
ace748
--- /dev/null
ace748
+++ b/modules.d/99base/memtrace-ko.sh
ace748
@@ -0,0 +1,191 @@
ace748
+#!/bin/sh
ace748
+
ace748
+# Try to find out kernel modules with large total memory allocation during loading.
ace748
+# For large slab allocation, it will fall into buddy, also not trace "mm_page_free"
ace748
+# considering large free is quite rare for module_init, thus saving tons of events
ace748
+# to avoid trace data overwritten.
ace748
+#
ace748
+# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose.
ace748
+
ace748
+# "sys/kernel/tracing" has the priority if exists.
ace748
+get_trace_base() {
ace748
+    # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available.
ace748
+    if [ -d "/sys/kernel/tracing" ]; then
ace748
+        echo "/sys/kernel"
ace748
+    else
ace748
+        echo "/sys/kernel/debug"
ace748
+    fi
ace748
+}
ace748
+
ace748
+# We want to enable these trace events.
ace748
+get_want_events() {
ace748
+    echo "module:module_put module:module_load kmem:mm_page_alloc"
ace748
+}
ace748
+
ace748
+get_event_filter() {
ace748
+    echo "comm == systemd-udevd || comm == modprobe || comm == insmod"
ace748
+}
ace748
+
ace748
+is_trace_ready() {
ace748
+    local trace_base want_events current_events
ace748
+
ace748
+    trace_base=$(get_trace_base)
ace748
+    ! [ -f "$trace_base/tracing/trace" ] && return 1
ace748
+
ace748
+    [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1
ace748
+
ace748
+    # Also check if trace events were properly setup.
ace748
+    want_events=$(get_want_events)
ace748
+    current_events=$(echo $(cat $trace_base/tracing/set_event))
ace748
+    [ "$current_events" != "$want_events" ] && return 1
ace748
+
ace748
+    return 0
ace748
+}
ace748
+
ace748
+prepare_trace() {
ace748
+    local trace_base
ace748
+
ace748
+    trace_base=$(get_trace_base)
ace748
+    # old debugfs interface case.
ace748
+    if ! [ -d "$trace_base/tracing" ]; then
ace748
+        mount none -t debugfs $trace_base
ace748
+    # new tracefs interface case.
ace748
+    elif ! [ -f "$trace_base/tracing/trace" ]; then
ace748
+        mount none -t tracefs "$trace_base/tracing"
ace748
+    fi
ace748
+
ace748
+    if ! [ -f "$trace_base/tracing/trace" ]; then
ace748
+        echo "WARN: Mount trace failed for kernel module memory analyzing."
ace748
+        return 1
ace748
+    fi
ace748
+
ace748
+    # Active all the wanted trace events.
ace748
+    echo "$(get_want_events)" > $trace_base/tracing/set_event
ace748
+
ace748
+    # There are three kinds of known applications for module loading:
ace748
+    # "systemd-udevd", "modprobe" and "insmod".
ace748
+    # Set them as the global events filter.
ace748
+    # NOTE: Some kernel may not support this format of filter, anyway
ace748
+    #       the operation will fail and it doesn't matter.
ace748
+    echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1
ace748
+    echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1
ace748
+
ace748
+    # Set the number of comm-pid if supported.
ace748
+    if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then
ace748
+        # Thanks to filters, 4096 is big enough(also well supported).
ace748
+        echo 4096 > $trace_base/tracing/saved_cmdlines_size
ace748
+    fi
ace748
+
ace748
+    # Enable and clear trace data for the first time.
ace748
+    echo 1 > $trace_base/tracing/tracing_on
ace748
+    echo > $trace_base/tracing/trace
ace748
+    echo "Prepare trace success."
ace748
+    return 0
ace748
+}
ace748
+
ace748
+order_to_pages()
ace748
+{
ace748
+    local pages=1
ace748
+    local order=$1
ace748
+
ace748
+    while [ "$order" != 0 ]; do
ace748
+        order=$((order-1))
ace748
+        pages=$(($pages*2))
ace748
+	done
ace748
+
ace748
+    echo $pages
ace748
+}
ace748
+
ace748
+parse_trace_data() {
ace748
+    local module_name tmp_eval pages
ace748
+
ace748
+    cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args
ace748
+    do
ace748
+        # Skip comment lines
ace748
+        if [ "$pid" = "#" ]; then
ace748
+            continue
ace748
+        fi
ace748
+
ace748
+        pid=${pid##*-}
ace748
+        function=${function%:}
ace748
+        if [ "$function" = "module_load" ]; then
ace748
+            # One module is being loaded, save the task pid for tracking.
ace748
+            # Remove the trailing after whitespace, there may be the module flags.
ace748
+            module_name=${args%% *}
ace748
+            # Mark current_module to track the task.
ace748
+            eval current_module_$pid="$module_name"
ace748
+            tmp_eval=$(eval echo '${module_loaded_'${module_name}'}')
ace748
+            if [ -n "$tmp_eval" ]; then
ace748
+                echo "WARN: \"$module_name\" was loaded multiple times!"
ace748
+            fi
ace748
+            eval unset module_loaded_$module_name
ace748
+            eval nr_alloc_pages_$module_name=0
ace748
+            continue
ace748
+        fi
ace748
+
ace748
+        module_name=$(eval echo '${current_module_'${pid}'}')
ace748
+        if [ -z "$module_name" ]; then
ace748
+            continue
ace748
+        fi
ace748
+
ace748
+        # Once we get here, the task is being tracked(is loading a module).
ace748
+        if [ "$function" = "module_put" ]; then
ace748
+            # Mark the module as loaded when the first module_put event happens after module_load.
ace748
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
ace748
+            echo "$tmp_eval pages consumed by \"$module_name\""
ace748
+            eval module_loaded_$module_name=1
ace748
+            # Module loading finished, so untrack the task.
ace748
+            eval unset current_module_$pid
ace748
+            eval unset nr_alloc_pages_$module_name
ace748
+            continue
ace748
+        fi
ace748
+
ace748
+        if [ "$function" = "mm_page_alloc" ]; then
ace748
+            # Get order first, then convert to actual pages.
ace748
+            pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/')
ace748
+            pages=$(order_to_pages "$pages")
ace748
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
ace748
+            eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))"
ace748
+        fi
ace748
+    done
ace748
+}
ace748
+
ace748
+cleanup_trace() {
ace748
+    local trace_base
ace748
+
ace748
+    if is_trace_ready; then
ace748
+        trace_base=$(get_trace_base)
ace748
+        echo 0 > $trace_base/tracing/tracing_on
ace748
+        echo > $trace_base/tracing/trace
ace748
+        echo > $trace_base/tracing/set_event
ace748
+        echo 0 > $trace_base/tracing/events/kmem/filter
ace748
+        echo 0 > $trace_base/tracing/events/module/filter
ace748
+    fi
ace748
+}
ace748
+
ace748
+show_usage() {
ace748
+    echo "Find out kernel modules with large memory consumption during loading based on trace."
ace748
+    echo "Usage:"
ace748
+    echo "1) run it first to setup trace."
ace748
+    echo "2) run again to parse the trace data if any."
ace748
+    echo "3) run with \"--cleanup\" option to cleanup trace after use."
ace748
+}
ace748
+
ace748
+if [ "$1" = "--help" ]; then
ace748
+    show_usage
ace748
+    exit 0
ace748
+fi
ace748
+
ace748
+if [ "$1" = "--cleanup" ]; then
ace748
+    cleanup_trace
ace748
+    exit 0
ace748
+fi
ace748
+
ace748
+if is_trace_ready ; then
ace748
+    echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)"
ace748
+    parse_trace_data
ace748
+else
ace748
+    prepare_trace
ace748
+fi
ace748
+
ace748
+exit $?