712866
From 3ce2235abbd28cefcef348ad7895e2529810ad7c Mon Sep 17 00:00:00 2001
712866
From: Xunlei Pang <xlpang@redhat.com>
712866
Date: Thu, 3 Nov 2016 19:30:41 +0800
712866
Subject: [PATCH] 99base: add memtrace-ko.sh to debug kernel module large
712866
 memory consumption
712866
712866
The current method for memory debug is to use "rd.memdebug=[0-3]",
712866
it is not enough for debugging kernel modules. For example, when we
712866
want to find out which kernel module consumes a large amount of memory,
712866
"rd.memdebug=[0-3]" won't help too much.
712866
712866
A better way is needed to achieve this requirement, this is useful for
712866
kdump OOM debugging.
712866
712866
The principle of this patch is to use kernel trace to track slab and
712866
buddy allocation calls during kernel module loading(module_init), thus
712866
we can analyze all the trace data and get the total memory consumption.
712866
As for large slab allocation, it will probably fall into buddy allocation,
712866
thus tracing "mm_page_alloc" alone should be enough for the purpose(this
712866
saves quite some trace buffer memory, also large free is quite unlikey
712866
during module loading, we neglect those memory free events).
712866
712866
The trace events include memory calls under "tracing/events/":
712866
  kmem/mm_page_alloc
712866
712866
We also inpect the following events to detect the module loading:
712866
  module/module_load
712866
  module/module_put
712866
712866
Since we use filters to trace events, the final trace data size won't
712866
be too big. Users can adjust the trace buffer size via "trace_buf_size"
712866
kernel boot command line as needed.
712866
712866
We can get the module name and task pid from "module_load" event which
712866
also mark the beginning of the loading, and module_put called by the
712866
same task pid implies the end of the loading. So the memory events
712866
recorded in between by the same task pid are consumed by this module
712866
during loading(i.e. modprobe or module_init()).
712866
712866
With these information, we can record the rough total memory(the larger,
712866
the more precise the result will be) consumption involved by each kernel
712866
module loading.
712866
712866
Thus we introduce this shell script to find out which kernel module
712866
consumes a large amount of memory during loading. Use "rd.memdebug=4"
712866
as the tigger.
712866
712866
After applying this patch and specifying "rd.memdebug=4", during booting
712866
it will print out something extra like below:
712866
0 pages consumed by "pata_acpi"
712866
0 pages consumed by "ata_generic"
712866
1 pages consumed by "drm"
712866
0 pages consumed by "ttm"
712866
0 pages consumed by "drm_kms_helper"
712866
835 pages consumed by "qxl"
712866
0 pages consumed by "mii"
712866
6 pages consumed by "8139cp"
712866
0 pages consumed by "virtio"
712866
0 pages consumed by "virtio_ring"
712866
9 pages consumed by "virtio_pci"
712866
1 pages consumed by "8139too"
712866
0 pages consumed by "serio_raw"
712866
0 pages consumed by "crc32c_intel"
712866
199 pages consumed by "virtio_console"
712866
0 pages consumed by "libcrc32c"
712866
9 pages consumed by "xfs"
712866
712866
From the print, we see clearly that "qxl" consumed the most memory.
712866
712866
This file will be installed as a separate executable named "tracekomem"
712866
in the following patch.
712866
712866
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
712866
---
712866
 modules.d/99base/memtrace-ko.sh | 191 ++++++++++++++++++++++++++++++++++++++++
712866
 1 file changed, 191 insertions(+)
712866
 create mode 100755 modules.d/99base/memtrace-ko.sh
712866
712866
diff --git a/modules.d/99base/memtrace-ko.sh b/modules.d/99base/memtrace-ko.sh
712866
new file mode 100755
5c6c2a
index 00000000..ee035e15
712866
--- /dev/null
712866
+++ b/modules.d/99base/memtrace-ko.sh
712866
@@ -0,0 +1,191 @@
712866
+#!/bin/sh
712866
+
712866
+# Try to find out kernel modules with large total memory allocation during loading.
712866
+# For large slab allocation, it will fall into buddy, also not trace "mm_page_free"
712866
+# considering large free is quite rare for module_init, thus saving tons of events
712866
+# to avoid trace data overwritten.
712866
+#
712866
+# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose.
712866
+
712866
+# "sys/kernel/tracing" has the priority if exists.
712866
+get_trace_base() {
712866
+    # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available.
712866
+    if [ -d "/sys/kernel/tracing" ]; then
712866
+        echo "/sys/kernel"
712866
+    else
712866
+        echo "/sys/kernel/debug"
712866
+    fi
712866
+}
712866
+
712866
+# We want to enable these trace events.
712866
+get_want_events() {
712866
+    echo "module:module_put module:module_load kmem:mm_page_alloc"
712866
+}
712866
+
712866
+get_event_filter() {
712866
+    echo "comm == systemd-udevd || comm == modprobe || comm == insmod"
712866
+}
712866
+
712866
+is_trace_ready() {
712866
+    local trace_base want_events current_events
712866
+
712866
+    trace_base=$(get_trace_base)
712866
+    ! [ -f "$trace_base/tracing/trace" ] && return 1
712866
+
712866
+    [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1
712866
+
712866
+    # Also check if trace events were properly setup.
712866
+    want_events=$(get_want_events)
712866
+    current_events=$(echo $(cat $trace_base/tracing/set_event))
712866
+    [ "$current_events" != "$want_events" ] && return 1
712866
+
712866
+    return 0
712866
+}
712866
+
712866
+prepare_trace() {
712866
+    local trace_base
712866
+
712866
+    trace_base=$(get_trace_base)
712866
+    # old debugfs interface case.
712866
+    if ! [ -d "$trace_base/tracing" ]; then
712866
+        mount none -t debugfs $trace_base
712866
+    # new tracefs interface case.
712866
+    elif ! [ -f "$trace_base/tracing/trace" ]; then
712866
+        mount none -t tracefs "$trace_base/tracing"
712866
+    fi
712866
+
712866
+    if ! [ -f "$trace_base/tracing/trace" ]; then
712866
+        echo "WARN: Mount trace failed for kernel module memory analyzing."
712866
+        return 1
712866
+    fi
712866
+
712866
+    # Active all the wanted trace events.
712866
+    echo "$(get_want_events)" > $trace_base/tracing/set_event
712866
+
712866
+    # There are three kinds of known applications for module loading:
712866
+    # "systemd-udevd", "modprobe" and "insmod".
712866
+    # Set them as the global events filter.
712866
+    # NOTE: Some kernel may not support this format of filter, anyway
712866
+    #       the operation will fail and it doesn't matter.
712866
+    echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1
712866
+    echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1
712866
+
712866
+    # Set the number of comm-pid if supported.
712866
+    if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then
712866
+        # Thanks to filters, 4096 is big enough(also well supported).
712866
+        echo 4096 > $trace_base/tracing/saved_cmdlines_size
712866
+    fi
712866
+
712866
+    # Enable and clear trace data for the first time.
712866
+    echo 1 > $trace_base/tracing/tracing_on
712866
+    echo > $trace_base/tracing/trace
712866
+    echo "Prepare trace success."
712866
+    return 0
712866
+}
712866
+
712866
+order_to_pages()
712866
+{
712866
+    local pages=1
712866
+    local order=$1
712866
+
712866
+    while [ "$order" != 0 ]; do
712866
+        order=$((order-1))
712866
+        pages=$(($pages*2))
712866
+	done
712866
+
712866
+    echo $pages
712866
+}
712866
+
712866
+parse_trace_data() {
712866
+    local module_name tmp_eval pages
712866
+
712866
+    cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args
712866
+    do
712866
+        # Skip comment lines
712866
+        if [ "$pid" = "#" ]; then
712866
+            continue
712866
+        fi
712866
+
712866
+        pid=${pid##*-}
712866
+        function=${function%:}
712866
+        if [ "$function" = "module_load" ]; then
712866
+            # One module is being loaded, save the task pid for tracking.
712866
+            # Remove the trailing after whitespace, there may be the module flags.
712866
+            module_name=${args%% *}
712866
+            # Mark current_module to track the task.
712866
+            eval current_module_$pid="$module_name"
712866
+            tmp_eval=$(eval echo '${module_loaded_'${module_name}'}')
712866
+            if [ -n "$tmp_eval" ]; then
712866
+                echo "WARN: \"$module_name\" was loaded multiple times!"
712866
+            fi
712866
+            eval unset module_loaded_$module_name
712866
+            eval nr_alloc_pages_$module_name=0
712866
+            continue
712866
+        fi
712866
+
712866
+        module_name=$(eval echo '${current_module_'${pid}'}')
712866
+        if [ -z "$module_name" ]; then
712866
+            continue
712866
+        fi
712866
+
712866
+        # Once we get here, the task is being tracked(is loading a module).
712866
+        if [ "$function" = "module_put" ]; then
712866
+            # Mark the module as loaded when the first module_put event happens after module_load.
712866
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
712866
+            echo "$tmp_eval pages consumed by \"$module_name\""
712866
+            eval module_loaded_$module_name=1
712866
+            # Module loading finished, so untrack the task.
712866
+            eval unset current_module_$pid
712866
+            eval unset nr_alloc_pages_$module_name
712866
+            continue
712866
+        fi
712866
+
712866
+        if [ "$function" = "mm_page_alloc" ]; then
712866
+            # Get order first, then convert to actual pages.
712866
+            pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/')
712866
+            pages=$(order_to_pages "$pages")
712866
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
712866
+            eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))"
712866
+        fi
712866
+    done
712866
+}
712866
+
712866
+cleanup_trace() {
712866
+    local trace_base
712866
+
712866
+    if is_trace_ready; then
712866
+        trace_base=$(get_trace_base)
712866
+        echo 0 > $trace_base/tracing/tracing_on
712866
+        echo > $trace_base/tracing/trace
712866
+        echo > $trace_base/tracing/set_event
712866
+        echo 0 > $trace_base/tracing/events/kmem/filter
712866
+        echo 0 > $trace_base/tracing/events/module/filter
712866
+    fi
712866
+}
712866
+
712866
+show_usage() {
712866
+    echo "Find out kernel modules with large memory consumption during loading based on trace."
712866
+    echo "Usage:"
712866
+    echo "1) run it first to setup trace."
712866
+    echo "2) run again to parse the trace data if any."
712866
+    echo "3) run with \"--cleanup\" option to cleanup trace after use."
712866
+}
712866
+
712866
+if [ "$1" = "--help" ]; then
712866
+    show_usage
712866
+    exit 0
712866
+fi
712866
+
712866
+if [ "$1" = "--cleanup" ]; then
712866
+    cleanup_trace
712866
+    exit 0
712866
+fi
712866
+
712866
+if is_trace_ready ; then
712866
+    echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)"
712866
+    parse_trace_data
712866
+else
712866
+    prepare_trace
712866
+fi
712866
+
712866
+exit $?