d2787b
From e66ab728426e147bf4fc594109137ebfb1f2dda6 Mon Sep 17 00:00:00 2001
d2787b
From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com>
d2787b
Date: Mon, 23 Nov 2020 08:09:44 +0530
d2787b
Subject: [PATCH 566/584] enahancement/debug: Option to generate core dump
d2787b
 without killing the process
d2787b
d2787b
Comments and idea proposed by: Xavi Hernandez(jahernan@redhat.com):
d2787b
d2787b
On production systems sometimes we see a log message saying that an assertion
d2787b
has failed. But it's hard to track why it failed without additional information
d2787b
(on debug builds, a GF_ASSERT() generates a core dump and kills the process,
d2787b
so it can be used to debug the issue, but many times we are only able to
d2787b
reproduce assertion failures on production systems, where GF_ASSERT() only logs
d2787b
a message and continues).
d2787b
d2787b
In other cases we may have a core dump caused by a bug, but the core dump doesn't
d2787b
necessarily happen when the bug has happened. Sometimes the crash happens so much
d2787b
later that the causes that triggered the bug are lost. In these cases we can add
d2787b
more assertions to the places that touch the potential candidates to cause the bug,
d2787b
but the only thing we'll get is a log message, which may not be enough.
d2787b
d2787b
One solution would be to always generate a core dump in case of assertion failure,
d2787b
but this was already discussed and it was decided that it was too drastic. If a
d2787b
core dump was really needed, a new macro was created to do so: GF_ABORT(),
d2787b
but GF_ASSERT() would continue to not kill the process on production systems.
d2787b
d2787b
I'm proposing to modify GF_ASSERT() on production builds so that it conditionally
d2787b
triggers a signal when a debugger is attached. When this happens, the debugger
d2787b
will generate a core dump and continue the process as if nothing had happened.
d2787b
If there's no debugger attached, GF_ASSERT() will behave as always.
d2787b
d2787b
The idea I have is to use SIGCONT to do that. This signal is harmless, so we can
d2787b
unmask it (we currently mask all unneeded signals) and raise it inside a GF_ASSERT()
d2787b
when some global variable is set to true.
d2787b
d2787b
To produce the core dump, run the script under extras/debug/gfcore.py on other
d2787b
terminal. gdb breaks and produces coredump when GF_ASSERT is hit.
d2787b
d2787b
The script is copied from #1810 which is written by Xavi Hernandez(jahernan@redhat.com)
d2787b
d2787b
Backport of:
d2787b
> Upstream-patch: https://github.com/gluster/glusterfs/pull/1814
d2787b
> Fixes: #1810
d2787b
> Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53
d2787b
> Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
d2787b
d2787b
BUG: 1927640
d2787b
Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53
d2787b
Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
d2787b
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244960
d2787b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d2787b
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
d2787b
---
d2787b
 extras/debug/gfcore.py                    | 77 +++++++++++++++++++++++++++++++
d2787b
 libglusterfs/src/common-utils.c           | 11 +++++
d2787b
 libglusterfs/src/glusterfs/common-utils.h | 10 +++-
d2787b
 libglusterfs/src/libglusterfs.sym         | 16 +++++++
d2787b
 4 files changed, 112 insertions(+), 2 deletions(-)
d2787b
 create mode 100755 extras/debug/gfcore.py
d2787b
d2787b
diff --git a/extras/debug/gfcore.py b/extras/debug/gfcore.py
d2787b
new file mode 100755
d2787b
index 0000000..9f097f0
d2787b
--- /dev/null
d2787b
+++ b/extras/debug/gfcore.py
d2787b
@@ -0,0 +1,77 @@
d2787b
+#!/usr/bin/env python3
d2787b
+
d2787b
+def launch():
d2787b
+    if len(sys.argv) < 3:
d2787b
+        sys.stderr.write("Syntax: {} <pid> <count> [<dir>]\n".format(os.path.basename(sys.argv[0])))
d2787b
+        sys.exit(1)
d2787b
+
d2787b
+    pid = int(sys.argv[1])
d2787b
+    count = int(sys.argv[2])
d2787b
+    base = os.getcwd()
d2787b
+    if len(sys.argv) > 3:
d2787b
+        base = sys.argv[3]
d2787b
+    base = os.path.realpath(base)
d2787b
+
d2787b
+    subprocess.run([
d2787b
+        "gdb", "-batch",
d2787b
+        "-p", str(pid),
d2787b
+        "-ex", "py arg_count = {}".format(count),
d2787b
+        "-ex", "py arg_dir = '{}'".format(base),
d2787b
+        "-x", __file__
d2787b
+    ])
d2787b
+
d2787b
+class GFCore(object):
d2787b
+    def __init__(self, count, base):
d2787b
+        self.count = count
d2787b
+        self.base = base
d2787b
+        gdb.execute('set pagination off')
d2787b
+        gdb.execute('set gf_signal_on_assert = 1')
d2787b
+        gdb.events.stop.connect(self.gf_stop)
d2787b
+
d2787b
+        self.cont()
d2787b
+
d2787b
+    def cont(self, quit = False):
d2787b
+        if not(quit) and (self.count > 0):
d2787b
+            gdb.execute('continue')
d2787b
+        else:
d2787b
+            gdb.execute('set gf_signal_on_assert = 0')
d2787b
+            gdb.execute('quit')
d2787b
+
d2787b
+    def gf_stop(self, event):
d2787b
+        quit = False
d2787b
+
d2787b
+        if isinstance(event, gdb.SignalEvent):
d2787b
+            if event.stop_signal == 'SIGCONT':
d2787b
+                now = datetime.utcnow().isoformat()
d2787b
+                pid = gdb.selected_inferior().pid
d2787b
+                name = "{}/gfcore.{}.{}".format(self.base, pid, now)
d2787b
+                print("Generating coredump '{}'".format(name))
d2787b
+                gdb.execute('gcore {}'.format(name))
d2787b
+                self.count -= 1
d2787b
+
d2787b
+            elif event.stop_signal == 'SIGINT':
d2787b
+                print("SIGINT received. Exiting")
d2787b
+                quit = True
d2787b
+
d2787b
+            else:
d2787b
+                print("Ignoring signal {}".format(event.stop_signal))
d2787b
+        else:
d2787b
+            print("Unexpected event {}".format(type(event)))
d2787b
+
d2787b
+        self.cont(quit)
d2787b
+
d2787b
+# Module 'gdb' is not available when running outside gdb.
d2787b
+try:
d2787b
+    import gdb
d2787b
+    from datetime import datetime
d2787b
+
d2787b
+    GFCore(arg_count, arg_dir)
d2787b
+except ModuleNotFoundError:
d2787b
+    import sys
d2787b
+    import os
d2787b
+    import subprocess
d2787b
+
d2787b
+    try:
d2787b
+        launch()
d2787b
+    except KeyboardInterrupt:
d2787b
+        pass
d2787b
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
d2787b
index 70d5d21..d351b93 100644
d2787b
--- a/libglusterfs/src/common-utils.c
d2787b
+++ b/libglusterfs/src/common-utils.c
d2787b
@@ -77,9 +77,19 @@ char *vol_type_str[] = {
d2787b
     "Distributed-Disperse",
d2787b
 };
d2787b
 
d2787b
+gf_boolean_t gf_signal_on_assert = false;
d2787b
+
d2787b
 typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size);
d2787b
 typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size);
d2787b
 
d2787b
+void gf_assert(void)
d2787b
+{
d2787b
+    if (gf_signal_on_assert) {
d2787b
+        raise(SIGCONT);
d2787b
+    }
d2787b
+
d2787b
+}
d2787b
+
d2787b
 void
d2787b
 gf_xxh64_wrapper(const unsigned char *data, size_t const len,
d2787b
                  unsigned long long const seed, char *xxh64)
d2787b
@@ -4021,6 +4031,7 @@ gf_thread_vcreate(pthread_t *thread, const pthread_attr_t *attr,
d2787b
     sigdelset(&set, SIGSYS);
d2787b
     sigdelset(&set, SIGFPE);
d2787b
     sigdelset(&set, SIGABRT);
d2787b
+    sigdelset(&set, SIGCONT);
d2787b
 
d2787b
     pthread_sigmask(SIG_BLOCK, &set, &old;;
d2787b
 
d2787b
diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h
d2787b
index f0a0a41..604afd0 100644
d2787b
--- a/libglusterfs/src/glusterfs/common-utils.h
d2787b
+++ b/libglusterfs/src/glusterfs/common-utils.h
d2787b
@@ -25,6 +25,7 @@
d2787b
 #include <limits.h>
d2787b
 #include <fnmatch.h>
d2787b
 #include <uuid/uuid.h>
d2787b
+#include <urcu/compiler.h>
d2787b
 
d2787b
 #ifndef ffsll
d2787b
 #define ffsll(x) __builtin_ffsll(x)
d2787b
@@ -431,14 +432,19 @@ BIT_VALUE(unsigned char *array, unsigned int index)
d2787b
 #define GF_FILE_CONTENT_REQUESTED(_xattr_req, _content_limit)                  \
d2787b
     (dict_get_uint64(_xattr_req, "glusterfs.content", _content_limit) == 0)
d2787b
 
d2787b
+void gf_assert(void);
d2787b
+
d2787b
 #ifdef DEBUG
d2787b
 #define GF_ASSERT(x) assert(x);
d2787b
 #else
d2787b
 #define GF_ASSERT(x)                                                           \
d2787b
     do {                                                                       \
d2787b
-        if (!(x)) {                                                            \
d2787b
+        if (caa_unlikely(!(x))) {                                              \
d2787b
+            gf_assert();                                                       \
d2787b
             gf_msg_callingfn("", GF_LOG_ERROR, 0, LG_MSG_ASSERTION_FAILED,     \
d2787b
-                             "Assertion failed: " #x);                         \
d2787b
+                             "Assertion failed: To attach gdb and coredump,"   \
d2787b
+                             " Run the script under "                          \
d2787b
+                             "\"glusterfs/extras/debug/gfcore.py\"");          \
d2787b
         }                                                                      \
d2787b
     } while (0)
d2787b
 #endif
d2787b
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
d2787b
index 0a0862e..9072afa 100644
d2787b
--- a/libglusterfs/src/libglusterfs.sym
d2787b
+++ b/libglusterfs/src/libglusterfs.sym
d2787b
@@ -1167,3 +1167,19 @@ gf_changelog_register_generic
d2787b
 gf_gfid_generate_from_xxh64
d2787b
 find_xlator_option_in_cmd_args_t
d2787b
 gf_d_type_from_ia_type
d2787b
+glusterfs_graph_fini
d2787b
+glusterfs_process_svc_attach_volfp
d2787b
+glusterfs_mux_volfile_reconfigure
d2787b
+glusterfs_process_svc_detach
d2787b
+mgmt_is_multiplexed_daemon
d2787b
+xlator_is_cleanup_starting
d2787b
+gf_nanosleep
d2787b
+gf_syncfs
d2787b
+graph_total_client_xlator
d2787b
+get_xattrs_to_heal
d2787b
+gf_latency_statedump_and_reset
d2787b
+gf_latency_new
d2787b
+gf_latency_reset
d2787b
+gf_latency_update
d2787b
+gf_frame_latency_update
d2787b
+gf_assert
d2787b
-- 
d2787b
1.8.3.1
d2787b