|
|
45cd8f |
From 45617b727e280cac384a28ae3d96145e066e6197 Mon Sep 17 00:00:00 2001
|
|
|
45cd8f |
From: Reid Wahl <nrwahl@protonmail.com>
|
|
|
45cd8f |
Date: Fri, 3 Feb 2023 12:08:57 -0800
|
|
|
45cd8f |
Subject: [PATCH 01/02] Fix: fencer: Prevent double g_source_remove of op_timer_one
|
|
|
45cd8f |
|
|
|
45cd8f |
QE observed a rarely reproducible core dump in the fencer during
|
|
|
45cd8f |
Pacemaker shutdown, in which we try to g_source_remove() an op timer
|
|
|
45cd8f |
that's already been removed.
|
|
|
45cd8f |
|
|
|
45cd8f |
free_stonith_remote_op_list()
|
|
|
45cd8f |
-> g_hash_table_destroy()
|
|
|
45cd8f |
-> g_hash_table_remove_all_nodes()
|
|
|
45cd8f |
-> clear_remote_op_timers()
|
|
|
45cd8f |
-> g_source_remove()
|
|
|
45cd8f |
-> crm_glib_handler()
|
|
|
45cd8f |
-> "Source ID 190 was not found when attempting to remove it"
|
|
|
45cd8f |
|
|
|
45cd8f |
The likely cause is that request_peer_fencing() doesn't set
|
|
|
45cd8f |
op->op_timer_one to 0 after calling g_source_remove() on it, so if that
|
|
|
45cd8f |
op is still in the stonith_remote_op_list at shutdown with the same
|
|
|
45cd8f |
timer, clear_remote_op_timers() tries to remove the source for
|
|
|
45cd8f |
op_timer_one again.
|
|
|
45cd8f |
|
|
|
45cd8f |
There are only five locations that call g_source_remove() on a
|
|
|
45cd8f |
remote_fencing_op_t timer.
|
|
|
45cd8f |
* Three of them are in clear_remote_op_timers(), which first 0-checks
|
|
|
45cd8f |
the timer and then sets it to 0 after g_source_remove().
|
|
|
45cd8f |
* One is in remote_op_query_timeout(), which does the same.
|
|
|
45cd8f |
* The last is the one we fix here in request_peer_fencing().
|
|
|
45cd8f |
|
|
|
45cd8f |
I don't know all the conditions of QE's test scenario at this point.
|
|
|
45cd8f |
What I do know:
|
|
|
45cd8f |
* have-watchdog=true
|
|
|
45cd8f |
* stonith-watchdog-timeout=10
|
|
|
45cd8f |
* no explicit topology
|
|
|
45cd8f |
* fence agent script is missing for the configured fence device
|
|
|
45cd8f |
* requested fencing of one node
|
|
|
45cd8f |
* cluster shutdown
|
|
|
45cd8f |
|
|
|
45cd8f |
Fixes RHBZ2166967
|
|
|
45cd8f |
|
|
|
45cd8f |
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
|
|
|
45cd8f |
---
|
|
|
45cd8f |
daemons/fenced/fenced_remote.c | 1 +
|
|
|
45cd8f |
1 file changed, 1 insertion(+)
|
|
|
45cd8f |
|
|
|
45cd8f |
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
|
|
|
45cd8f |
index d61b5bd..b7426ff 100644
|
|
|
45cd8f |
--- a/daemons/fenced/fenced_remote.c
|
|
|
45cd8f |
+++ b/daemons/fenced/fenced_remote.c
|
|
|
45cd8f |
@@ -1825,6 +1825,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
|
|
|
45cd8f |
op->state = st_exec;
|
|
|
45cd8f |
if (op->op_timer_one) {
|
|
|
45cd8f |
g_source_remove(op->op_timer_one);
|
|
|
45cd8f |
+ op->op_timer_one = 0;
|
|
|
45cd8f |
}
|
|
|
45cd8f |
|
|
|
45cd8f |
if (!((stonith_watchdog_timeout_ms > 0)
|
|
|
45cd8f |
--
|
|
|
45cd8f |
2.31.1
|
|
|
45cd8f |
|
|
|
45cd8f |
From 0291db4750322ec7f01ae6a4a2a30abca9d8e19e Mon Sep 17 00:00:00 2001
|
|
|
45cd8f |
From: Reid Wahl <nrwahl@protonmail.com>
|
|
|
45cd8f |
Date: Wed, 15 Feb 2023 22:30:27 -0800
|
|
|
45cd8f |
Subject: [PATCH 02/02] Fix: fencer: Avoid double source remove of op_timer_total
|
|
|
45cd8f |
|
|
|
45cd8f |
remote_op_timeout() returns G_SOURCE_REMOVE, which tells GLib to remove
|
|
|
45cd8f |
the source from the main loop after returning. Currently this function
|
|
|
45cd8f |
is used as the callback only when creating op->op_timer_total.
|
|
|
45cd8f |
|
|
|
45cd8f |
If we don't set op->op_timer_total to 0 before returning from
|
|
|
45cd8f |
remote_op_timeout(), then we can get an assertion and core dump from
|
|
|
45cd8f |
GLib when the op's timers are being cleared (either during op
|
|
|
45cd8f |
finalization or during fencer shutdown). This is because
|
|
|
45cd8f |
clear_remote_op_timers() sees that op->op_timer_total != 0 and tries to
|
|
|
45cd8f |
remove the source, but the source has already been removed.
|
|
|
45cd8f |
|
|
|
45cd8f |
Note that we're already (correctly) zeroing op->op_timer_one and
|
|
|
45cd8f |
op->query_timeout as appropriate in their respective callback functions.
|
|
|
45cd8f |
|
|
|
45cd8f |
Fortunately, GLib doesn't care whether the source has already been
|
|
|
45cd8f |
removed before we return G_SOURCE_REMOVE from a callback. So it's safe
|
|
|
45cd8f |
to call finalize_op() (which removes all the op's timer sources) from
|
|
|
45cd8f |
within a callback.
|
|
|
45cd8f |
|
|
|
45cd8f |
Fixes RHBZ#2166967
|
|
|
45cd8f |
|
|
|
45cd8f |
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
|
|
|
45cd8f |
---
|
|
|
45cd8f |
daemons/fenced/fenced_remote.c | 2 ++
|
|
|
45cd8f |
1 file changed, 2 insertions(+)
|
|
|
45cd8f |
|
|
|
45cd8f |
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
|
|
|
45cd8f |
index b7426ff88..adea3d7d8 100644
|
|
|
45cd8f |
--- a/daemons/fenced/fenced_remote.c
|
|
|
45cd8f |
+++ b/daemons/fenced/fenced_remote.c
|
|
|
45cd8f |
@@ -718,6 +718,8 @@ remote_op_timeout(gpointer userdata)
|
|
|
45cd8f |
{
|
|
|
45cd8f |
remote_fencing_op_t *op = userdata;
|
|
|
45cd8f |
|
|
|
45cd8f |
+ op->op_timer_total = 0;
|
|
|
45cd8f |
+
|
|
|
45cd8f |
if (op->state == st_done) {
|
|
|
45cd8f |
crm_debug("Action '%s' targeting %s for client %s already completed "
|
|
|
45cd8f |
CRM_XS " id=%.8s",
|
|
|
45cd8f |
--
|
|
|
45cd8f |
2.39.0
|