|
 |
5a92bc |
From e7a82370a7b5d3ca342d5e42e25763fa2c938739 Mon Sep 17 00:00:00 2001
|
|
 |
5a92bc |
From: Jan Friesse <jfriesse@redhat.com>
|
|
 |
5a92bc |
Date: Tue, 26 Oct 2021 18:17:59 +0200
|
|
 |
5a92bc |
Subject: [PATCH] totemsrp: Switch totempg buffers at the right time
|
|
 |
5a92bc |
|
|
 |
5a92bc |
Commit 92e0f9c7bb9b4b6a0da8d64bdf3b2e47ae55b1cc added switching of
|
|
 |
5a92bc |
totempg buffers in sync phase. But because buffers got switch too early
|
|
 |
5a92bc |
there was a problem when delivering recovered messages (messages got
|
|
 |
5a92bc |
corrupted and/or lost). Solution is to switch buffers after recovered
|
|
 |
5a92bc |
messages got delivered.
|
|
 |
5a92bc |
|
|
 |
5a92bc |
I think it is worth to describe complete history with reproducers so it
|
|
 |
5a92bc |
doesn't get lost.
|
|
 |
5a92bc |
|
|
 |
5a92bc |
It all started with 402638929e5045ef520a7339696c687fbed0b31b (more info
|
|
 |
5a92bc |
about original problem is described in
|
|
 |
5a92bc |
https://bugzilla.redhat.com/show_bug.cgi?id=820821). This patch
|
|
 |
5a92bc |
solves problem which is way to be reproduced with following reproducer:
|
|
 |
5a92bc |
- 2 nodes
|
|
 |
5a92bc |
- Both nodes running corosync and testcpg
|
|
 |
5a92bc |
- Pause node 1 (SIGSTOP of corosync)
|
|
 |
5a92bc |
- On node 1, send some messages by testcpg
|
|
 |
5a92bc |
(it's not answering but this doesn't matter). Simply hit ENTER key
|
|
 |
5a92bc |
few times is enough)
|
|
 |
5a92bc |
- Wait till node 2 detects that node 1 left
|
|
 |
5a92bc |
- Unpause node 1 (SIGCONT of corosync)
|
|
 |
5a92bc |
|
|
 |
5a92bc |
and on node 1 newly mcasted cpg messages got sent before sync barrier,
|
|
 |
5a92bc |
so node 2 logs "Unknown node -> we will not deliver message".
|
|
 |
5a92bc |
|
|
 |
5a92bc |
Solution was to add switch of totemsrp new messages buffer.
|
|
 |
5a92bc |
|
|
 |
5a92bc |
This patch was not enough so new one
|
|
 |
5a92bc |
(92e0f9c7bb9b4b6a0da8d64bdf3b2e47ae55b1cc) was created. Reproducer of
|
|
 |
5a92bc |
problem was similar, just cpgverify was used instead of testcpg.
|
|
 |
5a92bc |
Occasionally when node 1 was unpaused it hang in sync phase because
|
|
 |
5a92bc |
there was a partial message in totempg buffers. New sync message had
|
|
 |
5a92bc |
different frag cont so it was thrown away and never delivered.
|
|
 |
5a92bc |
|
|
 |
5a92bc |
After many years problem was found which is solved by this patch
|
|
 |
5a92bc |
(original issue describe in
|
|
 |
5a92bc |
https://github.com/corosync/corosync/issues/660).
|
|
 |
5a92bc |
Reproducer is more complex:
|
|
 |
5a92bc |
- 2 nodes
|
|
 |
5a92bc |
- Node 1 is rate-limited (used script on the hypervisor side):
|
|
 |
5a92bc |
```
|
|
 |
5a92bc |
iface=tapXXXX
|
|
 |
5a92bc |
# ~0.1MB/s in bit/s
|
|
 |
5a92bc |
rate=838856
|
|
 |
5a92bc |
# 1mb/s
|
|
 |
5a92bc |
burst=1048576
|
|
 |
5a92bc |
tc qdisc add dev $iface root handle 1: htb default 1
|
|
 |
5a92bc |
tc class add dev $iface parent 1: classid 1:1 htb rate ${rate}bps \
|
|
 |
5a92bc |
burst ${burst}b
|
|
 |
5a92bc |
tc qdisc add dev $iface handle ffff: ingress
|
|
 |
5a92bc |
tc filter add dev $iface parent ffff: prio 50 basic police rate \
|
|
 |
5a92bc |
${rate}bps burst ${burst}b mtu 64kb "drop"
|
|
 |
5a92bc |
```
|
|
 |
5a92bc |
- Node 2 is running corosync and cpgverify
|
|
 |
5a92bc |
- Node 1 keeps restarting of corosync and running cpgverify in cycle
|
|
 |
5a92bc |
- Console 1: while true; do corosync; sleep 20; \
|
|
 |
5a92bc |
kill $(pidof corosync); sleep 20; done
|
|
 |
5a92bc |
- Console 2: while true; do ./cpgverify;done
|
|
 |
5a92bc |
|
|
 |
5a92bc |
And from time to time (reproduced usually in less than 5 minutes)
|
|
 |
5a92bc |
cpgverify reports corrupted message.
|
|
 |
5a92bc |
|
|
 |
5a92bc |
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
|
|
 |
5a92bc |
Reviewed-by: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
 |
5a92bc |
---
|
|
 |
5a92bc |
exec/totemsrp.c | 16 +++++++++++++++-
|
|
 |
5a92bc |
1 file changed, 15 insertions(+), 1 deletion(-)
|
|
 |
5a92bc |
|
|
 |
5a92bc |
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
|
|
 |
5a92bc |
index d24b11fa..fd71771b 100644
|
|
 |
5a92bc |
--- a/exec/totemsrp.c
|
|
 |
5a92bc |
+++ b/exec/totemsrp.c
|
|
 |
5a92bc |
@@ -1989,13 +1989,27 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
|
|
 |
5a92bc |
trans_memb_list_totemip, instance->my_trans_memb_entries,
|
|
 |
5a92bc |
left_list, instance->my_left_memb_entries,
|
|
 |
5a92bc |
0, 0, &instance->my_ring_id);
|
|
 |
5a92bc |
+ /*
|
|
 |
5a92bc |
+ * Switch new totemsrp messages queue. Messages sent from now on are stored
|
|
 |
5a92bc |
+ * in different queue so synchronization messages are delivered first. Totempg
|
|
 |
5a92bc |
+ * buffers will be switched later.
|
|
 |
5a92bc |
+ */
|
|
 |
5a92bc |
instance->waiting_trans_ack = 1;
|
|
 |
5a92bc |
- instance->totemsrp_waiting_trans_ack_cb_fn (1);
|
|
 |
5a92bc |
|
|
 |
5a92bc |
// TODO we need to filter to ensure we only deliver those
|
|
 |
5a92bc |
// messages which are part of instance->my_deliver_memb
|
|
 |
5a92bc |
messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received);
|
|
 |
5a92bc |
|
|
 |
5a92bc |
+ /*
|
|
 |
5a92bc |
+ * Switch totempg buffers. This used to be right after
|
|
 |
5a92bc |
+ * instance->waiting_trans_ack = 1;
|
|
 |
5a92bc |
+ * line. This was causing problem, because there may be not yet
|
|
 |
5a92bc |
+ * processed parts of messages in totempg buffers.
|
|
 |
5a92bc |
+ * So when buffers were switched and recovered messages
|
|
 |
5a92bc |
+ * got delivered it was not possible to assemble them.
|
|
 |
5a92bc |
+ */
|
|
 |
5a92bc |
+ instance->totemsrp_waiting_trans_ack_cb_fn (1);
|
|
 |
5a92bc |
+
|
|
 |
5a92bc |
instance->my_aru = aru_save;
|
|
 |
5a92bc |
|
|
 |
5a92bc |
/*
|
|
 |
5a92bc |
--
|
|
 |
5a92bc |
2.27.0
|
|
 |
5a92bc |
|