diff --git a/SOURCES/bz1679792-1-votequorum-set-wfa-status-only-on-startup.patch b/SOURCES/bz1679792-1-votequorum-set-wfa-status-only-on-startup.patch new file mode 100644 index 0000000..c21ba76 --- /dev/null +++ b/SOURCES/bz1679792-1-votequorum-set-wfa-status-only-on-startup.patch @@ -0,0 +1,68 @@ +From 6894792d76b1e8932bc822bb040933ae17e1a0c7 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Tue, 10 Mar 2020 17:49:27 +0100 +Subject: [PATCH] votequorum: set wfa status only on startup + +Previously reload of configuration with enabled wait_for_all result in +set of wait_for_all_status which set cluster_is_quorate to 0 but didn't +inform the quorum service so votequorum and quorum information may get +out of sync. + +Example is 1 node cluster, which is extended to 3 nodes. Quorum service +reports cluster as a quorate (incorrect) and votequorum as not-quorate +(correct). Similar behavior happens when extending cluster in general, +but some configurations are less incorrect (3->4). + +Discussed solution was to inform quorum service but that would mean +every reload would cause loss of quorum until all nodes would be seen +again. + +Such behaviour is consistent but seems to be a bit too strict. + +Proposed solution sets wait_for_all_status only on startup and +doesn't touch it during reload. + +This solution fulfills requirement of "cluster will be quorate for +the first time only after all nodes have been visible at least +once at the same time." because node clears wait_for_all_status only +after it sees all other nodes or joins cluster which is quorate. It also +solves problem with extending cluster, because when cluster becomes +unquorate (1->3) wait_for_all_status is set. + +Added assert is only for ensure that I haven't missed any case when +quorate cluster may become unquorate. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +(cherry picked from commit ca320beac25f82c0c555799e647a47975a333c28) +--- + exec/votequorum.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index 1cbbe37..8b96199 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -1016,7 +1016,7 @@ static void are_we_quorate(unsigned int total_votes) + "Waiting for all cluster members. " + "Current votes: %d expected_votes: %d", + total_votes, us->expected_votes); +- cluster_is_quorate = 0; ++ assert(!cluster_is_quorate); + return; + } + update_wait_for_all_status(0); +@@ -1548,7 +1548,9 @@ static char *votequorum_readconfig(int runtime) + update_ev_barrier(us->expected_votes); + update_two_node(); + if (wait_for_all) { +- update_wait_for_all_status(1); ++ if (!runtime) { ++ update_wait_for_all_status(1); ++ } + } else if (wait_for_all_autoset && wait_for_all_status) { + /* + * Reset wait for all status for consistency when wfa is auto-unset by 2node. +-- +1.8.3.1 + diff --git a/SOURCES/bz1780134-1-votequorum-Ignore-the-icmap_get_-return-value.patch b/SOURCES/bz1780134-1-votequorum-Ignore-the-icmap_get_-return-value.patch new file mode 100644 index 0000000..abac158 --- /dev/null +++ b/SOURCES/bz1780134-1-votequorum-Ignore-the-icmap_get_-return-value.patch @@ -0,0 +1,74 @@ +From 8ad3c6bbb4556332c5a6b7fecdab73310c045b24 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Mon, 25 Nov 2019 18:21:52 +0100 +Subject: [PATCH] votequorum: Ignore the icmap_get_* return value + +Express intention to ignore icmap_get_* return +value and rely on default behavior of not changing the output +parameter on error. + +Signed-off-by: Jan Friesse +(cherry picked from commit cddd62f972bca276c934e58f08da84071cec1ddb) +--- + exec/votequorum.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index 49bcdfa..e3cb7eb 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -1272,10 +1272,10 @@ static char *votequorum_readconfig(int runtime) + /* + * gather basic data here + */ +- icmap_get_uint32("quorum.expected_votes", &expected_votes); ++ (void)icmap_get_uint32("quorum.expected_votes", &expected_votes); + have_nodelist = votequorum_read_nodelist_configuration(&node_votes, &node_count, &node_expected_votes); + have_qdevice = votequorum_qdevice_is_configured(&qdevice_votes); +- icmap_get_uint8("quorum.two_node", &two_node); ++ (void)icmap_get_uint8("quorum.two_node", &two_node); + + /* + * do config verification and enablement +@@ -1320,13 +1320,13 @@ static char *votequorum_readconfig(int runtime) + wait_for_all = 1; + } + +- icmap_get_uint8("quorum.allow_downscale", &allow_downscale); +- icmap_get_uint8("quorum.wait_for_all", &wait_for_all); +- icmap_get_uint8("quorum.last_man_standing", &last_man_standing); +- icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); +- icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); +- icmap_get_uint8("quorum.auto_tie_breaker", &atb); +- icmap_get_string("quorum.auto_tie_breaker_node", &atb_string); ++ (void)icmap_get_uint8("quorum.allow_downscale", &allow_downscale); ++ (void)icmap_get_uint8("quorum.wait_for_all", &wait_for_all); ++ (void)icmap_get_uint8("quorum.last_man_standing", &last_man_standing); ++ (void)icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); ++ (void)icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); ++ (void)icmap_get_uint8("quorum.auto_tie_breaker", &atb); ++ (void)icmap_get_string("quorum.auto_tie_breaker_node", &atb_string); + + /* auto_tie_breaker defaults to LOWEST */ + if (atb) { +@@ -1518,7 +1518,7 @@ static char *votequorum_readconfig(int runtime) + us->expected_votes = node_expected_votes; + } else { + us->votes = 1; +- icmap_get_uint32("quorum.votes", &us->votes); ++ (void)icmap_get_uint32("quorum.votes", &us->votes); + } + + if (expected_votes) { +@@ -1569,7 +1569,7 @@ static void votequorum_refresh_config( + return ; + } + +- icmap_get_uint8("quorum.cancel_wait_for_all", &cancel_wfa); ++ (void)icmap_get_uint8("quorum.cancel_wait_for_all", &cancel_wfa); + if (strcmp(key_name, "quorum.cancel_wait_for_all") == 0 && + cancel_wfa >= 1) { + icmap_set_uint8("quorum.cancel_wait_for_all", 0); +-- +1.8.3.1 + diff --git a/SOURCES/bz1780134-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch b/SOURCES/bz1780134-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch new file mode 100644 index 0000000..a377c83 --- /dev/null +++ b/SOURCES/bz1780134-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch @@ -0,0 +1,81 @@ +From bfbed8c320b0c0c5d3db48630f3de77e5fd62b75 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Thu, 16 Jan 2020 15:43:59 +0100 +Subject: [PATCH] votequorum: Reflect runtime change of 2Node to WFA + +When 2Node mode is set, WFA is also set unless WFA is configured +explicitly. This behavior was not reflected on runtime change, so +restarted corosync behavior was different (WFA not set). Also when +cluster is reduced from 3 nodes to 2 nodes during runtime, WFA was not +set, what may result in two quorate partitions. + +Solution is to set WFA depending on 2Node when WFA +is not explicitly configured. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +(cherry picked from commit 8ce65bf951bc1e5b2d64b60ea027fbdc551d4fc8) +--- + exec/votequorum.c | 24 +++++++++++++++++++----- + 1 file changed, 19 insertions(+), 5 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index 2fb5db9..d87b6fd 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -80,6 +80,7 @@ static uint8_t two_node = 0; + + static uint8_t wait_for_all = 0; + static uint8_t wait_for_all_status = 0; ++static uint8_t wait_for_all_autoset = 0; /* Wait for all is not set explicitly and follows two_node */ + + static enum {ATB_NONE, ATB_LOWEST, ATB_HIGHEST, ATB_LIST} auto_tie_breaker = ATB_NONE, initial_auto_tie_breaker = ATB_NONE; + static int lowest_node_id = -1; +@@ -1316,12 +1317,10 @@ static char *votequorum_readconfig(int runtime) + * Enable special features + */ + if (!runtime) { +- if (two_node) { +- wait_for_all = 1; +- } +- + (void)icmap_get_uint8("quorum.allow_downscale", &allow_downscale); +- (void)icmap_get_uint8("quorum.wait_for_all", &wait_for_all); ++ if (icmap_get_uint8("quorum.wait_for_all", &wait_for_all) != CS_OK) { ++ wait_for_all_autoset = 1; ++ } + (void)icmap_get_uint8("quorum.last_man_standing", &last_man_standing); + (void)icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); + (void)icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); +@@ -1362,6 +1361,15 @@ static char *votequorum_readconfig(int runtime) + + } + ++ /* ++ * Changing of wait_for_all during runtime is not supported, but changing of two_node is ++ * and two_node may set wfa if not configured explicitly. It is safe to unset it ++ * (or set it back) when two_node changes. ++ */ ++ if (wait_for_all_autoset) { ++ wait_for_all = two_node; ++ } ++ + /* two_node and auto_tie_breaker are not compatible as two_node uses + * a fence race to decide quorum whereas ATB decides based on node id + */ +@@ -1541,6 +1549,12 @@ static char *votequorum_readconfig(int runtime) + update_two_node(); + if (wait_for_all) { + update_wait_for_all_status(1); ++ } else if (wait_for_all_autoset && wait_for_all_status) { ++ /* ++ * Reset wait for all status for consistency when wfa is auto-unset by 2node. ++ * wait_for_all_status would be ignored by are_we_quorate anyway. ++ */ ++ update_wait_for_all_status(0); + } + + out: +-- +1.8.3.1 + diff --git a/SOURCES/bz1835885-1-stats-Add-basic-schedule-miss-stats-to-needle.patch b/SOURCES/bz1835885-1-stats-Add-basic-schedule-miss-stats-to-needle.patch new file mode 100644 index 0000000..5f9cdfb --- /dev/null +++ b/SOURCES/bz1835885-1-stats-Add-basic-schedule-miss-stats-to-needle.patch @@ -0,0 +1,56 @@ +From 274fda334a84253222e01b779349784ec552921b Mon Sep 17 00:00:00 2001 +From: Christine Caulfield +Date: Fri, 21 Feb 2020 09:20:29 +0000 +Subject: [PATCH] stats: Add basic schedule-miss stats to needle + +In camelback (48b6894ef41e9a06ccbb696d062d86ef60dc2c4b) we have +a much more comprehensive system for recording +schedule misses because it has a 'stats' map. This is much more basic +and just writes the last event into cmap. You can still query and track +the value though. + +Signed-off-by: Christine Caulfield +Reviewed-by: Jan Friesse +--- + exec/main.c | 3 +++ + man/cmap_keys.8 | 10 ++++++++++ + 2 files changed, 13 insertions(+) + +diff --git a/exec/main.c b/exec/main.c +index 787d5c9..204abc8 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -852,6 +852,9 @@ static void timer_function_scheduler_timeout (void *data) + log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms " + "(threshold is %0.4f ms). Consider token timeout increase.", + (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); ++ ++ icmap_set_float("runtime.schedmiss.delay", (float)tv_diff / QB_TIME_NS_IN_MSEC); ++ icmap_set_uint64("runtime.schedmiss.timestamp", qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC); + } + + /* +diff --git a/man/cmap_keys.8 b/man/cmap_keys.8 +index b0cd721..1045c65 100644 +--- a/man/cmap_keys.8 ++++ b/man/cmap_keys.8 +@@ -256,6 +256,16 @@ Status of the processor. Can be one of joined and left. + Config version of the member node. + + .TP ++runtime.schedmiss.timestamp ++The timestamp of the last time when corosync failed to be scheduled ++for the required amount of time. The even is warned in syslog but this ++is easier to find. The time is milli-seconds since the epoch. ++ ++.B ++runtime.schedmiss.delay ++The amount of time (milliseconds as a float) that corosync was delayed. ++ ++.TP + resources.process.PID.* + Prefix created by applications using SAM with CMAP integration. + It contains the following keys: +-- +1.8.3.1 + diff --git a/SOURCES/bz1835885-2-main-Add-schedmiss-timestamp-into-message.patch b/SOURCES/bz1835885-2-main-Add-schedmiss-timestamp-into-message.patch new file mode 100644 index 0000000..c69dec3 --- /dev/null +++ b/SOURCES/bz1835885-2-main-Add-schedmiss-timestamp-into-message.patch @@ -0,0 +1,44 @@ +From 3166a87749fa4817d90ed335f3c5843fc38e7304 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Mon, 24 Feb 2020 14:58:45 +0100 +Subject: [PATCH] main: Add schedmiss timestamp into message + +This is useful for matching schedmiss event in stats map with logged +event. + +(backported from master 35662dd0ec53f456445c30c0ef92892f47b25aa2) + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/main.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/exec/main.c b/exec/main.c +index 204abc8..545c123 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -834,6 +834,7 @@ static void timer_function_scheduler_timeout (void *data) + struct scheduler_pause_timeout_data *timeout_data = (struct scheduler_pause_timeout_data *)data; + unsigned long long tv_current; + unsigned long long tv_diff; ++ uint64_t schedmiss_event_tstamp; + + tv_current = qb_util_nano_current_get (); + +@@ -849,8 +850,11 @@ static void timer_function_scheduler_timeout (void *data) + timeout_data->tv_prev = tv_current; + + if (tv_diff > timeout_data->max_tv_diff) { +- log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms " ++ schedmiss_event_tstamp = qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC; ++ ++ log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled (@%" PRIu64 ") for %0.4f ms " + "(threshold is %0.4f ms). Consider token timeout increase.", ++ schedmiss_event_tstamp, + (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); + + icmap_set_float("runtime.schedmiss.delay", (float)tv_diff / QB_TIME_NS_IN_MSEC); +-- +1.8.3.1 + diff --git a/SOURCES/bz1835885-3-main-Make-schedmiss-in-cmap-and-log-equal.patch b/SOURCES/bz1835885-3-main-Make-schedmiss-in-cmap-and-log-equal.patch new file mode 100644 index 0000000..8a6badd --- /dev/null +++ b/SOURCES/bz1835885-3-main-Make-schedmiss-in-cmap-and-log-equal.patch @@ -0,0 +1,60 @@ +From 44c1c8ea31f981bdd7856d4eb8f4ac49f95a85e3 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Fri, 5 Jun 2020 14:42:26 +0200 +Subject: [PATCH] main: Make schedmiss in cmap and log equal + +Second call of qb_util_nano_from_epoch_get may differ a bit. Solution is +to use previously stored timestamp (similarly as in master branch). + +Also fix man page to follow similar style as other keys. + +Thanks Patrik Hagara for reporting the problem. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/main.c | 2 +- + man/cmap_keys.8 | 12 +++++++----- + 2 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/exec/main.c b/exec/main.c +index 545c123d..5d05d573 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -858,7 +858,7 @@ static void timer_function_scheduler_timeout (void *data) + (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); + + icmap_set_float("runtime.schedmiss.delay", (float)tv_diff / QB_TIME_NS_IN_MSEC); +- icmap_set_uint64("runtime.schedmiss.timestamp", qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC); ++ icmap_set_uint64("runtime.schedmiss.timestamp", schedmiss_event_tstamp); + } + + /* +diff --git a/man/cmap_keys.8 b/man/cmap_keys.8 +index 1045c65e..d9e512d6 100644 +--- a/man/cmap_keys.8 ++++ b/man/cmap_keys.8 +@@ -256,13 +256,15 @@ Status of the processor. Can be one of joined and left. + Config version of the member node. + + .TP +-runtime.schedmiss.timestamp ++runtime.schedmiss.* ++If corosync is not scheduled after the required period of time it will ++log this event and also write an entry to cmap under following keys: ++ ++.B timestamp + The timestamp of the last time when corosync failed to be scheduled +-for the required amount of time. The even is warned in syslog but this +-is easier to find. The time is milli-seconds since the epoch. ++for the required amount of time. The time is milli-seconds since the epoch. + +-.B +-runtime.schedmiss.delay ++.B delay + The amount of time (milliseconds as a float) that corosync was delayed. + + .TP +-- +2.18.2 + diff --git a/SPECS/corosync.spec b/SPECS/corosync.spec index 40e24fb..3d8e1ca 100644 --- a/SPECS/corosync.spec +++ b/SPECS/corosync.spec @@ -29,7 +29,7 @@ Name: corosync Summary: The Corosync Cluster Engine and Application Programming Interfaces Version: 2.4.5 -Release: 4%{?gitver}%{?dist} +Release: 7%{?gitver}%{?dist} License: BSD Group: System Environment/Base URL: http://corosync.github.io/corosync/ @@ -43,9 +43,15 @@ Source1: https://github.com/jfriesse/spausedd/releases/download/%{spausedd_versi %endif Patch0: bz1656492-1-totem-Increase-ring_id-seq-after-load.patch +Patch1: bz1780134-1-votequorum-Ignore-the-icmap_get_-return-value.patch +Patch2: bz1780134-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch +Patch3: bz1679792-1-votequorum-set-wfa-status-only-on-startup.patch +Patch4: bz1835885-1-stats-Add-basic-schedule-miss-stats-to-needle.patch +Patch5: bz1835885-2-main-Add-schedmiss-timestamp-into-message.patch +Patch6: bz1835885-3-main-Make-schedmiss-in-cmap-and-log-equal.patch %if 0%{?rhel} -ExclusiveArch: i686 x86_64 s390x ppc64le aarch64 %{arm} +ExclusiveArch: i686 x86_64 s390x ppc64le aarch64 %endif # Runtime bits @@ -111,6 +117,12 @@ BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) %endif %patch0 -p1 -b .bz1656492-1 +%patch1 -p1 -b .bz1780134-1 +%patch2 -p1 -b .bz1780134-2 +%patch3 -p1 -b .bz1679792-1 +%patch4 -p1 -b .bz1835885-1 +%patch5 -p1 -b .bz1835885-2 +%patch6 -p1 -b .bz1835885-3 %build %if %{with runautogen} @@ -619,6 +631,31 @@ fi %endif %changelog +* Fri Jun 5 2020 Jan Friesse 2.4.5-7 +- Related: rhbz#1835885 + +- main: Make schedmiss in cmap and log equal (rhbz#1835885) +- merge upstream commit 44c1c8ea31f981bdd7856d4eb8f4ac49f95a85e3 (rhbz#1835885) + +* Thu May 28 2020 Jan Friesse 2.4.5-6 +- Resolves: rhbz#1835885 + +- stats: Add basic schedule-miss stats to needle (rhbz#1835885) +- merge upstream commit 274fda334a84253222e01b779349784ec552921b (rhbz#1835885) +- main: Add schedmiss timestamp into message (rhbz#1835885) +- merge upstream commit 3166a87749fa4817d90ed335f3c5843fc38e7304 (rhbz#1835885) + +* Tue Mar 24 2020 Jan Friesse 2.4.5-5 +- Resolves: rhbz#1679792 +- Resolves: rhbz#1780134 + +- votequorum: Ignore the icmap_get_* return value (rhbz#1780134) +- merge upstream commit 8ad3c6bbb4556332c5a6b7fecdab73310c045b24 (rhbz#1780134) +- votequorum: Reflect runtime change of 2Node to WFA (rhbz#1780134) +- merge upstream commit bfbed8c320b0c0c5d3db48630f3de77e5fd62b75 (rhbz#1780134) +- votequorum: set wfa status only on startup (rhbz#1679792) +- merge upstream commit 6894792d76b1e8932bc822bb040933ae17e1a0c7 (rhbz#1679792) + * Wed Aug 07 2019 Jan Friesse - 2.4.5-4 - Related: rhbz#1737884