From 8e629246fead7a5dfee78ca45627a0cfd7fef4b1 Mon Sep 17 00:00:00 2001 From: Mark Michelson Date: Wed, 10 Jun 2020 14:50:06 -0400 Subject: [PATCH 20/22] Add northd and ovn-controller cluster status reset commands. During the course of debugging a clustered DB environment, all members of the southbound database cluster were destroyed (i.e. the .db files were removed from disk) and then restarted. Once this happened, ovn-northd and ovn-controller could not interact with the southbound database because they both detected all members of the cluster as having "stale" data. The only course of action was to reset ovn-northd and all ovn-controllers. It is possible to have this happen with the northbound database as well if it is clustered. This patch offers new ovn-appctl commands for ovn-northd and ovn-controller that allows for it to reset its clustered status. This allows for it to interact with the database successfully after a cluster teardown and restart. Signed-off-by: Mark Michelson Acked-by: Han Zhou Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109 (cherry-picked from upstream ovn master commit 512b884dea3f85791eca44fd1d92956e8282be6d) Change-Id: I3e7d844d6f79552fd53a018c74b80def6069edcb --- controller/ovn-controller.8.xml | 16 ++++++++++++++++ controller/ovn-controller.c | 30 ++++++++++++++++++++++++++--- northd/ovn-northd.8.xml | 28 +++++++++++++++++++++++++++ northd/ovn-northd.c | 34 +++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+), 3 deletions(-) diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml index 92e0a6e43..66877314c 100644 --- a/controller/ovn-controller.8.xml +++ b/controller/ovn-controller.8.xml @@ -491,6 +491,22 @@ recomputes are cpu intensive.

+ +
sb-cluster-state-reset
+
+

+ Reset southbound database cluster status when databases are destroyed + and rebuilt. +

+

+ If all databases in a clustered southbound database are removed from + disk, then the stored index of all databases will be reset to zero. + This will cause ovn-controller to be unable to read or write to the + southbound database, because it will always detect the data as stale. + In such a case, run this command so that ovn-controller will reset its + local index so that it can interact with the southbound database again. +

+

diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 85e58d04f..fe6048153 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -72,6 +72,7 @@ static unixctl_cb_func ct_zone_list; static unixctl_cb_func extend_table_list; static unixctl_cb_func inject_pkt; static unixctl_cb_func engine_recompute_cmd; +static unixctl_cb_func cluster_state_reset_cmd; #define DEFAULT_BRIDGE_NAME "br-int" #define DEFAULT_PROBE_INTERVAL_MSEC 5000 @@ -445,7 +446,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl *ovs_idl) * updates 'sbdb_idl' with that pointer. */ static void update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl, - bool *monitor_all_p) + bool *monitor_all_p, bool *reset_ovnsb_idl_min_index) { const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_first(ovs_idl); if (!cfg) { @@ -475,6 +476,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl, if (monitor_all_p) { *monitor_all_p = monitor_all; } + if (*reset_ovnsb_idl_min_index) { + VLOG_INFO("Resetting southbound database cluster state"); + engine_set_force_recompute(true); + ovsdb_idl_reset_min_index(ovnsb_idl); + *reset_ovnsb_idl_min_index = false; + } } static void @@ -2287,6 +2294,11 @@ main(int argc, char *argv[]) unixctl_command_register("recompute", "", 0, 0, engine_recompute_cmd, NULL); + bool reset_ovnsb_idl_min_index = false; + unixctl_command_register("sb-cluster-state-reset", "", 0, 0, + cluster_state_reset_cmd, + &reset_ovnsb_idl_min_index); + unsigned int ovs_cond_seqno = UINT_MAX; unsigned int ovnsb_cond_seqno = UINT_MAX; @@ -2308,7 +2320,8 @@ main(int argc, char *argv[]) ovs_cond_seqno = new_ovs_cond_seqno; } - update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all); + update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all, + &reset_ovnsb_idl_min_index); update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl)); ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl)); @@ -2558,7 +2571,7 @@ main(int argc, char *argv[]) if (!restart) { bool done = !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl); while (!done) { - update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL); + update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL, false); update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl)); struct ovsdb_idl_txn *ovs_idl_txn @@ -2780,3 +2793,14 @@ engine_recompute_cmd(struct unixctl_conn *conn OVS_UNUSED, int argc OVS_UNUSED, poll_immediate_wake(); unixctl_command_reply(conn, NULL); } + +static void +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *idl_reset_) +{ + bool *idl_reset = idl_reset_; + + *idl_reset = true; + poll_immediate_wake(); + unixctl_command_reply(conn, NULL); +} diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml index e45d494e8..989e3643b 100644 --- a/northd/ovn-northd.8.xml +++ b/northd/ovn-northd.8.xml @@ -96,6 +96,34 @@ acquired OVSDB lock on SB DB, "standby" if it has not or "paused" if this instance is paused. + +
sb-cluster-state-reset
+
+

+ Reset southbound database cluster status when databases are destroyed + and rebuilt. +

+

+ If all databases in a clustered southbound database are removed from + disk, then the stored index of all databases will be reset to zero. + This will cause ovn-northd to be unable to read or write to the + southbound database, because it will always detect the data as stale. + In such a case, run this command so that ovn-northd will reset its + local index so that it can interact with the southbound database again. +

+
+ +
nb-cluster-state-reset
+
+

+ Reset northbound database cluster status when databases are destroyed + and rebuilt. +

+

+ This performs the same task as sb-cluster-state-reset + except for the northbound database client. +

+

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c index 5f0abeee1..fc05accde 100644 --- a/northd/ovn-northd.c +++ b/northd/ovn-northd.c @@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause; static unixctl_cb_func ovn_northd_resume; static unixctl_cb_func ovn_northd_is_paused; static unixctl_cb_func ovn_northd_status; +static unixctl_cb_func cluster_state_reset_cmd; struct northd_context { struct ovsdb_idl *ovnnb_idl; @@ -12393,6 +12394,16 @@ main(int argc, char *argv[]) &state); unixctl_command_register("status", "", 0, 0, ovn_northd_status, &state); + bool reset_ovnsb_idl_min_index = false; + unixctl_command_register("sb-cluster-state-reset", "", 0, 0, + cluster_state_reset_cmd, + &reset_ovnsb_idl_min_index); + + bool reset_ovnnb_idl_min_index = false; + unixctl_command_register("nb-cluster-state-reset", "", 0, 0, + cluster_state_reset_cmd, + &reset_ovnnb_idl_min_index); + daemonize_complete(); /* We want to detect (almost) all changes to the ovn-nb db. */ @@ -12684,6 +12695,18 @@ main(int argc, char *argv[]) ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl, northd_probe_interval); ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl, northd_probe_interval); + if (reset_ovnsb_idl_min_index) { + VLOG_INFO("Resetting southbound database cluster state"); + ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl); + reset_ovnsb_idl_min_index = false; + } + + if (reset_ovnnb_idl_min_index) { + VLOG_INFO("Resetting northbound database cluster state"); + ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl); + reset_ovnnb_idl_min_index = false; + } + poll_block(); if (should_service_stop()) { exiting = true; @@ -12762,3 +12785,14 @@ ovn_northd_status(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, ds_cstr(&s)); ds_destroy(&s); } + +static void +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *idl_reset_) +{ + bool *idl_reset = idl_reset_; + + *idl_reset = true; + poll_immediate_wake(); + unixctl_command_reply(conn, NULL); +} -- 2.26.2