|
|
9219d1 |
From 8e629246fead7a5dfee78ca45627a0cfd7fef4b1 Mon Sep 17 00:00:00 2001
|
|
|
9219d1 |
From: Mark Michelson <mmichels@redhat.com>
|
|
|
9219d1 |
Date: Wed, 10 Jun 2020 14:50:06 -0400
|
|
|
9219d1 |
Subject: [PATCH 20/22] Add northd and ovn-controller cluster status reset
|
|
|
9219d1 |
commands.
|
|
|
9219d1 |
|
|
|
9219d1 |
During the course of debugging a clustered DB environment, all members
|
|
|
9219d1 |
of the southbound database cluster were destroyed (i.e. the .db files
|
|
|
9219d1 |
were removed from disk) and then restarted. Once this happened,
|
|
|
9219d1 |
ovn-northd and ovn-controller could not interact with the southbound
|
|
|
9219d1 |
database because they both detected all members of the cluster as having
|
|
|
9219d1 |
"stale" data. The only course of action was to reset ovn-northd and all
|
|
|
9219d1 |
ovn-controllers. It is possible to have this happen with the northbound
|
|
|
9219d1 |
database as well if it is clustered.
|
|
|
9219d1 |
|
|
|
9219d1 |
This patch offers new ovn-appctl commands for ovn-northd and
|
|
|
9219d1 |
ovn-controller that allows for it to reset its clustered status. This
|
|
|
9219d1 |
allows for it to interact with the database successfully after a cluster
|
|
|
9219d1 |
teardown and restart.
|
|
|
9219d1 |
|
|
|
9219d1 |
Signed-off-by: Mark Michelson <mmichels@redhat.com>
|
|
|
9219d1 |
Acked-by: Han Zhou <hzhou@ovn.org>
|
|
|
9219d1 |
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109
|
|
|
9219d1 |
|
|
|
9219d1 |
(cherry-picked from upstream ovn master commit 512b884dea3f85791eca44fd1d92956e8282be6d)
|
|
|
9219d1 |
|
|
|
9219d1 |
Change-Id: I3e7d844d6f79552fd53a018c74b80def6069edcb
|
|
|
9219d1 |
---
|
|
|
9219d1 |
controller/ovn-controller.8.xml | 16 ++++++++++++++++
|
|
|
9219d1 |
controller/ovn-controller.c | 30 ++++++++++++++++++++++++++---
|
|
|
9219d1 |
northd/ovn-northd.8.xml | 28 +++++++++++++++++++++++++++
|
|
|
9219d1 |
northd/ovn-northd.c | 34 +++++++++++++++++++++++++++++++++
|
|
|
9219d1 |
4 files changed, 105 insertions(+), 3 deletions(-)
|
|
|
9219d1 |
|
|
|
9219d1 |
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
|
|
|
9219d1 |
index 92e0a6e43..66877314c 100644
|
|
|
9219d1 |
--- a/controller/ovn-controller.8.xml
|
|
|
9219d1 |
+++ b/controller/ovn-controller.8.xml
|
|
|
9219d1 |
@@ -491,6 +491,22 @@
|
|
|
9219d1 |
recomputes are cpu intensive.
|
|
|
9219d1 |
|
|
|
9219d1 |
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ sb-cluster-state-reset
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ Reset southbound database cluster status when databases are destroyed
|
|
|
9219d1 |
+ and rebuilt.
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ If all databases in a clustered southbound database are removed from
|
|
|
9219d1 |
+ disk, then the stored index of all databases will be reset to zero.
|
|
|
9219d1 |
+ This will cause ovn-controller to be unable to read or write to the
|
|
|
9219d1 |
+ southbound database, because it will always detect the data as stale.
|
|
|
9219d1 |
+ In such a case, run this command so that ovn-controller will reset its
|
|
|
9219d1 |
+ local index so that it can interact with the southbound database again.
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
|
|
|
9219d1 |
|
|
|
9219d1 |
|
|
|
9219d1 |
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
|
|
|
9219d1 |
index 85e58d04f..fe6048153 100644
|
|
|
9219d1 |
--- a/controller/ovn-controller.c
|
|
|
9219d1 |
+++ b/controller/ovn-controller.c
|
|
|
9219d1 |
@@ -72,6 +72,7 @@ static unixctl_cb_func ct_zone_list;
|
|
|
9219d1 |
static unixctl_cb_func extend_table_list;
|
|
|
9219d1 |
static unixctl_cb_func inject_pkt;
|
|
|
9219d1 |
static unixctl_cb_func engine_recompute_cmd;
|
|
|
9219d1 |
+static unixctl_cb_func cluster_state_reset_cmd;
|
|
|
9219d1 |
|
|
|
9219d1 |
#define DEFAULT_BRIDGE_NAME "br-int"
|
|
|
9219d1 |
#define DEFAULT_PROBE_INTERVAL_MSEC 5000
|
|
|
9219d1 |
@@ -445,7 +446,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl *ovs_idl)
|
|
|
9219d1 |
* updates 'sbdb_idl' with that pointer. */
|
|
|
9219d1 |
static void
|
|
|
9219d1 |
update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
|
|
|
9219d1 |
- bool *monitor_all_p)
|
|
|
9219d1 |
+ bool *monitor_all_p, bool *reset_ovnsb_idl_min_index)
|
|
|
9219d1 |
{
|
|
|
9219d1 |
const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_first(ovs_idl);
|
|
|
9219d1 |
if (!cfg) {
|
|
|
9219d1 |
@@ -475,6 +476,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
|
|
|
9219d1 |
if (monitor_all_p) {
|
|
|
9219d1 |
*monitor_all_p = monitor_all;
|
|
|
9219d1 |
}
|
|
|
9219d1 |
+ if (*reset_ovnsb_idl_min_index) {
|
|
|
9219d1 |
+ VLOG_INFO("Resetting southbound database cluster state");
|
|
|
9219d1 |
+ engine_set_force_recompute(true);
|
|
|
9219d1 |
+ ovsdb_idl_reset_min_index(ovnsb_idl);
|
|
|
9219d1 |
+ *reset_ovnsb_idl_min_index = false;
|
|
|
9219d1 |
+ }
|
|
|
9219d1 |
}
|
|
|
9219d1 |
|
|
|
9219d1 |
static void
|
|
|
9219d1 |
@@ -2287,6 +2294,11 @@ main(int argc, char *argv[])
|
|
|
9219d1 |
unixctl_command_register("recompute", "", 0, 0, engine_recompute_cmd,
|
|
|
9219d1 |
NULL);
|
|
|
9219d1 |
|
|
|
9219d1 |
+ bool reset_ovnsb_idl_min_index = false;
|
|
|
9219d1 |
+ unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
|
|
|
9219d1 |
+ cluster_state_reset_cmd,
|
|
|
9219d1 |
+ &reset_ovnsb_idl_min_index);
|
|
|
9219d1 |
+
|
|
|
9219d1 |
unsigned int ovs_cond_seqno = UINT_MAX;
|
|
|
9219d1 |
unsigned int ovnsb_cond_seqno = UINT_MAX;
|
|
|
9219d1 |
|
|
|
9219d1 |
@@ -2308,7 +2320,8 @@ main(int argc, char *argv[])
|
|
|
9219d1 |
ovs_cond_seqno = new_ovs_cond_seqno;
|
|
|
9219d1 |
}
|
|
|
9219d1 |
|
|
|
9219d1 |
- update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all);
|
|
|
9219d1 |
+ update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all,
|
|
|
9219d1 |
+ &reset_ovnsb_idl_min_index);
|
|
|
9219d1 |
update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
|
|
|
9219d1 |
ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl));
|
|
|
9219d1 |
|
|
|
9219d1 |
@@ -2558,7 +2571,7 @@ main(int argc, char *argv[])
|
|
|
9219d1 |
if (!restart) {
|
|
|
9219d1 |
bool done = !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl);
|
|
|
9219d1 |
while (!done) {
|
|
|
9219d1 |
- update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL);
|
|
|
9219d1 |
+ update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL, false);
|
|
|
9219d1 |
update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
|
|
|
9219d1 |
|
|
|
9219d1 |
struct ovsdb_idl_txn *ovs_idl_txn
|
|
|
9219d1 |
@@ -2780,3 +2793,14 @@ engine_recompute_cmd(struct unixctl_conn *conn OVS_UNUSED, int argc OVS_UNUSED,
|
|
|
9219d1 |
poll_immediate_wake();
|
|
|
9219d1 |
unixctl_command_reply(conn, NULL);
|
|
|
9219d1 |
}
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+static void
|
|
|
9219d1 |
+cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
|
|
|
9219d1 |
+ const char *argv[] OVS_UNUSED, void *idl_reset_)
|
|
|
9219d1 |
+{
|
|
|
9219d1 |
+ bool *idl_reset = idl_reset_;
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ *idl_reset = true;
|
|
|
9219d1 |
+ poll_immediate_wake();
|
|
|
9219d1 |
+ unixctl_command_reply(conn, NULL);
|
|
|
9219d1 |
+}
|
|
|
9219d1 |
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
|
|
|
9219d1 |
index e45d494e8..989e3643b 100644
|
|
|
9219d1 |
--- a/northd/ovn-northd.8.xml
|
|
|
9219d1 |
+++ b/northd/ovn-northd.8.xml
|
|
|
9219d1 |
@@ -96,6 +96,34 @@
|
|
|
9219d1 |
acquired OVSDB lock on SB DB, "standby" if it has not or "paused" if
|
|
|
9219d1 |
this instance is paused.
|
|
|
9219d1 |
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ sb-cluster-state-reset
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ Reset southbound database cluster status when databases are destroyed
|
|
|
9219d1 |
+ and rebuilt.
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ If all databases in a clustered southbound database are removed from
|
|
|
9219d1 |
+ disk, then the stored index of all databases will be reset to zero.
|
|
|
9219d1 |
+ This will cause ovn-northd to be unable to read or write to the
|
|
|
9219d1 |
+ southbound database, because it will always detect the data as stale.
|
|
|
9219d1 |
+ In such a case, run this command so that ovn-northd will reset its
|
|
|
9219d1 |
+ local index so that it can interact with the southbound database again.
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ nb-cluster-state-reset
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ Reset northbound database cluster status when databases are destroyed
|
|
|
9219d1 |
+ and rebuilt.
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ This performs the same task as sb-cluster-state-reset
|
|
|
9219d1 |
+ except for the northbound database client.
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+
|
|
|
9219d1 |
|
|
|
9219d1 |
|
|
|
9219d1 |
|
|
|
9219d1 |
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
|
|
|
9219d1 |
index 5f0abeee1..fc05accde 100644
|
|
|
9219d1 |
--- a/northd/ovn-northd.c
|
|
|
9219d1 |
+++ b/northd/ovn-northd.c
|
|
|
9219d1 |
@@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause;
|
|
|
9219d1 |
static unixctl_cb_func ovn_northd_resume;
|
|
|
9219d1 |
static unixctl_cb_func ovn_northd_is_paused;
|
|
|
9219d1 |
static unixctl_cb_func ovn_northd_status;
|
|
|
9219d1 |
+static unixctl_cb_func cluster_state_reset_cmd;
|
|
|
9219d1 |
|
|
|
9219d1 |
struct northd_context {
|
|
|
9219d1 |
struct ovsdb_idl *ovnnb_idl;
|
|
|
9219d1 |
@@ -12393,6 +12394,16 @@ main(int argc, char *argv[])
|
|
|
9219d1 |
&state);
|
|
|
9219d1 |
unixctl_command_register("status", "", 0, 0, ovn_northd_status, &state);
|
|
|
9219d1 |
|
|
|
9219d1 |
+ bool reset_ovnsb_idl_min_index = false;
|
|
|
9219d1 |
+ unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
|
|
|
9219d1 |
+ cluster_state_reset_cmd,
|
|
|
9219d1 |
+ &reset_ovnsb_idl_min_index);
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ bool reset_ovnnb_idl_min_index = false;
|
|
|
9219d1 |
+ unixctl_command_register("nb-cluster-state-reset", "", 0, 0,
|
|
|
9219d1 |
+ cluster_state_reset_cmd,
|
|
|
9219d1 |
+ &reset_ovnnb_idl_min_index);
|
|
|
9219d1 |
+
|
|
|
9219d1 |
daemonize_complete();
|
|
|
9219d1 |
|
|
|
9219d1 |
/* We want to detect (almost) all changes to the ovn-nb db. */
|
|
|
9219d1 |
@@ -12684,6 +12695,18 @@ main(int argc, char *argv[])
|
|
|
9219d1 |
ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl, northd_probe_interval);
|
|
|
9219d1 |
ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl, northd_probe_interval);
|
|
|
9219d1 |
|
|
|
9219d1 |
+ if (reset_ovnsb_idl_min_index) {
|
|
|
9219d1 |
+ VLOG_INFO("Resetting southbound database cluster state");
|
|
|
9219d1 |
+ ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl);
|
|
|
9219d1 |
+ reset_ovnsb_idl_min_index = false;
|
|
|
9219d1 |
+ }
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ if (reset_ovnnb_idl_min_index) {
|
|
|
9219d1 |
+ VLOG_INFO("Resetting northbound database cluster state");
|
|
|
9219d1 |
+ ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl);
|
|
|
9219d1 |
+ reset_ovnnb_idl_min_index = false;
|
|
|
9219d1 |
+ }
|
|
|
9219d1 |
+
|
|
|
9219d1 |
poll_block();
|
|
|
9219d1 |
if (should_service_stop()) {
|
|
|
9219d1 |
exiting = true;
|
|
|
9219d1 |
@@ -12762,3 +12785,14 @@ ovn_northd_status(struct unixctl_conn *conn, int argc OVS_UNUSED,
|
|
|
9219d1 |
unixctl_command_reply(conn, ds_cstr(&s);;
|
|
|
9219d1 |
ds_destroy(&s);
|
|
|
9219d1 |
}
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+static void
|
|
|
9219d1 |
+cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
|
|
|
9219d1 |
+ const char *argv[] OVS_UNUSED, void *idl_reset_)
|
|
|
9219d1 |
+{
|
|
|
9219d1 |
+ bool *idl_reset = idl_reset_;
|
|
|
9219d1 |
+
|
|
|
9219d1 |
+ *idl_reset = true;
|
|
|
9219d1 |
+ poll_immediate_wake();
|
|
|
9219d1 |
+ unixctl_command_reply(conn, NULL);
|
|
|
9219d1 |
+}
|
|
|
9219d1 |
--
|
|
|
9219d1 |
2.26.2
|
|
|
9219d1 |
|