9219d1
From 8e629246fead7a5dfee78ca45627a0cfd7fef4b1 Mon Sep 17 00:00:00 2001
9219d1
From: Mark Michelson <mmichels@redhat.com>
9219d1
Date: Wed, 10 Jun 2020 14:50:06 -0400
9219d1
Subject: [PATCH 20/22] Add northd and ovn-controller cluster status reset
9219d1
 commands.
9219d1
9219d1
During the course of debugging a clustered DB environment, all members
9219d1
of the southbound database cluster were destroyed (i.e. the .db files
9219d1
were removed from disk) and then restarted. Once this happened,
9219d1
ovn-northd and ovn-controller could not interact with the southbound
9219d1
database because they both detected all members of the cluster as having
9219d1
"stale" data. The only course of action was to reset ovn-northd and all
9219d1
ovn-controllers. It is possible to have this happen with the northbound
9219d1
database as well if it is clustered.
9219d1
9219d1
This patch offers new ovn-appctl commands for ovn-northd and
9219d1
ovn-controller that allows for it to reset its clustered status. This
9219d1
allows for it to interact with the database successfully after a cluster
9219d1
teardown and restart.
9219d1
9219d1
Signed-off-by: Mark Michelson <mmichels@redhat.com>
9219d1
Acked-by: Han Zhou <hzhou@ovn.org>
9219d1
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109
9219d1
9219d1
(cherry-picked from upstream ovn master commit 512b884dea3f85791eca44fd1d92956e8282be6d)
9219d1
9219d1
Change-Id: I3e7d844d6f79552fd53a018c74b80def6069edcb
9219d1
---
9219d1
 controller/ovn-controller.8.xml | 16 ++++++++++++++++
9219d1
 controller/ovn-controller.c     | 30 ++++++++++++++++++++++++++---
9219d1
 northd/ovn-northd.8.xml         | 28 +++++++++++++++++++++++++++
9219d1
 northd/ovn-northd.c             | 34 +++++++++++++++++++++++++++++++++
9219d1
 4 files changed, 105 insertions(+), 3 deletions(-)
9219d1
9219d1
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
9219d1
index 92e0a6e43..66877314c 100644
9219d1
--- a/controller/ovn-controller.8.xml
9219d1
+++ b/controller/ovn-controller.8.xml
9219d1
@@ -491,6 +491,22 @@
9219d1
         recomputes are cpu intensive.
9219d1
       

9219d1
       
9219d1
+
9219d1
+      
sb-cluster-state-reset
9219d1
+      
9219d1
+      

9219d1
+        Reset southbound database cluster status when databases are destroyed
9219d1
+        and rebuilt.
9219d1
+      

9219d1
+      

9219d1
+        If all databases in a clustered southbound database are removed from
9219d1
+        disk, then the stored index of all databases will be reset to zero.
9219d1
+        This will cause ovn-controller to be unable to read or write to the
9219d1
+        southbound database, because it will always detect the data as stale.
9219d1
+        In such a case, run this command so that ovn-controller will reset its
9219d1
+        local index so that it can interact with the southbound database again.
9219d1
+      

9219d1
+      
9219d1
       
9219d1
     

9219d1
 
9219d1
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
9219d1
index 85e58d04f..fe6048153 100644
9219d1
--- a/controller/ovn-controller.c
9219d1
+++ b/controller/ovn-controller.c
9219d1
@@ -72,6 +72,7 @@ static unixctl_cb_func ct_zone_list;
9219d1
 static unixctl_cb_func extend_table_list;
9219d1
 static unixctl_cb_func inject_pkt;
9219d1
 static unixctl_cb_func engine_recompute_cmd;
9219d1
+static unixctl_cb_func cluster_state_reset_cmd;
9219d1
 
9219d1
 #define DEFAULT_BRIDGE_NAME "br-int"
9219d1
 #define DEFAULT_PROBE_INTERVAL_MSEC 5000
9219d1
@@ -445,7 +446,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl *ovs_idl)
9219d1
  * updates 'sbdb_idl' with that pointer. */
9219d1
 static void
9219d1
 update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
9219d1
-             bool *monitor_all_p)
9219d1
+             bool *monitor_all_p, bool *reset_ovnsb_idl_min_index)
9219d1
 {
9219d1
     const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_first(ovs_idl);
9219d1
     if (!cfg) {
9219d1
@@ -475,6 +476,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
9219d1
     if (monitor_all_p) {
9219d1
         *monitor_all_p = monitor_all;
9219d1
     }
9219d1
+    if (*reset_ovnsb_idl_min_index) {
9219d1
+        VLOG_INFO("Resetting southbound database cluster state");
9219d1
+        engine_set_force_recompute(true);
9219d1
+        ovsdb_idl_reset_min_index(ovnsb_idl);
9219d1
+        *reset_ovnsb_idl_min_index = false;
9219d1
+    }
9219d1
 }
9219d1
 
9219d1
 static void
9219d1
@@ -2287,6 +2294,11 @@ main(int argc, char *argv[])
9219d1
     unixctl_command_register("recompute", "", 0, 0, engine_recompute_cmd,
9219d1
                              NULL);
9219d1
 
9219d1
+    bool reset_ovnsb_idl_min_index = false;
9219d1
+    unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
9219d1
+                             cluster_state_reset_cmd,
9219d1
+                             &reset_ovnsb_idl_min_index);
9219d1
+
9219d1
     unsigned int ovs_cond_seqno = UINT_MAX;
9219d1
     unsigned int ovnsb_cond_seqno = UINT_MAX;
9219d1
 
9219d1
@@ -2308,7 +2320,8 @@ main(int argc, char *argv[])
9219d1
             ovs_cond_seqno = new_ovs_cond_seqno;
9219d1
         }
9219d1
 
9219d1
-        update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all);
9219d1
+        update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all,
9219d1
+                     &reset_ovnsb_idl_min_index);
9219d1
         update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
9219d1
         ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl));
9219d1
 
9219d1
@@ -2558,7 +2571,7 @@ main(int argc, char *argv[])
9219d1
     if (!restart) {
9219d1
         bool done = !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl);
9219d1
         while (!done) {
9219d1
-            update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL);
9219d1
+            update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL, false);
9219d1
             update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
9219d1
 
9219d1
             struct ovsdb_idl_txn *ovs_idl_txn
9219d1
@@ -2780,3 +2793,14 @@ engine_recompute_cmd(struct unixctl_conn *conn OVS_UNUSED, int argc OVS_UNUSED,
9219d1
     poll_immediate_wake();
9219d1
     unixctl_command_reply(conn, NULL);
9219d1
 }
9219d1
+
9219d1
+static void
9219d1
+cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
9219d1
+               const char *argv[] OVS_UNUSED, void *idl_reset_)
9219d1
+{
9219d1
+    bool *idl_reset = idl_reset_;
9219d1
+
9219d1
+    *idl_reset = true;
9219d1
+    poll_immediate_wake();
9219d1
+    unixctl_command_reply(conn, NULL);
9219d1
+}
9219d1
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
9219d1
index e45d494e8..989e3643b 100644
9219d1
--- a/northd/ovn-northd.8.xml
9219d1
+++ b/northd/ovn-northd.8.xml
9219d1
@@ -96,6 +96,34 @@
9219d1
         acquired OVSDB lock on SB DB, "standby" if it has not or "paused" if
9219d1
         this instance is paused.
9219d1
       
9219d1
+
9219d1
+      
sb-cluster-state-reset
9219d1
+      
9219d1
+      

9219d1
+        Reset southbound database cluster status when databases are destroyed
9219d1
+        and rebuilt.
9219d1
+      

9219d1
+      

9219d1
+        If all databases in a clustered southbound database are removed from
9219d1
+        disk, then the stored index of all databases will be reset to zero.
9219d1
+        This will cause ovn-northd to be unable to read or write to the
9219d1
+        southbound database, because it will always detect the data as stale.
9219d1
+        In such a case, run this command so that ovn-northd will reset its
9219d1
+        local index so that it can interact with the southbound database again.
9219d1
+      

9219d1
+      
9219d1
+
9219d1
+      
nb-cluster-state-reset
9219d1
+      
9219d1
+      

9219d1
+        Reset northbound database cluster status when databases are destroyed
9219d1
+        and rebuilt.
9219d1
+      

9219d1
+      

9219d1
+        This performs the same task as sb-cluster-state-reset
9219d1
+        except for the northbound database client.
9219d1
+      

9219d1
+      
9219d1
       
9219d1
     

9219d1
 
9219d1
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
9219d1
index 5f0abeee1..fc05accde 100644
9219d1
--- a/northd/ovn-northd.c
9219d1
+++ b/northd/ovn-northd.c
9219d1
@@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause;
9219d1
 static unixctl_cb_func ovn_northd_resume;
9219d1
 static unixctl_cb_func ovn_northd_is_paused;
9219d1
 static unixctl_cb_func ovn_northd_status;
9219d1
+static unixctl_cb_func cluster_state_reset_cmd;
9219d1
 
9219d1
 struct northd_context {
9219d1
     struct ovsdb_idl *ovnnb_idl;
9219d1
@@ -12393,6 +12394,16 @@ main(int argc, char *argv[])
9219d1
                              &state);
9219d1
     unixctl_command_register("status", "", 0, 0, ovn_northd_status, &state);
9219d1
 
9219d1
+    bool reset_ovnsb_idl_min_index = false;
9219d1
+    unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
9219d1
+                             cluster_state_reset_cmd,
9219d1
+                             &reset_ovnsb_idl_min_index);
9219d1
+
9219d1
+    bool reset_ovnnb_idl_min_index = false;
9219d1
+    unixctl_command_register("nb-cluster-state-reset", "", 0, 0,
9219d1
+                             cluster_state_reset_cmd,
9219d1
+                             &reset_ovnnb_idl_min_index);
9219d1
+
9219d1
     daemonize_complete();
9219d1
 
9219d1
     /* We want to detect (almost) all changes to the ovn-nb db. */
9219d1
@@ -12684,6 +12695,18 @@ main(int argc, char *argv[])
9219d1
         ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl, northd_probe_interval);
9219d1
         ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl, northd_probe_interval);
9219d1
 
9219d1
+        if (reset_ovnsb_idl_min_index) {
9219d1
+            VLOG_INFO("Resetting southbound database cluster state");
9219d1
+            ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl);
9219d1
+            reset_ovnsb_idl_min_index = false;
9219d1
+        }
9219d1
+
9219d1
+        if (reset_ovnnb_idl_min_index) {
9219d1
+            VLOG_INFO("Resetting northbound database cluster state");
9219d1
+            ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl);
9219d1
+            reset_ovnnb_idl_min_index = false;
9219d1
+        }
9219d1
+
9219d1
         poll_block();
9219d1
         if (should_service_stop()) {
9219d1
             exiting = true;
9219d1
@@ -12762,3 +12785,14 @@ ovn_northd_status(struct unixctl_conn *conn, int argc OVS_UNUSED,
9219d1
     unixctl_command_reply(conn, ds_cstr(&s);;
9219d1
     ds_destroy(&s);
9219d1
 }
9219d1
+
9219d1
+static void
9219d1
+cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
9219d1
+               const char *argv[] OVS_UNUSED, void *idl_reset_)
9219d1
+{
9219d1
+    bool *idl_reset = idl_reset_;
9219d1
+
9219d1
+    *idl_reset = true;
9219d1
+    poll_immediate_wake();
9219d1
+    unixctl_command_reply(conn, NULL);
9219d1
+}
9219d1
-- 
9219d1
2.26.2
9219d1