Blob Blame History Raw
From 308df38edabc40221b9ce293e73ea4ac71eb965e Mon Sep 17 00:00:00 2001
From: Numan Siddique <numans@ovn.org>
Date: Tue, 5 Nov 2019 23:11:56 +0530
Subject: [PATCH ovn] Fix ha chassis failover issues for stale ha chassis
 entries

If ha chassis rows of an HA chassis group become stale i.e the HA_Chassis.chassis
column is empty (because ovn-controller is not running in that chassis)
except one row and when ha_chassis_group_is_active()
is called on that ovn-controller, then it returns false. Ideally it should
become active since its the only active chassis. This patch fixes this issue.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1762777
Reported-by: Daniel Alvarez <dalvarez@redhat.com>

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Numan Siddique <numans@ovn.org>
---
 ovn/controller/ha-chassis.c | 25 +++++++++++++++++++++++++
 tests/ovn.at                | 20 +++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/ovn/controller/ha-chassis.c b/ovn/controller/ha-chassis.c
index 6d9426a..d6ec7b6 100644
--- a/ovn/controller/ha-chassis.c
+++ b/ovn/controller/ha-chassis.c
@@ -142,6 +142,27 @@ ha_chassis_destroy_ordered(struct ha_chassis_ordered *ordered_ha_ch)
     }
 }
 
+/* Returns true if there is only one active ha chassis in the chassis group
+ * (i.e HA_Chassis.chassis column is set) and that active ha chassis is
+ * local chassis.
+ * Returns false otherwise. */
+static bool
+is_local_chassis_only_candidate(const struct sbrec_ha_chassis_group *ha_ch_grp,
+                                const struct sbrec_chassis *local_chassis)
+{
+    size_t n_active_ha_chassis = 0;
+    bool local_chassis_present = false;
+    for (size_t i = 0; i < ha_ch_grp->n_ha_chassis; i++) {
+        if (ha_ch_grp->ha_chassis[i]->chassis) {
+            n_active_ha_chassis++;
+            if (ha_ch_grp->ha_chassis[i]->chassis == local_chassis) {
+                local_chassis_present = true;
+            }
+        }
+    }
+
+    return (local_chassis_present && n_active_ha_chassis == 1);
+}
 
 /* Returns true if the local_chassis is the master of
  * the HA chassis group, false otherwise. */
@@ -159,6 +180,10 @@ ha_chassis_group_is_active(
         return (ha_ch_grp->ha_chassis[0]->chassis == local_chassis);
     }
 
+    if (is_local_chassis_only_candidate(ha_ch_grp, local_chassis)) {
+        return true;
+    }
+
     if (sset_is_empty(active_tunnels)) {
         /* If active tunnel sset is empty, it means it has lost
          * connectivity with other chassis. */
diff --git a/tests/ovn.at b/tests/ovn.at
index 410f4b5..cb7903d 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -13413,7 +13413,25 @@ OVS_WAIT_UNTIL(
 logical_port=ls1-lp_ext1`
     test "$chassis" = "$hv1_uuid"])
 
-OVN_CLEANUP([hv1],[hv2],[hv3])
+# Stop ovn-controllers on hv1 and hv3.
+as hv1 ovs-appctl -t ovn-controller exit
+as hv3 ovs-appctl -t ovn-controller exit
+
+# hv2 should be master and claim ls1-lp_ext1
+OVS_WAIT_UNTIL(
+    [chassis=`ovn-sbctl --bare --columns chassis find port_binding \
+logical_port=ls1-lp_ext1`
+    test "$chassis" = "$hv2_uuid"])
+
+as hv1
+OVS_APP_EXIT_AND_WAIT([ovs-vswitchd])
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as hv3
+OVS_APP_EXIT_AND_WAIT([ovs-vswitchd])
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+OVN_CLEANUP([hv2])
 AT_CLEANUP
 
 AT_SETUP([ovn -- Address Set Incremental Processing])
-- 
1.8.3.1