Blob Blame History Raw
From 803ad90f11eb57221e7805e5cba8c309bafe1de8 Mon Sep 17 00:00:00 2001
From: Gris Ge <fge@redhat.com>
Date: Wed, 13 Jan 2021 23:51:31 +0800
Subject: [PATCH 1/2] nm: Better handling for timeout

When creating 1000 VLAN along with 1000 bridge using each VLAN,
NetworkManager might trigger two timeout:

 * The callback raises `Gio.IOErrorEnum.TIMED_OUT` error.
 * NetworkManager never call callback, nmstate idle check trigger the
   timeout.

To solve above issue:

 * Increase the nmstate idle check timeout to 5 minutes.

 * For actions like add profile and activate profile, we add a
   fallback checker which check whether requested action is already
   finished using `GLib.timeout_source_new()`

 * When `Gio.IOErrorEnum.TIMED_OUT` happens, ignore the failure and wait
   fallback checker.

 * The fallback checker is only started 15 seconds after action started,
   so this does not impact small desire state.

Test results on RHEL 8.3 i7-8665U 2G RAM:

    10m29.212s to create 1000 VLAN and 1000 bridge over each VLAN.

Changed the integration test case to test 500 VLANs + 500 bridges.

Signed-off-by: Gris Ge <fge@redhat.com>
---
 libnmstate/nm/connection.py | 67 ++++++++++++++++++++++++++++++++++++-
 libnmstate/nm/context.py    |  3 +-
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py
index 1f6c734..374a379 100644
--- a/libnmstate/nm/connection.py
+++ b/libnmstate/nm/connection.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2020 Red Hat, Inc.
+# Copyright (c) 2018-2021 Red Hat, Inc.
 #
 # This file is part of nmstate
 #
@@ -24,11 +24,14 @@ from libnmstate.error import NmstateLibnmError
 from libnmstate.error import NmstateInternalError
 from libnmstate.error import NmstateValueError
 
+from .common import GLib
+from .common import Gio
 from .common import NM
 from . import ipv4
 from . import ipv6
 
 ACTIVATION_TIMEOUT_FOR_BRIDGE = 35  # Bridge STP requires 30 seconds.
+FALLBACK_CHECKER_INTERNAL = 15
 
 
 class ConnectionProfile:
@@ -40,6 +43,7 @@ class ConnectionProfile:
         self._nm_ac = None
         self._ac_handlers = set()
         self._dev_handlers = set()
+        self._fallback_checker = None
 
     def create(self, settings):
         self.profile = NM.SimpleConnection.new()
@@ -102,6 +106,26 @@ class ConnectionProfile:
             self._add_connection2_callback,
             user_data,
         )
+        self._fallback_checker = GLib.timeout_source_new(
+            FALLBACK_CHECKER_INTERNAL * 1000
+        )
+        self._fallback_checker.set_callback(
+            self._profile_add_fallback_checker_callback, action
+        )
+        self._fallback_checker.attach(self._ctx.context)
+
+    def _profile_add_fallback_checker_callback(self, action):
+        for nm_profile in self._ctx.client.get_connections():
+            if nm_profile.get_uuid() == self.profile.get_uuid():
+                self._fallback_checker_cleanup()
+                self._ctx.finish_async(action)
+                return GLib.SOURCE_REMOVE
+        return GLib.SOURCE_CONTINUE
+
+    def _fallback_checker_cleanup(self):
+        if self._fallback_checker:
+            self._fallback_checker.destroy()
+            self._fallback_checker = None
 
     def delete(self):
         if not self.profile:
@@ -152,6 +176,26 @@ class ConnectionProfile:
             self._active_connection_callback,
             user_data,
         )
+        self._fallback_checker = GLib.timeout_source_new(
+            FALLBACK_CHECKER_INTERNAL * 1000
+        )
+        self._fallback_checker.set_callback(
+            self._activation_fallback_checker_callback, action
+        )
+        self._fallback_checker.attach(self._ctx.context)
+
+    def _activation_fallback_checker_callback(self, action):
+        if self.devname:
+            self._nm_dev = self._ctx.get_nm_dev(self.devname)
+            if self._nm_dev:
+                self._activation_progress_check(action)
+            return GLib.SOURCE_CONTINUE
+        else:
+            logging.warn(
+                "Failed to get interface name from profile, "
+                "can not perform flalback check on activation"
+            )
+            return GLib.SOURCE_REMOVE
 
     @property
     def profile(self):
@@ -213,6 +257,18 @@ class ConnectionProfile:
 
         try:
             nm_act_con = src_object.activate_connection_finish(result)
+        except GLib.Error as e:
+            if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+                logging.debug(
+                    f"{action} timeout on activation, "
+                    "using fallback method to wait activation"
+                )
+                return
+            else:
+                self._ctx.fail(
+                    NmstateLibnmError(f"{action} failed: error={e}")
+                )
+                return
         except Exception as e:
             self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}"))
             return
@@ -366,6 +422,7 @@ class ConnectionProfile:
     def _activation_clean_up(self):
         self._remove_ac_handlers()
         self._remove_dev_handlers()
+        self._fallback_checker_cleanup()
 
     def _is_activating(self):
         if not self._nm_ac or not self._nm_dev:
@@ -396,6 +453,13 @@ class ConnectionProfile:
         action = user_data
         try:
             profile = src_object.add_connection2_finish(result)[0]
+        except GLib.Error as e:
+            if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+                logging.debug(
+                    f"{action} timeout, using fallback method to "
+                    "wait profile creation"
+                )
+                return
         except Exception as e:
             self._ctx.fail(
                 NmstateLibnmError(f"{action} failed with error: {e}")
@@ -410,6 +474,7 @@ class ConnectionProfile:
                 )
             )
         else:
+            self._fallback_checker_cleanup()
             self._ctx.finish_async(action)
 
     def _update2_callback(self, src_object, result, user_data):
diff --git a/libnmstate/nm/context.py b/libnmstate/nm/context.py
index 373ffe8..bc5c41c 100644
--- a/libnmstate/nm/context.py
+++ b/libnmstate/nm/context.py
@@ -31,8 +31,7 @@ from .common import Gio
 # last finish async action.
 IDLE_CHECK_INTERNAL = 5
 
-# libnm dbus connection has reply timeout 25 seconds.
-IDLE_TIMEOUT = 25
+IDLE_TIMEOUT = 60 * 5  # 5 minutes
 
 # NetworkManage is using dbus in libnm while the dbus has limitation on
 # maximum number of pending replies per connection.(RHEL/CentOS 8 is 1024)
-- 
2.27.0


From ac82d18f96aa2313583efa1477be441291e2957c Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Sun, 17 Jan 2021 11:18:10 +0800
Subject: [PATCH 2/2] nm: Use fallback checker on profile deactivation and
 delete

When NM is under heave loads, NM might raise timeout error when
try to deactivate or delete a profile, to solve that this patch
introduce the same method in 2407f98
to have a fallback checker on whether profile is deactivated/deleted
every 15 seconds.

No test case required as current `test_lot_of_vlans_with_bridges` test
case has `state: absent` which is good enough for testing this patch.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Gris Ge <fge@redhat.com>
---
 libnmstate/nm/active_connection.py | 61 ++++++++++++++++++++++++------
 libnmstate/nm/connection.py        | 30 +++++++++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/libnmstate/nm/active_connection.py b/libnmstate/nm/active_connection.py
index 062c78a..b235e8b 100644
--- a/libnmstate/nm/active_connection.py
+++ b/libnmstate/nm/active_connection.py
@@ -21,12 +21,14 @@ import logging
 
 from libnmstate.error import NmstateLibnmError
 
+from .common import Gio
 from .common import GLib
 from .common import GObject
 from .common import NM
 
 
 NM_AC_STATE_CHANGED_SIGNAL = "state-changed"
+FALLBACK_CHECKER_INTERNAL = 15
 
 
 class ActivationError(Exception):
@@ -37,6 +39,8 @@ class ActiveConnection:
     def __init__(self, context=None, nm_ac_con=None):
         self._ctx = context
         self._act_con = nm_ac_con
+        self._signal_handler = None
+        self._fallback_checker = None
 
         nmdevs = None
         if nm_ac_con:
@@ -75,19 +79,35 @@ class ActiveConnection:
 
         action = f"Deactivate profile: {self.devname}"
         self._ctx.register_async(action)
-        handler_id = act_connection.connect(
+        self._signal_handler = act_connection.connect(
             NM_AC_STATE_CHANGED_SIGNAL,
             self._wait_state_changed_callback,
             action,
         )
         if act_connection.props.state != NM.ActiveConnectionState.DEACTIVATING:
-            user_data = (handler_id, action)
+            user_data = action
             self._ctx.client.deactivate_connection_async(
                 act_connection,
                 self._ctx.cancellable,
                 self._deactivate_connection_callback,
                 user_data,
             )
+        self._fallback_checker = GLib.timeout_source_new(
+            FALLBACK_CHECKER_INTERNAL * 1000
+        )
+        self._fallback_checker.set_callback(
+            self._deactivation_fallback_checker_callback, action
+        )
+        self._fallback_checker.attach(self._ctx.context)
+
+    def _clean_up(self):
+        if self._signal_handler:
+            if self._act_con:
+                self._act_con.handler_disconnect(self._signal_handler)
+            self._signal_handler = None
+        if self._fallback_checker:
+            self._fallback_checker.destroy()
+            self._fallback_checker = None
 
     def _wait_state_changed_callback(self, act_con, state, reason, action):
         if self._ctx.is_cancelled():
@@ -96,13 +116,13 @@ class ActiveConnection:
             logging.debug(
                 "Connection deactivation succeeded on %s", self.devname,
             )
+            self._clean_up()
             self._ctx.finish_async(action)
 
     def _deactivate_connection_callback(self, src_object, result, user_data):
-        handler_id, action = user_data
+        action = user_data
         if self._ctx.is_cancelled():
-            if self._act_con:
-                self._act_con.handler_disconnect(handler_id)
+            self._clean_up()
             return
 
         try:
@@ -116,16 +136,20 @@ class ActiveConnection:
                     "Connection is not active on {}, no need to "
                     "deactivate".format(self.devname)
                 )
+            elif e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+                logging.debug(
+                    f"{action} timeout, using fallback method to wait profile "
+                    "deactivation"
+                )
+                return
             else:
-                if self._act_con:
-                    self._act_con.handler_disconnect(handler_id)
+                self._clean_up()
                 self._ctx.fail(
                     NmstateLibnmError(f"{action} failed: error={e}")
                 )
                 return
         except Exception as e:
-            if self._act_con:
-                self._act_con.handler_disconnect(handler_id)
+            self._clean_up()
             self._ctx.fail(
                 NmstateLibnmError(
                     f"BUG: Unexpected error when activating {self.devname} "
@@ -135,8 +159,7 @@ class ActiveConnection:
             return
 
         if not success:
-            if self._act_con:
-                self._act_con.handler_disconnect(handler_id)
+            self._clean_up()
             self._ctx.fail(
                 NmstateLibnmError(
                     f"{action} failed: error='None returned from "
@@ -144,6 +167,22 @@ class ActiveConnection:
                 )
             )
 
+    def _deactivation_fallback_checker_callback(self, action):
+        if self.devname:
+            self._nmdev = self._ctx.get_nm_dev(self.devname)
+            if self._nmdev:
+                self._act_con = self._nmdev.get_active_connection()
+                if (
+                    self._act_con
+                    and self._act_con.props.state
+                    != NM.ActiveConnectionState.DEACTIVATED
+                ):
+                    return GLib.SOURCE_CONTINUE
+
+        self._clean_up()
+        self._ctx.finish_async(action)
+        return GLib.SOURCE_REMOVE
+
     @property
     def nm_active_connection(self):
         return self._act_con
diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py
index 374a379..af7296f 100644
--- a/libnmstate/nm/connection.py
+++ b/libnmstate/nm/connection.py
@@ -144,6 +144,13 @@ class ConnectionProfile:
                 self._delete_connection_callback,
                 user_data,
             )
+            self._fallback_checker = GLib.timeout_source_new(
+                FALLBACK_CHECKER_INTERNAL * 1000
+            )
+            self._fallback_checker.set_callback(
+                self._delete_fallback_checker_callback, action
+            )
+            self._fallback_checker.attach(self._ctx.context)
 
     def activate(self):
         if self.con_id:
@@ -504,11 +511,24 @@ class ConnectionProfile:
         action = user_data
         try:
             success = src_object.delete_finish(result)
+        except GLib.Error as e:
+            if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+                logging.debug(
+                    f"{action} timeout, using fallback method to wait profile "
+                    "deletion"
+                )
+                return
+            else:
+                self._ctx.fail(
+                    NmstateLibnmError(f"{action} failed with error: {e}")
+                )
+                return
         except Exception as e:
             self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}"))
             return
 
         if success:
+            self._fallback_checker_cleanup()
             self._ctx.finish_async(action)
         else:
             self._ctx.fail(
@@ -518,6 +538,16 @@ class ConnectionProfile:
                 )
             )
 
+    def _delete_fallback_checker_callback(self, action):
+        if self.profile:
+            for nm_profile in self._ctx.client.get_connections():
+                if nm_profile.get_uuid() == self.profile.get_uuid():
+                    return GLib.SOURCE_CONTINUE
+
+        self._fallback_checker_cleanup()
+        self._ctx.finish_async(action)
+        return GLib.SOURCE_REMOVE
+
     def _reset_profile(self):
         self._con_profile = None
 
-- 
2.27.0