From 803ad90f11eb57221e7805e5cba8c309bafe1de8 Mon Sep 17 00:00:00 2001 From: Gris Ge Date: Wed, 13 Jan 2021 23:51:31 +0800 Subject: [PATCH 1/2] nm: Better handling for timeout When creating 1000 VLAN along with 1000 bridge using each VLAN, NetworkManager might trigger two timeout: * The callback raises `Gio.IOErrorEnum.TIMED_OUT` error. * NetworkManager never call callback, nmstate idle check trigger the timeout. To solve above issue: * Increase the nmstate idle check timeout to 5 minutes. * For actions like add profile and activate profile, we add a fallback checker which check whether requested action is already finished using `GLib.timeout_source_new()` * When `Gio.IOErrorEnum.TIMED_OUT` happens, ignore the failure and wait fallback checker. * The fallback checker is only started 15 seconds after action started, so this does not impact small desire state. Test results on RHEL 8.3 i7-8665U 2G RAM: 10m29.212s to create 1000 VLAN and 1000 bridge over each VLAN. Changed the integration test case to test 500 VLANs + 500 bridges. Signed-off-by: Gris Ge --- libnmstate/nm/connection.py | 67 ++++++++++++++++++++++++++++++++++++- libnmstate/nm/context.py | 3 +- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py index 1f6c734..374a379 100644 --- a/libnmstate/nm/connection.py +++ b/libnmstate/nm/connection.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2018-2020 Red Hat, Inc. +# Copyright (c) 2018-2021 Red Hat, Inc. # # This file is part of nmstate # @@ -24,11 +24,14 @@ from libnmstate.error import NmstateLibnmError from libnmstate.error import NmstateInternalError from libnmstate.error import NmstateValueError +from .common import GLib +from .common import Gio from .common import NM from . import ipv4 from . import ipv6 ACTIVATION_TIMEOUT_FOR_BRIDGE = 35 # Bridge STP requires 30 seconds. +FALLBACK_CHECKER_INTERNAL = 15 class ConnectionProfile: @@ -40,6 +43,7 @@ class ConnectionProfile: self._nm_ac = None self._ac_handlers = set() self._dev_handlers = set() + self._fallback_checker = None def create(self, settings): self.profile = NM.SimpleConnection.new() @@ -102,6 +106,26 @@ class ConnectionProfile: self._add_connection2_callback, user_data, ) + self._fallback_checker = GLib.timeout_source_new( + FALLBACK_CHECKER_INTERNAL * 1000 + ) + self._fallback_checker.set_callback( + self._profile_add_fallback_checker_callback, action + ) + self._fallback_checker.attach(self._ctx.context) + + def _profile_add_fallback_checker_callback(self, action): + for nm_profile in self._ctx.client.get_connections(): + if nm_profile.get_uuid() == self.profile.get_uuid(): + self._fallback_checker_cleanup() + self._ctx.finish_async(action) + return GLib.SOURCE_REMOVE + return GLib.SOURCE_CONTINUE + + def _fallback_checker_cleanup(self): + if self._fallback_checker: + self._fallback_checker.destroy() + self._fallback_checker = None def delete(self): if not self.profile: @@ -152,6 +176,26 @@ class ConnectionProfile: self._active_connection_callback, user_data, ) + self._fallback_checker = GLib.timeout_source_new( + FALLBACK_CHECKER_INTERNAL * 1000 + ) + self._fallback_checker.set_callback( + self._activation_fallback_checker_callback, action + ) + self._fallback_checker.attach(self._ctx.context) + + def _activation_fallback_checker_callback(self, action): + if self.devname: + self._nm_dev = self._ctx.get_nm_dev(self.devname) + if self._nm_dev: + self._activation_progress_check(action) + return GLib.SOURCE_CONTINUE + else: + logging.warn( + "Failed to get interface name from profile, " + "can not perform flalback check on activation" + ) + return GLib.SOURCE_REMOVE @property def profile(self): @@ -213,6 +257,18 @@ class ConnectionProfile: try: nm_act_con = src_object.activate_connection_finish(result) + except GLib.Error as e: + if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT): + logging.debug( + f"{action} timeout on activation, " + "using fallback method to wait activation" + ) + return + else: + self._ctx.fail( + NmstateLibnmError(f"{action} failed: error={e}") + ) + return except Exception as e: self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}")) return @@ -366,6 +422,7 @@ class ConnectionProfile: def _activation_clean_up(self): self._remove_ac_handlers() self._remove_dev_handlers() + self._fallback_checker_cleanup() def _is_activating(self): if not self._nm_ac or not self._nm_dev: @@ -396,6 +453,13 @@ class ConnectionProfile: action = user_data try: profile = src_object.add_connection2_finish(result)[0] + except GLib.Error as e: + if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT): + logging.debug( + f"{action} timeout, using fallback method to " + "wait profile creation" + ) + return except Exception as e: self._ctx.fail( NmstateLibnmError(f"{action} failed with error: {e}") @@ -410,6 +474,7 @@ class ConnectionProfile: ) ) else: + self._fallback_checker_cleanup() self._ctx.finish_async(action) def _update2_callback(self, src_object, result, user_data): diff --git a/libnmstate/nm/context.py b/libnmstate/nm/context.py index 373ffe8..bc5c41c 100644 --- a/libnmstate/nm/context.py +++ b/libnmstate/nm/context.py @@ -31,8 +31,7 @@ from .common import Gio # last finish async action. IDLE_CHECK_INTERNAL = 5 -# libnm dbus connection has reply timeout 25 seconds. -IDLE_TIMEOUT = 25 +IDLE_TIMEOUT = 60 * 5 # 5 minutes # NetworkManage is using dbus in libnm while the dbus has limitation on # maximum number of pending replies per connection.(RHEL/CentOS 8 is 1024) -- 2.27.0 From ac82d18f96aa2313583efa1477be441291e2957c Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Sun, 17 Jan 2021 11:18:10 +0800 Subject: [PATCH 2/2] nm: Use fallback checker on profile deactivation and delete When NM is under heave loads, NM might raise timeout error when try to deactivate or delete a profile, to solve that this patch introduce the same method in 2407f98 to have a fallback checker on whether profile is deactivated/deleted every 15 seconds. No test case required as current `test_lot_of_vlans_with_bridges` test case has `state: absent` which is good enough for testing this patch. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Gris Ge --- libnmstate/nm/active_connection.py | 61 ++++++++++++++++++++++++------ libnmstate/nm/connection.py | 30 +++++++++++++++ 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/libnmstate/nm/active_connection.py b/libnmstate/nm/active_connection.py index 062c78a..b235e8b 100644 --- a/libnmstate/nm/active_connection.py +++ b/libnmstate/nm/active_connection.py @@ -21,12 +21,14 @@ import logging from libnmstate.error import NmstateLibnmError +from .common import Gio from .common import GLib from .common import GObject from .common import NM NM_AC_STATE_CHANGED_SIGNAL = "state-changed" +FALLBACK_CHECKER_INTERNAL = 15 class ActivationError(Exception): @@ -37,6 +39,8 @@ class ActiveConnection: def __init__(self, context=None, nm_ac_con=None): self._ctx = context self._act_con = nm_ac_con + self._signal_handler = None + self._fallback_checker = None nmdevs = None if nm_ac_con: @@ -75,19 +79,35 @@ class ActiveConnection: action = f"Deactivate profile: {self.devname}" self._ctx.register_async(action) - handler_id = act_connection.connect( + self._signal_handler = act_connection.connect( NM_AC_STATE_CHANGED_SIGNAL, self._wait_state_changed_callback, action, ) if act_connection.props.state != NM.ActiveConnectionState.DEACTIVATING: - user_data = (handler_id, action) + user_data = action self._ctx.client.deactivate_connection_async( act_connection, self._ctx.cancellable, self._deactivate_connection_callback, user_data, ) + self._fallback_checker = GLib.timeout_source_new( + FALLBACK_CHECKER_INTERNAL * 1000 + ) + self._fallback_checker.set_callback( + self._deactivation_fallback_checker_callback, action + ) + self._fallback_checker.attach(self._ctx.context) + + def _clean_up(self): + if self._signal_handler: + if self._act_con: + self._act_con.handler_disconnect(self._signal_handler) + self._signal_handler = None + if self._fallback_checker: + self._fallback_checker.destroy() + self._fallback_checker = None def _wait_state_changed_callback(self, act_con, state, reason, action): if self._ctx.is_cancelled(): @@ -96,13 +116,13 @@ class ActiveConnection: logging.debug( "Connection deactivation succeeded on %s", self.devname, ) + self._clean_up() self._ctx.finish_async(action) def _deactivate_connection_callback(self, src_object, result, user_data): - handler_id, action = user_data + action = user_data if self._ctx.is_cancelled(): - if self._act_con: - self._act_con.handler_disconnect(handler_id) + self._clean_up() return try: @@ -116,16 +136,20 @@ class ActiveConnection: "Connection is not active on {}, no need to " "deactivate".format(self.devname) ) + elif e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT): + logging.debug( + f"{action} timeout, using fallback method to wait profile " + "deactivation" + ) + return else: - if self._act_con: - self._act_con.handler_disconnect(handler_id) + self._clean_up() self._ctx.fail( NmstateLibnmError(f"{action} failed: error={e}") ) return except Exception as e: - if self._act_con: - self._act_con.handler_disconnect(handler_id) + self._clean_up() self._ctx.fail( NmstateLibnmError( f"BUG: Unexpected error when activating {self.devname} " @@ -135,8 +159,7 @@ class ActiveConnection: return if not success: - if self._act_con: - self._act_con.handler_disconnect(handler_id) + self._clean_up() self._ctx.fail( NmstateLibnmError( f"{action} failed: error='None returned from " @@ -144,6 +167,22 @@ class ActiveConnection: ) ) + def _deactivation_fallback_checker_callback(self, action): + if self.devname: + self._nmdev = self._ctx.get_nm_dev(self.devname) + if self._nmdev: + self._act_con = self._nmdev.get_active_connection() + if ( + self._act_con + and self._act_con.props.state + != NM.ActiveConnectionState.DEACTIVATED + ): + return GLib.SOURCE_CONTINUE + + self._clean_up() + self._ctx.finish_async(action) + return GLib.SOURCE_REMOVE + @property def nm_active_connection(self): return self._act_con diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py index 374a379..af7296f 100644 --- a/libnmstate/nm/connection.py +++ b/libnmstate/nm/connection.py @@ -144,6 +144,13 @@ class ConnectionProfile: self._delete_connection_callback, user_data, ) + self._fallback_checker = GLib.timeout_source_new( + FALLBACK_CHECKER_INTERNAL * 1000 + ) + self._fallback_checker.set_callback( + self._delete_fallback_checker_callback, action + ) + self._fallback_checker.attach(self._ctx.context) def activate(self): if self.con_id: @@ -504,11 +511,24 @@ class ConnectionProfile: action = user_data try: success = src_object.delete_finish(result) + except GLib.Error as e: + if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT): + logging.debug( + f"{action} timeout, using fallback method to wait profile " + "deletion" + ) + return + else: + self._ctx.fail( + NmstateLibnmError(f"{action} failed with error: {e}") + ) + return except Exception as e: self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}")) return if success: + self._fallback_checker_cleanup() self._ctx.finish_async(action) else: self._ctx.fail( @@ -518,6 +538,16 @@ class ConnectionProfile: ) ) + def _delete_fallback_checker_callback(self, action): + if self.profile: + for nm_profile in self._ctx.client.get_connections(): + if nm_profile.get_uuid() == self.profile.get_uuid(): + return GLib.SOURCE_CONTINUE + + self._fallback_checker_cleanup() + self._ctx.finish_async(action) + return GLib.SOURCE_REMOVE + def _reset_profile(self): self._con_profile = None -- 2.27.0