088c30
From ca5b83cee7b45bf56eec258db739cb5fe51b3231 Mon Sep 17 00:00:00 2001
088c30
From: aswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com>
088c30
Date: Mon, 26 Apr 2021 07:28:39 -0700
088c30
Subject: [PATCH 6/7] Azure: Retry net metadata during nic attach for
088c30
 non-timeout errs (#878)
088c30
088c30
RH-Author: Eduardo Otubo <otubo@redhat.com>
088c30
RH-MergeRequest: 45: Add support for userdata on Azure from IMDS
088c30
RH-Commit: [6/7] 4e6e44f017d5ffcb72ac8959a94f80c71fef9560
088c30
RH-Bugzilla: 2023940
088c30
RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
088c30
RH-Acked-by: Mohamed Gamal Morsy <mmorsy@redhat.com>
088c30
088c30
When network interfaces are hot-attached to the VM, attempting to get
088c30
network metadata might return 410 (or 500, 503 etc) because the info
088c30
is not yet available. In those cases, we retry getting the metadata
088c30
before giving up. The only case where we can move on to wait for more
088c30
nic attach events is if the call times out despite retries, which
088c30
means the interface is not likely a primary interface, and we should
088c30
try for more nic attach events.
088c30
---
088c30
 cloudinit/sources/DataSourceAzure.py          | 65 +++++++++++--
088c30
 tests/unittests/test_datasource/test_azure.py | 95 ++++++++++++++++---
088c30
 2 files changed, 140 insertions(+), 20 deletions(-)
088c30
088c30
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
088c30
index 6d7954ee..d0be6d84 100755
088c30
--- a/cloudinit/sources/DataSourceAzure.py
088c30
+++ b/cloudinit/sources/DataSourceAzure.py
088c30
@@ -17,6 +17,7 @@ from time import sleep
088c30
 from xml.dom import minidom
088c30
 import xml.etree.ElementTree as ET
088c30
 from enum import Enum
088c30
+import requests
088c30
 
088c30
 from cloudinit import dmi
088c30
 from cloudinit import log as logging
088c30
@@ -665,7 +666,9 @@ class DataSourceAzure(sources.DataSource):
088c30
             self,
088c30
             fallback_nic,
088c30
             retries,
088c30
-            md_type=metadata_type.compute):
088c30
+            md_type=metadata_type.compute,
088c30
+            exc_cb=retry_on_url_exc,
088c30
+            infinite=False):
088c30
         """
088c30
         Wrapper for get_metadata_from_imds so that we can have flexibility
088c30
         in which IMDS api-version we use. If a particular instance of IMDS
088c30
@@ -685,7 +688,8 @@ class DataSourceAzure(sources.DataSource):
088c30
                         fallback_nic=fallback_nic,
088c30
                         retries=0,
088c30
                         md_type=md_type,
088c30
-                        api_version=IMDS_VER_WANT
088c30
+                        api_version=IMDS_VER_WANT,
088c30
+                        exc_cb=exc_cb
088c30
                     )
088c30
                 except UrlError as err:
088c30
                     LOG.info(
088c30
@@ -708,7 +712,9 @@ class DataSourceAzure(sources.DataSource):
088c30
             fallback_nic=fallback_nic,
088c30
             retries=retries,
088c30
             md_type=md_type,
088c30
-            api_version=IMDS_VER_MIN
088c30
+            api_version=IMDS_VER_MIN,
088c30
+            exc_cb=exc_cb,
088c30
+            infinite=infinite
088c30
         )
088c30
 
088c30
     def device_name_to_device(self, name):
088c30
@@ -938,6 +944,9 @@ class DataSourceAzure(sources.DataSource):
088c30
         is_primary = False
088c30
         expected_nic_count = -1
088c30
         imds_md = None
088c30
+        metadata_poll_count = 0
088c30
+        metadata_logging_threshold = 1
088c30
+        metadata_timeout_count = 0
088c30
 
088c30
         # For now, only a VM's primary NIC can contact IMDS and WireServer. If
088c30
         # DHCP fails for a NIC, we have no mechanism to determine if the NIC is
088c30
@@ -962,14 +971,48 @@ class DataSourceAzure(sources.DataSource):
088c30
                                     % (ifname, e), logger_func=LOG.error)
088c30
             raise
088c30
 
088c30
+        # Retry polling network metadata for a limited duration only when the
088c30
+        # calls fail due to timeout. This is because the platform drops packets
088c30
+        # going towards IMDS when it is not a primary nic. If the calls fail
088c30
+        # due to other issues like 410, 503 etc, then it means we are primary
088c30
+        # but IMDS service is unavailable at the moment. Retry indefinitely in
088c30
+        # those cases since we cannot move on without the network metadata.
088c30
+        def network_metadata_exc_cb(msg, exc):
088c30
+            nonlocal metadata_timeout_count, metadata_poll_count
088c30
+            nonlocal metadata_logging_threshold
088c30
+
088c30
+            metadata_poll_count = metadata_poll_count + 1
088c30
+
088c30
+            # Log when needed but back off exponentially to avoid exploding
088c30
+            # the log file.
088c30
+            if metadata_poll_count >= metadata_logging_threshold:
088c30
+                metadata_logging_threshold *= 2
088c30
+                report_diagnostic_event(
088c30
+                    "Ran into exception when attempting to reach %s "
088c30
+                    "after %d polls." % (msg, metadata_poll_count),
088c30
+                    logger_func=LOG.error)
088c30
+
088c30
+                if isinstance(exc, UrlError):
088c30
+                    report_diagnostic_event("poll IMDS with %s failed. "
088c30
+                                            "Exception: %s and code: %s" %
088c30
+                                            (msg, exc.cause, exc.code),
088c30
+                                            logger_func=LOG.error)
088c30
+
088c30
+            if exc.cause and isinstance(exc.cause, requests.Timeout):
088c30
+                metadata_timeout_count = metadata_timeout_count + 1
088c30
+                return (metadata_timeout_count <= 10)
088c30
+            return True
088c30
+
088c30
         # Primary nic detection will be optimized in the future. The fact that
088c30
         # primary nic is being attached first helps here. Otherwise each nic
088c30
         # could add several seconds of delay.
088c30
         try:
088c30
             imds_md = self.get_imds_data_with_api_fallback(
088c30
                 ifname,
088c30
-                5,
088c30
-                metadata_type.network
088c30
+                0,
088c30
+                metadata_type.network,
088c30
+                network_metadata_exc_cb,
088c30
+                True
088c30
             )
088c30
         except Exception as e:
088c30
             LOG.warning(
088c30
@@ -2139,7 +2182,9 @@ def _generate_network_config_from_fallback_config() -> dict:
088c30
 def get_metadata_from_imds(fallback_nic,
088c30
                            retries,
088c30
                            md_type=metadata_type.compute,
088c30
-                           api_version=IMDS_VER_MIN):
088c30
+                           api_version=IMDS_VER_MIN,
088c30
+                           exc_cb=retry_on_url_exc,
088c30
+                           infinite=False):
088c30
     """Query Azure's instance metadata service, returning a dictionary.
088c30
 
088c30
     If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
088c30
@@ -2158,7 +2203,7 @@ def get_metadata_from_imds(fallback_nic,
088c30
     kwargs = {'logfunc': LOG.debug,
088c30
               'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
088c30
               'func': _get_metadata_from_imds,
088c30
-              'args': (retries, md_type, api_version,)}
088c30
+              'args': (retries, exc_cb, md_type, api_version, infinite)}
088c30
     if net.is_up(fallback_nic):
088c30
         return util.log_time(**kwargs)
088c30
     else:
088c30
@@ -2176,14 +2221,16 @@ def get_metadata_from_imds(fallback_nic,
088c30
 @azure_ds_telemetry_reporter
088c30
 def _get_metadata_from_imds(
088c30
         retries,
088c30
+        exc_cb,
088c30
         md_type=metadata_type.compute,
088c30
-        api_version=IMDS_VER_MIN):
088c30
+        api_version=IMDS_VER_MIN,
088c30
+        infinite=False):
088c30
     url = "{}?api-version={}".format(md_type.value, api_version)
088c30
     headers = {"Metadata": "true"}
088c30
     try:
088c30
         response = readurl(
088c30
             url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
088c30
-            retries=retries, exception_cb=retry_on_url_exc)
088c30
+            retries=retries, exception_cb=exc_cb, infinite=infinite)
088c30
     except Exception as e:
088c30
         # pylint:disable=no-member
088c30
         if isinstance(e, UrlError) and e.code == 400:
088c30
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
088c30
index d9817d84..c4a8e08d 100644
088c30
--- a/tests/unittests/test_datasource/test_azure.py
088c30
+++ b/tests/unittests/test_datasource/test_azure.py
088c30
@@ -448,7 +448,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
088c30
             "http://169.254.169.254/metadata/instance?api-version="
088c30
             "2019-06-01", exception_cb=mock.ANY,
088c30
             headers=mock.ANY, retries=mock.ANY,
088c30
-            timeout=mock.ANY)
088c30
+            timeout=mock.ANY, infinite=False)
088c30
 
088c30
     @mock.patch(MOCKPATH + 'readurl', autospec=True)
088c30
     @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
088c30
@@ -467,7 +467,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
088c30
             "http://169.254.169.254/metadata/instance/network?api-version="
088c30
             "2019-06-01", exception_cb=mock.ANY,
088c30
             headers=mock.ANY, retries=mock.ANY,
088c30
-            timeout=mock.ANY)
088c30
+            timeout=mock.ANY, infinite=False)
088c30
 
088c30
     @mock.patch(MOCKPATH + 'readurl', autospec=True)
088c30
     @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
088c30
@@ -486,7 +486,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
088c30
             "http://169.254.169.254/metadata/instance?api-version="
088c30
             "2019-06-01", exception_cb=mock.ANY,
088c30
             headers=mock.ANY, retries=mock.ANY,
088c30
-            timeout=mock.ANY)
088c30
+            timeout=mock.ANY, infinite=False)
088c30
 
088c30
     @mock.patch(MOCKPATH + 'readurl', autospec=True)
088c30
     @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True)
088c30
@@ -511,7 +511,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
088c30
         m_readurl.assert_called_with(
088c30
             self.network_md_url, exception_cb=mock.ANY,
088c30
             headers={'Metadata': 'true'}, retries=2,
088c30
-            timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)
088c30
+            timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False)
088c30
 
088c30
     @mock.patch('cloudinit.url_helper.time.sleep')
088c30
     @mock.patch(MOCKPATH + 'net.is_up', autospec=True)
088c30
@@ -2694,15 +2694,22 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
088c30
 
088c30
         def nic_attach_ret(nl_sock, nics_found):
088c30
             nonlocal m_attach_call_count
088c30
-            if m_attach_call_count == 0:
088c30
-                m_attach_call_count = m_attach_call_count + 1
088c30
+            m_attach_call_count = m_attach_call_count + 1
088c30
+            if m_attach_call_count == 1:
088c30
                 return "eth0"
088c30
-            return "eth1"
088c30
+            elif m_attach_call_count == 2:
088c30
+                return "eth1"
088c30
+            raise RuntimeError("Must have found primary nic by now.")
088c30
+
088c30
+        # Simulate two NICs by adding the same one twice.
088c30
+        md = {
088c30
+            "interface": [
088c30
+                IMDS_NETWORK_METADATA['interface'][0],
088c30
+                IMDS_NETWORK_METADATA['interface'][0]
088c30
+            ]
088c30
+        }
088c30
 
088c30
-        def network_metadata_ret(ifname, retries, type):
088c30
-            # Simulate two NICs by adding the same one twice.
088c30
-            md = IMDS_NETWORK_METADATA
088c30
-            md['interface'].append(md['interface'][0])
088c30
+        def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
088c30
             if ifname == "eth0":
088c30
                 return md
088c30
             raise requests.Timeout('Fake connection timeout')
088c30
@@ -2724,6 +2731,72 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
088c30
         self.assertEqual(1, m_imds.call_count)
088c30
         self.assertEqual(2, m_link_up.call_count)
088c30
 
088c30
+    @mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback')
088c30
+    @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
088c30
+    def test_check_if_nic_is_primary_retries_on_failures(
088c30
+            self, m_dhcpv4, m_imds):
088c30
+        """Retry polling for network metadata on all failures except timeout"""
088c30
+        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
088c30
+        lease = {
088c30
+            'interface': 'eth9', 'fixed-address': '192.168.2.9',
088c30
+            'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
088c30
+            'unknown-245': '624c3620'}
088c30
+
088c30
+        eth0Retries = []
088c30
+        eth1Retries = []
088c30
+        # Simulate two NICs by adding the same one twice.
088c30
+        md = {
088c30
+            "interface": [
088c30
+                IMDS_NETWORK_METADATA['interface'][0],
088c30
+                IMDS_NETWORK_METADATA['interface'][0]
088c30
+            ]
088c30
+        }
088c30
+
088c30
+        def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
088c30
+            nonlocal eth0Retries, eth1Retries
088c30
+
088c30
+            # Simulate readurl functionality with retries and
088c30
+            # exception callbacks so that the callback logic can be
088c30
+            # validated.
088c30
+            if ifname == "eth0":
088c30
+                cause = requests.HTTPError()
088c30
+                for _ in range(0, 15):
088c30
+                    error = url_helper.UrlError(cause=cause, code=410)
088c30
+                    eth0Retries.append(exc_cb("No goal state.", error))
088c30
+            else:
088c30
+                cause = requests.Timeout('Fake connection timeout')
088c30
+                for _ in range(0, 10):
088c30
+                    error = url_helper.UrlError(cause=cause)
088c30
+                    eth1Retries.append(exc_cb("Connection timeout", error))
088c30
+                # Should stop retrying after 10 retries
088c30
+                eth1Retries.append(exc_cb("Connection timeout", error))
088c30
+                raise cause
088c30
+            return md
088c30
+
088c30
+        m_imds.side_effect = network_metadata_ret
088c30
+
088c30
+        dhcp_ctx = mock.MagicMock(lease=lease)
088c30
+        dhcp_ctx.obtain_lease.return_value = lease
088c30
+        m_dhcpv4.return_value = dhcp_ctx
088c30
+
088c30
+        is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0")
088c30
+        self.assertEqual(True, is_primary)
088c30
+        self.assertEqual(2, expected_nic_count)
088c30
+
088c30
+        # All Eth0 errors are non-timeout errors. So we should have been
088c30
+        # retrying indefinitely until success.
088c30
+        for i in eth0Retries:
088c30
+            self.assertTrue(i)
088c30
+
088c30
+        is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1")
088c30
+        self.assertEqual(False, is_primary)
088c30
+
088c30
+        # All Eth1 errors are timeout errors. Retry happens for a max of 10 and
088c30
+        # then we should have moved on assuming it is not the primary nic.
088c30
+        for i in range(0, 10):
088c30
+            self.assertTrue(eth1Retries[i])
088c30
+        self.assertFalse(eth1Retries[10])
088c30
+
088c30
     @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
088c30
     def test_wait_for_link_up_returns_if_already_up(
088c30
             self, m_is_link_up):
088c30
-- 
088c30
2.27.0
088c30