Blob Blame History Raw
From 83e17432645b9e959c82ffe9c86d20fa183bc5ef Mon Sep 17 00:00:00 2001
From: Daniel Watkins <oddbloke@ubuntu.com>
Date: Mon, 8 Mar 2021 12:50:57 -0500
Subject: [PATCH 2/2] net: exclude OVS internal interfaces in get_interfaces
 (#829)

RH-Author: Eduardo Otubo (otubo)
RH-MergeRequest: 6: Patch series to fix "Bug 1957135 - Intermittent failure to start cloud-init due to failure to detect macs"
RH-Commit: [2/2] d401dc64a7ceeecb091a792aa24de334940a3750 (otubo/cloud-init)
RH-Bugzilla: 1957135
RH-Acked-by: Mohammed Gamal <mmorsy@redhat.com>
RH-Acked-by: Emanuele Giuseppe Esposito <[eesposit@redhat.com](mailto:eesposit@redhat.com>

commit 121bc04cdf0e6732fe143b7419131dc250c13384
Author: Daniel Watkins <oddbloke@ubuntu.com>
Date:   Mon Mar 8 12:50:57 2021 -0500

    net: exclude OVS internal interfaces in get_interfaces (#829)

    `get_interfaces` is used to in two ways, broadly: firstly, to determine
    the available interfaces when converting cloud network configuration
    formats to cloud-init's network configuration formats; and, secondly, to
    ensure that any interfaces which are specified in network configuration
    are (a) available, and (b) named correctly.  The first of these is
    unaffected by this commit, as no clouds support Open vSwitch
    configuration in their network configuration formats.

    For the second, we check that MAC addresses of physical devices are
    unique.  In some OVS configurations, there are OVS-created devices which
    have duplicate MAC addresses, either with each other or with physical
    devices.  As these interfaces are created by OVS, we can be confident
    that (a) they will be available when appropriate, and (b) that OVS will
    name them correctly.  As such, this commit excludes any OVS-internal
    interfaces from the set of interfaces returned by `get_interfaces`.

    LP: #1912844

Signed-off-by: Eduardo Otubo <otubo@redhat.com>
---
 cloudinit/net/__init__.py                     |  62 +++++++++
 cloudinit/net/tests/test_init.py              | 119 ++++++++++++++++++
 .../sources/helpers/tests/test_openstack.py   |   5 +
 cloudinit/sources/tests/test_oracle.py        |   4 +
 .../integration_tests/bugs/test_lp1912844.py  | 103 +++++++++++++++
 .../test_datasource/test_configdrive.py       |   8 ++
 tests/unittests/test_net.py                   |  20 +++
 7 files changed, 321 insertions(+)
 create mode 100644 tests/integration_tests/bugs/test_lp1912844.py

diff --git a/cloudinit/net/__init__.py b/cloudinit/net/__init__.py
index 0aa58b27..2ff770e1 100644
--- a/cloudinit/net/__init__.py
+++ b/cloudinit/net/__init__.py
@@ -6,6 +6,7 @@
 # This file is part of cloud-init. See LICENSE file for license information.
 
 import errno
+import functools
 import ipaddress
 import logging
 import os
@@ -19,6 +20,19 @@ from cloudinit.url_helper import UrlError, readurl
 LOG = logging.getLogger(__name__)
 SYS_CLASS_NET = "/sys/class/net/"
 DEFAULT_PRIMARY_INTERFACE = 'eth0'
+OVS_INTERNAL_INTERFACE_LOOKUP_CMD = [
+    "ovs-vsctl",
+    "--format",
+    "csv",
+    "--no-headings",
+    "--timeout",
+    "10",
+    "--columns",
+    "name",
+    "find",
+    "interface",
+    "type=internal",
+]
 
 
 def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
@@ -133,6 +147,52 @@ def master_is_openvswitch(devname):
     return os.path.exists(ovs_path)
 
 
+@functools.lru_cache(maxsize=None)
+def openvswitch_is_installed() -> bool:
+    """Return a bool indicating if Open vSwitch is installed in the system."""
+    ret = bool(subp.which("ovs-vsctl"))
+    if not ret:
+        LOG.debug(
+            "ovs-vsctl not in PATH; not detecting Open vSwitch interfaces"
+        )
+    return ret
+
+
+@functools.lru_cache(maxsize=None)
+def get_ovs_internal_interfaces() -> list:
+    """Return a list of the names of OVS internal interfaces on the system.
+
+    These will all be strings, and are used to exclude OVS-specific interface
+    from cloud-init's network configuration handling.
+    """
+    try:
+        out, _err = subp.subp(OVS_INTERNAL_INTERFACE_LOOKUP_CMD)
+    except subp.ProcessExecutionError as exc:
+        if "database connection failed" in exc.stderr:
+            LOG.info(
+                "Open vSwitch is not yet up; no interfaces will be detected as"
+                " OVS-internal"
+            )
+            return []
+        raise
+    else:
+        return out.splitlines()
+
+
+def is_openvswitch_internal_interface(devname: str) -> bool:
+    """Returns True if this is an OVS internal interface.
+
+    If OVS is not installed or not yet running, this will return False.
+    """
+    if not openvswitch_is_installed():
+        return False
+    ovs_bridges = get_ovs_internal_interfaces()
+    if devname in ovs_bridges:
+        LOG.debug("Detected %s as an OVS interface", devname)
+        return True
+    return False
+
+
 def is_netfailover(devname, driver=None):
     """ netfailover driver uses 3 nics, master, primary and standby.
         this returns True if the device is either the primary or standby
@@ -877,6 +937,8 @@ def get_interfaces():
         # skip nics that have no mac (00:00....)
         if name != 'lo' and mac == zero_mac[:len(mac)]:
             continue
+        if is_openvswitch_internal_interface(name):
+            continue
         ret.append((name, mac, device_driver(name), device_devid(name)))
     return ret
 
diff --git a/cloudinit/net/tests/test_init.py b/cloudinit/net/tests/test_init.py
index 0535387a..946f8ee2 100644
--- a/cloudinit/net/tests/test_init.py
+++ b/cloudinit/net/tests/test_init.py
@@ -391,6 +391,10 @@ class TestGetDeviceList(CiTestCase):
         self.assertCountEqual(['eth0', 'eth1'], net.get_devicelist())
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False),
+)
 class TestGetInterfaceMAC(CiTestCase):
 
     def setUp(self):
@@ -1224,6 +1228,121 @@ class TestNetFailOver(CiTestCase):
         self.assertFalse(net.is_netfailover(devname, driver))
 
 
+class TestOpenvswitchIsInstalled:
+    """Test cloudinit.net.openvswitch_is_installed.
+
+    Uses the ``clear_lru_cache`` local autouse fixture to allow us to test
+    despite the ``lru_cache`` decorator on the unit under test.
+    """
+
+    @pytest.fixture(autouse=True)
+    def clear_lru_cache(self):
+        net.openvswitch_is_installed.cache_clear()
+
+    @pytest.mark.parametrize(
+        "expected,which_return", [(True, "/some/path"), (False, None)]
+    )
+    @mock.patch("cloudinit.net.subp.which")
+    def test_mirrors_which_result(self, m_which, expected, which_return):
+        m_which.return_value = which_return
+        assert expected == net.openvswitch_is_installed()
+
+    @mock.patch("cloudinit.net.subp.which")
+    def test_only_calls_which_once(self, m_which):
+        net.openvswitch_is_installed()
+        net.openvswitch_is_installed()
+        assert 1 == m_which.call_count
+
+
+@mock.patch("cloudinit.net.subp.subp", return_value=("", ""))
+class TestGetOVSInternalInterfaces:
+    """Test cloudinit.net.get_ovs_internal_interfaces.
+
+    Uses the ``clear_lru_cache`` local autouse fixture to allow us to test
+    despite the ``lru_cache`` decorator on the unit under test.
+    """
+    @pytest.fixture(autouse=True)
+    def clear_lru_cache(self):
+        net.get_ovs_internal_interfaces.cache_clear()
+
+    def test_command_used(self, m_subp):
+        """Test we use the correct command when we call subp"""
+        net.get_ovs_internal_interfaces()
+
+        assert [
+            mock.call(net.OVS_INTERNAL_INTERFACE_LOOKUP_CMD)
+        ] == m_subp.call_args_list
+
+    def test_subp_contents_split_and_returned(self, m_subp):
+        """Test that the command output is appropriately mangled."""
+        stdout = "iface1\niface2\niface3\n"
+        m_subp.return_value = (stdout, "")
+
+        assert [
+            "iface1",
+            "iface2",
+            "iface3",
+        ] == net.get_ovs_internal_interfaces()
+
+    def test_database_connection_error_handled_gracefully(self, m_subp):
+        """Test that the error indicating OVS is down is handled gracefully."""
+        m_subp.side_effect = ProcessExecutionError(
+            stderr="database connection failed"
+        )
+
+        assert [] == net.get_ovs_internal_interfaces()
+
+    def test_other_errors_raised(self, m_subp):
+        """Test that only database connection errors are handled."""
+        m_subp.side_effect = ProcessExecutionError()
+
+        with pytest.raises(ProcessExecutionError):
+            net.get_ovs_internal_interfaces()
+
+    def test_only_runs_once(self, m_subp):
+        """Test that we cache the value."""
+        net.get_ovs_internal_interfaces()
+        net.get_ovs_internal_interfaces()
+
+        assert 1 == m_subp.call_count
+
+
+@mock.patch("cloudinit.net.get_ovs_internal_interfaces")
+@mock.patch("cloudinit.net.openvswitch_is_installed")
+class TestIsOpenVSwitchInternalInterface:
+    def test_false_if_ovs_not_installed(
+        self, m_openvswitch_is_installed, _m_get_ovs_internal_interfaces
+    ):
+        """Test that OVS' absence returns False."""
+        m_openvswitch_is_installed.return_value = False
+
+        assert not net.is_openvswitch_internal_interface("devname")
+
+    @pytest.mark.parametrize(
+        "detected_interfaces,devname,expected_return",
+        [
+            ([], "devname", False),
+            (["notdevname"], "devname", False),
+            (["devname"], "devname", True),
+            (["some", "other", "devices", "and", "ours"], "ours", True),
+        ],
+    )
+    def test_return_value_based_on_detected_interfaces(
+        self,
+        m_openvswitch_is_installed,
+        m_get_ovs_internal_interfaces,
+        detected_interfaces,
+        devname,
+        expected_return,
+    ):
+        """Test that the detected interfaces are used correctly."""
+        m_openvswitch_is_installed.return_value = True
+        m_get_ovs_internal_interfaces.return_value = detected_interfaces
+        assert expected_return == net.is_openvswitch_internal_interface(
+            devname
+        )
+
+
 class TestIsIpAddress:
     """Tests for net.is_ip_address.
 
diff --git a/cloudinit/sources/helpers/tests/test_openstack.py b/cloudinit/sources/helpers/tests/test_openstack.py
index 2bde1e3f..95fb9743 100644
--- a/cloudinit/sources/helpers/tests/test_openstack.py
+++ b/cloudinit/sources/helpers/tests/test_openstack.py
@@ -1,10 +1,15 @@
 # This file is part of cloud-init. See LICENSE file for license information.
 # ./cloudinit/sources/helpers/tests/test_openstack.py
+from unittest import mock
 
 from cloudinit.sources.helpers import openstack
 from cloudinit.tests import helpers as test_helpers
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestConvertNetJson(test_helpers.CiTestCase):
 
     def test_phy_types(self):
diff --git a/cloudinit/sources/tests/test_oracle.py b/cloudinit/sources/tests/test_oracle.py
index 7bd23813..902d1e40 100644
--- a/cloudinit/sources/tests/test_oracle.py
+++ b/cloudinit/sources/tests/test_oracle.py
@@ -173,6 +173,10 @@ class TestIsPlatformViable(test_helpers.CiTestCase):
         m_read_dmi_data.assert_has_calls([mock.call('chassis-asset-tag')])
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestNetworkConfigFromOpcImds:
     def test_no_secondary_nics_does_not_mutate_input(self, oracle_ds):
         oracle_ds._vnics_data = [{}]
diff --git a/tests/integration_tests/bugs/test_lp1912844.py b/tests/integration_tests/bugs/test_lp1912844.py
new file mode 100644
index 00000000..efafae50
--- /dev/null
+++ b/tests/integration_tests/bugs/test_lp1912844.py
@@ -0,0 +1,103 @@
+"""Integration test for LP: #1912844
+
+cloud-init should ignore OVS-internal interfaces when performing its own
+interface determination: these interfaces are handled fully by OVS, so
+cloud-init should never need to touch them.
+
+This test is a semi-synthetic reproducer for the bug.  It uses a similar
+network configuration, tweaked slightly to DHCP in a way that will succeed even
+on "failed" boots.  The exact bug doesn't reproduce with the NoCloud
+datasource, because it runs at init-local time (whereas the MAAS datasource,
+from the report, runs only at init (network) time): this means that the
+networking code runs before OVS creates its interfaces (which happens after
+init-local but, of course, before networking is up), and so doesn't generate
+the traceback that they cause.  We work around this by calling
+``get_interfaces_by_mac` directly in the test code.
+"""
+import pytest
+
+from tests.integration_tests import random_mac_address
+
+MAC_ADDRESS = random_mac_address()
+
+NETWORK_CONFIG = """\
+bonds:
+    bond0:
+        interfaces:
+            - enp5s0
+        macaddress: {0}
+        mtu: 1500
+bridges:
+        ovs-br:
+            interfaces:
+            - bond0
+            macaddress: {0}
+            mtu: 1500
+            openvswitch: {{}}
+            dhcp4: true
+ethernets:
+    enp5s0:
+      mtu: 1500
+      set-name: enp5s0
+      match:
+          macaddress: {0}
+version: 2
+vlans:
+  ovs-br.100:
+    id: 100
+    link: ovs-br
+    mtu: 1500
+  ovs-br.200:
+    id: 200
+    link: ovs-br
+    mtu: 1500
+""".format(MAC_ADDRESS)
+
+
+SETUP_USER_DATA = """\
+#cloud-config
+packages:
+- openvswitch-switch
+"""
+
+
+@pytest.fixture
+def ovs_enabled_session_cloud(session_cloud):
+    """A session_cloud wrapper, to use an OVS-enabled image for tests.
+
+    This implementation is complicated by wanting to use ``session_cloud``s
+    snapshot cleanup/retention logic, to avoid having to reimplement that here.
+    """
+    old_snapshot_id = session_cloud.snapshot_id
+    with session_cloud.launch(
+        user_data=SETUP_USER_DATA,
+    ) as instance:
+        instance.instance.clean()
+        session_cloud.snapshot_id = instance.snapshot()
+
+    yield session_cloud
+
+    try:
+        session_cloud.delete_snapshot()
+    finally:
+        session_cloud.snapshot_id = old_snapshot_id
+
+
+@pytest.mark.lxd_vm
+def test_get_interfaces_by_mac_doesnt_traceback(ovs_enabled_session_cloud):
+    """Launch our OVS-enabled image and confirm the bug doesn't reproduce."""
+    launch_kwargs = {
+        "config_dict": {
+            "user.network-config": NETWORK_CONFIG,
+            "volatile.eth0.hwaddr": MAC_ADDRESS,
+        },
+    }
+    with ovs_enabled_session_cloud.launch(
+        launch_kwargs=launch_kwargs,
+    ) as client:
+        result = client.execute(
+            "python3 -c"
+            "'from cloudinit.net import get_interfaces_by_mac;"
+            "get_interfaces_by_mac()'"
+        )
+        assert result.ok
diff --git a/tests/unittests/test_datasource/test_configdrive.py b/tests/unittests/test_datasource/test_configdrive.py
index 6f830cc6..2e2b7847 100644
--- a/tests/unittests/test_datasource/test_configdrive.py
+++ b/tests/unittests/test_datasource/test_configdrive.py
@@ -494,6 +494,10 @@ class TestConfigDriveDataSource(CiTestCase):
         self.assertEqual('config-disk (/dev/anything)', cfg_ds.subplatform)
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestNetJson(CiTestCase):
     def setUp(self):
         super(TestNetJson, self).setUp()
@@ -654,6 +658,10 @@ class TestNetJson(CiTestCase):
             self.assertEqual(out_data, conv_data)
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestConvertNetworkData(CiTestCase):
 
     with_logs = True
diff --git a/tests/unittests/test_net.py b/tests/unittests/test_net.py
index 844d5ba8..3607c5e3 100644
--- a/tests/unittests/test_net.py
+++ b/tests/unittests/test_net.py
@@ -2825,6 +2825,10 @@ iface eth1 inet dhcp
         self.assertEqual(0, mock_settle.call_count)
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestRhelSysConfigRendering(CiTestCase):
 
     with_logs = True
@@ -3495,6 +3499,10 @@ USERCTL=no
                 expected, self._render_and_read(network_config=v2data))
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestOpenSuseSysConfigRendering(CiTestCase):
 
     with_logs = True
@@ -4859,6 +4867,10 @@ class TestNetRenderers(CiTestCase):
             self.assertTrue(result)
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestGetInterfaces(CiTestCase):
     _data = {'bonds': ['bond1'],
              'bridges': ['bridge1'],
@@ -5008,6 +5020,10 @@ class TestInterfaceHasOwnMac(CiTestCase):
         self.assertFalse(interface_has_own_mac("eth0"))
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestGetInterfacesByMac(CiTestCase):
     _data = {'bonds': ['bond1'],
              'bridges': ['bridge1'],
@@ -5164,6 +5180,10 @@ class TestInterfacesSorting(CiTestCase):
             ['enp0s3', 'enp0s8', 'enp0s13', 'enp1s2', 'enp2s0', 'enp2s3'])
 
 
+@mock.patch(
+    "cloudinit.net.is_openvswitch_internal_interface",
+    mock.Mock(return_value=False)
+)
 class TestGetIBHwaddrsByInterface(CiTestCase):
 
     _ib_addr = '80:00:00:28:fe:80:00:00:00:00:00:00:00:11:22:03:00:33:44:56'
-- 
2.27.0