738010
From c1c6d0b586e5556a37b9b813afbb4e6a24921adf Mon Sep 17 00:00:00 2001
738010
From: Eduardo Otubo <otubo@redhat.com>
738010
Date: Wed, 23 Jan 2019 12:30:21 +0100
738010
Subject: net: Wait for dhclient to daemonize before reading lease file
738010
738010
RH-Author: Eduardo Otubo <otubo@redhat.com>
738010
Message-id: <20190123123021.32708-1-otubo@redhat.com>
738010
Patchwork-id: 84095
738010
O-Subject: [RHEL-7.7 cloud-init PATCH] net: Wait for dhclient to daemonize before reading lease file
738010
Bugzilla: 1632967
738010
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
738010
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
738010
738010
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
738010
Brew: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
738010
Tested: Me and upstream
738010
738010
commit fdadcb5fae51f4e6799314ab98e3aec56c79b17c
738010
Author: Jason Zions <jasonzio@microsoft.com>
738010
Date:   Tue Jan 15 21:37:17 2019 +0000
738010
738010
    net: Wait for dhclient to daemonize before reading lease file
738010
738010
    cloud-init uses dhclient to fetch the DHCP lease so it can extract
738010
    DHCP options.  dhclient creates the leasefile, then writes to it;
738010
    simply waiting for the leasefile to appear creates a race between
738010
    dhclient and cloud-init. Instead, wait for dhclient to be parented by
738010
    init. At that point, we know it has written to the leasefile, so it's
738010
    safe to copy the file and kill the process.
738010
738010
    cloud-init creates a temporary directory in which to execute dhclient,
738010
    and deletes that directory after it has killed the process. If
738010
    cloud-init abandons waiting for dhclient to daemonize, it will still
738010
    attempt to delete the temporary directory, but will not report an
738010
    exception should that attempt fail.
738010
738010
    LP: #1794399
738010
738010
Signed-off-by: Eduardo Otubo <otubo@redhat.com>
738010
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
738010
---
738010
 cloudinit/net/dhcp.py              | 44 +++++++++++++++++++++++++++-----------
738010
 cloudinit/net/tests/test_dhcp.py   | 15 ++++++++++---
738010
 cloudinit/temp_utils.py            |  4 ++--
738010
 cloudinit/tests/test_temp_utils.py | 18 +++++++++++++++-
738010
 cloudinit/util.py                  | 16 +++++++++++++-
738010
 tests/unittests/test_util.py       |  6 ++++++
738010
 6 files changed, 83 insertions(+), 20 deletions(-)
738010
738010
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
738010
index 0db991d..c98a97c 100644
738010
--- a/cloudinit/net/dhcp.py
738010
+++ b/cloudinit/net/dhcp.py
738010
@@ -9,6 +9,7 @@ import logging
738010
 import os
738010
 import re
738010
 import signal
738010
+import time
738010
 
738010
 from cloudinit.net import (
738010
     EphemeralIPv4Network, find_fallback_nic, get_devicelist,
738010
@@ -127,7 +128,9 @@ def maybe_perform_dhcp_discovery(nic=None):
738010
     if not dhclient_path:
738010
         LOG.debug('Skip dhclient configuration: No dhclient command found.')
738010
         return []
738010
-    with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
738010
+    with temp_utils.tempdir(rmtree_ignore_errors=True,
738010
+                            prefix='cloud-init-dhcp-',
738010
+                            needs_exe=True) as tdir:
738010
         # Use /var/tmp because /run/cloud-init/tmp is mounted noexec
738010
         return dhcp_discovery(dhclient_path, nic, tdir)
738010
 
738010
@@ -195,24 +198,39 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
738010
            '-pf', pid_file, interface, '-sf', '/bin/true']
738010
     util.subp(cmd, capture=True)
738010
 
738010
-    # dhclient doesn't write a pid file until after it forks when it gets a
738010
-    # proper lease response. Since cleandir is a temp directory that gets
738010
-    # removed, we need to wait for that pidfile creation before the
738010
-    # cleandir is removed, otherwise we get FileNotFound errors.
738010
+    # Wait for pid file and lease file to appear, and for the process
738010
+    # named by the pid file to daemonize (have pid 1 as its parent). If we
738010
+    # try to read the lease file before daemonization happens, we might try
738010
+    # to read it before the dhclient has actually written it. We also have
738010
+    # to wait until the dhclient has become a daemon so we can be sure to
738010
+    # kill the correct process, thus freeing cleandir to be deleted back
738010
+    # up the callstack.
738010
     missing = util.wait_for_files(
738010
         [pid_file, lease_file], maxwait=5, naplen=0.01)
738010
     if missing:
738010
         LOG.warning("dhclient did not produce expected files: %s",
738010
                     ', '.join(os.path.basename(f) for f in missing))
738010
         return []
738010
-    pid_content = util.load_file(pid_file).strip()
738010
-    try:
738010
-        pid = int(pid_content)
738010
-    except ValueError:
738010
-        LOG.debug(
738010
-            "pid file contains non-integer content '%s'", pid_content)
738010
-    else:
738010
-        os.kill(pid, signal.SIGKILL)
738010
+
738010
+    ppid = 'unknown'
738010
+    for _ in range(0, 1000):
738010
+        pid_content = util.load_file(pid_file).strip()
738010
+        try:
738010
+            pid = int(pid_content)
738010
+        except ValueError:
738010
+            pass
738010
+        else:
738010
+            ppid = util.get_proc_ppid(pid)
738010
+            if ppid == 1:
738010
+                LOG.debug('killing dhclient with pid=%s', pid)
738010
+                os.kill(pid, signal.SIGKILL)
738010
+                return parse_dhcp_lease_file(lease_file)
738010
+        time.sleep(0.01)
738010
+
738010
+    LOG.error(
738010
+        'dhclient(pid=%s, parentpid=%s) failed to daemonize after %s seconds',
738010
+        pid_content, ppid, 0.01 * 1000
738010
+    )
738010
     return parse_dhcp_lease_file(lease_file)
738010
 
738010
 
738010
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
738010
index cd3e732..79e8842 100644
738010
--- a/cloudinit/net/tests/test_dhcp.py
738010
+++ b/cloudinit/net/tests/test_dhcp.py
738010
@@ -145,16 +145,20 @@ class TestDHCPDiscoveryClean(CiTestCase):
738010
               'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
738010
             dhcp_discovery(dhclient_script, 'eth9', tmpdir))
738010
         self.assertIn(
738010
-            "pid file contains non-integer content ''", self.logs.getvalue())
738010
+            "dhclient(pid=, parentpid=unknown) failed "
738010
+            "to daemonize after 10.0 seconds",
738010
+            self.logs.getvalue())
738010
         m_kill.assert_not_called()
738010
 
738010
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
738010
     @mock.patch('cloudinit.net.dhcp.os.kill')
738010
     @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
738010
     @mock.patch('cloudinit.net.dhcp.util.subp')
738010
     def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
738010
                                                                   m_subp,
738010
                                                                   m_wait,
738010
-                                                                  m_kill):
738010
+                                                                  m_kill,
738010
+                                                                  m_getppid):
738010
         """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
738010
         tmpdir = self.tmp_dir()
738010
         dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
738010
@@ -164,6 +168,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
738010
         pidfile = self.tmp_path('dhclient.pid', tmpdir)
738010
         leasefile = self.tmp_path('dhcp.leases', tmpdir)
738010
         m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
738010
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
738010
         self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
738010
         self.assertEqual(
738010
             mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
738010
@@ -173,9 +178,10 @@ class TestDHCPDiscoveryClean(CiTestCase):
738010
             self.logs.getvalue())
738010
         m_kill.assert_not_called()
738010
 
738010
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
738010
     @mock.patch('cloudinit.net.dhcp.os.kill')
738010
     @mock.patch('cloudinit.net.dhcp.util.subp')
738010
-    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
738010
+    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
738010
         """dhcp_discovery brings up the interface and runs dhclient.
738010
 
738010
         It also returns the parsed dhcp.leases file generated in the sandbox.
738010
@@ -197,6 +203,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
738010
         pid_file = os.path.join(tmpdir, 'dhclient.pid')
738010
         my_pid = 1
738010
         write_file(pid_file, "%d\n" % my_pid)
738010
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
738010
 
738010
         self.assertItemsEqual(
738010
             [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
738010
@@ -355,3 +362,5 @@ class TestEphemeralDhcpNoNetworkSetup(HttprettyTestCase):
738010
             self.assertEqual(fake_lease, lease)
738010
         # Ensure that dhcp discovery occurs
738010
         m_dhcp.called_once_with()
738010
+
738010
+# vi: ts=4 expandtab
738010
diff --git a/cloudinit/temp_utils.py b/cloudinit/temp_utils.py
738010
index c98a1b5..346276e 100644
738010
--- a/cloudinit/temp_utils.py
738010
+++ b/cloudinit/temp_utils.py
738010
@@ -81,7 +81,7 @@ def ExtendedTemporaryFile(**kwargs):
738010
 
738010
 
738010
 @contextlib.contextmanager
738010
-def tempdir(**kwargs):
738010
+def tempdir(rmtree_ignore_errors=False, **kwargs):
738010
     # This seems like it was only added in python 3.2
738010
     # Make it since its useful...
738010
     # See: http://bugs.python.org/file12970/tempdir.patch
738010
@@ -89,7 +89,7 @@ def tempdir(**kwargs):
738010
     try:
738010
         yield tdir
738010
     finally:
738010
-        shutil.rmtree(tdir)
738010
+        shutil.rmtree(tdir, ignore_errors=rmtree_ignore_errors)
738010
 
738010
 
738010
 def mkdtemp(**kwargs):
738010
diff --git a/cloudinit/tests/test_temp_utils.py b/cloudinit/tests/test_temp_utils.py
738010
index ffbb92c..4a52ef8 100644
738010
--- a/cloudinit/tests/test_temp_utils.py
738010
+++ b/cloudinit/tests/test_temp_utils.py
738010
@@ -2,8 +2,9 @@
738010
 
738010
 """Tests for cloudinit.temp_utils"""
738010
 
738010
-from cloudinit.temp_utils import mkdtemp, mkstemp
738010
+from cloudinit.temp_utils import mkdtemp, mkstemp, tempdir
738010
 from cloudinit.tests.helpers import CiTestCase, wrap_and_call
738010
+import os
738010
 
738010
 
738010
 class TestTempUtils(CiTestCase):
738010
@@ -98,4 +99,19 @@ class TestTempUtils(CiTestCase):
738010
         self.assertEqual('/fake/return/path', retval)
738010
         self.assertEqual([{'dir': '/run/cloud-init/tmp'}], calls)
738010
 
738010
+    def test_tempdir_error_suppression(self):
738010
+        """test tempdir suppresses errors during directory removal."""
738010
+
738010
+        with self.assertRaises(OSError):
738010
+            with tempdir(prefix='cloud-init-dhcp-') as tdir:
738010
+                os.rmdir(tdir)
738010
+                # As a result, the directory is already gone,
738010
+                # so shutil.rmtree should raise OSError
738010
+
738010
+        with tempdir(rmtree_ignore_errors=True,
738010
+                     prefix='cloud-init-dhcp-') as tdir:
738010
+            os.rmdir(tdir)
738010
+            # Since the directory is already gone, shutil.rmtree would raise
738010
+            # OSError, but we suppress that
738010
+
738010
 # vi: ts=4 expandtab
738010
diff --git a/cloudinit/util.py b/cloudinit/util.py
738010
index 7800f7b..a84112a 100644
738010
--- a/cloudinit/util.py
738010
+++ b/cloudinit/util.py
738010
@@ -2861,7 +2861,6 @@ def mount_is_read_write(mount_point):
738010
     mount_opts = result[-1].split(',')
738010
     return mount_opts[0] == 'rw'
738010
 
738010
-
738010
 def udevadm_settle(exists=None, timeout=None):
738010
     """Invoke udevadm settle with optional exists and timeout parameters"""
738010
     settle_cmd = ["udevadm", "settle"]
738010
@@ -2875,5 +2874,20 @@ def udevadm_settle(exists=None, timeout=None):
738010
 
738010
     return subp(settle_cmd)
738010
 
738010
+def get_proc_ppid(pid):
738010
+    """
738010
+    Return the parent pid of a process.
738010
+    """
738010
+    ppid = 0
738010
+    try:
738010
+        contents = load_file("/proc/%s/stat" % pid, quiet=True)
738010
+    except IOError as e:
738010
+        LOG.warning('Failed to load /proc/%s/stat. %s', pid, e)
738010
+    if contents:
738010
+        parts = contents.split(" ", 4)
738010
+        # man proc says
738010
+        #  ppid %d     (4) The PID of the parent.
738010
+        ppid = int(parts[3])
738010
+    return ppid
738010
 
738010
 # vi: ts=4 expandtab
738010
diff --git a/tests/unittests/test_util.py b/tests/unittests/test_util.py
738010
index 5a14479..8aebcd6 100644
738010
--- a/tests/unittests/test_util.py
738010
+++ b/tests/unittests/test_util.py
738010
@@ -1114,6 +1114,12 @@ class TestLoadShellContent(helpers.TestCase):
738010
                 'key3="val3 #tricky"',
738010
                 ''])))
738010
 
738010
+    def test_get_proc_ppid(self):
738010
+        """get_proc_ppid returns correct parent pid value."""
738010
+        my_pid = os.getpid()
738010
+        my_ppid = os.getppid()
738010
+        self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
738010
+
738010
 
738010
 class TestGetProcEnv(helpers.TestCase):
738010
     """test get_proc_env."""
738010
-- 
738010
1.8.3.1
738010