18322d
From 2e070086275341dfceb6d5b1e12f06f22e7bbfcd Mon Sep 17 00:00:00 2001
18322d
From: Eduardo Otubo <otubo@redhat.com>
18322d
Date: Wed, 23 Jan 2019 12:30:21 +0100
18322d
Subject: net: Wait for dhclient to daemonize before reading lease file
18322d
18322d
RH-Author: Eduardo Otubo <otubo@redhat.com>
18322d
Message-id: <20190123123021.32708-1-otubo@redhat.com>
18322d
Patchwork-id: 84095
18322d
O-Subject: [RHEL-7.7 cloud-init PATCH] net: Wait for dhclient to daemonize before reading lease file
18322d
Bugzilla: 1632967
18322d
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
18322d
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
18322d
18322d
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
18322d
Brew: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
18322d
Tested: Me and upstream
18322d
18322d
commit fdadcb5fae51f4e6799314ab98e3aec56c79b17c
18322d
Author: Jason Zions <jasonzio@microsoft.com>
18322d
Date:   Tue Jan 15 21:37:17 2019 +0000
18322d
18322d
    net: Wait for dhclient to daemonize before reading lease file
18322d
18322d
    cloud-init uses dhclient to fetch the DHCP lease so it can extract
18322d
    DHCP options.  dhclient creates the leasefile, then writes to it;
18322d
    simply waiting for the leasefile to appear creates a race between
18322d
    dhclient and cloud-init. Instead, wait for dhclient to be parented by
18322d
    init. At that point, we know it has written to the leasefile, so it's
18322d
    safe to copy the file and kill the process.
18322d
18322d
    cloud-init creates a temporary directory in which to execute dhclient,
18322d
    and deletes that directory after it has killed the process. If
18322d
    cloud-init abandons waiting for dhclient to daemonize, it will still
18322d
    attempt to delete the temporary directory, but will not report an
18322d
    exception should that attempt fail.
18322d
18322d
    LP: #1794399
18322d
18322d
Signed-off-by: Eduardo Otubo <otubo@redhat.com>
18322d
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
18322d
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
18322d
---
18322d
 cloudinit/net/dhcp.py              | 44 +++++++++++++++++++++---------
18322d
 cloudinit/net/tests/test_dhcp.py   | 15 ++++++++--
18322d
 cloudinit/temp_utils.py            |  4 +--
18322d
 cloudinit/tests/test_temp_utils.py | 18 +++++++++++-
18322d
 cloudinit/util.py                  | 16 ++++++++++-
18322d
 tests/unittests/test_util.py       |  6 ++++
18322d
 6 files changed, 83 insertions(+), 20 deletions(-)
18322d
18322d
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
18322d
index 0db991db..c98a97cd 100644
18322d
--- a/cloudinit/net/dhcp.py
18322d
+++ b/cloudinit/net/dhcp.py
18322d
@@ -9,6 +9,7 @@ import logging
18322d
 import os
18322d
 import re
18322d
 import signal
18322d
+import time
18322d
 
18322d
 from cloudinit.net import (
18322d
     EphemeralIPv4Network, find_fallback_nic, get_devicelist,
18322d
@@ -127,7 +128,9 @@ def maybe_perform_dhcp_discovery(nic=None):
18322d
     if not dhclient_path:
18322d
         LOG.debug('Skip dhclient configuration: No dhclient command found.')
18322d
         return []
18322d
-    with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
18322d
+    with temp_utils.tempdir(rmtree_ignore_errors=True,
18322d
+                            prefix='cloud-init-dhcp-',
18322d
+                            needs_exe=True) as tdir:
18322d
         # Use /var/tmp because /run/cloud-init/tmp is mounted noexec
18322d
         return dhcp_discovery(dhclient_path, nic, tdir)
18322d
 
18322d
@@ -195,24 +198,39 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
18322d
            '-pf', pid_file, interface, '-sf', '/bin/true']
18322d
     util.subp(cmd, capture=True)
18322d
 
18322d
-    # dhclient doesn't write a pid file until after it forks when it gets a
18322d
-    # proper lease response. Since cleandir is a temp directory that gets
18322d
-    # removed, we need to wait for that pidfile creation before the
18322d
-    # cleandir is removed, otherwise we get FileNotFound errors.
18322d
+    # Wait for pid file and lease file to appear, and for the process
18322d
+    # named by the pid file to daemonize (have pid 1 as its parent). If we
18322d
+    # try to read the lease file before daemonization happens, we might try
18322d
+    # to read it before the dhclient has actually written it. We also have
18322d
+    # to wait until the dhclient has become a daemon so we can be sure to
18322d
+    # kill the correct process, thus freeing cleandir to be deleted back
18322d
+    # up the callstack.
18322d
     missing = util.wait_for_files(
18322d
         [pid_file, lease_file], maxwait=5, naplen=0.01)
18322d
     if missing:
18322d
         LOG.warning("dhclient did not produce expected files: %s",
18322d
                     ', '.join(os.path.basename(f) for f in missing))
18322d
         return []
18322d
-    pid_content = util.load_file(pid_file).strip()
18322d
-    try:
18322d
-        pid = int(pid_content)
18322d
-    except ValueError:
18322d
-        LOG.debug(
18322d
-            "pid file contains non-integer content '%s'", pid_content)
18322d
-    else:
18322d
-        os.kill(pid, signal.SIGKILL)
18322d
+
18322d
+    ppid = 'unknown'
18322d
+    for _ in range(0, 1000):
18322d
+        pid_content = util.load_file(pid_file).strip()
18322d
+        try:
18322d
+            pid = int(pid_content)
18322d
+        except ValueError:
18322d
+            pass
18322d
+        else:
18322d
+            ppid = util.get_proc_ppid(pid)
18322d
+            if ppid == 1:
18322d
+                LOG.debug('killing dhclient with pid=%s', pid)
18322d
+                os.kill(pid, signal.SIGKILL)
18322d
+                return parse_dhcp_lease_file(lease_file)
18322d
+        time.sleep(0.01)
18322d
+
18322d
+    LOG.error(
18322d
+        'dhclient(pid=%s, parentpid=%s) failed to daemonize after %s seconds',
18322d
+        pid_content, ppid, 0.01 * 1000
18322d
+    )
18322d
     return parse_dhcp_lease_file(lease_file)
18322d
 
18322d
 
18322d
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
18322d
index cd3e7328..79e8842f 100644
18322d
--- a/cloudinit/net/tests/test_dhcp.py
18322d
+++ b/cloudinit/net/tests/test_dhcp.py
18322d
@@ -145,16 +145,20 @@ class TestDHCPDiscoveryClean(CiTestCase):
18322d
               'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
18322d
             dhcp_discovery(dhclient_script, 'eth9', tmpdir))
18322d
         self.assertIn(
18322d
-            "pid file contains non-integer content ''", self.logs.getvalue())
18322d
+            "dhclient(pid=, parentpid=unknown) failed "
18322d
+            "to daemonize after 10.0 seconds",
18322d
+            self.logs.getvalue())
18322d
         m_kill.assert_not_called()
18322d
 
18322d
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
18322d
     @mock.patch('cloudinit.net.dhcp.os.kill')
18322d
     @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
18322d
     @mock.patch('cloudinit.net.dhcp.util.subp')
18322d
     def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
18322d
                                                                   m_subp,
18322d
                                                                   m_wait,
18322d
-                                                                  m_kill):
18322d
+                                                                  m_kill,
18322d
+                                                                  m_getppid):
18322d
         """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
18322d
         tmpdir = self.tmp_dir()
18322d
         dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
18322d
@@ -164,6 +168,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
18322d
         pidfile = self.tmp_path('dhclient.pid', tmpdir)
18322d
         leasefile = self.tmp_path('dhcp.leases', tmpdir)
18322d
         m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
18322d
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
18322d
         self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
18322d
         self.assertEqual(
18322d
             mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
18322d
@@ -173,9 +178,10 @@ class TestDHCPDiscoveryClean(CiTestCase):
18322d
             self.logs.getvalue())
18322d
         m_kill.assert_not_called()
18322d
 
18322d
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
18322d
     @mock.patch('cloudinit.net.dhcp.os.kill')
18322d
     @mock.patch('cloudinit.net.dhcp.util.subp')
18322d
-    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
18322d
+    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
18322d
         """dhcp_discovery brings up the interface and runs dhclient.
18322d
 
18322d
         It also returns the parsed dhcp.leases file generated in the sandbox.
18322d
@@ -197,6 +203,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
18322d
         pid_file = os.path.join(tmpdir, 'dhclient.pid')
18322d
         my_pid = 1
18322d
         write_file(pid_file, "%d\n" % my_pid)
18322d
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
18322d
 
18322d
         self.assertItemsEqual(
18322d
             [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
18322d
@@ -355,3 +362,5 @@ class TestEphemeralDhcpNoNetworkSetup(HttprettyTestCase):
18322d
             self.assertEqual(fake_lease, lease)
18322d
         # Ensure that dhcp discovery occurs
18322d
         m_dhcp.called_once_with()
18322d
+
18322d
+# vi: ts=4 expandtab
18322d
diff --git a/cloudinit/temp_utils.py b/cloudinit/temp_utils.py
18322d
index c98a1b53..346276ec 100644
18322d
--- a/cloudinit/temp_utils.py
18322d
+++ b/cloudinit/temp_utils.py
18322d
@@ -81,7 +81,7 @@ def ExtendedTemporaryFile(**kwargs):
18322d
 
18322d
 
18322d
 @contextlib.contextmanager
18322d
-def tempdir(**kwargs):
18322d
+def tempdir(rmtree_ignore_errors=False, **kwargs):
18322d
     # This seems like it was only added in python 3.2
18322d
     # Make it since its useful...
18322d
     # See: http://bugs.python.org/file12970/tempdir.patch
18322d
@@ -89,7 +89,7 @@ def tempdir(**kwargs):
18322d
     try:
18322d
         yield tdir
18322d
     finally:
18322d
-        shutil.rmtree(tdir)
18322d
+        shutil.rmtree(tdir, ignore_errors=rmtree_ignore_errors)
18322d
 
18322d
 
18322d
 def mkdtemp(**kwargs):
18322d
diff --git a/cloudinit/tests/test_temp_utils.py b/cloudinit/tests/test_temp_utils.py
18322d
index ffbb92cd..4a52ef89 100644
18322d
--- a/cloudinit/tests/test_temp_utils.py
18322d
+++ b/cloudinit/tests/test_temp_utils.py
18322d
@@ -2,8 +2,9 @@
18322d
 
18322d
 """Tests for cloudinit.temp_utils"""
18322d
 
18322d
-from cloudinit.temp_utils import mkdtemp, mkstemp
18322d
+from cloudinit.temp_utils import mkdtemp, mkstemp, tempdir
18322d
 from cloudinit.tests.helpers import CiTestCase, wrap_and_call
18322d
+import os
18322d
 
18322d
 
18322d
 class TestTempUtils(CiTestCase):
18322d
@@ -98,4 +99,19 @@ class TestTempUtils(CiTestCase):
18322d
         self.assertEqual('/fake/return/path', retval)
18322d
         self.assertEqual([{'dir': '/run/cloud-init/tmp'}], calls)
18322d
 
18322d
+    def test_tempdir_error_suppression(self):
18322d
+        """test tempdir suppresses errors during directory removal."""
18322d
+
18322d
+        with self.assertRaises(OSError):
18322d
+            with tempdir(prefix='cloud-init-dhcp-') as tdir:
18322d
+                os.rmdir(tdir)
18322d
+                # As a result, the directory is already gone,
18322d
+                # so shutil.rmtree should raise OSError
18322d
+
18322d
+        with tempdir(rmtree_ignore_errors=True,
18322d
+                     prefix='cloud-init-dhcp-') as tdir:
18322d
+            os.rmdir(tdir)
18322d
+            # Since the directory is already gone, shutil.rmtree would raise
18322d
+            # OSError, but we suppress that
18322d
+
18322d
 # vi: ts=4 expandtab
18322d
diff --git a/cloudinit/util.py b/cloudinit/util.py
18322d
index 7800f7bc..a84112a9 100644
18322d
--- a/cloudinit/util.py
18322d
+++ b/cloudinit/util.py
18322d
@@ -2861,7 +2861,6 @@ def mount_is_read_write(mount_point):
18322d
     mount_opts = result[-1].split(',')
18322d
     return mount_opts[0] == 'rw'
18322d
 
18322d
-
18322d
 def udevadm_settle(exists=None, timeout=None):
18322d
     """Invoke udevadm settle with optional exists and timeout parameters"""
18322d
     settle_cmd = ["udevadm", "settle"]
18322d
@@ -2875,5 +2874,20 @@ def udevadm_settle(exists=None, timeout=None):
18322d
 
18322d
     return subp(settle_cmd)
18322d
 
18322d
+def get_proc_ppid(pid):
18322d
+    """
18322d
+    Return the parent pid of a process.
18322d
+    """
18322d
+    ppid = 0
18322d
+    try:
18322d
+        contents = load_file("/proc/%s/stat" % pid, quiet=True)
18322d
+    except IOError as e:
18322d
+        LOG.warning('Failed to load /proc/%s/stat. %s', pid, e)
18322d
+    if contents:
18322d
+        parts = contents.split(" ", 4)
18322d
+        # man proc says
18322d
+        #  ppid %d     (4) The PID of the parent.
18322d
+        ppid = int(parts[3])
18322d
+    return ppid
18322d
 
18322d
 # vi: ts=4 expandtab
18322d
diff --git a/tests/unittests/test_util.py b/tests/unittests/test_util.py
18322d
index 5a14479a..8aebcd62 100644
18322d
--- a/tests/unittests/test_util.py
18322d
+++ b/tests/unittests/test_util.py
18322d
@@ -1114,6 +1114,12 @@ class TestLoadShellContent(helpers.TestCase):
18322d
                 'key3="val3 #tricky"',
18322d
                 ''])))
18322d
 
18322d
+    def test_get_proc_ppid(self):
18322d
+        """get_proc_ppid returns correct parent pid value."""
18322d
+        my_pid = os.getpid()
18322d
+        my_ppid = os.getppid()
18322d
+        self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
18322d
+
18322d
 
18322d
 class TestGetProcEnv(helpers.TestCase):
18322d
     """test get_proc_env."""
18322d
-- 
18322d
2.20.1
18322d