17dafa
From c1c6d0b586e5556a37b9b813afbb4e6a24921adf Mon Sep 17 00:00:00 2001
17dafa
From: Eduardo Otubo <otubo@redhat.com>
17dafa
Date: Wed, 23 Jan 2019 12:30:21 +0100
17dafa
Subject: net: Wait for dhclient to daemonize before reading lease file
17dafa
17dafa
RH-Author: Eduardo Otubo <otubo@redhat.com>
17dafa
Message-id: <20190123123021.32708-1-otubo@redhat.com>
17dafa
Patchwork-id: 84095
17dafa
O-Subject: [RHEL-7.7 cloud-init PATCH] net: Wait for dhclient to daemonize before reading lease file
17dafa
Bugzilla: 1632967
17dafa
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
17dafa
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
17dafa
17dafa
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
17dafa
Brew: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
17dafa
Tested: Me and upstream
17dafa
17dafa
commit fdadcb5fae51f4e6799314ab98e3aec56c79b17c
17dafa
Author: Jason Zions <jasonzio@microsoft.com>
17dafa
Date:   Tue Jan 15 21:37:17 2019 +0000
17dafa
17dafa
    net: Wait for dhclient to daemonize before reading lease file
17dafa
17dafa
    cloud-init uses dhclient to fetch the DHCP lease so it can extract
17dafa
    DHCP options.  dhclient creates the leasefile, then writes to it;
17dafa
    simply waiting for the leasefile to appear creates a race between
17dafa
    dhclient and cloud-init. Instead, wait for dhclient to be parented by
17dafa
    init. At that point, we know it has written to the leasefile, so it's
17dafa
    safe to copy the file and kill the process.
17dafa
17dafa
    cloud-init creates a temporary directory in which to execute dhclient,
17dafa
    and deletes that directory after it has killed the process. If
17dafa
    cloud-init abandons waiting for dhclient to daemonize, it will still
17dafa
    attempt to delete the temporary directory, but will not report an
17dafa
    exception should that attempt fail.
17dafa
17dafa
    LP: #1794399
17dafa
17dafa
Signed-off-by: Eduardo Otubo <otubo@redhat.com>
17dafa
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
17dafa
---
17dafa
 cloudinit/net/dhcp.py              | 44 +++++++++++++++++++++++++++-----------
17dafa
 cloudinit/net/tests/test_dhcp.py   | 15 ++++++++++---
17dafa
 cloudinit/temp_utils.py            |  4 ++--
17dafa
 cloudinit/tests/test_temp_utils.py | 18 +++++++++++++++-
17dafa
 cloudinit/util.py                  | 16 +++++++++++++-
17dafa
 tests/unittests/test_util.py       |  6 ++++++
17dafa
 6 files changed, 83 insertions(+), 20 deletions(-)
17dafa
17dafa
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
17dafa
index 0db991d..c98a97c 100644
17dafa
--- a/cloudinit/net/dhcp.py
17dafa
+++ b/cloudinit/net/dhcp.py
17dafa
@@ -9,6 +9,7 @@ import logging
17dafa
 import os
17dafa
 import re
17dafa
 import signal
17dafa
+import time
17dafa
 
17dafa
 from cloudinit.net import (
17dafa
     EphemeralIPv4Network, find_fallback_nic, get_devicelist,
17dafa
@@ -127,7 +128,9 @@ def maybe_perform_dhcp_discovery(nic=None):
17dafa
     if not dhclient_path:
17dafa
         LOG.debug('Skip dhclient configuration: No dhclient command found.')
17dafa
         return []
17dafa
-    with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
17dafa
+    with temp_utils.tempdir(rmtree_ignore_errors=True,
17dafa
+                            prefix='cloud-init-dhcp-',
17dafa
+                            needs_exe=True) as tdir:
17dafa
         # Use /var/tmp because /run/cloud-init/tmp is mounted noexec
17dafa
         return dhcp_discovery(dhclient_path, nic, tdir)
17dafa
 
17dafa
@@ -195,24 +198,39 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
17dafa
            '-pf', pid_file, interface, '-sf', '/bin/true']
17dafa
     util.subp(cmd, capture=True)
17dafa
 
17dafa
-    # dhclient doesn't write a pid file until after it forks when it gets a
17dafa
-    # proper lease response. Since cleandir is a temp directory that gets
17dafa
-    # removed, we need to wait for that pidfile creation before the
17dafa
-    # cleandir is removed, otherwise we get FileNotFound errors.
17dafa
+    # Wait for pid file and lease file to appear, and for the process
17dafa
+    # named by the pid file to daemonize (have pid 1 as its parent). If we
17dafa
+    # try to read the lease file before daemonization happens, we might try
17dafa
+    # to read it before the dhclient has actually written it. We also have
17dafa
+    # to wait until the dhclient has become a daemon so we can be sure to
17dafa
+    # kill the correct process, thus freeing cleandir to be deleted back
17dafa
+    # up the callstack.
17dafa
     missing = util.wait_for_files(
17dafa
         [pid_file, lease_file], maxwait=5, naplen=0.01)
17dafa
     if missing:
17dafa
         LOG.warning("dhclient did not produce expected files: %s",
17dafa
                     ', '.join(os.path.basename(f) for f in missing))
17dafa
         return []
17dafa
-    pid_content = util.load_file(pid_file).strip()
17dafa
-    try:
17dafa
-        pid = int(pid_content)
17dafa
-    except ValueError:
17dafa
-        LOG.debug(
17dafa
-            "pid file contains non-integer content '%s'", pid_content)
17dafa
-    else:
17dafa
-        os.kill(pid, signal.SIGKILL)
17dafa
+
17dafa
+    ppid = 'unknown'
17dafa
+    for _ in range(0, 1000):
17dafa
+        pid_content = util.load_file(pid_file).strip()
17dafa
+        try:
17dafa
+            pid = int(pid_content)
17dafa
+        except ValueError:
17dafa
+            pass
17dafa
+        else:
17dafa
+            ppid = util.get_proc_ppid(pid)
17dafa
+            if ppid == 1:
17dafa
+                LOG.debug('killing dhclient with pid=%s', pid)
17dafa
+                os.kill(pid, signal.SIGKILL)
17dafa
+                return parse_dhcp_lease_file(lease_file)
17dafa
+        time.sleep(0.01)
17dafa
+
17dafa
+    LOG.error(
17dafa
+        'dhclient(pid=%s, parentpid=%s) failed to daemonize after %s seconds',
17dafa
+        pid_content, ppid, 0.01 * 1000
17dafa
+    )
17dafa
     return parse_dhcp_lease_file(lease_file)
17dafa
 
17dafa
 
17dafa
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
17dafa
index cd3e732..79e8842 100644
17dafa
--- a/cloudinit/net/tests/test_dhcp.py
17dafa
+++ b/cloudinit/net/tests/test_dhcp.py
17dafa
@@ -145,16 +145,20 @@ class TestDHCPDiscoveryClean(CiTestCase):
17dafa
               'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
17dafa
             dhcp_discovery(dhclient_script, 'eth9', tmpdir))
17dafa
         self.assertIn(
17dafa
-            "pid file contains non-integer content ''", self.logs.getvalue())
17dafa
+            "dhclient(pid=, parentpid=unknown) failed "
17dafa
+            "to daemonize after 10.0 seconds",
17dafa
+            self.logs.getvalue())
17dafa
         m_kill.assert_not_called()
17dafa
 
17dafa
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
17dafa
     @mock.patch('cloudinit.net.dhcp.os.kill')
17dafa
     @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
17dafa
     @mock.patch('cloudinit.net.dhcp.util.subp')
17dafa
     def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
17dafa
                                                                   m_subp,
17dafa
                                                                   m_wait,
17dafa
-                                                                  m_kill):
17dafa
+                                                                  m_kill,
17dafa
+                                                                  m_getppid):
17dafa
         """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
17dafa
         tmpdir = self.tmp_dir()
17dafa
         dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
17dafa
@@ -164,6 +168,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
17dafa
         pidfile = self.tmp_path('dhclient.pid', tmpdir)
17dafa
         leasefile = self.tmp_path('dhcp.leases', tmpdir)
17dafa
         m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
17dafa
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
17dafa
         self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
17dafa
         self.assertEqual(
17dafa
             mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
17dafa
@@ -173,9 +178,10 @@ class TestDHCPDiscoveryClean(CiTestCase):
17dafa
             self.logs.getvalue())
17dafa
         m_kill.assert_not_called()
17dafa
 
17dafa
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
17dafa
     @mock.patch('cloudinit.net.dhcp.os.kill')
17dafa
     @mock.patch('cloudinit.net.dhcp.util.subp')
17dafa
-    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
17dafa
+    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
17dafa
         """dhcp_discovery brings up the interface and runs dhclient.
17dafa
 
17dafa
         It also returns the parsed dhcp.leases file generated in the sandbox.
17dafa
@@ -197,6 +203,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
17dafa
         pid_file = os.path.join(tmpdir, 'dhclient.pid')
17dafa
         my_pid = 1
17dafa
         write_file(pid_file, "%d\n" % my_pid)
17dafa
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
17dafa
 
17dafa
         self.assertItemsEqual(
17dafa
             [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
17dafa
@@ -355,3 +362,5 @@ class TestEphemeralDhcpNoNetworkSetup(HttprettyTestCase):
17dafa
             self.assertEqual(fake_lease, lease)
17dafa
         # Ensure that dhcp discovery occurs
17dafa
         m_dhcp.called_once_with()
17dafa
+
17dafa
+# vi: ts=4 expandtab
17dafa
diff --git a/cloudinit/temp_utils.py b/cloudinit/temp_utils.py
17dafa
index c98a1b5..346276e 100644
17dafa
--- a/cloudinit/temp_utils.py
17dafa
+++ b/cloudinit/temp_utils.py
17dafa
@@ -81,7 +81,7 @@ def ExtendedTemporaryFile(**kwargs):
17dafa
 
17dafa
 
17dafa
 @contextlib.contextmanager
17dafa
-def tempdir(**kwargs):
17dafa
+def tempdir(rmtree_ignore_errors=False, **kwargs):
17dafa
     # This seems like it was only added in python 3.2
17dafa
     # Make it since its useful...
17dafa
     # See: http://bugs.python.org/file12970/tempdir.patch
17dafa
@@ -89,7 +89,7 @@ def tempdir(**kwargs):
17dafa
     try:
17dafa
         yield tdir
17dafa
     finally:
17dafa
-        shutil.rmtree(tdir)
17dafa
+        shutil.rmtree(tdir, ignore_errors=rmtree_ignore_errors)
17dafa
 
17dafa
 
17dafa
 def mkdtemp(**kwargs):
17dafa
diff --git a/cloudinit/tests/test_temp_utils.py b/cloudinit/tests/test_temp_utils.py
17dafa
index ffbb92c..4a52ef8 100644
17dafa
--- a/cloudinit/tests/test_temp_utils.py
17dafa
+++ b/cloudinit/tests/test_temp_utils.py
17dafa
@@ -2,8 +2,9 @@
17dafa
 
17dafa
 """Tests for cloudinit.temp_utils"""
17dafa
 
17dafa
-from cloudinit.temp_utils import mkdtemp, mkstemp
17dafa
+from cloudinit.temp_utils import mkdtemp, mkstemp, tempdir
17dafa
 from cloudinit.tests.helpers import CiTestCase, wrap_and_call
17dafa
+import os
17dafa
 
17dafa
 
17dafa
 class TestTempUtils(CiTestCase):
17dafa
@@ -98,4 +99,19 @@ class TestTempUtils(CiTestCase):
17dafa
         self.assertEqual('/fake/return/path', retval)
17dafa
         self.assertEqual([{'dir': '/run/cloud-init/tmp'}], calls)
17dafa
 
17dafa
+    def test_tempdir_error_suppression(self):
17dafa
+        """test tempdir suppresses errors during directory removal."""
17dafa
+
17dafa
+        with self.assertRaises(OSError):
17dafa
+            with tempdir(prefix='cloud-init-dhcp-') as tdir:
17dafa
+                os.rmdir(tdir)
17dafa
+                # As a result, the directory is already gone,
17dafa
+                # so shutil.rmtree should raise OSError
17dafa
+
17dafa
+        with tempdir(rmtree_ignore_errors=True,
17dafa
+                     prefix='cloud-init-dhcp-') as tdir:
17dafa
+            os.rmdir(tdir)
17dafa
+            # Since the directory is already gone, shutil.rmtree would raise
17dafa
+            # OSError, but we suppress that
17dafa
+
17dafa
 # vi: ts=4 expandtab
17dafa
diff --git a/cloudinit/util.py b/cloudinit/util.py
17dafa
index 7800f7b..a84112a 100644
17dafa
--- a/cloudinit/util.py
17dafa
+++ b/cloudinit/util.py
17dafa
@@ -2861,7 +2861,6 @@ def mount_is_read_write(mount_point):
17dafa
     mount_opts = result[-1].split(',')
17dafa
     return mount_opts[0] == 'rw'
17dafa
 
17dafa
-
17dafa
 def udevadm_settle(exists=None, timeout=None):
17dafa
     """Invoke udevadm settle with optional exists and timeout parameters"""
17dafa
     settle_cmd = ["udevadm", "settle"]
17dafa
@@ -2875,5 +2874,20 @@ def udevadm_settle(exists=None, timeout=None):
17dafa
 
17dafa
     return subp(settle_cmd)
17dafa
 
17dafa
+def get_proc_ppid(pid):
17dafa
+    """
17dafa
+    Return the parent pid of a process.
17dafa
+    """
17dafa
+    ppid = 0
17dafa
+    try:
17dafa
+        contents = load_file("/proc/%s/stat" % pid, quiet=True)
17dafa
+    except IOError as e:
17dafa
+        LOG.warning('Failed to load /proc/%s/stat. %s', pid, e)
17dafa
+    if contents:
17dafa
+        parts = contents.split(" ", 4)
17dafa
+        # man proc says
17dafa
+        #  ppid %d     (4) The PID of the parent.
17dafa
+        ppid = int(parts[3])
17dafa
+    return ppid
17dafa
 
17dafa
 # vi: ts=4 expandtab
17dafa
diff --git a/tests/unittests/test_util.py b/tests/unittests/test_util.py
17dafa
index 5a14479..8aebcd6 100644
17dafa
--- a/tests/unittests/test_util.py
17dafa
+++ b/tests/unittests/test_util.py
17dafa
@@ -1114,6 +1114,12 @@ class TestLoadShellContent(helpers.TestCase):
17dafa
                 'key3="val3 #tricky"',
17dafa
                 ''])))
17dafa
 
17dafa
+    def test_get_proc_ppid(self):
17dafa
+        """get_proc_ppid returns correct parent pid value."""
17dafa
+        my_pid = os.getpid()
17dafa
+        my_ppid = os.getppid()
17dafa
+        self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
17dafa
+
17dafa
 
17dafa
 class TestGetProcEnv(helpers.TestCase):
17dafa
     """test get_proc_env."""
17dafa
-- 
17dafa
1.8.3.1
17dafa