From 4e4e73df5ce7a819f48bed0ae6d2f0b1fb7e243b Mon Sep 17 00:00:00 2001 From: Eduardo Otubo Date: Wed, 29 May 2019 13:41:46 +0200 Subject: [PATCH 2/5] DatasourceAzure: add additional logging for azure datasource RH-Author: Eduardo Otubo Message-id: <20190529134149.842-3-otubo@redhat.com> Patchwork-id: 88268 O-Subject: [RHEL-8.0.1/RHEL-8.1.0 cloud-init PATCHv2 2/5] DatasourceAzure: add additional logging for azure datasource Bugzilla: 1691986 RH-Acked-by: Vitaly Kuznetsov RH-Acked-by: Cathy Avery From: Anh Vo commit 0d8c88393b51db6454491a379dcc2e691551217a Author: Anh Vo Date: Wed Apr 3 18:23:18 2019 +0000 DatasourceAzure: add additional logging for azure datasource Create an Azure logging decorator and use additional ReportEventStack context managers to provide additional logging details. Signed-off-by: Eduardo Otubo Signed-off-by: Miroslav Rezanina --- cloudinit/sources/DataSourceAzure.py | 231 ++++++++++++++++++++++------------- cloudinit/sources/helpers/azure.py | 31 +++++ 2 files changed, 179 insertions(+), 83 deletions(-) mode change 100644 => 100755 cloudinit/sources/DataSourceAzure.py mode change 100644 => 100755 cloudinit/sources/helpers/azure.py diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py old mode 100644 new mode 100755 index a768b2c..c827816 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -21,10 +21,14 @@ from cloudinit import net from cloudinit.event import EventType from cloudinit.net.dhcp import EphemeralDHCPv4 from cloudinit import sources -from cloudinit.sources.helpers.azure import get_metadata_from_fabric from cloudinit.sources.helpers import netlink from cloudinit.url_helper import UrlError, readurl, retry_on_url_exc from cloudinit import util +from cloudinit.reporting import events + +from cloudinit.sources.helpers.azure import (azure_ds_reporter, + azure_ds_telemetry_reporter, + get_metadata_from_fabric) LOG = logging.getLogger(__name__) @@ -244,6 +248,7 @@ def set_hostname(hostname, hostname_command='hostname'): util.subp(['hostnamectl', 'set-hostname', str(hostname)]) +@azure_ds_telemetry_reporter @contextlib.contextmanager def temporary_hostname(temp_hostname, cfg, hostname_command='hostname'): """ @@ -290,6 +295,7 @@ class DataSourceAzure(sources.DataSource): root = sources.DataSource.__str__(self) return "%s [seed=%s]" % (root, self.seed) + @azure_ds_telemetry_reporter def bounce_network_with_azure_hostname(self): # When using cloud-init to provision, we have to set the hostname from # the metadata and "bounce" the network to force DDNS to update via @@ -315,6 +321,7 @@ class DataSourceAzure(sources.DataSource): util.logexc(LOG, "handling set_hostname failed") return False + @azure_ds_telemetry_reporter def get_metadata_from_agent(self): temp_hostname = self.metadata.get('local-hostname') agent_cmd = self.ds_cfg['agent_command'] @@ -344,15 +351,18 @@ class DataSourceAzure(sources.DataSource): LOG.debug("ssh authentication: " "using fingerprint from fabirc") - # wait very long for public SSH keys to arrive - # https://bugs.launchpad.net/cloud-init/+bug/1717611 - missing = util.log_time(logfunc=LOG.debug, - msg="waiting for SSH public key files", - func=util.wait_for_files, - args=(fp_files, 900)) - - if len(missing): - LOG.warning("Did not find files, but going on: %s", missing) + with events.ReportEventStack( + name="waiting-for-ssh-public-key", + description="wait for agents to retrieve ssh keys", + parent=azure_ds_reporter): + # wait very long for public SSH keys to arrive + # https://bugs.launchpad.net/cloud-init/+bug/1717611 + missing = util.log_time(logfunc=LOG.debug, + msg="waiting for SSH public key files", + func=util.wait_for_files, + args=(fp_files, 900)) + if len(missing): + LOG.warning("Did not find files, but going on: %s", missing) metadata = {} metadata['public-keys'] = key_value or pubkeys_from_crt_files(fp_files) @@ -366,6 +376,7 @@ class DataSourceAzure(sources.DataSource): subplatform_type = 'seed-dir' return '%s (%s)' % (subplatform_type, self.seed) + @azure_ds_telemetry_reporter def crawl_metadata(self): """Walk all instance metadata sources returning a dict on success. @@ -467,6 +478,7 @@ class DataSourceAzure(sources.DataSource): super(DataSourceAzure, self).clear_cached_attrs(attr_defaults) self._metadata_imds = sources.UNSET + @azure_ds_telemetry_reporter def _get_data(self): """Crawl and process datasource metadata caching metadata as attrs. @@ -513,6 +525,7 @@ class DataSourceAzure(sources.DataSource): # quickly (local check only) if self.instance_id is still valid return sources.instance_id_matches_system_uuid(self.get_instance_id()) + @azure_ds_telemetry_reporter def setup(self, is_new_instance): if self._negotiated is False: LOG.debug("negotiating for %s (new_instance=%s)", @@ -580,6 +593,7 @@ class DataSourceAzure(sources.DataSource): if nl_sock: nl_sock.close() + @azure_ds_telemetry_reporter def _report_ready(self, lease): """Tells the fabric provisioning has completed """ try: @@ -617,9 +631,14 @@ class DataSourceAzure(sources.DataSource): def _reprovision(self): """Initiate the reprovisioning workflow.""" contents = self._poll_imds() - md, ud, cfg = read_azure_ovf(contents) - return (md, ud, cfg, {'ovf-env.xml': contents}) - + with events.ReportEventStack( + name="reprovisioning-read-azure-ovf", + description="read azure ovf during reprovisioning", + parent=azure_ds_reporter): + md, ud, cfg = read_azure_ovf(contents) + return (md, ud, cfg, {'ovf-env.xml': contents}) + + @azure_ds_telemetry_reporter def _negotiate(self): """Negotiate with fabric and return data from it. @@ -652,6 +671,7 @@ class DataSourceAzure(sources.DataSource): util.del_file(REPROVISION_MARKER_FILE) return fabric_data + @azure_ds_telemetry_reporter def activate(self, cfg, is_new_instance): address_ephemeral_resize(is_new_instance=is_new_instance, preserve_ntfs=self.ds_cfg.get( @@ -690,12 +710,14 @@ def _partitions_on_device(devpath, maxnum=16): return [] +@azure_ds_telemetry_reporter def _has_ntfs_filesystem(devpath): ntfs_devices = util.find_devs_with("TYPE=ntfs", no_cache=True) LOG.debug('ntfs_devices found = %s', ntfs_devices) return os.path.realpath(devpath) in ntfs_devices +@azure_ds_telemetry_reporter def can_dev_be_reformatted(devpath, preserve_ntfs): """Determine if the ephemeral drive at devpath should be reformatted. @@ -744,43 +766,59 @@ def can_dev_be_reformatted(devpath, preserve_ntfs): (cand_part, cand_path, devpath)) return False, msg + @azure_ds_telemetry_reporter def count_files(mp): ignored = set(['dataloss_warning_readme.txt']) return len([f for f in os.listdir(mp) if f.lower() not in ignored]) bmsg = ('partition %s (%s) on device %s was ntfs formatted' % (cand_part, cand_path, devpath)) - try: - file_count = util.mount_cb(cand_path, count_files, mtype="ntfs", - update_env_for_mount={'LANG': 'C'}) - except util.MountFailedError as e: - if "unknown filesystem type 'ntfs'" in str(e): - return True, (bmsg + ' but this system cannot mount NTFS,' - ' assuming there are no important files.' - ' Formatting allowed.') - return False, bmsg + ' but mount of %s failed: %s' % (cand_part, e) - - if file_count != 0: - LOG.warning("it looks like you're using NTFS on the ephemeral disk, " - 'to ensure that filesystem does not get wiped, set ' - '%s.%s in config', '.'.join(DS_CFG_PATH), - DS_CFG_KEY_PRESERVE_NTFS) - return False, bmsg + ' but had %d files on it.' % file_count + + with events.ReportEventStack( + name="mount-ntfs-and-count", + description="mount-ntfs-and-count", + parent=azure_ds_reporter) as evt: + try: + file_count = util.mount_cb(cand_path, count_files, mtype="ntfs", + update_env_for_mount={'LANG': 'C'}) + except util.MountFailedError as e: + evt.description = "cannot mount ntfs" + if "unknown filesystem type 'ntfs'" in str(e): + return True, (bmsg + ' but this system cannot mount NTFS,' + ' assuming there are no important files.' + ' Formatting allowed.') + return False, bmsg + ' but mount of %s failed: %s' % (cand_part, e) + + if file_count != 0: + evt.description = "mounted and counted %d files" % file_count + LOG.warning("it looks like you're using NTFS on the ephemeral" + " disk, to ensure that filesystem does not get wiped," + " set %s.%s in config", '.'.join(DS_CFG_PATH), + DS_CFG_KEY_PRESERVE_NTFS) + return False, bmsg + ' but had %d files on it.' % file_count return True, bmsg + ' and had no important files. Safe for reformatting.' +@azure_ds_telemetry_reporter def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, is_new_instance=False, preserve_ntfs=False): # wait for ephemeral disk to come up naplen = .2 - missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen, - log_pre="Azure ephemeral disk: ") - - if missing: - LOG.warning("ephemeral device '%s' did not appear after %d seconds.", - devpath, maxwait) - return + with events.ReportEventStack( + name="wait-for-ephemeral-disk", + description="wait for ephemeral disk", + parent=azure_ds_reporter): + missing = util.wait_for_files([devpath], + maxwait=maxwait, + naplen=naplen, + log_pre="Azure ephemeral disk: ") + + if missing: + LOG.warning("ephemeral device '%s' did" + " not appear after %d seconds.", + devpath, maxwait) + return result = False msg = None @@ -808,6 +846,7 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120, return +@azure_ds_telemetry_reporter def perform_hostname_bounce(hostname, cfg, prev_hostname): # set the hostname to 'hostname' if it is not already set to that. # then, if policy is not off, bounce the interface using command @@ -843,6 +882,7 @@ def perform_hostname_bounce(hostname, cfg, prev_hostname): return True +@azure_ds_telemetry_reporter def crtfile_to_pubkey(fname, data=None): pipeline = ('openssl x509 -noout -pubkey < "$0" |' 'ssh-keygen -i -m PKCS8 -f /dev/stdin') @@ -851,6 +891,7 @@ def crtfile_to_pubkey(fname, data=None): return out.rstrip() +@azure_ds_telemetry_reporter def pubkeys_from_crt_files(flist): pubkeys = [] errors = [] @@ -866,6 +907,7 @@ def pubkeys_from_crt_files(flist): return pubkeys +@azure_ds_telemetry_reporter def write_files(datadir, files, dirmode=None): def _redact_password(cnt, fname): @@ -893,6 +935,7 @@ def write_files(datadir, files, dirmode=None): util.write_file(filename=fname, content=content, mode=0o600) +@azure_ds_telemetry_reporter def invoke_agent(cmd): # this is a function itself to simplify patching it for test if cmd: @@ -912,6 +955,7 @@ def find_child(node, filter_func): return ret +@azure_ds_telemetry_reporter def load_azure_ovf_pubkeys(sshnode): # This parses a 'SSH' node formatted like below, and returns # an array of dicts. @@ -964,6 +1008,7 @@ def load_azure_ovf_pubkeys(sshnode): return found +@azure_ds_telemetry_reporter def read_azure_ovf(contents): try: dom = minidom.parseString(contents) @@ -1064,6 +1109,7 @@ def read_azure_ovf(contents): return (md, ud, cfg) +@azure_ds_telemetry_reporter def _extract_preprovisioned_vm_setting(dom): """Read the preprovision flag from the ovf. It should not exist unless true.""" @@ -1092,6 +1138,7 @@ def encrypt_pass(password, salt_id="$6$"): return crypt.crypt(password, salt_id + util.rand_str(strlen=16)) +@azure_ds_telemetry_reporter def _check_freebsd_cdrom(cdrom_dev): """Return boolean indicating path to cdrom device has content.""" try: @@ -1103,6 +1150,7 @@ def _check_freebsd_cdrom(cdrom_dev): return False +@azure_ds_telemetry_reporter def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE): """Return content random seed file if available, otherwise, return None.""" @@ -1126,6 +1174,7 @@ def _get_random_seed(source=PLATFORM_ENTROPY_SOURCE): return seed +@azure_ds_telemetry_reporter def list_possible_azure_ds_devs(): devlist = [] if util.is_FreeBSD(): @@ -1140,6 +1189,7 @@ def list_possible_azure_ds_devs(): return devlist +@azure_ds_telemetry_reporter def load_azure_ds_dir(source_dir): ovf_file = os.path.join(source_dir, "ovf-env.xml") @@ -1162,47 +1212,54 @@ def parse_network_config(imds_metadata): @param: imds_metadata: Dict of content read from IMDS network service. @return: Dictionary containing network version 2 standard configuration. """ - if imds_metadata != sources.UNSET and imds_metadata: - netconfig = {'version': 2, 'ethernets': {}} - LOG.debug('Azure: generating network configuration from IMDS') - network_metadata = imds_metadata['network'] - for idx, intf in enumerate(network_metadata['interface']): - nicname = 'eth{idx}'.format(idx=idx) - dev_config = {} - for addr4 in intf['ipv4']['ipAddress']: - privateIpv4 = addr4['privateIpAddress'] - if privateIpv4: - if dev_config.get('dhcp4', False): - # Append static address config for nic > 1 - netPrefix = intf['ipv4']['subnet'][0].get( - 'prefix', '24') - if not dev_config.get('addresses'): - dev_config['addresses'] = [] - dev_config['addresses'].append( - '{ip}/{prefix}'.format( - ip=privateIpv4, prefix=netPrefix)) - else: - dev_config['dhcp4'] = True - for addr6 in intf['ipv6']['ipAddress']: - privateIpv6 = addr6['privateIpAddress'] - if privateIpv6: - dev_config['dhcp6'] = True - break - if dev_config: - mac = ':'.join(re.findall(r'..', intf['macAddress'])) - dev_config.update( - {'match': {'macaddress': mac.lower()}, - 'set-name': nicname}) - netconfig['ethernets'][nicname] = dev_config - else: - blacklist = ['mlx4_core'] - LOG.debug('Azure: generating fallback configuration') - # generate a network config, blacklist picking mlx4_core devs - netconfig = net.generate_fallback_config( - blacklist_drivers=blacklist, config_driver=True) - return netconfig + with events.ReportEventStack( + name="parse_network_config", + description="", + parent=azure_ds_reporter) as evt: + if imds_metadata != sources.UNSET and imds_metadata: + netconfig = {'version': 2, 'ethernets': {}} + LOG.debug('Azure: generating network configuration from IMDS') + network_metadata = imds_metadata['network'] + for idx, intf in enumerate(network_metadata['interface']): + nicname = 'eth{idx}'.format(idx=idx) + dev_config = {} + for addr4 in intf['ipv4']['ipAddress']: + privateIpv4 = addr4['privateIpAddress'] + if privateIpv4: + if dev_config.get('dhcp4', False): + # Append static address config for nic > 1 + netPrefix = intf['ipv4']['subnet'][0].get( + 'prefix', '24') + if not dev_config.get('addresses'): + dev_config['addresses'] = [] + dev_config['addresses'].append( + '{ip}/{prefix}'.format( + ip=privateIpv4, prefix=netPrefix)) + else: + dev_config['dhcp4'] = True + for addr6 in intf['ipv6']['ipAddress']: + privateIpv6 = addr6['privateIpAddress'] + if privateIpv6: + dev_config['dhcp6'] = True + break + if dev_config: + mac = ':'.join(re.findall(r'..', intf['macAddress'])) + dev_config.update( + {'match': {'macaddress': mac.lower()}, + 'set-name': nicname}) + netconfig['ethernets'][nicname] = dev_config + evt.description = "network config from imds" + else: + blacklist = ['mlx4_core'] + LOG.debug('Azure: generating fallback configuration') + # generate a network config, blacklist picking mlx4_core devs + netconfig = net.generate_fallback_config( + blacklist_drivers=blacklist, config_driver=True) + evt.description = "network config from fallback" + return netconfig +@azure_ds_telemetry_reporter def get_metadata_from_imds(fallback_nic, retries): """Query Azure's network metadata service, returning a dictionary. @@ -1227,6 +1284,7 @@ def get_metadata_from_imds(fallback_nic, retries): return util.log_time(**kwargs) +@azure_ds_telemetry_reporter def _get_metadata_from_imds(retries): url = IMDS_URL + "instance?api-version=2017-12-01" @@ -1246,6 +1304,7 @@ def _get_metadata_from_imds(retries): return {} +@azure_ds_telemetry_reporter def maybe_remove_ubuntu_network_config_scripts(paths=None): """Remove Azure-specific ubuntu network config for non-primary nics. @@ -1283,14 +1342,20 @@ def maybe_remove_ubuntu_network_config_scripts(paths=None): def _is_platform_viable(seed_dir): - """Check platform environment to report if this datasource may run.""" - asset_tag = util.read_dmi_data('chassis-asset-tag') - if asset_tag == AZURE_CHASSIS_ASSET_TAG: - return True - LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) - if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): - return True - return False + with events.ReportEventStack( + name="check-platform-viability", + description="found azure asset tag", + parent=azure_ds_reporter) as evt: + + """Check platform environment to report if this datasource may run.""" + asset_tag = util.read_dmi_data('chassis-asset-tag') + if asset_tag == AZURE_CHASSIS_ASSET_TAG: + return True + LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag) + evt.description = "Non-Azure DMI asset tag '%s' discovered.", asset_tag + if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')): + return True + return False class BrokenAzureDataSource(Exception): diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py old mode 100644 new mode 100755 index 2829dd2..d3af05e --- a/cloudinit/sources/helpers/azure.py +++ b/cloudinit/sources/helpers/azure.py @@ -16,10 +16,27 @@ from xml.etree import ElementTree from cloudinit import url_helper from cloudinit import util +from cloudinit.reporting import events LOG = logging.getLogger(__name__) +azure_ds_reporter = events.ReportEventStack( + name="azure-ds", + description="initialize reporter for azure ds", + reporting_enabled=True) + + +def azure_ds_telemetry_reporter(func): + def impl(*args, **kwargs): + with events.ReportEventStack( + name=func.__name__, + description=func.__name__, + parent=azure_ds_reporter): + return func(*args, **kwargs) + return impl + + @contextmanager def cd(newdir): prevdir = os.getcwd() @@ -119,6 +136,7 @@ class OpenSSLManager(object): def clean_up(self): util.del_dir(self.tmpdir) + @azure_ds_telemetry_reporter def generate_certificate(self): LOG.debug('Generating certificate for communication with fabric...') if self.certificate is not None: @@ -139,17 +157,20 @@ class OpenSSLManager(object): LOG.debug('New certificate generated.') @staticmethod + @azure_ds_telemetry_reporter def _run_x509_action(action, cert): cmd = ['openssl', 'x509', '-noout', action] result, _ = util.subp(cmd, data=cert) return result + @azure_ds_telemetry_reporter def _get_ssh_key_from_cert(self, certificate): pub_key = self._run_x509_action('-pubkey', certificate) keygen_cmd = ['ssh-keygen', '-i', '-m', 'PKCS8', '-f', '/dev/stdin'] ssh_key, _ = util.subp(keygen_cmd, data=pub_key) return ssh_key + @azure_ds_telemetry_reporter def _get_fingerprint_from_cert(self, certificate): """openssl x509 formats fingerprints as so: 'SHA1 Fingerprint=07:3E:19:D1:4D:1C:79:92:24:C6:A0:FD:8D:DA:\ @@ -163,6 +184,7 @@ class OpenSSLManager(object): octets = raw_fp[eq+1:-1].split(':') return ''.join(octets) + @azure_ds_telemetry_reporter def _decrypt_certs_from_xml(self, certificates_xml): """Decrypt the certificates XML document using the our private key; return the list of certs and private keys contained in the doc. @@ -185,6 +207,7 @@ class OpenSSLManager(object): shell=True, data=b'\n'.join(lines)) return out + @azure_ds_telemetry_reporter def parse_certificates(self, certificates_xml): """Given the Certificates XML document, return a dictionary of fingerprints and associated SSH keys derived from the certs.""" @@ -265,11 +288,13 @@ class WALinuxAgentShim(object): return socket.inet_ntoa(packed_bytes) @staticmethod + @azure_ds_telemetry_reporter def _networkd_get_value_from_leases(leases_d=None): return dhcp.networkd_get_option_from_leases( 'OPTION_245', leases_d=leases_d) @staticmethod + @azure_ds_telemetry_reporter def _get_value_from_leases_file(fallback_lease_file): leases = [] content = util.load_file(fallback_lease_file) @@ -287,6 +312,7 @@ class WALinuxAgentShim(object): return leases[-1] @staticmethod + @azure_ds_telemetry_reporter def _load_dhclient_json(): dhcp_options = {} hooks_dir = WALinuxAgentShim._get_hooks_dir() @@ -305,6 +331,7 @@ class WALinuxAgentShim(object): return dhcp_options @staticmethod + @azure_ds_telemetry_reporter def _get_value_from_dhcpoptions(dhcp_options): if dhcp_options is None: return None @@ -318,6 +345,7 @@ class WALinuxAgentShim(object): return _value @staticmethod + @azure_ds_telemetry_reporter def find_endpoint(fallback_lease_file=None, dhcp245=None): value = None if dhcp245 is not None: @@ -352,6 +380,7 @@ class WALinuxAgentShim(object): LOG.debug('Azure endpoint found at %s', endpoint_ip_address) return endpoint_ip_address + @azure_ds_telemetry_reporter def register_with_azure_and_fetch_data(self, pubkey_info=None): if self.openssl_manager is None: self.openssl_manager = OpenSSLManager() @@ -404,6 +433,7 @@ class WALinuxAgentShim(object): return keys + @azure_ds_telemetry_reporter def _report_ready(self, goal_state, http_client): LOG.debug('Reporting ready to Azure fabric.') document = self.REPORT_READY_XML_TEMPLATE.format( @@ -419,6 +449,7 @@ class WALinuxAgentShim(object): LOG.info('Reported ready to Azure fabric.') +@azure_ds_telemetry_reporter def get_metadata_from_fabric(fallback_lease_file=None, dhcp_opts=None, pubkey_info=None): shim = WALinuxAgentShim(fallback_lease_file=fallback_lease_file, -- 1.8.3.1