Blame SOURCES/pyproject_save_files.py

838e4d
import argparse
838e4d
import fnmatch
838e4d
import json
838e4d
import os
838e4d
838e4d
from collections import defaultdict
838e4d
from pathlib import PosixPath, PurePosixPath
838e4d
from importlib.metadata import Distribution
838e4d
838e4d
838e4d
# From RPM's build/files.c strtokWithQuotes delim argument
838e4d
RPM_FILES_DELIMETERS = ' \n\t'
838e4d
838e4d
838e4d
class BuildrootPath(PurePosixPath):
838e4d
    """
838e4d
    This path represents a path in a buildroot.
838e4d
    When absolute, it is "relative" to a buildroot.
838e4d
838e4d
    E.g. /usr/lib means %{buildroot}/usr/lib
838e4d
    The object carries no buildroot information.
838e4d
    """
838e4d
838e4d
    @staticmethod
838e4d
    def from_real(realpath, *, root):
838e4d
        """
838e4d
        For a given real disk path, return a BuildrootPath in the given root.
838e4d
838e4d
        For example::
838e4d
838e4d
            >>> BuildrootPath.from_real(PosixPath('/tmp/buildroot/foo'), root=PosixPath('/tmp/buildroot'))
838e4d
            BuildrootPath('/foo')
838e4d
        """
838e4d
        return BuildrootPath("/") / realpath.relative_to(root)
838e4d
838e4d
    def to_real(self, root):
838e4d
        """
838e4d
        Return a real PosixPath in the given root
838e4d
838e4d
        For example::
838e4d
838e4d
            >>> BuildrootPath('/foo').to_real(PosixPath('/tmp/buildroot'))
838e4d
            PosixPath('/tmp/buildroot/foo')
838e4d
        """
838e4d
        return root / self.relative_to("/")
838e4d
838e4d
    def normpath(self):
838e4d
        """
838e4d
        Normalize all the potential /../ parts of the path without touching real files.
838e4d
838e4d
        PurePaths don't have .resolve().
838e4d
        Paths have .resolve() but it touches real files.
838e4d
        This is an alternative. It assumes there are no symbolic links.
838e4d
838e4d
        Example:
838e4d
838e4d
            >>> BuildrootPath('/usr/lib/python/../pypy').normpath()
838e4d
            BuildrootPath('/usr/lib/pypy')
838e4d
        """
838e4d
        return type(self)(os.path.normpath(self))
838e4d
838e4d
838e4d
def pycache_dir(script):
838e4d
    """
838e4d
    For a script BuildrootPath, return a BuildrootPath of its __pycache__ directory.
838e4d
838e4d
    Example:
838e4d
838e4d
        >>> pycache_dir(BuildrootPath('/whatever/bar.py'))
838e4d
        BuildrootPath('/whatever/__pycache__')
838e4d
838e4d
        >>> pycache_dir(BuildrootPath('/opt/python3.10/foo.py'))
838e4d
        BuildrootPath('/opt/python3.10/__pycache__')
838e4d
    """
838e4d
    return script.parent / "__pycache__"
838e4d
838e4d
838e4d
def pycached(script, python_version):
838e4d
    """
838e4d
    For a script BuildrootPath, return a list with that path and its bytecode glob.
838e4d
    Like the %pycached macro.
838e4d
838e4d
    The glob is represented as a BuildrootPath.
838e4d
838e4d
    Examples:
838e4d
838e4d
        >>> pycached(BuildrootPath('/whatever/bar.py'), '3.8')
838e4d
        [BuildrootPath('/whatever/bar.py'), BuildrootPath('/whatever/__pycache__/bar.cpython-38{,.opt-?}.pyc')]
838e4d
838e4d
        >>> pycached(BuildrootPath('/opt/python3.10/foo.py'), '3.10')
838e4d
        [BuildrootPath('/opt/python3.10/foo.py'), BuildrootPath('/opt/python3.10/__pycache__/foo.cpython-310{,.opt-?}.pyc')]
838e4d
    """
838e4d
    assert script.suffix == ".py"
838e4d
    pyver = "".join(python_version.split(".")[:2])
838e4d
    pycname = f"{script.stem}.cpython-{pyver}{{,.opt-?}}.pyc"
838e4d
    pyc = pycache_dir(script) / pycname
838e4d
    return [script, pyc]
838e4d
838e4d
838e4d
def add_file_to_module(paths, module_name, module_type, files_dirs, *files):
838e4d
    """
838e4d
    Helper procedure, adds given files to the module_name of a given module_type
838e4d
    """
838e4d
    for module in paths["modules"][module_name]:
838e4d
        if module["type"] == module_type:
838e4d
            if files[0] not in module[files_dirs]:
838e4d
                module[files_dirs].extend(files)
838e4d
            break
838e4d
    else:
838e4d
        paths["modules"][module_name].append(
838e4d
            {"type": module_type, "files": [], "dirs": [], files_dirs: list(files)}
838e4d
        )
838e4d
838e4d
838e4d
def add_py_file_to_module(paths, module_name, module_type, path, python_version,
838e4d
                          *, include_pycache_dir):
838e4d
    """
838e4d
    Helper procedure, adds given .py file to the module_name of a given module_type
838e4d
    Always also adds the bytecode cache.
838e4d
    If include_pycache_dir is set, also include the __pycache__ directory.
838e4d
    """
838e4d
    add_file_to_module(paths, module_name, module_type, "files", *pycached(path, python_version))
838e4d
    if include_pycache_dir:
838e4d
        add_file_to_module(paths, module_name, module_type, "dirs", pycache_dir(path))
838e4d
838e4d
838e4d
def add_lang_to_module(paths, module_name, path):
838e4d
    """
838e4d
    Helper procedure, divides lang files by language and adds them to the module_name
838e4d
838e4d
    Returns True if the language code detection was successful
838e4d
    """
838e4d
    for i, parent in enumerate(path.parents):
838e4d
        if i > 0 and parent.name == 'locale':
838e4d
            lang_country_code = path.parents[i-1].name
838e4d
            break
838e4d
    else:
838e4d
        return False
838e4d
    # convert potential en_US to plain en
838e4d
    lang_code = lang_country_code.partition('_')[0]
838e4d
    if module_name not in paths["lang"]:
838e4d
        paths["lang"].update({module_name: defaultdict(list)})
838e4d
    paths["lang"][module_name][lang_code].append(path)
838e4d
    return True
838e4d
838e4d
838e4d
def classify_paths(
838e4d
    record_path, parsed_record_content, metadata, sitedirs, python_version
838e4d
):
838e4d
    """
838e4d
    For each BuildrootPath in parsed_record_content classify it to a dict structure
838e4d
    that allows to filter the files for the %files section easier.
838e4d
838e4d
    For the dict structure, look at the beginning of this function's code.
838e4d
838e4d
    Each "module" is a dict with "type" ("package", "script", "extension"), and "files" and "dirs".
838e4d
    """
838e4d
    distinfo = record_path.parent
838e4d
    paths = {
838e4d
        "metadata": {
838e4d
            "files": [],  # regular %file entries with dist-info content
838e4d
            "dirs": [distinfo],  # %dir %file entries with dist-info directory
838e4d
            "docs": [],  # to be used once there is upstream way to recognize READMEs
838e4d
            "licenses": [],  # %license entries parsed from dist-info METADATA file
838e4d
        },
838e4d
        "lang": {}, # %lang entries: [module_name or None][language_code] lists of .mo files
838e4d
        "modules": defaultdict(list),  # each importable module (directory, .py, .so)
838e4d
        "other": {"files": []},  # regular %file entries we could not parse :(
838e4d
    }
838e4d
838e4d
    # In RECORDs generated by pip, there are no directories, only files.
838e4d
    # The example RECORD from PEP 376 does not contain directories either.
838e4d
    # Hence, we'll only assume files, but TODO get it officially documented.
838e4d
    license_files = metadata.get_all('License-File')
838e4d
    for path in parsed_record_content:
838e4d
        if path.suffix == ".pyc":
838e4d
            # we handle bytecode separately
838e4d
            continue
838e4d
838e4d
        if path.parent == distinfo:
838e4d
            if path.name in ("RECORD", "REQUESTED"):
838e4d
                # RECORD and REQUESTED files are removed in %pyproject_install
838e4d
                # See PEP 627
838e4d
                continue
838e4d
            if license_files and path.name in license_files:
838e4d
                paths["metadata"]["licenses"].append(path)
838e4d
            else:
838e4d
                paths["metadata"]["files"].append(path)
838e4d
            continue
838e4d
838e4d
        for sitedir in sitedirs:
838e4d
            if sitedir in path.parents:
838e4d
                if path.parent == sitedir:
838e4d
                    if path.suffix == ".so":
838e4d
                        # extension modules can have 2 suffixes
838e4d
                        name = BuildrootPath(path.stem).stem
838e4d
                        add_file_to_module(paths, name, "extension", "files", path)
838e4d
                    elif path.suffix == ".py":
838e4d
                        name = path.stem
838e4d
                        # we add the .pyc files, but not top-level __pycache__
838e4d
                        add_py_file_to_module(
838e4d
                            paths, name, "script", path, python_version,
838e4d
                            include_pycache_dir=False
838e4d
                        )
838e4d
                    else:
838e4d
                        paths["other"]["files"].append(path)
838e4d
                else:
838e4d
                    # this file is inside a dir, we add all dirs upwards until sitedir
838e4d
                    index = path.parents.index(sitedir)
838e4d
                    module_dir = path.parents[index - 1]
838e4d
                    for parent in list(path.parents)[:index]:  # no direct slice until Python 3.10
838e4d
                        add_file_to_module(paths, module_dir.name, "package", "dirs", parent)
838e4d
                    is_lang = False
838e4d
                    if path.suffix == ".mo":
838e4d
                        is_lang = add_lang_to_module(paths, module_dir.name, path)
838e4d
                    if not is_lang:
838e4d
                        if path.suffix == ".py":
838e4d
                            # we add the .pyc files, and their __pycache__
838e4d
                            add_py_file_to_module(
838e4d
                                paths, module_dir.name, "package", path, python_version,
838e4d
                                include_pycache_dir=True
838e4d
                            )
838e4d
                        else:
838e4d
                            add_file_to_module(paths, module_dir.name, "package", "files", path)
838e4d
                break
838e4d
        else:
838e4d
            if path.suffix == ".mo":
838e4d
                add_lang_to_module(paths, None, path) or paths["other"]["files"].append(path)
838e4d
            else:
838e4d
                paths["other"]["files"].append(path)
838e4d
838e4d
    return paths
838e4d
838e4d
838e4d
def escape_rpm_path(path):
838e4d
    """
838e4d
    Escape special characters in string-paths or BuildrootPaths
838e4d
838e4d
    E.g. a space in path otherwise makes RPM think it's multiple paths,
838e4d
    unless we put it in "quotes".
838e4d
    Or a literal % symbol in path might be expanded as a macro if not escaped.
838e4d
838e4d
    Due to limitations in RPM, paths with spaces and double quotes are not supported.
838e4d
838e4d
    Examples:
838e4d
838e4d
        >>> escape_rpm_path(BuildrootPath('/usr/lib/python3.9/site-packages/setuptools'))
838e4d
        '/usr/lib/python3.9/site-packages/setuptools'
838e4d
838e4d
        >>> escape_rpm_path('/usr/lib/python3.9/site-packages/setuptools/script (dev).tmpl')
838e4d
        '"/usr/lib/python3.9/site-packages/setuptools/script (dev).tmpl"'
838e4d
838e4d
        >>> escape_rpm_path('/usr/share/data/100%valid.path')
838e4d
        '/usr/share/data/100%%%%%%%%valid.path'
838e4d
838e4d
        >>> escape_rpm_path('/usr/share/data/100 % valid.path')
838e4d
        '"/usr/share/data/100 %%%%%%%% valid.path"'
838e4d
838e4d
        >>> escape_rpm_path('/usr/share/data/1000 %% valid.path')
838e4d
        '"/usr/share/data/1000 %%%%%%%%%%%%%%%% valid.path"'
838e4d
838e4d
        >>> escape_rpm_path('/usr/share/data/spaces and "quotes"')
838e4d
        Traceback (most recent call last):
838e4d
          ...
838e4d
        NotImplementedError: ...
838e4d
    """
838e4d
    orig_path = path = str(path)
838e4d
    if "%" in path:
838e4d
        # Escaping by 8 %s has been verified in RPM 4.16 and 4.17, but probably not stable
838e4d
        # See this thread http://lists.rpm.org/pipermail/rpm-list/2021-June/002048.html
838e4d
        # On the CI, we build tests/escape_percentages.spec to verify this assumption
838e4d
        path = path.replace("%", "%" * 8)
838e4d
    if any(symbol in path for symbol in RPM_FILES_DELIMETERS):
838e4d
        if '"' in path:
838e4d
            # As far as we know, RPM cannot list such file individually
838e4d
            # See this thread http://lists.rpm.org/pipermail/rpm-list/2021-June/002048.html
838e4d
            raise NotImplementedError(f'" symbol in path with spaces is not supported by %pyproject_save_files: {orig_path!r}')
838e4d
        return f'"{path}"'
838e4d
    return path
838e4d
838e4d
838e4d
def generate_file_list(paths_dict, module_globs, include_others=False):
838e4d
    """
838e4d
    This function takes the classified paths_dict and turns it into lines
838e4d
    for the %files section. Returns list with text lines, no Path objects.
838e4d
838e4d
    Only includes files from modules that match module_globs, metadata and
838e4d
    optionaly all other files.
838e4d
838e4d
    It asserts that all globs match at least one module, raises ValueError otherwise.
838e4d
    Multiple globs matching identical module(s) are OK.
838e4d
    """
838e4d
    files = set()
838e4d
838e4d
    if include_others:
838e4d
        files.update(f"{escape_rpm_path(p)}" for p in paths_dict["other"]["files"])
838e4d
        try:
838e4d
            for lang_code in paths_dict["lang"][None]:
838e4d
                files.update(f"%lang({lang_code}) {escape_rpm_path(p)}" for p in paths_dict["lang"][None][lang_code])
838e4d
        except KeyError:
838e4d
            pass
838e4d
838e4d
    files.update(f"{escape_rpm_path(p)}" for p in paths_dict["metadata"]["files"])
838e4d
    for macro in "dir", "doc", "license":
838e4d
        files.update(f"%{macro} {escape_rpm_path(p)}" for p in paths_dict["metadata"][f"{macro}s"])
838e4d
838e4d
    modules = paths_dict["modules"]
838e4d
    done_modules = set()
838e4d
    done_globs = set()
838e4d
838e4d
    for glob in module_globs:
838e4d
        for name in modules:
838e4d
            if fnmatch.fnmatchcase(name, glob):
838e4d
                if name not in done_modules:
838e4d
                    try:
838e4d
                        for lang_code in paths_dict["lang"][name]:
838e4d
                            files.update(f"%lang({lang_code}) {escape_rpm_path(p)}" for p in paths_dict["lang"][name][lang_code])
838e4d
                    except KeyError:
838e4d
                        pass
838e4d
                    for module in modules[name]:
838e4d
                        files.update(f"%dir {escape_rpm_path(p)}" for p in module["dirs"])
838e4d
                        files.update(f"{escape_rpm_path(p)}" for p in module["files"])
838e4d
                    done_modules.add(name)
838e4d
                done_globs.add(glob)
838e4d
838e4d
    missed = module_globs - done_globs
838e4d
    if missed:
838e4d
        missed_text = ", ".join(sorted(missed))
838e4d
        raise ValueError(f"Globs did not match any module: {missed_text}")
838e4d
838e4d
    return sorted(files)
838e4d
838e4d
838e4d
def parse_varargs(varargs):
838e4d
    """
838e4d
    Parse varargs from the %pyproject_save_files macro
838e4d
838e4d
    Arguments starting with + are treated as a flags, everything else is a glob
838e4d
838e4d
    Returns as set of globs, boolean flag whether to include all the other files
838e4d
838e4d
    Raises ValueError for unknown flags and globs with dots (namespace packages).
838e4d
838e4d
    Good examples:
838e4d
838e4d
        >>> parse_varargs(['*'])
838e4d
        ({'*'}, False)
838e4d
838e4d
        >>> mods, auto = parse_varargs(['requests*', 'kerberos', '+auto'])
838e4d
        >>> auto
838e4d
        True
838e4d
        >>> sorted(mods)
838e4d
        ['kerberos', 'requests*']
838e4d
838e4d
        >>> mods, auto = parse_varargs(['tldr', 'tensorf*'])
838e4d
        >>> auto
838e4d
        False
838e4d
        >>> sorted(mods)
838e4d
        ['tensorf*', 'tldr']
838e4d
838e4d
        >>> parse_varargs(['+auto'])
838e4d
        (set(), True)
838e4d
838e4d
    Bad examples:
838e4d
838e4d
        >>> parse_varargs(['+kinkdir'])
838e4d
        Traceback (most recent call last):
838e4d
          ...
838e4d
        ValueError: Invalid argument: +kinkdir
838e4d
838e4d
        >>> parse_varargs(['good', '+bad', '*ugly*'])
838e4d
        Traceback (most recent call last):
838e4d
          ...
838e4d
        ValueError: Invalid argument: +bad
838e4d
838e4d
        >>> parse_varargs(['+bad', 'my.bad'])
838e4d
        Traceback (most recent call last):
838e4d
          ...
838e4d
        ValueError: Invalid argument: +bad
838e4d
838e4d
        >>> parse_varargs(['mod', 'mod.*'])
838e4d
        Traceback (most recent call last):
838e4d
          ...
838e4d
        ValueError: Attempted to use a namespaced package with dot in the glob: mod.*. ...
838e4d
838e4d
        >>> parse_varargs(['my.bad', '+bad'])
838e4d
        Traceback (most recent call last):
838e4d
          ...
838e4d
        ValueError: Attempted to use a namespaced package with dot in the glob: my.bad. ...
838e4d
    """
838e4d
    include_auto = False
838e4d
    globs = set()
838e4d
838e4d
    for arg in varargs:
838e4d
        if arg.startswith("+"):
838e4d
            if arg == "+auto":
838e4d
                include_auto = True
838e4d
            else:
838e4d
                raise ValueError(f"Invalid argument: {arg}")
838e4d
        elif "." in arg:
838e4d
            top, *_ = arg.partition(".")
838e4d
            msg = (
838e4d
                f"Attempted to use a namespaced package with dot in the glob: {arg}. "
838e4d
                f"That is not (yet) supported. Use {top} instead and/or file a Bugzilla explaining your use case."
838e4d
            )
838e4d
            raise ValueError(msg)
838e4d
        else:
838e4d
            globs.add(arg)
838e4d
838e4d
    return globs, include_auto
838e4d
838e4d
838e4d
def load_parsed_record(pyproject_record):
838e4d
    parsed_record = {}
838e4d
    with open(pyproject_record) as pyproject_record_file:
838e4d
        content = json.load(pyproject_record_file)
838e4d
838e4d
    if len(content) > 1:
838e4d
        raise FileExistsError("%pyproject install has found more than one *.dist-info/RECORD file. "
838e4d
                              "Currently, %pyproject_save_files supports only one wheel → one file list mapping. "
838e4d
                              "Feel free to open a bugzilla for pyproject-rpm-macros and describe your usecase.")
838e4d
838e4d
    # Redefine strings stored in JSON to BuildRootPaths
838e4d
    for record_path, files in content.items():
838e4d
        parsed_record[BuildrootPath(record_path)] = [BuildrootPath(f) for f in files]
838e4d
838e4d
    return parsed_record
838e4d
838e4d
838e4d
def dist_metadata(buildroot, record_path):
838e4d
    """
838e4d
    Returns distribution metadata (email.message.EmailMessage), possibly empty
838e4d
    """
838e4d
    real_dist_path = record_path.parent.to_real(buildroot)
838e4d
    dist = Distribution.at(real_dist_path)
838e4d
    return dist.metadata
838e4d
838e4d
def pyproject_save_files(buildroot, sitelib, sitearch, python_version, pyproject_record, varargs):
838e4d
    """
838e4d
    Takes arguments from the %{pyproject_save_files} macro
838e4d
838e4d
    Returns list of paths for the %files section
838e4d
    """
838e4d
    # On 32 bit architectures, sitelib equals to sitearch
838e4d
    # This saves us browsing one directory twice
838e4d
    sitedirs = sorted({sitelib, sitearch})
838e4d
838e4d
    globs, include_auto = parse_varargs(varargs)
838e4d
    parsed_records = load_parsed_record(pyproject_record)
838e4d
838e4d
    final_file_list = []
838e4d
838e4d
    for record_path, files in parsed_records.items():
838e4d
        metadata = dist_metadata(buildroot, record_path)
838e4d
        paths_dict = classify_paths(
838e4d
            record_path, files, metadata, sitedirs, python_version
838e4d
        )
838e4d
838e4d
        final_file_list.extend(
838e4d
            generate_file_list(paths_dict, globs, include_auto)
838e4d
        )
838e4d
838e4d
    return final_file_list
838e4d
838e4d
838e4d
def main(cli_args):
838e4d
    file_section = pyproject_save_files(
838e4d
        cli_args.buildroot,
838e4d
        cli_args.sitelib,
838e4d
        cli_args.sitearch,
838e4d
        cli_args.python_version,
838e4d
        cli_args.pyproject_record,
838e4d
        cli_args.varargs,
838e4d
    )
838e4d
838e4d
    cli_args.output.write_text("\n".join(file_section) + "\n", encoding="utf-8")
838e4d
838e4d
838e4d
def argparser():
838e4d
    parser = argparse.ArgumentParser()
838e4d
    r = parser.add_argument_group("required arguments")
838e4d
    r.add_argument("--output", type=PosixPath, required=True)
838e4d
    r.add_argument("--buildroot", type=PosixPath, required=True)
838e4d
    r.add_argument("--sitelib", type=BuildrootPath, required=True)
838e4d
    r.add_argument("--sitearch", type=BuildrootPath, required=True)
838e4d
    r.add_argument("--python-version", type=str, required=True)
838e4d
    r.add_argument("--pyproject-record", type=PosixPath, required=True)
838e4d
    parser.add_argument("varargs", nargs="+")
838e4d
    return parser
838e4d
838e4d
838e4d
if __name__ == "__main__":
838e4d
    cli_args = argparser().parse_args()
838e4d
    main(cli_args)