Source code for friendly_data.cli

"""Functions that are run from the CLI to create, or edit a data package.

"""

from datetime import datetime
from itertools import chain
from pathlib import Path
import sys
from typing import Any, Dict, Iterable, List

from frictionless import Package
from glom import glom, Iter

from friendly_data import logger_config
from friendly_data._types import _license_t, _path_t
from friendly_data.converters import to_df
from friendly_data.dpkg import entry_from_res
from friendly_data.dpkg import idxpath_from_pkgpath
from friendly_data.dpkg import pkg_from_files
from friendly_data.dpkg import pkgindex
from friendly_data.dpkg import read_pkg
from friendly_data.dpkg import set_idxcols
from friendly_data.dpkg import write_pkg
from friendly_data.helpers import consume, filter_dict
from friendly_data.helpers import is_windows
from friendly_data.helpers import sanitise
from friendly_data.io import copy_files
from friendly_data.io import dwim_file
from friendly_data.io import path_not_in
from friendly_data.io import outoftree_paths
from friendly_data.metatools import _fetch_license
from friendly_data.metatools import check_license
from friendly_data.metatools import get_license
from friendly_data.metatools import lic_metadata
from friendly_data.metatools import resolve_licenses
from friendly_data.registry import config_ctx
from friendly_data.doc import get_template, page

logger = logger_config(fmt="{name}: {levelname}: {message}")


[docs]def list_licenses() -> str:
    """List commonly used licenses

    NOTE: for Python API users, not to be confused with
    :func:`metatools.list_licenses`.

    Returns
    -------
    str
        ASCII table with commonly used licenses

    """
    from tabulate import tabulate

    keys = ("domain", "id", "maintainer", "title")
    return tabulate(lic_metadata(keys), headers="keys")


[docs]def license_info(lic: str) -> Dict:
    """Give detailed metadata about a license

    Parameters
    ----------
    lic : str
        License ID as listed in the output of ``friendly_data list-licenses``

    Returns
    -------
    Dict
        License metadata

    """
    keys = ("domain", "id", "maintainer", "title", "url")
    lic_info = lic_metadata(keys, lambda i: i["id"] == lic)
    if not lic_info:
        logger.error(f"no matching license with id: {lic}")
        sys.exit(1)
    return lic_info[0]


[docs]def license_prompt() -> _license_t:  # pragma: no cover, interactive function
    """Prompt for a license on the terminal (with completion)."""
    licenses = _fetch_license("all")

    def complete(text, state):
        for lic in licenses:
            if lic.startswith(text):
                if not state:
                    return lic
                else:
                    state -= 1

    if not is_windows():
        import readline

        readline.parse_and_bind("tab: complete")
        readline.set_completer(complete)

    return check_license(licenses[input("license: ")])


def _metadata(
    mandatory: List[str],
    *,
    name: str = "",
    title: str = "",
    licenses: str = "",
    description: str = "",
    keywords: str = "",
    config: _path_t = "",
) -> Dict:
    """Metadata from the config file is overriden by keyword arguments"""
    if config:
        try:
            meta = dwim_file(config)["metadata"]  # type: ignore[call-overload]
        except KeyError as err:
            logger.warning(f"{err}: section missing from {config}")
            meta = {}
        else:
            meta = resolve_licenses(meta)
    else:
        meta = {}
    _meta: Dict[str, Any] = {
        "name": name if name else sanitise(title),
        "title": title,
        "description": description,
        "keywords": keywords.split(),
    }
    if licenses:
        _meta["licenses"] = [get_license(licenses)]
    elif "licenses" in mandatory and "licenses" not in meta:
        _meta["licenses"] = [license_prompt()]  # pragma: no cover

    # override config file with values from flags
    meta.update((k, v) for k, v in _meta.items() if v)

    check = [k for k in mandatory if k not in meta]  # mandatory fields
    if check:
        logger.error(f"{check}: mandatory metadata missing")
        if "license" in meta:
            logger.error("'license': should be plural!")
        sys.exit(1)

    return meta


# TODO: ability to add datasets from arbitrary paths
# - flag to provide destination directory for out of tree datasets
# - normalise relative path w.r.t. index entries
# - for files not in the index, normalise relative path w.r.t. pkgdir
# add similar ability for update(..)
def _create(
    meta: Dict,
    pkgpath: _path_t,
    fpaths: Iterable[_path_t],
    *,
    export: _path_t,
) -> List[Path]:
    if export:
        pkgpath, export = Path(pkgpath), Path(export)
        idxp = idxpath_from_pkgpath(pkgpath) if pkgpath.is_dir() else pkgpath
        spec = Iter("path").map(lambda p: idxp.parent / p)  # type: ignore[union-attr]
        if idxp:  # create a uniquified list of files
            files = chain(
                [idxp],
                set(chain(glom(pkgindex.from_file(idxp), spec), map(Path, fpaths))),
            )
        else:
            files = fpaths  # type: ignore[assignment]
        # NOTE: if idxpath was found, first of the returned files is the index
        # file that was copied in the export directory, extract it to pkgpath
        fpaths = copy_files(files, export, pkgpath)
        if idxp:
            pkgpath, *fpaths = fpaths
        else:  # if no index was found, set export directory to new pkgpath
            pkgpath = export

    pkgdir, pkg, _ = pkg_from_files(meta, pkgpath, fpaths)
    return write_pkg(pkg, pkgdir)


[docs]def create(
    idxpath: str,
    *fpaths: str,
    name: str = "",
    title: str = "",
    licenses: str = "",
    description: str = "",
    keywords: str = "",
    inplace: bool = False,
    export: str = "",
    config: str = "",
):
    """Create a package from an index file and other files

    Package metadata provided with command line flags override metadata from
    the config file.

    Parameters
    ----------
    idxpath : str
        Path to the index file or package directory with the index file.  Note
        the index file has to be at the top level directory of the datapackage.

    fpaths : Tuple[str]
        List of datasets/resources not in the index.  If any of them point to a
        dataset already present in the index, it is ignored.

    name : str
        Package name (no spaces or special characters)

    title : str
        Package title

    licenses : str
        License

    description : str
        Package description

    keywords : str
        A space separated list of keywords: 'renewable energy model' ->
        ['renewable', 'energy', 'model']

    inplace : bool
        Whether to create the data package by only adding metadata to the
        current directory.  NOTE: one of inplace/export must be chosen

    export : str
        Create the data package in the provided directory instead of the
        current directory

    config : str
        Config file in YAML format with metadata and custom registry.  The
        metadata should be under a "metadata" section, and the custom registry
        under a "registry" section.

    """
    if (not export) and (not inplace):
        logger.error("you must explicitly choose between `inplace` or `export`")
        sys.exit(1)
    elif export and inplace:
        logger.warning(
            "both `inplace` and `export` present, `inplace` will be ignored`"
        )

    meta = {
        "name": name,
        "title": title,
        "licenses": licenses,
        "description": description,
        "keywords": keywords,
        "config": config,
    }
    meta = _metadata(["name", "licenses"], **meta)  # type: ignore[arg-type]
    with config_ctx(conffile=config):
        files = _create(meta, idxpath, fpaths, export=export)
    return f"Package metadata: {files[0]}"


def _update(pkg: Dict, pkgpath: _path_t, fpaths: Iterable[_path_t]):
    _fpaths1, outoftree = outoftree_paths(pkgpath, fpaths)
    _fpaths2 = copy_files(outoftree, pkgpath)
    fpaths = _fpaths1 + _fpaths2
    pkg = _rm_from_pkg(pkg, pkgpath, fpaths)
    return _create(pkg, pkgpath, fpaths, export="")


# TODO: option to update files in index
[docs]def update(
    pkgpath: str,
    *fpaths: str,
    name: str = "",
    title: str = "",
    licenses: str = "",
    description: str = "",
    keywords: str = "",
    config: str = "",
):
    """Update metadata and datasets in a package.

    Parameters
    ----------
    pkgpath : str
        Path to the package.

    fpaths : Tuple[str]
        List of datasets/resources; they could be new datasets or datasets with
        updated index entries.

    name : str
        Package name (no spaces or special characters)

    title : str
        Package title

    description : str
        Package description

    keywords : str
        A space separated list of keywords: 'renewable energy model' ->
        ['renewable', 'energy', 'model']

    licenses : str
        License

    config : str
        Config file in YAML format with metadata and custom registry.  The
        metadata should be under a "metadata" section, and the custom registry
        under a "registry" section.

    """
    meta = {
        "name": name,
        "title": title,
        "licenses": licenses,
        "description": description,
        "keywords": keywords,
        "config": config,
    }
    meta = _metadata([], **meta)  # type: ignore[arg-type]
    pkg = read_pkg(pkgpath)
    pkg.update(meta)

    if len(fpaths) == 0:
        files = write_pkg(pkg, pkgpath)
    else:
        with config_ctx(conffile=config):
            files = _update(pkg, pkgpath, fpaths)
    return f"Package metadata: {files[0]}"


def _rm_paths_spec(pkgpath: _path_t, fpaths: Iterable[_path_t]):
    pkgpath = Path(pkgpath)
    return Iter().filter(lambda r: path_not_in(fpaths, pkgpath / r["path"])).all()


def _rm_from_pkg(pkg: Dict, pkgpath: _path_t, fpaths: Iterable[_path_t]):
    count = len(pkg["resources"])
    pkg["resources"] = glom(pkg["resources"], _rm_paths_spec(pkgpath, fpaths))
    if count == len(pkg["resources"]):
        logger.info("no resources to update/remove in package")
    return pkg


def _rm_from_idx(pkgpath: _path_t, fpaths: Iterable[_path_t]) -> pkgindex:
    idx = pkgindex.from_file(idxpath_from_pkgpath(pkgpath))
    return glom(idx, _rm_paths_spec(pkgpath, fpaths))


def _rm_from_disk(fpaths: Iterable[_path_t]):
    consume(map(lambda fp: Path(fp).unlink(), fpaths))


[docs]def remove(pkgpath: str, *fpaths: str, rm_from_disk: bool = False) -> str:
    """Remove datasets from the package

    Parameters
    ----------
    pkgpath : str
        Path to the package directory

    fpaths : Tuple[str]
        List of datasets/resources to be removed from the package. The index is
        updated accordingly.

    rm_from_disk : bool (default: False)
        Permanently delete the files from disk

    """
    pkg = _rm_from_pkg(read_pkg(pkgpath), pkgpath, fpaths)
    idx = _rm_from_idx(pkgpath, fpaths)
    fmeta, fidx = write_pkg(pkg, pkgpath, idx=idx)
    if rm_from_disk:
        _rm_from_disk(fpaths)
    msgs = [f"Package metadata: {fmeta}", f"Package index: {fidx}"]
    return "\n".join(msgs)


[docs]def generate_index_file(idxpath: str, *fpaths: str, config: str = ""):
    """Generate an index file from a set of dataset files

    Parameters
    ----------
    idxpath : str
        Path where the index file (YAML format) should be written

    fpaths : Tuple[str]
        List of datasets/resources to include in the index

    config : str
        Config file in YAML format with custom registry.  It should be defined
        under a "registry" section.

    """
    with config_ctx(conffile=config):
        idx = [entry_from_res(set_idxcols(f)) for f in fpaths]
    dwim_file(idxpath, idx)


[docs]def to_iamc(config: str, idxpath: str, iamcpath: str, *, wide: bool = False):
    """Aggregate datasets into an IAMC dataset

    Parameters
    ----------
    config : str
        Config file

    idxpath : str
        Index file

    iamcpath : str
        IAMC dataset

    wide : bool (default: False)
        Enable wide IAMC format

    """
    from friendly_data.iamc import IAMconv

    conv = IAMconv.from_file(config, idxpath)
    files = conv.res_idx.get("path")
    conv.to_csv(files, output=iamcpath, wide=wide)
    return f"{', '.join(files)} -> {iamcpath}"


[docs]def reports(pkg: Package, report_dir: str):
    """Write HTML reports summarising all resources in the package

    Parameters
    ----------
    pkg : Package

    report_dir : str
        Directory where reports are written

    Returns
    -------
    int
        Bytes written (index.html)
    """
    import pandas_profiling as _  # noqa: F401

    _dir = Path(report_dir)
    _dir.mkdir(parents=True, exist_ok=True)

    title = pkg.get("title", pkg["name"])
    res = {"title": title, "date": datetime.now().isoformat(), "resources": []}
    for _res in pkg.resources:
        df = to_df(_res, noexcept=True)
        html = Path(_res["path"]).with_suffix(".html")
        report = df.profile_report(
            title=title,
            dataset=filter_dict(_res, ["description"]),  # FIXME: add pkg url
            variables={
                "descriptions": glom(
                    _res,
                    (
                        "schema.fields",
                        Iter()
                        .filter(lambda i: "description" in i)
                        .map(lambda i: (i["name"], i["description"]))
                        .all(),
                        dict,
                    ),
                )
            },
        )
        report.config.html.minify_html = False
        report.to_file(_dir / html)
        res["resources"].append({"path": html, "name": _res["name"]})

    tmpl = get_template("index.html.template")
    return (_dir / "index.html").write_text(tmpl.render(res))


[docs]def describe(pkgpath: str, config: str = "", report_dir: str = ""):
    """Give a summary of the data package

    Parameters
    ----------
    pkgpath : str
        Path to the data package

    report_dir : str (default: empty string)
        If not empty, generate an HTML report and write to the directory
        ``report``.  The directory will have an ``index.html`` file, and one
        HTML file for each dataset.

    config : str
        Config file in YAML format with custom registry.  It should be defined
        under a "registry" section.

    """
    try:
        pkg = read_pkg(pkgpath)
    except (ValueError, FileNotFoundError):
        sys.exit(1)
    res = {}
    meta_f = ("name", "title", "description", "keywords", "licenses")
    for k, v in pkg.items():
        if k in meta_f and v:
            res[k] = glom(v, ["name"]) if k == "licenses" else v

    if report_dir:
        res["report_dir"] = report_dir
        with config_ctx(conffile=config):
            reports(pkg, report_dir)

    tmpl = get_template("dpkg_describe.template")
    res["resources"] = glom(
        pkg["resources"], [{"fields": ("schema.fields", ["name"]), "path": "path"}]
    )
    return tmpl.render(res)


[docs]def describe_registry(column_type: str = ""):
    """Describe columns defined in the registry

    Parameters
    ----------
    column_type : str (default: empty string → all)
        Column type to list; one of: "cols", or "idxcols".  If nothing is
        provided (default), columns of both types are listed.

    """
    from rich.console import Console
    from rich.markdown import Markdown

    console = Console()
    md = Markdown(page(markup="md", col_t=column_type))
    console.print(md)


[docs]def main():  # pragma: no cover, CLI entry point
    """Entry point for console scripts"""
    import os
    import fire

    os.environ["PAGER"] = "cat"
    fire.Fire(
        {
            "create": create,
            "update": update,
            "remove": remove,
            "describe-registry": describe_registry,
            "list-licenses": list_licenses,
            "license-info": license_info,
            "generate-index-file": generate_index_file,
            "to-iamc": to_iamc,
            "describe": describe,
        }
    )