"""Functions that are run from the CLI to create, or edit a data package.
"""
from datetime import datetime
from itertools import chain
from pathlib import Path
import sys
from typing import Any, Dict, Iterable, List
from frictionless import Package
from glom import glom, Iter
from friendly_data import logger_config
from friendly_data._types import _license_t, _path_t
from friendly_data.converters import to_df
from friendly_data.dpkg import entry_from_res
from friendly_data.dpkg import idxpath_from_pkgpath
from friendly_data.dpkg import pkg_from_files
from friendly_data.dpkg import pkgindex
from friendly_data.dpkg import read_pkg
from friendly_data.dpkg import set_idxcols
from friendly_data.dpkg import write_pkg
from friendly_data.helpers import consume, filter_dict
from friendly_data.helpers import is_windows
from friendly_data.helpers import sanitise
from friendly_data.io import copy_files
from friendly_data.io import dwim_file
from friendly_data.io import path_not_in
from friendly_data.io import outoftree_paths
from friendly_data.metatools import _fetch_license
from friendly_data.metatools import check_license
from friendly_data.metatools import get_license
from friendly_data.metatools import lic_metadata
from friendly_data.metatools import resolve_licenses
from friendly_data.registry import config_ctx
from friendly_data.doc import get_template, page
logger = logger_config(fmt="{name}: {levelname}: {message}")
[docs]def list_licenses() -> str:
"""List commonly used licenses
NOTE: for Python API users, not to be confused with
:func:`metatools.list_licenses`.
Returns
-------
str
ASCII table with commonly used licenses
"""
from tabulate import tabulate
keys = ("domain", "id", "maintainer", "title")
return tabulate(lic_metadata(keys), headers="keys")
[docs]def license_info(lic: str) -> Dict:
"""Give detailed metadata about a license
Parameters
----------
lic : str
License ID as listed in the output of ``friendly_data list-licenses``
Returns
-------
Dict
License metadata
"""
keys = ("domain", "id", "maintainer", "title", "url")
lic_info = lic_metadata(keys, lambda i: i["id"] == lic)
if not lic_info:
logger.error(f"no matching license with id: {lic}")
sys.exit(1)
return lic_info[0]
[docs]def license_prompt() -> _license_t: # pragma: no cover, interactive function
"""Prompt for a license on the terminal (with completion)."""
licenses = _fetch_license("all")
def complete(text, state):
for lic in licenses:
if lic.startswith(text):
if not state:
return lic
else:
state -= 1
if not is_windows():
import readline
readline.parse_and_bind("tab: complete")
readline.set_completer(complete)
return check_license(licenses[input("license: ")])
def _metadata(
mandatory: List[str],
*,
name: str = "",
title: str = "",
licenses: str = "",
description: str = "",
keywords: str = "",
config: _path_t = "",
) -> Dict:
"""Metadata from the config file is overriden by keyword arguments"""
if config:
try:
meta = dwim_file(config)["metadata"] # type: ignore[call-overload]
except KeyError as err:
logger.warning(f"{err}: section missing from {config}")
meta = {}
else:
meta = resolve_licenses(meta)
else:
meta = {}
_meta: Dict[str, Any] = {
"name": name if name else sanitise(title),
"title": title,
"description": description,
"keywords": keywords.split(),
}
if licenses:
_meta["licenses"] = [get_license(licenses)]
elif "licenses" in mandatory and "licenses" not in meta:
_meta["licenses"] = [license_prompt()] # pragma: no cover
# override config file with values from flags
meta.update((k, v) for k, v in _meta.items() if v)
check = [k for k in mandatory if k not in meta] # mandatory fields
if check:
logger.error(f"{check}: mandatory metadata missing")
if "license" in meta:
logger.error("'license': should be plural!")
sys.exit(1)
return meta
# TODO: ability to add datasets from arbitrary paths
# - flag to provide destination directory for out of tree datasets
# - normalise relative path w.r.t. index entries
# - for files not in the index, normalise relative path w.r.t. pkgdir
# add similar ability for update(..)
def _create(
meta: Dict,
pkgpath: _path_t,
fpaths: Iterable[_path_t],
*,
export: _path_t,
) -> List[Path]:
if export:
pkgpath, export = Path(pkgpath), Path(export)
idxp = idxpath_from_pkgpath(pkgpath) if pkgpath.is_dir() else pkgpath
spec = Iter("path").map(lambda p: idxp.parent / p) # type: ignore[union-attr]
if idxp: # create a uniquified list of files
files = chain(
[idxp],
set(chain(glom(pkgindex.from_file(idxp), spec), map(Path, fpaths))),
)
else:
files = fpaths # type: ignore[assignment]
# NOTE: if idxpath was found, first of the returned files is the index
# file that was copied in the export directory, extract it to pkgpath
fpaths = copy_files(files, export, pkgpath)
if idxp:
pkgpath, *fpaths = fpaths
else: # if no index was found, set export directory to new pkgpath
pkgpath = export
pkgdir, pkg, _ = pkg_from_files(meta, pkgpath, fpaths)
return write_pkg(pkg, pkgdir)
[docs]def create(
idxpath: str,
*fpaths: str,
name: str = "",
title: str = "",
licenses: str = "",
description: str = "",
keywords: str = "",
inplace: bool = False,
export: str = "",
config: str = "",
):
"""Create a package from an index file and other files
Package metadata provided with command line flags override metadata from
the config file.
Parameters
----------
idxpath : str
Path to the index file or package directory with the index file. Note
the index file has to be at the top level directory of the datapackage.
fpaths : Tuple[str]
List of datasets/resources not in the index. If any of them point to a
dataset already present in the index, it is ignored.
name : str
Package name (no spaces or special characters)
title : str
Package title
licenses : str
License
description : str
Package description
keywords : str
A space separated list of keywords: 'renewable energy model' ->
['renewable', 'energy', 'model']
inplace : bool
Whether to create the data package by only adding metadata to the
current directory. NOTE: one of inplace/export must be chosen
export : str
Create the data package in the provided directory instead of the
current directory
config : str
Config file in YAML format with metadata and custom registry. The
metadata should be under a "metadata" section, and the custom registry
under a "registry" section.
"""
if (not export) and (not inplace):
logger.error("you must explicitly choose between `inplace` or `export`")
sys.exit(1)
elif export and inplace:
logger.warning(
"both `inplace` and `export` present, `inplace` will be ignored`"
)
meta = {
"name": name,
"title": title,
"licenses": licenses,
"description": description,
"keywords": keywords,
"config": config,
}
meta = _metadata(["name", "licenses"], **meta) # type: ignore[arg-type]
with config_ctx(conffile=config):
files = _create(meta, idxpath, fpaths, export=export)
return f"Package metadata: {files[0]}"
def _update(pkg: Dict, pkgpath: _path_t, fpaths: Iterable[_path_t]):
_fpaths1, outoftree = outoftree_paths(pkgpath, fpaths)
_fpaths2 = copy_files(outoftree, pkgpath)
fpaths = _fpaths1 + _fpaths2
pkg = _rm_from_pkg(pkg, pkgpath, fpaths)
return _create(pkg, pkgpath, fpaths, export="")
# TODO: option to update files in index
[docs]def update(
pkgpath: str,
*fpaths: str,
name: str = "",
title: str = "",
licenses: str = "",
description: str = "",
keywords: str = "",
config: str = "",
):
"""Update metadata and datasets in a package.
Parameters
----------
pkgpath : str
Path to the package.
fpaths : Tuple[str]
List of datasets/resources; they could be new datasets or datasets with
updated index entries.
name : str
Package name (no spaces or special characters)
title : str
Package title
description : str
Package description
keywords : str
A space separated list of keywords: 'renewable energy model' ->
['renewable', 'energy', 'model']
licenses : str
License
config : str
Config file in YAML format with metadata and custom registry. The
metadata should be under a "metadata" section, and the custom registry
under a "registry" section.
"""
meta = {
"name": name,
"title": title,
"licenses": licenses,
"description": description,
"keywords": keywords,
"config": config,
}
meta = _metadata([], **meta) # type: ignore[arg-type]
pkg = read_pkg(pkgpath)
pkg.update(meta)
if len(fpaths) == 0:
files = write_pkg(pkg, pkgpath)
else:
with config_ctx(conffile=config):
files = _update(pkg, pkgpath, fpaths)
return f"Package metadata: {files[0]}"
def _rm_paths_spec(pkgpath: _path_t, fpaths: Iterable[_path_t]):
pkgpath = Path(pkgpath)
return Iter().filter(lambda r: path_not_in(fpaths, pkgpath / r["path"])).all()
def _rm_from_pkg(pkg: Dict, pkgpath: _path_t, fpaths: Iterable[_path_t]):
count = len(pkg["resources"])
pkg["resources"] = glom(pkg["resources"], _rm_paths_spec(pkgpath, fpaths))
if count == len(pkg["resources"]):
logger.info("no resources to update/remove in package")
return pkg
def _rm_from_idx(pkgpath: _path_t, fpaths: Iterable[_path_t]) -> pkgindex:
idx = pkgindex.from_file(idxpath_from_pkgpath(pkgpath))
return glom(idx, _rm_paths_spec(pkgpath, fpaths))
def _rm_from_disk(fpaths: Iterable[_path_t]):
consume(map(lambda fp: Path(fp).unlink(), fpaths))
[docs]def remove(pkgpath: str, *fpaths: str, rm_from_disk: bool = False) -> str:
"""Remove datasets from the package
Parameters
----------
pkgpath : str
Path to the package directory
fpaths : Tuple[str]
List of datasets/resources to be removed from the package. The index is
updated accordingly.
rm_from_disk : bool (default: False)
Permanently delete the files from disk
"""
pkg = _rm_from_pkg(read_pkg(pkgpath), pkgpath, fpaths)
idx = _rm_from_idx(pkgpath, fpaths)
fmeta, fidx = write_pkg(pkg, pkgpath, idx=idx)
if rm_from_disk:
_rm_from_disk(fpaths)
msgs = [f"Package metadata: {fmeta}", f"Package index: {fidx}"]
return "\n".join(msgs)
[docs]def generate_index_file(idxpath: str, *fpaths: str, config: str = ""):
"""Generate an index file from a set of dataset files
Parameters
----------
idxpath : str
Path where the index file (YAML format) should be written
fpaths : Tuple[str]
List of datasets/resources to include in the index
config : str
Config file in YAML format with custom registry. It should be defined
under a "registry" section.
"""
with config_ctx(conffile=config):
idx = [entry_from_res(set_idxcols(f)) for f in fpaths]
dwim_file(idxpath, idx)
[docs]def to_iamc(config: str, idxpath: str, iamcpath: str, *, wide: bool = False):
"""Aggregate datasets into an IAMC dataset
Parameters
----------
config : str
Config file
idxpath : str
Index file
iamcpath : str
IAMC dataset
wide : bool (default: False)
Enable wide IAMC format
"""
from friendly_data.iamc import IAMconv
conv = IAMconv.from_file(config, idxpath)
files = conv.res_idx.get("path")
conv.to_csv(files, output=iamcpath, wide=wide)
return f"{', '.join(files)} -> {iamcpath}"
[docs]def reports(pkg: Package, report_dir: str):
"""Write HTML reports summarising all resources in the package
Parameters
----------
pkg : Package
report_dir : str
Directory where reports are written
Returns
-------
int
Bytes written (index.html)
"""
import pandas_profiling as _ # noqa: F401
_dir = Path(report_dir)
_dir.mkdir(parents=True, exist_ok=True)
title = pkg.get("title", pkg["name"])
res = {"title": title, "date": datetime.now().isoformat(), "resources": []}
for _res in pkg.resources:
df = to_df(_res, noexcept=True)
html = Path(_res["path"]).with_suffix(".html")
report = df.profile_report(
title=title,
dataset=filter_dict(_res, ["description"]), # FIXME: add pkg url
variables={
"descriptions": glom(
_res,
(
"schema.fields",
Iter()
.filter(lambda i: "description" in i)
.map(lambda i: (i["name"], i["description"]))
.all(),
dict,
),
)
},
)
report.config.html.minify_html = False
report.to_file(_dir / html)
res["resources"].append({"path": html, "name": _res["name"]})
tmpl = get_template("index.html.template")
return (_dir / "index.html").write_text(tmpl.render(res))
[docs]def describe(pkgpath: str, config: str = "", report_dir: str = ""):
"""Give a summary of the data package
Parameters
----------
pkgpath : str
Path to the data package
report_dir : str (default: empty string)
If not empty, generate an HTML report and write to the directory
``report``. The directory will have an ``index.html`` file, and one
HTML file for each dataset.
config : str
Config file in YAML format with custom registry. It should be defined
under a "registry" section.
"""
try:
pkg = read_pkg(pkgpath)
except (ValueError, FileNotFoundError):
sys.exit(1)
res = {}
meta_f = ("name", "title", "description", "keywords", "licenses")
for k, v in pkg.items():
if k in meta_f and v:
res[k] = glom(v, ["name"]) if k == "licenses" else v
if report_dir:
res["report_dir"] = report_dir
with config_ctx(conffile=config):
reports(pkg, report_dir)
tmpl = get_template("dpkg_describe.template")
res["resources"] = glom(
pkg["resources"], [{"fields": ("schema.fields", ["name"]), "path": "path"}]
)
return tmpl.render(res)
[docs]def describe_registry(column_type: str = ""):
"""Describe columns defined in the registry
Parameters
----------
column_type : str (default: empty string → all)
Column type to list; one of: "cols", or "idxcols". If nothing is
provided (default), columns of both types are listed.
"""
from rich.console import Console
from rich.markdown import Markdown
console = Console()
md = Markdown(page(markup="md", col_t=column_type))
console.print(md)
[docs]def main(): # pragma: no cover, CLI entry point
"""Entry point for console scripts"""
import os
import fire
os.environ["PAGER"] = "cat"
fire.Fire(
{
"create": create,
"update": update,
"remove": remove,
"describe-registry": describe_registry,
"list-licenses": list_licenses,
"license-info": license_info,
"generate-index-file": generate_index_file,
"to-iamc": to_iamc,
"describe": describe,
}
)