"""The Friendly data schema registry

This module provides getter methods to retrieve individual columns,
:func:`get`, or the whole registry, :func:`getall`.  The function utilities and
classes are used by the module internally.


from itertools import chain
import json
from logging import getLogger
from pathlib import Path
from typing import cast, Dict, List, Union

from glom import glom, Match, Optional as optmatch, Or
from pkg_resources import resource_filename

import yaml

logger = getLogger("friendly_data._registry")
_path_t = Union[str, Path]

[docs]class schschemaema(Dict): """Registry column schema. Instantiate to validate. Raises ------ TypeMatchError When the column schema has a type mismatch MatchError Other mismatches like, an incorrectly named key """ _schema = { "name": str, "type": str, optmatch("format"): str, optmatch("constraints"): { optmatch("enum"): list, optmatch("maximum"): Or(int, float), optmatch("minimum"): Or(int, float), optmatch("pattern"): str, }, optmatch("title"): str, optmatch("description"): str, optmatch("alias"): [{"name": str, "description": str}], } def __init__(self, schema: dict): super().__init__(glom(schema, Match(self._schema)))
[docs]def read_file(fpath: _path_t) -> Union[Dict, List]: """Read JSON or yaml file; file type is guessed from extension""" fpath = Path(fpath) if fpath.suffix in (".yaml", ".yml"): reader = yaml.safe_load elif fpath.suffix == ".json": reader = json.loads else: raise RuntimeError(f"{fpath}: not a JSON or YAML file") return reader(fpath.read_text())
# FIXME: can't use Literal until we drop 3.7
[docs]def get(col: str, col_t: str) -> Dict: """Retrieve the column schema from column schema registry: `friendly_data_registry` Parameters ---------- col : str Column name to look for col_t : Literal["cols", "idxcols"] A literal string specifying the kind of column; one of: "cols", or "idxcols" Returns ------- Dict Column schema; an empty dictionary is returned in case there are no matches Raises ------ RuntimeError When more than one matches are found ValueError When the schema file in the registry is unsupported; not one of: JSON, or YAML """ if col_t not in ("cols", "idxcols"): raise ValueError(f"{col_t}: unknown column type") curdir = Path(resource_filename("friendly_data_registry", col_t)) schema = list( chain.from_iterable(curdir.glob(f"{col}.{fmt}") for fmt in ("json", "yaml")) ) if len(schema) == 0:"{col_t}/{col}: not in registry") return {} # no match, unregistered column if len(schema) > 1: # pragma: no cover, bad registry raise RuntimeError(f"{schema}: multiple matches, duplicates in registry") res = cast(Dict, read_file(curdir / schema[0])) for key in ("alias",): res.pop(key, None) # strip doc only keys return res
[docs]def getall(with_file: bool = False) -> Dict[str, List[Dict]]: """Get all columns from registry, primarily to generate documentation Returns ------- Dict[str, Dict] The returned value is separated by column type:: { "idxcols": [ {..} # column schemas ], "cols": [ {..} # column schemas ], } Raises ------ RuntimeError When more than one matches are found """ res = {} for col_t in ("cols", "idxcols"): col_t_dir = Path(resource_filename("friendly_data_registry", col_t)) cols = [] schema_files = set() for f in chain.from_iterable( col_t_dir.glob(f"*.{fmt}") for fmt in ("json", "yaml") ): if f.stem in schema_files: raise RuntimeError(f"{f}: duplicate schema in registry") else: schema_files.add(f.stem) if with_file: cols += [(read_file(f), f"{f.relative_to(col_t_dir.parent)}")] else: cols += [read_file(f)] res[col_t] = cols return res