Source code for friendly_data.validate

"""Functions useful to validate a data package or parts of its schema.

"""

from typing import Callable, Dict, List, Set, Tuple

from frictionless import validate_package as validate
from glom import Coalesce, glom, Iter, T
import pandas as pd

from friendly_data.helpers import select


[docs]def check_pkg(pkg) -> List[Dict]:
    """Validate all resources in a datapackage for common errors.

    Typical errors that are checked:
     - ``blank-header``,
     - ``extra-label``,
     - ``missing-label``,
     - ``blank-label``,
     - ``duplicate-label``,
     - ``incorrect-label``,
     - ``blank-row``,
     - ``primary-key-error``,
     - ``foreign-key-error``,
     - ``extra-cell``,
     - ``missing-cell``,
     - ``type-error``,
     - ``constraint-error``,
     - ``unique-error``

    Parameters
    ----------
    pkg : frictionless.Package
        The datapackage descriptor dictionary

    Returns
    -------
    Dict
        A dictionary with a summary of the validation checks.

    """
    # noinfer -> original in newer versions
    report = validate(pkg, basepath=pkg.basepath, noinfer=True)
    count = glom(report, "stats.errors")
    if not count:
        return list()
    res = glom(
        report,
        (
            "tasks",
            Iter()
            .filter(T["stats"]["errors"])
            .map(
                {
                    "path": T["resource"]["path"],
                    "position": (
                        T["errors"],
                        [
                            {
                                "row": T["rowNumber"],
                                "col": Coalesce(T["fieldName"], default=""),
                            }
                        ],
                    ),
                    "errors": (
                        T["errors"],
                        [
                            {
                                "error": T["code"],
                                "remark": T["note"],
                            }
                        ],
                    ),
                }
            )
            .all(),
        ),
    )
    return res


[docs]def summarise_errors(report: List[Dict]) -> pd.DataFrame:
    """Summarise the dict/json error report as a `pandas.DataFrame`

    Parameters
    ----------
    report : List[Dict]
        List of errors as returned by :func:`check_pkg`

    Returns
    -------
    pandas.DataFrame
        Summary dataframe; example::

               filename  row  col       error  remark
            0   bad.csv   12       extra-cell     ...
            1   bad.csv   22  SRB  type-error     ...

    """
    df = pd.DataFrame(report)
    errors: pd.DataFrame = df["errors"].explode(ignore_index=True).apply(pd.Series)
    df = df.explode("position").reset_index(drop=True).drop("errors", axis=1)
    fnames: pd.DataFrame = df["path"].str.rsplit("/").apply(pd.Series).iloc[:, -1]
    fnames.name = "filename"
    position: pd.Series = df["position"].apply(pd.Series)
    return pd.concat([fnames, position, errors], axis=1)


[docs]def check_schema(
    ref: Dict[str, str], dst: Dict[str, str], *, remap: Dict[str, str] = None
) -> Tuple[bool, Set[str], Dict[str, Tuple[str, str]], List[Tuple]]:
    """Compare a schema with a reference.

    The reference schema is a minimal set, meaning, any additional fields in
    the compared schema are accepted, but omissions are not.

    Name comparisons are case-sensitive.

    TODO: maybe also compare constraints?

    Parameters
    ----------
    ref : Dict[str, str]
        Reference schema dictionary
    dst : Dict[str, str]
        Schema dictionary from the dataset being validated
    remap : Dict[str, str] (optional)
       Column/field names that are to be remapped before checking.

    Returns
    -------
    result : Tuple[bool, Set[str], Dict[str, Tuple[str, str]], List[Tuple]]
        Result tuple:

        - Boolean flag indicating if it passed the checks or not
        - If checks failed, set of missing columns from minimal set
        - If checks failed, set of columns with mismatching types.  It is a
          dictionary with the column name as key, and the reference type and
          the actual type in a tuple as value. ::

              {
                  'col_x': ('integer', 'number'),
                  'col_y': ('datetime', 'string'),
              }

        - If primary keys are different, tuple with the diff.  The first
          element is the index where the two differ, and the two subsequent
          elements are the corresponding elements from the reference and
          dataset primary key list: ``(index, ref_col, dst_col)``

    """
    # extract columns
    ref_: List[Dict[str, str]]
    dst_: List[Dict[str, str]]
    ref_, dst_ = glom((ref, dst), Iter("fields").all())

    if remap:
        dst_ = [
            {**i, "name": remap[i["name"]] if i["name"] in remap else i["name"]}
            for i in dst_
        ]

    # column names
    ref_set = glom(ref_, (["name"], set))
    dst_set = glom(dst_, (["name"], set))
    # missing columns
    missing = ref_set - dst_set

    # mismatched types, FIXME: horrible mess
    common = ref_set.intersection(dst_set)
    mismatch = {}
    for col in dst_:
        if col["name"] not in common:
            continue
        ref_col, *_ = glom(ref_, [select("name", equal_to=col["name"])])
        if ref_col["type"] != col["type"]:
            mismatch[col["name"]] = (ref_col["type"], col["type"])

    # metadata: ignore missing values
    pri_ref = ref.get("primaryKey", [])  # type: ignore
    pri_dst = dst.get("primaryKey", [])  # type: ignore
    if isinstance(pri_ref, str):
        pri_ref = [pri_ref]
    if isinstance(pri_dst, str):
        pri_dst = [pri_dst]

    def pair(i: List[str], j: List[str]) -> Callable[[], Tuple]:
        iitr, jitr = iter(i), iter(j)

        def _pair() -> Tuple:
            return next(iitr, None), next(jitr, None)

        return _pair

    pri_diff = []
    if pri_ref != pri_dst:
        pairs = iter(pair(pri_ref, pri_dst), (None, None))
        pri_diff = [(i, j, k) for i, (j, k) in enumerate(pairs) if j != k]

    check_pass = not (missing or mismatch or pri_diff)
    return (check_pass, missing, mismatch, pri_diff)  # type: ignore


[docs]def summarise_diff(
    diff: Tuple[bool, Set[str], Dict[str, Tuple[str, str]], List[Tuple]]
) -> str:
    """Summarise the schema diff from :func:`check_schema` results as a
    ``pandas.DataFrame``.

    """

    status, missing, mismatch, pri = diff
    report = ""
    if status:
        return report
    if missing:
        report += f"missing column names: {missing}\n"
    if mismatch:
        df = pd.DataFrame(
            [(col, *col_ts) for col, col_ts in mismatch.items()],
            columns=["column", "reference_type", "current_type"],
        )
        report += "mismatched column types:\n"
        report += str(df.to_string(header=True, index=False))
    if pri:
        df = pd.DataFrame(pri, columns=["level", "reference_col", "current_col"])
        report += "mismatched index levels/cols:\n"
        report += str(df.to_string(header=True, index=False))
    return report