Source code for friendly_data.io

"""Functions useful for I/O and file manipulation

"""

from hashlib import sha256
import json
from pathlib import Path
import shutil
import tempfile
import time
from typing import Any, Dict, Iterable, List, overload, Tuple, Union

import requests
import yaml

from friendly_data._types import _path_t


[docs]def copy_files(
    src: Iterable[_path_t], dest: _path_t, anchor: _path_t = ""
) -> List[Path]:
    """Copy files to a directory

    Without an anchor, the source files are copied to the root of the
    destination directory; with an anchor, the relative paths between the
    source files are maintained; any required subdirectories are created.

    Parameters
    ----------
    src : Iterable[Union[str, Path]]
        List of files to be copied

    dest : Union[str, Path]
        Destination directory

    anchor : Union[str, Path] (default: empty string)
        Top-level directory for anchoring, provide if you want the relative
        paths between the source files to be maintained with respect to this
        directory.

    Returns
    -------
    List[Path]
        List of files that were copied

    """
    dest = Path(dest)
    dest.mkdir(parents=True, exist_ok=True)
    if anchor:
        anchor = Path(anchor)
        if not anchor.is_dir():
            anchor = anchor.parent
    files = []
    for fp in src:
        fp = Path(fp)
        files.append(dest / (fp.relative_to(anchor) if anchor else fp.name))
        files[-1].parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(fp, files[-1].parent)
    return files


[docs]def relpaths(basepath: _path_t, pattern: Union[str, Iterable[_path_t]]) -> List[str]:
    """Convert a list of paths to relative paths

    Parameters
    ----------
    basepath : Union[str, Path]
        Path to use as the reference when calculating relative paths
    pattern : Union[str, Iterable[Union[str, Path]]]
        Either a pattern relative to ``basepath`` to generate a list of paths,
        or a list of paths to convert.

    Returns
    -------
    List[str]
        List of relative paths (as ``str``-s)

    """
    if isinstance(pattern, str):
        basepath = Path(basepath)
        return [str(p.relative_to(basepath)) for p in basepath.glob(pattern)]
    else:  # iterable of "paths"
        return [str(p.relative_to(basepath)) for p in map(Path, pattern)]


[docs]def outoftree_paths(
    basepath: _path_t, fpaths: Iterable[_path_t]
) -> Tuple[List[Path], List[Path]]:
    """Separate a list of paths into in tree and out of tree.

    Parameters
    ----------
    basepath : Union[str, Path]
        Path to use as the reference when identifying in/out of tree paths.

    fpaths : Iterable[Union[str, Path]]
        List of paths.

    Returns
    -------
    Tuple[List[str], List[Path]]
        A pair of list of in tree and out of tree paths

    """
    intree, outoftree = [], []
    for fp in map(Path, fpaths):
        # NOTE: cannot use fp.is_relative_to(basepath) for <Python 3.9
        try:
            fp.relative_to(basepath)
        except ValueError:
            outoftree.append(fp)
        else:
            intree.append(fp)
    return intree, outoftree


[docs]def path_in(fpaths: Iterable[_path_t], testfile: _path_t) -> bool:
    """Function to test if a path is in a list of paths.

    The test checks if they are the same physical files or not, so the testfile
    needs to exist on disk.

    Parameters
    ----------
    fpaths : Iterable[Union[str, Path]]
        List of paths to check
    testfile : Union[str, Path]
        Test file (must exist on disk)

    Returns
    -------
    bool

    """
    return any(map(Path(testfile).samefile, fpaths))


[docs]def path_not_in(fpaths: Iterable[_path_t], testfile: _path_t) -> bool:
    """Function to test if a path is absent from a list of paths.

    Opposite of :func:`path_in`.

    Parameters
    ----------
    fpaths : Iterable[Union[str, Path]]
        List of paths to check
    testfile : Union[str, Path]
        Test file (must exist on disk)

    Returns
    -------
    bool

    """
    return not path_in(fpaths, testfile)


[docs]def posixpathstr(fpath: _path_t) -> str:
    """Given a path object, return a POSIX compatible path string

    Parameters
    ----------
    fpath : Unioin[str, Path]
        Path object

    Returns
    -------
    str

    """
    return str(Path(fpath).as_posix())


@overload
def dwim_file(fpath: _path_t) -> Union[Dict, List]:
    ...  # pragma: no cover, overload


@overload
def dwim_file(fpath: _path_t, data: Any) -> None:
    ...  # pragma: no cover, overload


[docs]def dwim_file(fpath, data=None):
    """Do What I Mean with file

    Depending on the function arguments, either read the contents of a file, or
    write data to the file.  The file type is guessed from the extension;
    supported formats: JSON and YAML.

    Parameters
    ----------
    fpath : Union[str, Path]
        File path to read or write to

    data : Union[None, Any]
        Data, when writing to a file.

    Returns
    -------
    Union[None, Union[Dict, List]]
        - If writing to a file, nothing (``None``) is returned
        - If reading from a file, depending on the contents, either a list or
          dictionary are returned

    """
    fpath = Path(fpath)
    mode = "r" if data is None else "w"
    if fpath.suffix in (".yaml", ".yml"):
        with open(fpath, mode=mode) as stream:
            if data is None:
                return yaml.safe_load(stream)
            else:
                yaml.safe_dump(data, stream)
    elif fpath.suffix == ".json":
        with open(fpath, mode=mode) as stream:
            if data is None:
                return json.load(stream)
            else:
                json.dump(data, stream, indent=2)
    else:
        raise RuntimeError(f"{fpath}: not a JSON or YAML file")


[docs]def get_cachedir() -> Path:
    """Create the directory ``$TMPDIR/friendly_data_cache`` and return the Path"""
    cachedir = Path(tempfile.gettempdir()) / "friendly_data_cache"
    cachedir.mkdir(exist_ok=True)
    return cachedir


[docs]class HttpCache:
    """An HTTP cache

    It accepts a URL template which accepts parameters:
    ``https://www.example.com/path/{}.json``, the parameters can be provided
    later at fetch time.  No checks are made if the number of parameters passed
    are compatible with the URL template.

    After fetching a resource, it is cached in a file under
    ``$TMPDIR/friendly_data_cache/``.  The file name is of the form
    ``http-<checksum-of-url-template>-<checksum-of-url>``.  The cache is
    updated every 24 hours.  A user may also force a cache cleanup by calling
    :meth:`remove`.

    Parameters
    ----------
    url_t : str
        URL template, e.g. ``https://www.example.com/path/{}.json``

    Attributes
    ----------
    cachedir : pathlib.Path
        Path object pointing to the cache directory

    """

    cachedir: Path = get_cachedir()

    def __init__(self, url_t: str):
        self.url_t = url_t
        self.url_t_hex = sha256(bytes(url_t, "utf8")).hexdigest()

[docs]    def cachefile(self, arg: str, *args: str) -> Tuple[Path, str]:
        """Return the cache file, and the corresponding URL

        Parameters
        ----------
        arg : str
            parameters for the URL template (one mandatory)
        *args : str, optional
            more parameters (optional)

        Returns
        -------
        Tuple[pathlib.Path, str]
            Tuple of Path object pointing to the cache file and the URL string

        """
        url = self.url_t.format(arg, *args)
        url_hex = sha256(bytes(url, "utf8")).hexdigest()
        return (
            self.cachedir / f"http-{self.url_t_hex}-{url_hex}",
            url,
        )

[docs]    def remove(self, *args: str):
        """Remove cache files

        - Remove all files associated with this cache (w/o arguments).
        - Remove only the files associated with the URL formed from the args.

        Parameters
        ----------
        *args : str, optional
            parameters for the URL template

        Raises
        ------
        FileNotFoundError
            If an argument is provided to remove a specific cache file, but the
            cache file does not exist.

        """
        if len(args):
            files = (i for i in [self.cachefile(*args)[0]])
        else:
            files = self.cachedir.glob(f"http-{self.url_t_hex}-*")
        for cf in files:
            cf.unlink()

[docs]    def get(self, arg: str, *args: str) -> bytes:
        """Get the URL contents

        If a valid cache exists, return the contents from there, otherwise
        fetch again.

        Parameters
        ----------
        arg : str
            parameters for the URL template (one mandatory)
        *args : str, optional
            more parameters (optional)

        Returns
        -------
        bytes
            bytes array of the contents

        Raises
        ------
        ValueError
            If the URL is incorrect
        requests.ConnectionError
            If there is no network connection

        """
        cachefile, url = self.cachefile(arg, *args)
        if not cachefile.exists() or (
            time.time() - cachefile.stat().st_ctime > 24 * 3600
        ):
            cachefile.write_bytes(self.fetch(url))
        return cachefile.read_bytes()

[docs]    def fetch(self, url: str) -> bytes:
        """Fetch the URL

        Parameters
        ----------
        url : str
            URL to fetch

        Returns
        -------
        bytes
            bytes array of the contents that was fetched

        Raises
        ------
        ValueError
            If the URL is incorrect

        """
        response = requests.get(url)
        if response.ok:
            return response.content
        else:
            raise ValueError(f"error: {response.url} responded {response.reason}")