Source code for friendly_data.tseries

"""Convenience functions useful to ingest different kinds of differently shaped
time series data into the standard 1-D shape supported by the data package
specification.

"""

from datetime import datetime
from logging import getLogger
from pathlib import Path
from typing import List, TextIO, TypeVar, Union
import warnings

import pandas as pd

logger = getLogger(__name__)

_file_t = TypeVar("_file_t", str, Path, TextIO)
_col_t = TypeVar("_col_t", int, str)


[docs]def read_timeseries(
    fpath: _file_t,
    *,
    date_cols: Union[List[_col_t], None] = None,
    col_units: Union[str, None] = None,
    zero_idx: bool = False,
    row_fmt: str = "",
    source_t: str = "",
    **kwargs,
):
    """Read a time series from a file.

    While the natural way to structure a time series dataset is with the index
    column as datetime values, with subsequent columns holding other values,
    there are a few other frequently used structures.

    The first is to structure it as a table:

    ===========  ===  ===  =====  ====  ====
     date         1    2    ...    23    24
    ===========  ===  ===  =====  ====  ====
     1/1/2016      0   10   ...    2.3   5.1
     4/1/2016      3   11   ...    4.3   9.1
    ===========  ===  ===  =====  ====  ====

    When `source_t` is set to "table", this function reads a tabular dataset
    like the one above, and flattens it into a series, and sets the appropriate
    datetime values as their index.

    The other common structure is to split the datetime values into multiple
    columns in the table:

    ===========  ======  ======  ======
      date        time    col1    col2
    ===========  ======  ======  ======
     1/1/2016     10:00    42.0    foo
     4/1/2016     11:00    3.14    bar
    ===========  ======  ======  ======

    When `source_t` is set to "multicol", as the table is read, the indicated
    columns are combined to construct the datetime values, which are then set
    as the index.

    If `source_t` is not specified (or set to an empty string), options
    specific to this function are ignored, and all other keyword options are
    passed on to the backend transparently; in case of reading a CSV with
    Pandas, that means all valid keywords for `pandas.read_csv` are accepted.

    Parameters
    ----------
    fpath : Union[str, Path, TextIO]
        Path to the dataset file

    date_cols : List[int, str] (for "multicol" mode)
        List of columns to be combined to construct the datetime values

    col_units : str (for "table" mode)
        Time units for the columns.  Accepted values: "month", "hour".

    zero_idx : bool (for "table" mode, default: False)
        Whether the columns are zero indexed.  When the columns represent
        hours, or minutes, it is common to number them as nth hour.  Which
        means they are counted starting at 1 instead of 0.  Set this to False
        if that is the case.

    row_fmt : str (for "table" mode, default: empty string)
        What is the format of the datetime column (use strftime format strings,
        see: `man 3 strftime`).  If this is left empty, the reader tries to
        guess a format using the `dateutil` module (Pandas default)

    source_t : str (default: empty string)
        Mode of reading the data. Accepted values: "table", "multicol", or
        empty string

    **kwards : Dict
        Other keyword arguments passed on to the reader backend.  Any options
        passed here takes precedence, and overwrites other values inferred from
        the earlier keyword arguments.

    Returns
    -------
    ts : Series/DataFrame
        The time series is returned as a series or a dataframe depending on the
        number of other columns that are present.

    Examples
    --------

    To skip specific rows, maybe because they have bad data, or are empty, you
    may use the `skiprows` option.  It can be set to a list-like where the
    entries are row indices (numbers).

    >>> read_timeseries("mydata.csv", source_t="table", col_units="hour",
    ...     skiprows=range(1522, 5480))  # doctest: +SKIP

    The above example skips rows 1522-5480.

    Similarly, data type of the column values can be controlled by using the
    `dtype` option.  When set to a `numpy.dtype`, all values will be read as
    that type, which is probably relevant for the "table" mode.  In the
    "multicol" mode, the types of the values can be controlled at the column
    level by setting it to a dictionary, where the key matches a column name,
    and the value is a valid `numpy.dtype`.

    """
    # FIXME: parse_dates & index_col assumes input is oriented as portrait
    if source_t == "table":
        if col_units is None:
            raise ValueError("col_units: missing time unit for columns")
        ts = from_table(
            fpath,
            col_units=col_units,
            zero_idx=zero_idx,
            row_fmt=row_fmt,
            **kwargs,
        )
    elif source_t == "multicol":
        if date_cols is None:
            raise ValueError("date_cols: missing list of datetime columns")
        ts = from_multicol(fpath, date_cols=date_cols, **kwargs)
    else:
        if source_t:
            logger.warning(f"{source_t}: unsupported source, falling back to default")
        ts = pd.read_csv(fpath, **kwargs)
    return ts


[docs]def from_table(
    fpath: _file_t,
    *,
    col_units: str,
    zero_idx: bool,
    row_fmt: str = "",
    **kwargs,
):
    """Read a time series from a tabular file.

    See Also
    --------
    read_timeseries : see for full documentation, main entrypoint for users

    """
    # NOTE: allow for plural forms, as it is quite common, but the allowance is
    # undocumented, hence not guaranteed to work.
    if "month" in col_units:
        offset = pd.tseries.offsets.MonthBegin()
    elif "hour" in col_units:
        offset = pd.Timedelta(1, unit="hour")
    else:
        raise ValueError(f"{col_units}: unsupported column units")

    # NOTE: assumption: input is oriented as portrait
    opts = {"parse_dates": [0], "index_col": 0}
    # NOTE: for date-hour, it's okay to use the default dateutil parser for
    # date, unless otherwise specified, however for year-month it gets confused
    # and the format string needs to be explicitly set to YYYY
    if col_units == "month" and row_fmt == "":
        row_fmt = "%Y"
    if row_fmt:
        opts.update(date_parser=lambda dt: datetime.strptime(dt, row_fmt))
    # NOTE: "parse_dates", and "index_col" maybe overidden by the keyword
    # arguments so that the user has the option to ignore the inferred values;
    # it's a wild world, can't think of everything ;)
    opts.update(kwargs)
    ts = pd.read_csv(fpath, **opts).stack()

    # merge indices
    idx_lvls = [ts.index.get_level_values(i) for i in (0, 1)]
    ts_delta = (idx_lvls[1].astype(int) - int(not zero_idx)) * offset
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
        ts.index = idx_lvls[0] + ts_delta
    return ts


[docs]def from_multicol(fpath: _file_t, *, date_cols: List[_col_t], **kwargs):
    """Read a time series where datetime values are in multiple columns.

    See Also
    --------
    read_timeseries : see for full documentation, main entrypoint for users

    """
    # NOTE: index_col=0 b/c columns parsed as dates always end up in the front
    df = pd.read_csv(fpath, parse_dates=[date_cols], index_col=0, **kwargs)
    return df