Source code for friendly_data.tseries

"""Convenience functions useful to ingest different kinds of differently shaped
time series data into the standard 1-D shape supported by the data package
specification.

"""

from datetime import datetime
from logging import getLogger
from pathlib import Path
from typing import List, TextIO, TypeVar, Union
import warnings

import pandas as pd

logger = getLogger(__name__)

_file_t = TypeVar("_file_t", str, Path, TextIO)
_col_t = TypeVar("_col_t", int, str)


[docs]def read_timeseries( fpath: _file_t, *, date_cols: Union[List[_col_t], None] = None, col_units: Union[str, None] = None, zero_idx: bool = False, row_fmt: str = "", source_t: str = "", **kwargs, ): """Read a time series from a file. While the natural way to structure a time series dataset is with the index column as datetime values, with subsequent columns holding other values, there are a few other frequently used structures. The first is to structure it as a table: =========== === === ===== ==== ==== date 1 2 ... 23 24 =========== === === ===== ==== ==== 1/1/2016 0 10 ... 2.3 5.1 4/1/2016 3 11 ... 4.3 9.1 =========== === === ===== ==== ==== When `source_t` is set to "table", this function reads a tabular dataset like the one above, and flattens it into a series, and sets the appropriate datetime values as their index. The other common structure is to split the datetime values into multiple columns in the table: =========== ====== ====== ====== date time col1 col2 =========== ====== ====== ====== 1/1/2016 10:00 42.0 foo 4/1/2016 11:00 3.14 bar =========== ====== ====== ====== When `source_t` is set to "multicol", as the table is read, the indicated columns are combined to construct the datetime values, which are then set as the index. If `source_t` is not specified (or set to an empty string), options specific to this function are ignored, and all other keyword options are passed on to the backend transparently; in case of reading a CSV with Pandas, that means all valid keywords for `pandas.read_csv` are accepted. Parameters ---------- fpath : Union[str, Path, TextIO] Path to the dataset file date_cols : List[int, str] (for "multicol" mode) List of columns to be combined to construct the datetime values col_units : str (for "table" mode) Time units for the columns. Accepted values: "month", "hour". zero_idx : bool (for "table" mode, default: False) Whether the columns are zero indexed. When the columns represent hours, or minutes, it is common to number them as nth hour. Which means they are counted starting at 1 instead of 0. Set this to False if that is the case. row_fmt : str (for "table" mode, default: empty string) What is the format of the datetime column (use strftime format strings, see: `man 3 strftime`). If this is left empty, the reader tries to guess a format using the `dateutil` module (Pandas default) source_t : str (default: empty string) Mode of reading the data. Accepted values: "table", "multicol", or empty string **kwards : Dict Other keyword arguments passed on to the reader backend. Any options passed here takes precedence, and overwrites other values inferred from the earlier keyword arguments. Returns ------- ts : Series/DataFrame The time series is returned as a series or a dataframe depending on the number of other columns that are present. Examples -------- To skip specific rows, maybe because they have bad data, or are empty, you may use the `skiprows` option. It can be set to a list-like where the entries are row indices (numbers). >>> read_timeseries("mydata.csv", source_t="table", col_units="hour", ... skiprows=range(1522, 5480)) # doctest: +SKIP The above example skips rows 1522-5480. Similarly, data type of the column values can be controlled by using the `dtype` option. When set to a `numpy.dtype`, all values will be read as that type, which is probably relevant for the "table" mode. In the "multicol" mode, the types of the values can be controlled at the column level by setting it to a dictionary, where the key matches a column name, and the value is a valid `numpy.dtype`. """ # FIXME: parse_dates & index_col assumes input is oriented as portrait if source_t == "table": if col_units is None: raise ValueError("col_units: missing time unit for columns") ts = from_table( fpath, col_units=col_units, zero_idx=zero_idx, row_fmt=row_fmt, **kwargs, ) elif source_t == "multicol": if date_cols is None: raise ValueError("date_cols: missing list of datetime columns") ts = from_multicol(fpath, date_cols=date_cols, **kwargs) else: if source_t: logger.warning(f"{source_t}: unsupported source, falling back to default") ts = pd.read_csv(fpath, **kwargs) return ts
[docs]def from_table( fpath: _file_t, *, col_units: str, zero_idx: bool, row_fmt: str = "", **kwargs, ): """Read a time series from a tabular file. See Also -------- read_timeseries : see for full documentation, main entrypoint for users """ # NOTE: allow for plural forms, as it is quite common, but the allowance is # undocumented, hence not guaranteed to work. if "month" in col_units: offset = pd.tseries.offsets.MonthBegin() elif "hour" in col_units: offset = pd.Timedelta(1, unit="hour") else: raise ValueError(f"{col_units}: unsupported column units") # NOTE: assumption: input is oriented as portrait opts = {"parse_dates": [0], "index_col": 0} # NOTE: for date-hour, it's okay to use the default dateutil parser for # date, unless otherwise specified, however for year-month it gets confused # and the format string needs to be explicitly set to YYYY if col_units == "month" and row_fmt == "": row_fmt = "%Y" if row_fmt: opts.update(date_parser=lambda dt: datetime.strptime(dt, row_fmt)) # NOTE: "parse_dates", and "index_col" maybe overidden by the keyword # arguments so that the user has the option to ignore the inferred values; # it's a wild world, can't think of everything ;) opts.update(kwargs) ts = pd.read_csv(fpath, **opts).stack() # merge indices idx_lvls = [ts.index.get_level_values(i) for i in (0, 1)] ts_delta = (idx_lvls[1].astype(int) - int(not zero_idx)) * offset with warnings.catch_warnings(): warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning) ts.index = idx_lvls[0] + ts_delta return ts
[docs]def from_multicol(fpath: _file_t, *, date_cols: List[_col_t], **kwargs): """Read a time series where datetime values are in multiple columns. See Also -------- read_timeseries : see for full documentation, main entrypoint for users """ # NOTE: index_col=0 b/c columns parsed as dates always end up in the front df = pd.read_csv(fpath, parse_dates=[date_cols], index_col=0, **kwargs) return df