carlculator/timeseriesx

View on GitHub
timeseriesx/mixins/frequency.py

Summary

Maintainability
B
4 hrs
Test Coverage
B
88%
import copy

import numpy as np
import pandas as pd

from timeseriesx.mixins import BaseMixin
from timeseriesx.validation.frequency import (
    coerce_freq,
    infer_freq,
)


class FrequencyMixin(BaseMixin):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._freq = kwargs.get('freq', None)
        if self._freq == 'infer':
            self._freq = infer_freq(self._series)
        self._validate_freq()

    @property
    def freq(self):
        return self._freq

    def fill_gaps(self, start=None, end=None, value=np.NaN):
        """
        fill all gaps between `start` and `end` in a series with a frequency with a
        constant value

        :param datetime.datetime start: the start timestamps of the period that will be
            investigated (included). If None, then the first timestamp in the
            time series is considered as start. Defaults to None
        :param datetime.datetime end: the end timestamps of the period that will be
            investigated (included). If None, then the last timestamp in the
            time series is considered as end. Defaults to None
        :param float/int/np.float value: the constant fill value
        :return: return the series with filled gaps
        :rtype: BaseTimeSeries
        """
        if not self._freq:
            raise ValueError('cannot determine gaps when freq is not set')
        if (not start or not end) and self._series.empty:
            raise ValueError('cannot fill the gaps for empty series '
                             'without parameters providing start and end.')

        start = start or self._series.index[0].to_pydatetime()
        end = end or self._series.index[-1].to_pydatetime()

        try:
            expected_index = pd.date_range(
                start, end, freq=self._freq, tz=self._get_time_zone())
        except (AssertionError, TypeError):
            raise ValueError('time zone of parameter start or end does not match '
                             'the time zone of the series')
        self._series = self._series.reindex(
            self._series.index.join(expected_index, how='right'))
        self._series.loc[self._series.isnull()] = value
        return self

    def get_gaps(self, start=None, end=None):
        """
        get all timestamps between `start` and `end` from a series with a frequency,
        where the value is missing or NaN

        :param datetime.datetime start: the start timestamps of the period that will be
            investigated (included). If None, then the first timestamp in the
            time series is considered as start. Defaults to None
        :param datetime.datetime end: the end timestamps of the period that will be
            investigated (included). If None, then the last timestamp in the
            time series is considered as end. Defaults to None
        :return: list of timestamps
        :rtype: list of datetime.datetime
        """
        if not self._freq:
            raise ValueError('cannot determine gaps when freq is not set')
        if (not start or not end) and self._series.empty:
            raise ValueError('cannot determine the gaps from empty series '
                             'without parameters providing start and end.')

        tmp_series = copy.deepcopy(self)
        start = start or self._series.index[0].to_pydatetime()
        end = end or self._series.index[-1].to_pydatetime()

        tmp_series.fill_gaps(start, end)
        gap_series = tmp_series._series[tmp_series._series.isnull()]
        return gap_series.index.to_pydatetime().tolist()

    def resample(self, freq, method):
        """
        resample the series to a smaller frequency, aggregate the values

        :param str/datetime.timedelta/pandas.Offset/pandas.Timedelta freq:
            the new frequency, has to be smaller than the current frequency
            (greater offset)
        :param str/Callable method: aggregation method, currently supported
            are "all", "any", "min", "max", "sum", "mean", "median", or function
            that a collection (e.g. pandas.Series or list) of numeric values as
            its argument and returns a scalar
        :return: the resamples time series
        :rtype: BaseTimeSeries
        """
        if not self._freq:
            raise ValueError('cannot resample when freq is not set')
        freq = coerce_freq(freq)
        if self._freq >= freq:
            raise ValueError(
                'can only resample to smaller frequencies (larger offsets)'
            )
        freq = coerce_freq(freq)
        # perform the aggregation on a tmp_series to avoid
        # potential problems with units. Issue with unit aggregations has been
        # reported to pint-pandas https://github.com/hgrecco/pint-pandas/issues/117
        tmp_series = self.as_pd_series()
        tmp_series = getattr(tmp_series.resample(freq), 'aggregate')(method)
        self._series = tmp_series.astype(self._series.dtype)
        self._freq = freq
        return self

    def _validate_freq(self):
        self._freq = coerce_freq(self._freq)
        try:
            self._series.index.freq = self._freq
        except ValueError:
            raise ValueError('frequency does not conform to timestamps')

    def _validate_all(self):
        super()._validate_all()
        self._validate_freq()

    def _get_time_zone(self):
        if hasattr(self, 'time_zone'):
            return self.time_zone
        else:
            return None