Garve/scikit-bonus

View on GitHub
skbonus/pandas/time_utils.py

Summary

Maintainability
B
4 hrs
Test Coverage
"""Deal with time features in dataframes."""

import numpy as np
import pandas as pd


def explode_date(
    df: pd.DataFrame,
    start_column: str,
    end_column: str,
    result_column: str = "Date",
    frequency: str = "d",
    drop: bool = True,
) -> pd.DataFrame:
    """
    Transform a pandas dataframe with columns (*, start_date, end_date) into a longer format with columns (*, date).

    This is useful if you deal with datasets that contain special time periods per row, but you need a single date per row.
    See the examples for more details.

    Parameters
    ----------
    df: pd.DataFrame
        The input dataframe with a column containing starts dates and end dates.

    start_column : str
        Start date of the period.

    end_column : str
        End date of the period.

    result_column : str, default="Date"
        Name of the new output date column.

    frequency : str, default="d" (for day)
        A pandas time frequency. Can take values like "d" for day or "m" for month. A full list can
        be found on https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases.
        If None, the transformer tries to infer it during fit time.

    drop : bool, default=True
        Whether to drop the `start_column` and `end_column` in the output.

    Returns
    -------
    pd.DataFrame
        A longer dataframe with one date per row.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...    "Data": ["a", "b", "c"],
    ...    "Start": pd.date_range("2020-01-01", periods=3),
    ...    "End": pd.date_range("2020-01-03", periods=3)
    ... })
    >>> df
      Data      Start        End
    0    a 2020-01-01 2020-01-03
    1    b 2020-01-02 2020-01-04
    2    c 2020-01-03 2020-01-05

    >>> explode_date(df, start_column="Start", end_column="End", result_column="output_date", frequency="d")
      Data output_date
    0    a  2020-01-01
    0    a  2020-01-02
    0    a  2020-01-03
    1    b  2020-01-02
    1    b  2020-01-03
    1    b  2020-01-04
    2    c  2020-01-03
    2    c  2020-01-04
    2    c  2020-01-05
    """
    return (
        df.assign(
            **{
                result_column: lambda df: df.apply(
                    lambda row: pd.date_range(
                        start=row[start_column], end=row[end_column], freq=frequency
                    ),
                    axis=1,
                )
            }
        )
        .explode(result_column)
        .drop(columns=drop * [start_column, end_column])
    )


def add_date_indicators(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """
    Enrich a pandas dataframes with a new column indicating if there is a special date.

    This new column will contain a one for each date specified in the `dates` keyword, zero otherwise.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with a DateTime index.

    kwargs : List[str]*
        As many inputs as you want of the form date_name=[date_1, date_2, ...], i.e. christmas=['2020-12-24'].
        See the example below for more information.

    Returns
    -------
    pd.DataFrame
        A dataframe with date indicator columns.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({"A": range(7)}, index=pd.date_range(start="2019-12-29", periods=7))
    >>> add_date_indicators(
    ...     df,
    ...     around_new_year_2020=["2019-12-31", "2020-01-01", "2020-01-02"],
    ...     other_date_1=["2019-12-29"],
    ...     other_date_2=["2018-01-01"]
    ... )
                A  around_new_year_2020  other_date_1  other_date_2
    2019-12-29  0                     0             1             0
    2019-12-30  1                     0             0             0
    2019-12-31  2                     1             0             0
    2020-01-01  3                     1             0             0
    2020-01-02  4                     1             0             0
    2020-01-03  5                     0             0             0
    2020-01-04  6                     0             0             0
    """
    return df.assign(
        **{name: df.index.isin(dates).astype(int) for name, dates in kwargs.items()}
    )


def add_time_features(
    df: pd.DataFrame,
    second: bool = False,
    minute: bool = False,
    hour: bool = False,
    day_of_week: bool = False,
    day_of_month: bool = False,
    day_of_year: bool = False,
    week_of_month: bool = False,
    week_of_year: bool = False,
    month: bool = False,
    year: bool = False,
) -> pd.DataFrame:
    """
    Enrich pandas dataframes with new columns which are easy derivations from its DatetimeIndex, such as the day of week or the month.

    Parameters
    ----------
    df: pd.DataFrame
        Input dataframe with a DateTime index.

    second : bool, default=False
        Whether to extract the day of week from the index and add it as a new column.

    minute : bool, default=False
        Whether to extract the day of week from the index and add it as a new column.

    hour : bool, default=False
        Whether to extract the day of week from the index and add it as a new column.

    day_of_week : bool, default=False
        Whether to extract the day of week from the index and add it as a new column.

    day_of_month : bool, default=False
        Whether to extract the day of month from the index and add it as a new column.

    day_of_year : bool, default=False
        Whether to extract the day of year from the index and add it as a new column.

    week_of_month : bool, default=False
        Whether to extract the week of month from the index and add it as a new column.

    week_of_year : bool, default=False
        Whether to extract the week of year from the index and add it as a new column.

    month : bool, default=False
        Whether to extract the month from the index and add it as a new column.

    year : bool, default=False
        Whether to extract the year from the index and add it as a new column.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame(
    ...     {"A": ["a", "b", "c"]},
    ...     index=[
    ...         pd.Timestamp("1988-08-08"),
    ...         pd.Timestamp("2000-01-01"),
    ...         pd.Timestamp("1950-12-31"),
    ...     ])
    >>> add_time_features(df, day_of_month=True, month=True, year=True)
                A  day_of_month  month  year
    1988-08-08  a             8      8  1988
    2000-01-01  b             1      1  2000
    1950-12-31  c            31     12  1950
    """

    def _add_second(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(second=df.index.second) if second else df

    def _add_minute(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(minute=df.index.minute) if minute else df

    def _add_hour(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(hour=df.index.hour) if hour else df

    def _add_day_of_week(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(day_of_week=df.index.weekday + 1) if day_of_week else df

    def _add_day_of_month(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(day_of_month=df.index.day) if day_of_month else df

    def _add_day_of_year(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(day_of_year=df.index.dayofyear) if day_of_year else df

    def _add_week_of_month(df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.assign(week_of_month=np.ceil(df.index.day / 7).astype(int))
            if week_of_month
            else df
        )

    def _add_week_of_year(df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.assign(week_of_year=df.index.isocalendar().week) if week_of_year else df
        )

    def _add_month(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(month=df.index.month) if month else df

    def _add_year(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(year=df.index.year) if year else df

    return (
        df.pipe(_add_second)
        .pipe(_add_minute)
        .pipe(_add_hour)
        .pipe(_add_day_of_week)
        .pipe(_add_day_of_month)
        .pipe(_add_day_of_year)
        .pipe(_add_week_of_month)
        .pipe(_add_week_of_year)
        .pipe(_add_month)
        .pipe(_add_year)
    )