WenjieDu/PyPOTS

View on GitHub
pypots/data/load_specific_datasets.py

Summary

Maintainability
A
0 mins
Test Coverage
"""
Functions to load supported open-source time-series datasets.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause


from benchpots.datasets import preprocess_physionet2012

from ..utils.logging import logger

# currently supported datasets
SUPPORTED_DATASETS = [
    "physionet_2012",
]

# preprocessing functions of the supported datasets
PREPROCESSING_FUNC = {
    "physionet_2012": preprocess_physionet2012,
}


def list_supported_datasets() -> list:
    """Return the datasets natively supported by PyPOTS so far.

    Returns
    -------
    SUPPORTED_DATASETS :
        A list including all supported datasets.

    """
    return SUPPORTED_DATASETS


def load_specific_dataset(dataset_name: str, use_cache: bool = True) -> dict:
    """Load specific datasets supported by PyPOTS.
    Different from tsdb.load(), which only produces merely raw data,
    load_specific_dataset here does some preprocessing operations,
    like truncating time series to generate samples with the same length.

    Parameters
    ----------
    dataset_name :
        The name of the dataset to be loaded, which should be supported, i.e. in SUPPORTED_DATASETS.

    use_cache :
        Whether to use cache. This is an argument of tsdb.load().

    Returns
    -------
    data :
        A dict contains the preprocessed dataset.
        Users only need to continue the preprocessing steps to generate the data they want,
        e.g. standardizing and splitting.

    """
    logger.info(f"Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)...")
    assert dataset_name in SUPPORTED_DATASETS, (
        f"Dataset {dataset_name} is not supported. "
        f"If you believe this dataset is valuable to be supported by PyPOTS,"
        f"please create an issue on GitHub "
        f"https://github.com/WenjieDu/PyPOTS/issues"
    )
    logger.info(f"Starting preprocessing {dataset_name}...")
    data = PREPROCESSING_FUNC[dataset_name]("all", 0.1)
    logger.warning(
        "⚠️ load_specific_dataset() will be deprecated in the near future. Data preprocessing functions "
        "are moved to BenchPOTS, which now supports processing 170+ public time-series datasets."
    )
    return data