gwosc/locate.py

Summary

Maintainability
B
4 hrs
Test Coverage
# -*- coding: utf-8 -*-
# Copyright (C) Cardiff University (2018-2021)
# SPDX-License-Identifier: MIT

"""
`gwosc.locate` provides functions to determine the file URLs containing
data for a specific dataset.

You can search for remote data URLS based on the event name:

>>> from gwosc.locate import get_event_urls
>>> get_event_urls('GW150914')
['https://gwosc.org/eventapi/json/GWTC-1-confident/GW150914/v3/H-H1_GWOSC_4KHZ_R1-1126259447-32.hdf5',
 'https://gwosc.org/eventapi/json/GWTC-1-confident/GW150914/v3/H-H1_GWOSC_4KHZ_R1-1126257415-4096.hdf5',
 'https://gwosc.org/eventapi/json/GWTC-1-confident/GW150914/v3/L-L1_GWOSC_4KHZ_R1-1126259447-32.hdf5',
 'https://gwosc.org/eventapi/json/GWTC-1-confident/GW150914/v3/L-L1_GWOSC_4KHZ_R1-1126257415-4096.hdf5']

You can down-select the URLs using keyword arguments:

>>> get_event_urls('GW150914', detector='L1', duration=32)
['https://gwosc.org/eventapi/json/GWTC-1-confident/GW150914/v3/L-L1_GWOSC_4KHZ_R1-1126259447-32.hdf5']

You can search for remote data URLs based on the GPS time interval as
follows:

>>> from gwosc.locate import get_urls
>>> get_urls('L1', 968650000, 968660000)
['https://gwosc.org/archive/data/S6/967835648/L-L1_LOSC_4_V1-968646656-4096.hdf5',
 'https://gwosc.org/archive/data/S6/967835648/L-L1_LOSC_4_V1-968650752-4096.hdf5',
 'https://gwosc.org/archive/data/S6/967835648/L-L1_LOSC_4_V1-968654848-4096.hdf5',
 'https://gwosc.org/archive/data/S6/967835648/L-L1_LOSC_4_V1-968658944-4096.hdf5']

By default, this method will return the paths to HDF5 files for the 4 kHz
sample-rate data, these can be specified as keyword arguments.
For full information, see :func:`get_urls`.
"""  # noqa: E501

from . import (
    api,
    datasets,
    urls as lurls,
    utils,
)

__all__ = ['get_urls', 'get_run_urls', 'get_event_urls']
__author__ = 'Duncan Macleod <duncan.macleod@ligo.org>'


def get_urls(
        detector, start, end,
        dataset=None,
        version=None,
        sample_rate=4096,
        format='hdf5',
        host=api.DEFAULT_URL,
):
    """Fetch the URLs from GWOSC regarding a given GPS interval

    Parameters
    ----------
    detector : `str`
        the prefix of the relevant GW detector

    start : `int`
        the GPS start time of your query

    end : `int`
        the GPS end time of your query

    dataset : `str`, `None`, optional
        the name of the dataset to query, e.g. ``'GW150914'``

    version : `int`, `None`, optional
        the data-release version for the selected datasets

    sample_rate : `int`, optional, default : ``4096``
        the sampling rate (Hz) of files you want to find

    format : `str`, optional, default: ``'hdf5'``
        the file format (extension) you want to find

    host : `str`, optional
        the URL of the remote GWOSC server

    Returns
    -------
    urls : `list` of `str`
        the list of remote file URLs that contain data matching the
        relevant parameters
    """
    start = int(start)
    end = int(end)

    for dstype in ("event", "run"):
        dsets = datasets._iter_datasets(
            match=dataset,
            type=dstype,
            detector=detector,
            segment=(start, end),
            version=version,
            host=host,
        )

        for dst in dsets:
            # get URL list for this dataset
            if dstype == "run":
                urls = get_run_urls(
                    dst,
                    detector,
                    start,
                    end,
                    host=host,
                    sample_rate=sample_rate,
                    format=format,
                )
            else:
                urls = get_event_urls(
                    dst,
                    detector=detector,
                    start=start,
                    end=end,
                    format=format,
                    sample_rate=sample_rate,
                    version=version,
                    host=host,
                )

            # if full span covered, return now
            if utils.full_coverage(urls, (start, end)):
                return urls

    raise ValueError("Cannot find a GWOSC dataset for %s covering [%d, %d)"
                     % (detector, start, end))


def get_run_urls(
        run,
        detector,
        start,
        end,
        format='hdf5',
        sample_rate=4096,
        **match
):
    """Fetch the URLs from GWOSC regarding a given event

    Parameters
    ----------
    event : `str`
        the ID of the event

    detector : `str`, optional
        the detector for files you want to find

    start : `int`
        the GPS start time of your query

    end : `int`
        the GPS end time of your query

    format : `str`, optional, default: ``'hdf5'``
        the file format (extension) you want to find

    sample_rate : `int`, optional, default : ``4096``
        the sampling rate (Hz) of files you want to find

    host : `str`, optional
        the URL of the remote GWOSC server

    version : `int`, `None`, optional
        the data-release version for the selected datasets

    Returns
    -------
    urls : `list` of `str`
        the list of remote file URLs that contain data matching the
        relevant parameters
    """
    # get the URL file list for this run segment
    meta = api.fetch_run_json(
        run,
        detector,
        gpsstart=start,
        gpsend=end,
        host=match.pop('host', api.DEFAULT_URL),
    )

    # filter
    return _urls(lurls.sieve(
        meta["strain"],
        format=format,
        sampling_rate=sample_rate,
        **match
    ))


def get_event_urls(
        event,
        catalog=None,
        version=None,
        detector=None,
        start=None,
        end=None,
        format='hdf5',
        sample_rate=4096,
        host=api.DEFAULT_URL,
        **match
):
    """Fetch the URLs from GWOSC regarding a given event

    Parameters
    ----------
    event : `str`
        the ID of the event

    detector : `str`, optional
        the detector for files you want to find

    format : `str`, optional, default: ``'hdf5'``
        the file format (extension) you want to find

    sample_rate : `int`, optional, default : ``4096``
        the sampling rate (Hz) of files you want to find

    host : `str`, optional
        the URL of the remote GWOSC server

    start : `int`
        the GPS start time of your query

    end : `int`
        the GPS end time of your query

    version : `int`, `None`, optional
        the data-release version for the selected datasets

    Returns
    -------
    urls : `list` of `str`
        the list of remote file URLs that contain data matching the
        relevant parameters
    """
    # get the URL file list for this event
    meta = list(api._fetch_allevents_event_json(
        event,
        catalog=catalog,
        version=version,
        full=True,
        host=host,
    )["events"].values())[0]
    strain = meta["strain"]

    # format start and end as a segment
    if start is None and end is None:
        segment = None
    else:
        segment = (
            -float('inf') if start is None else start,
            +float('inf') if end is None else end,
        )

    # then filter on everything els
    return _urls(lurls.sieve(
        strain,
        detector=detector,
        format=format,
        sampling_rate=sample_rate,
        segment=segment,
        **match
    ))


def _urls(strain):
    return [u["url"] for u in strain]