gwosc/api.py

Summary

Maintainability
C
7 hrs
Test Coverage
# -*- coding: utf-8 -*-
# Copyright (C) Cardiff University (2018-2021)
# SPDX-License-Identifier: MIT

"""
:mod:`gwosc.api` provides the low-level interface functions
that handle direct requests to the GWOSC host.
"""

import logging
import os
import re
from urllib.parse import urlencode
from . import __version__

import requests

logger = logging.getLogger("gwosc.api")
_loghandler = logging.StreamHandler()
_loghandler.setFormatter(
    logging.Formatter(logging.BASIC_FORMAT),
)
logger.addHandler(_loghandler)
logger.setLevel(int(os.getenv("GWOSC_LOG_LEVEL", logging.NOTSET)))

_MAX_GPS = 99999999999

#: The default GWOSC host URL
DEFAULT_URL = "https://gwosc.org"

#: Cache of downloaded blobs
JSON_CACHE = {}

_ALLOWED_OPS = set((">=", "=>", "<=", "=<"))

# -- JSON handling ------------------------------------------------------------


def fetch_json(url, **kwargs):
    """Fetch JSON data from a remote URL

    Parameters
    ----------
    url : `str`
        the remote URL to fetch

    **kwargs
        other keyword arguments are passed directly to :func:`requests.get`

    Returns
    ------
    data : `dict` or `list`
        the data fetched from ``url`` as parsed by
        :meth:`requests.Response.json`

    See also
    --------
    json.loads
        for details of the JSON parsing
    """
    try:
        return JSON_CACHE[url]
    except KeyError:
        logger.debug("fetching {}".format(url))
        client_headers = {"User-Agent": f"python-gwosc/{__version__}"}
        resp = requests.get(url, headers=client_headers, **kwargs)
        resp.raise_for_status()
        return JSON_CACHE.setdefault(
            url,
            resp.json(),
        )


# -- Run datasets -------------------------------------------------------------

def _dataset_url(start, end, host=DEFAULT_URL):
    return "{}/archive/{:d}/{:d}/json/".format(host, start, end)


def fetch_dataset_json(gpsstart, gpsend, host=DEFAULT_URL):
    """Returns the JSON metadata for all datasets matching the GPS interval

    Parameters
    ----------
    gpsstart : `int`
        the GPS start of the desired interval

    gpsend : `int`
        the GPS end of the desired interval

    host : `str`, optional
        the URL of the GWOSC host to query, defaults to
        https://gwosc.org

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returned by `json.loads`
    """
    return fetch_json(_dataset_url(gpsstart, gpsend, host=host))


def _run_url(run, detector, start, end, host=DEFAULT_URL):
    return "{}/archive/links/{}/{}/{:d}/{:d}/json/".format(
        host, run, detector, start, end,
    )


def fetch_run_json(run, detector, gpsstart=0, gpsend=_MAX_GPS,
                   host=DEFAULT_URL):
    """Returns the JSON metadata for the given science run parameters

    Parameters
    ----------
    run : `str`
        the name of the science run, e.g. ``'O1'``

    detector : `str`
        the prefix of the GW detector, e.g. ``'L1'``

    gpsstart : `int`
        the GPS start of the desired interval

    gpsend : `int`
        the GPS end of the desired interval

    host : `str`, optional
        the URL of the GWOSC host to query, defaults to
        https://gwosc.org

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returned by `json.loads`
    """
    return fetch_json(_run_url(run, detector, gpsstart, gpsend, host=host))


# -- EventAPI catalogs -------------------------------------------------------

def _allowed_params_url(host=DEFAULT_URL):
    return "{}/eventapi/json/query/params/".format(host)


def fetch_allowed_params_json(host=DEFAULT_URL):
    return fetch_json(_allowed_params_url(host=host))


def _eventapi_url(full=False, host=DEFAULT_URL):
    j = "jsonfull" if full else "json"
    return "{}/eventapi/{}/".format(host, j)


def fetch_cataloglist_json(host=DEFAULT_URL):
    """Returns the JSON metadata for the catalogue list.

    Parameters
    ----------
    host : `str`, optional
        the URL of the GWOSC host to query

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returned by
        :meth:`requests.Response.json`
    """
    return fetch_json(_eventapi_url(host=host))


def _catalog_url(catalog, host=DEFAULT_URL):
    return "{}{}/".format(_eventapi_url(host=host), catalog)


def fetch_catalog_json(catalog, host=DEFAULT_URL):
    """"Returns the JSON metadata for the given catalogue

    Parameters
    ----------
    catalog : `str`
        the name of the event catalog, e.g. `GWTC-1-confident`

    host : `str`, optional
        the URL of the GWOSC host to query, defaults to
        https://gwosc.org

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returnend by
        :meth:`requests.Response.json`
    """
    return fetch_json(_catalog_url(catalog, host=host))


# -- EventAPI events ---------------------------------------------------------

def _allevents_url(full=False, host=DEFAULT_URL):
    return "{}allevents/".format(_eventapi_url(full=full, host=host))


def _has_jsonfull_allevents(host=DEFAULT_URL):
    return _allevents_url(full=True, host=host) in JSON_CACHE


def fetch_allevents_json(full=False, host=DEFAULT_URL):
    """"Returns the JSON metadata for the allevents API

    Parameters
    ----------
    host : `str`, optional
        the URL of the GWOSC host to query, defaults to https://gwosc.org

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returned by
        :meth:`requests.Response.json`
    """
    if full is None and _has_jsonfull_allevents(host=host):
        return fetch_json(_allevents_url(full=True, host=host))
    return fetch_json(_allevents_url(full=full, host=host))


def _fetch_allevents_event_json(
        event,
        catalog=None,
        version=None,
        full=False,
        host=DEFAULT_URL,
):
    """Returns the JSON metadata from the allevents view for a specific event

    The raw JSON data are packaged to look the same as if they came from
    a full event API query, i.e. nested under `'events`'.
    """
    allevents = fetch_allevents_json(full=full, host=host)["events"]
    matched = []

    def _match(keyvalue):
        dset, metadata = keyvalue
        if event not in {
            dset,
            metadata["commonName"],  # full name
            metadata["commonName"].split("_", 1)[0],  # GWYYMMDD prefix
        }:
            return
        if version is not None and metadata["version"] != version:
            return
        if catalog is not None and metadata["catalog.shortName"] != catalog:
            return
        return True

    # match datasets
    matched = list(filter(_match, allevents.items()))
    names = set(x[1]["commonName"] for x in matched)

    # one dataset has an exact name match, so discard everything else
    if event in names:
        matched = [x for x in matched if x[1]["commonName"] == event]
        names = set(x[1]["commonName"] for x in matched)

    # we have a winner!
    if len(names) == 1:
        key, meta = sorted(matched, key=lambda x: x[1]["version"])[-1]
        return {"events": {key: meta}}

    # raise error with the right message
    if len(names) > 1:
        raise ValueError(
            "multiple events matched for {!r}: '{}'".format(
                event,
                "', '".join(names),
            ),
        )
    msg = "failed to identify {} for event '{}'"
    if catalog is None:
        msg = msg.format("catalog", event)
        if version is not None:
            msg += " at version {}".format(version)
        raise ValueError(msg)
    msg = msg.format("version", event)
    if catalog is not None:
        msg += " in catalog '{}'".format(catalog)
    raise ValueError(msg)


def _parse_two_ops(compiled_m, host=DEFAULT_URL):
    allowed_params = fetch_allowed_params_json(host=host)
    md = compiled_m.groupdict()
    op1, op2 = md["op1"], md["op2"]
    if not set((op1, op2)).issubset(_ALLOWED_OPS):
        raise ValueError(
            "Could not parse select string.\n"
            "Unknown operators."
        )
    param, val1, val2 = md["param"], md["val1"], md["val2"]
    if param not in allowed_params:
        raise ValueError(
            "Could not parse select string.\n"
            f"Unrecognized parameter: {param}.\n"
            f"Use one of:\n{allowed_params}"
        )
    queries = []
    if ">" in op1:
        queries.append((f"max-{param}", val1))
    if "<" in op1:
        queries.append((f"min-{param}", val1))
    if ">" in op2:
        queries.append((f"min-{param}", val2))
    if "<" in op2:
        queries.append((f"max-{param}", val2))
    return queries


def _parse_one_op(compiled_m, host=DEFAULT_URL):
    allowed_params = fetch_allowed_params_json(host=host)
    md = compiled_m.groupdict()
    op = md["op"]
    if not set((op,)).issubset(_ALLOWED_OPS):
        raise ValueError(
            "Could not parse select string.\n"
            "Unknown operator."
        )
    param, val = md["param"], md["val"]
    if param not in allowed_params:
        raise ValueError(
            f"Could not parse select string.\n"
            f"Unrecognized parameter: {param}.\n"
            f"Use one of:\n{allowed_params}"
        )
    queries = []
    if ">" in op:
        queries.append((f"min-{param}", val))
    if "<" in op:
        queries.append((f"max-{param}", val))
    return queries


def _select_to_query(select, host=DEFAULT_URL):
    """Parse select string and translate into URL GET parameters"""

    # Captures strings of the form `1.44 <= param <= 5.0`
    two_ops = re.compile(
        r"^\s*(?P<val1>[\d.+-eE]+)\s*(?P<op1>[<>=]{2})\s*(?P<param>[\w-]+)"
        r"\s*(?P<op2>[<>=]+)\s*(?P<val2>[\d.+-eE]+)\s*$"
    )
    # Captures strings of the form `param <= 5.0`
    one_op = re.compile(
        r"^\s*(?P<param>[\w-]+)\s*(?P<op>[<>=]{2})\s*(?P<val>[\d.+-eE]+)\s*$"
    )

    queries = []
    for s in select:
        for regex, _parse in (
            (one_op, _parse_one_op),
            (two_ops, _parse_two_ops),
        ):
            m = regex.match(s)
            if m is not None:
                queries.extend(_parse(m, host=host))
                break
        else:
            raise ValueError(f"Could not parse select string: {s}")
    return urlencode(queries)


def _query_events_url(select, host=DEFAULT_URL):
    return "{}/eventapi/json/query/show?{}".format(
        host, _select_to_query(select, host=host)
    )


def fetch_filtered_events_json(select, host=DEFAULT_URL):
    """"Return the JSON metadata for the events constrained by select

    Parameters
    ----------
    select : `list-like`
        a list of range constrains for the events.
        All ranges should have inclusive ends (<= and >= operators).

    host : `str`, optional
        the URL of the GWOSC host to query, defaults to
        https://gwosc.org

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returnend by
        :meth:`requests.Response.json`

    Example
    -------
    >>> fetch_filtered_events_json(
    ...     select=[
    ...         "mass-1-source <= 5",
    ...         "mass-2-source =< 10",
    ...         "10 <= luminosity-distance <= 100",
    ...     ]
    ... )
    """
    return fetch_json(_query_events_url(select, host=host))


def _event_url(
        event,
        catalog=None,
        version=None,
        host=DEFAULT_URL,
):
    return list(_fetch_allevents_event_json(
        event,
        catalog=catalog,
        version=version,
        full=None,
        host=host,
    )["events"].values())[0]["jsonurl"]


def fetch_event_json(
        event,
        catalog=None,
        version=None,
        host=DEFAULT_URL,
):
    """Returns the JSON metadata for the given event.

    By default, this function queries across all catalogs and all data-release
    versions, returning the highest available version, unless the
    ``version`` and/or ``catalog`` keywords are specified.

    Parameters
    ----------
    event : `str`
        the name of the event to query

    catalog : `str`, optional
        name of catalogue that hosts this event

    version : `int`, `None`, optional
        restrict query to a given data-release version

    host : `str`, optional
        the URL of the GWOSC host to query, defaults to
        https://gwosc.org

    Returns
    -------
    data : `dict` or `list`
        the JSON data retrieved from GWOSC and returned by `json.loads`
    """
    return fetch_json(
        _event_url(event, catalog=catalog, version=version, host=host),
    )