datalad/datalad-container

View on GitHub
datalad_container/containers_add.py

Summary

Maintainability
C
1 day
Test Coverage
"""Add a container environment to a dataset"""

__docformat__ = 'restructuredtext'

import json
import logging
import os
import os.path as op
import re
from pathlib import (
    Path,
    PurePosixPath,
)
from shutil import copyfile

from datalad.cmd import WitlessRunner
from datalad.distribution.dataset import (
    EnsureDataset,
    datasetmethod,
    require_dataset,
)
from datalad.interface.base import (
    Interface,
    build_doc,
    eval_results,
)
from datalad.interface.results import get_status_dict
from datalad.support.constraints import (
    EnsureNone,
    EnsureStr,
)
from datalad.support.exceptions import InsufficientArgumentsError
from datalad.support.param import Parameter

from .utils import get_container_configuration

lgr = logging.getLogger("datalad.containers.containers_add")

# The DataLad special remote has built-in support for Singularity Hub URLs. Let
# it handle shub:// URLs if it's available.
_HAS_SHUB_DOWNLOADER = True
try:
    import datalad.downloaders.shub
except ImportError:
    lgr.debug("DataLad's shub downloader not found. "
              "Custom handling for shub:// will be used")
    _HAS_SHUB_DOWNLOADER = False


def _resolve_img_url(url):
    """Takes a URL and tries to resolve it to an actual download
    URL that `annex addurl` can handle"""
    if not _HAS_SHUB_DOWNLOADER and url.startswith('shub://'):
        # TODO: Remove this handling once the minimum DataLad version is at
        # least 0.14.
        lgr.debug('Query singularity-hub for image download URL')
        import requests
        req = requests.get(
            'https://www.singularity-hub.org/api/container/{}'.format(
                url[7:]))
        shub_info = json.loads(req.text)
        url = shub_info['image']
    return url


def _guess_call_fmt(ds, name, url):
    """Helper to guess a container exec setup based on
    - a name (to be able to look up more config
    - a plain url to make inference based on the source location

    Should return `None` is no guess can be made.
    """
    if url is None:
        return None
    elif url.startswith('shub://') or url.startswith('docker://'):
        return 'singularity exec {img} {cmd}'
    elif url.startswith('dhub://'):
        # {python} is replaced with sys.executable on *execute*
        return '{python} -m datalad_container.adapters.docker run {img} {cmd}'


def _ensure_datalad_remote(repo):
    """Initialize and enable datalad special remote if it isn't already."""
    dl_remote = None
    for info in repo.get_special_remotes().values():
        if info.get("externaltype") == "datalad":
            dl_remote = info["name"]
            break

    if not dl_remote:
        from datalad.consts import DATALAD_SPECIAL_REMOTE
        from datalad.customremotes.base import init_datalad_remote

        init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True)
    elif repo.is_special_annex_remote(dl_remote, check_if_known=False):
        lgr.debug("datalad special remote '%s' is already enabled",
                  dl_remote)
    else:
        lgr.debug("datalad special remote '%s' found. Enabling",
                  dl_remote)
        repo.enable_remote(dl_remote)


@build_doc
# all commands must be derived from Interface
class ContainersAdd(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """Add a container to a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to add the container to. If no dataset is
            given, an attempt is made to identify the dataset based on the
            current working directory""",
            constraints=EnsureDataset() | EnsureNone()
        ),
        name=Parameter(
            args=("name",),
            doc="""The name to register the container under. This also
                determines the default location of the container image
                within the dataset.""",
            metavar="NAME",
            constraints=EnsureStr(),
        ),
        url=Parameter(
            args=("-u", "--url"),
            doc="""A URL (or local path) to get the container image from. If
            the URL scheme is one recognized by Singularity (e.g.,
            'shub://neurodebian/dcm2niix:latest' or
            'docker://debian:stable-slim'), a command format string for
            Singularity-based execution will be auto-configured when
            [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified.
            For Docker-based container execution with the URL scheme 'dhub://',
            the rest of the URL will be interpreted as the argument to
            'docker pull', the image will be saved to a location
            specified by `name`, and the call format will be auto-configured
            to run docker, unless overwritten. The auto-configured call to docker 
            run mounts the CWD to '/tmp' and sets the working directory to '/tmp'.""",
            metavar="URL",
            constraints=EnsureStr() | EnsureNone(),
        ),

        # TODO: The "prepared command stuff should ultimately go somewhere else
        # (probably datalad-run). But first figure out, how exactly to address
        # container datasets
        call_fmt=Parameter(
            args=("--call-fmt",),
            doc="""Command format string indicating how to execute a command in
            this container, e.g. "singularity exec {img} {cmd}". Where '{img}'
            is a placeholder for the path to the container image and '{cmd}' is
            replaced with the desired command. Additional placeholders:
            '{img_dspath}' is relative path to the dataset containing the image,
            '{img_dirpath}' is the directory containing the '{img}'.
            '{python}' expands to the path of the Python executable that is
            running the respective DataLad session, for example a
            'datalad containers-run' command.
            """,
            metavar="FORMAT",
            constraints=EnsureStr() | EnsureNone(),
        ),
        extra_input=Parameter(
            args=("--extra-input",),
            doc="""Additional file the container invocation depends on (e.g.
            overlays used in --call-fmt). Can be specified multiple times.
            Similar to --call-fmt, the placeholders {img_dspath} and
            {img_dirpath} are available. Will be stored in the dataset config and
            later added alongside the container image to the `extra_inputs`
            field in the run-record and thus automatically be fetched when
            needed.
            """,
            action="append",
            default=[],
            metavar="FILE",
            # Can't use EnsureListOf(str) yet as it handles strings as iterables...
            # See this PR: https://github.com/datalad/datalad/pull/7267
            # constraints=EnsureListOf(str) | EnsureNone(),
        ),
        image=Parameter(
            args=("-i", "--image"),
            doc="""Relative path of the container image within the dataset. If not
                given, a default location will be determined using the
                `name` argument.""",
            metavar="IMAGE",
            constraints=EnsureStr() | EnsureNone(),

        ),
        update=Parameter(
            args=("--update",),
            action="store_true",
            doc="""Update the existing container for `name`. If no other
            options are specified, URL will be set to 'updateurl', if
            configured. If a container with `name` does not already exist, this
            option is ignored."""
        )
    )

    @staticmethod
    @datasetmethod(name='containers_add')
    @eval_results
    def __call__(name, url=None, dataset=None, call_fmt=None, image=None,
                 update=False, extra_input=None):
        if not name:
            raise InsufficientArgumentsError("`name` argument is required")

        ds = require_dataset(dataset, check_installed=True,
                             purpose='add container')
        runner = WitlessRunner()

        # prevent madness in the config file
        if not re.match(r'^[0-9a-zA-Z-]+$', name):
            raise ValueError(
                "Container names can only contain alphanumeric characters "
                "and '-', got: '{}'".format(name))

        container_cfg = get_container_configuration(ds, name)
        if 'image' in container_cfg:
            if not update:
                yield get_status_dict(
                    action="containers_add", ds=ds, logger=lgr,
                    status="impossible",
                    message=("Container named %r already exists. "
                             "Use --update to reconfigure.",
                             name))
                return

            if not (url or image or call_fmt):
                # No updated values were provided. See if an update url is
                # configured (currently relevant only for Singularity Hub).
                url = container_cfg.get("updateurl")
                if not url:
                    yield get_status_dict(
                        action="containers_add", ds=ds, logger=lgr,
                        status="impossible",
                        message="No values to update specified")
                    return

            call_fmt = call_fmt or container_cfg.get("cmdexec")
            image = image or container_cfg.get("image")

        if not image:
            loc_cfg_var = "datalad.containers.location"
            container_loc = \
                ds.config.obtain(
                    loc_cfg_var,
                    # if not False it would actually modify the
                    # dataset config file -- undesirable
                    store=False,
                )
            image = op.join(ds.path, container_loc, name, 'image')
        else:
            image = op.join(ds.path, image)

        result = get_status_dict(
            action="containers_add",
            path=image,
            type="file",
            logger=lgr,
        )

        if call_fmt is None:
            # maybe built in knowledge can help
            call_fmt = _guess_call_fmt(ds, name, url)

        # collect bits for a final and single save() call
        to_save = []
        imgurl = url
        was_updated = False
        if url:
            if update and op.lexists(image):
                was_updated = True
                # XXX: check=False is used to avoid dropping the image. It
                # should use drop=False if remove() gets such an option (see
                # DataLad's gh-2673).
                for r in ds.remove(image, reckless='availability',
                                   return_type="generator"):
                    yield r

            imgurl = _resolve_img_url(url)
            lgr.debug('Attempt to obtain container image from: %s', imgurl)
            if url.startswith("dhub://"):
                from .adapters import docker

                docker_image = url[len("dhub://"):]

                lgr.debug(
                    "Running 'docker pull %s and saving image to %s",
                    docker_image, image)
                runner.run(["docker", "pull", docker_image])
                docker.save(docker_image, image)
            elif url.startswith("docker://"):
                image_dir, image_basename = op.split(image)
                if not image_basename:
                    raise ValueError("No basename in path {}".format(image))
                if image_dir and not op.exists(image_dir):
                    os.makedirs(image_dir)

                lgr.info("Building Singularity image for %s "
                         "(this may take some time)",
                         url)
                runner.run(["singularity", "build", image_basename, url],
                           cwd=image_dir or None)
            elif op.exists(url):
                lgr.info("Copying local file %s to %s", url, image)
                image_dir = op.dirname(image)
                if image_dir and not op.exists(image_dir):
                    os.makedirs(image_dir)
                copyfile(url, image)
            else:
                if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'):
                    _ensure_datalad_remote(ds.repo)

                try:
                    ds.repo.add_url_to_file(image, imgurl)
                except Exception as e:
                    result["status"] = "error"
                    result["message"] = str(e)
                    yield result
            # TODO do we have to take care of making the image executable
            # if --call_fmt is not provided?
            to_save.append(image)
        # continue despite a remote access failure, the following config
        # setting will enable running the command again with just the name
        # given to ease a re-run
        if not op.lexists(image):
            result["status"] = "error"
            result["message"] = ('no image at %s', image)
            yield result
            return

        # store configs
        cfgbasevar = "datalad.containers.{}".format(name)
        if imgurl != url:
            # store originally given URL, as it resolves to something
            # different and maybe can be used to update the container
            # at a later point in time
            ds.config.set("{}.updateurl".format(cfgbasevar), url)
        # force store the image, and prevent multiple entries
        ds.config.set(
            "{}.image".format(cfgbasevar),
            # always store a POSIX path, relative to dataset root
            str(PurePosixPath(Path(image).relative_to(ds.pathobj))),
            force=True)
        if call_fmt:
            ds.config.set(
                "{}.cmdexec".format(cfgbasevar),
                call_fmt,
                force=True)
        # --extra-input sanity check
        # TODO: might also want to do that for --call-fmt above?
        extra_input_placeholders = dict(img_dirpath="", img_dspath="")
        for xi in (extra_input or []):
            try:
                xi.format(**extra_input_placeholders)
            except KeyError as exc:
                yield get_status_dict(
                    action="containers_add", ds=ds, logger=lgr,
                    status="error",
                    message=("--extra-input %r contains unknown placeholder %s. "
                             "Available placeholders: %s",
                             repr(xi), exc, ', '.join(extra_input_placeholders)))
                return

        # actually setting --extra-input config
        cfgextravar = "{}.extra-input".format(cfgbasevar)
        if ds.config.get(cfgextravar) is not None:
            ds.config.unset(cfgextravar)
        for xi in (extra_input or []):
            ds.config.add(cfgextravar, xi)

        # store changes
        to_save.append(op.join(".datalad", "config"))
        for r in ds.save(
                path=to_save,
                message="[DATALAD] {do} containerized environment '{name}'".format(
                    do="Update" if was_updated else "Configure",
                    name=name)):
            yield r
        result["status"] = "ok"
        yield result