datalad/datalad-container

View on GitHub
datalad_container/containers_run.py

Summary

Maintainability
B
5 hrs
Test Coverage
"""Drop-in replacement for `datalad run` for command execution in a container"""

__docformat__ = 'restructuredtext'

import logging
import os.path as op
import sys

from datalad.core.local.run import (
    Run,
    get_command_pwds,
    normalize_command,
    run_command,
)
from datalad.distribution.dataset import (
    datasetmethod,
    require_dataset,
)
from datalad.interface.base import (
    Interface,
    build_doc,
    eval_results,
)
from datalad.interface.results import get_status_dict
from datalad.support.param import Parameter
from datalad.utils import ensure_iter

from datalad_container.find_container import find_container_

lgr = logging.getLogger("datalad.containers.containers_run")

# Environment variable to be set during execution to possibly
# inform underlying shim scripts about the original name of
# the container
CONTAINER_NAME_ENVVAR = 'DATALAD_CONTAINER_NAME'

_run_params = dict(
    Run._params_,
    container_name=Parameter(
        args=('-n', '--container-name',),
        metavar="NAME",
        doc="""Specify the name of or a path to a known container to use 
        for execution, in case multiple containers are configured."""),
)


@build_doc
# all commands must be derived from Interface
class ContainersRun(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """Drop-in replacement of 'run' to perform containerized command execution

    Container(s) need to be configured beforehand (see containers-add). If no
    container is specified and only one container is configured in the current
    dataset, it will be selected automatically. If more than one container is
    registered in the current dataset or to access containers from subdatasets,
    the container has to be specified.

    A command is generated based on the input arguments such that the
    container image itself will be recorded as an input dependency of
    the command execution in the `run` record in the git history.

    During execution the environment variable {name_envvar} is set to the
    name of the used container.
    """

    _docs_ = dict(
        name_envvar=CONTAINER_NAME_ENVVAR
    )

    _params_ = _run_params

    # Analogous to 'run' command - stop on first error
    on_failure = 'stop'

    @staticmethod
    @datasetmethod(name='containers_run')
    @eval_results
    def __call__(cmd, container_name=None, dataset=None,
                 inputs=None, outputs=None, message=None, expand=None,
                 explicit=False, sidecar=None):
        from unittest.mock import \
            patch  # delayed, since takes long (~600ms for yoh)
        pwd, _ = get_command_pwds(dataset)
        ds = require_dataset(dataset, check_installed=True,
                             purpose='run a containerized command execution')

        # this following block locates the target container. this involves a
        # configuration look-up. This is not using
        # get_container_configuration(), because it needs to account for a
        # wide range of scenarios, including the installation of the dataset(s)
        # that will eventually provide (the configuration) for the container.
        # However, internally this is calling `containers_list()`, which is
        # using get_container_configuration(), so any normalization of
        # configuration on-read, get still be implemented in this helper.
        container = None
        for res in find_container_(ds, container_name):
            if res.get("action") == "containers":
                container = res
            else:
                yield res
        assert container, "bug: container should always be defined here"

        image_path = op.relpath(container["path"], pwd)
        # container record would contain path to the (sub)dataset containing
        # it.  If not - take current dataset, as it must be coming from it
        image_dspath = op.relpath(container.get('parentds', ds.path), pwd)

        # sure we could check whether the container image is present,
        # but it might live in a subdataset that isn't even installed yet
        # let's leave all this business to `get` that is called by `run`

        cmd = normalize_command(cmd)
        # expand the command with container execution
        if 'cmdexec' in container:
            callspec = container['cmdexec']

            # Temporary kludge to give a more helpful message
            if callspec.startswith("["):
                import json
                try:
                    json.loads(callspec)
                except json.JSONDecodeError:
                    pass  # Never mind, false positive.
                else:
                    raise ValueError(
                        'cmdexe {!r} is in an old, unsupported format. '
                        'Convert it to a plain string.'.format(callspec))
            try:
                cmd_kwargs = dict(
                    # point to the python installation that runs *this* code
                    # we know that it would have things like the docker
                    # adaptor installed with this extension package
                    python=sys.executable,
                    img=image_path,
                    cmd=cmd,
                    img_dspath=image_dspath,
                    img_dirpath=op.dirname(image_path) or ".",
                )
                cmd = callspec.format(**cmd_kwargs)
            except KeyError as exc:
                yield get_status_dict(
                    'run',
                    ds=ds,
                    status='error',
                    message=(
                        'Unrecognized cmdexec placeholder: %s. '
                        'See containers-add for information on known ones: %s',
                        exc,
                        ", ".join(cmd_kwargs)))
                return
        else:
            # just prepend and pray
            cmd = container['path'] + ' ' + cmd

        extra_inputs = []
        for extra_input in ensure_iter(container.get("extra-input",[]), set):
            try:
                xi_kwargs = dict(
                    img_dspath=image_dspath,
                    img_dirpath=op.dirname(image_path) or ".",
                )
                extra_inputs.append(extra_input.format(**xi_kwargs))
            except KeyError as exc:
                yield get_status_dict(
                    'run',
                    ds=ds,
                    status='error',
                    message=(
                        'Unrecognized extra_input placeholder: %s. '
                        'See containers-add for information on known ones: %s',
                        exc,
                        ", ".join(xi_kwargs)))
                return

        lgr.debug("extra_inputs = %r", extra_inputs)

        with patch.dict('os.environ',
                        {CONTAINER_NAME_ENVVAR: container['name']}):
            # fire!
            for r in run_command(
                    cmd=cmd,
                    dataset=dataset or (ds if ds.path == pwd else None),
                    inputs=inputs,
                    extra_inputs=[image_path] + extra_inputs,
                    outputs=outputs,
                    message=message,
                    expand=expand,
                    explicit=explicit,
                    sidecar=sidecar):
                yield r