datalad/datalad-container

View on GitHub
datalad_container/adapters/docker.py

Summary

Maintainability
A
1 hr
Test Coverage
"""Work with Docker images as local paths.

This module provides support for saving a Docker image in a local directory and
then loading it on-the-fly before calling `docker run ...`. The motivation for
this is that it allows the components of an image to be tracked as objects in a
DataLad dataset.

Run `python -m datalad_container.adapters.docker --help` for details about the
command-line interface.
"""

import hashlib
import json
import logging
import os
import os.path as op
import subprocess as sp
import sys
import tarfile
import tempfile

from datalad.utils import on_windows

lgr = logging.getLogger("datalad.containers.adapters.docker")

# Note: A dockerpy dependency probably isn't worth it in the current
# state but is worth thinking about if this module gets more
# complicated.

# FIXME: These functions assume that there is a "docker" on the path
# that can be managed by a non-root user.  At the least, this should
# be documented somewhere.


def save(image, path):
    """Save and extract a docker image to a directory.

    Parameters
    ----------
    image : str
        A unique identifier for a docker image.
    path : str
        A directory to extract the image to.
    """
    # Use a temporary file because docker save (or actually tar underneath)
    # complains that stdout needs to be redirected if we use Popen and PIPE.
    if ":" not in image:
        image = f"{image}:latest"
    with tempfile.NamedTemporaryFile() as stream:
        # Windows can't write to an already opened file
        stream.close()
        sp.check_call(["docker", "save", "-o", stream.name, image])
        with tarfile.open(stream.name, mode="r:") as tar:
            if not op.exists(path):
                lgr.debug("Creating new directory at %s", path)
                os.makedirs(path)
            elif os.listdir(path):
                raise OSError("Directory {} is not empty".format(path))
            def is_within_directory(directory, target):
                
                abs_directory = os.path.abspath(directory)
                abs_target = os.path.abspath(target)
            
                prefix = os.path.commonprefix([abs_directory, abs_target])
                
                return prefix == abs_directory
            
            def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
            
                for member in tar.getmembers():
                    member_path = os.path.join(path, member.name)
                    if not is_within_directory(path, member_path):
                        raise Exception("Attempted Path Traversal in Tar File")
            
                tar.extractall(path, members, numeric_owner=numeric_owner) 
                
            
            safe_extract(tar, path=path)
            lgr.info("Saved %s to %s", image, path)


def _list_images():
    out = sp.check_output(
        ["docker", "images", "--all", "--quiet", "--no-trunc"])
    return out.decode().splitlines()


def get_image(path, repo_tag=None, config=None):
    """Return the image ID of the image extracted at `path`.
    """
    manifest_path = op.join(path, "manifest.json")
    with open(manifest_path) as fp:
        manifest = json.load(fp)
    if repo_tag is not None:
        manifest = [img for img in manifest if repo_tag in (img.get("RepoTags") or [])]
    if config is not None:
        manifest = [img for img in manifest if img["Config"].startswith(config)]
    if len(manifest) == 0:
        raise ValueError(f"No matching images found in {manifest_path}")
    elif len(manifest) > 1:
        raise ValueError(
            f"Multiple images found in {manifest_path}; disambiguate with"
            " --repo-tag or --config"
        )

    with open(op.join(path, manifest[0]["Config"]), "rb") as stream:
        return hashlib.sha256(stream.read()).hexdigest()


def load(path, repo_tag, config):
    """Load the Docker image from `path`.

    Parameters
    ----------
    path : str
        A directory with an extracted tar archive.
    repo_tag : str or None
        `image:tag` of image to load
    config : str or None
        "Config" value or prefix of image to load

    Returns
    -------
    The image ID (str)
    """
    # FIXME: If we load a dataset, it may overwrite the current tag. Say that
    # (1) a dataset has a saved neurodebian:latest from a month ago, (2) a
    # newer neurodebian:latest has been pulled, and (3) the old image have been
    # deleted (e.g., with 'docker image prune --all'). Given all three of these
    # things, loading the image from the dataset will tag the old neurodebian
    # image as the latest.
    image_id = "sha256:" + get_image(path, repo_tag, config)
    if image_id not in _list_images():
        lgr.debug("Loading %s", image_id)
        cmd = ["docker", "load"]
        p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
        with tarfile.open(fileobj=p.stdin, mode="w|", dereference=True) as tar:
            tar.add(path, arcname="")
        out, err = p.communicate()
        return_code = p.poll()
        if return_code:
            lgr.warning("Running %r failed: %s", cmd, err.decode())
            raise sp.CalledProcessError(return_code, cmd, output=out)
    else:
        lgr.debug("Image %s is already present", image_id)

    if image_id not in _list_images():
        raise RuntimeError(
            "docker image {} was not successfully loaded".format(image_id))
    return image_id


# Command-line


def cli_save(namespace):
    save(namespace.image, namespace.path)


def cli_run(namespace):
    image_id = load(namespace.path, namespace.repo_tag, namespace.config)
    prefix = ["docker", "run",
              # FIXME: The -v/-w settings are convenient for testing, but they
              # should be configurable.
              "-v", "{}:/tmp".format(os.getcwd()),
              "-w", "/tmp",
              "--rm",
              "--interactive"]
    if not on_windows:
        # Make it possible for the output files to be added to the
        # dataset without the user needing to manually adjust the
        # permissions.
        prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())])

    if sys.stdin.isatty():
        prefix.append("--tty")
    prefix.append(image_id)
    cmd = prefix + namespace.cmd
    lgr.debug("Running %r", cmd)
    sp.check_call(cmd)


def main(args):
    import argparse

    parser = argparse.ArgumentParser(
        prog="python -m datalad_container.adapters.docker",
        description="Work with Docker images as local paths")
    parser.add_argument(
        "-v", "--verbose",
        action="store_true")

    subparsers = parser.add_subparsers(title="subcommands")
    # Don't continue without a subcommand.
    subparsers.required = True
    subparsers.dest = "command"

    parser_save = subparsers.add_parser(
        "save",
        help="save and extract a Docker image to a directory")
    parser_save.add_argument(
        "image", metavar="NAME",
        help="image to save")
    parser_save.add_argument(
        "path", metavar="PATH",
        help="directory to save image in")
    parser_save.set_defaults(func=cli_save)
    # TODO: Add command for updating an archive directory.

    parser_run = subparsers.add_parser(
        "run",
        help="run a command with a directory's image")
    parser_run.add_argument(
        "--repo-tag", metavar="IMAGE:TAG", help="Tag of image to load"
    )
    parser_run.add_argument(
        "--config",
        metavar="IDPREFIX",
        help="Config value or prefix of image to load"
    )
    parser_run.add_argument(
        "path", metavar="PATH",
        help="run the image in this directory")
    parser_run.add_argument(
        "cmd", metavar="CMD", nargs=argparse.REMAINDER,
        help="command to execute")
    parser_run.set_defaults(func=cli_run)

    namespace = parser.parse_args(args[1:])

    logging.basicConfig(
        level=logging.DEBUG if namespace.verbose else logging.INFO,
        format="%(message)s")

    namespace.func(namespace)


if __name__ == "__main__":
    try:
        main(sys.argv)
    except Exception as exc:
        lgr.exception("Failed to execute %s", sys.argv)
        if isinstance(exc, sp.CalledProcessError):
            excode = exc.returncode
        else:
            excode = 1
        sys.exit(excode)