iterative/dvc

View on GitHub
dvc/repo/diff.py

Summary

Maintainability
A
3 hrs
Test Coverage
import errno
import os
from collections import defaultdict
from typing import Optional

from dvc.log import logger
from dvc.repo import locked
from dvc.ui import ui

logger = logger.getChild(__name__)


def _path(entry):
    if entry and entry.meta and entry.meta.isdir:
        return os.path.join(*entry.key, "")
    return os.path.join(*entry.key)


def _hash(entry):
    if entry and entry.hash_info:
        return entry.hash_info.value
    return None


def _diff(old, new, data_keys, with_missing=False):
    from dvc_data.index.diff import ADD, DELETE, MODIFY, RENAME
    from dvc_data.index.diff import diff as idiff

    ret: "dict[str, list[dict]]" = {
        "added": [],
        "deleted": [],
        "modified": [],
        "renamed": [],
        "not in cache": [],
    }

    def meta_cmp_key(meta):
        if not meta:
            return meta
        return meta.isdir

    for change in idiff(
        old,
        new,
        with_renames=True,
        meta_cmp_key=meta_cmp_key,
        roots=data_keys,
    ):
        if (change.old and change.old.isdir and not change.old.hash_info) or (
            change.new and change.new.isdir and not change.new.hash_info
        ):
            continue

        if change.typ == ADD:
            ret["added"].append({"path": _path(change.new), "hash": _hash(change.new)})
        elif change.typ == DELETE:
            ret["deleted"].append(
                {"path": _path(change.old), "hash": _hash(change.old)}
            )
        elif change.typ == MODIFY:
            ret["modified"].append(
                {
                    "path": _path(change.old),
                    "hash": {"old": _hash(change.old), "new": _hash(change.new)},
                }
            )
        elif change.typ == RENAME:
            ret["renamed"].append(
                {
                    "path": {"old": _path(change.old), "new": _path(change.new)},
                    "hash": _hash(change.old),
                }
            )

        if (
            with_missing
            and change.old
            and change.old.hash_info
            and not old.storage_map.cache_exists(change.old)
        ):
            ret["not in cache"].append(
                {"path": _path(change.old), "hash": _hash(change.old)}
            )

    return ret if any(ret.values()) else {}


@locked
def diff(
    self,
    a_rev: str = "HEAD",
    b_rev: Optional[str] = None,
    targets: Optional[list[str]] = None,
    recursive: bool = False,
):
    """
    By default, it compares the workspace with the last commit's fs.

    This implementation differs from `git diff` since DVC doesn't have
    the concept of `index`, but it keeps the same interface, thus,
    `dvc diff` would be the same as `dvc diff HEAD`.
    """
    if self.scm.no_commits:
        return {}

    indexes = {}
    missing_targets = defaultdict(set)
    with_missing = False
    if not b_rev:
        b_rev = "workspace"
        with_missing = True

    data_keys = set()
    for rev in self.brancher(revs=[a_rev, b_rev]):
        if rev == "workspace" and b_rev != "workspace":
            # brancher always returns workspace, but we only need to compute
            # workspace paths/checksums if b_rev was None
            continue

        def onerror(target, _exc):
            missing_targets[rev].add(target)  # noqa: B023

        view = self.index.targets_view(targets, onerror=onerror, recursive=recursive)

        data_keys.update(view.data_keys.get("repo", set()))

        if rev == "workspace":
            from .index import build_data_index

            with ui.status("Building workspace index"):
                data = build_data_index(view, self.root_dir, self.fs, compute_hash=True)
        else:
            data = view.data["repo"]

        assert rev not in indexes
        indexes[rev] = data

    if targets:
        old_missing = missing_targets.get(a_rev, set())
        new_missing = missing_targets.get(b_rev, set())

        # check for overlapping missing targets between a_rev and b_rev
        for target in old_missing & new_missing:
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), target)

    if len(indexes.keys()) == 1:
        # both a_rev and b_rev point to the same sha, nothing to compare
        old = None
        new = None
    else:
        old = indexes[a_rev]
        new = indexes[b_rev]

    with ui.status("Calculating diff"):
        return _diff(old, new, data_keys, with_missing=with_missing)