iterative/dvc

View on GitHub
dvc/commands/diff.py

Summary

Maintainability
B
6 hrs
Test Coverage
import os

from dvc.cli import completion, formatter
from dvc.cli.command import CmdBase
from dvc.cli.utils import append_doc_link
from dvc.log import logger
from dvc.ui import ui

logger = logger.getChild(__name__)


def _digest(checksum):
    if isinstance(checksum, str):
        return checksum[0:8]
    return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8])


def _show_markdown(diff, show_hash=False, hide_missing=False):
    headers = ["Status", "Hash", "Path"] if show_hash else ["Status", "Path"]
    rows = []
    statuses = ["added", "deleted", "renamed", "modified"]
    if not hide_missing:
        statuses.append("not in cache")

    for status in statuses:
        entries = diff.get(status, [])
        if not entries:
            continue
        for entry in entries:
            path = entry["path"]
            if isinstance(path, dict):
                path = f"{path['old']} -> {path['new']}"
            if show_hash:
                check_sum = _digest(entry.get("hash", ""))
                rows.append([status, check_sum, path])
            else:
                rows.append([status, path])

    ui.table(rows, headers=headers, markdown=True)


class CmdDiff(CmdBase):
    @staticmethod
    def _show_diff(diff, hide_missing=False):
        """
        Given a diff structure, generate a string of paths separated
        by new lines and grouped together by their state.

        A group's header is colored to enhance readability, for example:

            Added:
                another_file.txt
                backup.tar
                dir/
                dir/1

        An example of a diff formatted when entries contain hash:

            Added:
                d3b07384 foo

            Modified:
                c157a790..f98bf6f1 bar

        If a group has no entries, it won't be included in the result.

        At the bottom, include a summary with the number of files per state.
        """

        colors = {
            "added": "green",
            "modified": "yellow",
            "deleted": "red",
            "renamed": "green",
            "not in cache": "yellow",
        }

        summary = {}

        states = ["added", "deleted", "renamed", "modified"]
        if not hide_missing:
            states.append("not in cache")
        for state in states:
            summary[state] = 0
            entries = diff[state]

            if not entries:
                continue

            header = state.capitalize()
            ui.write(f"[{colors[state]}]{header}[/]:", styled=True)

            for entry in entries:
                path = entry["path"]
                if isinstance(path, dict):
                    path = f"{path['old']} -> {path['new']}"
                checksum = entry.get("hash")
                summary[state] += 1 if not path.endswith(os.sep) else 0
                ui.write(
                    "{space}{checksum}{separator}{path}".format(
                        space="    ",
                        checksum=_digest(checksum) if checksum else "",
                        separator="  " if checksum else "",
                        path=path,
                    )
                )

            ui.write()

        if not sum(summary.values()):
            return

        states_summary = ", ".join(
            f"{summary[state]} {state}" for state in states if summary[state] > 0
        )
        ui.write("files summary:", states_summary)

    def run(self):
        from dvc.exceptions import DvcException

        try:
            diff = self.repo.diff(self.args.a_rev, self.args.b_rev, self.args.targets)
            show_hash = self.args.show_hash
            hide_missing = self.args.b_rev or self.args.hide_missing
            if hide_missing:
                diff.pop("not in cache", None)

            for key, entries in diff.items():
                entries = sorted(
                    entries,
                    key=lambda entry: (
                        entry["path"]["old"]
                        if isinstance(entry["path"], dict)
                        else entry["path"]
                    ),
                )
                if not show_hash:
                    for entry in entries:
                        del entry["hash"]
                diff[key] = entries

            if self.args.json:
                ui.write_json(diff)
            elif self.args.markdown:
                _show_markdown(diff, show_hash, hide_missing)
            elif diff:
                self._show_diff(diff, hide_missing)

        except DvcException:
            logger.exception("failed to get diff")
            return 1
        return 0


def add_parser(subparsers, parent_parser):
    DIFF_DESCRIPTION = (
        "Show added, modified, or deleted data between commits in the DVC"
        " repository, or between a commit and the workspace."
    )
    diff_parser = subparsers.add_parser(
        "diff",
        parents=[parent_parser],
        description=append_doc_link(DIFF_DESCRIPTION, "diff"),
        help=DIFF_DESCRIPTION,
        formatter_class=formatter.RawDescriptionHelpFormatter,
    )
    diff_parser.add_argument(
        "--targets",
        nargs="*",
        help="Specific DVC-tracked files to compare. Accepts one or more file paths.",
        metavar="<paths>",
    ).complete = completion.FILE
    diff_parser.add_argument(
        "a_rev",
        help="Old Git commit to compare (defaults to HEAD)",
        nargs="?",
        default="HEAD",
    )
    diff_parser.add_argument(
        "b_rev",
        help="New Git commit to compare (defaults to the current workspace)",
        nargs="?",
    )
    diff_parser.add_argument(
        "--json",
        help="Format the output into a JSON",
        action="store_true",
        default=False,
    )
    diff_parser.add_argument(
        "--show-hash",
        help="Display hash value for each entry",
        action="store_true",
        default=False,
    )
    diff_parser.add_argument(
        "--md",
        help="Show tabulated output in the Markdown format (GFM).",
        action="store_true",
        dest="markdown",
        default=False,
    )
    diff_parser.add_argument(
        "--hide-missing",
        help="Hide missing cache file status.",
        action="store_true",
    )
    diff_parser.set_defaults(func=CmdDiff)