KarrLab/wc_utils

View on GitHub
wc_utils/util/git.py

Summary

Maintainability
B
5 hrs
Test Coverage
A
99%
""" Git utilities for obtaining repo metadata

:Author: Arthur Goldberg <Arthur.Goldberg@mssm.edu>
:Author: Jonathan Karr <jonrkarr@gmail.com>
:Date: 2017-05-24
:Copyright: 2017-2019, Karr Lab
:License: MIT
"""

from dataclasses import dataclass
from enum import Enum, auto
from github.GithubException import UnknownObjectException
from pathlib import Path
from wc_utils.config import core
from wc_utils.util.misc import obj_to_str
import git
import github
import itertools
import os


def get_repo(path='.', search_parent_directories=True):
    """ Get a Git repository given the path to a file it contains

    Args:
        path (:obj:`str`): path to file or directory in a Git repository; if `path` doesn't exist
            or is a file then its directory is used
        search_parent_directories (:obj:`bool`, optional): if :obj:`True` have :obj:`git.Repo` search
            for the root of the repository among the parent directories of :obj:`path`; otherwise,
            this method iterates over the parent directories itself

    Returns:
        :obj:`git.Repo`: a `GitPython` repository

    Raises:
        :obj:`ValueError`: if obj:`path` is not a path to a Git repository
    """
    repo = None
    resolved_path = Path(path).expanduser().resolve()
    if not resolved_path.exists() or resolved_path.is_file():
        resolved_path = resolved_path.parent
    dirnames = itertools.chain([str(resolved_path)], resolved_path.parents)
    if search_parent_directories:
        dirnames = [str(resolved_path)]
    for parent_dirname in dirnames:
        try:
            repo = git.Repo(str(parent_dirname), search_parent_directories=search_parent_directories)
            break
        except (git.exc.InvalidGitRepositoryError, git.exc.NoSuchPathError) as e:
            pass

    if not repo:
        raise ValueError('"{}" is not in a Git repository'.format(path))
    return repo


class RepoMetadataCollectionType(Enum):
    """ Type of Git repo being queried for metadata that's stored in a data file """
    DATA_REPO = auto()
    SCHEMA_REPO = auto()


# todo: automatically determine branch of repo & use it instead of 'master'
def repo_suitability(repo, repo_type, data_file=None):
    """ Evaluate whether a repo is a suitable source for git metadata

    Determine whether `repo` is in a state that's suitable for collecting immutable metadata.
    It cannot be ahead of the remote, because commits must have been pushed to
    the server so they can be later retrieved.
    If the `repo_type` is `RepoMetadataCollectionType.SCHEMA_REPO`, then there cannot be any differences
    between the index and the working tree because the schema should be synched with the origin.
    If the`repo_type` is `RepoMetadataCollectionType.DATA_REPO` then the repo can contain changes,
    but the data file should not depend on them. The caller is responsible for determining this.

    Args:
        repo (:obj:`git.Repo`): a `GitPython` repository
        repo_type (:obj:`RepoMetadataCollectionType`): repo type having status determined
        data_file (:obj:`str`, optional): pathname of a data file in the repo; must be provided if
            `repo_type` is `RepoMetadataCollectionType.DATA_REPO`

    Returns:
        :obj:`list` of :obj:`str`: list of reasons, if any, that the repo is in a state that's not
            suitable for collecting metadata; an empty list indicates that the repo can be used to
            collect metadata

    Raises:
        :obj:`ValueError`: if obj:`data_file` is not a path in a Git repository, or
            if `repo_type` is `RepoMetadataCollectionType.DATA_REPO` and `data_file` is not provided, or
            if `repo_type` is not a `RepoMetadataCollectionType`
    """
    unsuitable_changes = []
    commits_ahead = list(repo.iter_commits('origin..HEAD'))
    if commits_ahead:
        unsuitable_changes.append('commits ahead of origin')

    # diff between the index and the commit tree HEAD points to
    diff_index = repo.index.diff(repo.head.commit)
    if repo_type is RepoMetadataCollectionType.DATA_REPO:

        if not data_file:
            raise ValueError("data_file must be provided if repo_type is "
                "RepoMetadataCollectionType.DATA_REPO")

        # ensure that data_file exists in repo
        resolved_data_file = Path(data_file).expanduser().resolve()
        repo_root = Path(repo.git_dir).parent
        try:
            resolved_data_file.relative_to(str(repo_root))
        except ValueError:
            raise ValueError("data_file '{}' must be in the repo that's in '{}'".format(
                data_file, str(repo_root)))

        # ideally, the git repo storing a data file should only have changes in the data file so that
        # it depends on the prior commits; but this may be difficult to satisfy, so other differences
        # should be reported as a warning
        for change_type in diff_index.change_type:
            for diff in diff_index.iter_change_type(change_type):
                resolved_a_rawpath = repo_root.joinpath(diff.a_rawpath.decode())
                resolved_b_rawpath = repo_root.joinpath(diff.b_rawpath.decode())
                if (resolved_a_rawpath != resolved_data_file or
                    resolved_b_rawpath != resolved_data_file):
                    unsuitable_changes.append('modified path(s) are not data_file path')

        for untracked_file in repo.untracked_files:
            if repo_root.joinpath(untracked_file) != resolved_data_file:
                unsuitable_changes.append("untracked file '{}' is not data file: '{}'".format(
                    repo_root.joinpath(untracked_file), resolved_data_file))

    elif repo_type is RepoMetadataCollectionType.SCHEMA_REPO:

        # a schema repo that has any differences between the index and the working tree
        # isn't suitable for collecting metadata
        for change_type in diff_index.change_type:
            if list(diff_index.iter_change_type(change_type)):
                unsuitable_changes.append('changes present')
        if repo.untracked_files:
            unsuitable_changes.append('untracked files present')

    else:
        raise ValueError("Invalid RepoMetadataCollectionType: '{}'".format(repo_type))

    return unsuitable_changes


def get_repo_metadata(path='.', search_parent_directories=True, repo_type=None, data_file=None):
    """ Get metadata about a Git repository

    Args:
        path (:obj:`str`): path to file or directory in a Git repository
        search_parent_directories (:obj:`bool`, optional): if :obj:`True`, have `GitPython` search for
            the root of the repository among the parent directories of :obj:`path`
        repo_type (:obj:`RepoMetadataCollectionType`, optional): repo type having metadata collected
        data_file (:obj:`str`, optional): pathname of a data file in the repo; must be provided if
            `repo_type` is `RepoMetadataCollectionType.DATA_REPO`

    Returns:
        :obj:`tuple`: of :obj:`RepositoryMetadata`:, :obj:`list` of :obj:`str`: repository metadata,
            and, if `repo_type` is provided, changes in the repository that make it unsuitable
    """
    repo = get_repo(path=path, search_parent_directories=search_parent_directories)
    unsuitable_changes = None
    if repo_type:
        unsuitable_changes = repo_suitability(repo, repo_type, data_file=data_file)

    url = str(repo.remote('origin').url)
    branch = str(repo.active_branch.name)
    revision = str(repo.head.commit.hexsha)
    return RepositoryMetadata(url, branch, revision), unsuitable_changes


@dataclass
class RepositoryMetadata(object):
    """ Represents metadata about a Git repository

    Attributes:
        url (:obj:`str`): URL
        branch (:obj:`str`): branch
        revision (:obj:`str`): revision
    """

    url: str
    branch: str
    revision: str


class GitHubRepoForTests(object):
    """ Functions for managing test GitHub repos """

    @staticmethod
    def get_github_api_token():
        config = core.get_config()['wc_utils']['github']
        if config['github_api_token'] == '':
            raise ValueError(f"'github_api_token' is not provided by config")
        return config['github_api_token']

    def __init__(self, name, organization='KarrLab'):
        """ Manage a test GitHub repository

        Args:
            name (:obj:`str`): name of the repo
            organization (:obj:`str`): GitHub organization home for the repo; default='KarrLab'
        """
        self.api_token = self.get_github_api_token()
        self.name = name
        self.organization = organization

    def make_test_repo(self, dirname=None):
        """ Create a test GitHub repository

        Args:
            dirname (:obj:`str`, optional): a directory name; if present, clone the repo into it

        Returns:
            :obj:`obj`: if `dirname` is provided, a `gitpython` reference to a local clone of the test
                GitHub repository; otherwise, the URL of the test GitHub repository
        """
        # delete test repo in case it wasn't deleted previously
        self.delete_test_repo()
        g = github.Github(self.api_token)
        org = g.get_organization(self.organization)
        org.create_repo(name=self.name, private=False, auto_init=True)
        repo_url = 'https://github.com/{}/{}.git'.format(self.organization, self.name)
        if dirname:
            # clone from GitHub
            self.repo = git.Repo.clone_from(repo_url, dirname)
            return self.repo
        return repo_url

    def delete_test_repo(self):
        g = github.Github(self.api_token)
        try:
            repo = g.get_repo("{}/{}".format(self.organization, self.name))
            repo.delete()
        except UnknownObjectException:
            # ignore exception that occurs when delete does not find the repo
            pass
        except Exception:   # pragma: no cover; cannot deliberately raise an other exception
            # re-raise all other exceptions
            raise