pygcgen/fetcher.py from topic2k/pygcgen

pygcgen/fetcher.py
Summary

Maintainability

2 days
Test Coverage

Issues
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function

import os
import re
import sys
import subprocess
import threading
if sys.version_info.major == 3:
    from builtins import object, range

from agithub.GitHub import GitHub

from .pygcgen_exceptions import GithubApiError


GH_CFG_VARS = ["github.pygcgen.token", "github.token"]
PER_PAGE_NUMBER = 100
CHANGELOG_GITHUB_TOKEN = "CHANGELOG_GITHUB_TOKEN"
GH_RATE_LIMIT_EXCEEDED_MSG = \
    "GitHub API rate limit exceeded, change log may be missing some issues. " \
    "Please provide a token with -t option or in git config."
NO_TOKEN_PROVIDED = \
    "Warning: No token provided. Neither -t option, git config or variable " \
    "$CHANGELOG_GITHUB_TOKEN found. This script can make only " \
    "50 requests to GitHub API per hour without token!"
REPO_CREATED_TAG_NAME = "repo_created_at"


class Fetcher(object):
    """
    A Fetcher is responsible for all requests to GitHub and all basic
    manipulation with related data (such as filtering, validating, e.t.c).
    """

    def __init__(self, options):
        self.options = options
        self.first_issue = None
        self.events_cnt = 0
        self.fetch_github_token()
        if isinstance(self.options.user, bytes):
            self.options.user = self.options.user.decode("utf8")
        if isinstance(self.options.project, bytes):
            self.options.project = self.options.project.decode("utf8")
        if isinstance(self.options.token, bytes):
            self.options.token = self.options.token.decode("utf8")
        if options.token:
            self.github = GitHub(
                token=options.token,
                api_url=options.github_endpoint
            )
        else:
            self.github = GitHub(api_url=options.github_endpoint)

    def fetch_github_token(self):
        """
        Fetch GitHub token. First try to use variable provided
        by --token option, otherwise try to fetch it from git config
        and last CHANGELOG_GITHUB_TOKEN env variable.

        :returns: Nothing
        """

        if not self.options.token:
            try:
                for v in GH_CFG_VARS:
                    cmd = ['git', 'config', '--get', '{0}'.format(v)]
                    self.options.token = subprocess.Popen(
                        cmd, stdout=subprocess.PIPE).communicate()[0].strip()
                    if self.options.token:
                        break
            except (subprocess.CalledProcessError, WindowsError):
                pass
        if not self.options.token:
            self.options.token = os.environ.get(CHANGELOG_GITHUB_TOKEN)
        if not self.options.token:
            print(NO_TOKEN_PROVIDED)

    def get_all_tags(self):
        """
        Fetch all tags for repository from Github.

        :return: tags in repository
        :rtype: list
        """

        verbose = self.options.verbose
        gh = self.github
        user = self.options.user
        repo = self.options.project
        if verbose:
            print("Fetching tags...")

        tags = []
        page = 1
        while page > 0:
            if verbose > 2:
                print(".", end="")
            rc, data = gh.repos[user][repo].tags.get(
                page=page, per_page=PER_PAGE_NUMBER)
            if rc == 200:
                tags.extend(data)
            else:
                self.raise_GitHubError(rc, data, gh.getheaders())
            page = NextPage(gh)
        if verbose > 2:
            print(".")

        if len(tags) == 0:
            if not self.options.quiet:
                print("Warning: Can't find any tags in repo. Make sure, that "
                      "you push tags to remote repo via 'git push --tags'")
                exit()
        if verbose > 1:
            print("Found {} tag(s)".format(len(tags)))
        return tags

    def fetch_closed_issues_and_pr(self):
        """
        This method fetches all closed issues and separate them to
        pull requests and pure issues (pull request is kind of issue
        in term of GitHub).

        :rtype: list, list
        :return: issues, pull-requests
        """

        verbose = self.options.verbose
        gh = self.github
        user = self.options.user
        repo = self.options.project
        if verbose:
            print("Fetching closed issues and pull requests...")

        data = []
        issues = []
        data = []
        page = 1
        while page > 0:
            if verbose > 2:
                print(".", end="")
            rc, data = gh.repos[user][repo].issues.get(
                page=page, per_page=PER_PAGE_NUMBER,
                state='closed', filter='all'
            )
            if rc == 200:
                issues.extend(data)
            else:
                self.raise_GitHubError(rc, data, gh.getheaders())
            if len(issues) >= self.options.max_issues:
                break
            page = NextPage(gh)
        self.first_issue = data[-1] if len(data) > 0 else []
        if verbose > 2:
            print(".")

        # separate arrays of issues and pull requests:
        prs = []
        iss = []
        for i in issues:
            if "pull_request" in i:
                prs.append(i)
            else:
                iss.append(i)
        if verbose > 1:
            print("\treceived {} issues and  {} pull requests.".format(
                len(iss), len(prs))
            )
        return iss, prs

    def fetch_closed_pull_requests(self):
        """
        Fetch all pull requests. We need them to detect "merged_at" parameter

        :rtype: list
        :return: all pull requests
        """

        pull_requests = []
        verbose = self.options.verbose
        gh = self.github
        user = self.options.user
        repo = self.options.project
        if verbose:
            print("Fetching closed pull requests...")
        page = 1
        while page > 0:
            if verbose > 2:
                print(".", end="")

            if self.options.release_branch:
                rc, data = gh.repos[user][repo].pulls.get(
                    page=page, per_page=PER_PAGE_NUMBER, state='closed',
                    base=self.options.release_branch
                )
            else:
                rc, data = gh.repos[user][repo].pulls.get(
                    page=page, per_page=PER_PAGE_NUMBER, state='closed',
                )

            if rc == 200:
                pull_requests.extend(data)
            else:
                self.raise_GitHubError(rc, data, gh.getheaders())
            page = NextPage(gh)
        if verbose > 2:
            print(".")
        if verbose > 1:
            print("\tfetched {} closed pull requests.".format(
                len(pull_requests))
            )
        return pull_requests

    def fetch_repo_creation_date(self):
        """
        Get the creation date of the repository from GitHub.

        :rtype: str, str
        :return: special tag name, creation date as ISO date string
        """
        gh = self.github
        user = self.options.user
        repo = self.options.project
        rc, data = gh.repos[user][repo].get()
        if rc == 200:
            return REPO_CREATED_TAG_NAME, data["created_at"]
        else:
            self.raise_GitHubError(rc, data, gh.getheaders())
        return None, None

    def fetch_events_async(self, issues, tag_name):
        """
        Fetch events for all issues and add them to self.events

        :param list issues: all issues
        :param str tag_name: name of the tag to fetch events for
        :returns: Nothing
        """

        if not issues:
            return issues

        max_simultaneous_requests = self.options.max_simultaneous_requests
        verbose = self.options.verbose
        gh = self.github
        user = self.options.user
        repo = self.options.project
        self.events_cnt = 0
        if verbose:
            print("fetching events for {} {}... ".format(
                len(issues), tag_name)
            )

        def worker(issue):
            page = 1
            issue['events'] = []
            while page > 0:
                rc, data = gh.repos[user][repo].issues[
                    issue['number']].events.get(
                    page=page, per_page=PER_PAGE_NUMBER)
                if rc == 200:
                    issue['events'].extend(data)
                    self.events_cnt += len(data)
                else:
                    self.raise_GitHubError(rc, data, gh.getheaders())
                page = NextPage(gh)

        threads = []
        cnt = len(issues)
        for i in range(0, (cnt // max_simultaneous_requests) + 1):
            for j in range(max_simultaneous_requests):
                idx = i * max_simultaneous_requests + j
                if idx == cnt:
                    break
                t = threading.Thread(target=worker, args=(issues[idx],))
                threads.append(t)
                t.start()
                if verbose > 2:
                    print(".", end="")
                    if not idx % PER_PAGE_NUMBER:
                        print("")
            for t in threads:
                t.join()
        if verbose > 2:
            print(".")

    def fetch_date_of_tag(self, tag):
        """
        Fetch time for tag from repository.

        :param dict tag: dictionary with tag information
        :rtype: str
        :return: time of specified tag as ISO date string
        """

        if self.options.verbose > 1:
            print("\tFetching date for tag {}".format(tag["name"]))
        gh = self.github
        user = self.options.user
        repo = self.options.project

        rc, data = gh.repos[user][repo].git.commits[
            tag["commit"]["sha"]].get()
        if rc == 200:
            return data["committer"]["date"]
        self.raise_GitHubError(rc, data, gh.getheaders())

    def fetch_commit(self, event):
        """
        Fetch commit data for specified event.

        :param dict event: dictionary with event information
        :rtype: dict
        :return: dictionary with commit data
        """

        gh = self.github
        user = self.options.user
        repo = self.options.project

        rc, data = gh.repos[user][repo].git.commits[
            event["commit_id"]].get()
        if rc == 200:
            return data
        self.raise_GitHubError(rc, data, gh.getheaders())

    @staticmethod
    def raise_GitHubError(rc, data, header):
        hdr = dict(header)
        if rc == 403 and hdr.get("x-ratelimit-remaining") == '0':
            # TODO: add auto-retry
            raise GithubApiError(GH_RATE_LIMIT_EXCEEDED_MSG)
        raise GithubApiError("({0}) {1}".format(rc, data["message"]))


def NextPage(gh):
    """
    Checks if a GitHub call returned multiple pages of data.

    :param gh: GitHub() instance
    :rtype: int
    :return: number of next page or 0 if no next page
    """
    header = dict(gh.getheaders())
    if 'Link' in header:
        parts = header['Link'].split(',')
        for part in parts:
            subparts = part.split(';')
            sub = subparts[1].split('=')
            if sub[0].strip() == 'rel':
                if sub[1] == '"next"':
                    page = int(
                        re.match(
                            r'.*page=(\d+).*', subparts[0],
                            re.IGNORECASE | re.DOTALL | re.UNICODE
                        ).groups()[0]
                    )
                    return page
    return 0