unixorn/git-extra-commands

View on GitHub
bin/git-restore-mtime

Summary

Maintainability
Test Coverage
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# git-restore-mtime - Change mtime of files based on commit date of last change
#
#    Copyright (C) 2012 Rodrigo Silva (MestreLion) <linux@rodrigosilva.com>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program. See <http://www.gnu.org/licenses/gpl.html>
#
# Change the modification time (mtime) of all files in work tree, based on the
# date of the most recent commit that modified the file.
#
# Useful prior to generating release tarballs, so each file is archived with a
# date that resembles the date when the file was actually last modified.
# (assuming the actual modification date and its commit date are close)

# By default ignores all ignored and untracked files, and also refuses to work
# on trees with uncommitted changes.

if __name__ != "__main__":
    raise ImportError("%s should not be used as a module." % __name__)

import subprocess, shlex
import sys, os.path
import logging as logger
import argparse
import time

if sys.version_info[:2] >= (3, 0):
    def text(s):
        return s.decode('utf-8')
else:
    def text(s):
        return s

parser = argparse.ArgumentParser(
    description='Restore original modification time of files based on '
                'the date of the most recent commit that modified them. '
                'Useful when generating release tarballs.')

parser.add_argument('--quiet', '-q',
                    action="store_true",
                    help='suppress informative messages and summary statistics.')

parser.add_argument('--verbose', '-v',
                    action="store_true",
                    help='print additional information for each processed file. '
                    'Overwrites --quiet.')

parser.add_argument('--force', '-f',
                    action="store_true",
                    help='force execution on trees with uncommitted changes.')

parser.add_argument('--merge', '-m',
                    action="store_true",
                    help='include merge commits. Leads to more recent mtimes '
                        'and more files per commit, thus with the same mtime '
                        '(which may or may not be what you want). Including '
                        'merge commits may lead to less commits being evaluated '
                        '(all files are found sooner), which improves performance, '
                        'sometimes substantially. But since merge commits are '
                        'usually huge, processing them may also take longer, '
                        'sometimes substantially. By default merge logs are only '
                        'used for files missing from regular commit logs.')

parser.add_argument('--first-parent',
                    action="store_true",
                    help='pass --first-parent to git whatchanged to hide the '
                        'second parent from the merge commit logs. Only has any '
                        'effect if --merge is also specified or --skip-missing '
                        'is not specified and there were files not found in regular '
                        'commit logs.')

parser.add_argument('--skip-missing', '-s',
                    action="store_false", default=True, dest='missing',
                    help='do not try to find missing files. If some files were '
                        'not found in regular commit logs, by default it re-tries '
                        'using merge commit logs for these files (if --merge was '
                        'not used already). This option disables this behavior, '
                        'which may slightly improve performance, but files '
                        'found only in merge commits will not be updated.')

parser.add_argument('--no-directories', '-D',
                    action="store_false", default=True, dest='dirs',
                    help='do not update directory mtime for files created, '
                        'renamed or deleted in it. Note: just modifying a file '
                        'will not update its directory mtime.')

parser.add_argument('--test', '-t',
                    action="store_true", default=False, dest='test',
                    help='test run: do not actually update any file')

parser.add_argument('--commit-time', '-c',
                    action='store_const', const='%ct', default='%at', dest='timeformat',
                    help='use commit time instead of author time')

parser.add_argument('pathspec',
                    nargs='*', default=[os.path.curdir],
                    help='only modify paths (dirs or files) matching PATHSPEC, '
                        'relative to current directory. '
                        'Default is to modify all non-ignored, tracked files.')

parser.add_argument('--work-tree',
                    dest='workdir',
                    help='specify where the work tree is. '
                        'Default for most repositories is current directory.')

parser.add_argument('--git-dir',
                    dest='gitdir',
                    help='specify where the git repository is. '
                        'Default for most repositories <work-tree>/.git')

args = parser.parse_args()

gitcmd = ['git']
if args.workdir: gitcmd.append("--work-tree=%s" % args.workdir)
if args.gitdir : gitcmd.append("--git-dir=%s"   % args.gitdir)

if   args.verbose: level = logger.DEBUG
elif args.quiet:   level = logger.WARN
else:              level = logger.INFO

logger.basicConfig(level=level, format='%(message)s')


# UI done, it's show time!
start = time.time()  # yes, Wall time. CPU time is not realistic for users.

lines = loglines = commits = totalfiles = \
ignoredfiles = files = touches = errors = 0
stepmissing = 100


# First things first: Where and Who are we?
try:
    workdir, gitdir = text(subprocess.check_output(gitcmd + shlex.split(
                    'rev-parse --show-toplevel --git-dir'))).split('\n')[:2]

    workdir = os.path.abspath(workdir)
    gitdir  = os.path.abspath(gitdir)

except subprocess.CalledProcessError as e:
    # rev-parse couldn't find git repo, and already informed user.
    # So we just...
    sys.exit(e.returncode)


# Get the files managed by git
lsfileslist = set()
gitobj = subprocess.Popen(gitcmd + shlex.split('ls-files --full-name') +
                          ['--'] + args.pathspec,
                          stdout=subprocess.PIPE)
for line in gitobj.stdout:
    lsfileslist.add(os.path.relpath(line.strip(), workdir))

# List files matching user pathspec, relative to current directory
# git commands always print paths relative to work tree root
filelist = set()
dirlist  = set()
for path in args.pathspec:

    # Normalize user input so ./doc = doc/ = doc/../doc/. = doc
    path = os.path.normpath(path)

    # Is path inside the work tree?
    if os.path.commonprefix([workdir, os.path.abspath(path)]) != workdir:
        logger.warn("WARNING: Skipping pathspec outside work tree: %s", path)
        continue

    # git does not care if it's a broken symlink, hence lexists
    if not os.path.lexists(path):
        logger.warn("WARNING: Skipping non-existing pathspec: %s", path)
        continue

    # file or symlink (to file, to dir or broken - git handles the same way)
    if os.path.isfile(path) or os.path.islink(path):
        # Always add them relative to worktree root
        filelist.add(os.path.relpath(path, workdir))

    # dir
    else:
        for root, subdirs, files in os.walk(path):
            if gitdir in [os.path.abspath(os.path.join(root, subdir))
                          for subdir in subdirs]:
                subdirs.remove(os.path.basename(gitdir))

            if os.path.abspath(root) == workdir and '.git' in files:
                files.remove('.git')

            if args.dirs:
                dir = os.path.relpath(root, workdir)
                if dir == '.':
                    dir = ''
                dirlist.add(dir)

            for file in files:
                # Always add them relative to worktree root
                filelist.add(os.path.relpath(os.path.join(root, file), workdir))

filelist &= lsfileslist

totaldirs  = dirs  = len(dirlist)
totalfiles = files = len(filelist)
logger.info("{:,} files to be processed in work dir".format(totalfiles))


# Discard untracked and ignored files
ignoredlist = set()
gitobj = subprocess.Popen(gitcmd + shlex.split('status --porcelain --ignored') +
                          ['--'] + args.pathspec,
                          stdout=subprocess.PIPE)
for line in gitobj.stdout:
    line = text(line.strip())
    status = line[:2]
    filespec = line[3:]

    # Make sure the slash matches the os; for Windows we need a backslash
    if not os.sep == '/':
        filespec = filespec.replace('/', os.sep)

    if status in ['??', '!!']: # also safe to ignore: 'A ', ' M'
        # filespec can be a dir, so we must iterate on filelist
        for file in filelist:
            if ( (filespec.endswith(os.sep) and  file.startswith(filespec)) or
                 (file == filespec) ):
                logger.debug("Ignoring: %s", file)
                ignoredlist.add(file)
                files -= 1
                ignoredfiles += 1
    elif not args.force:
        logger.critical(
         "ERROR: There are local changes in the working directory.\n"
         "This could lead to undesirable results for modified files.\n"
         "Please, commit your changes (or use --force) and try again.\n"
         "Aborting")
        gitobj.kill()
        sys.exit(1)

if ignoredfiles:
    filelist -= ignoredlist
    logger.info("{:,} files to process after ignoring {:,}"
                "".format(files, ignoredfiles))


# Process the log until all files are 'touched'
logger.debug("Line #\tLog #\tFiles\tmtime\tFile")
def parselog(merge=False, filterlist=[]):
    global loglines, dirs, files, touches, errors, commits

    gitobj = subprocess.Popen(gitcmd + shlex.split('whatchanged --pretty={}'.format(args.timeformat)) +
                              (['-m'] if merge else []) +
                              (['--first-parent'] if args.first_parent else []) +
                              ['--'] + filterlist,
                              stdout=subprocess.PIPE)
    for line in gitobj.stdout:
        loglines += 1
        line = text(line.strip())

        # Blank line between Date and list of files
        if not line: continue

        # File line
        if line.startswith(':'):
            # If line describes a rename, linetok has three tokens, otherwise
            # two.
            linetok = line.split('\t')
            status = linetok[0]
            file = linetok[-1]

            # Make sure the slash matches the os; for Windows we need a backslash
            if not os.sep == '/':
                file = file.replace('/',os.sep)

            if file.startswith('"'):
                file = file[1:-1].decode("string-escape")

            if file in filelist:
                logger.debug("%d\t%d\t%d\t%s\t%s",
                             loglines, commits, files,
                             time.ctime(mtime), file)
                filelist.remove(file)
                files -= 1
                try:
                    if not args.test:
                        os.utime(os.path.join(workdir, file), (mtime, mtime))
                    touches += 1
                except Exception as e:
                    logger.error("ERROR: %s\n", e)
                    errors += 1

            if args.dirs:
                dir = os.path.dirname(file)
                if status[-1] in ['A', 'D'] and dir in dirlist:
                    logger.debug("%d\t%d\t-\t%s\t%s",
                                 loglines, commits,
                                 time.ctime(mtime), dir)
                    dirlist.remove(dir)
                    try:
                        if not args.test:
                            os.utime(os.path.join(workdir, dir), (mtime, mtime))
                        dirs -= 1
                    except Exception as e:
                        logger.error("ERROR: %s\n", e)

        # Date line
        else:
            commits += 1
            mtime = int(line)

        # All files done?
        if not files:
            break

    try:
        gitobj.terminate()
    except OSError:
        pass


parselog(args.merge, args.pathspec)

# Missing files
if filelist:

    # Try to find them in merge logs, if not done already
    # (usually HUGE, thus MUCH slower!)
    if args.missing and not args.merge:
        filterlist = list(filelist)
        for i in range(0, len(filterlist), stepmissing):
            parselog(merge=True, filterlist=filterlist[i:i+stepmissing])

    # Still missing some?
    for file in filelist:
        logger.warn("WARNING: not found in log: %s", file)


# Final statistics
# TODO: use git-log --before=mtime to brag about skipped log entries
logger.info(
    "Statistics:\n"
    "{:13,.2f} seconds\n"
    "{:13,} log lines processed\n"
    "{:13,} commits evaluated"
    "".format(time.time()-start, loglines, commits))

if touches != totalfiles:
                 logger.info("{:13,} total files".format(totalfiles))
if ignoredfiles: logger.info("{:13,} ignored files".format(ignoredfiles))
if files:        logger.info("{:13,} missing files".format(files))
if errors:       logger.info("{:13,} update errors".format(errors))

logger.info("{:13,} updated files".format(touches))

if args.dirs:
    logger.info("{:13,} updated directories".format(totaldirs - dirs))

if args.test:
        logger.info("TEST RUN - No files modified!")

# Statistics for some large projects

#bash
#         0.27 seconds
#        5,750 log lines processed
#           62 commits evaluated
#        1,155 updated files

#git
#         3.71 seconds
#       96,702 log lines processed
#       24,217 commits evaluated
#        2,495 updated files

#git (--merge)
#         0.19 seconds
#        2,586 log lines processed
#            3 commits evaluated
#        2,495 updated files

#wine
#        13.53 seconds
#      443,979 log lines processed
#       91,703 commits evaluated
#        6,005 updated files

#linux kernel
#        59.11 seconds
#    1,484,567 log lines processed
#      313,164 commits evaluated
#       40,902 updated files

#linux kernel (--skip-missing)
#WARNING: not found in log: arch/arm/mach-sa1100/include/mach/reset.h
#WARNING: not found in log: lib/raid6/unroll.awk
#Statistics:
#        51.88 seconds
#    1,484,558 log lines processed
#      313,161 commits evaluated
#       40,902 total files
#            2 missing files
#       40,900 updated files

#linux kernel (--merge)
#       501.17 seconds
#   34,495,300 log lines processed
#      238,711 commits evaluated
#       40,902 updated files