emory-libraries/eulfedora

View on GitHub
scripts/repo-cp

Summary

Maintainability
Test Coverage
#!/usr/bin/env python

# file scripts/repo-cp
#
#   Copyright 2015 Emory University Libraries & IT Services
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import argparse
import base64
import binascii
from copy import copy
from six.moves import configparser
import hashlib
import logging.config
from lxml import etree
import math
import glob
import os
import re

from eulxml.xmlmap import load_xmlobject_from_string
from eulfedora.server import Repository
from eulfedora.models import DigitalObject
from eulfedora.util import ChecksumMismatch, PermissionDenied, \
    RequestFailed
from eulfedora.xml import FoxmlDigitalObject
from eulfedora.syncutil import estimate_object_size, ArchiveExport, \
    humanize_file_size, sync_object


def repo_copy():

    parser = argparse.ArgumentParser()

    # config file options
    cfg_args = parser.add_argument_group('Config file options')
    cfg_args.add_argument('--config', '-c',
        default='$HOME/.repocpcfg',
        help='Load the specified config file (default: %(default)s')

    cfg_args.add_argument('source',
        help='Source repository for content to be copied')
    cfg_args.add_argument('dest',
        help='Destination repository for content to be copied')

    # list of pids
    parser.add_argument(
        'pids', metavar='PID', nargs='*',
        help='list of pids to copy')
    parser.add_argument(
        '--file', '-f', required=False,
        help='Input file with a list of pids, one pid per line')
    parser.add_argument(
        '--progress', '-p',
        help='Show progress (only available for archive exports)',
        action='store_true', default=False)
    parser.add_argument(
        '--requires-auth',
        help='Datastream content urls require authentication',
        action='store_true', default=False)
    parser.add_argument(
        '--export-format', '-e',
        choices=['migrate', 'archive', 'archive-xml'], default='migrate',
        help='Fedora export format to use.  Use archive if migrate exports ' +
             'fail with checksum errors or if content URLs are not accessible' +
             ' to the destination server. (default: %(default)s)')
    parser.add_argument(
        '--omit-checksums', default=False, action='store_true',
        help='Omit checksums from datastreams (default: %(default)s)',)
    parser.add_argument(
        '--archive', '-a', action='store_const',
        const='archive', dest='export_format',
        help='Use archive export format (equivalent to --export-format archive)')
    parser.add_argument(
        '--archive-xml', action='store_const',
        const='archive-xml', dest='export_format',
        help='Use archival export for xml datastreams only; use fedora ' +
             'datastream dissemination urls for other content.  Can be used ' +
             'with --requires-auth. ' +
             '(Equivalent to --export-format archive-xml)')
    parser.add_argument(
        '--verify', action='store_true', default=False,
        help='Verify checksums as datastreams are decoded (default: %(default)s)')
    parser.add_argument(
        '--verbosity', '-v', default='WARN',
        choices=['ERROR', 'WARN', 'INFO', 'DEBUG'],
        help='Output verbosity (default: %(default)s)')

    args = parser.parse_args()

    cfg = configparser.ConfigParser()
    configfile_path = args.config.replace('$HOME', os.environ['HOME'])
    with open(configfile_path) as cfgfile:
            cfg.readfp(cfgfile)

    if is_airlock(cfg, args.source):
       src_repo = Airlock(args.source)
    elif not cfg.has_section(args.source):
        print('Source repository %s is not configured' % args.source)
        return
    else:
        src_repo = Repository(cfg.get(args.source, 'fedora_root'),
                              cfg.get(args.source, 'fedora_user'),
                              cfg.get(args.source, 'fedora_password'))

    if is_airlock(cfg, args.dest):
        dest_repo = Airlock(args.dest)

    elif not cfg.has_section(args.dest):
        print('Destination repository %s is not configured' % args.dest)
        return

    else:
        dest_repo = Repository(cfg.get(args.dest, 'fedora_root'),
                               cfg.get(args.dest, 'fedora_user'),
                               cfg.get(args.dest, 'fedora_password'))

    # special case
    if isinstance(src_repo, Airlock):
        # if source is an "airlock", then the "pids" to sync are any
        # xml files in the directory
        pids = glob.glob(os.path.join(src_repo.path, '*.xml'))

    else:
        pids = []
        if args.pids:
            pids = args.pids
        elif args.file:
            with open(args.file) as pidlistfile:
                # allow whitespace on front or end of pid, for convenience
                pids = [p.strip() for p in pidlistfile.read().splitlines()]
        else:
            print('Specify either one or more pids or a file with a list of pids')
            parser.print_help()
            return

    allow_overwrite = cfg.has_option(args.dest, 'allow_overwrite') and \
        cfg.getboolean(args.dest, 'allow_overwrite')

    # configure logging based on verbosity level requested
    logging.config.dictConfig(get_logging_config(args.verbosity))

    for pid in pids:
        try:

            # if source is a repo, "pid" is actually a file
            if isinstance(src_repo, Airlock):
                with open(pid, 'rb') as export:
                    result = dest_repo.ingest(export)
                    if result:
                        print('%s copied' % result)
                    else:
                        # false means not copied but no error; currently means
                        # skipped because object exists and overwrite not allowed
                        print('%s skipped' % pid)

                continue

            src_obj = src_repo.get_object(pid)

            # special case: if destination is an airlock
            # save the fedora export to a file in that airlock path
            if isinstance(dest_repo, Airlock):
                export_filename = os.path.join(dest_repo.path,
                                               '%s.xml' % src_obj.pid)
                with open(export_filename, 'wb') as export:
                    response = src_obj.api.export(
                        src_obj, context=args.export_format, stream=True)
                    for chunk in response.iter_content(4096*1024):
                        export.write(chunk)
                    print('Exported %s to %s' % (src_obj.pid, export_filename))

            else:
                result = sync_object(src_obj, dest_repo, export_context=args.export_format,
                    overwrite=allow_overwrite, show_progress=args.progress,
                    requires_auth=args.requires_auth, omit_checksums=args.omit_checksums,
                    verify=args.verify)
                if result:
                    print('%s copied' % result)
                else:
                    # false means not copied but no error; currently means
                    # skipped because object exists and overwrite not allowed
                    print('%s skipped' % pid)
        except ChecksumMismatch:
            print('ChecksumMismatch on %s' % pid)

        except RequestFailed as err:
            err_type = 'Error'
            if isinstance(err, PermissionDenied):
                err_type = 'Permission denied'
            print('%s importing %s to %s: %s' %
                  (err_type, pid, args.dest, err))

        except Exception as err:
            print('Error copying %s: %s' % (pid, err))


def get_logging_config(level):
    return {
        'version': 1,
        'disable_existing_loggers': True,
        'formatters': {
            'simple': {
                'format': '%(message)s',
            },
        },
        'handlers': {
            'console': {
                'level': 'DEBUG',
                'class': 'logging.StreamHandler',
                'formatter': 'simple'
            },
        },
        'loggers': {
            'eulfedora.syncutil': {
                'handlers': ['console'],
                'level': level,
                'propagate': False,
            },
        }
    }


def is_airlock(cfg, name):
    # if a name is not in the config file and is a directory,
    # consider it an airlock
    if not cfg.has_section(name) and os.path.isdir(name):
        return True
    return False


class Airlock(object):

    def __init__(self, path):
        self.path = path


if __name__ == '__main__':
    repo_copy()
    # import profile
    # profile.run('repo_copy()')