scripts/repo-cp
#!/usr/bin/env python
# file scripts/repo-cp
#
# Copyright 2015 Emory University Libraries & IT Services
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import binascii
from copy import copy
from six.moves import configparser
import hashlib
import logging.config
from lxml import etree
import math
import glob
import os
import re
from eulxml.xmlmap import load_xmlobject_from_string
from eulfedora.server import Repository
from eulfedora.models import DigitalObject
from eulfedora.util import ChecksumMismatch, PermissionDenied, \
RequestFailed
from eulfedora.xml import FoxmlDigitalObject
from eulfedora.syncutil import estimate_object_size, ArchiveExport, \
humanize_file_size, sync_object
def repo_copy():
parser = argparse.ArgumentParser()
# config file options
cfg_args = parser.add_argument_group('Config file options')
cfg_args.add_argument('--config', '-c',
default='$HOME/.repocpcfg',
help='Load the specified config file (default: %(default)s')
cfg_args.add_argument('source',
help='Source repository for content to be copied')
cfg_args.add_argument('dest',
help='Destination repository for content to be copied')
# list of pids
parser.add_argument(
'pids', metavar='PID', nargs='*',
help='list of pids to copy')
parser.add_argument(
'--file', '-f', required=False,
help='Input file with a list of pids, one pid per line')
parser.add_argument(
'--progress', '-p',
help='Show progress (only available for archive exports)',
action='store_true', default=False)
parser.add_argument(
'--requires-auth',
help='Datastream content urls require authentication',
action='store_true', default=False)
parser.add_argument(
'--export-format', '-e',
choices=['migrate', 'archive', 'archive-xml'], default='migrate',
help='Fedora export format to use. Use archive if migrate exports ' +
'fail with checksum errors or if content URLs are not accessible' +
' to the destination server. (default: %(default)s)')
parser.add_argument(
'--omit-checksums', default=False, action='store_true',
help='Omit checksums from datastreams (default: %(default)s)',)
parser.add_argument(
'--archive', '-a', action='store_const',
const='archive', dest='export_format',
help='Use archive export format (equivalent to --export-format archive)')
parser.add_argument(
'--archive-xml', action='store_const',
const='archive-xml', dest='export_format',
help='Use archival export for xml datastreams only; use fedora ' +
'datastream dissemination urls for other content. Can be used ' +
'with --requires-auth. ' +
'(Equivalent to --export-format archive-xml)')
parser.add_argument(
'--verify', action='store_true', default=False,
help='Verify checksums as datastreams are decoded (default: %(default)s)')
parser.add_argument(
'--verbosity', '-v', default='WARN',
choices=['ERROR', 'WARN', 'INFO', 'DEBUG'],
help='Output verbosity (default: %(default)s)')
args = parser.parse_args()
cfg = configparser.ConfigParser()
configfile_path = args.config.replace('$HOME', os.environ['HOME'])
with open(configfile_path) as cfgfile:
cfg.readfp(cfgfile)
if is_airlock(cfg, args.source):
src_repo = Airlock(args.source)
elif not cfg.has_section(args.source):
print('Source repository %s is not configured' % args.source)
return
else:
src_repo = Repository(cfg.get(args.source, 'fedora_root'),
cfg.get(args.source, 'fedora_user'),
cfg.get(args.source, 'fedora_password'))
if is_airlock(cfg, args.dest):
dest_repo = Airlock(args.dest)
elif not cfg.has_section(args.dest):
print('Destination repository %s is not configured' % args.dest)
return
else:
dest_repo = Repository(cfg.get(args.dest, 'fedora_root'),
cfg.get(args.dest, 'fedora_user'),
cfg.get(args.dest, 'fedora_password'))
# special case
if isinstance(src_repo, Airlock):
# if source is an "airlock", then the "pids" to sync are any
# xml files in the directory
pids = glob.glob(os.path.join(src_repo.path, '*.xml'))
else:
pids = []
if args.pids:
pids = args.pids
elif args.file:
with open(args.file) as pidlistfile:
# allow whitespace on front or end of pid, for convenience
pids = [p.strip() for p in pidlistfile.read().splitlines()]
else:
print('Specify either one or more pids or a file with a list of pids')
parser.print_help()
return
allow_overwrite = cfg.has_option(args.dest, 'allow_overwrite') and \
cfg.getboolean(args.dest, 'allow_overwrite')
# configure logging based on verbosity level requested
logging.config.dictConfig(get_logging_config(args.verbosity))
for pid in pids:
try:
# if source is a repo, "pid" is actually a file
if isinstance(src_repo, Airlock):
with open(pid, 'rb') as export:
result = dest_repo.ingest(export)
if result:
print('%s copied' % result)
else:
# false means not copied but no error; currently means
# skipped because object exists and overwrite not allowed
print('%s skipped' % pid)
continue
src_obj = src_repo.get_object(pid)
# special case: if destination is an airlock
# save the fedora export to a file in that airlock path
if isinstance(dest_repo, Airlock):
export_filename = os.path.join(dest_repo.path,
'%s.xml' % src_obj.pid)
with open(export_filename, 'wb') as export:
response = src_obj.api.export(
src_obj, context=args.export_format, stream=True)
for chunk in response.iter_content(4096*1024):
export.write(chunk)
print('Exported %s to %s' % (src_obj.pid, export_filename))
else:
result = sync_object(src_obj, dest_repo, export_context=args.export_format,
overwrite=allow_overwrite, show_progress=args.progress,
requires_auth=args.requires_auth, omit_checksums=args.omit_checksums,
verify=args.verify)
if result:
print('%s copied' % result)
else:
# false means not copied but no error; currently means
# skipped because object exists and overwrite not allowed
print('%s skipped' % pid)
except ChecksumMismatch:
print('ChecksumMismatch on %s' % pid)
except RequestFailed as err:
err_type = 'Error'
if isinstance(err, PermissionDenied):
err_type = 'Permission denied'
print('%s importing %s to %s: %s' %
(err_type, pid, args.dest, err))
except Exception as err:
print('Error copying %s: %s' % (pid, err))
def get_logging_config(level):
return {
'version': 1,
'disable_existing_loggers': True,
'formatters': {
'simple': {
'format': '%(message)s',
},
},
'handlers': {
'console': {
'level': 'DEBUG',
'class': 'logging.StreamHandler',
'formatter': 'simple'
},
},
'loggers': {
'eulfedora.syncutil': {
'handlers': ['console'],
'level': level,
'propagate': False,
},
}
}
def is_airlock(cfg, name):
# if a name is not in the config file and is a directory,
# consider it an airlock
if not cfg.has_section(name) and os.path.isdir(name):
return True
return False
class Airlock(object):
def __init__(self, path):
self.path = path
if __name__ == '__main__':
repo_copy()
# import profile
# profile.run('repo_copy()')