emory-libraries/eulxml

View on GitHub
eulxml/catalog.py

Summary

Maintainability
A
1 hr
Test Coverage
# file eulxml/catalog.py
#
#   Copyright 2016 Emory University Libraries
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

'''
Logic for downloading local copies of schemas and generating an
`XML catalog <http://lxml.de/resolvers.html#xml-catalogs`_ for use in
resolving schemas locally instead of downloading them every time validation
is required.

Catalog generation is available via the setup.py custom command xmlcatalog,
and a generated catalog and corresponding schema files should be included
in packaged releases of eulxml.

For more information about setting up and testing XML catalogs, see the
`libxml2 documentation <http://xmlsoft.org/catalog.html>`_.
'''

import os
import logging
from datetime import date
from lxml import etree
import sys

from eulxml import xmlmap, __version__, XMLCATALOG_DIR, XMLCATALOG_FILE

# requests is an optional dependency, handle gracefully if not present
try:
    import requests
except ImportError:
    requests = None


logger = logging.getLogger(__name__)

# message to display if requests is not available
req_requests_msg = 'Please install requests to download schemas ' + \
                   '(pip install requests)\n'


XSD_SCHEMAS = [
    'http://www.loc.gov/standards/mods/mods.xsd',
    'http://www.loc.gov/standards/mods/v3/mods-3-4.xsd',
    'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
    'http://www.loc.gov/standards/xlink/xlink.xsd',
    'http://www.loc.gov/standards/premis/premis.xsd',
    'http://www.loc.gov/standards/premis/v2/premis-v2-1.xsd',
    'http://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd',
    'http://www.history.ncdcr.gov/SHRAB/ar/emailpreservation/mail-account/mail-account.xsd',
    'http://www.loc.gov/ead/ead.xsd'
]
# , 'http://www.archives.ncdcr.gov/mail-account.xsd'


class Uri(xmlmap.XmlObject):
    """:class:`xmlmap.XmlObject` class for Catalog URIs"""
    ROOT_NAME = 'uri'
    ROOT_NS = "urn:oasis:names:tc:entity:xmlns:xml:catalog"
    #: name, i.e. schema URI
    name = xmlmap.StringField('@name')
    #: uri, i.e. path to load the schema locally
    uri = xmlmap.StringField('@uri')


class Catalog(xmlmap.XmlObject):
    """:class:`xmlmap.XmlObject` class to for generating XML Catalogs"""
    ROOT_NAME = 'catalog'
    ROOT_NS = "urn:oasis:names:tc:entity:xmlns:xml:catalog"
    ROOT_NAMESPACES = {'c': ROOT_NS}
    #: list of uris, as instance of :class:`Uri`
    uri_list = xmlmap.NodeListField('c:uri', Uri)


def download_schema(uri, path, comment=None):
    """Download a schema from a specified URI and save it locally.

    :param uri: url where the schema should be downloaded
    :param path: local file path where the schema should be saved
    :param comment: optional comment; if specified, will be added to
        the downloaded schema
    :returns: true on success, false if there was an error and the
        schema failed to download
    """
    # if requests isn't available, warn and bail out
    if requests is None:
        sys.stderr.write(req_requests_msg)
        return

    # short-hand name of the schema, based on uri
    schema = os.path.basename(uri)
    try:

        req = requests.get(uri, stream=True)
        req.raise_for_status()
        with open(path, 'wb') as schema_download:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk: # filter out keep-alive new chunks
                    schema_download.write(chunk)
        # if a comment is specified, add it to the locally saved schema
        if comment is not None:
            tree = etree.parse(path)
            tree.getroot().append(etree.Comment(comment))
            with open(path, 'wb') as xml_catalog:
                xml_catalog.write(etree.tostring(tree, pretty_print=True,
                    xml_declaration=True, encoding="UTF-8"))
            logger.debug('Downloaded schema %s', schema)

        return True

    except requests.exceptions.HTTPError as err:
        msg = 'Failed to download schema %s' % schema
        msg += '(error codes %s)' % err.response.status_code
        logger.warn(msg)

        return False


def generate_catalog(xsd_schemas=None, xmlcatalog_dir=None, xmlcatalog_file=None):
    """Generating an XML catalog for use in resolving schemas

    Creates the XML Catalog directory if it doesn't already exist.
    Uses :meth:`download_schema` to save local copies of schemas,
    adding a comment indicating the date downloaded by eulxml.

    Generates a new catalog.xml file, with entries for all schemas
    that downloaded successfully.  If no schemas downloaded, the catalog
    is not generated.

    .. Note::

        Currently this method overwites any existing schema and catalog
        files, without checking if they are present or need to be
        updated.

    """
    # if requests isn't available, warn and bail out
    if requests is None:
        sys.stderr.write(req_requests_msg)
        return

    logger.debug("Generating a new XML catalog")
    if xsd_schemas is None:
        xsd_schemas = XSD_SCHEMAS

    if xmlcatalog_file is None:
        xmlcatalog_file = XMLCATALOG_FILE

    if xmlcatalog_dir is None:
        xmlcatalog_dir = XMLCATALOG_DIR
    # if the catalog dir doesn't exist, create it
    if not os.path.isdir(xmlcatalog_dir):
        os.mkdir(xmlcatalog_dir)

    # new xml catalog to be populated with saved schemas
    catalog = Catalog()

    # comment string to be added to locally-saved schemas
    comment = 'Downloaded by eulxml %s on %s' % \
        (__version__, date.today().isoformat())

    for schema_uri in xsd_schemas:
        filename = os.path.basename(schema_uri)
        schema_path = os.path.join(xmlcatalog_dir, filename)
        saved = download_schema(schema_uri, schema_path, comment)
        if saved:
            # if download succeeded, add to our catalog.
            # - name is the schema identifier (uri)
            # - uri is the local path to load
            # NOTE: using path relative to catalog file
            catalog.uri_list.append(Uri(name=schema_uri, uri=filename))

    # if we have any uris in our catalog, write it out
    if catalog.uri_list:
        with open(xmlcatalog_file, 'wb') as xml_catalog:
            catalog.serializeDocument(xml_catalog, pretty=True)
    return catalog