CenterForOpenScience/scrapi

View on GitHub
scrapi/base/__init__.py

Summary

Maintainability
C
1 day
Test Coverage
# Classes for scrAPI Harvesters
from __future__ import unicode_literals

import abc
import json
import logging
from datetime import timedelta, date

import six
from furl import furl
from lxml import etree

from scrapi import registry
from scrapi import settings
from scrapi.base.schemas import OAISCHEMA
from scrapi.linter.document import RawDocument, NormalizedDocument
from scrapi.base.transformer import XMLTransformer, JSONTransformer
from scrapi.base.helpers import (
    updated_schema,
    build_properties,
    oai_get_records_and_token,
    compose,
    datetime_formatter,
    null_on_error,
    coerce_to_list
)

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)

etree.set_default_parser(etree.XMLParser(recover=True))


class HarvesterMeta(abc.ABCMeta):
    def __init__(cls, name, bases, dct):
        super(HarvesterMeta, cls).__init__(name, bases, dct)
        if len(cls.__abstractmethods__) == 0 and cls.short_name not in settings.disabled:
            registry[cls.short_name] = cls()
        else:
            logger.info('Class {} not added to registry'.format(cls.__name__))


@six.add_metaclass(HarvesterMeta)
class BaseHarvester(object):
    """ This is a base class that all harvesters should inheret from

    Defines the copy to unicode method, which is useful for getting standard
    unicode out of xml results.
    """

    @abc.abstractproperty
    def short_name(self):
        raise NotImplementedError

    @abc.abstractproperty
    def long_name(self):
        raise NotImplementedError

    @abc.abstractproperty
    def url(self):
        raise NotImplementedError

    @abc.abstractproperty
    def file_format(self):
        raise NotImplementedError

    @abc.abstractmethod
    def harvest(self, start_date=None, end_date=None):
        raise NotImplementedError

    @abc.abstractmethod
    def normalize(self, raw_doc):
        raise NotImplementedError

    @property
    def run_at(self):
        return {
            'hour': 22,
            'minute': 59,
            'day_of_week': 'mon-sun',
        }


class JSONHarvester(BaseHarvester, JSONTransformer):
    file_format = 'json'

    def normalize(self, raw_doc):
        transformed = self.transform(json.loads(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER)
        transformed['shareProperties'] = {
            'source': self.short_name,
            'docID': raw_doc['docID'],
            'filetype': raw_doc['filetype']
        }
        return NormalizedDocument(transformed, clean=True)


class XMLHarvester(BaseHarvester, XMLTransformer):
    file_format = 'xml'

    def normalize(self, raw_doc):
        transformed = self.transform(etree.XML(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER)
        transformed['shareProperties'] = {
            'source': self.short_name,
            'docID': raw_doc['docID'],
            'filetype': raw_doc['filetype']
        }
        return NormalizedDocument(transformed, clean=True)


class OAIHarvester(XMLHarvester):
    """ Create a harvester with a oai_dc namespace, that will harvest
    documents within a certain date range

    Contains functions for harvesting from an OAI provider, normalizing,
    and outputting in a way that scrapi can understand, in the most
    generic terms possible.

    For more information, see the OAI PMH specification:
    http://www.openarchives.org/OAI/openarchivesprotocol.html
    """
    record_encoding = None
    DEFAULT_ENCODING = 'UTF-8'
    RESUMPTION = '&resumptionToken='
    RECORDS_URL = '?verb=ListRecords'
    META_PREFIX_DATE = '&metadataPrefix=oai_dc&from={}&until={}'

    # Override these variable is required
    namespaces = {
        'dc': 'http://purl.org/dc/elements/1.1/',
        'ns0': 'http://www.openarchives.org/OAI/2.0/',
        'oai_dc': 'http://www.openarchives.org/OAI/2.0/',
    }

    timeout = 0.5
    approved_sets = None
    timezone_granularity = False
    property_list = ['date', 'type']
    force_request_update = False
    verify = True

    @property
    def schema(self):
        return self._schema

    @property
    def _schema(self):
        return updated_schema(OAISCHEMA, self.formatted_properties)

    @property
    def formatted_properties(self):
        return {
            'otherProperties': build_properties(*list(map(self.format_property, self.property_list)))
        }

    def format_property(self, property):
        if property == 'date':
            fn = compose(lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property)
        else:
            fn = self.resolve_property
        return (property, (
            '//dc:{}/node()'.format(property),
            '//ns0:{}/node()'.format(property),
            fn)
        )

    def resolve_property(self, dc, ns0):
        ret = dc + ns0
        return ret[0] if len(ret) == 1 else ret

    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        url = furl(self.base_url)
        url.args['verb'] = 'ListRecords'
        url.args['metadataPrefix'] = 'oai_dc'
        url.args['from'] = start_date
        url.args['until'] = end_date

        records = self.get_records(url.url, start_date, end_date)

        return [
            RawDocument({
                'doc': etree.tostring(record, encoding=self.record_encoding),
                'source': self.short_name,
                'docID': record.xpath('ns0:header/ns0:identifier', namespaces=self.namespaces)[0].text,
                'filetype': 'xml'
            }) for record in records
        ]

    def get_records(self, url, start_date, end_date=None):
        url = furl(url)
        all_records, token = oai_get_records_and_token(url.url, self.timeout, self.force_request_update, self.namespaces, self.verify)

        while token:
            url.remove('from')
            url.remove('until')
            url.remove('metadataPrefix')
            url.args['resumptionToken'] = token[0]
            records, token = oai_get_records_and_token(url.url, self.timeout, self.force_request_update, self.namespaces, self.verify)
            all_records += records

        return all_records

    def normalize(self, raw_doc):
        str_result = raw_doc.get('doc')
        parser = etree.XMLParser(recover=True, encoding=self.DEFAULT_ENCODING)
        result = etree.XML(str_result, parser=parser)

        if self.approved_sets:
            set_spec = result.xpath(
                'ns0:header/ns0:setSpec/node()',
                namespaces=self.namespaces
            )
            # check if there's an intersection between the approved sets and the
            # setSpec list provided in the record. If there isn't, don't normalize.
            if not {x.replace('publication:', '') for x in set_spec}.intersection(self.approved_sets):
                logger.info('Series {} not in approved list'.format(set_spec))
                return None

        status = result.xpath('ns0:header/@status', namespaces=self.namespaces)
        if status and status[0] == 'deleted':
            logger.info('Deleted record, not normalizing {}'.format(raw_doc['docID']))
            return None

        return super(OAIHarvester, self).normalize(raw_doc)