CenterForOpenScience/scrapi

View on GitHub
scrapi/base/schemas.py

Summary

Maintainability
A
1 hr
Test Coverage
from __future__ import unicode_literals

from .helpers import (
    compose,
    non_string,
    format_tags,
    single_result,
    language_codes,
    element_to_dict,
    oai_process_uris,
    build_properties,
    datetime_formatter,
    doe_process_contributors,
    oai_process_contributors,
    dif_process_contributors
)


DOESCHEMA = {
    "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
    "contributors": ('//dc:creator/node()', compose(doe_process_contributors, lambda x: x.split(';'), single_result)),
    "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
    "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
    "uris": {
        "canonicalUri": ('//dcq:identifier-citation/node()', compose(lambda x: x.strip(), single_result)),
        "objectUris": [('//dc:doi/node()', compose(lambda x: 'http://dx.doi.org/' + x, single_result))]
    },
    "languages": ("//dc:language/text()", language_codes),
    "publisher": {
        "name": ("//dcq:publisher/node()", single_result)
    },
    "sponsorships": [{
        "sponsor": {
            "sponsorName": ("//dcq:publisherSponsor/node()", single_result)
        }
    }],
    "otherProperties": build_properties(
        ('coverage', '//dc:coverage/node()'),
        ('date', '//dc:date/node()'),
        ('format', '//dc:format/node()'),
        ('identifier', '//dc:identifier/node()'),
        ('identifierDOEcontract', '//dcq:identifierDOEcontract/node()'),
        ('identifierOther', '//dc:identifierOther/node()'),
        ('identifier-purl', '//dc:identifier-purl/node()'),
        ('identifierReport', '//dc:identifierReport/node()'),
        ('publisherAvailability', '//dcq:publisherAvailability/node()'),
        ('publisherCountry', '//dcq:publisherCountry/node()'),
        ('publisherResearch', '//dcq:publisherResearch/node()'),
        ('relation', '//dc:relation/node()'),
        ('rights', '//dc:rights/node()'),
        ('type', '//dc:type/node()'),
        ('typeQualifier', '//dc:typeQualifier/node()')
    )
}

OAISCHEMA = {
    "contributors": ('//dc:creator/node()', '//dc:contributor/node()', oai_process_contributors),
    "uris": ('//dc:doi/node()', '//dc:identifier/node()', oai_process_uris),
    'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(datetime_formatter, single_result)),
    'title': ('//dc:title/node()', single_result),
    'description': ('//dc:description/node()', single_result),
    'subjects': ('//dc:subject/node()', format_tags),
    'publisher': {
        'name': ('//dc:publisher/node()', single_result)
    },
    'languages': ('//dc:language/text()', language_codes)
}


DIFSCHEMA = {
    "abstract": ('//dif:Summary/dif:Abstract/node()', single_result),
    "uris": ('//dif:URL/node()', oai_process_uris),
    "title": ('//dif:Entry_Title/node()', single_result),
    'providerUpdatedDateTime': ('//OAI-PMH:header/OAI-PMH:datestamp/node()', compose(datetime_formatter, single_result)),
    "contributors": ('//dif:Personnel/dif:First_Name/node()', '//dif:Personnel/dif:Last_Name/node()', dif_process_contributors),
    "otherProperties": build_properties(
        ('metadataName', '//dif:Metadata_Name/node()'),
        ('metadataVersion', '//dif:Metadata_Version/node()'),
        ('lastDIFRevisionDate', '//dif:Last_DIF_Revision_Date/node()'),
        ('dataCenter', ('//dif:Data_Center/node()', compose(
            list,
            lambda x: map(element_to_dict, x),
            lambda x: filter(non_string, x)
        ))),
        ('relatedUrl', ('//dif:Related_URL/node()', compose(
            list,
            lambda x: map(element_to_dict, x),
            lambda x: filter(non_string, x)
        ))),
    )
}