eulfedora/models.py
# file eulfedora/models.py
#
# Copyright 2010,2011 Emory University Libraries
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import unicode_literals
import hashlib
import logging
import requests
from rdflib import URIRef, Graph as RdfGraph, Literal
from lxml import etree
from lxml.builder import ElementMaker
import six
from eulxml import xmlmap
from eulfedora.api import ResourceIndex
from eulfedora.rdfns import model as modelns, relsext as relsextns, fedora_rels
from eulfedora.util import parse_xml_object, parse_rdf, RequestFailed, \
datetime_to_fedoratime, force_bytes, force_text
from eulfedora.xml import ObjectDatastreams, ObjectProfile, DatastreamProfile, \
NewPids, ObjectHistory, ObjectMethods, DsCompositeModel, FoxmlDigitalObject, \
DatastreamHistory
from eulxml.xmlmap.dc import DublinCore
logger = logging.getLogger(__name__)
class DatastreamObject(object):
"""Object to ease accessing and updating a datastream belonging to a Fedora
object. Handles datastream content as well as datastream profile information.
Content and datastream info are only pulled from Fedora when content and info
fields are accessed.
Intended to be used with :class:`DigitalObject` and intialized
via :class:`Datastream`.
Initialization parameters:
:param obj: the :class:`DigitalObject` that this datastream belongs to.
:param id: datastream id
:param label: default datastream label
:param mimetype: default datastream mimetype
:param versionable: default configuration for datastream versioning
:param state: default configuration for datastream state
(default: A [active])
:param format: default configuration for datastream format URI
:param control_group: default configuration for datastream control group
(default: M [managed])
:param checksum: default configuration for datastream checksum
:param checksum_type: default configuration for datastream checksum type
(default: MD5)
:param as_of_date: load a historical version of this datastream as of
a particular date time. (Note that historical datastream versions
are for access only, and cannot be saved.)
"""
default_mimetype = "application/octet-stream"
ds_location = None
'''Datastream content location: set this attribute to a URI that
Fedora can resolve (e.g., http:// or file://) in order to add or
update datastream content from a known, accessible location,
rather than posting via :attr:`content`. If :attr:`ds_location`
is set, it takes precedence over :attr:`content`.'''
as_of_date = None
'optional datetime for accessing a historical datastream version'
def __init__(self, obj, id, label, mimetype=None, versionable=False,
state='A', format=None, control_group='M', checksum=None, checksum_type="MD5",
as_of_date=None):
self.obj = obj
self.id = id
self.as_of_date = as_of_date
if mimetype is None:
mimetype = self.default_mimetype
self.defaults = {
'label': label,
'mimetype': mimetype,
'versionable': versionable,
'state': state,
'format': format,
'control_group': control_group,
'checksum': checksum,
'checksumType': checksum_type,
}
self._info = None
self._content = None
# for unversioned datastreams, store a copy of data pulled
# from fedora in case undo save is required
self._info_backup = None
self._content_backup = None
self.info_modified = False
self.digest = None
self.checksum_modified = False
# Flag to indicate whether this datastream exists in fedora.
# Assume false until/unless we can confirm otherwise.
self.exists = False
if not self.obj._create:
# If this datastream belongs to an existing object, check
# to see if the datastream actually exists.
if id in self.obj.ds_list:
self.exists = True
@property
def info(self):
# pull datastream profile information from Fedora, but only when accessed
if self._info is None:
if not self.exists:
self._info = self._bootstrap_info()
else:
self._info = self.obj.getDatastreamProfile(self.id,
date=self.as_of_date)
return self._info
def _bootstrap_info(self):
profile = DatastreamProfile()
profile.state = self.defaults['state']
profile.mimetype = self.defaults['mimetype']
profile.control_group = self.defaults['control_group']
profile.versionable = self.defaults['versionable']
profile.checksum_type = self.defaults['checksumType']
if self.defaults.get('label', None):
profile.label = self.defaults['label']
if self.defaults.get('format', None):
profile.format = self.defaults['format']
return profile
def _get_content(self):
# Pull datastream content from Fedora and return it as a string, but
# only when accessed. Note that this will load the entire datastream
# contents into memory as a string. This is probably a bad idea for
# large files. Thus:
# TODO: Once we have an eulfedora.api call that returns
# iterable chunks of datastream content, we need to either update
# this property or add another to expose that iterable chunk
# functionality at this layer.
if self._content is None:
if not self.exists:
self._content = self._bootstrap_content()
else:
r = self.obj.api.getDatastreamDissemination(self.obj.pid, self.id,
asOfDateTime=self.as_of_date)
self._content = self._convert_content(r.content, r.url)
# calculate and store a digest of the current datastream text content
self.digest = self._content_digest()
return self._content
def _set_content(self, val):
# if datastream is not versionable, grab contents before updating
if not self.versionable:
self._get_content()
self._content = val
content = property(_get_content, _set_content, None,
'''contents of the datastream; for existing datastreams,
content is only pulled from Fedora when first requested, and
cached after first access; can be used to set or update
datastream contents.
For an alternate method to set datastream content, see
:attr:`ds_location`.''')
def _convert_content(self, data, url):
# convert output of getDatastreamDissemination into the expected content type
return data
def _bootstrap_content(self):
return None
def _content_as_node(self):
# used for serializing inline xml datastreams at ingest
return None
def _raw_content(self):
# return datastream content in the appropriate format to be saved to Fedora
# (normally, either a string or a file); used for serializing
# managed datastreams for ingest and save and generating a hash
# NOTE: if you override so this does not return a string, you may
# also need to override _content_digest and/or isModified
if self.content is None:
return None
if hasattr(self.content, 'serialize'):
return force_bytes(self.content.serialize())
else:
return force_bytes(self.content)
def isModified(self):
"""Check if either the datastream content or profile fields have changed
and should be saved to Fedora.
:rtype: boolean
"""
# NOTE: only check content digest if locally cached content is set
# (content already pulled or new content set); otherwise this
# results in pulling content down to checksum it !
return self.info_modified or \
self._content and self._content_digest() != self.digest
def _content_digest(self):
# generate a hash of the content so we can easily check if it has changed and should be saved
raw = self._raw_content()
# handle case where datastream is empty or does not yet exist
if raw is not None:
return hashlib.sha1(force_bytes(raw)).hexdigest()
### access to datastream profile fields; tracks if changes are made for saving to Fedora
def _get_label(self):
return self.info.label
def _set_label(self, val):
self.info.label = val
self.info_modified = True
label = property(_get_label, _set_label, None, "datastream label")
def _get_mimetype(self):
return self.info.mimetype
def _set_mimetype(self, val):
self.info.mimetype = val
self.info_modified = True
mimetype = property(_get_mimetype, _set_mimetype, None, "datastream mimetype")
def _get_versionable(self):
return self.info.versionable
def _set_versionable(self, val):
self.info.versionable = val
self.info_modified = True
versionable = property(_get_versionable, _set_versionable, None,
"boolean; indicates if Fedora is configured to version the datastream")
def _get_state(self):
return self.info.state
def _set_state(self, val):
self.info.state = val
self.info_modified = True
state = property(_get_state, _set_state, None, "datastream state (Active/Inactive/Deleted)")
def _get_format(self):
return self.info.format
def _set_format(self, val):
self.info.format = val
self.info_modified = True
format = property(_get_format, _set_format, "datastream format URI")
def _get_checksum(self):
return self.info.checksum
def _set_checksum(self, val):
self.info.checksum = val
self.info_modified = True
self.checksum_modified = True
checksum = property(_get_checksum, _set_checksum, "datastream checksum")
def _get_checksumType(self):
return self.info.checksum_type
def _set_checksumType(self, val):
self.info.checksum_type = val
self.info_modified = True
checksum_type = property(_get_checksumType, _set_checksumType, "datastream checksumType")
# read-only info properties
@property
def control_group(self):
return self.info.control_group
@property
def created(self):
return self.info.created
@property
def size(self):
'Size of the datastream content'
return self.info.size
@property
def modified(self):
# FIXME: not actually available in datastreamProfile !!
return self.info.modified
def last_modified(self):
# NOTE: last_modified may actually be the 'created' date for
# the current version of the datastream.
# FIXME: **preliminary** actual last-modified, since the above does not
# actually work - should probably cache ds history...
return self.history().versions[0].created # fedora returns most recent first
def history(self):
'''Get history/version information for this datastream and
return as an instance of
:class:`~eulfedora.xml.DatastreamHistory`.'''
r = self.obj.api.getDatastreamHistory(self.obj.pid, self.id, format='xml')
return parse_xml_object(DatastreamHistory, r.content, r.url)
def save(self, logmessage=None):
"""Save datastream content and any changed datastream profile
information to Fedora.
:rtype: boolean for success
"""
if self.as_of_date is not None:
raise RuntimeError('Saving is not implemented for datastream versions')
save_opts = {}
if self.info_modified:
if self.label:
save_opts['dsLabel'] = self.label
if self.mimetype:
save_opts['mimeType'] = self.mimetype
if self.versionable is not None:
save_opts['versionable'] = self.versionable
if self.state:
save_opts['dsState'] = self.state
if self.format:
save_opts['formatURI'] = self.format
if self.checksum:
if self.checksum_modified:
save_opts['checksum'] = self.checksum
if self.checksum_type:
save_opts['checksumType'] = self.checksum_type
# FIXME: should be able to handle checksums
# NOTE: as of Fedora 3.2, updating content without specifying mimetype fails (Fedora bug?)
if 'mimeType' not in save_opts.keys():
# if datastreamProfile has not been pulled from fedora, use configured default mimetype
if self._info is not None:
save_opts['mimeType'] = self.mimetype
else:
save_opts['mimeType'] = self.defaults['mimetype']
# if datastream location has been set, use that for content
# otherwise, use local content (if any)
if self.ds_location is not None:
save_opts['dsLocation'] = self.ds_location
else:
save_opts['content'] = self._raw_content()
if self.exists:
# if not versionable, make a backup to back out changes if object save fails
if not self.versionable:
self._backup()
# if this datastream already exists, use modifyDatastream API call
r = self.obj.api.modifyDatastream(self.obj.pid, self.id,
logMessage=logmessage, **save_opts)
# expects 200 ok
success = (r.status_code == requests.codes.ok)
else:
# if this datastream does not yet exist, add it
r = self.obj.api.addDatastream(self.obj.pid, self.id,
controlGroup=self.defaults['control_group'],
logMessage=logmessage, **save_opts)
# expects 201 created
success = (r.status_code == requests.codes.created)
# clean-up required for object info after adding a new datastream
if success:
# update exists flag - if add succeeded, the datastream exists now
self.exists = True
# if the datastream content is a file-like object, clear it out
# (we don't want to attempt to save the current file contents again,
# particularly since the file is not guaranteed to still be open)
if 'content' in save_opts and hasattr(save_opts['content'], 'read'):
self._content = None
self._content_modified = False
if success:
# update modification indicators
self.info_modified = False
self.checksum_modified = False
self.digest = self._content_digest()
# clear out ds location
self.ds_location = None
return success # msg ?
def _backup(self):
info = self.obj.getDatastreamProfile(self.id)
self._info_backup = {'dsLabel': info.label,
'mimeType': info.mimetype,
'versionable': info.versionable,
'dsState': info.state,
'formatURI': info.format,
'checksumType': info.checksum_type,
'checksum': info.checksum}
r = self.obj.api.getDatastreamDissemination(self.obj.pid, self.id)
self._content_backup = r.content
def undo_last_save(self, logMessage=None):
"""Undo the last change made to the datastream content and profile, effectively
reverting to the object state in Fedora as of the specified timestamp.
For a versioned datastream, this will purge the most recent datastream.
For an unversioned datastream, this will overwrite the last changes with
a cached version of any content and/or info pulled from Fedora.
"""
# NOTE: currently not clearing any of the object caches and backups
# of fedora content and datastream info, as it is unclear what (if anything)
# should be cleared
if self.versionable:
# if this is a versioned datastream, get datastream history
# and purge the most recent version
last_save = self.history().versions[0].created # fedora returns most recent first
r = self.obj.api.purgeDatastream(self.obj.pid, self.id,
datetime_to_fedoratime(last_save),
logMessage=logMessage)
return r.status_code == requests.codes.ok
else:
# for an unversioned datastream, update with any content and info
# backups that were pulled from Fedora before any modifications were made
args = {}
if self._content_backup is not None:
args['content'] = self._content_backup
if self._info_backup is not None:
args.update(self._info_backup)
r = self.obj.api.modifyDatastream(self.obj.pid, self.id,
logMessage=logMessage, **args)
return r.status_code == requests.codes.ok
def get_chunked_content(self, chunksize=4096):
'''Generator that returns the datastream content in chunks, so
larger datastreams can be used without reading the entire
contents into memory.'''
# get the datastream dissemination, but return the actual http response
r = self.obj.api.getDatastreamDissemination(self.obj.pid, self.id,
stream=True, asOfDateTime=self.as_of_date)
# read and yield the response in chunks
for chunk in r.iter_content(chunksize):
yield chunk
def validate_checksum(self, date=None):
'''Check if this datastream has a valid checksum in Fedora, by
running the :meth:`REST_API.compareDatastreamChecksum` API
call. Returns a boolean based on the checksum valid
response from Fedora.
:param date: (optional) check the datastream validity at a
particular date/time (e.g., for versionable datastreams)
'''
r = self.obj.api.compareDatastreamChecksum(self.obj.pid, self.id,
asOfDateTime=date)
dsprofile = parse_xml_object(DatastreamProfile, r.content, r.url)
return dsprofile.checksum_valid
class Datastream(object):
"""Datastream descriptor to simplify configuration and access to datastreams
that belong to a particular :class:`DigitalObject`.
When accessed, will initialize a :class:`DatastreamObject` and cache it on
the :class:`DigitalObject` that it belongs to.
Example usage::
class MyDigitalObject(DigitalObject):
text = Datastream("TEXT", "Text content", defaults={'mimetype': 'text/plain'})
All other configuration defaults are passed on to the :class:`DatastreamObject`.
"""
_datastreamClass = DatastreamObject
def __init__(self, id, label, defaults=None):
self.id = id
self.label = label
self.datastream_args = defaults or {}
def __get__(self, obj, objtype):
if obj is None:
return self
if obj.dscache.get(self.id, None) is None:
obj.dscache[self.id] = self._datastreamClass(obj, self.id, self.label, **self.datastream_args)
return obj.dscache[self.id]
@property
def default_mimetype(self):
mimetype = self.datastream_args.get('mimetype', None)
if mimetype:
return mimetype
ds_cls = self._datastreamClass
return ds_cls.default_mimetype
@property
def default_format_uri(self):
return self.datastream_args.get('format', None)
# set and delete not implemented on datastream descriptor
# - delete would only make sense for optional datastreams, not yet needed
# - saving updated content to fedora handled by datastream object
class XmlDatastreamObject(DatastreamObject):
"""Extends :class:`DatastreamObject` in order to initialize datastream content
as an instance of a specified :class:`~eulxml.xmlmap.XmlObject`.
See :class:`DatastreamObject` for more details. Has one additional parameter:
:param objtype: xml object type to use for datastream content; if not specified,
defaults to :class:`~eulxml.xmlmap.XmlObject`
"""
default_mimetype = "text/xml"
def __init__(self, obj, id, label, objtype=xmlmap.XmlObject, **kwargs):
self.objtype = objtype
super(XmlDatastreamObject, self).__init__(obj, id, label, **kwargs)
# FIXME: override _set_content to handle setting full xml content?
def _convert_content(self, data, url):
return parse_xml_object(self.objtype, data, url)
def _bootstrap_content(self):
return self.objtype()
def _content_as_node(self):
return self.content.node
def _raw_content(self):
# special case for xml datastreams:
# self.content is automatically bootstrapped as a new XmlObject
# - consider the datastream to have no content if the xml is empty
# (which, by default, means no attributes and no text content)
if self.content is None or \
(hasattr(self.content, 'is_empty') and self.content.is_empty()):
return None
return super(XmlDatastreamObject, self)._raw_content()
class XmlDatastream(Datastream):
"""XML-specific version of :class:`Datastream`. Datastreams are initialized
as instances of :class:`XmlDatastreamObject`. An additional, optional
parameter ``objtype`` is passed to the Datastream object to configure the
type of :class:`eulxml.xmlmap.XmlObject` that should be used for datastream
content.
Example usage::
from eulxml.xmlmap.dc import DublinCore
class MyDigitalObject(DigitalObject):
extra_dc = XmlDatastream("EXTRA_DC", "Dublin Core", DublinCore)
my_obj = repo.get_object("example:1234", type=MyDigitalObject)
my_obj.extra_dc.content.title = "Example object"
my_obj.save(logMessage="automatically setting dc title")
"""
_datastreamClass = XmlDatastreamObject
def __init__(self, id, label, objtype=None, defaults=None):
super(XmlDatastream, self).__init__(id, label, defaults or {})
self.datastream_args['objtype'] = objtype
class RdfDatastreamObject(DatastreamObject):
"""Extends :class:`DatastreamObject` in order to initialize datastream content
as an `rdflib <http://pypi.python.org/pypi/rdflib/>`_ RDF graph.
"""
default_mimetype = "application/rdf+xml"
# prefixes for namespaces expected to be used in RELS-EXT
default_namespaces = {
'fedora-model': 'info:fedora/fedora-system:def/model#',
'fedora-rels-ext': 'info:fedora/fedora-system:def/relations-external#',
'oai': 'http://www.openarchives.org/OAI/2.0/'
}
# FIXME: override _set_content to handle setting content?
def _convert_content(self, data, url):
return self._bind_prefixes(parse_rdf(data, url))
def _bootstrap_content(self):
return self._bind_prefixes(RdfGraph())
def _bind_prefixes(self, graph):
# bind any specified prefixes so that serialized xml will be human-readable
for prefix, namespace in six.iteritems(self.default_namespaces):
graph.bind(prefix, namespace)
return graph
def _content_as_node(self):
graph = self.content
data = graph.serialize()
obj = xmlmap.load_xmlobject_from_string(data)
return obj.node
def replace_uri(self, src, dest):
"""Replace a uri reference everywhere it appears in the graph with
another one. It could appear as the subject, predicate, or object of
a statement, so for each position loop through each statement that
uses the reference in that position, remove the old statement, and
add the replacement. """
# NB: The hypothetical statement <src> <src> <src> will be removed
# and re-added several times. The subject block will remove it and
# add <dest> <src> <src>. The predicate block will remove that and
# add <dest> <dest> <src>. The object block will then remove that
# and add <dest> <dest> <dest>.
# NB2: The list() call here is necessary. .triples() is a generator:
# It calculates its matches as it progressively iterates through the
# graph. Actively changing the graph inside the for loop while the
# generator is in the middle of examining it risks invalidating the
# generator and could conceivably make it Just Break, depending on
# the implementation of .triples(). Wrapping .triples() in a list()
# forces it to exhaust the generator, running through the entire
# graph to calculate the list of matches before continuing to the
# for loop.
subject_triples = list(self.content.triples((src, None, None)))
for s, p, o in subject_triples:
self.content.remove((src, p, o))
self.content.add((dest, p, o))
predicate_triples = list(self.content.triples((None, src, None)))
for s, p, o in predicate_triples:
self.content.remove((s, src, o))
self.content.add((s, dest, o))
object_triples = list(self.content.triples((None, None, src)))
for s, p, o in object_triples:
self.content.remove((s, p, src))
self.content.add((s, p, dest))
def _prepare_ingest(self):
"""If the RDF datastream refers to the object by the default dummy
uriref then we need to replace that dummy reference with a real one
before we ingest the object."""
# see also commentary on DigitalObject.DUMMY_URIREF
self.replace_uri(self.obj.DUMMY_URIREF, self.obj.uriref)
class RdfDatastream(Datastream):
"""RDF-specific version of :class:`Datastream` for accessing datastream
content as an `rdflib <http://pypi.python.org/pypi/rdflib/>`_ RDF graph.
Datastreams are initialized as instances of
:class:`RdfDatastreamObject`.
Example usage::
from rdflib import RDFS, Literal
class MyDigitalObject(DigitalObject):
extra_rdf = RdfDatastream("EXTRA_RDF", "an RDF graph of stuff")
my_obj = repo.get_object("example:4321", type=MyDigitalObject)
my_obj.extra_rdf.content.add((my_obj.uriref, RDFS.comment,
Literal("This is an example object.")))
my_obj.save(logMessage="automatically setting rdf comment")
"""
_datastreamClass = RdfDatastreamObject
class FileDatastreamObject(DatastreamObject):
"""Extends :class:`DatastreamObject` in order to allow setting and reading
datastream content as a file. To update contents, set datastream content
property to a new file object. For example::
class ImageObject(DigitalObject):
image = FileDatastream('IMAGE', 'image datastream', defaults={
'mimetype': 'image/png'
})
Then, with an instance of ImageObject::
obj.image.content = open('/path/to/my/file')
obj.save()
"""
_content_modified = False
def _raw_content(self):
# return the content in the format needed to save to Fedora
# if content has not been loaded, return None (no changes)
if self._content is None:
return None
else:
return self.content # return the file itself (handled by upload/save API calls)
def _convert_content(self, data, url):
# for now, using stringio to return a file-like object
# NOTE: will require changes (here and in APIs) to handle large files
return six.BytesIO(data)
# redefine content property to override set_content to set a flag when modified
def _get_content(self):
super(FileDatastreamObject, self)._get_content()
return self._content
def _set_content(self, val):
super(FileDatastreamObject, self)._set_content(val)
self._content_modified = True
content = property(_get_content, _set_content, None,
"contents of the datastream; only pulled from Fedora when accessed, cached after first access")
def _content_digest(self):
# don't attempt to create a checksum of the file content
pass
def isModified(self):
return self.info_modified or self._content_modified
class FileDatastream(Datastream):
"""File-based content version of :class:`Datastream`. Datastreams are
initialized as instances of :class:`FileDatastreamObject`.
"""
_datastreamClass = FileDatastreamObject
### Descriptors for dealing with object relations
# TODO: Relation objects should probably have an intro section with a
# more user-friendly introduction...
# Relation (list variant still TODO)
# ReverseRelation
class Relation(object):
'''This descriptor is intended for use with
:class:`~eulfedora.models.DigitalObject` RELS-EXT relations, and
provides get, set, and delete functionality for a single related
:class:`DigitalObject` instance or literal value in the RELS-EXT
of an individual object.
Example use for a related object: a :class:`Relation` should be
initialized with a predicate URI and optionally a subclass of
:class:`~eulfedora.models.DigitalObject` that should be returned::
class Page(DigitalObject):
volume = Relation(relsext.isConstituentOf, type=Volume)
When a :class:`Relation` is created with a type that references a
:class:`DigitalObject` subclass, a corresponding
:class:`ReverseRelation` will automatically be added to the
related subclass. For the example above, the fictional ``Volume``
class would automatically get a ``page_set`` attribute configured
with the same URI and a class of ``Page``. Reverse property names
can be customized using the ``related_name`` parameter, which is
documented below and follows the basic conventions of Django\'s
:class:`~django.db.models.ForeignKey` model field (to which
:class:`Relation` is roughly analogous).
.. Note::
Currently, auto-generated :class:`ReverseRelation` properties
will always be initialized with ``multiple=True``, since that
is the most common pattern for Fedora object relations (one to
many). Other variants may be added later, if and when use
cases arise.
:class:`~eulfedora.models.Relation` also supports configuring the
RDF type and namespace prefixes that should be used for
serialization; for example::
from rdflib import XSD, URIRef
from rdflib.namespace import Namespace
MYNS = Namespace(URIRef("http://example.com/ns/2011/my-test-namespace/#"))
class MyObj(DigitalObject):
total = Relation(MYNS.count, ns_prefix={"my": MYNS}, rdf_type=XSD.int)
This would allow us to access ``total`` as an integer on a MyObj
object, e.g.::
myobj.total = 3
and when the RELS-EXT is serialized it will use the
configured namespace prefix, e.g.:
.. code-block:: xml
<rdf:RDF xmlns:my="xmlns:fedora-model="info:fedora/fedora-system:def/model#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about="info:fedora/myobj:1">
<my:count rdf:datatype="http://www.w3.org/2001/XMLSchema#int">3</my:count>
</rdf:Description>
</rdf:RDF>
.. Note::
If a namespace prefix is not specified, :mod:`rfdlib` will
automatically generate a namespace to produce valid output,
but it may be less readable than a custom namespace.
Initialization options:
:param relation: the RDF predicate URI as a :class:`rdflib.URIRef`
:param type: optional :class:`~eulfedora.models.DigitalObject`
subclass to initialize (for object relations); use
``type="self"`` to specify that the current DigitalObject
class should be used (currently no reverse relation will be
created for recursive relations).
:param ns_prefix: optional dictionary to configure namespace
prefixes to be used for serialization; key should be the
desired prefix, value should be an instance of
:class:`rdflib.namespace.Namespace`
:param rdf_type: optional rdf type for literal values (passed
to :class:`rdflib.Literal` as the datatype option)
:param related_name: optional name for the auto-generated
:class:`ReverseRelation` property, when the relation is to a
subclass of :class:`DigitalObject`; if not specified, the
related name will be ``classname_set``; a value of ``+``
indicates no :class:`ReverseRelation` should be created
:param related_order: optional URI for sorting related objects
in the auto-generated :class:`ReverseRelation` property.
'''
def __init__(self, relation, type=None, ns_prefix=None, rdf_type=None,
related_name=None, related_order=None):
self.relation = relation
self.object_type = type
self.ns_prefix = ns_prefix or {}
self.rdf_type = rdf_type
self.related_name = related_name
self.related_order = related_order
self.uri_val = None
def __get__(self, obj, objtype):
if obj is None:
return self
# if related object has already been cached, use the cached copy
if self.uri_val is not None and obj.relcache.get(self.uri_val, None) is not None:
return obj.relcache[self.uri_val]
# otherwise: lookup, add to cache, and return
self.uri_val = obj.rels_ext.content.value(subject=obj.uriref,
predicate=self.relation)
if self.uri_val and self.object_type: # don't init new object if val is None
# special case: if object_type is the string 'self',
# use the parent object class (save after the first check)
if self.object_type == 'self':
self.object_type = obj.__class__
# need get_object wrapper method on digital object
result = obj.get_object(self.uri_val, type=self.object_type)
# if the value has 'toPython' method (e.g., rdflib.Literal),
# return the result of that conversion
elif hasattr(self.uri_val, 'toPython'):
result = self.uri_val.toPython()
else:
result = self.uri_val
obj.relcache[self.uri_val] = result
return obj.relcache[self.uri_val]
def __set__(self, obj, subject):
# if any namespace prefixes were specified, bind them before adding the tuple
for prefix, ns in six.iteritems(self.ns_prefix):
obj.rels_ext.content.bind(prefix, ns)
# TODO: do we need to check that subject matches self.object_type (if any)?
if isinstance(subject, URIRef):
subject_uri = subject
elif hasattr(subject, 'uriref'):
subject_uri = subject.uriref
elif self.rdf_type:
subject_uri = Literal(subject, datatype=self.rdf_type)
else:
subject_uri = Literal(subject)
# set the property in the rels-ext, removing any existing
# value for that property (single-value relation only, for now)
obj.rels_ext.content.set((
obj.uriref,
self.relation,
subject_uri
))
# store updated value in cache
obj.relcache[self.uri_val] = subject
def __delete__(self, obj):
# find the subject uri and remove from rels-ext
uris = list(obj.rels_ext.content.objects(obj.uriref, self.relation))
if uris:
obj.rels_ext.content.remove((
obj.uriref,
self.relation,
uris[0]
))
# if related object has been cached, delete that as well
if self.uri_val is not None and self.uri_val in obj.relcache:
del obj.relcache[self.uri_val]
class ReverseRelation(object):
'''Descriptor for use with
:class:`~eulfedora.models.DigitalObject` RELS-EXT reverse
relations, where the owning object is the RDF **object** of the
predicate and the related object is the RDF **subject**. This
descriptor will query the Fedora
:class:`~eulfedora.api.ResourceIndex` for the requested subjects,
based on the configured predicate, and return resulting items.
This descriptor *only* provides read access; there is no
functionality for setting or deleting reverse-related objects.
It is recommended to use :class:`Relation` and let the
corresponding :class:`ReverseRelation` be automatically generated
for you.
Example use::
class Volume(DigitalObject):
pages = ReverseRelation(relsext.isConstituentOf, type=Page, multiple=True)
:param relation: RDF relation to be used for querying to find the items
:param type: object type for the related item or items
:param multiple: set to true if there multiple related items, which will be returned
as a list (defaults to false)
:param order_by: RDF predicate to be used for sorting multiple items
(must be available for query in the RIsearch, as a property of
the items being returned)
'''
def __init__(self, relation, type=None, multiple=False, order_by=None):
self.relation = relation
self.object_type = type
self.multiple = multiple
self.order_by = order_by
def __get__(self, obj, objtype):
if obj is None:
return self
# query RIsearch for subjects based on configured relation and current object
# if a sort property is specified, use sparql to find *and* sort
if self.order_by is not None:
sparql_query = '''SELECT ?pid ?order
WHERE {
?pid <%(rel)s> <%(uri)s> .
?pid <%(sort_rel)s> ?order
} ORDER BY ?order''' % {
'rel': self.relation,
'uri': obj.uriref,
'sort_rel': self.order_by
}
results = obj.risearch.sparql_query(sparql_query)
uris = [r['pid'] for r in results]
# otherwise, just do a simple SPO search to get the objects
else:
uris = list(obj.risearch.get_subjects(self.relation, obj.uriref))
if self.multiple:
return [self._init_val(obj, uri) for uri in uris]
elif uris:
return self._init_val(obj, uris[0])
def _init_val(self, obj, val):
# initialize the desired return type, based on configuration
# would any reverse relation not be an object?
if self.object_type:
return obj.get_object(val, type=self.object_type)
else:
return val
class DigitalObjectType(type):
"""A metaclass for :class:`DigitalObject`.
All this does for now is find Datastream objects from parent classes
and those defined on the class itself and collect them into a
_defined_datastreams dictionary on the class. Using this, clients (or,
more likely, internal library code) can more easily introspect the
datastreams defined in code for the object.
"""
_registry = {}
def __new__(cls, name, bases, defined_attrs):
datastreams = {}
local_datastreams = {}
use_attrs = defined_attrs.copy()
reverse_rels = {}
for base in bases:
base_ds = getattr(base, '_defined_datastreams', None)
if base_ds:
datastreams.update(base_ds)
for attr_name, attr_val in defined_attrs.items():
if isinstance(attr_val, Datastream):
local_datastreams[attr_name] = attr_val
elif isinstance(attr_val, Relation):
# collect Relations in order to add reverse relations
# after the class has been created
# - only relations to subclasses of DigitalObject need reversing
if attr_val.object_type and attr_val.object_type != DigitalObject\
and attr_val.related_name != '+':
# as in Django, related_name=+ is special case
# used to indicate no reverse relation should be created
reverse_rels[attr_name] = attr_val
use_attrs['_local_datastreams'] = local_datastreams
datastreams.update(local_datastreams)
use_attrs['_defined_datastreams'] = datastreams
super_new = super(DigitalObjectType, cls).__new__
new_class = super_new(cls, name, bases, use_attrs)
new_class_name = '%s.%s' % (new_class.__module__, new_class.__name__)
DigitalObjectType._registry[new_class_name] = new_class
# create any ReverseRelations corresponding to Relations on
# the current class
# for now, assume all reverse relations are multiple
for rel in six.itervalues(reverse_rels):
# don't reverse self-relations for now
if isinstance(rel.object_type, six.string_types):
continue
# TODO: look into handling this the way django handles
# recursive relationships
# use related name if one has been specified
if rel.related_name is not None:
reverse_name = rel.related_name
# otherwise, follow Django convention and use classname_set
else:
reverse_name = '%s_set' % new_class.__name__.lower()
# add the reverse relation to the other class
setattr(rel.object_type, reverse_name,
ReverseRelation(rel.relation, type=new_class,
multiple=True, order_by=rel.related_order))
return new_class
@property
def defined_types(self):
return DigitalObjectType._registry.copy()
class DigitalObject(six.with_metaclass(DigitalObjectType, object)):
"""
A single digital object in a Fedora respository, with methods and
properties to easy creating, accessing, and updating a Fedora
object or any of its component parts, with pre-defined datastream
mappings for the standard Fedora Dublin Core
(:attr:`~eulfedora.models.DigitalObject.dc`) and RELS-EXT
(:attr:`~eulfedora.models.DigitalObject.rels_ext`) datastreams.
.. Note::
If you want idiomatic access to other datastreams, consider
extending :class:`DigitalObject` and defining your own
datastreams using :class:`XmlDatastream`,
:class:`RdfDatastream`, or :class:`FileDatastream` as
appropriate.
"""
default_pidspace = None
"""Default namespace to use when generating new PIDs in
:meth:`get_default_pid` (by default, calls Fedora getNextPid,
which will use Fedora-configured namespace if default_pidspace
is not set)."""
OWNER_ID_SEPARATOR = ','
'''Owner ID separator for multiple owners. Should match the
OWNER-ID-SEPARATOR configured in Fedora.
For more detail, see https://jira.duraspace.org/browse/FCREPO-82
'''
dc = XmlDatastream("DC", "Dublin Core", DublinCore, defaults={
'control_group': 'X',
'format': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
})
''':class:`XmlDatastream` for the required Fedora **DC** datastream;
datastream content will be automatically loaded as an instance of
:class:`eulxml.xmlmap.dc.DublinCore`'''
rels_ext = RdfDatastream("RELS-EXT", "External Relations", defaults={
'control_group': 'X',
'format': 'info:fedora/fedora-system:FedoraRELSExt-1.0',
})
''':class:`RdfDatastream` for the standard Fedora **RELS-EXT** datastream'''
def __init__(self, api, pid=None, create=False, default_pidspace=None):
self.api = api
self.dscache = {} # accessed by DatastreamDescriptor to store and cache datastreams
self.relcache = {} # used by Relation to store and cache related objects
self._risearch = None
self._adhoc_datastreams = {}
# per-object ad-hoc datastreams parallel to per-class _defined_datastreams
if default_pidspace:
try:
self.default_pidspace = default_pidspace
except AttributeError:
# allow extending classes to make default_pidspace a custom property,
# but warn if there is case of conflict
if default_pidspace != getattr(self, 'default_pidspace', None):
logger.warn("Failed to set requested default_pidspace %s (using %s instead)",
default_pidspace, self.default_pidspace)
# cache object profile, track if it is modified and needs to be saved
self._info = None
self.info_modified = False
# datastream list from fedora
self._ds_list = None
# object history
self._history = None
self._methods = None
# object foxml
self._object_xml = None
# pid = None signals to create a new object, using a default pid
# generation function.
if pid is None:
# self.get_default_pid is probably the method defined elsewhere
# in this class. Barring clever hanky-panky, it should be
# reliably callable.
pid = self.get_default_pid
elif isinstance(pid, six.string_types) and \
pid.startswith('info:fedora/'): # passed a uri
pid = pid[len('info:fedora/'):]
# callable(pid) signals a function to call to obtain a pid if and
# when one is needed
if callable(pid):
create = True
self.pid = pid
# self._create is True when we should create (ingest) this object in
# fedora on first save(), False if we should assume it's already
# there. Note that if pid is callable, create is always True (for
# which see above)
self._create = bool(create)
if create:
self._init_as_new_object()
def _init_as_new_object(self):
for cmodel in getattr(self, 'CONTENT_MODELS', ()):
self.rels_ext.content.add((self.uriref, modelns.hasModel,
URIRef(cmodel)))
@property
def risearch(self):
"Instance of :class:`eulfedora.api.ResourceIndex`, with the same root url and credentials"
if self._risearch is None:
self._risearch = ResourceIndex(self.api.base_url, self.api.username, self.api.password)
return self._risearch
def get_object(self, pid, type=None):
'''Initialize and return a new
:class:`~eulfedora.models.DigitalObject` instance from the same
repository, passing along the connection credentials in use by
the current object. If type is not specified, the current
DigitalObject class will be used.
:param pid: pid of the object to return
:param type: (optional) :class:`~eulfedora.models.DigitalObject`
type to initialize and return
'''
if type is None:
type = self.__class__
return type(self.api, pid)
def __str__(self):
if callable(self.pid):
return '(generated pid; uningested)'
elif self._create:
return self.pid + ' (uningested)'
else:
return self.pid
def __repr__(self):
return '<%s %s>' % (self.__class__.__name__, force_text(self))
def get_default_pid(self):
'''Get the next default pid when creating and ingesting a new
DigitalObject instance without specifying a pid. By default,
calls :meth:`ApiFacade.getNextPID` with the configured class
default_pidspace (if specified) as the pid namespace.
If your project requires custom pid logic (e.g., object pids
are based on an external pid generator), you should extend
DigitalObject and override this method.'''
# This function is used by __init__ as a default pid generator if
# none is specified. If you get the urge to override it, make sure
# it still works there.
kwargs = {}
if self.default_pidspace is not None:
kwargs['namespace'] = self.default_pidspace
r = self.api.getNextPID(**kwargs)
nextpids = parse_xml_object(NewPids, r.content, r.url)
return nextpids.pids[0]
@property
def pidspace(self):
"Fedora pidspace of this object"
if callable(self.pid):
return None
# split pid into pidspace and id, return just the pidspace
return self.pid.split(':', 1)[0]
# This dummy pid stuff is ugly. I'd rather not need it. Every now and
# then, though, something needs a PID or URI for a brand-new object
# (i.e., with a callable self.pid) before we've even had a chance to
# generate one. In particular, if we want to add statements to an
# object's RELS-EXT, then the the object URI needs to be the subject of
# those statements. We can't just generate the PID early because we get
# PIDs from ARKs, and those things stick around. Also, most objects get
# RELS-EXT statements right as we create them anyway (see references to
# CONTENT_MODELS in _init_as_new_object()), so calling self.pid as soon
# as we need a uri would be essentially equivalent to "at object
# creation," which negates the whole point of lazy callable pids.
#
# So anyway, this DUMMY_PID gives us something we can use as a pid for
# new objects, with the understanding that we have to clean it up to use
# the real pid in obj._prepare_ingest(), which is called after we've
# committed to trying to ingest the object, and thus after self.pid has
# been called and replaced with a real string pid. DatastreamObject
# subclasses can do this in their own _prepare_ingest() methods.
# RELS-EXT (and all other RDF datastreams for that matter) get that
# implemented in RdfDatastreamObject above.
DUMMY_PID = 'TEMP:DUMMY_PID'
DUMMY_URIREF = URIRef('info:fedora/' + DUMMY_PID)
@property
def uri(self):
"Fedora URI for this object (``info:fedora/foo:###`` form of object pid) "
use_pid = self.pid
if callable(use_pid):
use_pid = self.DUMMY_PID
return 'info:fedora/' + use_pid
@property
def uriref(self):
"Fedora URI for this object, as an :class:`rdflib.URIRef` URI object"
return URIRef(self.uri)
@property
def info(self):
# pull object profile information from Fedora, but only when accessed
if self._info is None:
self._info = self.getProfile()
return self._info
# object info properties
label_max_size = 255
'maximum label size allowed by fedora'
def _get_label(self):
return self.info.label
def _set_label(self, val):
# Fedora object label property has a maximum of 255 characters
if len(val) > self.label_max_size:
logger.warning('Attempting to set object label for %s to a value longer than %d character max (%d); truncating',\
self.pid, self.label_max_size, len(val))
val = val[0:self.label_max_size]
# if the new value is different, track object information modification for next save
if self.info.label != val:
self.info_modified = True
self.info.label = val
label = property(_get_label, _set_label, None, "object label")
owner_max_size = 64
'maximum owner size allowed by fedora'
def _get_owner(self):
return self.info.owner
def _set_owner(self, val):
if len(val) > self.owner_max_size:
logger.warning('Attempting to set object owner for %s to a value longer than %d character max (%d); truncating',
self.pid, self.owner_max_size, len(val))
# if owner is delimited, truncate to last full value
if self.OWNER_ID_SEPARATOR in val:
# find the last delimiter under the max size
end = val.rfind(self.OWNER_ID_SEPARATOR, 0, self.owner_max_size + 1)
val = val[0:end]
# otherwise, just truncate
else:
val = val[0:self.owner_max_size]
self.info.owner = val
self.info_modified = True
owner = property(_get_owner, _set_owner, None, "object owner")
@property
def owners(self):
'''Read-only list of object owners, separated by the configured
:attr:`OWNER_ID_SEPARATOR`, with whitespace stripped.'''
return [o.strip() for o in self.owner.split(self.OWNER_ID_SEPARATOR)]
def _get_state(self):
return self.info.state
def _set_state(self, val):
self.info.state = val
self.info_modified = True
state = property(_get_state, _set_state, None, "object state (Active/Inactive/Deleted)")
# read-only info properties
@property
def created(self):
return self.info.created
@property
def modified(self):
return self.info.modified
@property
def exists(self):
""":type: bool
True when the object actually exists (and can be accessed by
the current user) in Fedora
"""
# If we made the object under the pretext that it doesn't exist in
# fedora yet, then assume it doesn't exist in fedora yet.
if self._create:
return False
# If we can get a valid object profile, regardless of its contents,
# then this object exists. If not, then it doesn't.
try:
self.getProfile()
return True
except RequestFailed:
return False
@property
def has_requisite_content_models(self):
''':type: bool
True when the current object has the expected content models
for whatever subclass of :class:`DigitalObject` it was
initialized as.'''
for cmodel in getattr(self, 'CONTENT_MODELS', ()):
if not self.has_model(cmodel):
return False
return True
def getDatastreamProfile(self, dsid, date=None):
"""Get information about a particular datastream belonging to this object.
:param dsid: datastream id
:rtype: :class:`DatastreamProfile`
"""
# NOTE: used by DatastreamObject
if self._create:
return None
r = self.api.getDatastream(self.pid, dsid, asOfDateTime=date)
return parse_xml_object(DatastreamProfile, r.content, r.url)
@property
def history(self):
if self._history is None:
self.getHistory()
return self._history
def getHistory(self):
if self._create:
return None
else:
r = self.api.getObjectHistory(self.pid)
history = parse_xml_object(ObjectHistory, r.content, r.url)
self._history = [c for c in history.changed]
return history
@property
def object_xml(self):
'''Fedora object XML as an instance of :class:`FoxmlDigitalObject`.
(via :meth:`REST_API. getObjectXML`).
'''
if self._object_xml is None:
self.getObjectXml()
return self._object_xml
def getObjectXml(self):
if self._create:
return None
else:
r = self.api.getObjectXML(self.pid)
self._object_xml = parse_xml_object(FoxmlDigitalObject, r.content, r.url)
return self._object_xml
@property
def audit_trail(self):
'''Fedora audit trail as an instance of :class:`eulfedora.xml.AuditTrail`
.. Note::
Since Fedora (as of 3.5) does not make the audit trail
available via an API call or as a datastream, accessing the
audit trail requires loading the foxml for the object. If
an object has large, versioned XML datastreams this may be
slow.
'''
# NOTE: It would be nice to expose the audit trail so that it
# looks and behaves a bit more like other datastreams (pseudo
# or read-only DatastreamObject?). At the moment, the overhead
# for that doesn't seem worth the possible benefits.
# Fedora may eventually expose the AUDIT info more directly:
# https://jira.duraspace.org/browse/FCREPO-635
if self.object_xml:
return self.object_xml.audit_trail
@property
def ingest_user(self):
'''Username responsible for ingesting this object into the repository,
as recorded in the :attr:`audit_trail`, if available.'''
# if there is an audit trail and it has records and the first
# action is ingest, return the user
if self.audit_trail and self.audit_trail.records \
and self.audit_trail.records[0].action == 'ingest':
return self.audit_trail.records[0].user
@property
def audit_trail_users(self):
'''A set of all usernames recorded in the :attr:`audit_trail`,
if available.'''
if self.audit_trail:
return set([r.user for r in self.audit_trail.records])
return set()
_profile = None
def getProfile(self):
"""Get information about this object (label, owner, date created, etc.).
:rtype: :class:`ObjectProfile`
"""
if self._create:
return ObjectProfile()
else:
if self._profile is None:
r = self.api.getObjectProfile(self.pid)
self._profile = parse_xml_object(ObjectProfile, r.content, r.url)
return self._profile
def _saveProfile(self, logMessage=None):
if self._create:
raise Exception("can't save profile information for a new object before it's ingested.")
r = self.api.modifyObject(self.pid, self.label, self.owner, self.state, logMessage)
if r.status_code == requests.codes.ok:
# profile info is no longer different than what is in Fedora
self.info_modified = False
saved = True
else:
saved = False
return saved
def save(self, logMessage=None):
"""Save to Fedora any parts of this object that have been
modified (including object profile attributes such as
:attr:`label`, :attr:`owner`, or :attr:`state`, and any
changes to datastream content or datastream properties). If a
failure occurs at any point on saving any of the parts of the
object, will back out any changes that have been made and
raise a :class:`DigitalObjectSaveFailure` with information
about where the failure occurred and whether or not it was
recoverable.
If the object is new, ingest it. If object profile information has
been modified before saving, this data is used in the ingest.
Datastreams are initialized to sensible defaults: XML objects are
created using their default constructor, and RDF graphs start
empty. If they're updated before saving then those updates are
included in the initial version. Datastream profile information is
initialized from defaults specified in the :class:`Datastream`
declaration, though it too can be overridden prior to the initial
save.
"""
if self._create:
self._prepare_ingest()
self._ingest(logMessage)
else:
self._save_existing(logMessage)
#No errors, then return true
return True
def _save_existing(self, logMessage):
# save an object that has already been ingested into fedora
# - list of datastreams that should be saved
to_save = [ds for ds, dsobj in six.iteritems(self.dscache) if dsobj.isModified()]
# - track successfully saved datastreams, in case roll-back is necessary
saved = []
# save modified datastreams
for ds in to_save:
# in eulfedora 0.16 and before, add/modify datastream returned True/False
# in later versions, it throws an exception
try:
ds_saved = self.dscache[ds].save(logMessage)
except RequestFailed:
logger.error('Failed to save %s/%s', self.pid, ds)
ds_saved = False
if ds_saved:
saved.append(ds)
else:
# save datastream failed - back out any changes that have been made
cleaned = self._undo_save(saved,
"failed saving %s, rolling back changes" % ds)
raise DigitalObjectSaveFailure(self.pid, ds, to_save, saved, cleaned)
# NOTE: to_save list in exception will never include profile; should it?
# FIXME: catch exceptions on save, treat same as failure to save (?)
# save object profile (if needed) after all modified datastreams have been successfully saved
if self.info_modified:
try:
profile_saved = self._saveProfile(logMessage)
except RequestFailed:
logger.error('Failed to save object profile for %s', self.pid)
profile_saved = False
if not profile_saved:
cleaned = self._undo_save(saved, "failed to save object profile, rolling back changes")
raise DigitalObjectSaveFailure(self.pid, "object profile", to_save, saved, cleaned)
if saved or (self.info_modified and profile_saved):
# clear out any cached object info that is now out of date
self._history = None
self._object_xml = None
def _undo_save(self, datastreams, logMessage=None):
"""Takes a list of datastreams and a datetime, run undo save on all of them,
and returns a list of the datastreams where the undo succeeded.
:param datastreams: list of datastream ids (should be in self.dscache)
:param logMessage: optional log message
"""
return [ds for ds in datastreams if self.dscache[ds].undo_last_save(logMessage)]
def _prepare_ingest(self):
# This should only ever be called on newly-created objects, and only
# immediately before ingest. It's used to clean up any rough edges
# left over from being hewn from raw bits (instead of loaded from
# the repo, like most other DigitalObjects are). In particular, see
# the comments by DigitalObject.DUMMY_PID.
if callable(self.pid):
self.pid = self.pid()
for dsname in six.iterkeys(self._defined_datastreams):
dsobj = getattr(self, dsname)
if hasattr(dsobj, '_prepare_ingest'):
dsobj._prepare_ingest()
def _ingest(self, logMessage):
foxml = self._build_foxml_for_ingest()
# NOTE: previously, this code was decoding the foxml from utf-8
# but that causes problems for ingesting content with unicode
# It *should* be ok to simply remove the decode, and trust users
# to encode their strings correctly...
resp = self.api.ingest(foxml, logMessage)
if resp.status_code != requests.codes.created or resp.text != self.pid:
msg = ('fedora returned unexpected pid "%s" when trying to ' +
'ingest object with pid "%s" (status code: %s)') % \
(resp.content, self.pid, resp.status_code)
raise Exception(msg)
# then clean up the local object so that self knows it's dealing
# with an ingested object now
self._create = False
self._info = None
self.info_modified = False
self.dscache = {} # cache for datastreams that have been retrieved
self.relcache = {} # cache for related objects that have been retrieved
def _build_foxml_for_ingest(self, pretty=False):
doc = self._build_foxml_doc()
print_opts = {'encoding': 'UTF-8'}
if pretty: # for easier debug
print_opts['pretty_print'] = True
return etree.tostring(doc, **print_opts)
FOXML_NS = 'info:fedora/fedora-system:def/foxml#'
def _build_foxml_doc(self):
# make an lxml element builder - default namespace is foxml, display with foxml prefix
E = ElementMaker(namespace=self.FOXML_NS, nsmap={'foxml': self.FOXML_NS})
doc = E('digitalObject')
doc.set('VERSION', '1.1')
doc.set('PID', self.pid)
doc.append(self._build_foxml_properties(E))
# collect datastream definitions for ingest.
for dsname, ds in self._defined_datastreams.items():
dsobj = getattr(self, dsname)
dsnode = self._build_foxml_datastream(E, ds.id, dsobj)
if dsnode is not None:
doc.append(dsnode)
# also collect ad-hoc datastream definitions for ingest.
for dsname, ds in self._adhoc_datastreams.items():
dsobj = getattr(self, dsname)
dsnode = self._build_foxml_datastream(E, ds.id, dsobj)
if dsnode is not None:
doc.append(dsnode)
return doc
def _build_foxml_properties(self, E):
props = E('objectProperties')
state = E('property')
state.set('NAME', 'info:fedora/fedora-system:def/model#state')
state.set('VALUE', self.state or 'A')
props.append(state)
if self.label:
label = E('property')
label.set('NAME', 'info:fedora/fedora-system:def/model#label')
label.set('VALUE', self.label)
props.append(label)
if self.owner:
owner = E('property')
owner.set('NAME', 'info:fedora/fedora-system:def/model#ownerId')
owner.set('VALUE', self.owner)
props.append(owner)
return props
def _build_foxml_datastream(self, E, dsid, dsobj):
# if we can't construct a content node then bail before constructing
# any other nodes
content_node = None
if dsobj.control_group == 'X':
content_node = self._build_foxml_inline_content(E, dsobj)
elif dsobj.control_group == 'M':
content_node = self._build_foxml_managed_content(E, dsobj)
if content_node is None:
return
ds_xml = E('datastream')
ds_xml.set('ID', dsid)
ds_xml.set('CONTROL_GROUP', dsobj.control_group)
ds_xml.set('STATE', dsobj.state)
ds_xml.set('VERSIONABLE', force_text(dsobj.versionable).lower())
ver_xml = E('datastreamVersion')
ver_xml.set('ID', dsid + '.0')
ver_xml.set('MIMETYPE', dsobj.mimetype)
if dsobj.format:
ver_xml.set('FORMAT_URI', dsobj.format)
if dsobj.label:
ver_xml.set('LABEL', dsobj.label)
# I BOTH checksum and checksum type are specified, set contentDigest.
# NOTE: this is a change from eulfedora 1.1 and previous, where
# a checksum type could be passed and Fedora would automatically calculate
# a checksum value of the requested type; that does *not* work in
# Fedora 3.8, so instead we rely on the auto-checksumming functionality.
# (But also note that auto-checksumming on ingest was broken in Fedora
# until Fedora 3.7 - https://jira.duraspace.org/browse/FCREPO-1047)
if dsobj.checksum and dsobj.checksum_type:
digest_xml = E('contentDigest')
if dsobj.checksum_type:
digest_xml.set('TYPE', dsobj.checksum_type)
else:
# default to MD5 checksum if not specified
digest_xml.set('TYPE', "MD5")
if dsobj.checksum:
digest_xml.set('DIGEST', dsobj.checksum)
ver_xml.append(digest_xml)
elif hasattr(dsobj._raw_content(), 'read'):
# Content exists, but no checksum, so log a warning.
# FIXME: probably need a better way to check this.
logging.warning("Datastream ingested without a passed checksum or checksum type: %s/%s.",
self.pid, dsid)
ds_xml.append(ver_xml)
ver_xml.append(content_node)
return ds_xml
def _build_foxml_inline_content(self, E, dsobj):
orig_content_node = dsobj._content_as_node()
if orig_content_node is None:
return
content_container_xml = E('xmlContent')
content_container_xml.append(orig_content_node)
return content_container_xml
def _build_foxml_managed_content(self, E, dsobj):
content_uri = None
if dsobj.ds_location:
# if datastream has a location set, use that first
content_uri = dsobj.ds_location
uri_type = 'URL'
else:
# otherwise, check for local content and upload it
content_s = dsobj._raw_content()
if content_s is not None:
content_uri = self.api.upload(content_s)
uri_type = 'INTERNAL_ID'
# stop if no content was found in either location
if content_uri is None:
return
content_location = E('contentLocation')
content_location.set('REF', content_uri)
content_location.set('TYPE', uri_type)
return content_location
def _get_datastreams(self):
"""
Get all datastreams that belong to this object.
Returns a dictionary; key is datastream id, value is an :class:`ObjectDatastream`
for that datastream.
:rtype: dictionary
"""
if self._create:
# FIXME: should we default to the datastreams defined in code?
return {}
else:
# NOTE: can be accessed as a cached class property via ds_list
r = self.api.listDatastreams(self.pid)
dsobj = parse_xml_object(ObjectDatastreams, r.content, r.url)
return dict([(ds.dsid, ds) for ds in dsobj.datastreams])
@property
def ds_list(self): # NOTE: how to name to distinguish from locally configured datastream objects?
"""
Dictionary of all datastreams that belong to this object in Fedora.
Key is datastream id, value is an :class:`ObjectDatastream` for that
datastream.
Only retrieved when requested; cached after first retrieval.
"""
# FIXME: how to make access to a versioned ds_list ?
if self._ds_list is None:
self._ds_list = self._get_datastreams()
return self._ds_list
@property
def methods(self):
if self._methods is None:
self.get_methods()
return self._methods
def get_methods(self):
if self._create:
return {}
r = self.api.listMethods(self.pid)
methods = parse_xml_object(ObjectMethods, r.content, r.url)
self._methods = dict((sdef.pid, sdef.methods)
for sdef in methods.service_definitions)
return self._methods
def getDissemination(self, service_pid, method, params=None):
return self.api.getDissemination(self.pid, service_pid, method,
method_params=params or {})
def getDatastreamObject(self, dsid, dsobj_type=None, as_of_date=None):
'''Get any datastream on this object as a :class:`DatastreamObject`
**or** add a new datastream. If the datastream id corresponds
to a predefined datastream, the configured object will be returned
and the datastream object will be returned. If type is not
specified for an existing datastream, attempts to infer the
appropriate subclass of datastream object to return based on the
mimetype (for XML and RELS-EXT).
Note that if you use this method to add new datastreams you should
be sure to set all datastream metadata appropriately for your content
(i.e., label, mimetype, control group, etc).
:param dsid: datastream id
:param dsobj_type: optional :class:`DatastreamObject` type to
be returned
:param as_of_date: optional datetime, used to load a historical
version of the requested datastream
'''
# NOTE: disabling this option because it returns a Datastream instead of
# a DatastreamObject, which is unexpected
# if the requested datastream is a defined datastream, return it
# if dsid in self._defined_datastreams:
# return self._defined_datastreams[dsid]
if dsid in self._adhoc_datastreams:
return self._adhoc_datastreams[dsid]
if dsid in self.ds_list:
ds_info = self.ds_list[dsid]
# FIXME: can we take advantage of Datastream descriptor? or at least use dscache ?
if dsobj_type is None:
# if datastream mimetype matches one of our base datastream objects, use it
# special case: rels-ext should always be loaded as rdf
if ds_info.mimeType == RdfDatastreamObject.default_mimetype \
or dsid == 'RELS-EXT':
dsobj_type = RdfDatastreamObject
elif ds_info.mimeType == XmlDatastreamObject.default_mimetype:
dsobj_type = XmlDatastreamObject
else:
# default to base datastream object class
dsobj_type = DatastreamObject
dsobj = dsobj_type(self, dsid, label=ds_info.label,
mimetype=ds_info.mimeType, as_of_date=as_of_date)
# add to dscache so modifications will be saved on existing object
self.dscache[dsid] = dsobj
setattr(self, dsid, dsobj)
return dsobj
else:
if dsobj_type is None:
dsobj_type = DatastreamObject
logger.info('Adding new datastream %s to %r', dsid, self)
# NOTE: label is required to initialize a new datastream object;
# using dsid as label since we don't have anything else.
dsobj = dsobj_type(self, dsid, dsid)
# add to the adhoc datastreams so it will get ingested to fedora (if new)
self._adhoc_datastreams[dsid] = dsobj
# make available like a defined datastream object so ingest will work
setattr(self, dsid, dsobj)
# add to dscache so new datastream will be saved on existing object
self.dscache[dsid] = dsobj
# default modification logic should be appropriate here since the
# calling program should be setting content, label, etc
return dsobj
def add_relationship(self, rel_uri, obj):
"""
Add a new relationship to the RELS-EXT for this object.
Calls :meth:`API_M.addRelationship`.
Example usage::
isMemberOfCollection = 'info:fedora/fedora-system:def/relations-external#isMemberOfCollection'
collection_uri = 'info:fedora/foo:456'
object.add_relationship(isMemberOfCollection, collection_uri)
:param rel_uri: URI for the new relationship
:param obj: related object; can be :class:`DigitalObject` or string; if
string begins with info:fedora/ it will be treated as
a resource, otherwise it will be treated as a literal
:rtype: boolean
"""
if isinstance(rel_uri, URIRef):
rel_uri = force_text(rel_uri)
obj_is_literal = True
if isinstance(obj, DigitalObject):
obj = obj.uri
obj_is_literal = False
elif (isinstance(obj, str) or isinstance(obj, six.string_types)) \
and obj.startswith('info:fedora/'):
obj_is_literal = False
# this call will change RELS-EXT, possibly creating it if it's
# missing. remove any cached info we have for that datastream.
if 'RELS-EXT' in self.dscache:
del self.dscache['RELS-EXT']
self._ds_list = None
return self.api.addRelationship(self.pid, self.uri, rel_uri, obj, obj_is_literal)
def purge_relationship(self, rel_uri, obj):
"""
Purge a relationship from RELS-EXT for this object.
Calls :meth:`API_M.purgeRelationship`.
Example usage::
isMemberOfCollection = 'info:fedora/fedora-system:def/relations-external#isMemberOfCollection'
collection_uri = 'info:fedora/foo:789'
object.purge_relationship(isMemberOfCollection, collection_uri)
:param rel_uri: URI for the existing relationship
:param obj: related object; can be :class:`DigitalObject` or string; if
string begins with info:fedora/ it will be treated as
a resource, otherwise it will be treated as a literal
:rtype: boolean
"""
if isinstance(rel_uri, URIRef):
rel_uri = force_text(rel_uri)
obj_is_literal = True
if isinstance(obj, DigitalObject):
obj = obj.uri
obj_is_literal = False
elif (isinstance(obj, str) or isinstance(obj, six.string_types)) \
and obj.startswith('info:fedora/'):
obj_is_literal = False
# this call will change RELS-EXT, possibly creating it if it's
# missing. remove any cached info we have for that datastream.
if 'RELS-EXT' in self.dscache:
del self.dscache['RELS-EXT']
self._ds_list = None
return self.api.purgeRelationship(self.pid, self.uri, rel_uri, obj, obj_is_literal)
def modify_relationship(self, rel_uri, old_object, new_object):
"""
Modify a relationship from RELS-EXT for this object. As the Fedora API-M does not contain
a native "modifyRelationship", this method purges an existing one, then adds a new one,
pivoting on the predicate.
Calls :meth:`API_M.purgeRelationship`, :meth:`API_M.addRelationship`
Example usage::
predicate = 'info:fedora/fedora-system:def/relations-external#isMemberOfCollection'
old_object = 'info:fedora/foo:456'
new_object = 'info:fedora/foo:789'
object.modify_relationship(predicate, old_object, new_object)
:param rel_uri: URI for the existing relationship
:param old_object: previous target object for relationship; can be
:class:`DigitalObject` or string; if string begins with info:fedora/ it
will be treated as a resource, otherwise it will be treated as a literal
:param new_object: new target object for relationship; can be
:class:`DigitalObject` or string; if string begins with info:fedora/ it
will be treated as a resource, otherwise it will be treated as a literal
:rtype: boolean
"""
if isinstance(rel_uri, URIRef):
rel_uri = force_text(rel_uri)
# old_object
obj_old_is_literal = True
if isinstance(old_object, DigitalObject):
old_object = old_object.uri
obj_old_is_literal = False
elif (isinstance(old_object, str) or isinstance(old_object, six.string_types)) \
and old_object.startswith('info:fedora/'):
obj_old_is_literal = False
# new_object
obj_new_is_literal = True
if isinstance(new_object, DigitalObject):
new_object = new_object.uri
obj_new_is_literal = False
elif (isinstance(new_object, str) or isinstance(new_object, six.string_types)) \
and new_object.startswith('info:fedora/'):
obj_new_is_literal = False
# this call will change RELS-EXT, possibly creating it if it's
# missing. remove any cached info we have for that datastream.
if 'RELS-EXT' in self.dscache:
del self.dscache['RELS-EXT']
self._ds_list = None
# attempt purge
if self.api.purgeRelationship(self.pid, self.uri, rel_uri, old_object,
obj_old_is_literal) is not True:
return False
# attempt add
elif self.api.addRelationship(self.pid, self.uri, rel_uri, new_object,
obj_new_is_literal) is not True:
# if addRelationship fails, rollback to old_object
self.api.addRelationship(self.pid, self.uri, rel_uri, old_object,
obj_old_is_literal)
return False
else:
return True
def has_model(self, model):
"""
Check if this object subscribes to the specified content model.
:param model: URI for the content model, as a string
(currently only accepted in ``info:fedora/foo:###`` format)
:rtype: boolean
"""
# TODO:
# - accept DigitalObject for model?
# - convert model pid to info:fedora/ form if not passed in that way?
try:
rels = self.rels_ext.content
except RequestFailed:
# if rels-ext can't be retrieved, confirm this object does not have a RELS-EXT
# (in which case, it does not subscribe to the specified content model)
if "RELS-EXT" not in self.ds_list.keys():
return False
else:
raise
st = (self.uriref, modelns.hasModel, URIRef(model))
return st in rels
def get_models(self):
"""
Get a list of content models the object subscribes to.
"""
try:
rels = self.rels_ext.content
except RequestFailed:
# if rels-ext can't be retrieved, confirm this object does not have a RELS-EXT
# (in which case, it does not have any content models)
if "RELS-EXT" not in self.ds_list.keys():
return []
else:
raise
return list(rels.objects(self.uriref, modelns.hasModel))
def index_data(self):
'''Generate and return a dictionary of default fields to be
indexed for searching (e.g., in Solr). Includes top-level
object properties, Content Model URIs, and Dublin Core
fields.
This method is intended to be customized and extended in order
to easily modify the fields that should be indexed for any
particular type of object in any project; data returned from
this method should be serializable as JSON (the current
implementation uses :mod:`django.utils.simplejson`).
This method was designed for use with :mod:`eulfedora.indexdata`.
'''
index_data = {
'pid': self.pid,
'label': self.label,
'owner': self.owners,
'state': self.state,
'content_model': [force_text(cm) for cm in self.get_models()], # convert URIRefs to strings
}
# date created/modified won't be set unless the object actually exists in Fedora
# (probably the case for anything being indexed, except in tests)
if self.exists:
index_data.update({
# last_modified and created are configured as date type in sample solr Schema
# using isoformat here so they can be serialized via JSON
'last_modified': self.modified.isoformat(),
'created': self.created.isoformat(),
# datastream ids
'dsids': list(six.iterkeys(self.ds_list)),
})
index_data.update(self.index_data_descriptive())
index_data.update(self.index_data_relations())
return index_data
def index_data_descriptive(self):
'''Descriptive data to be included in :meth:`index_data`
output. This implementation includes all Dublin Core fields,
but should be extended or overridden as appropriate for custom
:class:`~eulfedora.models.DigitalObject` classes.'''
dc_fields = ['title', 'contributor', 'coverage', 'creator', 'date', 'description',
'format', 'identifier', 'language', 'publisher', 'relation',
'rights', 'source', 'subject', 'type']
dc_data = {}
for field in dc_fields:
list_field = getattr(self.dc.content, '%s_list' % field)
if list_field:
# convert xmlmap lists to straight lists so simplejson can handle them
dc_data[field] = list(list_field)
return dc_data
def index_data_relations(self):
'''Standard Fedora relations to be included in
:meth:`index_data` output. This implementation includes all
standard relations included in the Fedora relations namespace,
but should be extended or overridden as appropriate for custom
:class:`~eulfedora.models.DigitalObject` classes.'''
data = {}
# NOTE: hasModel relation is handled with top-level object properties above
# currently not indexing other model rels (service bindings)
for rel in fedora_rels:
values = []
for o in self.rels_ext.content.objects(self.uriref, relsextns[rel]):
values.append(force_text(o))
if values:
data[rel] = values
return data
class ContentModel(DigitalObject):
"""Fedora CModel object"""
CONTENT_MODELS = ['info:fedora/fedora-system:ContentModel-3.0']
ds_composite_model = XmlDatastream('DS-COMPOSITE-MODEL',
'Datastream Composite Model', DsCompositeModel, defaults={
'format': 'info:fedora/fedora-system:FedoraDSCompositeModel-1.0',
'control_group': 'X',
'versionable': True,
})
@staticmethod
def for_class(digobj, repo):
'''Generate a ContentModel object for the specified
:class:`DigitalObject` class. Content model object is saved
in the specified repository if it doesn't already exist.'''
full_name = '%s.%s' % (digobj.__module__, digobj.__name__)
cmodels = getattr(digobj, 'CONTENT_MODELS', None)
if not cmodels:
logger.debug('%s has no content models', full_name)
return None
if len(cmodels) > 1:
logger.debug('%s has %d content models', full_name, len(cmodels))
raise ValueError(('Cannot construct ContentModel object for ' +
'%s, which has %d CONTENT_MODELS (only 1 is ' +
'supported)') %
(full_name, len(cmodels)))
cmodel_uri = cmodels[0]
logger.debug('cmodel for %s is %s', full_name, cmodel_uri)
cmodel_obj = repo.get_object(cmodel_uri, type=ContentModel,
create=False)
if cmodel_obj.exists:
logger.debug('%s already exists', cmodel_uri)
return cmodel_obj
# otherwise the cmodel doesn't exist. let's create it.
logger.debug('creating %s from %s', cmodel_uri, full_name)
cmodel_obj = repo.get_object(cmodel_uri, type=ContentModel,
create=True)
# XXX: should this use _defined_datastreams instead?
for ds in digobj._local_datastreams.values():
ds_composite_model = cmodel_obj.ds_composite_model.content
type_model = ds_composite_model.get_type_model(ds.id, create=True)
type_model.mimetype = ds.default_mimetype
if ds.default_format_uri:
type_model.format_uri = ds.default_format_uri
cmodel_obj.save()
return cmodel_obj
class DigitalObjectSaveFailure(Exception):
"""Custom exception class for when a save error occurs part-way through saving
an instance of :class:`DigitalObject`. This exception should contain enough
information to determine where the save failed, and whether or not any changes
saved before the failure were successfully rolled back.
These properties are available:
* obj_pid - pid of the :class:`DigitalObject` instance that failed to save
* failure - string indicating where the failure occurred (either a datastream ID or 'object profile')
* to_be_saved - list of datastreams that were modified and should have been saved
* saved - list of datastreams that were successfully saved before failure occurred
* cleaned - list of saved datastreams that were successfully rolled back
* not_cleaned - saved datastreams that were not rolled back
* recovered - boolean, True indicates all saved datastreams were rolled back
"""
def __init__(self, pid, failure, to_be_saved, saved, cleaned):
self.obj_pid = pid
self.failure = force_text(failure)
self.to_be_saved = to_be_saved
self.saved = saved
self.cleaned = cleaned
# check for anything was saved before failure occurred that
# was *not* cleaned up
self.not_cleaned = [item for item in self.saved
if item not in self.cleaned]
self.recovered = (len(self.not_cleaned) == 0)
def __str__(self):
return "Error saving %s - failed to save %s; saved %s; successfully backed out %s" \
% (self.obj_pid, self.failure, ', '.join(self.saved), ', '.join(self.cleaned))