eulfedora/views.py from emory-libraries/eulfedora

eulfedora/views.py
Summary

Maintainability

5 days
Test Coverage

Issues
# file eulfedora/views.py
#
#   Copyright 2010,2011 Emory University Libraries
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

'''Generic, re-usable views for use with Fedora-based Django projects.
Intended to be analogous to `Django's generic views
<http://docs.djangoproject.com/en/1.2/topics/generic-views/>`_ .

Using these views (in the simpler cases) should be as easy as::

    from django.conf.urls import *
    from eulfedora.views import raw_datastream, raw_audit_trail

    urlpatterns = patterns('',
        url(r'^(?P<pid>[^/]+)/(?P<dsid>(MODS|RELS-EXT|DC))/$', raw_datastream),
        url(r'^(?P<pid>[^/]+)/AUDIT/$', raw_audit_trail),
    )

'''

from __future__ import unicode_literals
import logging

from django.contrib.auth import authenticate, login, REDIRECT_FIELD_NAME
from django.http import HttpResponse, Http404, HttpResponseBadRequest, \
    StreamingHttpResponse, HttpResponseRedirect
from django.views.decorators.http import require_http_methods, condition
from django.views.generic import View
import six

from eulfedora.cryptutil import encrypt
from eulfedora.server import Repository, FEDORA_PASSWORD_SESSION_KEY
from eulfedora.util import RequestFailed, parse_xml_object
from eulfedora.xml import DatastreamProfile


logger = logging.getLogger(__name__)


class HttpResponseRangeNotSatisfiable(HttpResponseBadRequest):
    '''Custom version of :class:`~django.http.HttpResponseBadRequest`
    to return a 416 response when a requested range cannot be satisfied.'''
    status_code = 416


def datastream_etag(request, pid, dsid, repo=None,
    as_of_date=None, **kwargs):
    '''Method suitable for use as an etag function with
    :class:`django.views.decorators.http.condition`.  Takes the same
    arguments as :meth:`~eulfedora.views.raw_datastream`.
    '''
    # if a range is requested and it is not for the entire file,
    # do *NOT* return an etag

    # NOTE: using api directly here instead of object/ds models
    # to avoid making unneeded api calls

    try:
        if repo is None:
            repo = Repository()
        resp = repo.api.getDatastream(pid, dsid, asOfDateTime=as_of_date)
        dsprofile = parse_xml_object(DatastreamProfile, resp.content, resp.url)
        if dsprofile and dsprofile.checksum_type != 'DISABLED':
            return dsprofile.checksum
    except RequestFailed:
        pass

    return None

def datastream_lastmodified(request, pid, dsid, repo=None,
    as_of_date=None, *args, **kwargs):
    '''Method suitable for use as a a last-modified function with
    :class:`django.views.decorators.http.condition`.  Takes basically
    the same arguments as :meth:`~eulfedora.views.raw_datastream`.
    '''
    try:
        if repo is None:
            repo = Repository()
        resp = repo.api.getDatastream(pid, dsid, asOfDateTime=as_of_date)
        dsprofile = parse_xml_object(DatastreamProfile, resp.content, resp.url)
        if dsprofile:
            return dsprofile.created
    except RequestFailed:
        pass


@condition(etag_func=datastream_etag, last_modified_func=datastream_lastmodified)
@require_http_methods(['GET', 'HEAD'])
def raw_datastream(request, pid, dsid, repo=None, headers=None,
       as_of_date=None):
    '''
    Access raw datastream content from a Fedora object.
    Returns :class:`~django.http.HttpResponse` for HEAD requests,
    :class:`~django.http.StreamingHttpResponse` for GET requests.  The
    headers and status code from Fedora response are set on the
    django response; any headers specified in the parameters will
    override Fedora headers.  If an HTTP_RANGE header is present on the
    request, it is passed through to Fedora.

    This view method is wrapped with ETag and last modified conditionals.

    :param request: HttpRequest
    :param pid: Fedora object PID
    :param dsid: datastream ID
    :param repo: :class:`~eulcore.django.fedora.server.Repository` instance to use,
        in case your application requires custom repository initialization (optional)
    :param headers: dictionary of additional headers to include in the response
    :param as_of_date: access a historical version of the datastream
    '''
    return _raw_datastream(request, pid, dsid, repo=repo, headers=headers,
       as_of_date=as_of_date)

def _raw_datastream(request, pid, dsid, repo=None, headers=None,
       as_of_date=None):
    '''Version of :meth:`raw_datastream` without conditionals, for use
    in class-based views or elsewhere.'''
    if repo is None:
        repo = Repository()

    # if a range request is present, pass it through to fedora
    rqst_headers = {}
    if 'HTTP_RANGE' in request.META:
        rqst_headers['RANGE'] = request.META['HTTP_RANGE']

    try:
        if request.method == 'HEAD':
            response = repo.api.getDatastreamDissemination(pid, dsid, asOfDateTime=as_of_date,
                head=True, rqst_headers=rqst_headers)
            dj_response = HttpResponse()
        else:
            response = repo.api.getDatastreamDissemination(pid, dsid, asOfDateTime=as_of_date,
                stream=True, rqst_headers=rqst_headers)
            dj_response = StreamingHttpResponse(response.iter_content(4096))
    except RequestFailed as rf:
        # if error is object not found, raise generic django 404
        if rf.code == 404:
            raise Http404

        # otherwise, raise the error
        raise

    # make sure django response code matches fedora code
    # e.g. error code or 206 partial content for range requests
    dj_response.status_code = response.status_code

    # copy fedora response headers to the django response
    resp_headers = response.headers
    # any headers passed in should take precedence
    if headers is not None:
        resp_headers.update(headers)
    # etag needn't always be content md5, but for fedora datastreams it is
    if 'ETag' in resp_headers:
        resp_headers['Content-MD5'] = resp_headers['ETag']

    for header, value in six.iteritems(resp_headers):
        dj_response[header] = value

    return dj_response


@condition(etag_func=datastream_etag)
@require_http_methods(['GET', 'HEAD'])
def raw_datastream_old(request, pid, dsid, type=None, repo=None,
                       headers=None, accept_range_request=False,
                       as_of_date=None, streaming=False):
    '''
    .. NOTE::

        This version of :meth:`raw_datastream` is deprecated, and you
        should update to the new :meth:`raw_datastream`.  This version
        is still available if you are using a version of Fedora
        prior to 3.7 and need the additional functionality.

    View to display a raw datastream that belongs to a Fedora Object.
    Returns an :class:`~django.http.HttpResponse` with the response content
    populated with the content of the datastream.  The following HTTP headers
    may be included in all the responses:

    - Content-Type: mimetype of the datastream in Fedora
    - ETag: datastream checksum, as long as the checksum type is not 'DISABLED'

    The following HTTP headers may be set if the appropriate content is included
    in the datastream metadata:

    - Content-MD5: MD5 checksum of the datastream in Fedora, if available
    - Content-Length: size of the datastream in Fedora

    If either the datastream or object are not found, raises an
    :class:`~django.http.Http404` .  For any other errors (e.g., permission
    denied by Fedora), the exception is re-raised and should be handled elsewhere.

    :param request: HttpRequest
    :param pid: Fedora object PID
    :param dsid: datastream ID to be returned
    :param type: custom object type (should extend
        :class:`~eulcore.fedora.models.DigitalObject`) (optional)
    :param repo: :class:`~eulcore.django.fedora.server.Repository` instance to use,
        in case your application requires custom repository initialization (optional)
    :param headers: dictionary of additional headers to include in the response
    :param accept_range_request: enable HTTP Range requests (disabled by default)
    :param as_of_date: access a historical version of the datastream
    :param streaming: if True, response will be returned as an instance of
        :class:`django.http.StreamingHttpResponse` instead of
        :class:`django.http.HttpResponse`; intended for use with large
        datastreams, defaults to False.
    '''

    if repo is None:
        repo = Repository()
    if headers is None:
        headers = {}

    get_obj_opts = {}
    if type is not None:
        get_obj_opts['type'] = type
    obj = repo.get_object(pid, **get_obj_opts)

    range_request = False
    partial_request = False

    try:
        # NOTE: we could test that pid is actually the requested
        # obj.has_requisite_content_models but that would mean
        # an extra API call for every datastream but RELS-EXT
        # Leaving out for now, for efficiency

        ds = obj.getDatastreamObject(dsid, as_of_date=as_of_date)

        if ds and ds.exists:
            # because retrieving the content is expensive and checking
            # headers can be useful, explicitly support HEAD requests
            if request.method == 'HEAD':
                content = ''

            elif accept_range_request and request.META.get('HTTP_RANGE', None) is not None:
                rng = request.META['HTTP_RANGE']
                logger.debug('HTTP Range request: %s', rng)
                range_request = True
                kind, numbers = rng.split('=')
                if kind != 'bytes':
                    return HttpResponseRangeNotSatisfiable()

                try:
                    start, end = numbers.split('-')
                    # NOTE: could potentially be complicated stuff like
                    # this: 0-999,1002-9999,1-9999
                    # for now, only support the simple case of a single range
                except ValueError:
                    return HttpResponseRangeNotSatisfiable()

                start = int(start)
                if not end:
                    end = ds.info.size - 1
                else:
                    end = int(end)

                # ignore requests where end is before start
                if end < start:
                    return HttpResponseRangeNotSatisfiable()

                if start == end:  # safari sends this (weird?); don't 500
                    partial_length = 0
                    partial_request = True
                    content = ''

                # special case for bytes=0-
                elif start == 0 and end == (ds.info.size - 1):
                    # set chunksize and end so range headers can be set on response
                    # partial_length= ds.info.size
                    partial_length = end - start

                    content = ds.get_chunked_content()

                # range with *NOT* full content requested
                elif start != 0 or end != (ds.info.size - 1):
                    partial_request = True
                    partial_length = end - start
                    # chunksize = min(end - start, 4096)
                    # sample chunk 370726-3005759
                    content = get_range_content(ds, start, end)

            else:
                # get the datastream content in chunks, to handle larger datastreams
                content = ds.get_chunked_content()
                # not using serialize(pretty=True) for XML/RDF datastreams, since
                # we actually want the raw datastream content.

            http_response_class = HttpResponse
            if streaming:
                http_response_class = StreamingHttpResponse
            response = http_response_class(content, content_type=ds.mimetype)
            # NOTE: might want to use StreamingHttpResponse here, at least
            # over some size threshold or for range requests

            # if we have a checksum, use it as an ETag
            # (but checksum not valid when sending partial content)
            if ds.checksum_type != 'DISABLED' and not partial_request:
                response['ETag'] = ds.checksum
            # ds.created is the creation date of this *version* of the datastream,
            # so it is effectively our last-modified date
            response['Last-Modified'] = ds.created

            # Where available, set content length & MD5 checksum in response headers.
            # (but checksum not valid when sending partial content)
            if ds.checksum_type == 'MD5' and not partial_request:
                response['Content-MD5'] = ds.checksum
            if ds.info.size and not range_request:
                response['Content-Length'] = ds.info.size
            if ds.info.size and accept_range_request:
                response['Accept-Ranges'] = 'bytes'
                # response['Content-Range'] = '0,%d/%d' % (ds.info.size, ds.info.size)

            # if partial request, status should be 206 (even for whole file?)
            if range_request:
                response.status_code = 206
                if partial_request:
                    response['Content-Length'] = partial_length
                else:
                    response['Content-Length'] = ds.info.size
                cont_range = 'bytes %d-%d/%d' % (start, end, ds.info.size)
                response['Content-Range'] = cont_range
                logger.debug('Content-Length=%s Content-Range=%s',
                             partial_length, cont_range)

            # set any user-specified headers that were passed in
            for header, val in six.iteritems(headers):
                response[header] = val

            # Fix for old Fedora data bug where the `Content-Length`
            # was -1. IF it is -1 we're just going to get rid of it.
            # Setting the value to an arbitrary value led to issues.
            if int(response['Content-Length']) < 0:
                del response['Content-Length']

            return response
        else:
            raise Http404

    except RequestFailed as rf:
        # if object is not the speficied type or if either the object
        # or the requested datastream doesn't exist, 404
        if rf.code == 404 or \
            (type is not None and not obj.has_requisite_content_models) or \
                not getattr(obj, dsid).exists or not obj.exists:
            raise Http404

        # for anything else, re-raise & let Django's default 500 logic handle it
        raise

def get_range_content(ds, start, end):
    '''Generator for range-requested datastream content.  Iterates over
    datastream content in chunks, and yields the chunks (or partial chunks)
    that are part of the requested range.'''
    if not end or end > ds.info.size:
        end = ds.info.size - 1
    chunksize = 4096

    content_chunks = ds.get_chunked_content(chunksize=chunksize)
    length = 0
    for i in range(int(end/chunksize) + 10):
        chunk_start = chunksize  * i
        chunk_end = chunk_start + chunksize

        # probably shouldn't run out of data, but in case data doesn't
        # match datastream metadata size in fedora...
        try:
            content = next(content_chunks)
        except StopIteration:
            break

        real_chunksize = len(content)

        if chunk_start <= start < chunk_end:
            # start of range is somewhere in the current chunk
            # get the section of requested content at start index
            content = content[start - chunk_start:]

            # range could also *end* in same chunk where it starts
            if chunk_start < end <= chunk_end:

                # trim based on *actual* size of current chunk (before any
                # start trim), since last chunk may not be fullsize
                end_trim = -(chunk_start + real_chunksize - end)
                if end_trim:
                    content = content[:end_trim]
                length += len(content)
                yield content

                # stop - hit the end of the range
                break
            else:
                length += len(content)
                yield content

        elif chunk_start < end <= chunk_end:
            # end of range is in this chunk; trim if necessary, then stop

            # trim based on *actual* size of current chunk (before any
            # start trimming), since last chunk may not be fullsize
            content = content[:-(chunk_start + real_chunksize - end)]

            length += len(content)
            yield content
            # stop - hit the end of the range
            break

        elif chunk_start > start  and chunk_end < end:
            # chunk is somewhere in the range of start - end
            length += len(content)
            yield content

    logger.debug('total content length returned is %d', length)


@require_http_methods(['GET'])
def raw_audit_trail(request, pid, type=None, repo=None):
    '''View to display the raw xml audit trail for a Fedora Object.
    Returns an :class:`~django.http.HttpResponse` with the response content
    populated with the content of the audit trial.

    If the object is not found or does not have an audit trail, raises
    an :class:`~django.http.Http404` .  For any other errors (e.g.,
    permission denied by Fedora), the exception is not caught and
    should be handled elsewhere.

    :param request: HttpRequest
    :param pid: Fedora object PID
    :param repo: :class:`~eulcore.django.fedora.server.Repository` instance to use,
        in case your application requires custom repository initialization (optional)

    .. Note::

      Fedora does not make checksums, size, or other attributes
      available for the audit trail (since it is internal and not a
      true datastream), so the additional headers included in
      :meth:`raw_datastream` cannot be added here.

    '''

    if repo is None:
        repo = Repository()
    # no special options are *needed* to access audit trail, since it
    # is available on any DigitalObject; but a particular view may be
    # restricted to a certain type of object
    get_obj_opts = {}
    if type is not None:
        get_obj_opts['type'] = type
    obj = repo.get_object(pid, **get_obj_opts)
    # object exists and has a non-empty audit trail
    if obj.exists and obj.has_requisite_content_models and obj.audit_trail:
        response = HttpResponse(obj.audit_trail.serialize(),
                            content_type='text/xml')
        # audit trail is updated every time the object gets modified
        response['Last-Modified'] = obj.modified
        return response

    else:
        raise Http404

    # any other errors should be caught elsewhere


def login_and_store_credentials_in_session(request, *args, **kwargs):
    '''Custom login view.  Calls the standard Django authentication,
    but on successful login, stores encrypted user credentials in
    order to allow accessing the Fedora repository with the
    credentials of the currently-logged in user (e.g., when the
    application and Fedora share a common authentication system, such
    as LDAP).

    In order for :class:`~eulcore.django.fedora.server.Repository` to
    pick up user credentials, you must pass the request object in (so
    it will have access to the session).  Example::

        from eulcore.django.fedora.server import Repository

        def my_view(rqst):
            repo = Repository(request=rqst)


    Any arguments supported by :meth:`django.contrib.auth.views.login`
    can be specified and they will be passed along for the standard
    login functionality.

    **This is not a terribly secure.  Do NOT use this method unless
    you need the functionality.**

    '''
    redirect_to = request.POST.get(REDIRECT_FIELD_NAME, request.GET.get(REDIRECT_FIELD_NAME, ''))
    if request.method == "POST":
        username = request.POST.get('username')
        password = request.POST.get('password')
        user = authenticate(request, username=username, password=password)
        if user is not None:
            login(request, user)
            response = HttpResponseRedirect(redirect_to)
        else:
            response = HttpResponse('401 Unauthorized', status=401)

    if request.method == "POST" and request.user.is_authenticated:
        # on successful login, encrypt and store user's password to use for fedora access
        request.session[FEDORA_PASSWORD_SESSION_KEY] = encrypt(request.POST.get('password'))
    return response



# class-based views

class RawDatastreamViewOld(View):
    '''Class-based view for serving out datastream content from Fedora.
    (View-based wrapper for :meth:`raw_datastream_old`.)
    '''
    #: subclass of DigitalObject, if needed
    object_type = None
    #: datastream id
    datastream_id = ''
    #: Enable range requests (default: False)
    accept_range_request = False
    #: url kwarg term for retrieving object pid (default: pid)
    pid_url_kwarg = 'pid'
    #: url kwarg term for retrieving date time, if used (default: date)
    as_of_date_url_kwarg = 'date'
    #: streaming response option (default: False)
    streaming = False
    #: Repository class to use, if needed
    repository_class = Repository
    #: extra http headers to include
    headers = {}

    @classmethod
    def etag(cls, request, *args, **kwargs):
        '''Class method to generate an ETag for use with
        conditional processing; calls :meth:`datastream_etag` with
        class configuration.'''
        pid = kwargs[cls.pid_url_kwarg]
        date = kwargs.get(cls.as_of_date_url_kwarg, None)
        return datastream_etag(
            request, pid, cls.datastream_id,
            type=cls.object_type, repo=cls.repository_class(request=request),
            accept_range_request=cls.accept_range_request, as_of_date=date)

    @classmethod
    def last_modified(cls, request, *args, **kwargs):
        '''Class method to generate last-modified header for use with
        conditional processing; calls :meth:`datastream_lastmodified` with
        class configuration.'''
        pid = kwargs[cls.pid_url_kwarg]
        date = kwargs.get(cls.as_of_date_url_kwarg, None)
        return datastream_lastmodified(
            request, pid, cls.datastream_id,
            type=cls.object_type, repo=cls.repository_class(request=request),
            accept_range_request=cls.accept_range_request, as_of_date=date)

    @classmethod
    def as_view(cls, **initkwargs):
        view = super(RawDatastreamViewOld, cls).as_view(**initkwargs)
        # wrap view with conditional decorator for etag/last-modified
        return condition(etag_func=cls.etag,
            last_modified_func=cls.last_modified)(view)

    def get_datastream_id(self):
        return self.datastream_id

    def get_repository(self):
        '''Initialize and return the configured repository class,
        passing in the current request.'''
        return self.repository_class(request=self.request)

    def get_headers(self):
        '''Return headers to be included when generating the datastream
        content response.  Default implementation is to return
        :attr:`headers`.'''
        return self.headers

    def head(self, request, *args, **kwargs):
        # raw_datastream method handles both head and get
        return self.get(request, *args, **kwargs)

    def get(self, request, *args, **kwargs):
        pid = kwargs[self.pid_url_kwarg]
        date = kwargs.get(self.as_of_date_url_kwarg, None)
        return raw_datastream_old(request, pid, self.get_datastream_id(),
            type=self.object_type, repo=self.get_repository(),
            headers=self.get_headers(),
            accept_range_request=self.accept_range_request,
            as_of_date=date, streaming=self.streaming)


class RawDatastreamView(View):
    '''Class-based view for serving out datastream content from Fedora.
    (View-based wrapper for :meth:`raw_datastream`.)
    '''
    #: datastream id
    datastream_id = ''
    #: url kwarg term for retrieving object pid (default: pid)
    pid_url_kwarg = 'pid'
    #: url kwarg term for retrieving date time, if used (default: date)
    as_of_date_url_kwarg = 'date'
    #: Repository class to use, if needed
    repository_class = Repository
    #: extra http headers to include
    headers = {}

    @classmethod
    def etag(cls, request, *args, **kwargs):
        '''Class method to generate an ETag for use with
        conditional processing; calls :meth:`datastream_etag` with
        class configuration.'''
        pid = kwargs[cls.pid_url_kwarg]
        date = kwargs.get(cls.as_of_date_url_kwarg, None)
        return datastream_etag(request, pid, cls.datastream_id,
            repo=cls.repository_class(request=request),
            as_of_date=date)

    @classmethod
    def last_modified(cls, request, *args, **kwargs):
        '''Class method to generate last-modified header for use with
        conditional processing; calls :meth:`datastream_lastmodified` with
        class configuration.'''
        pid = kwargs[cls.pid_url_kwarg]
        date = kwargs.get(cls.as_of_date_url_kwarg, None)
        return datastream_lastmodified(request, pid, cls.datastream_id,
            repo=cls.repository_class(request=request),
            as_of_date=date)

    @classmethod
    def as_view(cls, **initkwargs):
        view = super(RawDatastreamView, cls).as_view(**initkwargs)
        # wrap view with conditional decorator for etag/last-modified
        return condition(etag_func=cls.etag,
            last_modified_func=cls.last_modified)(view)

    def get_datastream_id(self):
        return self.datastream_id

    def get_repository(self):
        '''Initialize and return the configured repository class,
        passing in the current request.'''
        return self.repository_class(request=self.request)

    def get_headers(self):
        '''Return headers to be included when generating the datastream
        content response.  Default implementation is to return
        :attr:`headers`.'''
        return self.headers

    def head(self, request, *args, **kwargs):
        # raw_datastream method handles both head and get
        return self.get(request, *args, **kwargs)

    def get(self, request, *args, **kwargs):
        pid = kwargs[self.pid_url_kwarg]
        date = kwargs.get(self.as_of_date_url_kwarg, None)
        return _raw_datastream(request, pid, self.get_datastream_id(),
            repo=self.get_repository(), headers=self.get_headers(),
            as_of_date=date)