CenterForOpenScience/waterbutler

View on GitHub
waterbutler/core/metadata.py

Summary

Maintainability
A
3 hrs
Test Coverage
import abc
import typing
import hashlib

import furl

from waterbutler.core import utils
from waterbutler.server import settings


class BaseMetadata(metaclass=abc.ABCMeta):
    """The BaseMetadata object provides the base structure for all metadata returned via
    WaterButler.  It also implements the API serialization methods to turn metadata objects
    into primitive data structures suitable for serializing.

    The basic metadata response looks like::

        {
          "path": "",
          "name": "",
          "kind": "",
          "provider": "",
          "materialized": "",
          "provider": "",
          "etag": "",
          "extra": {}
        }
    """

    def __init__(self, raw: dict) -> None:
        self.raw = raw

    def serialized(self) -> dict:
        """Returns a dict of primitives suitable for serializing into JSON.

        .. note::

            This method determines the output of API v0 and v1.

        :rtype: dict
        """
        return {
            'extra': self.extra,
            'kind': self.kind,
            'name': self.name,
            'path': self.path,
            'provider': self.provider,
            'materialized': self.materialized_path,
            'etag': hashlib.sha256('{}::{}'.format(self.provider, self.etag).encode('utf-8')).hexdigest(),
        }

    def json_api_serialized(self, resource: str) -> dict:
        """Returns a dict of primitives suitable for serializing into a JSON-API -compliant
        response.  Sets the `id` and `type` attributes required by JSON-API, and stores the
        metadata under the `attributes` key.  A `links` object provides a dict of actions and the
        urls where those actions can be performed.

        .. note::

            This method determines the output of API v1.

        :rtype: dict
        """
        json_api = {
            'id': self.provider + self.path,
            'type': 'files',
            'attributes': self.serialized(),
            'links': self._json_api_links(resource),
        }
        # Typing: skip "unsupported target for indexed assignment" errors for nested dict from method
        json_api['attributes']['resource'] = resource  # type: ignore
        return json_api

    def _json_api_links(self, resource: str) -> dict:
        """ Returns a dict of action names and the endpoints where those actions are performed.

        :rtype: dict
        """
        entity_url = self._entity_url(resource)
        actions = {
            'move': entity_url,
            'upload': entity_url + '?kind=file',
            'delete': entity_url,
        }

        return actions

    def _entity_url(self, resource: str) -> str:
        """ Utility method for constructing the base url for actions. """
        url = furl.furl(settings.DOMAIN)
        segments = ['v1', 'resources', resource, 'providers', self.provider]
        # If self is a folder, path ends with a slash which must be preserved. However, furl
        # percent-encodes the trailing slash. Instead, turn folders into a list of (path_id, ''),
        # and let furl add the slash for us.  The [1:] is because path always begins with a slash,
        # meaning the first entry is always ''.
        segments += self.path.split('/')[1:]
        url.path.segments.extend(segments)

        return url.url

    def build_path(self, path) -> str:
        if not path.startswith('/'):
            path = '/' + path
        if self.kind == 'folder' and not path.endswith('/'):
            path += '/'
        return path

    @property
    def is_folder(self) -> bool:
        """ Does this object describe a folder?

        :rtype: bool
        """
        return self.kind == 'folder'

    @property
    def is_file(self) -> bool:
        """ Does this object describe a file?

        :rtype: bool
        """
        return self.kind == 'file'

    @property
    @abc.abstractmethod
    def provider(self) -> str:
        """ The provider from which this resource originated. """
        raise NotImplementedError

    @property
    @abc.abstractmethod
    def kind(self) -> str:
        """ `file` or `folder` """
        raise NotImplementedError

    @property
    @abc.abstractmethod
    def name(self) -> str:
        """ The user-facing name of the entity, excluding parent folder(s).
        ::

            /bar/foo.txt -> foo.txt
            /<someid> -> whatever.png
        """
        raise NotImplementedError

    @property
    @abc.abstractmethod
    def path(self) -> str:
        """ The canonical string representation of a waterbutler file or folder.  For providers
        that track entities with unique IDs, this will be the ID.  For providers that do not, this
        will usually be the full unix-style path of the file or folder.

        .. note::

            All paths MUST start with a `/`
            All folder entities MUST end with a `/`
            File entities MUST never end with a `/`
        """
        raise NotImplementedError

    @property
    def etag(self) -> str:
        raise NotImplementedError

    @property
    def materialized_path(self) -> str:
        """ The unix-style path of the file relative the the root of the provider.  Encoded
        entities should be decoded.

        e.g.::

            path              -> /313c57f9a9edeb87139b205beaed
            name              -> Foo.txt
            materialized_path -> /Parent Folder/Foo.txt

        .. note::

            All materialized_paths MUST start with a `/`
            All folder entities MUST end with a `/`
            File entities MUST never end with a `/`

        .. note::

            Defaults to `self.path`
        """
        return self.path

    @property
    def extra(self) -> dict:
        """A dict of optional metadata from the provider.  Non-mandatory metadata should be stored
        in this property.

        While this field is not explicitly structured, the `hashes` key should be reserved for the
        following.  If the provider supplies MD5 or SHA256 hashes, those should be saved in a dict
        under the `hashes` key, with `md5`, `sha256` as the canonical key names for the hashes.
        """
        return {}

    def __eq__(self, other: 'BaseMetadata') -> bool:  # type: ignore
        return isinstance(other, self.__class__) and self.serialized() == other.serialized()


class BaseFileMetadata(BaseMetadata):
    """ The base class for WaterButler metadata objects for files.  In addition to the properties
    required by `BaseMetadata`, `BaseFileMetadata` requires the consumer to implement the
    `content_type`, `modified`, and `size` properties.  The `etag` may be added in a subclass.
    """

    def serialized(self) -> dict:
        """ Returns a dict representing the file's metadata suitable to be serialized into JSON.

        :rtype: dict
        """
        return dict(super().serialized(), **{
            'contentType': self.content_type,
            'modified': self.modified,
            'modified_utc': self.modified_utc,
            'created_utc': self.created_utc,
            'size': self.size,
            'sizeInt': self.size_as_int,
        })

    def _json_api_links(self, resource: str) -> dict:
        """ Adds the `download` link to the JSON-API repsonse `links` field.

        :rtype: dict
        """
        ret = super()._json_api_links(resource)
        ret['download'] = self._entity_url(resource)
        return ret

    @property
    def kind(self) -> str:
        """ File objects have `kind == 'file'` """
        return 'file'

    @property
    @abc.abstractmethod
    def content_type(self) -> str:
        """ MIME-type of the file (if available) """
        raise NotImplementedError

    @property
    @abc.abstractmethod
    def modified(self) -> str:
        """ Date the file was last modified, as reported by the provider, in
        the format used by the provider. """
        raise NotImplementedError

    @property
    def modified_utc(self) -> str:
        """ Date the file was last modified, as reported by the provider,
        converted to UTC, in format (YYYY-MM-DDTHH:MM:SS+00:00). """
        return utils.normalize_datetime(self.modified)

    @property
    def created_utc(self) -> str:
        """ Date the file was created, as reported by the provider,
        converted to UTC, in format (YYYY-MM-DDTHH:MM:SS+00:00). """
        raise NotImplementedError

    @property
    @abc.abstractmethod
    def size(self) -> typing.Union[int, str]:
        """ Size of the file in bytes. Should be a int, but some providers return a string and WB
        never casted it.  The `size_as_int` property was added to enforce this without breaking
        exisiting code and workarounds.
        """
        raise NotImplementedError

    @property
    def size_as_int(self) -> int:
        """ Size of the file in bytes.  Always an `int` or `None`. Some providers report size as a
        `str`. Both exist to maintain backwards compatibility.
        """
        try:
            size_as_int = int(self.size)
        except (TypeError, ValueError):
            size_as_int = None
        return size_as_int


class BaseFileRevisionMetadata(metaclass=abc.ABCMeta):

    def __init__(self, raw: dict) -> None:
        self.raw = raw

    def serialized(self) -> dict:
        return {
            'extra': self.extra,
            'version': self.version,
            'modified': self.modified,
            'modified_utc': self.modified_utc,
            'versionIdentifier': self.version_identifier,
        }

    def json_api_serialized(self) -> dict:
        """The JSON API serialization of revision metadata from WaterButler.

        .. note::

            This method determines the output of API v1
        """
        return {
            'id': self.version,
            'type': 'file_versions',
            'attributes': self.serialized(),
        }

    @property
    @abc.abstractmethod
    def modified(self) -> str:
        raise NotImplementedError

    @property
    def modified_utc(self) -> str:
        """ Date the revision was last modified, as reported by the provider,
        converted to UTC, in format (YYYY-MM-DDTHH:MM:SS+00:00). """
        return utils.normalize_datetime(self.modified)

    @property
    @abc.abstractmethod
    def version(self) -> str:
        raise NotImplementedError

    @property
    @abc.abstractmethod
    def version_identifier(self) -> str:
        raise NotImplementedError

    @property
    def extra(self) -> dict:
        return {}

    def __eq__(self, other: 'BaseFileRevisionMetadata') -> bool:  # type: ignore
        return isinstance(other, self.__class__) and self.serialized() == other.serialized()


class BaseFolderMetadata(BaseMetadata):
    """ Defines the metadata structure for folders, auto defines :func:`kind`.   In addition
    to the properties required by `BaseMetadata`, `BaseFolderMetadata` does not require any
    additional properties beyond those required by `BaseMetadata`.  Provides an `etag` property
    that defaults to `None`. It also extends `BaseMetadata` to provide an accessor/mutator for
    children, which should be a list of file metadata objects that inherit from `BaseFileMetadata`.
    """

    def __init__(self, raw: dict) -> None:
        super().__init__(raw)
        self._children = None  # type: list

    def serialized(self) -> dict:
        """ Returns a dict representing the folder's metadata suitable to be serialized
        into JSON. If the `children` property has not been set, it will be excluded from
        the dict.

        :rtype: dict
        """
        ret = super().serialized()
        if self.children is not None:
            ret['children'] = [c.serialized() for c in self.children]
        return ret

    def json_api_serialized(self, resource: str) -> dict:
        """ Return a JSON-API compliant serializable dict, suitable for the WB v1 API.  Sets the
        `size` attribute to `None`, as folders do no have a size.

        :rtype: dict
        """
        ret = super().json_api_serialized(resource)
        ret['attributes']['size'] = None
        ret['attributes']['sizeInt'] = None
        return ret

    def _json_api_links(self, resource: str) -> dict:
        """ Adds the `new_folder` link to the JSON-API repsonse `links` field.

        :rtype: dict
        """
        ret = super()._json_api_links(resource)
        ret['new_folder'] = self._entity_url(resource) + '?kind=folder'
        return ret

    @property
    def children(self) -> typing.List[BaseMetadata]:
        """ (Optional) A list of child entities of the folder.  Each entity should be either a
        file or folder metadata object.  Will be `None` if the presence of children is unknown.

        :rtype: None or list of Metadata objects
        """
        return self._children

    @children.setter
    def children(self, kids: typing.List[BaseMetadata]):
        """ Assigns the given list to the children property.  The affirmative absence of child
        entities should be indicated by passing an empty list.

        :param list kids: list of children of the folder.
        """
        self._children = kids

    @property
    def kind(self) -> str:
        """ Folder metadata objects have `kind == 'folder'` """
        return 'folder'

    @property
    def etag(self) -> typing.Union[str, None]:
        """ FIXME: An etag? """
        return None