caravel/model/full_text.py from uchicago-sg/caravel

caravel/model/full_text.py
Summary

Maintainability

4 hrs
Test Coverage

Issues
from google.appengine.ext import ndb
from google.appengine.api import memcache
from caravel.model import utils
import json
import itertools

FTS = "fts:"


def tokenize(value):
    """Parses the given string into words."""
    return value.lower().split()


def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
    args = [iter(iterable)] * n
    return itertools.izip_longest(fillvalue=fillvalue, *args)


class AvoidRereadingFromPBProperty(ndb.ComputedProperty):
    """
    Since ComputedPropertys are overwritten on writes anyway, we might as well
    avoid the expensive desearlization process for reads.
    """

    def _deserialize(self, ent, property, depth=1): pass


class FullTextMixin(ndb.Model):
    keywords = AvoidRereadingFromPBProperty(
        lambda self: list(set(self._keywords())), repeated=True)

    @classmethod
    def get_by_id(klass, key_name):
        """Avoid the NDB cache since it sux."""
        return utils.get_keys([ndb.Key(klass.__name__, key_name)])[0]

    @classmethod
    def matching(klass, query, offset=0, limit=None):
        """Returns entities matching the given search query."""

        # Check contents of cache.
        words = tokenize(query)
        if not words:
            keys = klass.query().order(-klass.posted_at).iter(keys_only=True)
            for keys in grouper(keys, n=12):
                for entity in utils.get_keys([key for key in keys if key]):
                    if entity and entity._keywords():
                        yield entity
            return

        results = memcache.get_multi(words, key_prefix=FTS)
        matches = None
        in_order = None
        writeback = {}

        # Fill anything missing from the initial fetch.
        for word in words:
            if word in results:
                urlsafe = json.loads(results[word])
                keys = [ndb.Key(urlsafe=val) for val in urlsafe]
            else:
                query = klass.query(klass.keywords == word)
                keys = query.order(-klass.posted_at).fetch(keys_only=True)
                writeback[word] = json.dumps([key.urlsafe() for key in keys])

            matches = (matches & set(keys)) if matches else set(keys)

        # Write back modified cache.
        memcache.set_multi(writeback, key_prefix=FTS, time=(3600 * 24 * 7))

        # Elide potentially stale entries from the cache.
        keys = [key for key in keys if key in matches]

        for keys in grouper(keys, n=12):
            for entity in utils.get_keys([key for key in keys if key]):
                if entity and set(words).issubset(set(entity.keywords)):
                    yield entity

    def _post_put_hook(self, future):
        """Clear the cache of entries maching these keywords."""

        memcache.delete_multi(self.keywords, key_prefix=FTS)
        utils.invalidate_keys([self.key])

        return super(FullTextMixin, self)._post_put_hook(future)

    def _tokenize(self, field_name, value):
        """Generates a list of keywords for this entity."""

        words = tokenize(value)
        result = []
        for word in words:
            result.extend([field_name + ":" + word, word])
        return result

    def _keywords(self):
        """Override to decide what keywords to use for this entity."""

        return []