eulexistdb/query.py
# file eulexistdb/query.py
#
# Copyright 2010,2011 Emory University Libraries
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Developer Note: how to add a new "special field" to the queryset:
# 1. Near the top of the class Xquery, add the field name to the
# special_fields list variable
# 2. In the getQuery function in the section "define any special
# fields that have been requested" add an elif case for your field
# 3. It is possible that you will need to add an elif case in the
# _create_return_class function to return the correct type
# 4. Make sure the the correct fieldType is imported
"""Provide a prettier, more Pythonic approach to eXist-db access.
This module provides :class:`QuerySet` modeled after `Django QuerySet`_
objects. It's not dependent on Django at all, but it aims to function as a
stand-in replacement in any context that expects one.
.. _Django QuerySet: http://docs.djangoproject.com/en/1.1/ref/models/querysets/
"""
from lxml import etree
from lxml.builder import ElementMaker
import re
from eulxml import xmlmap
from eulxml.xmlmap import load_xmlobject_from_string
from eulxml.xmlmap.core import XmlObjectType
from eulxml.xpath import ast, parse, serialize
from eulexistdb import db
from eulexistdb.exceptions import DoesNotExist, ReturnedMultiple
import logging
logger = logging.getLogger(__name__)
__all__ = ['QuerySet', 'Xquery']
# TODO: update field info (currently only name/xpath?) passed to Query
# object to include field type (e.g., StringField, NodeField) so that we
# can handle Node and List field types more intelligently.
# Note that any changes in the return structure for NodeFields will
# most likely require a corresponding change in the _create_return_class function
class QuerySet(object):
"""Lazy eXist database lookup for a set of objects.
:param model: the type of object to return from :meth:`__getitem__`. If
set, the resulting xml nodes will be wrapped in objects of
this type. Some methods, like :meth:`filter` and
:meth:`only` only make sense if this is set. While this
argument can be any callable object, it is typically a
subclass of :class:`~eulxml.xmlmap.XmlObject`.
:param xpath: an XPath_ expression where this `QuerySet` will begin
filtering. Typically this is left out, beginning with an
unfiltered collection: Filtering is then added with
:meth:`filter`.
:param using: The :class:`~eulexistdb.db.ExistDB` to query against.
:param collection: If set, search only within a particular eXist-db
collection. Otherwise search all collections.
:param xquery: Override the entire :class:`Xquery` object used for
internal query serialization. Most code will leave this
unset, which uses a default :class:`Xquery`.
:param fulltext_options: optional dictionary of fulltext options to be used
as settings for any full-text queries. See
http://demo.exist-db.org/lucene.xml#N1047C for available options.
Requires a version of eXist that supports this feature.
.. _XPath: http://www.w3.org/TR/xpath/
"""
# pre-compile regular expression for pulling sort flags off beginning of sort field
_sort_field_re = re.compile(r'^(?P<flags>[~-]*)(?P<field>.*)$')
default_chunk_size = 100
def __init__(self, model=None, xpath=None, using=None, collection=None,
xquery=None, fulltext_options=None):
self.model = model
self._db = using
if fulltext_options is None:
fulltext_options = {}
# remove leading / from collection name if present
collection = collection.lstrip('/') if collection is not None else None
if xquery:
self.query = xquery
else:
xq_opts = {'xpath': xpath, 'collection': collection,
'fulltext_options': fulltext_options}
if model and hasattr(model, 'ROOT_NAMESPACES'):
xq_opts['namespaces'] = model.ROOT_NAMESPACES
self.query = Xquery(**xq_opts)
self._result_id = None
self.partial_fields = {}
self.additional_fields = {}
self._count = None
self._result_cache = {}
self._start = 0
self._stop = None
self._return_type = None
self._highlight_matches = False
def __del__(self):
# release any queries in eXist
if self._result_id is not None:
self._release_query_result()
def _release_query_result(self):
# tell eXist we are done with this result set
self._db.query(release=self._result_id)
self._result_id = None
@property
def result_id(self):
"""Return the cached server result id, executing the query first if
it has not yet executed."""
if self._result_id is None:
self._runQuery()
return self._result_id
def count(self):
"""Return the cached query hit count, executing the query first if
it has not yet executed."""
# if we are in a sliced subset of the query result with a defined end,
# return the slice length
if self._stop is not None:
return self._stop - self._start
if self._count is None:
self._runQuery()
# self._count = self._db.getHits(self.result_id)
return self._count - self._start
# NOTE: for now queryTime is only available in xmlrpc, not REST api;
# but future versions of eXist may include it as an attribute on
# the exist result
#
# def queryTime(self):
# """Return the time (in milliseconds) it took for eXist to run the
# query, running the query first if it has not yet executed."""
# # FIXME: should summary be cached ?
# summary = self._db.querySummary(self.result_id)
# return summary['queryTime']
def _getCopy(self):
"""Get a clone of the current QuerySet for modification via
:meth:`filter`, :meth:`order`, etc."""
# copy current queryset - for modification via filter/order/etc
copy = QuerySet(model=self.model, xquery=self.query.getCopy(), using=self._db)
copy.partial_fields = self.partial_fields.copy()
copy.additional_fields = self.additional_fields.copy()
copy._highlight_matches = self._highlight_matches
# reset result cache, session id, and count if set,
# because any filters will change them
copy._result_cache = {}
copy._result_id = None
copy._count = None
return copy
def filter(self, combine='AND', **kwargs):
"""Filter the QuerySet to return a subset of the documents.
Arguments take the form ``lookuptype`` or ``field__lookuptype``,
where ``field`` is the name of a field in the QuerySet's :attr:`model`
and ``lookuptype`` is one of:
* ``exact`` -- The field or object matches the argument value.
* ``contains`` -- The field or object contains the argument value.
* ``startswith`` -- The field or object starts with the argument
value.
* ``fulltext_terms`` -- the field or object contains any of the the argument
terms anywhere in the full text; requires a properly configured lucene index.
By default, highlighting is enabled when this filter is used. To turn it off,
specify an additional filter of highlight=False.
Recommend using ``fulltext_score`` for ordering, in return fields.
* ``highlight`` - highlight search terms; when used with ``fulltext_terms``,
should be specified as a boolean (enabled by default); when used separately,
takes a string using the same search format as ``fulltext_terms``, but
content will be returned even if it does not include the search terms.
Requires a properly configured lucene index.
* ``in`` - field or object is present in a list of values
* ``exists`` - field or object is or is not present in the document;
if True, field must be present; if False, must not be present.
* ``document_path`` - restrict the query to a single document;
this must be a document path as returned by eXist, with full db path
* ``gt``, ``gte``, ``lt``, ``lte`` - greater than, greater than or equal,
less than, less than or equal
Field may be in the format of field__subfield when field is an NodeField
or NodeListField and subfield is a configured element on that object.
Field may also be one of the prefined 'special' fields; see :meth:`only`
for the list of fields.
Any number of these filter arguments may be passed. This method
returns an updated copy of the QuerySet: It does not modify the
original.
:param combine: optional; specify how the filters should be combined.
Defaults to ``AND``; also supports ``OR``.
"""
# possible future lookup types:
# gt/gte,lt/lte, endswith, range, date, isnull (?), regex (?)
# search (full-text search with full-text indexing - like contains but faster)
qscopy = self._getCopy()
for arg, value in kwargs.iteritems():
fields, rest = _split_fielddef(arg, self.model)
if rest and rest not in qscopy.query.available_filters:
# check if xpath portion is actually an xquery predefined field
parts = _extract_fieldpart(arg)
if parts[0] in qscopy.query.special_fields and \
parts[1] in qscopy.query.available_filters:
xpath = parts[0]
lookuptype = parts[1]
else:
# there's leftover stuff and it's not a filter we recognize.
# assume the entire arg is actually one big xpath.
xpath = arg
lookuptype = 'exact'
else:
# valid filter, or no filter at all
xpath = _join_field_xpath(fields) or '.'
lookuptype = rest or 'exact'
if lookuptype == 'document_path':
qscopy.query.set_document(value)
# highlighting is only an xquery filter when passed as a string
elif lookuptype != 'highlight' or \
lookuptype == 'highlight' and not isinstance(value, bool):
qscopy.query.add_filter(xpath, lookuptype, value, combine)
# enable highlighting when a full-text query is used
if lookuptype == 'fulltext_terms':
# boolean highlight setting overrides default
if 'highlight' in kwargs and isinstance(kwargs['highlight'], bool):
qscopy._highlight_matches = kwargs['highlight']
qscopy.query.highlight = kwargs['highlight']
else:
qscopy._highlight_matches = True
qscopy.query.highlight = True
if lookuptype == 'highlight':
qscopy.query.highlight = value
if isinstance(value, bool):
# boolean - only triggers eXist highlighting in xml return
qscopy._highlight_matches = value
else:
# terms to highlight - enable highlighting in xml return
qscopy._highlight_matches = True
# return copy query string so additional filters can be added or get() called
return qscopy
def or_filter(self, **kwargs):
"""Filter the QuerySet to return a subset of the documents, but combine
the filters with OR instead of AND. Uses the same syntax and allows
for the same filters as :meth:`filter` with the exception that currently
predefined special fields (see :meth:`only`) are not supported.
"""
return self.filter(combine='OR', **kwargs)
def order_by(self, field):
"""Order results returned according to a specified field. By default,
all sorting is case-sensitive and in ascending order.
:param field: the name (a string) of a field in the QuerySet's
:attr:`model`. If the field is prefixed with '-', results
will be sorted in descending order. If the field is
prefixed with '~', results will use a case-insensitive
sort. The flags '-' and '~' may be combined in any order.
Example usage::
queryset.filter(fulltext_terms='foo').order_by('-fulltext_score')
queryset.order_by('~name')
This method returns an updated copy of the QuerySet. It does not
modify the original.
"""
sort_opts = {}
# use a regular expression to pull off any sort flags
match = self._sort_field_re.match(field)
field_parts = match.groupdict()
field = field_parts['field']
sort_flags = field_parts['flags']
# convert sort flags into options for xquery sort method
sort_opts = {}
if '-' in sort_flags:
sort_opts['ascending'] = False
if '~' in sort_flags:
sort_opts['case_insensitive'] = True
# TODO: allow multiple fields
xpath = _simple_fielddef_to_xpath(field, self.model) or field
qscopy = self._getCopy()
qscopy.query.sort(xpath, **sort_opts)
return qscopy
def order_by_raw(self, xpath, ascending=True):
"""Order results returned by a raw XPath.
:param xpath: the xpath to be used
This method returns an updated copy of the QuerySet. It does not
modify the original.
Example usage::
qs.order_by_raw('min(%(xq_var)s//date/string())')
"""
qscopy = self._getCopy()
qscopy.query.sort_raw(xpath, ascending=ascending)
return qscopy
def only(self, *fields):
"""Limit results to include only specified fields.
:param fields: names of fields in the QuerySet's :attr:`model`
This method returns an updated copy of the QuerySet: It does not
modify the original. When results are returned from the updated
copy, they will contain only the specified fields.
Special fields available:
* ``fulltext_score`` - lucene query; should only be used when a fulltext
query has been used
* ``document_name``, ``collection_name`` - document or collection name
where xml content is stored in eXist
* ``hash`` - generate and return a SHA-1 checksum of the root element being queried
* ``last_modified`` - :class:`~eulxml.xmlmap.fields.DateTimeField` for the date
the document the xml element belongs to was last modified
**NOTE:** Be aware that this will result in an XQuery with a constructed return.
For large queries, this may have a significant impact on performance.
For more details, see http://exist.sourceforge.net/tuning.html#N103A2 .
"""
field_objs = {}
field_xpath = {}
for f in fields:
fieldlist, rest = _split_fielddef(f, self.model)
if fieldlist and not rest:
field_objs[f] = fieldlist
field_xpath[f] = _join_field_xpath(fieldlist)
else:
field_objs[f] = f
field_xpath[f] = f
qscopy = self._getCopy()
qscopy.partial_fields.update(field_objs)
qscopy.query.return_only(field_xpath)
return qscopy
def also(self, *fields):
"""Return additional data in results.
:param fields: names of fields in the QuerySet's :attr:`model`
This method returns an updated copy of the QuerySet: It does not
modify the original. When results are returned from the updated
copy, they will contain the specified additional fields.
For special fields available, see :meth:`only`.
For performance considerations, see note on :meth:`only`.
"""
field_objs = {}
field_xpath = {}
for f in fields:
fieldlist, rest = _split_fielddef(f, self.model)
if fieldlist and not rest:
field_objs[f] = fieldlist
field_xpath[f] = _join_field_xpath(fieldlist)
else:
field_objs[f] = f
field_xpath[f] = f
qscopy = self._getCopy()
qscopy.additional_fields.update(field_objs)
qscopy.query.return_also(field_xpath)
return qscopy
def also_raw(self, **fields):
'''Return an additional field by raw xpath. Similar to (and can be
combined with) :meth:`also`, but xpath is not pulled from the model.
Use this when you want to retrieve a field with a different xpath
than the one configured in your model. See :meth:`Xquery.return_only`
for details on specifying xpaths in raw mode.
:param fields: field name and xpath in keyword-args notation. If
**field** is the name of a field on the associated model, the result
of the raw xpath should be accessible on the return object as the
normal property.
:param xpath: xpath for retrieving the specified field
Can be combined with :meth:`also`.
Example usage::
qs.also_raw(field_matches='count(util:expand(%(xq_var)s//field)//exist:match)')
'''
return self._raw_field(also=True, **fields)
def only_raw(self, **fields):
'''Limit results to include only specified fields, and return the specified
field by xpath. Similar to (and can be combined with) :meth:`only`. See
:meth:`Xquery.return_only` for details on specifying xpaths in raw mode.
See :meth:`also_raw` for more details and usage example.
'''
return self._raw_field(only=True, **fields)
def _raw_field(self, only=False, also=False, **fields):
'Common functionality for :meth:`also_raw` and :meth:`only_raw`.'
field_objs = {}
field_xpath = {}
for field, xpath in fields.iteritems():
field_xpath[field] = xpath
fieldlist, rest = _split_fielddef(field, self.model)
if fieldlist and not rest:
field_objs[field] = fieldlist
else:
field_objs[field] = field
qscopy = self._getCopy()
if only:
qscopy.partial_fields.update(field_objs)
qscopy.query.return_only(field_xpath, raw=True)
elif also:
qscopy.additional_fields.update(field_objs)
qscopy.query.return_also(field_xpath, raw=True)
return qscopy
def distinct(self):
"""Return distinct results.
This method returns an updated copy of the QuerySet: It does not
modify the original. When results are returned from the updated
copy, they will contain only distinct results.
"""
qscopy = self._getCopy()
qscopy.query.distinct()
return qscopy
def all(self):
"""Return all results.
This method returns an identical copy of the QuerySet.
"""
return self._getCopy()
def exclude(self, **kwargs):
"""Filter the QuerySet to return a subset of the documents that
do **not** contain any of the filters. Uses the same syntax and allows
for the same filters as :meth:`filter`.
"""
return self.filter(combine='NOT', **kwargs)
def using(self, collection):
'''Specify the eXist collection to be queried.
If you are using an :class:`eulexistdb.models.XmlModel` to generate queries against an eXist
collection other than the one defined in ``settings.EXISTDB_ROOT_COLLECTION``, you should use this function.
'''
qscopy = self._getCopy()
qscopy.query.set_collection(collection)
return qscopy
def reset(self):
"""Reset filters and cached results on the QuerySet.
This modifies the current query set, removing all filters, and
resetting cached results."""
self.query.clear_filters()
# if a query has been made to eXist - release result & reset result id
if self._result_id is not None:
self._release_query_result()
self._result_id = None
self._count = None # clear any count based on this result set
def get(self, **kwargs):
"""Get a single result identified by filter arguments.
Takes any number of :meth:`filter` arguments. Unlike :meth:`filter`,
though, this method returns exactly one item. If the filter
expressions match no items, or if they match more than one, this
method throws an exception.
Raises a :class:`eulexistdb.exceptions.DoesNotExist` exception if
no matches are found; raises a :class:`eulexistdb.exceptions.ReturnedMultiple`
exception if more than one match is found.
"""
fqs = self.filter(**kwargs)
if fqs.count() == 1:
# use regular get item logic to retrieve the first (only) item
obj = fqs[0]
if self.model is not None and not self.query._distinct:
# when single object object is deleted, release this query set
setattr(obj, '__del__', self._release_query_result)
# disabled for now
# make queryTime method available on the single item
# setattr(obj, 'queryTime', self.queryTime)
return obj
# NOTE: behaves like django - throws a DoesNotExist or a MultipleObjectsReturned
elif fqs.count() == 0:
raise DoesNotExist("no match found with params %s" % kwargs)
else:
raise ReturnedMultiple("returned %s with params %s" % (fqs.count(), kwargs))
def __getitem__(self, k):
"""Return a single result or slice of results from the query."""
if not isinstance(k, (slice, int, long)):
raise TypeError
if isinstance(k, slice):
qs = self._getCopy()
# if start was specified, use it; otherwise retain current start
if k.start is not None:
qs._start = int(k.start)
# if a slice bigger than available results is requested, cap it at
# actual max
qs._stop = min(k.stop, self.count())
# because the slicing is done within the result cache,
# share the same cache across subsets of this queryset
qs._result_cache = self._result_cache
return qs
# check that index is in range
# for now, not handling any fancy python indexing
if k < 0 or k >= self.count():
raise IndexError
# calculate the actual index for retrieval from eXist and storage result
# cache based on the start of the current slice
i = k + self._start
max_items = self.default_chunk_size
if self._stop is not None:
max_items = self._stop - self._start + 1
# retrieve results in a chunk and cache them for individual item access
if not self._result_cache:
self._runQuery(self._start + 1, max_items=max_items)
# if for some reason the requested item is not available
# retrieve it individually (this should not generally be used)
if i not in self._result_cache:
# if the requested item has not yet been retrieved, get it
# exist start index is 1-based instead of zero, so +1
self._runQuery(start=i + 1, max_items=max_items)
return self._result_cache[i]
@property
def return_type(self):
"""Return type that will be used for initializing results returned from
eXist queries. Either the subclass of :class:`~eulxml.xmlmap.XmlObject`
passed in to the constructor as model, or, if :meth:`only` or :meth:`also`
has been used, a dynamically created instance of :class:`~eulxml.xmlmap.XmlObject`
with the xpaths modified based on the constructed xml return.
"""
if self._return_type is None:
self._return_type = self.model
# if there are additional/partial fields that need to override defined fields,
# define a new class derived from the XmlObject model and map those fields
if self.partial_fields or self.additional_fields:
# extra fields should include all partial OR additional fields,
# since both options can be used together
extra_fields = self.partial_fields.copy()
extra_fields.update(self.additional_fields)
self._return_type = _create_return_class(self.model, extra_fields,
override_xpaths=self.query.get_return_xpaths())
return self._return_type
@property
def query_result_type(self):
'''Custom query result return type used to access a batch of results
wrapped in an exist result as returned by the REST API. Extends
:class:`eulexistdb.db.QueryResult` to add an item-level result mapping
based, using :attr:`return_type` if appropriate.
'''
classname = "QuerySetResult"
if self.model is not None:
classname += self.model.__name__
fields = {}
if self.model is None or self.query._distinct:
# distinct values returns content, not nodes; string is
# probably reasonable here (although possibly unexpected results if
# querying for distinct numerical values)
fields['items'] = xmlmap.StringListField('*')
else:
xpath = '*'
if self.additional_fields:
# instead of root element, use first child node to xmlobject
# so additional fields can be referenced
xpath = '*/*[1]'
fields['items'] = xmlmap.NodeListField(xpath, self.return_type)
return XmlObjectType(classname, (db.QueryResult,), fields)
def _init_item(self, data):
# when there are additional fields, the main node is the first node under returned xml
if self.additional_fields:
# basically do the simplest form of what load_xmlobject_from_string does
element = etree.fromstring(data)
# instead of root element, pass in first child node to xmlobject constructor
return_type = self.return_type
return return_type(element[0])
else:
return load_xmlobject_from_string(data, self.return_type)
def __iter__(self):
"""Iterate through available results."""
# rudimentary iterator (django queryset one much more complicated...)
for i in range(self.count()):
yield self[i]
def __len__(self):
# FIXME: is this sufficient?
# in django, calling len() populates the cache...
return self.count()
def _runQuery(self, start=None, max_items=None):
"""Execute the currently configured query."""
if max_items is None:
max_items = self.default_chunk_size
# exist start begins at 1, not 0
if start is None:
start = self._start + 1
# if we don't yet have a session, request one; if we do, use it
session_opts = {}
if self._result_id is None:
session_opts['cache'] = True
else:
session_opts['session'] = self._result_id
result = self._db.query(self.query.getQuery(), start=start,
how_many=max_items,
result_type=self.query_result_type,
**session_opts)
# store the session id if a new one was requested
if self._result_id is None:
self._result_id = result.session
# store total count for the query
self._count = result.hits
# if items were retrieved, cache them
if max_items != 0:
self._result_cache = dict(enumerate(result.items,
start=start - 1))
return result
def getDocument(self, docname):
"""Get a single document from the server by filename."""
data = self._db.getDocument('/'.join([self.query.collection, docname]))
# getDocument returns unicode instead of string-- need to decode before handing off to parseString
return load_xmlobject_from_string(data.encode('utf_8'), self.model)
def _create_return_class(baseclass, override_fields, xpath_prefix=None,
override_xpaths=None):
"""
Define a new return class which extends the specified baseclass and
overrides the specified fields.
:param baseclass: the baseclass to be extended; expected to be an instance of XmlObject
:param override_fields: dictionary of field, list of nodefields - in the format of partial_fields
or additional_fields, as genreated by QuerySet.only or QuerySet.also
:param xpath_prefix: optional, should only be used when recursing. By default, the xpath
for a constructed node is assumed to be the same as the field name; for sub-object fields,
this parameter is used to pass the prefix in for creating the sub-object class.
:param override_xpaths: dictionary of field name and xpaths to use, based on
the constructed xml being returned; most likely generated by
:meth:`Xquery.get_return_xpaths`.
"""
# NOTE: this class is tested indirectly via the QuerySet also and only functions,
# but it is *not* tested directly.
classname = "Partial%s" % baseclass.__name__
class_fields = {}
if override_xpaths is None:
override_xpaths = {}
# collect names of subobjects, with information needed to create additional return classes
subclasses = {}
subclass_fields = {}
for name, fields in override_fields.iteritems():
# nested object fields are indicated by basename__subname
if '__' in name:
basename, remainder = name.split('__', 1)
subclasses[basename] = fields[0] # list of field types - first type is basename
if basename not in subclass_fields:
subclass_fields[basename] = {}
subclass_fields[basename][remainder] = fields[1:]
else:
# field with the same type as the original model field, but with xpath of the variable
# name, to match how additional field results are constructed by Xquery object
if name == 'last_modified': # special case field
field_type = xmlmap.DateTimeField
elif name == 'match_count':
field_type = xmlmap.IntegerField
elif fields is None or isinstance(fields, basestring):
field_type = xmlmap.StringField # handle special cases like fulltext score
else:
field_type = type(fields[-1])
# by default, assume xpath is field name
xpath = name
fieldname = name
if xpath_prefix:
xpath = "__".join((xpath_prefix, name))
fieldname = "__".join((xpath_prefix, name))
# if an override xpath is specified for this field, use that
if fieldname in override_xpaths:
xpath = override_xpaths[fieldname]
# special case for following/preceding queries
# if an override path contains a sibling query, it won't
# work in the result, but the field index should be sufficient.
# strip out following, preceding, and following/preceding-siblings
pattern = re.compile(r'(following|preceding)(-sibling)?::([^\[]*)(\[\d+\])?$')
xpath = pattern.sub(r'\3', xpath)
# TODO: create a clone function for nodefield that takes an xpath
# (this should make field-type instantiation more reliable and flexible)
if isinstance(fields[-1], xmlmap.NodeField) or \
isinstance(fields[-1], xmlmap.NodeListField):
class_fields[name] = field_type(xpath, fields[-1]._get_node_class())
else:
class_fields[name] = field_type(xpath)
# create subclasses and add to current class fields
for subclass_name, nodefield in subclasses.iteritems():
# create a new class derived from the configured nodefield class, with subclass fields
prefix = subclass_name
if xpath_prefix:
prefix = "__".join((xpath_prefix, prefix))
# new subclass type
subclass = _create_return_class(nodefield._get_node_class(), subclass_fields[subclass_name],
xpath_prefix=prefix, override_xpaths=override_xpaths)
# field type (e.g. NodeField or NodeListField), to be instanced as new subclass
class_fields[subclass_name] = type(nodefield)(".", subclass)
# create the new class and set it as the return type to be initialized
return XmlObjectType(classname, (baseclass,), class_fields)
def escape_string(s):
'Escape a string as a literal value for use in an Xquery expression.'
return s.replace('"', '""').replace('&', '&')
def _quote_as_string_literal(s):
# special case: do nothing to queries constructed using
# XmlQuery class
if (isinstance(s, XmlQuery)):
return s
return '"' + escape_string(s) + '"'
class Xquery(object):
"""
Xpath/Xquery object.
Init parameters:
:param xpath: base xpath to use when building the query, optional
:param collection: optional collection; if specified, query will be limited
to the collection using eXist-db query syntax ``collection('/db/foo')//node``.
:param fulltext_options: optional dictionary of fulltext options that should
be used for any full-text queries. See http://demo.exist-db.org/lucene.xml#N1047C
for available options.
"""
xpath = '/node()' # default generic xpath
xq_var = '$n' # xquery variable to use when constructing flowr query
ft_option_xqvar = '$ft_options' # xquery variable for fulltext options, if needed
available_filters = ['contains', 'startswith', 'exact', 'fulltext_terms',
'highlight', 'in', 'document_path', 'exists',
'gt', 'gte', 'lt', 'lte']
special_fields = ['fulltext_score', 'last_modified', 'hash',
'document_name', 'collection_name', 'match_count']
_raw_prefix = 'r_' # field-name prefix to distinguish raw field returns
def __init__(self, xpath=None, collection=None, document=None,
namespaces=None, fulltext_options=None):
if xpath is not None:
self.xpath = xpath
if fulltext_options is None:
fulltext_options = {}
# remove leading / from collection name (if any)
self.set_collection(collection)
self.document = document
self.namespaces = namespaces
self.filters = []
self.or_filters = []
self.not_filters = []
# info for filters that use special fields & require let/where in xquery
self.where_filters = []
self.where_fields = []
# sort information - field to sort on, ascending/descending
self.order_by = None
self.order_by_rawxpath = False
self.order_mode = None
# also/only fields
self.return_fields = {}
self.additional_return_fields = {}
# list of field names (in return or additional return) where raw xpath should be used
self.raw_fields = []
# start/end values for subsequence
self.start = 0
self.end = None
self._distinct = False
# return field / xpath details for constructed xquery return
self.return_xpaths = []
self._return_field_count = 1
# optional configuration for fulltext queries
self.fulltext_options = fulltext_options
self.ft_query = False # flag for if the current xquery includes a fulltext query
self.highlight = None
def __str__(self):
return self.getQuery()
def set_collection(self, collection):
'''Set or update the collection to be used when constructing the xquery. Setting to ``None`` will remove
any collection filter from the generated XQuery.'''
if collection is not None:
collection = collection.lstrip('/')
self.collection = collection
def set_document(self, document):
self.document = document
def getCopy(self):
xq = Xquery(xpath=self.xpath, collection=self.collection,
document=self.document, namespaces=self.namespaces)
xq.filters += self.filters
xq.where_filters += self.where_filters
xq.or_filters += self.or_filters
xq.not_filters += self.not_filters
xq.where_fields = self.where_fields
xq.order_by = self.order_by
xq.order_by_rawxpath = self.order_by_rawxpath
xq.order_mode = self.order_mode
xq._distinct = self._distinct
# return *copies* of dictionaries, not references to the ones in this object!
xq.return_fields = self.return_fields.copy()
xq.additional_return_fields = self.additional_return_fields.copy()
xq.raw_fields = self.raw_fields
xq.return_xpaths = self.return_xpaths
xq._return_field_count = self._return_field_count
xq.fulltext_options = self.fulltext_options.copy()
xq.ft_query = self.ft_query
xq.highlight = self.highlight
return xq
def getQuery(self):
"""
Generate and return xquery based on configured filters, sorting, return fields.
Returns xpath or FLOWR XQuery if required based on sorting and return
"""
declarations = None
if self.namespaces:
declarations = '\n'.join('''declare namespace %s='%s';''' % (prefix, urn)
for prefix, urn in self.namespaces.iteritems())
xpath_parts = []
if self.document is not None:
# if a document is specified, add it it to the top-level query xpath
# -- document takes precedence over collection
document_xquery = 'doc("%s")' % self.document
xpath_parts.append(self.prep_xpath(self.xpath, context=document_xquery))
elif self.collection is not None:
# if a collection is specified, add it it to the top-level query xpath
# -- prep_xpath handles logic for top-level xpath with multiple components, e.g. foo|bar
collection_xquery = 'collection("/db/%s")' % self.collection
xpath_parts.append(self.prep_xpath(self.xpath, context=collection_xquery))
else:
xpath_parts.append(self.xpath)
xpath_parts += ['[%s]' % (f,) for f in self.filters]
if self.or_filters:
xpath_parts.append('[%s]' % (' or '.join(self.or_filters)))
if self.not_filters:
xpath_parts.append('[%s]' % (' and '.join(['not(%s)' % f for f in self.not_filters])))
xpath = ''.join(xpath_parts)
# add search terms for highlighting if requested
if self.highlight is not None:
if not isinstance(self.highlight, bool):
# Highlighting results efficiently in eXist is a bit tricky. We need to run a full-text search so
# eXist will enable match highlighting in the result, but we want to return the result even if there are
# no matches present. What we're doing here is telling eXist to take the first available version of the
# constructed xpath, either one that contains the fulltext search terms (if it exists),
# or (as a fallback) the one without them.
xpath = '(%(xp)s[ft:query(., %(val)s)]|%(xp)s)' % {'xp': xpath,
'val': _quote_as_string_literal(self.highlight)}
# requires FLOWR instead of just XQuery (sort, customized return, etc.)
if self.order_by or self.return_fields or self.additional_return_fields \
or self.where_filters or (self.ft_query and self.fulltext_options):
# some let statements must come at the beginning of a FLOWR query
if self.ft_query and self.fulltext_options:
# construct xml option configuration for fulltext query
E = ElementMaker()
opts = E('options')
for field, value in self.fulltext_options.iteritems():
opts.append(E(field, value))
flowr_pre = 'let %s := %s' % (self.ft_option_xqvar, etree.tostring(opts))
else:
flowr_pre = ''
# NOTE: using constructed xpath, with collection filter (if collection specified)
flowr_for = 'for %s in %s' % (self.xq_var, xpath)
# define any special fields that have been requested
let = []
for field in self.special_fields:
if field == self.order_by or field in self.return_fields \
or field in self.additional_return_fields \
or field in self.where_fields:
# determine how to calculate the value of the requested field
if field == 'fulltext_score':
val = 'ft:score(%s)' % self.xq_var
elif field == 'hash':
val = 'util:hash(%s, "SHA-1")' % self.xq_var
elif field == 'document_name':
val = 'util:document-name(%s)' % self.xq_var
elif field == 'collection_name':
val = 'util:collection-name(%s)' % self.xq_var
elif field == 'document_name':
val = 'util:document-name(%s)' % self.xq_var
elif field == 'last_modified':
val = 'xmldb:last-modified(util:collection-name(%(var)s), util:document-name(%(var)s))' % \
{'var': self.xq_var}
elif field == 'match_count':
val = 'count(util:expand(%(var)s)//exist:match)' % {'var': self.xq_var}
# define an xquery variable with the same name as the special field
let.append('let $%s := %s' % (field, val))
flowr_let = '\n'.join(let)
# if any where filters are present, combine them with 'and',
# prepend with one 'where' statement
flowr_where = ''
if self.where_filters:
flowr_where = 'where ' + '\n and '.join(self.where_filters)
# for now, assume sort relative to root element
if self.order_by:
if self.order_by in self.special_fields:
order_field = '$%s' % self.order_by
elif self.order_by_rawxpath:
# if order raw xpath flag is set, do not do any futher prep
# but insert xquery variable if referenced
order_field = self.order_by % {'xq_var': self.xq_var}
else:
order_field = self.prep_xpath(self.order_by)
flowr_order = 'order by %s %s' % (order_field, self.order_mode)
else:
flowr_order = ''
flowr_return = self._constructReturn()
query = '\n'.join(part for part in [flowr_pre, flowr_for, flowr_let, flowr_where, flowr_order,
flowr_return] if part) # don't generate blank lines in xqueries
else:
# if FLOWR is not required, just use plain xpath
# if highlighting is requested (boolean value OR search term)
# enable it here by wrapping around the entire xpath
if self.highlight:
xpath = 'util:expand(%s)' % xpath
query = xpath
if self._distinct:
query = "distinct-values(%s)" % (query,)
if declarations:
query = '\n'.join([declarations, query])
# if either start or end is specified, only retrieve the specified set of results
# limits need to be done after any sorting or filtering, so subsequencing entire query
if self.start or self.end is not None:
# subsequence takes nodeset, starting position, number of records to return
# note: xquery starts counting at 1 instead of 0
if self.end is None:
end = '' # no limit
else:
end = self.end - self.start # number to return
query = "subsequence(%s, %i, %s)" % (query, self.start + 1, end)
return query
def sort(self, field, ascending=True, case_insensitive=False):
'''Add ordering to xquery; sort field is assumed relative to base xpath.
:param field: xpath to sort on OR one of the special pre-defined named fields
:param ascending: defaults to True, set to False for reverse sort
:param case_insensitive: defaults to False, set to True to get
a case-insensitive sort (uses fn:lower-case conversion)
'''
# TODO: support multiple sort fields
# any field preparation for use in xpath must be handled when query
# is constructed instead of here, so that special fields can be
# recognized and defined before use in sorting
if case_insensitive:
field = 'fn:lower-case(%s)' % field
self.order_by = field
self.order_mode = 'ascending' if ascending else 'descending'
def sort_raw(self, xpath, ascending=True):
self.order_by = xpath
self.order_by_rawxpath = True # set flag to indicate no further prep should be done
self.order_mode = 'ascending' if ascending else 'descending'
def distinct(self):
self._distinct = True
def add_filter(self, xpath, type, value, mode=None):
"""
Add a filter to the xpath. Takes xpath, type of filter, and value.
Filter types currently implemented:
* contains
* startswith
* exact
* fulltext_terms - full-text query; requires lucene index configured in exist
* highlight - run a full-text query, but return even if no matches
* in - value is present in a list
* exists - element is present or not present in the document
* gt,gte,lt,lte : >, >=, <, <=
By default, all filters are ANDed together. Specifying a ``mode`` of **OR**
will OR together all filters added with a mode of OR.
"""
# possibilities to be added:
# gt/gte,lt/lte, endswith, range, date, isnull (?), regex (?)
# search (full-text search with full-text indexing - like contains but faster)
gtlt_ops = {'gt': '>', 'gte': '>=', 'lt': '<', 'lte': '<='}
if type not in self.available_filters:
raise TypeError(repr(type) + ' is not a supported filter type')
_xpath = xpath
if xpath in self.special_fields:
# filters on pre-defined 'special' fields need a little extra handling
# add to list of 'where' fields to ensure special field is defined as xq variable
# - can't know if user wants a return only or a return also
self.where_fields.append(xpath)
# - adjust filter xpath to use $, to reference xq variable for special field
_xpath = '$%s' % xpath
# if there are fulltext options specified, the method is called differently
if self.fulltext_options:
ft_query_template = 'ft:query(%%s, %%s, %s)' % self.ft_option_xqvar
else:
ft_query_template = 'ft:query(%s, %s)'
xq_functions = ['contains'] # TODO: min, max
if type in xq_functions:
filter = '%s(%s, %s)' % (type, _xpath, _quote_as_string_literal(value))
if type == 'startswith':
filter = 'starts-with(%s, %s)' % (_xpath, _quote_as_string_literal(value))
if type == 'exact':
filter = '%s = %s' % (_xpath, _quote_as_string_literal(value))
if type == 'exists':
if value is True:
filter = _xpath
else:
filter = 'not(%s)' % _xpath
if type == 'fulltext_terms':
filter = ft_query_template % (_xpath, _quote_as_string_literal(value))
self.ft_query = True
if type == 'highlight':
# highlight is a special case; it has to be handled after the initial xpath
# is constructed, in getQuery, so just store the value here
self.highlight = value
# FIXME: should we allow highlight multiple times? should specifying highlight overwrite or append here?
filter = None
self.ft_query = True
# Highlighting a specific xpath (not the query node) is currently not supported;
# just issue a warning and highlight the whole response for now.
if xpath != '.':
logger.warn('Highlighting is only supported on the entire return result; xpath of %s was requested' %
xpath + ', but the entire result will be highlighted')
if type == 'in':
filter = ' or '.join(['%s=%s' % (_xpath, _quote_as_string_literal(v))
for v in value])
# filter = 'contains((%s), %s)' % (','.join(_quote_as_string_literal(v)
# for v in value),
# _xpath)
# greater than / less than operations
if type in gtlt_ops:
# differentiate between actual numbers and numeric strings,
# since they will be compared differently
# if already numeric, use as is without any conversion
if isinstance(value, (int, long, float)):
val = value
# otherwise, treat it as a string
else:
val = _quote_as_string_literal(value)
# NOTE: using xq variable because these will be added as where filters
# filter = '%s/%s %s %s' % (self.xq_var, _xpath, gtlt_ops[type], val)
filter = '%s %s %s' % (_xpath, gtlt_ops[type], val)
if filter is not None:
# if xpath in self.special_fields or type in gtlt_ops:
if xpath in self.special_fields:
# filters on pre-defined fields must occur in 'where' section, after
# relevant xquery variable has been defined
self.where_filters.append(filter)
elif mode == 'OR':
self.or_filters.append(filter)
elif mode == 'NOT':
self.not_filters.append(filter)
else:
self.filters.append(filter)
def return_only(self, fields, raw=False):
"""Only return the specified fields.
When specifying xpaths in raw mode, use ``%(xq_var)s`` if some portion
of the xpath should be made relative to the main xquery variable. To
include a plain % in a raw xpath, it **must** be escaped as ``%%``.
Not compatible with :meth:`return_also`.
:param fields: dictionary of {'field name' : 'xpath'}.
:param raw: when True, minimal processing will be done on the xpath.
"""
self.return_fields.update(fields)
if raw:
self.raw_fields.extend(fields.keys())
def return_also(self, fields, raw=False):
"""Return additional specified fields. See :meth:`return_only` for
syntax of xpaths in raw mode.
Not compatible with :meth:`return_only`.
:param fields: dictionary of {'field name' : 'xpath'}.
:param raw: when True, minimal processing will be done on the xpath.
"""
self.additional_return_fields.update(fields)
if raw:
self.raw_fields.extend(fields.keys())
def _constructReturn(self):
"""Construct the return portion of a FLOWR xquery."""
if self.return_fields or self.additional_return_fields:
# constructed return result with partial or additional content
# get a return element name to wrap the results
return_el = self._return_name_from_xpath(parse(self.xpath))
# reset any return fields that have been previously calculated
self._return_field_count = 1
self.return_xpaths = []
# returns for only/also fields are constructed almost exactly the same
if self.return_fields:
rblocks = []
elif self.additional_return_fields:
rblocks = ["{%s}" % self.xq_var] # return entire node
fields = dict(self.return_fields, **self.additional_return_fields)
for name, xpath in fields.iteritems():
# special cases
if name in self.special_fields:
# reference any special fields requested as xquery variables
rblocks.append('<%(name)s>{$%(name)s}</%(name)s>' % {'name': name})
elif name in self.raw_fields:
xpath = xpath % {'xq_var': self.xq_var}
rblocks.append('<%(prefix)s%(name)s>{%(xpath)s}</%(prefix)s%(name)s>' % \
{'prefix': self._raw_prefix, 'name': name, 'xpath': xpath})
else:
rblocks.append(self.prep_xpath(xpath, return_field=True))
return_el = '<%s>\n ' % (return_el) + '\n '.join(rblocks) + '\n</%s>' % (return_el)
else:
# return entire node, no constructed return
return_el = self.xq_var
if self.highlight:
# if highlighting is requested, use util:expand from exist kwic
# to turn on exist:match search term tagging
return_el = 'util:expand(%s)' % return_el
return 'return %s' % return_el
def _return_name_from_xpath(self, parsed_xpath):
"Generate a top-level return element name based on the xpath."
if isinstance(parsed_xpath, ast.Step):
# if this is a step, just use the node test
# special cases: node tests that can't be used as return element
if str(parsed_xpath.node_test) in ['node()', '*']:
return 'node'
return parsed_xpath.node_test
elif isinstance(parsed_xpath, ast.BinaryExpression):
# binary expression like node()|node() - recurse on right hand portion
return self._return_name_from_xpath(parsed_xpath.right)
elif isinstance(parsed_xpath, ast.AbsolutePath):
# absolute path like //a - recurse on relative portion
return self._return_name_from_xpath(parsed_xpath.relative)
else:
# for types we don't handle yet, don't just return nothing!
return 'node' # generic node name should work for most cases
def clear_filters(self):
self.filters = []
def set_limits(self, low=None, high=None):
"""
Adjusts the limits on the results to be retrieved.
Any limits passed in here are applied relative to the existing
constraints. So low is added to the current low value and both will be
clamped to any existing high value.
"""
# based on set_limits from django.db.models.sql.query
if high is not None:
if self.end is not None:
self.end = min(self.end, self.start + high)
else:
self.end = (self.start or 0) + high
if low is not None:
if self.end is not None:
self.start = min(self.end, self.start + low)
else:
self.start = (self.start or 0) + low
def clear_limits(self):
"Clear any existing limits"
self.start = 0
self.end = None
def prep_xpath(self, xpath, context=None, return_field=False):
"""Prepare an xpath for use in an xquery.
:param xpath: xpath as string or parsed by :meth:`eulxml.xpath.parse`
:param context: optional context to add to xpaths; by default, the current
xquery variable will be used
:param return_field: xpath will be used as a return field; it will have
additional node wrapping, and a return-field xpath will be calculated
and stored for use in :meth:`get_return_xpaths`
:rtype: string
"""
# common xpath clean-up before handing off to exist
# if the xpath passed in is a basestring, it has not yet been parsed
if isinstance(xpath, basestring):
parsed_xpath = parse(xpath)
else:
parsed_xpath = xpath
# parsed_xpath = xpath if parsed else parse(xpath)
if context is None:
context = self.xq_var
if isinstance(parsed_xpath, ast.BinaryExpression) and parsed_xpath.op == '|':
# binary OR expression - prep the two expressions and put them back together
xpath_str = '%(left)s%(op)s%(right)s' % {
'op': parsed_xpath.op,
'left': self.prep_xpath(parsed_xpath.left, context=context),
'right': self.prep_xpath(parsed_xpath.right, context=context),
}
# xquery context variable has been added to individual portions and
# should not be added again
context_path = None
# determine context needed relative to xquery variable
elif isinstance(parsed_xpath, ast.AbsolutePath):
# for an absolute path, (e.g., //node or /node), we need $n(xpath)
context_path = context
elif isinstance(parsed_xpath, ast.FunctionCall):
# function call - the function itself needs no context, but
# any arguments that are node tests should be prepped
context_path = ''
for i in range(len(parsed_xpath.args)):
arg = parsed_xpath.args[i]
if isinstance(arg, ast.AbbreviatedStep) or \
isinstance(arg, ast.Step) or \
isinstance(arg, ast.FunctionCall): # nested function call
# prep_xpath returns string, but function arg needs to be parsed
parsed_xpath.args[i] = parse(self.prep_xpath(arg))
# xpath like .//name needs to be made relative to xquery variable
elif isinstance(arg, ast.BinaryExpression) and arg.op == '//':
xpath_str = '%(left)s%(op)s%(right)s' % {
'op': arg.op,
'left': self.prep_xpath(arg.left, context=context),
# only the first portion needs xquery variable context
'right': serialize(arg.right)
}
parsed_xpath.args[i] = parse(xpath_str)
# xpath like xpath1|xpath1 needs both parts made relative to xquery variable
elif isinstance(arg, ast.BinaryExpression) and arg.op == '|':
xpath_str = '%(left)s%(op)s%(right)s' % {
'op': arg.op,
'left': self.prep_xpath(arg.left, context=context),
'right': self.prep_xpath(arg.right, context=context),
}
parsed_xpath.args[i] = parse(xpath_str)
else:
# for a relative path, we need $n/(xpath)
context_path = "%s/" % context
# FIXME: other possible cases?
if context_path is not None:
xpath_str = "%(context)s%(xpath)s" % {'context': context_path,
'xpath': serialize(parsed_xpath)}
if return_field:
xpath_str = "<field>{%s}</field>" % xpath_str
# get xpath for field as it will be returned
self.return_xpaths.append(self._return_field_xpath(parsed_xpath))
self._return_field_count += 1
return xpath_str
def _return_field_xpath(self, xpath):
if isinstance(xpath, ast.Step):
# FIXME: should predicates be removed here?
# field[x] should be sufficient to identify return node, and predicates may not match
return "field[%d]/%s" % (self._return_field_count, serialize(xpath))
elif isinstance(xpath, ast.BinaryExpression):
if xpath.op == '|':
return "%(left)s|%(right)s" % {
'left': self._return_field_xpath(xpath.left),
'right': self._return_field_xpath(xpath.right)
}
if xpath.op in ('/', '//'):
return self._return_field_xpath(xpath.right)
elif isinstance(xpath, ast.FunctionCall):
# for a function call, the field itself should be all the xpath needed
return "field[%d]" % self._return_field_count
elif isinstance(xpath, ast.AbsolutePath):
return self._return_field_xpath(xpath.relative)
# FIXME: other cases?
return None # FIXME: is there any sane fall-back return?
def get_return_xpaths(self):
"""Generate a dictionary of xpaths to match the results as they will be
returned in a constructed return result (when return fields have
been specified by :meth:`return_also` or :meth:`return_only`).
:returns: dictionary keyed on field names from argument passed to
:meth:`return_only` or :meth:`return_also`
:rtype: dict
"""
fields = dict(self.return_fields, **self.additional_return_fields)
xpaths = {}
i = 0
prefix = ''
if self.additional_return_fields:
# when there are additional return fields, all extra fields
# are one step up relative to main node
prefix = '../'
for name in fields.keys():
if name in self.special_fields:
# for predefined fields, xpath is the name of the field
xpaths[name] = prefix + name
elif name in self.raw_fields:
# for raw fields, xpath is raw prefix + name of the field
# match all nodes directly under the raw field wrapper, excluding empty text nodes
xpaths[name] = ''.join([prefix, self._raw_prefix, name]) + '/node()[not(normalize-space(.)="")]'
else:
xpaths[name] = prefix + self.return_xpaths[i]
i += 1
return xpaths
class XmlQuery(xmlmap.XmlObject):
''':class:`~eulxml.xmlmap.XmlObject` class to allow describing
`queries in xml <http://exist-db.org/exist/apps/doc/lucene.xml?q=query&field=all&id=D2.2.5.9#D2.2.5.9>`_.
'''
ROOT_NAME = 'query'
#: single search term
term = xmlmap.StringField('term')
#: phrase search
phrase = xmlmap.StringField('phrase')
#: near
near = xmlmap.StringField('near')
#: unordered near
near_unorderted = xmlmap.StringField('near[@ordered = "no"]')
# also available
# <bool><term>nation</term><term>miserable</term></bool>
# <bool><term>nation</term><wildcard>miser*</wildcard></bool>
# <bool><term>nation</term><regex>miser.*</regex></bool>
# <bool><term occur="must">boil</term><term occur="should">bubble</term
def __unicode__(self):
# serialize unquoted xml for use in an exist full-text xquery
return self.serialize()
# some helpers for handling '__'-separated field names:
def _simple_fielddef_to_xpath(fielddef, cls):
"""Convert a foo__bar__baz field definition to the XPath to that node"""
fields, rest = _split_fielddef(fielddef, cls)
if fields and not rest:
return _join_field_xpath(fields)
def _split_fielddef(fielddef, cls):
"""Split a field definition into a list of field objects and any
leftover bits."""
field_parts = []
while fielddef and cls:
field_name, rest = _extract_fieldpart(fielddef)
field = cls._fields.get(field_name, None)
if field is None:
# the field_name was invalid. leave it in fielddef as remainder
break
fielddef = rest
field_parts.append(field)
cls = getattr(field, 'node_class', None)
# if no node_class then keep the field, but everything else is
# remainder.
return field_parts, fielddef
def _extract_fieldpart(s):
"""Split a field definition into exactly two __-separated parts. If
there are no __ in the field definition, leave the second part empty."""
idx = s.find('__')
if idx < 0:
return s, ''
else:
return s[:idx], s[idx + 2:]
def _join_field_xpath(fields):
return '/'.join(f.xpath for f in fields)