app/models/pqf_query.rb
# used for translating Kete search queries to
# PQF queries that our ZoomDb understands
class PqfQuery
# relevance attribute spec says, in essence
# sort by dynamic relevance ranking (based on query)
# and match partial words (truncated on either the left or right, i.e. both)
# and do fuzzy matching (any one character in term may be replaced to match in search)
# i.e.
# add the dynamic relevance ranking
# allowing for incomplete search terms
# and fuzzy (one misspelled character)
# relevancee relies on our zoom dbs having it configured
# kete zebra servers should be configured properly to use it
# we may need to adjust when querying non-kete zoom_dbs (koha for example)
# see comment above about current_basket
# see #{RAILS_ROOT}zebradb/conf/cql2pqf.txt for details
# PQF attribute specs based on
# customized bib1 attribute set
# found in #{RAILS_ROOT}zebradb/tab/bib1.att
# see #{RAILS_ROOT}zebradb/conf/oai2index.xsl
# for mappings of oai dc xml elements to specific indexes
QUALIFYING_ATTRIBUTE_SPECS = {
'relevance' => '@attr 2=102 @attr 5=3 ', # we specify @attr 5=103, or fuzzy, separately now
'exact' => '@attr 4=3 ', # this is meant for exact matches against key indexes
'complete' => '@attr 6=3 ', # this is like exact, except to find exact matches against word or phrase indexes
'partial' => '@attr 5=3 ',
'fuzzy_regexp' => '@attr 5=103 ',
'datetime' => '@attr 4=5 ',
'exact_url' => '@attr 4=104 ',
'lt' => '@attr 2=1 ',
'le' => '@attr 2=2 ',
'eq' => '@attr 2=3 ',
'ge' => '@attr 2=4 ',
'gt' => '@attr 2=5 ',
'sort_stub' => '@attr 7='
} unless defined?(QUALIFYING_ATTRIBUTE_SPECS)
ATTRIBUTE_SPECS = {
'oai_identifier' => '@attr 1=12 ',
'oai_setspec' => '@attr 1=20 ',
'description' => '@attr 1=1010 ',
'relations' => '@attr 1=1026 ',
'subjects' => '@attr 1=21 ',
'creators' => '@attr 1=1003 ',
'contributors' => '@attr 1=1020 ',
'title' => '@attr 1=4 ',
'coverage' => '@attr 1=29 ',
'any_text' => '@attr 1=1016 ',
'last_modified' => "@attr 1=1012 #{QUALIFYING_ATTRIBUTE_SPECS['datetime']}",
'date' => "@attr 1=30 #{QUALIFYING_ATTRIBUTE_SPECS['datetime']}",
'last_modified_sort' => '@attr 1=1012 ',
'date_sort' => '@attr 1=30 '
} unless defined?(ATTRIBUTE_SPECS)
# TODO: my hash_fu is failing me, DRY this up
DATETIME_SPECS = {
'oai_datestamp' => ATTRIBUTE_SPECS['last_modified'],
'last_modified' => ATTRIBUTE_SPECS['last_modified'],
'date' => ATTRIBUTE_SPECS['date']
} unless defined?(DATETIME_SPECS)
DATETIME_COMPARISON_SPECS = {
'before' => QUALIFYING_ATTRIBUTE_SPECS['lt'],
'after' => QUALIFYING_ATTRIBUTE_SPECS['gt'],
'on' => QUALIFYING_ATTRIBUTE_SPECS['eq'],
'on_or_before' => QUALIFYING_ATTRIBUTE_SPECS['le'],
'on_or_after' => QUALIFYING_ATTRIBUTE_SPECS['ge']
} unless defined?(DATETIME_COMPARISON_SPECS)
# all ATTRIBUTE_SPECS wll have ..._include method created for them
# except what is specified here
# any spec with sort in its key is skipped
DO_NOT_AUTO_DEF_INCLUDE_METHODS_FOR = ATTRIBUTE_SPECS.keys.select { |key| key.include?('sort') } unless defined?(DO_NOT_AUTO_DEF_INCLUDE_METHODS_FOR)
attr_accessor :query_parts, :operators,
:title_or_any_text_query_string, :title_or_any_text_operators_string,
:direction_value, :sort_spec, :should_search_web_links_to
# dynamically define query methods for our attribute specs
def self.define_query_method_for(method_name, attribute_spec)
# create the template code
code =
Proc.new do |term_or_terms, *options|
options = options.first || {}
terms = terms_as_array(term_or_terms)
# make default operator @and, if unspecified
options[:operator] = options[:operator].nil? ? '@and' : options[:operator]
# pass nil operator, if 'none' is specified
options[:operator] = nil if options[:operator] == 'none'
query_part = create_query_part(options.merge({
attribute_spec: attribute_spec,
term_or_terms: terms
}))
end
define_method(method_name, &code)
end
def initialize
@query_parts = []
@operators = []
@title_or_any_text_query_string = ''
@title_or_any_text_operators_string = ''
@direction_value = 1
@sort_spec = nil
@should_search_web_links_too = false
end
# combine query_parts and operators
# add any special aspects to query if required
# and spit out complete query as string
# suitable to be passed to ZOOM::Connection#search
def to_s
# handle the query as specified in standard ways so far
full_query = @operators.join(' ') + ' ' + @query_parts.join(' ') + ' '
# add special handling of searching URLs within dc subject
if @should_search_web_links_too
# pull off title or any text bit at end and prepend an @or
# then append the subject query to end
# we know that the pattener @or @attr 1=4 is what we are after
# for where to add the extra @or
prepend_at_pattern = "@or #{ATTRIBUTE_SPECS['title']}"
full_query_parts = full_query.split(prepend_at_pattern)
no_relevance_query_string =
full_query = ''
full_query += full_query_parts[0] unless full_query_parts[0].nil?
full_query += '@or ' + prepend_at_pattern
full_query += full_query_parts[1] unless full_query_parts[1].nil?
full_query += QUALIFYING_ATTRIBUTE_SPECS['exact_url'] + ' ' + ATTRIBUTE_SPECS['subjects'] +
@title_or_any_text_operators_string +
@title_or_any_text_query_string + ' '
full_query
end
# add sorting if specified
if !@sort_spec.nil?
# date specs when doing a non-sorting query
# have a slightly different format (specifies structure of date normalized as @attr 4=5)
# grab the correct spec for sorting
@sort_spec = @sort_spec +
'_sort' if Search.date_types.include?(@sort_spec) && !@sort_spec.include?('_sort')
full_query = '@or ' + full_query + QUALIFYING_ATTRIBUTE_SPECS['sort_stub'] + @direction_value.to_s + ' ' + ATTRIBUTE_SPECS[@sort_spec] + ' 0 '
end
full_query
end
# dynamically define _equals_completely and _include methods for our attribute specs
ATTRIBUTE_SPECS.each do |spec_key, spec_value|
unless DO_NOT_AUTO_DEF_INCLUDE_METHODS_FOR.include?(spec_key)
# define the method for exact matches for whole field value
method_name = spec_key + '_equals_completely'
full_spec_value = spec_value + QUALIFYING_ATTRIBUTE_SPECS['complete']
define_query_method_for(method_name, full_spec_value)
# define the more general method where the terms may be contained partially
method_name = spec_key + '_include'
# these include methods are meant be forgiving for partial matches
# thus we append the partial QUALIFYING_ATTRIBUTE_SPECS value
full_spec_value = spec_value + QUALIFYING_ATTRIBUTE_SPECS['partial']
define_query_method_for(method_name, full_spec_value)
end
end
# TODO: make this more concise via singleton method?
# even if we only have a single term
# make sure we always pass an array down to create_query_part
def terms_as_array(terms)
return terms if terms.is_a?(Array)
terms = terms_to_a(terms)
end
def terms_to_a(*terms)
terms
end
# we know that the format of oai_identifier is the following:
# oai:site:basket:Class:id
# if we want to search for an exact match for an element
# we wrap the term likeso ":term:"
def exact_match_for_part_of_oai_identifier(term_or_terms, *options)
options = options.first || {}
terms = terms_as_array(term_or_terms).collect { |term| ":#{term}:" }
# oai is a special case, can't have : precede it
# replace it with proper version if found
terms << 'oai:' if terms.delete(':oai:')
oai_identifier_include(terms, options)
end
# expects term_or_terms to be strings that are in db normalized datetimes
# can (and probably should) include utc offset
# i.e. "1999-12-31 23:59:59+00:00"
DATETIME_SPECS.each do |spec_key, spec_value|
DATETIME_COMPARISON_SPECS.each do |comparison_name, comparison_spec|
method_name = spec_key + '_' + comparison_name
full_attribute = comparison_spec + spec_value
define_query_method_for(method_name, full_attribute)
end
end
def oai_datestamp_between(options = {})
beginning = options[:beginning]
ending = options[:ending]
query_part = '@and ' + oai_datestamp_on_or_after(
beginning,
options.merge({
only_return_as_string: true,
operator: 'none'
})
)
query_part += ' ' + oai_datestamp_on_or_before(
ending,
options.merge({
only_return_as_string: true,
operator: 'none'
})
)
push_to_appropriate_variables(options.merge(query_part: query_part)) unless options[:only_return_as_string]
query_part
end
# a wrapper that sets up the correct query
# depending on what options are specified
def oai_datestamp_comparison(options = {})
beginning = !options[:beginning].blank? ? options[:beginning] : nil
ending = !options[:ending].blank? ? options[:ending] : nil
if !beginning.nil? && !ending.nil?
oai_datestamp_between(options)
elsif !beginning.nil? && ending.nil?
options.delete(:beginning)
oai_datestamp_on_or_after(beginning, options)
elsif !ending.nil? && beginning.nil?
options.delete(:ending)
oai_datestamp_on_or_before(ending, options)
end
end
def creators_or_contributors_include(term_or_terms, options = {})
query_part = '@or ' + creators_include(
term_or_terms,
options.merge({
only_return_as_string: true,
operator: 'none'
})
)
query_part += ' ' + contributors_include(
term_or_terms,
options.merge({
only_return_as_string: true,
operator: 'none'
})
)
push_to_appropriate_variables(options.merge(query_part: query_part, operator: '@and')) unless options[:only_return_as_string]
query_part
end
def creators_or_contributors_equals_completely(term_or_terms, options = {})
query_part = '@or ' + creators_equals_completely(
term_or_terms,
options.merge({
only_return_as_string: true,
operator: 'none'
})
)
query_part += ' ' + contributors_equals_completely(
term_or_terms,
options.merge({
only_return_as_string: true,
operator: 'none'
})
)
push_to_appropriate_variables(options.merge(query_part: query_part, operator: '@and')) unless options[:only_return_as_string]
query_part
end
# this is standard full text query
# of entire record
# by adding query for title first
# we give matches against title
# higher relevance
# includes sorting by dynamic relevance
# by default
def title_or_any_text_includes(terms)
query_part = QUALIFYING_ATTRIBUTE_SPECS['relevance']
operator = '@and'
terms = pqf_format(terms)
title_query = '@or ' + ATTRIBUTE_SPECS['title'] + ' '
all_content_query = ATTRIBUTE_SPECS['any_text'] + ' '
if !terms.blank?
if terms.size > 1
# work through terms
# if there is a boolean operator specified
# add it to the correct spot
# if not specified add another "@and"
term_count = 1
terms_array = []
operators_array = []
query_starts_with_not = false
last_term_an_operator = false
terms.each do |term|
# if first term is boolean operator "not"
# then replace the @and for this element of the query with @not
# all other boolean operators are treated as normal words if first term
if term_count == 1
if term.casecmp('not').zero?
query_starts_with_not = true
else
terms_array << term
end
else
if term_count > 1
# in the rare case that @not has replaced
# @and at the front of the whole query
# and this is the second term
# skip adding a boolean operator
if (query_starts_with_not == true) && (term_count == 2)
# this just treats even terms found in
# Search.boolean_operators as regular words
# since their placement makes them meaningless as boolean operators
terms_array << term
else
if Search.boolean_operators.include?(term.downcase)
# we got ourselves an operator
operators_array << "@#{term.downcase}"
last_term_an_operator = true
else
# just a plain term
if last_term_an_operator == false
# need to add an operator
# assume "and" since none-specified
operators_array << '@and '
end
terms_array << term
last_term_an_operator = false
end
end
end
end
term_count += 1
end
# handle case where the user has enterd two or more operators in a row
# we just subtract one from the beginning of operators_array
while operators_array.size >= terms_array.size
operators_array.delete_at(0)
end
if operators_array.size > 0
@title_or_any_text_operators_string = operators_array.join(' ') + ' '
title_query += @title_or_any_text_operators_string
all_content_query += @title_or_any_text_operators_string
end
if query_starts_with_not == true
operator += '@not'
end
@title_or_any_text_query_string = '"' + terms_array.join('" "') + '" '
title_query += @title_or_any_text_query_string
all_content_query += @title_or_any_text_query_string
query_part += title_query + all_content_query
else
# @and will break query if only single term
@title_or_any_text_query_string = '"' + terms.join('" "') + '" '
query_part += "#{title_query} #{@title_or_any_text_query_string} #{all_content_query} #{@title_or_any_text_query_string} "
end
end
push_to_appropriate_variables({ query_part: query_part, operator: operator })
query_part
end
def add_web_link_specific_query
@should_search_web_links_too = true
end
# aliases for readability's sake
alias oai_datestamp_include last_modified_include
alias kind_is exact_match_for_part_of_oai_identifier
alias within exact_match_for_part_of_oai_identifier
private
# quote each term to handle phrases, etc.
def pqf_format(terms)
# handles case were someone is searching for a url
# there may be other special characters to handle
# but this seems to do the trick
terms = terms.tr('/', "\/")
# this is sort of cheating
# we know that Topic class has the acts_as_zoom instance methods...
terms = Topic.split_to_search_terms(terms)
terms
end
def push_to_appropriate_variables(options = {})
@operators << options[:operator] if !options[:operator].blank? && options[:operator] != 'none'
@query_parts << options[:query_part] unless options[:only_return_as_string]
end
# expects single string for term_or_terms
# or array of strings
def create_query_part(options = {})
query_part = options[:attribute_spec]
# should always be an array by the time it gets here
term_or_terms = options[:term_or_terms]
should_be_exact = options[:should_be_exact] || false
inner_operator = options[:inner_operator] || '@or'
if should_be_exact
# strip out partial qualify attribute spec if it is in there
query_part = query_part.gsub(QUALIFYING_ATTRIBUTE_SPECS['partial'], '')
# now add in exact qualifying attribute spec
query_part += QUALIFYING_ATTRIBUTE_SPECS['exact']
end
if term_or_terms.size == 1
query_part += "\"#{term_or_terms}\""
else
# get the correct number of inner_operators
# essentially the number of terms - 1
# but we already have the first instance...
operators_string = inner_operator
number_of = term_or_terms.size - 2
number_of.times do
operators_string += " #{inner_operator}"
end
# we always quote since it won't hurt when they aren't needed
query_part += "#{operators_string} \"" + term_or_terms.join('" "') + '"'
end
push_to_appropriate_variables(options.merge(query_part: query_part))
query_part
end
end