lib/mongoid/full_text_search.rb from artsy/mongoid_fulltext

lib/mongoid/full_text_search.rb
Summary

Maintainability

3 days
Test Coverage

Issues
require 'mongoid'
require 'mongoid/compatibility'

if Mongoid::Compatibility::Version.mongoid3?
  require 'mongoid/full_text_search/indexes'
else
  require 'mongoid/full_text_search/indexable'
end

require 'unicode_utils'
require 'cgi'

module Mongoid::FullTextSearch
  extend ActiveSupport::Concern

  included do
    cattr_accessor :mongoid_fulltext_config
  end

  class UnspecifiedIndexError < StandardError; end
  class UnknownFilterQueryOperator < StandardError; end

  module ClassMethods
    def fulltext_search_in(*args)
      self.mongoid_fulltext_config = {} if mongoid_fulltext_config.nil?
      options = args.last.is_a?(Hash) ? args.pop : {}
      index_name = if options.key?(:index_name)
                     options[:index_name]
                   else
                     'mongoid_fulltext.index_%s_%s' % [name.downcase, mongoid_fulltext_config.count]
                   end

      config = {
        alphabet: 'abcdefghijklmnopqrstuvwxyz0123456789 ',
        word_separators: "-_ \n\t",
        ngram_width: 3,
        max_ngrams_to_search: 6,
        apply_prefix_scoring_to_all_words: true,
        index_full_words: true,
        index_short_prefixes: false,
        max_candidate_set_size: 1000,
        remove_accents: true,
        reindex_immediately: true,
        stop_words: Hash[%w[i a s t me my we he it am is be do an if
                            or as of at by to up in on no so our you him
                            his she her its who are was has had did the and
                            but for out off why how all any few nor not own
                            too can don now ours your hers they them what whom
                            this that were been have does with into from down over
                            then once here when both each more most some such only
                            same than very will just yours their which these those
                            being doing until while about after above below under
                            again there where other myself itself theirs having during
                            before should himself herself because against between through
                            further yourself ourselves yourselves themselves].map { |x| [x, true] }]
      }

      config.update(options)

      args = [:to_s] if args.empty?
      config[:ngram_fields] = args
      config[:alphabet] = Hash[config[:alphabet].split('').map { |ch| [ch, ch] }]
      config[:word_separators] = Hash[config[:word_separators].split('').map { |ch| [ch, ch] }]
      mongoid_fulltext_config[index_name] = config

      before_save(:update_ngram_index) if config[:reindex_immediately]
      before_destroy :remove_from_ngram_index
    end

    def create_fulltext_indexes
      return unless mongoid_fulltext_config
      mongoid_fulltext_config.each_pair do |index_name, fulltext_config|
        fulltext_search_ensure_indexes(index_name, fulltext_config)
      end
    end

    def fulltext_search_ensure_indexes(index_name, config)
      db = collection.database
      coll = db[index_name]

      # The order of filters matters when the same index is used from two or more collections.
      filter_indexes = (config[:filters] || []).map do |key, _value|
        ["filter_values.#{key}", 1]
      end.sort_by { |filter_index| filter_index[0] }

      index_definition = [['ngram', 1], ['score', -1]].concat(filter_indexes)

      # Since the definition of the index could have changed, we'll clean up by
      # removing any indexes that aren't on the exact.
      correct_keys = index_definition.map { |field_def| field_def[0] }
      all_filter_keys = filter_indexes.map { |field_def| field_def[0] }
      coll.indexes.each do |idef|
        keys = idef['key'].keys
        next unless keys.member?('ngram')
        all_filter_keys |= keys.find_all { |key| key.starts_with?('filter_values.') }
        next unless keys & correct_keys != correct_keys
        Mongoid.logger.info "Dropping #{idef['name']} [#{keys & correct_keys} <=> #{correct_keys}]" if Mongoid.logger
        if Mongoid::Compatibility::Version.mongoid5_or_newer?
          coll.indexes.drop_one(idef['key'])
        else
          coll.indexes.drop(idef['key'])
        end
      end

      if all_filter_keys.length > filter_indexes.length
        filter_indexes = all_filter_keys.map { |key| [key, 1] }.sort_by { |filter_index| filter_index[0] }
        index_definition = [['ngram', 1], ['score', -1]].concat(filter_indexes)
      end

      Mongoid.logger.info "Ensuring fts_index on #{coll.name}: #{index_definition}" if Mongoid.logger
      if Mongoid::Compatibility::Version.mongoid5_or_newer?
        coll.indexes.create_one(Hash[index_definition], name: 'fts_index')
      else
        coll.indexes.create(Hash[index_definition], name: 'fts_index')
      end

      Mongoid.logger.info "Ensuring document_id index on #{coll.name}" if Mongoid.logger
      if Mongoid::Compatibility::Version.mongoid5_or_newer?
        coll.indexes.create_one('document_id' => 1) # to make removes fast
      else
        coll.indexes.create('document_id' => 1) # to make removes fast
      end
    end

    def fulltext_search(query_string, options = {})
      max_results = options.key?(:max_results) ? options.delete(:max_results) : 10
      return_scores = options.key?(:return_scores) ? options.delete(:return_scores) : false
      if mongoid_fulltext_config.count > 1 && !options.key?(:index)
        error_message = '%s is indexed by multiple full-text indexes. You must specify one by passing an :index_name parameter'
        raise UnspecifiedIndexError, error_message % name, caller
      end
      index_name = options.key?(:index) ? options.delete(:index) : mongoid_fulltext_config.keys.first

      # Options hash should only contain filters after this point

      ngrams = all_ngrams(query_string, mongoid_fulltext_config[index_name])
      return [] if ngrams.empty?

      # For each ngram, construct the query we'll use to pull index documents and
      # get a count of the number of index documents containing that n-gram
      ordering = { 'score' => -1 }
      limit = mongoid_fulltext_config[index_name][:max_candidate_set_size]
      coll = collection.database[index_name]
      cursors = ngrams.map do |ngram|
        query = { 'ngram' => ngram[0] }
        query.update(map_query_filters(options))
        count = coll.find(query).count
        { ngram: ngram, count: count, query: query }
      end.sort! { |record1, record2| record1[:count] <=> record2[:count] }

      # Using the queries we just constructed and the n-gram frequency counts we
      # just computed, pull in about *:max_candidate_set_size* candidates by
      # considering the n-grams in order of increasing frequency. When we've
      # spent all *:max_candidate_set_size* candidates, pull the top-scoring
      # *max_results* candidates for each remaining n-gram.
      results_so_far = 0
      candidates_list = cursors.map do |doc|
        next if doc[:count] == 0
        query_result = coll.find(doc[:query])
        if results_so_far >= limit
          query_result = query_result.sort(ordering).limit(max_results)
        elsif doc[:count] > limit - results_so_far
          query_result = query_result.sort(ordering).limit(limit - results_so_far)
        end
        results_so_far += doc[:count]
        ngram_score = ngrams[doc[:ngram][0]]
        Hash[query_result.map do |candidate|
          [candidate['document_id'],
           { clazz: candidate['class'], score: candidate['score'] * ngram_score }]
        end]
      end.compact

      # Finally, score all candidates by matching them up with other candidates that are
      # associated with the same document. This is similar to how you might process a
      # boolean AND query, except that with an AND query, you'd stop after considering
      # the first candidate list and matching its candidates up with candidates from other
      # lists, whereas here we want the search to be a little fuzzier so we'll run through
      # all candidate lists, removing candidates as we match them up.
      all_scores = []
      until candidates_list.empty?
        candidates = candidates_list.pop
        scores = candidates.map do |candidate_id, data|
          { id: candidate_id,
            clazz: data[:clazz],
            score: data[:score] + candidates_list.map { |others| (others.delete(candidate_id) || { score: 0 })[:score] }.sum }
        end
        all_scores.concat(scores)
      end
      all_scores.sort! { |document1, document2| -document1[:score] <=> -document2[:score] }
      instantiate_mapreduce_results(all_scores[0..max_results - 1], return_scores: return_scores)
    end

    def instantiate_mapreduce_result(result)
      result[:clazz].constantize.find(result[:id])
    end

    def instantiate_mapreduce_results(results, options)
      if options[:return_scores]
        results.map { |result| [instantiate_mapreduce_result(result), result[:score]] }.find_all { |result| !result[0].nil? }
      else
        results.map { |result| instantiate_mapreduce_result(result) }.compact
      end
    end

    def all_ngrams(str, config, bound_number_returned = true)
      return {} if str.nil?

      if config[:remove_accents]
        if defined?(UnicodeUtils)
          str = UnicodeUtils.nfkd(str)
        elsif defined?(DiacriticsFu)
          str = DiacriticsFu.escape(str)
        end
      end

      # Remove any characters that aren't in the alphabet and aren't word separators
      filtered_str = str.mb_chars.downcase.to_s.split('').find_all { |ch| config[:alphabet][ch] || config[:word_separators][ch] }.join('')

      # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
      # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
      # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
      step_size = if bound_number_returned
                    [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
                  else
                    1
                  end

      # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the
      # input string using the step size that we just computed. Let score(x,y) be the score of string x
      # compared with string y - assigning scores to ngrams with the square root-based scoring function
      # below and multiplying scores of matching ngrams together yields a score function that has the
      # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
      # for any string z contained in y.
      ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
        score = if i == 0 || (config[:apply_prefix_scoring_to_all_words] && \
                      config[:word_separators].key?(filtered_str[i - 1].chr))
                  Math.sqrt(1 + 1.0 / filtered_str.length)
                else
                  Math.sqrt(2.0 / filtered_str.length)
                end
        { ngram: filtered_str[i..i + config[:ngram_width] - 1], score: score }
      end

      # If an ngram appears multiple times in the query string, keep the max score
      ngram_array = ngram_array.group_by { |h| h[:ngram] }.map { |key, values| { ngram: key, score: values.map { |v| v[:score] }.max } }

      if config[:index_short_prefixes] || config[:index_full_words]
        split_regex_def = config[:word_separators].keys.map { |k| Regexp.escape(k) }.join
        split_regex = Regexp.compile("[#{split_regex_def}]")
        all_words = filtered_str.split(split_regex)
      end

      # Add 'short prefix' records to the array: prefixes of the string that are length (ngram_width - 1)
      if config[:index_short_prefixes]
        prefixes_seen = {}
        all_words.each do |word|
          next if word.length < config[:ngram_width] - 1
          prefix = word[0...config[:ngram_width] - 1]
          if prefixes_seen[prefix].nil? && (config[:stop_words][word].nil? || word == filtered_str)
            ngram_array << { ngram: prefix, score: 1 + 1.0 / filtered_str.length }
            prefixes_seen[prefix] = true
          end
        end
      end

      # Add records to the array of ngrams for each full word in the string that isn't a stop word
      if config[:index_full_words]
        full_words_seen = {}
        all_words.each do |word|
          if word.length > 1 && full_words_seen[word].nil? && (config[:stop_words][word].nil? || word == filtered_str)
            ngram_array << { ngram: word, score: 1 + 1.0 / filtered_str.length }
            full_words_seen[word] = true
          end
        end
      end

      # If an ngram appears as any combination of full word, short prefix, and ngram, keep the sum of the two scores
      Hash[ngram_array.group_by { |h| h[:ngram] }.map { |key, values| [key, values.map { |v| v[:score] }.sum] }]
    end

    def remove_from_ngram_index
      mongoid_fulltext_config.each_pair do |index_name, _fulltext_config|
        coll = collection.database[index_name]
        if Mongoid::Compatibility::Version.mongoid5_or_newer?
          coll.find('class' => name).delete_many
        else
          coll.find('class' => name).remove_all
        end
      end
    end

    def update_ngram_index(options = { timeout: true })
      items = all
      items = items.no_timeout if options.key?(:timeout) && !options[:timeout]
      items.each(&:update_ngram_index)
    end

    private

    # Take a list of filters to be mapped so they can update the query
    # used upon the fulltext search of the ngrams
    def map_query_filters(filters)
      Hash[filters.map do |key, value|
        case value
        when Hash then
          if value.key? :any then format_query_filter('$in', key, value[:any])
          elsif value.key? :all then format_query_filter('$all', key, value[:all])
          else raise UnknownFilterQueryOperator, value.keys.join(','), caller end
        else format_query_filter('$all', key, value)
        end
      end]
    end

    def format_query_filter(operator, key, value)
      ['filter_values.%s' % key, { operator => [value].flatten }]
    end
  end

  def update_ngram_index
    mongoid_fulltext_config.each_pair do |index_name, fulltext_config|
      if condition = fulltext_config[:update_if]
        case condition
        when Symbol then  next unless send condition
        when String then  next unless instance_eval condition
        when Proc then    next unless condition.call self
        else; next
        end
      end

      # remove existing ngrams from external index
      coll = collection.database[index_name.to_sym]
      if Mongoid::Compatibility::Version.mongoid5_or_newer?
        coll.find('document_id' => _id).delete_many
      else
        coll.find('document_id' => _id).remove_all
      end
      # extract ngrams from fields
      field_values = fulltext_config[:ngram_fields].map { |field| send(field) }
      ngrams = field_values.inject({}) { |accum, item| accum.update(self.class.all_ngrams(item, fulltext_config, false)) }
      return if ngrams.empty?
      # apply filters, if necessary
      filter_values = nil
      if fulltext_config.key?(:filters)
        filter_values = Hash[fulltext_config[:filters].map do |key, value|
          begin
            [key, value.call(self)]
          rescue StandardError
            # Suppress any exceptions caused by filters
          end
        end.compact]
      end
      # insert new ngrams in external index
      ngrams.each_pair do |ngram, score|
        index_document = { 'ngram' => ngram, 'document_id' => _id, 'score' => score, 'class' => self.class.name }
        index_document['filter_values'] = filter_values if fulltext_config.key?(:filters)
        if Mongoid::Compatibility::Version.mongoid5_or_newer?
          coll.insert_one(index_document)
        else
          coll.insert(index_document)
        end
      end
    end
  end

  def remove_from_ngram_index
    mongoid_fulltext_config.each_pair do |index_name, _fulltext_config|
      coll = collection.database[index_name]
      if Mongoid::Compatibility::Version.mongoid5_or_newer?
        coll.find('document_id' => _id).delete_many
      else
        coll.find('document_id' => _id).remove_all
      end
    end
  end
end