SpeciesFileGroup/taxonworks

View on GitHub
lib/queries/otu/autocomplete.rb

Summary

Maintainability
A
0 mins
Test Coverage
require Rails.root.to_s + '/lib/queries/taxon_name/autocomplete'

module Queries
  module Otu

    # See Query::Autocomplete for optimization strategy per name.
    # There are 4 classes of name, each which has the same strategy: OTU name, Original TaxonName, TaxonName, CommonName
    # We then apply a global priority pulling the best names from each sub-strategy
    # to the top.
    #
    class Autocomplete < Query::Autocomplete

      # @return Boolean, nil
      #   true - only return Otus with `name` = nil
      #   false,nil - no effect
      attr_accessor :having_taxon_name_only

      # @return Boolean, nil
      #   true - OTU must have taxon name
      #   false - OTU must not have taxon name
      #   nil - ignored
      attr_accessor :with_taxon_name

      # @return [Boolean]
      #   &exact=<"true"|"false">
      #   if 'true' then only #name = query_string results are returned (no fuzzy matching)
      attr_accessor :exact

      # Keys are method names. Existence of method is checked
      # before requesting the query
      QUERIES = {
        # OTU
        otu_name_exact: {priority: 1},
        autocomplete_exact_id: {priority: 1},
        autocomplete_identifier_cached_exact: {priority: 1},
        otu_name_start_match: {priority: 200},
        otu_name_similarity: {priority: 220},

        # TaxonName
        autocomplete_taxon_name: {priority: nil}, # Priority is slotted from 10 .. 20
        # These are all approximately covered in the blanket taxon_name autocomplete
        # taxon_name_name_exact: {priority: 10},
        # taxon_name_identifier_exact: {priority: 10},
        # taxon_name_name_start_match: {priority: 100},
        # taxon_name_name_high_cuttoff: {priority: 200},

        # CommonName
        # These should all be covered/moved to common_name_autocomplete,
        autocomplete_common_name_exact: {priority: 100},
        autocomplete_common_name_like: {priority: 1000}
        # common_name_identifier_exact: {priority: 10},
        # common_name_name_start_match: {priority: 100},
        # common_name_name_similarity: {priority: 200},
      }.freeze

      def initialize(string, project_id: nil, having_taxon_name_only: false, with_taxon_name: nil, exact: 'false')
        super(string, project_id:)
        @having_taxon_name_only = boolean_param({having_taxon_name_only:}, :having_taxon_name_only)
        @with_taxon_name = boolean_param({with_taxon_name:}, :with_taxon_name)

        # TODO: move to mode
        @exact = boolean_param({exact:}, :exact)
      end

      def base_query
        q = ::Otu.all
        q = q.where(project_id:) if project_id.any?
        q
      end

      def otu_name_exact
        base_query.where(otus: {name: query_string})
      end

      def otu_name_start_match
        base_query.where('otus.name ilike ?', query_string + '%')
      end

      # All records that meet the similarity cuttoff
      # - this is intended as a generic replacement for wildcarded results
      #
      # Observations:
      #   - was similarity(), experimenting with word_similarity
      #   - 3 letter matches are going to be low probability, matches kick in at 4
      #
      def otu_name_similarity
        base_query
          .where('otus.name % ?', query_string)
          .where( ApplicationRecord.sanitize_sql_array(["word_similarity('%s', otus.name) > 0.33", query_string]))
          .order('otus.name, length(otus.name)')
      end

      # @return [Scope]
      #   Pull the result of a TaxonName autocomplete. Maintain the order returned, and
      #   re-cast the result in terms of an OTU query. Expensive but maintaining order is key.
      def autocomplete_taxon_name
        taxon_names = Queries::TaxonName::Autocomplete.new(query_string, exact:, project_id:).autocomplete # an array, not a query

        ids = taxon_names.collect{|n| n.is_combination? ? n.cached_valid_taxon_name_id : n.id} # TODO: Experiment with :cached_valid_taxon_name_id) # We assume we want to land on Valid OTUs, but see #
        return nil if ids.empty?

        min = 10.0
        max = 20.0
        scale = (max - min) / ids.count.to_f

        # TODO: optimize *
        base_query.select("otus.*, ((#{min} + row_number() OVER ())::float * #{scale}) as priority") # small incrementing numbers for priority
          .joins("INNER JOIN ( SELECT unnest(ARRAY[#{ids.join(',')}]) AS id, row_number() OVER () AS row_num ) AS id_order ON otus.taxon_name_id = id_order.id")
          .order('id_order.row_num')
      end

      # Maintains valid_taxon_name_id needed for API.
      #
      # Considerations:
      #   otus -> taxon names -> valid taxon name_id <- otu can return more OTUs than the original query
      #      because there can be multiple OTUs for the valid name of an invalid original result.
      #      right now we pick the first valid OTU for the name with distinct on()
      #
      def api_autocomplete
        @with_taxon_name = true

        # This limit() has more impact now. Since all
        # names are loaded large matches can swamp exact names
        # before priority ordering is applied. May require tuning.
        otus = compact_priorities( autocomplete_base.limit(30) )

        otu_order = otus.map(&:id).uniq

        f = ::Otu.where(id: otu_order)
          .joins('left join taxon_names t1 on otus.taxon_name_id = t1.id')
          .joins('left join otus o2 on t1.cached_valid_taxon_name_id = o2.taxon_name_id')
          .select('distinct on (otus.id) otus.id, otus.name, otus.taxon_name_id, COALESCE(o2.id, otus.id) as otu_valid_id')

        f.sort_by.with_index { |item, idx| [(otu_order.index(item.id) || 999), (idx || 999)] }
      end


      def autocomplete_taxon_name_extended
        taxon_names = Queries::TaxonName::Autocomplete.new(query_string, exact:, project_id:).autocomplete # an array, not a query

        ids = taxon_names.collect{|n|
          [
            (n.is_combination? ? n.cached_valid_taxon_name_id : n.id), # Points to the OTU target, if there is one
            n.id,  # points to the label target
          ]
        }

        return ::Otu.none if ids.empty?

        ids.uniq!

        min = 10.0
        max = 20.0
        scale = (max - min) / ids.count.to_f

        # TODO: optimize *
        otus = base_query.select("otus.*, label_target_taxon_name_id, ((#{min} + row_number() OVER ())::float * #{scale}) as priority") # small incrementing numbers for priority
          .joins("INNER JOIN ( SELECT unnest(ARRAY[#{ids.map(&:first).join(',')}]) AS id, unnest(ARRAY[#{ids.map(&:last).join(',')}]) AS label_target_taxon_name_id, row_number() OVER () AS row_num ) AS id_order ON otus.taxon_name_id = id_order.id")
          .order('id_order.row_num')

        otus = scope_autocomplete(otus).includes(:taxon_name)

        otus
      end

      # An autocomplete result that permits displaying the TaxonName as originally matched.
      # @return [Array] of
      #    { otu:,  label_target:, otu_valid_id: } 
      #
      # Note that otu: is really only useful when displaying otus without &having_taxon_name_only=true.  We don't, for example make use 
      # of this element there.
      def api_autocomplete_extended
        otu_queries = QUERIES.dup
        otu_queries.delete(:autocomplete_taxon_name)

        base_otus = autocomplete_base(otu_queries).limit(30)
        taxon_name_otus = autocomplete_taxon_name_extended

        r = []

        base_otus.each do |o|
          r.push({
            otu: o, # contains priority
            label_target: o
          })
        end

        taxon_name_otus.each do |o|
          r.push({
            otu: o,
            label_target: (o.label_target_taxon_name_id ? ::TaxonName.find(o.label_target_taxon_name_id) : o.taxon_name )  # is o.taxon_name true?!
          })
        end

        # Keep a unique set of otu + label (to render)
        seen = Set.new

        # The compacted result
        compact = []

        r.each do |h|
          g = h[:label_target].id.to_s + h[:label_target].class.name
          m = [ h[:otu].id, g ]
          next if seen.include?( m )
          seen << m
          compact.push h
        end

        compact.sort!{|c,d| (c[:otu].priority || 999) <=> (d[:otu].priority || 999 )}

        # TODO: Refactor to remove extra query and assignment of otu_valid_id.  This is ugly.
        otu_order = compact.collect{|d| d[:otu].id}

        # Extra query is painful.
        f = ::Otu.where(id: otu_order)
          .joins('left join taxon_names t1 on otus.taxon_name_id = t1.id')
          .joins('left join otus o2 on t1.cached_valid_taxon_name_id = o2.taxon_name_id')
          .select('distinct on (otus.id) otus.id, otus.name, otus.taxon_name_id, COALESCE(o2.id, otus.id) as otu_valid_id')

        compact.each do |h|
          h[:otu_valid_id] = f.select{|j| j.id == h[:otu].id}.first.otu_valid_id
        end

        compact
      end

      #
      # Doesn't work for extended, as we can have the same OTU with different labels
      #
      def compact_priorities(otus)
        # Mmmmarg!
        # We may have the same name at different priorities, strike all but the highest/first.
        r = []
        i = {}
        otus.each do |o|
          next if i[o.id]
          r.push o
          i[o.id] = true
        end
        r
      end

      def autocomplete
        compact_priorities( autocomplete_base.limit(40) )
      end

      def autocomplete_base(targets = QUERIES)
        queries = []

        targets.each do |q, p|
          if self.respond_to?(q)

            a = send(q)
            next if a.nil? # query has returned nil

            y = p[:priority]

            a = scope_autocomplete(a)

            a = a.select("otus.*, #{y} as priority") unless y.nil?

            queries.push a
          end
        end

        queries.compact!
        referenced_klass_union(queries).order('priority')
      end

      def scope_autocomplete(query)
        query = query.joins(:taxon_name) if with_taxon_name
        query = query.where.missing(:taxon_name) if with_taxon_name == false
        query = query.joins(:taxon_name).where(otus: {name: nil}) if having_taxon_name_only
        query
      end

      # # @return [Array]
      # def autocomplete
      #   result = []
      #   base_queries.each do |q|
      #     result += q.to_a
      #     result.uniq!
      #     break if result.count > 39
      #   end
      #   result[0..39].uniq
      # end

    end
  end
end