SpeciesFileGroup/taxonworks

View on GitHub
lib/queries/taxon_name/autocomplete.rb

Summary

Maintainability
A
0 mins
Test Coverage
# TaxonNameAutocompleteQuery
module Queries
  module TaxonName
    class Autocomplete < Query::Autocomplete

      # @return [Array]
      #   &nomenclature_group[]=<<Iczn|Icnp|Icn>::<Higher|Family|Genus|Species>>
      attr_accessor :nomenclature_group

      # @return [Boolean, nil]
      #  &valid=<"true"|"false">
      #     if 'true'  then id == cached_valid_taxon_name_id
      #     if 'false' then id != cached_valid_taxon_name
      #     if nil   then no check made, i.e. all names
      #  string is converted to Boolean here
      attr_accessor :valid

      # @return [Array]
      #   &type[]=<Protonym, Combination, Hybrid, etc.>&type[]=<other type> etc.
      attr_accessor :type

      # @return [Array]
      #   &parent_id[]=<int>&parent_id[]=<other_int> etc.
      attr_accessor :parent_id

      # TODO: this should move to 'mode'

      # @return [Boolean]
      #   &exact=<"true"|"false">
      #   if 'true' then only #name = query_string results are returned (no fuzzy matching)
      attr_accessor :exact

      # @return [Boolean]
      #   &no_leaves=<"true"|"false">
      #     if 'true' then only names with descendents will be returned
      attr_accessor :no_leaves

      # As determined by GlobalNames parser
      attr_accessor :authorship

      # @param [Hash] args
      def initialize(string, **params)
        @nomenclature_group = params[:nomenclature_group]
        @valid = boolean_param(params, :valid)
        @type = params[:type]
        @parent_id = params[:parent_id]
        @no_leaves = boolean_param(params, :no_leaves)

        # TODO: move to mode
        @exact = boolean_param(params, :exact)
        super
      end

      def nomenclature_group
        [@nomenclature_group].flatten.compact.uniq.collect{|g| "%::#{g}%"}
      end

      def type
        [@type].flatten.compact.uniq
      end

      def parent_id
        [@parent_id].flatten.compact.uniq
      end

      # @return [Arel:Nodes, nil]
      def and_clauses
        clauses = [
          valid_state,
          is_type,
          with_parent_id,
          with_nomenclature_group,
        ].compact

        return nil if clauses.nil?

        a = clauses.shift
        clauses.each do |b|
          a = a.and(b)
        end
        a
      end

      # @return [Arel::Nodes::<>, nil]
      # and clause
      def valid_state
        return nil if @valid.nil?
        valid ? table[:id].eq(table[:cached_valid_taxon_name_id]) : table[:id].not_eq(table[:cached_valid_taxon_name_id])
      end

      # and clause
      # @return [Arel::Nodes::<>, nil]
      def is_type
        return nil if type.empty?
        table[:type].in(type)
      end

      # and clause, limit to ancestors or [ids]
      # @return [Arel::Nodes::<>, nil]
      def with_parent_id
        return nil if parent_id.empty?
        taxon_name_hierarchies_table[:ancestor_id].in(parent_id)
      end

      # @return [Arel::Nodes::Grouping, nil]
      #   and clause
      def with_nomenclature_group
        return nil if nomenclature_group.empty?
        table[:rank_class].matches_any(nomenclature_group)
      end

      # @return [Scope]
      def autocomplete_exact_cached
        a = table[:cached].eq(query_string)
        base_query.where(a.to_sql).order('cached_author_year ASC').limit(20)
      end

      # @return [Scope]
      def autocomplete_exact_cached_original_combination
        a = table[:cached_original_combination].eq(query_string)
        base_query.where(a.to_sql).order('cached_author_year ASC').limit(20)
      end

      # @return [Scope]
      def autocomplete_wildcard_cached_original_combination
        a = table[:cached_original_combination].matches(wildcard_pieces)
        base_query.where(a.to_sql).order('cached_author_year ASC').limit(20)
      end

      # @return [Scope]
      def autocomplete_exact_name_and_year
        a = alphabetic_strings.select { |b| !(b =~ /\d/) }
        b = years
        if a.size == 1 && !b.empty?
          a = table[:name].eq(a.first).and(table[:cached_author_year].matches_any(wildcard_wrapped_years))
          base_query.where(a.to_sql).limit(10)
        else
          nil
        end
      end

      # @return [Scope]
      def autocomplete_exact_name
        a = table[:name].eq(query_string)
        base_query.where(a.to_sql).order('cached_author_year ASC').limit(20)
      end

      # @return [Scope]
      def autocomplete_top_cached
        s = query_string
        a = table[:cached].matches("#{s}%")
        base_query.where(a.to_sql).limit(1)
      end

      # @return [Scope]
      def autocomplete_cached_end_wildcard
        s = query_string.delete('\\')
        a = table[:cached].matches("#{s}%")
        base_query.where(a.to_sql).limit(20)
      end

      # @return [Scope]
      def autocomplete_top_cached_subgenus
        a = table[:cached].matches("%(#{query_string})")
        base_query.where(a.to_sql).limit(1)
      end

      # @param [String] result
      # @return [Scope]
      def autocomplete_genus_species1(result)
        return nil if result.nil?
        a = table[:cached].matches(result)
        base_query.where(a.to_sql).order('type DESC, cached ASC').limit(8)
      end

      # @param [String] result
      # @return [Scope]
      def autocomplete_genus_species2(result)
        return nil if result.nil?
        a = table[:cached].matches(result + '%')
        base_query.where(a.to_sql).order('type DESC, cached ASC').limit(8)
      end

      # @return [Scope]
      def autocomplete_cached_name_end_wildcard
        a = table[:name].matches("#{query_string}%")
        base_query.where(a.to_sql).limit(20)
      end

      # @return [Scope]
      def autocomplete_cached_wildcard_whitespace
        a = table[:cached].matches("#{query_string.gsub('. ', ' ').gsub(/[\s\\]/, '%')}")
        base_query.where(a.to_sql).limit(20)
      end

      # @return [Scope, nil]
      def autocomplete_name_author_year_fragment
        f = fragments
        if f.size == 2
          a = table[:name].matches(f[0]).and(table[:cached_author_year].matches(f[1]))
          base_query.where(a.to_sql).limit(20)
        else
          nil
        end
      end

      # @return [Scope, nil]
      def autocomplete_wildcard_author_year_joined_pieces
        return nil if pieces.empty?
        a = table[:cached_author_year].matches("%#{pieces.join('%')}%")
        base_query.where(a.to_sql).order('cached ASC').limit(20)
      end

      # @return [Scope, nil]
      def autocomplete_wildcard_joined_strings
        return nil if alphabetic_strings.empty?
        a = table[:cached].matches("%#{alphabetic_strings.join('%')}%")
        base_query.where(a.to_sql).limit(10)
      end

      # @return [Arel::Nodes::Matches]
      def autocomplete_taxon_name_author_year_matches
        a = authorship
        return nil if a.nil?
        base_query.where(table[:cached_author_year].matches(a).to_sql).limit(10)
      end

      #    def autocomplete_cached
      #      base_query.where(table[:cached].eq(query_term))
      #    end

      # ---- gin methods
      # Consider word_similarity()

      def autocomplete_cached
        ::TaxonName.where(project_id:).select(ApplicationRecord.sanitize_sql(['taxon_names.*, similarity(?, taxon_names.cached) AS sml', query_string]))
          .where('taxon_names.cached % ?', query_string) # `%` in where means nothing < 0.3 (internal PG similarity value)
          .where(ApplicationRecord.sanitize_sql_array(["similarity('%s', taxon_names.cached) > 0.6", query_string]))
          .order('sml DESC, taxon_names.cached')
      end

      def autocomplete_original_combination
        ::TaxonName.select(ApplicationRecord.sanitize_sql(['taxon_names.*, similarity(?, taxon_names.cached_original_combination) AS sml', query_string]))
          .where('taxon_names.cached_original_combination % ?', query_string)
          .where(ApplicationRecord.sanitize_sql_array(["similarity('%s', taxon_names.cached_original_combination) > 0.6", query_string]))
          .order('sml DESC, taxon_names.cached_original_combination')
      end

      def autocomplete_cached_author_year
        ::TaxonName.select(ApplicationRecord.sanitize_sql(['taxon_names.*, similarity(?, taxon_names.cached_author_year) AS sml', query_string]))
          .where('taxon_names.cached_author_year % ?', query_string)
          .where(ApplicationRecord.sanitize_sql(["similarity('%s', taxon_names.cached_author_year) > 0.6", query_string]))
          .order('sml DESC, taxon_names.cached_author_year')
      end

      # Weights.  Theory (using this loosely) is that this
      # will proportionally increase the importance in the list of the corresponding element.
      # The tradeoff is subtle, but seems to work at first try.
      CACHED_NAME_WEIGHT = 8.0
      CACHED_AUTHOR_YEAR_WEIGHT = 6.0
      CACHED_WEIGHT = 4.0
      CACHED_ORIGINAL_COMBINATION_WEIGHT = 2.0

      # Used in /otus/api/v1/autocomplete
      def autocomplete_combined_gin
        a = ::TaxonName.select(ApplicationRecord.sanitize_sql(
          ['taxon_names.*, similarity(?, name) AS sml_n, similarity(?, taxon_names.cached_author_year) AS sml_cay, similarity(?, cached) AS sml_c, similarity(?, taxon_names.cached_original_combination) AS sml_coc',
           query_string, authorship, query_string, query_string])
                              ).where('taxon_names.cached_author_year % ? OR taxon_names.cached_original_combination % ? OR cached % ?', query_string, query_string, query_string)

        s = 'WITH tns AS (' + a.to_sql + ') ' +
          ::TaxonName
          .select(Arel.sql("taxon_names.*, (( COALESCE(tns1.sml_n,0) * #{CACHED_NAME_WEIGHT} + \
                                                  COALESCE(tns1.sml_cay,0) * #{CACHED_AUTHOR_YEAR_WEIGHT} + \
                                                  COALESCE(tns1.sml_c,0) * #{CACHED_WEIGHT} + \
                                                  COALESCE(tns1.sml_coc,0) * #{CACHED_ORIGINAL_COMBINATION_WEIGHT} \
                                                )) sml_tn"))
          .joins('JOIN tns as tns1  on tns1.id = taxon_names.id')
          .to_sql

        ::TaxonName.select('taxon_names.*, sml_tn as sml_t').from('(' + s + ') as taxon_names').order('sml_tn DESC').distinct
      end

      # Used in New taxon name task, for example
      #  TODO: what is intent?
      def exact_autocomplete
        [
          autocomplete_exact_id,
          autocomplete_exact_cached,
          autocomplete_exact_cached_original_combination,
          autocomplete_identifier_cached_exact,
          autocomplete_identifier_identifier_exact,
          autocomplete_exact_name_and_year,

          autocomplete_cached_end_wildcard,
          autocomplete_cached_wildcard_whitespace,
          autocomplete_name_author_year_fragment,
          autocomplete_taxon_name_author_year_matches,
          autocomplete_wildcard_joined_strings,
          autocomplete_wildcard_author_year_joined_pieces,
          autocomplete_wildcard_cached_original_combination,
          autocomplete_exact_name, # not exact enough, want the whole thing?
          # autocomplete_top_cached, # not exact at all
        ]
      end

      # TODO: Refactor to OTU approach?
      def comprehensive_autocomplete
        z = genus_species
        queries = [
          autocomplete_exact_cached,
          autocomplete_exact_cached_original_combination,
          autocomplete_exact_name_and_year,
          autocomplete_exact_name,

          autocomplete_exact_id,
          autocomplete_identifier_cached_exact,
          autocomplete_identifier_identifier_exact,

          # All exact should be before these?
          #
          # There are left in, but the cutoff
          # is now 2x as high, i.e. more like wildcard matches we
          # were originally used to.
          autocomplete_cached, # sim
          autocomplete_original_combination, # sim
          autocomplete_cached_author_year, # sim

          # Specialized results
          autocomplete_genus_species1(z),    # not tested
          autocomplete_genus_species2(z),    # not tested
          autocomplete_top_cached_subgenus,  # not tested

          # autocomplete_top_cached, # Wildcard end
          # autocomplete_cached_end_wildcard,
          # autocomplete_cached_name_end_wildcard,
          # autocomplete_cached_wildcard_whitespace,
          # autocomplete_name_author_year_fragment,
          # autocomplete_taxon_name_author_year_matches,
          autocomplete_wildcard_joined_strings,
          autocomplete_wildcard_author_year_joined_pieces,
          autocomplete_wildcard_cached_original_combination
        ]
      end

      def unified_autocomplete
        [
          autocomplete_exact_id,
          autocomplete_combined_gin,
          autocomplete_identifier_cached_exact,
        ]
      end

      # @return [Array]
      def autocomplete
        # exact, unified, comprehensive

        queries = (exact ? exact_autocomplete : comprehensive_autocomplete )
        queries.compact!

        result = []

        queries.each_with_index do |q,i|
          a = q
          a = q.where(project_id:) if project_id.present?

          a = a.where(and_clauses.to_sql) if and_clauses

          if !parent_id.empty?
            a = a.descendants_of(::TaxonName.where(id: parent_id))
          end

          a = a.not_leaves if no_leaves

          result += a.limit(20).to_a
          break if result.count > 19
        end

        result.uniq!
        # result[0..19]
        result
      end

      # @return [String, nil]
      #   parse and only return what is assumed to be genus/species, with a wildcard in front
      def genus_species
        p = Vendor::Biodiversity::Result.new
        p.name = query_string
        r = p.parse

        a = p.genus
        b = p.species

        if a && b
          a + '%' + b
        else
          nil
        end
      end

      # @return [Scope]
      # TODO: this should deprecate for gin based approaches.
      def base_query
        ::TaxonName.select('taxon_names.*, char_length(taxon_names.cached)')
          .includes(:ancestor_hierarchies)
          .order(Arel.sql('char_length(taxon_names.cached), taxon_names.cached ASC'))
      end

      # @return [Arel::Table]
      def taxon_name_hierarchies_table
        Arel::Table.new('taxon_name_hierarchies')
      end

      # @return [Arel::Nodes::Matches]
      def with_cached_author_year
        table[:cached_author_year].matches_any(terms)
      end

      # @return [String] (including empty)
      def authorship
        return @authorship if @authorship
        a = ::Biodiversity::Parser.parse(query_string)

        if a.dig(:parsed)
          @authorship = a.dig(:authorship, :normalized)
        else
          # Gnparser doesn't parse with names like `aus Jones`, do a quick and dirty check for things like `foo Jones`
          if a = query_string.match(/\A[a-z]+\s*\,?\s*(.*)\Z/)
            @authorship = a[1].gsub(/\\+\z/, '')
          else
            @authorship = ''
          end
        end

        @authorship
      end

      # Note this overwrites the commonly used Geo parent/child!
      # def parent_child_where
      #   b,a = query_string.split(/\s+/, 2)
      #   return table[:id].eq('-1') if a.nil? || b.nil?
      #   table[:name].matches("#{a}%").and(parent[:name].matches("#{b}%"))
      # end

    end
  end
end