lib/queries/taxon_name/tabular.rb from SpeciesFileGroup/taxonworks

lib/queries/taxon_name/tabular.rb
Summary

Maintainability

4 hrs
Test Coverage

Issues
module Queries
  module TaxonName

    # Summarize data across a nomenclatural hierarchy.
    #
    # Results are Arrays, not AR objects.
    #
    # The approach is to use OVER/PARTITION to build a ranked list, then to summarize only those ranked data requested.
    # When we select without preserving all ranks, then the data are not sortable according to their overall nesting.
    #
    # The approach only succeeds on the hierarchical data because we cache the current valid id for every name, this
    # allows us to build an ordered, nested by hierarchy list across not only valid, but also combination and invalid names.
    #
    # These were useful references:
    #   * https://sonnym.github.io/2017/06/05/common-table-expressions-in-activerecord-a-case-study-of-quantiles/
    #   * https://blog.codeship.com/folding-postgres-window-functions-into-rails/
    class Tabular < Query::Autocomplete

      include Queries::Helpers

      # @return [Integer]
      #   required, the id scoping the result set
      attr_accessor :taxon_name_id

      # @return [Array]
      #   like 'family', 'genus', 'species'
      # ORDER matters
      attr_accessor :ranks

      # @return [Array]
      #   like 'family', 'genus', 'species'
      # ORDER MATTERS
      attr_accessor :rank_data

      # !! CAREFUL, too small limits will not properly nest all names !!
      #   defaults to 1000
      attr_accessor :limit

      # @return [Boolean]
      #   if true than also include Combinations in counts
      attr_accessor :combinations

      # @return [Array of Strings]
      #  named sets of columns to include
      #  valid values are
      #    - nomenclatural_stats
      #    - observations
      attr_accessor :fieldsets

      # @return [Boolean]
      #   if true then only include valid names
      attr_accessor :validity

      ### Internal

      # @return [ the build query ]
      attr_accessor :query

      # @return [Protonym, nil]
      attr_accessor :ancestor

      attr_accessor :rank_id_fields

      # @return [Array]
      #   of hashes, with name of column, and SQL field projection definition
      #      { header: 'foo'
      #       projected: 'taxon_names.id'  }
      attr_accessor :columns

      # @param params [Hash]
      #   keys are symbols
      def initialize(params = {})
        super(nil, project_id: params[:project_id]) # We don't actually use project_id here

        @taxon_name_id = params[:taxon_name_id]
        @combinations = boolean_param(params, :combinations)
        @limit = params[:limit]
        @ranks = params[:ranks]
        @rank_data = params[:rank_data]
        @validity = boolean_param(params, :validity)
        @fieldsets = params[:fieldsets]
        @rank_id_fields = []

        initialize_columns
        build_query
      end

      # These are generic headers used for existing queries, ultimately subclass this with
      # a reference to core, if it can be abstracted as such
      def initialize_columns
        @columns = []
        valid_table = table.alias('valid_taxon_names')

        @columns.push({ header: 'ro', projected: rank_over(table, valid_table) })

        @columns.push({header: 'taxon_name_id', projected: table[:id].as('taxon_name_id') } )
        @columns.push({header: 'cached_valid_taxon_name_id', projected: table[:cached_valid_taxon_name_id].as('cached_valid_taxon_name_id') } )
        @columns.push({header: 'cached', projected: table[:cached].as('cached') } )
        @columns.push({header: 'cached_author_year', projected: table[:cached_author_year].as('cached_author_year') } )
        @columns.push({header: 'cached_is_valid', projected: table[:cached_is_valid].as('cached_is_valid') } )

        if fieldsets.include?('observations')
          @columns.push({header: 'otu_id', projected: otu_table[:id].as('otu_id')  } )
          @columns.push({header: 'otu_name', projected:  otu_table[:name].as('otu_name')} )
        end
      end

      def fieldsets=(array)
        @fieldsets = array
        initialize_columns
      end

      def fieldsets
        [@fieldsets].flatten.compact
      end

      def column_headers
        columns.collect{|c| c[:header]}
      end

      def projected_columns
        columns.collect{|c| c[:projected]}
      end

      def ranks
        [@ranks].flatten.compact
      end

      def rank_data
        [@rank_data].flatten.compact
      end

      def ancestor
        return nil unless taxon_name_id
        @ancestor ||= ::Protonym.find(taxon_name_id)
      end

      def limit
        @limit ||= 1000
      end

      def hierarchy_table
        Arel::Table.new(:taxon_name_hierarchies)
      end

      def otu_table
        ::Otu.arel_table
      end

      def base_query
        q = table
        h = hierarchy_table

        # Scope all names in the result
        a = table[:id].eq(h[:descendant_id])
          .and(h[:ancestor_id].eq(taxon_name_id) )
        a = a.and(table[:cached_is_valid].eq(true)) if validity
        #a = a.and(table[:cached_valid_taxon_name_id].eq(table[:id])) if validity

        # Start a query
        q = q.join(h, Arel::Nodes::InnerJoin).on(a)

        # TODO: not always true now
        # All results can reference an otu
        q.join(otu_table, Arel::Nodes::OuterJoin).on(
          otu_table[:taxon_name_id].eq( table[:id] ).and( otu_table[:id].not_eq(nil)))
      end

      def build_query
        q = base_query

        # Add a join to the valid parent_id
        valid_table = table.alias('valid_taxon_names')
        q = q.join(valid_table, Arel::Nodes::InnerJoin).on(
          table[:cached_valid_taxon_name_id].eq(valid_table[:id])
        )

        # Setup the basic joins for each requested rank,
        # these are re-used in data gathering queries.
        ranks.each_with_index do |r, i|
          l = table.alias("j_#{i}")

          a = l[:id].eq(hierarchy_table[:descendant_id])
          a = a.and( l[:rank_class].eq( Ranks.lookup(ancestor.nomenclatural_code, ranks[i])) )

          q = q.join(l, Arel::Nodes::OuterJoin).on(a)

          @columns.push( {header: r, projected:  l[:name].as( ranks[i] )  } )

          @rank_id_fields.push l[:id]
        end

        # add individual sets of data columns
        fieldsets.each do |f|
          q = send(f + '_set', q)
        end

        # TODO: rank_classes getter should merge nil when combinations,
        # i.e. we likely don't need nil
        w = table[:rank_class].in(rank_classes)
        w = w.or(table[:rank_class].eq(nil)) if combinations

        q = q.project(*projected_columns)
        q = q.from(table).where(w).distinct
        q = q.take(limit.to_i) if limit

        @query = table.project( Arel::Nodes::SqlLiteral.new('*') ).from(q.as('p')).order( 'cached_valid_taxon_name_id', 'ro' ) # if every field had a value this would work
      end

      def rank_classes
        ranks.collect{|r| Ranks.lookup(ancestor.nomenclatural_code, r)}
      end

      # @return t[:id] for the finest rank, t is aliased
      def finest_rank_id
        coalesce_ranks(rank_id_fields.reverse)
      end

      def coalesce_ranks(fields)
        Arel::Nodes::NamedFunction.new('coalesce', [fields])
      end

      # Lets us generate a sort order across valid, invalid, and combinations
      def rank_over(source_table, valid_source_table)
        Arel::Nodes::Over.new(
          Arel::Nodes::SqlLiteral.new('rank()'),
          Arel::Nodes::Window.new.partition(
            [ valid_source_table[:parent_id] ]
          ).order( source_table[:cached_valid_taxon_name_id], source_table[:cached], source_table[:name] )
        ).as('ro')
      end

      def all
        ApplicationRecord.connection.execute(@query.to_sql)
      end

      # Only calculates for OTUs, not through to
      # get CollectionObjects, nor Extracts
      def observations_set(query)
        o = ::Observation.arel_table

        # Descriptors on OTUs scored
        f = 'descriptors_scored_for_otus'
        fa = 'fs_d1'
        @columns.push({header: f, projected: '"' + fa + '"."' + f + '" as ' + f } )

        z = o.project(
          o[:observation_object_id],
          o[:observation_object_type],
          o[:descriptor_id].count(true).as(f) # count(true) == distinct
        ).group(
          o[:observation_object_id],
          o[:observation_object_type],
        ).as(fa)

        query.join(z, Arel::Nodes::OuterJoin).on(
          z[:observation_object_id].eq(otu_table[:id])
          .and(z[:observation_object_type].eq('Otu'))
        )

        # Observations on OTUs
        f = 'otu_observation_count'
        fa = 'fs_o1'
        @columns.push({header: f, projected: '"' + fa +  '"."' + f + '" as ' + f })

        x = o.project(
          o[:observation_object_id],
          o[:observation_object_type],
          o[:id].count.as(f)
        ).group(o[:observation_object_id], o[:observation_object_type]).as(fa)

        query.join(x, Arel::Nodes::OuterJoin).on(x[:observation_object_id].eq(otu_table[:id]).and( x[:observation_object_type].eq('Otu')  ))

        # Depictions on observations on OTUs
        f = 'otu_observation_depictions'
        fa = 'fs_o2'
        @columns.push( {header: f, projected: '"' + fa + '"."' + f + '" as ' + f } )

        p = ::Depiction.arel_table
        y = o.join(p, Arel::Nodes::InnerJoin).on(
          o[:id].eq(p[:depiction_object_id]).and(
            p[:depiction_object_type].eq('Observation')
          )
        ).project(
          o[:observation_object_id], # an OTU id
          p[:depiction_object_type],
          p[:id].count.as(f)
        ).group(
          o[:observation_object_id],
          p[:depiction_object_type]
        ).as(fa)

        query.join(y, Arel::Nodes::OuterJoin).on(
          y[:observation_object_id].eq(otu_table[:id])
        )
        query
      end

      def nomenclatural_stats_set(query)
        c = rank_data

        i = 0
        c.each do |r|
          %w{valid invalid}.each do |v|
            nomenclature_stats_column(query, r, v, i)
            i += 1
          end
        end

        if combinations
          nomenclature_stats_column(query, 'combination', nil, i)
        end

        query
      end

      def nomenclature_stats_column(query, rank, valid = nil, index = 0)
        i = index
        v = valid
        r = rank

        h = hierarchy_table

        # May be alias?
        t = table

        s = [v, r].compact.join('_')
        a = ['ns_o', r, i, v].join('_')
        @columns.push({header: s, projected: "\"#{a}\".\"#{s}\" as #{s}" })

        x = t.where(
          # h[:generations].gt(0)
          t[:cached_is_valid].eq( v == 'valid' ? true : false )
          .and( t[:rank_class].eq( Ranks.lookup(ancestor.nomenclatural_code, r) ) )
        ).join(h, Arel::Nodes::InnerJoin).on(

          h[:descendant_id].eq( t[:cached_valid_taxon_name_id] )  ) # h[:descendant_id].eq( t[:id] ) )
          .project(
            h[:ancestor_id],
            t[:id].count.as( s )
          ).group(h[:ancestor_id]) # TODO: Might be wrong
            .as(a)

          query.join(x, Arel::Nodes::OuterJoin).on( x[:ancestor_id].eq( finest_rank_id  ))
          query
      end

    end
  end
end

# An alternate approach was explored, but it's overkill given our cached valid id:
#
# WITH RECURSIVE subordinates AS (
#    SELECT
#       id,
#       parent_id,
#       rank_class,
#       name,
#       cached_valid_taxon_name_id,
#       cached
#     FROM
#       taxon_names
#    WHERE
#       id = 373809
#    UNION
#       SELECT
#          e.id,
#          e.parent_id,
#          e.rank_class,
#          e.name,
#          e.cached_valid_taxon_name_id,
#          e.cached
#       FROM
#          taxon_names e
#       INNER JOIN subordinates s ON s.id = e.parent_id
# ) SELECT
#    *
# FROM
#    subordinates
# order by cached_valid_taxon_name_id, cached, name