app/classes/document_query.rb

Summary

Maintainability
D
2 days
Test Coverage
A
100%
# frozen_string_literal: true

class DocumentQuery
  include Elasticsearch::DSL

  HIGHLIGHT_OPTIONS = {
    pre_tags: ["\ue000"],
    post_tags: ["\ue001"]
  }.freeze

  DEFAULT_STOPWORDS = %w[
    a an and are as at be but by for if in into is it
    no not of on or such that the their then there these
    they this to was will with
  ].freeze

  FILTERABLE_TEXT_FIELDS = %i[audience
                              content_type
                              mime_type
                              searchgov_custom1
                              searchgov_custom2
                              searchgov_custom3
                              tags].freeze

  FILTERABLE_DATE_FIELDS = %i[created
                              changed].freeze

  attr_reader :audience,
              :content_type,
              :date_range,
              :date_range_created,
              :excluded_sites,
              :ignore_tags,
              :thumbnail_url,
              :included_sites,
              :language,
              :mime_type,
              :searchgov_custom1,
              :searchgov_custom2,
              :searchgov_custom3,
              :site_filters,
              :tags
  attr_accessor :query,
                :search

  def initialize(options)
    @options = options
    @date_range = { gte: @options[:min_timestamp], lt: @options[:max_timestamp] }
    @date_range_created = { gte: @options[:min_timestamp_created], lt: @options[:max_timestamp_created] }
    @excluded_sites = []
    @ignore_tags = options[:ignore_tags]
    @included_sites = []
    @search = Search.new
    parse_filters
    parse_query(options[:query]) if options[:query]
  end

  def body
    search.source source_fields
    search.sort { by :changed, order: 'desc' } if @options[:sort_by_date]
    if query.present?
      query_options
    end
    build_search_query
    search.explain true if Rails.logger.debug? # scoring details
    search
  end

  def query_options
    set_highlight_options
    search.suggest(:suggestion, suggestion_hash)
    FILTERABLE_TEXT_FIELDS.each do |facet|
      search.aggregation(facet, aggregation_hash(facet))
    end
    FILTERABLE_DATE_FIELDS.each do |date_facet|
      search.aggregation(date_facet, date_aggregation_hash(date_facet))
    end
  end

  def full_text_fields
    @full_text_fields ||= begin
      %w[title description content].index_with { |field| suffixed(field) }
    end
  end

  def common_terms_hash
    {
      query: query,
      cutoff_frequency: 0.05,
      minimum_should_match: { low_freq: '3<90%', high_freq: '2<90%' }
    }
  end

  def source_fields
    default_fields = %w[title path created changed thumbnail_url]
    fields = (@options[:include] || default_fields).push('language')
    fields.map { |field| full_text_fields[field] || field }
  end

  def timestamp_filters_present?
    @options[:min_timestamp].present? or @options[:max_timestamp].present?
  end

  def created_timestamp_filters_present?
    @options[:min_timestamp_created].present? or @options[:max_timestamp_created].present?
  end

  def boosted_fields
    full_text_fields.values.map do |field|
      if /title/ === field
        "#{field}^2"
      elsif /description/ === field
        "#{field}^1.5"
      else
        field.to_s
      end
    end
  end

  def functions
    [
      # Prefer more recent documents
      {
        gauss: {
          changed: { origin: 'now', scale: '1825d', offset: '30d', decay: 0.3 }
        }
      },

      # Avoid pdfs, etc.
      {
        filter: {
          terms: {
            extension: %w[doc docx pdf ppt pptx xls xlsx]
          }
        },
        weight: '.75'
      },

      # Prefer documents that have been clicked more often
      {
        field_value_factor: {
          field: 'click_count', modifier: 'log1p', factor: 2, missing: 1
        }
      }
    ]
  end

  private

  def suffixed(field)
    [field, language].compact.join('_')
  end

  def parse_query(query)
    site_params_parser = QueryParser.new(query)
    @site_filters = site_params_parser.site_filters
    @included_sites = @site_filters[:included_sites]
    @excluded_sites = @site_filters[:excluded_sites]
    @query = site_params_parser.stripped_query
  end

  def parse_filters
    @audience = @options[:audience]
    @content_type = @options[:content_type]
    @language = @options[:language] || 'en'
    @mime_type = @options[:mime_type]
    @searchgov_custom1 = @options[:searchgov_custom1]
    @searchgov_custom2 = @options[:searchgov_custom2]
    @searchgov_custom3 = @options[:searchgov_custom3]
    @tags = @options[:tags]
  end

  def set_highlight_options
    highlight_fields = highlight_fields_hash
    search.highlight do
      pre_tags HIGHLIGHT_OPTIONS[:pre_tags]
      post_tags HIGHLIGHT_OPTIONS[:post_tags]
      fields highlight_fields
    end
  end

  def aggregation_hash(facet_field)
    {
      terms: {
        field: facet_field
      }
    }
  end

  def date_aggregation_hash(date_facet_field)
    {
      date_range: {
        field: date_facet_field,
        format: '8M/d/u',
        ranges: [
          {
            key: 'Last Week',
            from: 'now-1w',
            to: 'now'
          },
          {
            key: 'Last Month',
            from: 'now-1M',
            to: 'now'
          },
          {
            key: 'Last Year',
            from: 'now-12M',
            to: 'now'
          }
        ]
      }
    }
  end

  def suggestion_hash
    { text: query_without_stopwords,
      phrase: {
        field: 'bigrams',
        size: 1,
        highlight: suggestion_highlight,
        collate: { query: { source: { multi_match: { query: '{{suggestion}}',
                                                     type: 'phrase',
                                                     fields: "*_#{language}" } } } }
      } }
  end

  def highlight_fields_hash
    {
      full_text_fields['title'] => {
        number_of_fragments: 0,
        type: 'fvh'
      },
      full_text_fields['description'] => {
        fragment_size: 75,
        number_of_fragments: 2,
        type: 'fvh'
      },
      full_text_fields['content'] => {
        fragment_size: 75,
        number_of_fragments: 2,
        type: 'fvh'
      }
    }
  end

  def suggestion_highlight
    {
      pre_tag: HIGHLIGHT_OPTIONS[:pre_tags].first,
      post_tag: HIGHLIGHT_OPTIONS[:post_tags].first
    }
  end

  # Temporary fix for https://github.com/elastic/elasticsearch/issues/34282
  def query_without_stopwords
    (query.downcase.split(/ +/) - DEFAULT_STOPWORDS).join(' ')
  end

  # Disabling length-related cops, as this method is intended to mimic the structure
  # of a complex Elasticsearch query using the Elasticsearch DSL
  # https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl
  # rubocop:disable Metrics/MethodLength, Metrics/BlockLength
  def build_search_query
    doc_query = self

    search.query do
      function_score do
        functions doc_query.functions

        query do
          bool do
            if doc_query.query.present?
              must do
                bool do
                  # prefer bigram matches
                  should { match bigrams: { operator: 'and', query: doc_query.query } }
                  should { term  promote: true }

                  # prefer_word_form_matches
                  must do
                    bool do
                      should do
                        bool do
                          must do
                            simple_query_string do
                              query doc_query.query
                              fields doc_query.boosted_fields
                            end
                          end

                          unless doc_query.query.match(/".*"/)
                            must do
                              bool do
                                doc_query.full_text_fields.values.each do |field|
                                  should { common({ field => doc_query.common_terms_hash }) }
                                end
                              end
                            end
                          end
                        end
                      end

                      should { match(audience: { operator: 'and', query: doc_query.query }) }
                      should { match(basename: { operator: 'and', query: doc_query.query }) }
                      should { match(searchgov_custom1: { operator: 'and', query: doc_query.query.downcase }) }
                      should { match(searchgov_custom2: { operator: 'and', query: doc_query.query.downcase }) }
                      should { match(searchgov_custom3: { operator: 'and', query: doc_query.query.downcase }) }
                      should { match(tags: { operator: 'and', query: doc_query.query.downcase }) }
                    end
                  end
                end
              end
            end

            filter do
              bool do
                must { term language: doc_query.language } if doc_query.language.present?

                minimum_should_match '100%'
                should do
                  bool do
                    if doc_query.included_sites.any?
                      minimum_should_match 1

                      doc_query.included_sites.each do |site_filter|
                        should do
                          bool do
                            must { term domain_name: site_filter.domain_name }
                            must { term url_path: site_filter.url_path } if site_filter.url_path.present?
                          end
                        end
                      end
                    end
                  end
                end

                FILTERABLE_TEXT_FIELDS.each do |field|
                  next if doc_query.send(field).blank?

                  should do
                    bool do
                      doc_query.send(field).each do |field_value|
                        minimum_should_match 1
                        should { term "#{field}": field_value.downcase }
                      end
                    end
                  end
                end

                must { range changed: doc_query.date_range } if doc_query.timestamp_filters_present?
                must { range created: doc_query.date_range_created } if doc_query.created_timestamp_filters_present?

                if doc_query.ignore_tags.present?
                  must_not do
                    terms tags: doc_query.ignore_tags
                  end
                end

                doc_query.excluded_sites.each do |site_filter|
                  if site_filter.url_path.present?
                    must_not { regexp path: { value: "https?:\/\/#{site_filter.domain_name}#{site_filter.url_path}/.*" } }
                  else
                    must_not { term domain_name: site_filter.domain_name }
                  end
                end
              end
            end
          end
        end
      end
    end
  end
  # rubocop:enable Metrics/MethodLength, Metrics/BlockLength
end