supercaracal/lsolr

View on GitHub
lib/lsolr.rb

Summary

Maintainability
C
1 day
Test Coverage
# frozen_string_literal: true

require 'date'

# A query builder of Apache Solr standard Lucene type query for Ruby.
#
# @example How to use. Part 1:
#   LSolr.build(field1: 'hoge', field2: true).to_s
#   #=> 'field1:hoge AND field2:true'
#
# @example How to use. Part 2:
#    params = {
#      field01: 'hoge',
#      field02: :fuga,
#      field03: 14,
#      field04: 7.3,
#      field05: true,
#      field06: false,
#      field07: Date.new(7000, 7, 1),
#      field08: DateTime.new(6000, 5, 31, 6, 31, 43),
#      field09: Time.new(5000, 6, 30, 12, 59, 3),
#      field10: LSolr.new(:field10).fuzzy_match('foo'),
#      field11: [1, 2, 3],
#      field12: 1..10,
#      field13: 20...40,
#      field14: Date.new(3000, 1, 1)..Date.new(4000, 12, 31),
#      field15: (3.0..4.0).step(0.1)
#    }
#
#    LSolr.build(params).to_s
#    #=> 'field01:hoge AND
#    #    field02:fuga AND
#    #    field03:14 AND
#    #    field04:7.3 AND
#    #    field05:true AND
#    #    field06:false AND
#    #    field07:"7000-07-01T00:00:00Z" AND
#    #    field08:"6000-05-31T06:31:43Z" AND
#    #    field09:"5000-06-30T12:59:03Z" AND
#    #    field10:foo~2.0 AND
#    #    field11:(1 2 3) AND
#    #    field12:[1 TO 10] AND
#    #    field13:[20 TO 40} AND
#    #    field14:[3000-01-01T00:00:00Z TO 4000-12-31T00:00:00Z] AND
#    #    field15:[3.0 TO 4.0]'
#
# @example How to use. Part 3:
#    bool1 = LSolr.new(:bool_field).match(true)
#    bool2 = LSolr.new(:bool_field).match(false)
#    date1 = LSolr.new(:date_field1).greater_than_or_equal_to('*').less_than_or_equal_to(Time.new(2000, 6, 30, 23, 59, 59))
#    date2 = LSolr.new(:date_field2).greater_than(Time.new(2000, 7, 1, 0, 0, 0)).less_than(Time.new(2001, 1, 1, 0, 0, 0))
#
#    left = bool1.and(date1).and(date2).wrap
#    right = bool2.and(date1.or(date2).wrap).wrap
#
#    left.or(right).to_s
#    #=> '(bool_field:true AND date_field1:[* TO 2000-06-30T23:59:59Z] AND date_field2:{2000-07-01T00:00:00Z TO 2001-01-01T00:00:00Z})
#    #    OR (bool_field:false AND (date_field1:[* TO 2000-06-30T23:59:59Z] OR date_field2:{2000-07-01T00:00:00Z TO 2001-01-01T00:00:00Z}))'
#
# @example How to use. Part 4:
#    %w[a b c].map { |v| LSolr.new(:field).prefix_match("#{v}*") }.reduce { |a, e| a.or(e) }.wrap.not.to_s
#    #=> 'NOT (field:a* OR field:b* OR field:c*)'
#
# @example How to use. Part 5:
#    LSolr.build('a:1').and(b: 2).to_s
#    #=> 'a:1 AND b:2'
class LSolr
  ArgumentError = Class.new(::ArgumentError)
  TypeError = Class.new(::TypeError)
  IncompleteQueryError = Class.new(StandardError)

  NOT = 'NOT'
  AND = 'AND'
  OR = 'OR'
  TO = 'TO'

  GREATER_THAN = '{'
  LESS_THAN = '}'
  GREATER_THAN_OR_EQUAL_TO = '['
  LESS_THAN_OR_EQUAL_TO = ']'

  WILD_CARD = '*'
  PROXIMITY = '~'
  BOOST = '^'
  CONSTANT_SCORE = '^='

  DELIMITER_SPACE = ' '

  RANGE_FUZZY_MATCH_DISTANCE = (0.0..2.0).freeze

  FORMAT_DATE_TIME = '%Y-%m-%dT%H:%M:%SZ'
  FORMAT_MILLISECOND_FOR_DATE_TYPE = '%Q'
  FORMAT_MILLISECOND_FOR_TIME_TYPE = '%L'
  FORMAT_SECOND = '%s'
  FORMAT_INSPECT = '#<%<class>s:%<object>#018x `%<query>s`>'

  PARENTHESIS_LEFT = '('
  PARENTHESIS_RIGHT = ')'

  RESERVED_SYMBOLS = %w(- + & | ! ( ) { } [ ] ^ " ~ * ? : \\\\ /).freeze
  RESERVED_WORDS = /(AND|OR|NOT)/.freeze
  REPLACEMENT_CHAR = ' '

  attr_accessor :prev, :operator, :left_parentheses, :right_parentheses, :expr_not

  class << self
    # Builds composite query and returns builder instance.
    #
    # @param params [Hash{Symbol => String, Symbol, Integer, Float, true, false, Range, Date, Time, Array<String, Symbol, Integer>}, String] query terms or a raw query
    #
    # @return [LSolr] a instance
    #
    # @raise [LSolr::ArgumentError] if specified parameters have a not supported type value
    def build(params)
      case params
      when Hash then params.map { |f, v| build_query(f, v) }.reduce { |a, e| a.and(e) }
      when String then build_raw_query(params)
      else raise TypeError, "Could not build solr query. Please specify a Hash or String value. `#{params}` given."
      end
    rescue TypeError => e
      raise ArgumentError, "#{e.message} It is not a supported type."
    end

    private

    def build_query(field, value) # rubocop:disable Metrics/CyclomaticComplexity
      case value
      when String, Symbol, Integer, Float, true, false then new(field).match(value)
      when Date, Time then new(field).date_time_match(value)
      when LSolr then value
      when Array then build_array_query(field, value)
      when Range then build_range_query(field, value)
      when Enumerator then build_enumerator_query(field, value)
      else raise TypeError, "Could not build solr query. field: `#{field}`, value: `#{value}` given."
      end
    end

    def build_array_query(field, values)
      values.empty? ? new(field) : new(field).match_in(values)
    end

    def build_range_query(field, value)
      if value.exclude_end?
        new(field).greater_than_or_equal_to(value.first).less_than(value.last)
      else
        new(field).greater_than_or_equal_to(value.first).less_than_or_equal_to(value.last)
      end
    end

    def build_enumerator_query(field, values)
      last = nil
      values.each { |v| last = v }
      new(field).greater_than_or_equal_to(values.first).less_than_or_equal_to(last)
    end

    def build_raw_query(query)
      query.empty? ? new : new.raw(query)
    end
  end

  # Create a new query builder instance.
  #
  # @param field_name [String, Symbol] a field name
  # @return [LSolr] a instance
  def initialize(field_name = nil)
    if field_name.nil?
      @field = ''
    else
      field(field_name)
    end

    @expr_not = @value = @range_first = @range_last = @boost = @constant_score = @raw = ''
    @left_parentheses = []
    @right_parentheses = []
  end

  # Returns Apache Solr standard lucene type query string.
  #
  # @return [String] a stringified query
  #
  # @raise [LSolr::IncompleteQueryError] if the query is incompletely
  def to_s
    raise IncompleteQueryError, 'Please specify a term of search.' if blank?

    decorate_linked_expressions_if_needed(build_expression)
  end

  alias to_str to_s

  # Returns instance information.
  #
  # @return [String] instance information
  def inspect
    format(FORMAT_INSPECT, class: self.class.name,
                           object: object_id << 1,
                           query: present? ? to_s : '')
  end

  # A query is blank if term is incomplete in expression.
  #
  # @return [true, false]
  def blank?
    managed_query_absence = @field.empty? || (@value.empty? && (@range_first.empty? || @range_last.empty?))
    managed_query_absence && @raw.empty?
  end

  # A query is present if it's not blank.
  #
  # @return [true, false]
  def present?
    !blank?
  end

  # Sets a field name.
  #
  # @param name [String, Symbol] a field name
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified field name is empty
  def field(name)
    raise ArgumentError, "The field name must be a not empty string value. `#{name}` given." unless present_string?(name)

    @field = name.to_s
    self
  end

  # Sets a raw query.
  #
  # @param query [String] a raw query string
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified raw query string is empty
  def raw(query)
    raise ArgumentError, "The raw query must be a not empty string value. `#{query}` given." unless present_string?(query)

    @raw = query.to_s
    self
  end

  # Adds parentheses to query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#grouping-terms-to-form-sub-queries Grouping Terms to Form Sub-Queries
  #
  # @return [LSolr] copied self instance
  def wrap
    this = dup
    this.head.left_parentheses << PARENTHESIS_LEFT
    this.right_parentheses << PARENTHESIS_RIGHT
    this
  end

  # Adds the boolean operator `NOT` to query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#the-boolean-operator-not The Boolean Operator NOT ("!")
  #
  # @return [LSolr] self instance
  def not
    this = dup
    this.head.expr_not = "#{NOT} "
    this
  end

  # Boosts a query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#boosting-a-term-with Boosting a Term with "^"
  #
  # @param factor [Float] a boost factor number
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified boost factor is invalid
  def boost(factor)
    raise ArgumentError, "The boost factor must be a positive number. `#{factor}` given." unless valid_boost_factor?(factor)

    @boost = "#{BOOST}#{factor}"
    self
  end

  # Specifies scoring result in expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#constant-score-with Constant Score with "^="
  #
  # @param score [Float] a constant score
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified score number is invalid
  def constant_score(score)
    raise ArgumentError, "The constant score must be a number. `#{score}` given." unless valid_score?(score)

    @constant_score = "#{CONSTANT_SCORE}#{score}"
    self
  end

  # Builds a normal query expression.
  #
  # @param value [String, Integer, true, false] a search word or a filter value
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified value is empty
  def match(value)
    raise ArgumentError, "`#{value}` given. It must be a not empty value." unless present_string?(value.to_s)
    return match_in(value) if value.is_a?(Array)
    return date_time_match(value) if value.is_a?(Date) || value.is_a?(Time)

    values = clean(value).split

    if values.size > 1
      phrase_match(values)
    else
      @value = values.join
      self
    end
  end

  # Builds a normal multi value query expression.
  #
  # @param value [Array<String, Symbol, Integer>] a search words or a filter values
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified value is a empty array or not array
  def match_in(values)
    raise ArgumentError, "`#{values}` given. It must be a not empty array." unless present_array?(values)
    return match(values.first) if values.size == 1

    values = values.map { |v| clean(v) }
    @value = "(#{values.join(DELIMITER_SPACE)})"
    self
  end

  # Builds a normal query expression with dates and times.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/working-with-dates.html Working with Dates
  #
  # @param value [String, Date, Time] a filter value
  #
  # @return [LSolr] self instance
  def date_time_match(value)
    value = stringify(value, symbols: RESERVED_SYMBOLS - %w[- : . / +])
    @value = %("#{value}")
    self
  end

  # Builds a prefix search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#wildcard-searches Wildcard Searches
  #
  # @param value [String] a search word
  #
  # @return [LSolr] self instance
  def prefix_match(value)
    @value = clean(value, symbols: RESERVED_SYMBOLS - %w[* ?]).split.join(WILD_CARD)
    self
  end

  # Builds a phrase or proximity search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#grouping-clauses-within-a-field Grouping Clauses within a Field
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#proximity-searches Proximity Searches
  #
  # @param values [Array<String>] search words
  # @param distance [Integer] proximity distance
  #
  # @return [LSolr] self instance
  def phrase_match(values, distance: 0)
    value = values.map { |v| clean(v).split }.flatten.join(DELIMITER_SPACE)
    proximity_match = distance.to_s.to_i > 0 ? "#{PROXIMITY}#{distance}" : ''
    @value = %("#{value}"#{proximity_match})
    self
  end

  # Builds a fuzzy search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#fuzzy-searches Fuzzy Searches
  #
  # @param value [String] a search word
  # @param distance [Float] a proximity distance
  #
  # @return [LSolr] self instance
  #
  # @raise [LSolr::ArgumentError] if specified distance is out of range
  def fuzzy_match(value, distance: 2.0)
    raise ArgumentError, "Out of #{RANGE_FUZZY_MATCH_DISTANCE}. `#{distance}` given." unless valid_fuzzy_match_distance?(distance)

    @value = "#{clean(value).split.join}#{PROXIMITY}#{distance}"
    self
  end

  # Builds a range search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#range-searches Range Searches
  #
  # @param value [String, Integer, Date, Time] a filter value
  #
  # @return [LSolr] self instance
  def greater_than(value)
    @range_first = "#{GREATER_THAN}#{stringify(value)}"
    self
  end

  # Builds a range search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#range-searches Range Searches
  #
  # @param value [String, Integer, Date, Time] a filter value
  #
  # @return [LSolr] self instance
  def less_than(value)
    @range_last = "#{stringify(value)}#{LESS_THAN}"
    self
  end

  # Builds a range search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#range-searches Range Searches
  #
  # @param value [String, Integer, Date, Time] a filter value
  #
  # @return [LSolr] self instance
  def greater_than_or_equal_to(value)
    @range_first = "#{GREATER_THAN_OR_EQUAL_TO}#{stringify(value)}"
    self
  end

  # Builds a range search query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#range-searches Range Searches
  #
  # @param value [String, Integer, Date, Time] a filter value
  #
  # @return [LSolr] self instance
  def less_than_or_equal_to(value)
    @range_last = "#{stringify(value)}#{LESS_THAN_OR_EQUAL_TO}"
    self
  end

  # Builds a composite query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#the-boolean-operator-and The Boolean Operator AND ("&&")
  #
  # @param another [LSolr, Hash, String] another query builder instance or query params or raw query string
  #
  # @return [LSolr] copied another query builder instance
  def and(another)
    link(another, AND)
  end

  # Builds a composite query expression.
  #
  # @see https://lucene.apache.org/solr/guide/7_2/the-standard-query-parser.html#boolean-operators-supported-by-the-standard-query-parser Boolean Operators Supported by the Standard Query Parser
  #
  # @param another [LSolr, Hash, String] another query builder instance or query params or raw query string
  #
  # @return [LSolr] copied another query builder instance
  def or(another)
    link(another, OR)
  end

  # Returns a first term of query.
  #
  # @return [LSolr] a first term of query.
  def head
    if present_query?(prev)
      prev.head
    else
      self
    end
  end

  private

  def initialize_copy(obj)
    obj.prev = obj.prev.dup if present_query?(obj.prev)
    obj.left_parentheses = obj.left_parentheses.dup
    obj.right_parentheses = obj.right_parentheses.dup
  end

  def range_search?
    @value.empty? && !@range_first.empty? && !@range_last.empty?
  end

  def raw?
    !@raw.empty?
  end

  def present_string?(val)
    !val.nil? && (val.is_a?(String) || val.is_a?(Symbol)) && !val.empty?
  end

  def present_array?(val)
    !val.nil? && val.is_a?(Array) && !val.compact.empty? && val.map(&:to_s).map(&:empty?).none?
  end

  def present_query?(val)
    !val.nil? && val.present?
  end

  def valid_boost_factor?(val)
    (val.is_a?(Float) || val.is_a?(Integer)) && val > 0
  end

  def valid_score?(val)
    val.is_a?(Float) || val.is_a?(Integer)
  end

  def valid_fuzzy_match_distance?(val)
    (val.is_a?(Float) || val.is_a?(Integer)) && RANGE_FUZZY_MATCH_DISTANCE.member?(val)
  end

  def clean(value, symbols: RESERVED_SYMBOLS)
    value.to_s
         .tr(symbols.join, REPLACEMENT_CHAR)
         .gsub(RESERVED_WORDS) { |match| "\\#{match}" }
  end

  def stringify(value, symbols: RESERVED_SYMBOLS - %w[- : . / + *])
    if value.is_a?(Date) || value.is_a?(Time)
      format_date(value)
    else
      clean(value, symbols: symbols)
    end
  end

  def format_date(date)
    msec_str = case date
               when Date then date.strftime(FORMAT_MILLISECOND_FOR_DATE_TYPE).gsub(date.strftime(FORMAT_SECOND), '')
               when Time then date.strftime(FORMAT_MILLISECOND_FOR_TIME_TYPE)
               else raise TypeError, "Could not format dates or times. `#{date}` given."
               end

    return date.strftime(FORMAT_DATE_TIME) if msec_str == '000'

    "#{date.strftime('%Y-%m-%dT%H:%M:%S')}.#{msec_str}Z"
  end

  def link(another, operator)
    another = build_instance_if_needed(another)
    return self unless present_query?(another)

    another = another.dup
    head = another.head
    head.prev = dup
    head.operator = operator
    another
  end

  def build_instance_if_needed(another)
    case another
    when self.class then another
    when Hash, String then self.class.build(another)
    end
  end

  def build_expression
    if raw?
      @raw
    elsif range_search?
      "#{@field}:#{@range_first} #{TO} #{@range_last}"
    else
      "#{@field}:#{@value}"
    end
  end

  def decorate_linked_expressions_if_needed(expr)
    expr = "#{expr_not}#{left_parentheses.join}#{expr}#{right_parentheses.join}"
    expr = "#{prev} #{operator} #{expr}" if present_query?(prev)
    scoring = present_string?(@constant_score) ? @constant_score : @boost
    "#{expr}#{scoring}"
  end
end