npolar/api.npolar.no

View on GitHub
lib/npolar/rack/solrizer.rb

Summary

Maintainability
F
4 days
Test Coverage
# encoding: utf-8
require "time"
module Npolar

  module Rack

    # Solr search middleware
    #
    # Given a search (q=) Solrizer returns OpenSearch-like JSON feeds.
    #
    # Solrizer's behavior is highly pluggable through lambda functions, see config parameters
    # :feed, :q, and :fq below.
    #
    # Usage in Rack config.ru:
    #
    # use Npolar::Rack::Solrizer, { core: "http://localhost:8983/solr/tracking",
    #   facets => ["list", "of", "field", "facets"],
    #   "dates": ["positioned", "measured", "updated", "created", "edited"],
    #   "range_facets": [
    #    {
    #        "field": "measured",
    #        "gap": "+1YEAR",
    #        "start": "NOW/YEAR-100YEARS",
    #        "end": "NOW"
    #    },
    #    {
    #        "field": "latitude",
    #        "gap": 10,
    #        "start": -90,
    #        "end": 90
    #    },
    #    {
    #        "field": "longitude",
    #        "gap": 20,
    #        "start": -180,
    #        "end": 180
    #    },
    #    {
    #        "field": "altitude",
    #        "gap": 1000,
    #        "start": -1000,
    #        "end": 9000
    #    }
    #   ]
    # }
    #
    # https://github.com/npolar/api.npolar.no/blob/master/lib/npolar/rack/solrizer.rb

    class Solrizer < Npolar::Rack::Middleware

      YEAR_REGEX = /^([-])?\d{4,}/
      DATE_REGEX = /(-)?(\d{4,})-(0[1-9]|1[0-2])-([12]\d|0[1-9]|3[01])/
      ISODATE_REGEX = /^#{DATE_REGEX}$/
      MONTH_REGEX = /^(-)?(\d{4,})\-(0[1-9]|1[0-2])$/
      DATETIME_REGEX = /T\d\d[:]\d\d[:]\d\dZ$/
      INTEGER_REGEX = /^[-+]?[0-9]+$/
      FLOAT_REGEX = /^[-+]?[0-9\.]+$/ # may hit x.y.z

      # Default condition lambda (see CONFIG)
      def self.query_or_save_json
        
        lambda {|request|
          if ["GET", "HEAD"].include? request.request_method and not request["q"].nil?
            true
          elsif ["POST","PUT", "DELETE"].include? request.request_method
            true
          else
            false
          end 
        }
      end

      # Use this CONFIG condition lambda for a Search-only Solrizer
      def self.searcher
        lambda {|request| ["GET", "HEAD", "DELETE"].include? request.request_method }
      end

      # Set default Solr URI
      def self.uri=uri
        if uri.respond_to? :gsub
          uri = uri.gsub(/[\/]$/, "")
        end
        @@uri=uri
      end

      # Get default Solr URI
      def self.uri
        @@uri ||= "http://localhost:8983/solr"
      end

      # Middleware config
      CONFIG = {
        :core => nil,
        :condition => self.searcher,
        :facets => nil,
        :group => nil,
        :geojson => nil,
        :model => nil,
        :range_facets => nil,
        :select => nil,
        :fl => Npolar::Api::SolrQuery.fields,
        :q => lambda {|request| Npolar::Api::SolrQuery.q(request)}, # # q=title:foo OR exact:foo OR text:foo*
        :dates => Npolar::Api::SolrQuery.dates,
        :force => nil,
        :fq => lambda {|request|
          [] + request.params.select {|p|
            p =~ /^filter-.{1,}/}.select{|k,v| v =~ /.{1}/}.map {|k,v| "#{k.gsub(/^filter-/, "")}:#{v}"
          }
        },
        :feed => lambda { |response, request| Npolar::Api::SolrFeedWriter.feed(response, request)},
        :log => Npolar::Api.log,
        :save => lambda {|request| raise "Not implemented" },
        :summary => lambda {|doc| doc["summary"]},
        :rows => Npolar::Api::SolrQuery.rows,
        :wt => :ruby,
        :path => "/",
        :to_solr => lambda {|doc|doc},
      }
      
      def condition?(request)
        # Only trigger on the mapped path
        if config[:path].gsub(/\/$/, "") !~ Regexp.new(Regexp.quote(request.path.gsub(/\/$/, "")))
          return false
        end 
        config[:condition].call(request)
      end

      def core
        @core ||= config[:core]
      end

      def core=core
        @core = core
      end

      def force
        config[:force] ||= {}
      end

      def handle(request)
        
        @request = request
        log.info "#{request.request_method} #{request.path} [#{self.class.name}]"
        
        if request["q"] and "POST" == request.request_method
          return handle_search(request)
        end
        case request.request_method
          when "DELETE" then handle_delete(request)
          when "POST", "PUT" then handle_save(request)
          else handle_search(request)
        end
      end

      def handle_delete(request)
        log.debug self.class.name+" DELETE #{request.id}"

        rsolr.delete_by_id request.id

        @app.call(request.env)
      end

      def rsolr
        @rsolr ||= RSolr.connect(:url => uri, :update_format => :json)
      end

      def rsolr=rsolr
        @rsolr=rsolr
      end

      # Save to Solr index
      def handle_save(request)
        
        begin
          t0 = Time.now
          
          response = config[:save].call(request)
          
          elapsed = Time.now-t0

          log.debug "Solr indexing time: #{elapsed} seconds"

          response
          
        rescue => e

          json_error_from_exception(e)
        end
      end
      
      # Solr search
      # Search request handler that returns a JSON feed (default) or CSV, Solr Ruby Hash, Solr XML
      # @return Rack-style HTTP triplet
      def handle_search(request)
        log.debug self.class.name+"#handle_search #{uri} #{solr_params}"
        begin
          
          #fq_bbox = []
          #if params["bbox"]
          #  #w,s,e,n = bbox = params["bbox"].split(" ").map {|c|c.to_f}
          #  #fq_bbox = ["north:[#{s} TO #{n}]", "east:[#{w} TO #{e}]"]
          #end

          response = search

          # if bulk=true in query, proceed
          if request["bulk"] and request["bulk"] == "true"

            # collect id's from search results
            ids = []
            response["response"]["docs"].each do |doc|
              ids << doc["id"]
            end

            log.debug("Bulk-query for ids: #{ids}")

            # prepare to POST ids to upstream storage to bulk-fetch the actual documents
            post_env = request.env
            post_env["REQUEST_METHOD"] = "POST"
            post_env["rack.input"] = StringIO.new({ "keys" => ids }.to_json)
            post_env["CONTENT_TYPE"] = "application/json"

            # POST to upstream storage and return
            resp = @app.call(post_env)
            return [200, headers("json"), resp.body]
          end
          
          if ("geojson" == request.format) or ("json" == request.format and request["variant"]=~ /^geo(\+)?(json)?$/)
            [200, headers("geojson"), [geojson(response).to_json]]
          elsif ["solr"].include? request.format
            [200, headers("json"), [response.to_json]]
          elsif ["csv", "xml"].include? request.format
            #http://wiki.apache.org/solr/CSVResponseWriter
            [200, headers(request.format), [response]]
          elsif ("array" == request.format) or ("json" == request.format and request["variant"]== "array")
            [200, headers("json"), [response["response"]["docs"].to_json]]
          else
            
            #["html", "json", "", nil].include? request.format
            status = response["responseHeader"]["status"]
            qtime = response["responseHeader"]["QTime"]
            hits = response["response"]["numFound"]
            log.debug "Solr hits=#{hits} status=#{status} qtime=#{qtime}"

            [200, headers("json"), [feed(response).to_json]]
          end

        rescue ::RSolr::Error::Http => e
          log.debug self.class.name+"#handle_search raised RSolr::Error::Http"
          json_error_from_exception(e)
        end
      end

      # Performs search using rsolr
      # @return Hash
      def search(params=nil)
        rsolr.get select, :params => solr_params
      end

      # Converts incoming JSON (or other document) to Solr-style JSON Hash
      def to_solr(body)
        config[:to_solr].call(body)
      end

      # Solr core URI
      def uri
        if core =~ /^http(s)?:\/\//
          core.gsub(/[\/]$/, "")
        elsif self.class.uri.is_a? String and self.class.uri =~ /^http(s)?:\/\// 
          self.class.uri.gsub(/[\/]$/, "") + "/#{core}"
        end
      end

      protected

      def feed(response)
        config[:feed].call(response, request)
      end
      
      def geojson(response)
        if request["geometry"]
          geometry = request["geometry"]
        else
          geometry = config[:geojson][:geometry]||="Point"
        end
        latitude = config[:geojson][:latitude]||"latitude"
        longitude = config[:geojson][:longitude]||"longitude"
        Npolar::Api::SolrFeedWriter.geojson_feature_collection(response, request, latitude,longitude, geometry)
      end

      def facets
        unless request["facets"].nil?
          if request["facets"] =~ /,/
            request_facets = request["facets"].split(",")
          else
            return false if /^false$/ =~ request["facets"]
            request_facets = [request["facets"]]
          end
        end
        
        # todo facets vs. multifacets
        if request_facets.respond_to? :"+="
          facets = request_facets += config[:facets]
        else
          facets = config[:facets]
        end
        
        unless facets.nil?
          facets = facets.uniq
          #facets = facets.map {|f|
          #  #"{!ex=multi_#{f}}#{f}"
          #  "#{f}"
          #}
        end
        facets

      end
      
      # Group (by field)
      def group_params
        gp = { :group => true,
          :"group.field" => request["group"],
          :"group.facet" => true,
          :"group.main" => true,
          :"group.limit" => request["group.limit"]||1,
          :"rows" => request["rows"]||-1,
          :"group.sort" => request["group.sort"]
        }
        if config[:group] != []
          if not config[:group].nil? and not config[:group][:sort].nil?
            gp[:"group.sort"] =  config[:search][:group][:sort]
          end
        end
        gp
      end

      def json_error_from_exception(e)
          log.error e
          status = e.response[:status].to_i
          body = e.response[:body]
          explanation = body
          if body =~ /<b>message<\/b>/ 
            explanation = body.split("<b>message</b>")[1]
          elsif body =~ /\'error\'\=\>\{\'msg\'/
            explanation = body.split("'error'=>{'msg'=>'")[1].split("',")[0]
          end

          error = { "error" => { "status" => status, "explanation" => "Solr request failed: #{explanation}"  } }
          [status, headers("json"), [error.to_json]]
      end

      # @return Array fq (filter queries)
      # @todo not(x)
      def fq

        # config[:fq] should contain a lambda that will extract fq from a request
        if config[:fq].respond_to?(:call)
          config_fq = config[:fq].call(request)
        elsif config[:fq].is_a? Array
          config_fq = config[:fq]
        else
          config_fq=[]
        end

        # Merge fq's in the request with filter-field= extracted above
        fq = (request.multi("fq") + config_fq + force.map {|k,v| "#{k}:#{v}" }).uniq.map {|fq|

          if fq =~ /(.*):(.*)/
            
            k,v = fq.split(":",2)

            
            if v =~ /true|false/
              
              "#{k}:#{v}"
              
            elsif v =~ /^(∅|%E2%88%85|null|)$/ui
              "-#{k}:[\"\" TO *]"
            elsif v =~ /(.*)?[.][.](.*)?/ or config[:dates].include? k
  
              # Trick all date queries into range query
              # This makes snazzy shortened year and date queries work for a single year or a single date
              if config[:dates].include? k           
                unless v =~ /(.*)?[.][.](.*)?/
                  v = "#{v}..#{v}"
                end 
              end
  
              from,to = v.split("..")
  
              if from.nil? or "" == from
                from = "*"
              end
              if to.nil? or "" == to
                to = "*"
              end
              
              # Lucene needs the smallest number/date first in a range, so we convert these from string
              if from =~ INTEGER_REGEX and to =~ INTEGER_REGEX
                from = from.to_i
                to = to.to_i
              end
              if from =~ FLOAT_REGEX and to =~ FLOAT_REGEX
                from = from.to_f
                to = to.to_f
              end
              
              if (from != "*" and to != "*" and from.respond_to?(:<) and to.respond_to?(:>))
               
                if from > to
                  from,to = to,from
                end
              end
              from = from.to_s
              to = to.to_s
              
              if config[:dates].include? k

                if from =~ DATETIME_REGEX or to =~ DATETIME_REGEX 
                  from = (from =~ YEAR_REGEX) ? from : "*"
                  to = (to =~ YEAR_REGEX) ? to : "*"
                elsif from =~ DATE_REGEX or to =~ DATE_REGEX
                  from,to = solr_date_range(from,to)
                  # At least one year (from or to)
                elsif from =~ MONTH_REGEX or to =~ MONTH_REGEX
                  from,to = solr_month_range(from,to)
                elsif from =~ YEAR_REGEX or to =~ YEAR_REGEX
                  from,to = solr_year_range(from,to)
                end
  
                
              end 
              "#{k}:[#{from} TO #{to}]"
              # End of Date range

            else


              "#{k}:\"#{CGI.unescape(v.gsub(/(%20|\+)/ui, " "))}\""
            end
          else # fq does not contain :
            fq
          end
      }



        #.map {|k,v| [k, v.)]}
        #qstar = CGI::unescape(qstar)
        # todo facets multifacets
        #unless fq.nil?
        #  fq = fq.map {|q|
        #    k,v = q.split(":")
        #    #"{!tag=multi_#{k}}#{q}"
        #    "#{q}"
        #  }
        #end

      fq
      end

      def fl
        fl = request.params["fields"] ||= config[:fl]
      end

      def log
        @log ||= config[:log]
      end

      def model
        @model ||= config[:model]
      end

      def model=model
        @model=model
      end

      def solr_year_range(from,to)
        unless "*" == from
          from = DateTime.new(from[0..3].to_i, 1, 1).to_time.utc.iso8601
        end

        unless "*" == to
          to = DateTime.new(to[0..3].to_i, 12, 31, 23, 59, 59.999999).to_time.utc.iso8601(6)
        end
    
        [from,to]
      end

      def solr_date_range(from,to)
        # Get the years, or even *
        from_year,to_year=solr_year_range(from,to)
        if "*" == from_year
          from = "*"
        else
          from = DateTime.new(from_year.to_i, from[5..6].to_i, from[8..9].to_i).to_time.utc.iso8601
        end
        if "*" == to_year
          to = "*"
        else

          to = DateTime.new(to_year.to_i, to[5..6].to_i, to[8..9].to_i, 23, 59, 59.999999).to_time.utc.iso8601(6)
        end
        [from,to]
      end

      def solr_month_range(from,to)
        # Get the years, or even *
        from_year,to_year=solr_year_range(from,to)
        if "*" == from_year
          from = "*"
        else
          from = DateTime.new(from_year.to_i, from[5..6].to_i).to_time.utc.iso8601
        end
        if "*" == to_year
          to = "*"
        else
          to = DateTime.new(to_year.to_i, to[5..6].to_i, -1, 23, 59, 59.999999).to_time.utc.iso8601(6)
        end
        [from,to]
      end

      def solr_params
        start = request["start"] ||= 0
        rows  = request["limit"]  ||= config[:rows]
        # Merge with user provided parameters, except some that shouldn't be repeated or has a different external meaning (group is a field in the API and bool in Solr)
        params = request.params.reject {|k,v| k =~ /limit|fields|start|group|facet\.sort|facet\.mincount|filter-/ }.merge ({
            :q=>q, :start => start, :rows => rows,
            :fq => fq,
            :facet => facets.nil? ? false : true,
            :"facet.field" => facets,
            :"facet.mincount" => request["facet.mincount"] ||= 1,
            :"facet.limit" => request["facet.limit"] ||= 100, # -1 == all
            :fl => fl,
            :wt => wt,
            :defType => "edismax",
            :sort => request["sort"] ||= nil,
            :"facet.sort" => request["facet.sort"] ||= "count",
            :qf => "text"
          })
        
        params = params.merge(range_facets)
        
        if request.format == "csv"
          params = params.merge({
            :"csv.separator" => "\t",
            :"csv.mv.separator" => "|",
            :"csv.encapsulator" => '"'
          })
        end
        
        if request["group"] =~ /\w+/
          params = params.merge(group_params)
        end
      
        
        params
      end
      
      def q
        config[:q].call(request)
      end
      
      def range_facets
        if config[:range_facets].nil?
          return {}
        end
        
        facet_range = config[:range_facets].map {|rf|
          if rf.field =~ /^[{][!]}(.)$/
            rf.field.split("}")[1]
          else
            rf.field
          end
        }
        rp = {}
        
        config[:range_facets].each { |rf|
          field = (rf.field =~ /^[{][!]}(.)$/) ? rf.field.split("}")[1] : rf.field
  
          rp.merge!({:"f.#{field}.facet.range.start" => rf.start,
            :"f.#{field}.facet.range.end" => rf.end,
            :"f.#{field}.facet.range.gap" => rf.gap })
        }
        rp[:"facet.range"] = facet_range        
        rp
      end

      # Returns the Solr serch (select) handler (defeult is "select")
      def select
        select = config[:select] ||= "select"
        if  select =~ /^[\/]/
          select = select.gsub(/^[\/]/, "")
        end
        select
      end

      def self.summary(doc)
        config[:summary].call(doc)
      end

      def wt
        wt = case request.format
          when "json" then :ruby
          when "csv" then :csv
          when "xml" then :xml
          else config[:wt]
        end
      end

    end
  end
end