team-umlaut/umlaut

View on GitHub
app/models/hip3/bib_searcher.rb

Summary

Maintainability
B
4 hrs
Test Coverage
#!/usr/bin/ruby


require 'net/http'
require 'nokogiri'

#  Hip3 Module has been written for JHU's HIP3 installation. It may not work
# quite right with other installations, I'm almost certain it needs to be
# abstracted and parameterized better to be more generic.
module Hip3


  # If multiple search criteria are supplied, will 'or' them all to find
  # bibs matching ANY criteria. 
  # keywords should be an array, and will be 'and'ed
    # Searches using the HIP3 xml 'interface', which means it may be sensitive to
    # HIP display changes that change XML. 
    # It finds BibNums, and creates Hip3Bib
    # objects based on that bibNum. Doesn't take any other info but Bib num from
    # the actual search response, but it could, and pre-load the bib object.    
    class BibSearcher
    ISSN_KW_INDEX = '.IS'
    ISBN_KW_INDEX = '.IB' 

    GEN_KW_INDEX = '.GW'
    TITLE_KW_INDEX = '.TW'
    SERIAL_TITLE_KW_INDEX = '.ST'
    AUTHOR_KW_INDEX = '.AW'

    BIBNUM_INDEX = 'BIB'

    SUDOC_KW_INDEX = '.SD'
    
        attr_accessor :httpSession 
        attr_accessor :hip_base_url_str, :hip_base_url
        attr_reader :issn, :isbn # writers provided concretely
    attr_accessor :sudoc, :bibnum
    attr_reader :keywords
        
        # You can pass in a Net::HTTP, if you'd for instance like to keep
    # open a persistent connection. You are advised to use our special 
    # Hip3::HTTPSession, for it's error handling. Or better yet, just
    # leave second argument empty, and we'll create one for you. 
        def initialize(arg_base_path, arg_http_session=nil)      
      self.hip_base_url_str = arg_base_path
      self.hip_base_url = URI::parse(self.hip_base_url_str);
      
      
            self.httpSession = arg_http_session
      if self.httpSession.nil?
        self.httpSession = Hip3::HTTPSession.create(self.hip_base_url.host() )
      end
      
      self.keywords = []
            
        end


    
        # Method checks for basic well-formedness (doesn't actually check
    # checksum), and adds hyphen if neccesary, because our HIP needs
        # it to search. Bah! 
        def issn=(argIssn)
            if (argIssn.nil? || argIssn.empty?)
                @issn = nil
                return
            end
            
            # first remove hyphen to normalize
            argIssn.gsub!('-', '')
            # now check for basic well-formedness
            unless argIssn =~ /\d{7}(\d|X)/ 
                raise ArgumentError.new("Malformed issn: #{argIssn}") 
            end
            #now put the hyphen back, sadly
            @issn = argIssn.slice(0..3) + '-' + argIssn.slice(4..7)            
        end

    def isbn=(arg_isbn)
      if ( arg_isbn.nil? || arg_isbn.empty? )
        @isbn = nil
      end
      
      @isbn = arg_isbn
    end

    # Yet another way to specify search criteria.
    # Hash, where the key is the name of a HIP keyword Index (use
    # constants in this class if possible), and the value is an array of
    # keywords. Everything is "anded" together. 
    def search_hash=(hash)
      @search_hash = hash
    end
    
    def keywords=(arg_kw)
      set_keywords(arg_kw)
    end
    
    def set_keywords(arg_kw, args={})

      arg_kw = [] if arg_kw.nil?
      args[:index] = :general unless args[:index]

      @keywords = arg_kw


      
      if (args[:index] == :title)
        @keyword_index = TITLE_KW_INDEX
      elsif (args[:index] == :serial_title)
        @keyword_index = SERIAL_TITLE_KW_INDEX
      else
        @keyword_index = GEN_KW_INDEX
      end
    end

    # Returns the URL starting from / that specifies the search criteria to
    # HIP. 
        def searchPath(args = {})
      args[:xml] = true if args[:xml].nil?
      
            path = self.hip_base_url.path() + '?'             "menu=search&aspect=power&npp=30&ipp=20&spp=20&profile=general&ri=2"

      criteria = Array.new

      # Need to do search_hash first, to make sure bibnum and isbn search
      # come LAST, for HIP. 
      unless ( @search_hash.blank?)
        manual_criteria = []
        @search_hash.each_pair do |index, kws|
         manual_criteria << kws.collect do |kw|
             kw = '"' + kw + '"' unless [BIBNUM_INDEX, ISSN_KW_INDEX, ISBN_KW_INDEX, AUTHOR_KW_INDEX].include?(index)
            "&index=#{index}&term=#{URI.escape(kw)}"
          end
        end
        path << manual_criteria.join("&oper=and") << "&oper=or"
      end

      
      criteria<< "&index=#{SUDOC_KW_INDEX}&term=#{URI.escape('"' + self.sudoc + '"' )}" unless sudoc.nil?
      criteria << "&index=#{ISSN_KW_INDEX}&term=#{URI.escape(self.issn)}" unless issn.nil?
      # For some reason ISBN must be LAST in order, and bibnum must be right before, or HIP doesn't like it.
      criteria << "&index=#{BIBNUM_INDEX}&term=#{URI.escape(self.bibnum)}" unless bibnum.blank?
      # Go figure. I hate you, HIP. 
      criteria << "&index=#{ISBN_KW_INDEX}&term=#{URI.escape(self.isbn)}" unless isbn.nil?
      
      criteria << keyword_url_args
      path << criteria.join("&oper=or")



      
      path << "&x=0&y=0&aspect=power"
      path << "&GetXML=1" if args[:xml]

      return path

        end
    
    def keyword_url_args
      args =
      self.keywords.collect { |k| "&index=#{@keyword_index}&term=#{CGI.escape('"' + k + '"')}" }

      return args.join("&oper=and") || ""            
    end

    # returns the numbef of hits--does not cache anything, calling
    # this method will cause a trip to the db, and calling search
    # will cause another one. 
    def count
      return [] if insufficient_query

      httpResp = httpSession.get( searchPath, nil )
      reDoc = Nokogiri::XML( httpResp.body )

      # Confusingly, sometimes
            # this gives us a search results page, and sometimes it gives us
            # a single bib
    
      # single bib?
      if reDoc.at('searchresponse/fullnonmarc/searchresults/results/row/key')
        return 1
      end

      # Multiple, get the count
      hits = reDoc.at('searchresponse/yoursearch/hits')
      return hits ? hits.inner_text.to_s.to_i : 0       
    end
    
        # Returns an array of bib objects. 
        def search
      return [] if insufficient_query
            httpResp = httpSession.get(searchPath(), nil )

      
            bib_xml = Nokogiri::XML( httpResp.body )
            # Confusingly, sometimes
            # this gives us a search results page, and sometimes it gives us
            # a single bib

            # single bib?
            if ( bibNum = bib_xml.at('searchresponse/fullnonmarc/searchresults/results/row/key'))
                # Single bib
                #return [Hip3::Bib.new( httpSession, bibNum.text, reDoc)]
        return [Hip3::Bib.new( bibNum.inner_text, self.hip_base_url,
                               :http_session => httpSession,
                               :bib_xml_doc => bib_xml
                               )]
            end

            
            # Multi-response
            # Get Bib #s and titles for each result.
      
            bib_summaries =  bib_xml.search('searchresponse/summary/searchresults/results/row');

      return bib_summaries.collect do |bib_xml|
        next unless bib_xml.at('key')

        # Find a title from the summary xml
        title_el = bib_xml.at('TITLE/data/text')
        title = title_el ? title_el.inner_text : nil
        # remove possible author on there, after a '/' char. That's how HIP rolls.
        title.sub!(/\/.*$/, '')
        
        Hip3::Bib.new(bib_xml.at('key').inner_text, self.hip_base_url, :http_session => httpSession, :title => title )
      end
        end

    def insufficient_query
      # Have to have some search criteria to search
      return (self.issn.nil? && self.isbn.nil? && self.sudoc.blank? && self.bibnum.blank? && self.keywords.blank? && @search_hash.blank?)
    end

    def search_url
      return self.hip_base_url_str + '?' + self.searchPath(:xml => false )
    end
    
    end

    
    class HTTPSession < Net::HTTP
        @@timeout = 5
        
        def HTTPSession.create(a_host, a_port = 80)
            http = HTTPSession.new(a_host, a_port)            
            http.read_timeout = @@timeout 
      http.open_timeout = @@timeout
            return http
        end

        
        def get(path, headers=nil, &block)
      limit = 6
      tries = 0
      response = nil

      while (response == nil || response.kind_of?(Net::HTTPRedirection) && tries < limit)
        # follow redirects
        if response.kind_of?( Net::HTTPRedirection )
          response = Net::HTTP.get_response(URI.parse(response['location']))
        else
          response = super(path, headers, block)      
        end
        tries = tries + 1
      end

      
            #This method raises if not 2xx response status.
            #No idea why such a method is called 'value'
            response.value
            
            return response
        end

    # Does a get whether or not the connection is already open,
    # if it wasn't already open, will make sure to leave it closed again. 
    def self.safe_get(httpObj, path, headers=nil)
        if httpObj.started?
          return httpObj.get(path, headers)
        else
          # With a block, will close the connection when we're done. 
          return httpObj.start { |h| h.get(path, headers) }
        end
    end
    
    end
    

    
end