app/models/hip3/bib_searcher.rb
#!/usr/bin/ruby
require 'net/http'
require 'nokogiri'
# Hip3 Module has been written for JHU's HIP3 installation. It may not work
# quite right with other installations, I'm almost certain it needs to be
# abstracted and parameterized better to be more generic.
module Hip3
# If multiple search criteria are supplied, will 'or' them all to find
# bibs matching ANY criteria.
# keywords should be an array, and will be 'and'ed
# Searches using the HIP3 xml 'interface', which means it may be sensitive to
# HIP display changes that change XML.
# It finds BibNums, and creates Hip3Bib
# objects based on that bibNum. Doesn't take any other info but Bib num from
# the actual search response, but it could, and pre-load the bib object.
class BibSearcher
ISSN_KW_INDEX = '.IS'
ISBN_KW_INDEX = '.IB'
GEN_KW_INDEX = '.GW'
TITLE_KW_INDEX = '.TW'
SERIAL_TITLE_KW_INDEX = '.ST'
AUTHOR_KW_INDEX = '.AW'
BIBNUM_INDEX = 'BIB'
SUDOC_KW_INDEX = '.SD'
attr_accessor :httpSession
attr_accessor :hip_base_url_str, :hip_base_url
attr_reader :issn, :isbn # writers provided concretely
attr_accessor :sudoc, :bibnum
attr_reader :keywords
# You can pass in a Net::HTTP, if you'd for instance like to keep
# open a persistent connection. You are advised to use our special
# Hip3::HTTPSession, for it's error handling. Or better yet, just
# leave second argument empty, and we'll create one for you.
def initialize(arg_base_path, arg_http_session=nil)
self.hip_base_url_str = arg_base_path
self.hip_base_url = URI::parse(self.hip_base_url_str);
self.httpSession = arg_http_session
if self.httpSession.nil?
self.httpSession = Hip3::HTTPSession.create(self.hip_base_url.host() )
end
self.keywords = []
end
# Method checks for basic well-formedness (doesn't actually check
# checksum), and adds hyphen if neccesary, because our HIP needs
# it to search. Bah!
def issn=(argIssn)
if (argIssn.nil? || argIssn.empty?)
@issn = nil
return
end
# first remove hyphen to normalize
argIssn.gsub!('-', '')
# now check for basic well-formedness
unless argIssn =~ /\d{7}(\d|X)/
raise ArgumentError.new("Malformed issn: #{argIssn}")
end
#now put the hyphen back, sadly
@issn = argIssn.slice(0..3) + '-' + argIssn.slice(4..7)
end
def isbn=(arg_isbn)
if ( arg_isbn.nil? || arg_isbn.empty? )
@isbn = nil
end
@isbn = arg_isbn
end
# Yet another way to specify search criteria.
# Hash, where the key is the name of a HIP keyword Index (use
# constants in this class if possible), and the value is an array of
# keywords. Everything is "anded" together.
def search_hash=(hash)
@search_hash = hash
end
def keywords=(arg_kw)
set_keywords(arg_kw)
end
def set_keywords(arg_kw, args={})
arg_kw = [] if arg_kw.nil?
args[:index] = :general unless args[:index]
@keywords = arg_kw
if (args[:index] == :title)
@keyword_index = TITLE_KW_INDEX
elsif (args[:index] == :serial_title)
@keyword_index = SERIAL_TITLE_KW_INDEX
else
@keyword_index = GEN_KW_INDEX
end
end
# Returns the URL starting from / that specifies the search criteria to
# HIP.
def searchPath(args = {})
args[:xml] = true if args[:xml].nil?
path = self.hip_base_url.path() + '?' "menu=search&aspect=power&npp=30&ipp=20&spp=20&profile=general&ri=2"
criteria = Array.new
# Need to do search_hash first, to make sure bibnum and isbn search
# come LAST, for HIP.
unless ( @search_hash.blank?)
manual_criteria = []
@search_hash.each_pair do |index, kws|
manual_criteria << kws.collect do |kw|
kw = '"' + kw + '"' unless [BIBNUM_INDEX, ISSN_KW_INDEX, ISBN_KW_INDEX, AUTHOR_KW_INDEX].include?(index)
"&index=#{index}&term=#{URI.escape(kw)}"
end
end
path << manual_criteria.join("&oper=and") << "&oper=or"
end
criteria<< "&index=#{SUDOC_KW_INDEX}&term=#{URI.escape('"' + self.sudoc + '"' )}" unless sudoc.nil?
criteria << "&index=#{ISSN_KW_INDEX}&term=#{URI.escape(self.issn)}" unless issn.nil?
# For some reason ISBN must be LAST in order, and bibnum must be right before, or HIP doesn't like it.
criteria << "&index=#{BIBNUM_INDEX}&term=#{URI.escape(self.bibnum)}" unless bibnum.blank?
# Go figure. I hate you, HIP.
criteria << "&index=#{ISBN_KW_INDEX}&term=#{URI.escape(self.isbn)}" unless isbn.nil?
criteria << keyword_url_args
path << criteria.join("&oper=or")
path << "&x=0&y=0&aspect=power"
path << "&GetXML=1" if args[:xml]
return path
end
def keyword_url_args
args =
self.keywords.collect { |k| "&index=#{@keyword_index}&term=#{CGI.escape('"' + k + '"')}" }
return args.join("&oper=and") || ""
end
# returns the numbef of hits--does not cache anything, calling
# this method will cause a trip to the db, and calling search
# will cause another one.
def count
return [] if insufficient_query
httpResp = httpSession.get( searchPath, nil )
reDoc = Nokogiri::XML( httpResp.body )
# Confusingly, sometimes
# this gives us a search results page, and sometimes it gives us
# a single bib
# single bib?
if reDoc.at('searchresponse/fullnonmarc/searchresults/results/row/key')
return 1
end
# Multiple, get the count
hits = reDoc.at('searchresponse/yoursearch/hits')
return hits ? hits.inner_text.to_s.to_i : 0
end
# Returns an array of bib objects.
def search
return [] if insufficient_query
httpResp = httpSession.get(searchPath(), nil )
bib_xml = Nokogiri::XML( httpResp.body )
# Confusingly, sometimes
# this gives us a search results page, and sometimes it gives us
# a single bib
# single bib?
if ( bibNum = bib_xml.at('searchresponse/fullnonmarc/searchresults/results/row/key'))
# Single bib
#return [Hip3::Bib.new( httpSession, bibNum.text, reDoc)]
return [Hip3::Bib.new( bibNum.inner_text, self.hip_base_url,
:http_session => httpSession,
:bib_xml_doc => bib_xml
)]
end
# Multi-response
# Get Bib #s and titles for each result.
bib_summaries = bib_xml.search('searchresponse/summary/searchresults/results/row');
return bib_summaries.collect do |bib_xml|
next unless bib_xml.at('key')
# Find a title from the summary xml
title_el = bib_xml.at('TITLE/data/text')
title = title_el ? title_el.inner_text : nil
# remove possible author on there, after a '/' char. That's how HIP rolls.
title.sub!(/\/.*$/, '')
Hip3::Bib.new(bib_xml.at('key').inner_text, self.hip_base_url, :http_session => httpSession, :title => title )
end
end
def insufficient_query
# Have to have some search criteria to search
return (self.issn.nil? && self.isbn.nil? && self.sudoc.blank? && self.bibnum.blank? && self.keywords.blank? && @search_hash.blank?)
end
def search_url
return self.hip_base_url_str + '?' + self.searchPath(:xml => false )
end
end
class HTTPSession < Net::HTTP
@@timeout = 5
def HTTPSession.create(a_host, a_port = 80)
http = HTTPSession.new(a_host, a_port)
http.read_timeout = @@timeout
http.open_timeout = @@timeout
return http
end
def get(path, headers=nil, &block)
limit = 6
tries = 0
response = nil
while (response == nil || response.kind_of?(Net::HTTPRedirection) && tries < limit)
# follow redirects
if response.kind_of?( Net::HTTPRedirection )
response = Net::HTTP.get_response(URI.parse(response['location']))
else
response = super(path, headers, block)
end
tries = tries + 1
end
#This method raises if not 2xx response status.
#No idea why such a method is called 'value'
response.value
return response
end
# Does a get whether or not the connection is already open,
# if it wasn't already open, will make sure to leave it closed again.
def self.safe_get(httpObj, path, headers=nil)
if httpObj.started?
return httpObj.get(path, headers)
else
# With a block, will close the connection when we're done.
return httpObj.start { |h| h.get(path, headers) }
end
end
end
end