app/service_adaptors/blacklight.rb
require 'nokogiri'
require 'open-uri'
require 'base64'
require 'marc'
# Searches a Blacklight with the cql extension installed.
#
#
# Params include:
# [base_url]
# required. Complete URL to catalog.atom action. Eg "https://blacklight.mse.jhu.edu/catalog.atom"
# [bl_fields]
# required with at least some entries if you want this to do anything. Describe the names of given semantic fields in your BL instance.
# * issn
# * isbn
# * lccn
# * oclcnum
# * id (defaults to 'id')
# * title
# * author
# * serials_limit_clause => not an index name, full URL clause for a limit to apply to known serials searches, for instance "f[format][]=Serial"
# [identifier_search]
# Do catalog search on issn/isbn/oclcnum/lccn/bibId. Default true.
# [keyword_search]
# Do catalog search on title/author keywords where applicable. Generally only used when identifier_search finds no hits, if identifier_search is on. Default true.
# [keyword_per_page]
# How many records to fetch from blacklight when doing keyword searches.
# [exclude_holdings]
# Can be used to exclude certain 'dummy' holdings that have certain collection, location, or other values. Eg:
# exclude_holdings:
# collection_str:
# - World Wide Web
# - Internet
# [rft_id_bibnum_prefixes]
# Array of URI prefixes in an rft_id that indicate that the actual solr id comes next. For instance, if your blacklight will send "http://blacklight.com/catalog/some_id" in an rft_id, then include "http://blacklight.com/catalog/". Optional.
class Blacklight < Service
required_config_params :base_url, :display_name
attr_reader :base_url, :cql_search_field
attr_reader :bl_fields, :issn
include UmlautHttp
include MetadataHelper
include MarcHelper
include XmlSchemaHelper
def initialize(config)
# defaults
# If you are sending an OpenURL from a library service, you may
# have the HIP bibnum, and include it in the OpenURL as, eg.
# rft_id=http://catalog.library.jhu.edu/bib/343434 (except URL-encoded)
# Then you'd set rft_id_bibnum_prefix to http://catalog.library.jhu.edu/bib/
@rft_id_bibnum_prefixes = []
@cql_search_field = "cql"
@keyword_per_page = 10
@identifier_search = true
@keyword_search = true
@link_to_search = true
super(config)
@bl_fields = { "id" => "id "}.merge(@bl_fields)
end
# Standard method, used by background service updater. See Service docs.
def service_types_generated
types = [ ServiceTypeValue[:fulltext], ServiceTypeValue[:holding], ServiceTypeValue[:table_of_contents], ServiceTypeValue[:relevant_link] ]
return types
end
def handle(request)
ids_processed = []
holdings_added = 0
if (@identifier_search && url = blacklight_precise_search_url(request) )
doc = Nokogiri::XML( http_fetch(url).body )
ids_processed.concat( bib_ids_from_atom_entries( doc.xpath("atom:feed/atom:entry", xml_ns) ) )
# namespaces make xpath harder than it should be, but css
# selector still easy, thanks nokogiri! Grab the marc from our
# results.
marc_matches = doc.xpath("atom:feed/atom:entry/atom:content[@type='application/marc']", xml_ns).collect do |encoded_marc21|
MARC::Reader.decode( Base64.decode64(encoded_marc21.text).force_encoding("UTF-8") )
end
add_856_links(request, marc_matches )
# Got to make a second fetch for dlf_expanded info, cause BL doens't
# (yet) let us ask for more than one at once
holdings_url = blacklight_precise_search_url( request, "dlf_expanded" )
holdings_added += add_holdings( holdings_url ) if holdings_url
end
#keyword search.
if (@keyword_search &&
url = blacklight_keyword_search_url(request))
doc = Nokogiri::XML( http_fetch(url).body )
# filter out matches whose titles don't really match at all, or
# which have already been seen in identifier search.
entries = filter_keyword_entries(request, doc.xpath("atom:feed/atom:entry", xml_ns) , :exclude_ids => ids_processed, :remove_subtitle => (! title_is_serial?(request.referent)) )
marc_by_atom_id = {}
# Grab the marc from our entries. Important not to do a // xpath
# search, or we'll wind up matching parent elements not actually
# included in our 'entries' list.
marc_matches = entries.xpath("atom:content[@type='application/marc']", xml_ns).collect do |encoded_marc21|
marc = MARC::Reader.decode( Base64.decode64(encoded_marc21.text).force_encoding("UTF-8") )
marc_by_atom_id[ encoded_marc21.at_xpath("ancestor::atom:entry/atom:id/text()", xml_ns).to_s ] = marc
marc
end
# We've filtered out those we consider just plain bad
# matches, everything else we're going to call
# an approximate match. Sort so that those with
# a date close to our request date are first.
if ( year = get_year(request.referent))
marc_matches = marc_matches.partition {|marc| get_years(marc).include?( year )}.flatten
end
# And add in the 856's
add_856_links(request, marc_matches, :match_reliability => ServiceResponse::MatchUnsure)
# Fetch and add in the holdings
url = blacklight_url_for_ids(bib_ids_from_atom_entries(entries))
holdings_added += add_holdings( url, :match_reliability => ServiceResponse::MatchUnsure, :marc_data => marc_by_atom_id ) if url
if (@link_to_search && holdings_added ==0)
hit_count = doc.at_xpath("atom:feed/opensearch:totalResults/text()", xml_ns).to_s.to_i
html_result_url = doc.at_xpath("atom:feed/atom:link[@rel='alternate'][@type='text/html']/attribute::href", xml_ns).to_s
if hit_count > 0
request.add_service_response(
:service => self,
:source_name => @display_name,
:count => hit_count,
:display_text => "#{hit_count} possible #{case; when hit_count > 1 ; 'matches' ; else; 'match' ; end} in #{@display_name}",
:url => html_result_url,
:service_type_value => :holding_search )
end
end
end
return request.dispatched(self, true)
end
# Send a CQL request for any identifiers present.
# Ask for for an atom response with embedded marc21 back.
def blacklight_precise_search_url(request, format = "marc")
# Add search clauses for our identifiers, if we have them and have a configured search field for them.
clauses = []
added = []
["lccn", "isbn", "oclcnum"].each do |key|
if bl_fields[key] && request.referent.send(key)
clauses.push( "#{bl_fields[key]} = \"#{request.referent.send(key)}\"")
added << key
end
end
# Only add ISSN if we don't have an ISBN, reduces false matches
if ( !added.include?("isbn") &&
bl_fields["issn"] &&
request.referent.issn)
clauses.push("#{bl_fields["issn"]} = \"#{request.referent.issn}\"")
end
# Add Solr document identifier if we can get one from the URL
if (id = get_solr_id(request.referent))
clauses.push("#{bl_fields['id']} = \"#{id}\"")
end
# if we have nothing, we can do no search.
return nil if clauses.length == 0
cql = clauses.join(" OR ")
return base_url + "?search_field=#{@cql_search_field}&content_format=#{format}&q=#{CGI.escape(cql)}"
end
# Construct a CQL search against blacklight for author and title,
# possibly with serial limit. Ask for Atom with embedded MARC back.
def blacklight_keyword_search_url(request, options = {})
options[:format] ||= "atom"
options[:content_format] ||= "marc"
clauses = []
# We need both title and author to search keyword style, or
# we get too many false positives. Except serials we'll do
# title only. sigh, logic tree.
# Also need to use appropriate 'container' title if avail, not
# article title.
metadata = request.referent.metadata
title = metadata['jtitle']
title = metadata['btitle'] if title.blank?
title = metadata['title'] if title.blank?
# remove sub-title for better search
title.gsub!(/\:.*\Z/, '') if title
author = get_top_level_creator(request.referent)
return nil unless title && (author || (@bl_fields["serials_limit_clause"] && title_is_serial?(request.referent)))
# phrase search for title, just raw dismax for author
# Embed quotes inside the quoted value, need to backslash-quote for CQL,
# and backslash the backslashes for ruby literal.
clauses.push("#{@bl_fields["title"]} = \"\\\"#{escape_for_cql_double_quotes title}\\\"\"")
clauses.push("#{@bl_fields["author"]} = \"#{escape_for_cql_double_quotes author}\"") if author
url = base_url + "?search_field=#{@cql_search_field}&content_format=#{options[:content_format]}&q=#{CGI.escape(clauses.join(" AND "))}"
if (@bl_fields["serials_limit_clause"] &&
title_is_serial?(request.referent))
url += "&" + @bl_fields["serials_limit_clause"]
end
return url
end
# We're putting a value inside of CQL double quotes. What if
# it has double quote literal in it already? Will be a CQL syntax
# error if we do nothing. Can we escape it somehow? CQL is really
# unclear, we're ALREADY backslash escaping the phrase quotes themselves!
# We just replace them with space, should work for our actual indexing.
#
# Single quotes (apostrophes) need to be escaped with an apostrophe itself,
# `''`, apparently. http://mail-archives.apache.org/mod_mbox/cassandra-user/201108.mbox/%3C20110803152250.294300@gmx.net%3E
def escape_for_cql_double_quotes(str)
str = str.gsub('"', " ")
str = str.gsub("'", "''")
return str
end
# Takes a url that will return atom response of dlf_expanded content.
# Adds Umlaut "holding" ServiceResponses for dlf_expanded, as appropriate.
# Returns number of holdings added.
def add_holdings(holdings_url, options = {})
options[:match_reliability] ||= ServiceResponse::MatchExact
options[:marc_data] ||= {}
atom = Nokogiri::XML( http_fetch(holdings_url).body )
content_entries = atom.search("/atom:feed/atom:entry/atom:content", xml_ns)
# For each atom entry, find the dlf_expanded record. For each dlf_expanded
# record, take all of it's holdingsrec's if it has them, or all of it's
# items if it doesn't, and add them to list. We wind up with a list
# of mixed holdingsrec's and items.
holdings_xml = content_entries.collect do |dlf_expanded|
copies = dlf_expanded.xpath("dlf:record/dlf:holdings/dlf:holdingset/dlf:holdingsrec", xml_ns)
copies.length > 0 ? copies : dlf_expanded.xpath("dlf:record/dlf:items/dlf:item", xml_ns)
end.flatten
service_data = holdings_xml.collect do | xml_metadata |
atom_entry = xml_metadata.at_xpath("ancestor::atom:entry", xml_ns)
atom_id = atom_entry.at_xpath("atom:id/text()", xml_ns).to_s
edition_str = edition_statement(options[:marc_data][atom_id])
url = atom_entry.at_xpath("atom:link[@rel='alternate'][@type='text/html']/attribute::href", xml_ns).to_s
xml_to_holdings( xml_metadata ).merge(
:service => self,
:match_reliability => options[:match_reliability],
:edition_str => edition_str,
:url => url
)
end
# strip out holdings that aren't really holdings
service_data.delete_if do |data|
@exclude_holdings.collect do |key, values|
values.include?(data[key.to_sym])
end.include?(true)
end
# Sort by "collection"
service_data.sort do |a, b|
a[:collection_str] <=> b[:collection_str]
end
service_data.each do |data|
request.add_service_response(data.merge(:service => self, :service_type_value =>"holding"))
end
return service_data.length
end
def filter_keyword_entries(request, atom_entries, options = {})
options[:exclude_ids] ||= []
options[:remove_subtitle] ||= true
title = request.referent['jtitle']
title = request.referent['btitle'] if title.blank?
title = request.referent['title'] if title.blank?
request_title_forms = [
title.downcase,
normalize_title( title )
]
request_title_forms << normalize_title( title, :remove_subtitle => true) if options[:remove_subtitle]
request_title_forms = request_title_forms.compact.uniq
# Only keep entries with title match, and that aren't in the
# exclude_ids list.
good_entries = atom_entries.find_all do |atom_entry|
title = atom_entry.xpath("atom:title/text()", xml_ns).text
entry_title_forms = [
title.downcase,
normalize_title(title)
]
entry_title_forms << normalize_title(title, :remove_subtitle=>true) if options[:remove_subtitle]
entry_title_forms = entry_title_forms.compact.uniq
((entry_title_forms & request_title_forms).length > 0 &&
(bib_ids_from_atom_entries(atom_entry) & options[:exclude_ids]).length == 0)
end
return Nokogiri::XML::NodeSet.new( atom_entries.document, good_entries)
end
def bib_ids_from_atom_entries(entries)
entries.xpath("atom:id/text()", xml_ns).to_a.collect do |atom_id|
atom_id.to_s =~ /([^\/]+)$/
$1
end.compact
end
def blacklight_url_for_ids(ids, format="dlf_expanded")
return nil unless ids.length > 0
return base_url + "?search_field=#{@cql_search_field}&content_format=#{format}&q=" + CGI.escape("#{@bl_fields["id"]} any \"#{ids.join(" ")}\"")
end
def get_solr_id(rft)
rft.identifiers.each do |id|
@rft_id_bibnum_prefixes.each do |prefix|
if id[0, prefix.length] == prefix
return id[prefix.length, id.length]
end
end
end
return nil
end
end