app/service_adaptors/blacklight.rb from team-umlaut/umlaut

app/service_adaptors/blacklight.rb
Summary

Maintainability

1 day
Test Coverage

Issues
require 'nokogiri'
require 'open-uri'
require 'base64'
require 'marc'

# Searches a Blacklight with the cql extension installed.
#
# 
# Params include:
# [base_url]
#    required. Complete URL to catalog.atom action. Eg "https://blacklight.mse.jhu.edu/catalog.atom"
# [bl_fields]
#    required with at least some entries if you want this to do anything. Describe the names of given semantic fields in your BL instance.
#    * issn
#    * isbn
#    * lccn
#    * oclcnum
#    * id (defaults to 'id')
#    * title
#    * author
#    * serials_limit_clause => not an index name, full URL clause for a limit to apply to known serials searches, for instance "f[format][]=Serial"
# [identifier_search]
#    Do catalog search on issn/isbn/oclcnum/lccn/bibId. Default true.
# [keyword_search]
#    Do catalog search on title/author keywords where applicable. Generally only used when identifier_search finds no hits, if identifier_search is on. Default true. 
# [keyword_per_page]
#    How many records to fetch from blacklight when doing keyword searches.
# [exclude_holdings]
#    Can be used to exclude certain 'dummy' holdings that have certain collection, location, or other values. Eg:
#    exclude_holdings:
#      collection_str:
#        - World Wide Web
#        - Internet
# [rft_id_bibnum_prefixes]
#    Array of URI prefixes in an rft_id that indicate that the actual solr id comes next. For instance, if your blacklight will send "http://blacklight.com/catalog/some_id" in an rft_id, then include "http://blacklight.com/catalog/". Optional.
class Blacklight < Service
  required_config_params :base_url, :display_name
  attr_reader :base_url, :cql_search_field
  attr_reader :bl_fields, :issn

  include UmlautHttp
  include MetadataHelper
  include MarcHelper
  include XmlSchemaHelper

  def initialize(config)
    # defaults    
    # If you are sending an OpenURL from a library service, you may
    # have the HIP bibnum, and include it in the OpenURL as, eg.
    # rft_id=http://catalog.library.jhu.edu/bib/343434 (except URL-encoded)
    # Then you'd set rft_id_bibnum_prefix to http://catalog.library.jhu.edu/bib/
    @rft_id_bibnum_prefixes = []
    @cql_search_field = "cql"
    @keyword_per_page = 10
    @identifier_search = true
    @keyword_search = true
    @link_to_search = true
    super(config)
    @bl_fields = { "id" => "id "}.merge(@bl_fields)
  end

  # Standard method, used by background service updater. See Service docs. 
  def service_types_generated    
    types = [ ServiceTypeValue[:fulltext], ServiceTypeValue[:holding], ServiceTypeValue[:table_of_contents], ServiceTypeValue[:relevant_link] ]
    
    return types
  end


  def handle(request)
    ids_processed = []
    holdings_added = 0
    
    if (@identifier_search && url = blacklight_precise_search_url(request) )
      doc = Nokogiri::XML( http_fetch(url).body )
      
      ids_processed.concat( bib_ids_from_atom_entries( doc.xpath("atom:feed/atom:entry", xml_ns) ) )
  
      # namespaces make xpath harder than it should be, but css
      # selector still easy, thanks nokogiri! Grab the marc from our
      # results. 
      marc_matches = doc.xpath("atom:feed/atom:entry/atom:content[@type='application/marc']", xml_ns).collect do |encoded_marc21|
        MARC::Reader.decode( Base64.decode64(encoded_marc21.text).force_encoding("UTF-8") )        
      end
  
      add_856_links(request, marc_matches )
  
      # Got to make a second fetch for dlf_expanded info, cause BL doens't
      # (yet) let us ask for more than one at once
      holdings_url = blacklight_precise_search_url( request, "dlf_expanded" )
      holdings_added += add_holdings( holdings_url ) if holdings_url
    end

    #keyword search.    
    if (@keyword_search &&
        url = blacklight_keyword_search_url(request))
            
        doc = Nokogiri::XML( http_fetch(url).body )
        # filter out matches whose titles don't really match at all, or
        # which have already been seen in identifier search. 
        entries = filter_keyword_entries(request, doc.xpath("atom:feed/atom:entry", xml_ns) , :exclude_ids => ids_processed, :remove_subtitle => (! title_is_serial?(request.referent)) )
        
        
        marc_by_atom_id = {}
        
        # Grab the marc from our entries. Important not to do a // xpath
        # search, or we'll wind up matching parent elements not actually
        # included in our 'entries' list. 
        marc_matches = entries.xpath("atom:content[@type='application/marc']", xml_ns).collect do |encoded_marc21|
          marc = MARC::Reader.decode( Base64.decode64(encoded_marc21.text).force_encoding("UTF-8") )

          marc_by_atom_id[ encoded_marc21.at_xpath("ancestor::atom:entry/atom:id/text()", xml_ns).to_s  ] = marc
          
          marc
        end
       
        # We've filtered out those we consider just plain bad
        # matches, everything else we're going to call
        # an approximate match. Sort so that those with
        # a date close to our request date are first.
        if ( year = get_year(request.referent))
          marc_matches = marc_matches.partition {|marc| get_years(marc).include?( year )}.flatten
        end
        # And add in the 856's
        add_856_links(request, marc_matches, :match_reliability => ServiceResponse::MatchUnsure)

        # Fetch and add in the holdings
        url = blacklight_url_for_ids(bib_ids_from_atom_entries(entries))
        
        holdings_added += add_holdings( url, :match_reliability => ServiceResponse::MatchUnsure, :marc_data => marc_by_atom_id ) if url
        
        if (@link_to_search && holdings_added ==0)
          hit_count = doc.at_xpath("atom:feed/opensearch:totalResults/text()", xml_ns).to_s.to_i
          html_result_url = doc.at_xpath("atom:feed/atom:link[@rel='alternate'][@type='text/html']/attribute::href", xml_ns).to_s

          if hit_count > 0          
            request.add_service_response(             
              :service => self,
              :source_name => @display_name,
              :count => hit_count,
              :display_text => "#{hit_count} possible #{case; when hit_count > 1 ; 'matches' ; else; 'match' ; end} in #{@display_name}", 
              :url => html_result_url,
              :service_type_value => :holding_search )
          end
        end
    end


    
    
    return request.dispatched(self, true)


  end

  # Send a CQL request for any identifiers present. 
  # Ask for for an atom response with embedded marc21 back. 
  def blacklight_precise_search_url(request, format = "marc")

    # Add search clauses for our identifiers, if we have them and have a configured search field for them. 
    clauses = []
    added = []
    ["lccn", "isbn", "oclcnum"].each do |key|
      if bl_fields[key] && request.referent.send(key)
        clauses.push( "#{bl_fields[key]} = \"#{request.referent.send(key)}\"")
        added << key
      end
    end
    # Only add ISSN if we don't have an ISBN, reduces false matches
    if ( !added.include?("isbn") &&
         bl_fields["issn"] && 
         request.referent.issn)
       clauses.push("#{bl_fields["issn"]} = \"#{request.referent.issn}\"")
    end
      
    
    # Add Solr document identifier if we can get one from the URL
    
    if (id = get_solr_id(request.referent))
      clauses.push("#{bl_fields['id']} = \"#{id}\"")
    end
    
    # if we have nothing, we can do no search.
    return nil if clauses.length == 0
    
    cql = clauses.join(" OR ")
    
    return base_url + "?search_field=#{@cql_search_field}&content_format=#{format}&q=#{CGI.escape(cql)}"             
  end

  # Construct a CQL search against blacklight for author and title,
  # possibly with serial limit. Ask for Atom with embedded MARC back. 
  def blacklight_keyword_search_url(request, options = {})
    options[:format] ||= "atom"
    options[:content_format] ||= "marc"
    
    clauses = []

    # We need both title and author to search keyword style, or
    # we get too many false positives. Except serials we'll do
    # title only. sigh, logic tree. 
    
    # Also need to use appropriate 'container' title if avail, not
    # article title. 
    metadata = request.referent.metadata
    title = metadata['jtitle']     
    title = metadata['btitle'] if title.blank?
    title = metadata['title'] if title.blank?
    # remove sub-title for better search
    title.gsub!(/\:.*\Z/, '') if title

    author = get_top_level_creator(request.referent)
    return nil unless title && (author || (@bl_fields["serials_limit_clause"] && title_is_serial?(request.referent)))
    # phrase search for title, just raw dismax for author
    # Embed quotes inside the quoted value, need to backslash-quote for CQL,
    # and backslash the backslashes for ruby literal. 
    clauses.push("#{@bl_fields["title"]} = \"\\\"#{escape_for_cql_double_quotes title}\\\"\"")    
    clauses.push("#{@bl_fields["author"]} = \"#{escape_for_cql_double_quotes author}\"") if author
    
    url = base_url + "?search_field=#{@cql_search_field}&content_format=#{options[:content_format]}&q=#{CGI.escape(clauses.join(" AND "))}"

    if (@bl_fields["serials_limit_clause"] &&
        title_is_serial?(request.referent))        
      url += "&" + @bl_fields["serials_limit_clause"]
    end
    
    return url
  end

  # We're putting a value inside of CQL double quotes. What if
  # it has double quote literal in it already? Will be a CQL syntax
  # error if we do nothing. Can we escape it somehow? CQL is really
  # unclear, we're ALREADY backslash escaping the phrase quotes themselves!
  # We just replace them with space, should work for our actual indexing. 
  #
  # Single quotes (apostrophes) need to be escaped with an apostrophe itself,
    # `''`, apparently. http://mail-archives.apache.org/mod_mbox/cassandra-user/201108.mbox/%3C20110803152250.294300@gmx.net%3E
  def escape_for_cql_double_quotes(str)
    str = str.gsub('"', " ")
    str = str.gsub("'", "''")

    return str
  end


  # Takes a url that will return atom response of dlf_expanded content.
  # Adds Umlaut "holding" ServiceResponses for dlf_expanded, as appropriate.
  # Returns number of holdings added.  
  def add_holdings(holdings_url, options = {})
    options[:match_reliability] ||= ServiceResponse::MatchExact
    options[:marc_data] ||= {}
    
    atom = Nokogiri::XML( http_fetch(holdings_url).body )
    content_entries = atom.search("/atom:feed/atom:entry/atom:content", xml_ns)
    
    # For each atom entry, find the dlf_expanded record. For each dlf_expanded
    # record, take all of it's holdingsrec's if it has them, or all of it's
    # items if it doesn't, and add them to list. We wind up with a list
    # of mixed holdingsrec's and items. 
    holdings_xml = content_entries.collect do |dlf_expanded|      
      copies = dlf_expanded.xpath("dlf:record/dlf:holdings/dlf:holdingset/dlf:holdingsrec", xml_ns)
      copies.length > 0 ? copies : dlf_expanded.xpath("dlf:record/dlf:items/dlf:item", xml_ns)
    end.flatten
    
    service_data = holdings_xml.collect do | xml_metadata |
      atom_entry = xml_metadata.at_xpath("ancestor::atom:entry", xml_ns)
      atom_id = atom_entry.at_xpath("atom:id/text()", xml_ns).to_s

      edition_str = edition_statement(options[:marc_data][atom_id])
      url = atom_entry.at_xpath("atom:link[@rel='alternate'][@type='text/html']/attribute::href", xml_ns).to_s
      
      xml_to_holdings( xml_metadata ).merge(
        :service => self,
        :match_reliability => options[:match_reliability],
        :edition_str => edition_str,
        :url => url
      )
    end
    
    # strip out holdings that aren't really holdings
    service_data.delete_if do |data|
      @exclude_holdings.collect do |key, values|
        values.include?(data[key.to_sym])
      end.include?(true)
    end

    # Sort by "collection"
    service_data.sort do |a, b|
      a[:collection_str] <=> b[:collection_str]
    end
    
    service_data.each do |data|
      request.add_service_response(data.merge(:service => self, :service_type_value =>"holding"))
    end

    return service_data.length
  end

  def filter_keyword_entries(request, atom_entries, options = {})
    options[:exclude_ids] ||= []
    options[:remove_subtitle] ||= true
    
    title = request.referent['jtitle']     
    title = request.referent['btitle'] if title.blank?
    title = request.referent['title'] if title.blank?
    
    request_title_forms = [
        title.downcase,        
        normalize_title( title )
    ]
    request_title_forms << normalize_title( title, :remove_subtitle => true) if options[:remove_subtitle]
    request_title_forms = request_title_forms.compact.uniq

    # Only keep entries with title match, and that aren't in the
    # exclude_ids list. 
    good_entries = atom_entries.find_all do |atom_entry|
      title = atom_entry.xpath("atom:title/text()", xml_ns).text  
   
      entry_title_forms = [
        title.downcase,
        normalize_title(title)
      ]
      entry_title_forms << normalize_title(title, :remove_subtitle=>true) if options[:remove_subtitle]
      entry_title_forms = entry_title_forms.compact.uniq
      
      ((entry_title_forms & request_title_forms).length > 0 &&
       (bib_ids_from_atom_entries(atom_entry) & options[:exclude_ids]).length == 0)
    end
    return Nokogiri::XML::NodeSet.new( atom_entries.document, good_entries)
  end

  def bib_ids_from_atom_entries(entries)
    entries.xpath("atom:id/text()", xml_ns).to_a.collect do |atom_id|
          atom_id.to_s =~ /([^\/]+)$/
          $1
    end.compact
  end
  
  def blacklight_url_for_ids(ids, format="dlf_expanded")
    return nil unless ids.length > 0  
  
    return base_url + "?search_field=#{@cql_search_field}&content_format=#{format}&q=" + CGI.escape("#{@bl_fields["id"]} any \"#{ids.join(" ")}\"")
  end


  def get_solr_id(rft)
    rft.identifiers.each do |id|
      @rft_id_bibnum_prefixes.each do |prefix|
        if id[0, prefix.length] == prefix
          return id[prefix.length, id.length]
        end
      end
    end

    return nil    
  end

end