team-umlaut/umlaut

View on GitHub
app/service_adaptors/hathi_trust.rb

Summary

Maintainability
B
6 hrs
Test Coverage
require 'open-uri'
require 'multi_json'
require 'cgi'

# Service that searches HathiTrust from the University of Michigan
#
# Supports full text links, and search inside. 
#
# We link to HathiTrust using a direct babel.hathitrust.org URL instead
# of the handle.net redirection, for two reasonsL
# 1) Can't use the handle.net redirection for the "direct link to search
#    results for user-entered query" feature. 
# 2) Some may want to force a Shibboleth login on HT links. Can't do that
#    with the handle.net redirection either. If you do want to do that,
#    possibly in concert with an EZProxy mediated WAYFless login,
#    set direct_link_base in your services.yml to:
#    "https://babel.hathitrust.org/shcgi/"
#
# Many (but not all) HT books will also be in Google Books (and vice versa)
# However, HT was more generous in deciding what books are public domain than GBS.
# Therefore the main expected use case is to use with Google Books, with
# HT being a lower priority, using preempted_by config.  
#
# Some may prefer HT search inside interface to Google, so search inside
# is not suppressed with presence of google. You can turn off HT
# search inside entirely if you like.
#
# For HT records representing one volume of several, a :excerpts type
# response will be added if full text is avail for some. Or a :highlighted_link
# if only search inside is available for some.
# Or set config show_multi_volume=false to prevent this and ignore partial
# volumes. 
# 
# Two possibilities are available for sdr rights "full" or "searchonly".
# The third possibility is that sdr will be null.
#
# An ISBN with search-only: 0195101464
class HathiTrust < Service  
  include MetadataHelper
  
  attr_reader :url, :display_name, :note
  
  def service_types_generated    
    types = [ ServiceTypeValue[:fulltext] ]
    types.concat([ServiceTypeValue[:excerpts], ServiceTypeValue[:highlighted_link]]) if @show_multi_volume
    types << ServiceTypeValue[:search_inside] if @show_search_inside
    return types
  end
  
  def initialize(config)
    @api_url = 'http://catalog.hathitrust.org/api/volumes'
    # Set to 'https://babel.hathitrust.org/shcgi/' to force
    # Shibboleth login, possibly in concert with EZProxy providing
    # WAYFLess login. 
    @direct_link_base = 'http://babel.hathitrust.org/cgi/'
    @display_name = 'HathiTrust'
    @num_full_views = 1 # max num full view links to include
    @note =  '' #'Fulltext books from the University of Michigan'
    @show_search_inside = true
    @show_multi_volume = true
    
    @credits = {
      "HathiTrust" => "http://www.hathitrust.org"
    }
    
    super(config)
  end
  
  def handle(request)
    params = get_parameters(request.referent)
    return request.dispatched(self, true) if params.blank?
    
    ht_json = do_query(params)
    return request.dispatched(self, true) if ht_json.nil?
    
    #extract the "items" list from the first result group from
    #response.
    first_group = ht_json.values.first    
    items = first_group["items"]
    
    
    
    # Only add fulltext if we're not skipping due to GBS
    if ( preempted_by(request, "fulltext"))
      Rails.logger.debug("#{self.class}: Skipping due to pre-emption")
    else
      full_views_shown = create_fulltext_service_response(request, items)
    end
    
    if @show_multi_volume
      #possibly partial volumes
      create_partial_volume_responses(request, ht_json)
    end

    

    create_search_inside(request, items)
        
    return request.dispatched(self, true)
  end
  
  # just a wrapper around get_bibkey_parameters
  def get_parameters(rft)
    # API supports oclcnum, isbn, or lccn, and can provide more than one of each. 
    get_bibkey_parameters(rft) do |isbn, lccn, oclcnum|         
      keys = Array.new
                  
      keys << "oclc:" + CGI.escape(oclcnum) unless oclcnum.blank?    
      keys <<  "lccn:" + CGI.escape(lccn) unless lccn.blank?
      # Only include ISBN if we have it and we do NOT have oclc or lccn,
      # Bill Dueber's advice for best matching. HT api will only match
      # if ALL the id's we supply match. 
      keys << "isbn:" + CGI.escape(isbn) unless (isbn.blank? || keys.length > 0)

      if keys.length > 0        
        return keys.join(";")
      else
        return nil
      end
    end
  end
  
  # method that takes a referent and a block for parameter creation
  # The block receives isbn, lccn, oclcnum and is responsible for formatting
  # the parameters for the particular service
  # FIXME consider moving this into metadata_helper
  def get_bibkey_parameters(rft)
    # filter out special chars that ought not to be in there anyway,
    # and that HathiTrust barfs on. 
    isbn = get_isbn(rft)
    
    oclcnum = get_identifier(:info, "oclcnum", rft)
    oclcnum = oclcnum.gsub(/[\-\[\]]/, '') unless oclcnum.blank?
    
    lccn = get_lccn(rft)
    lccn = lccn.gsub(/[\-\[\]]/, '') unless lccn.blank?
        
    yield(isbn, lccn, oclcnum)    
  end
  
  # conducts query and parses the JSON
  def do_query(params)        
    link = @api_url + "/brief/json/" + params
    return MultiJson.load( open(link).read )
  end
  
    
  def create_fulltext_service_response(request, items)
    return nil if items.empty?
    
    count = 0
    
    items.each do |item|         
      next if is_serial_part?(item)
      
      
      next unless full_view?(item)
      
      request.add_service_response(
          :service=>self, 
          :display_text=> @display_name,
          :display_text_i18n => "display_name",
          :url=> direct_url_to(item), 
          :add_i18n_notes => "single_volume", # signal for transform_view_data
          :source_for_i18n => item['orig'],
          :service_type_value => :fulltext 
      )
      count += 1
      break if count == @num_full_views
    end   
    return count
  end
  
  
  # If HT has partial serial volumes, include a link to that. 
  # Need to pass in complete HT json response
  def create_partial_volume_responses(request, ht_json)
    items =  ht_json.values.first["items"]
    full_ids = items.collect do |i| 
      i["fromRecord"] if (is_serial_part?(i) && full_view?(i))
    end.compact.uniq
    
    full_ids.each do |recordId|
      record = ht_json.values.first["records"][recordId]
      next unless record && record["recordURL"]
    
      record_title = record["titles"].first if record["titles"].kind_of?(Array)
    
      request.add_service_response(
          :service=>self, 
          :display_text=> @display_name,
          :display_text_i18n => "display_name",
          :url=> record["recordURL"],
          :add_i18n_notes => "partial_volume", # signal for transform_view_data
          :title_for_i18n => record_title,
          :service_type_value => :excerpts
      )
    end
    
    if full_ids.empty?
      search_ids = items.collect do |i|
        i["fromRecord"] if (is_serial_part?(i) )
      end.compact.uniq
      
      search_ids.each do |recordId|
        record = ht_json.values.first["records"][recordId]
        next unless record && record["recordURL"]
        
        request.add_service_response(
            :service=>self, 
            :display_text=> "Search inside some volumes",
            :display_text_i18n => "search_inside_some_vols",
            :url=> record["recordURL"],
            :service_type_value => :highlighted_link             
        )   

      end
      
    end
    
    
  end
  
  def create_search_inside(request, items)
    return if items.empty?

    # Can only include search from the first one  
    # There's search inside for _any_ HT item. We think. 
    item = items.first
    
    # if this is a serial, we don't want to search inside just part of it, forget it
    return if is_serial_part?(item) 
    
    direct_url = search_url_to(item)
    return unless direct_url

    request.add_service_response( 
        :service => self,
        :display_text=> @display_name,
        :display_text_i18n => "display_name",
        :url=> direct_url,
        :service_type_value => :search_inside
       )
  end
  
  def direct_url_to(item_json)
    if @direct_link_base
      # we're constructing our own link because we need our EZProxy
      # to recognize it for WAYFLess login, which it won't if we use
      # the handle.net url, sorry. 
      # We also need direct link for direct link to search results.
      @direct_link_base + "pt?id=" + CGI.escape(item_json['htid'])
    else
      item['itemURL']
    end
  end

  def transform_view_data(hash)
    if hash[:add_i18n_notes] == "single_volume"
      hash[:notes] = translate("note_for_single_vol", :source => (hash[:source_for_i18n] || ""))
    elsif hash[:add_i18n_notes] == "partial_volume"
      hash[:notes] = translate("note_for_multi_vol", :title => (hash[:title_for_i18n] || ""))
    end

    return hash
  end
  
  
  def is_serial_part?(item)
    # if it's got enumCron, then it's just part of a serial,
    # we don't want to say the serial title as a whole has full text
    # or can be searched, skip it. 
    return item['enumcron']
  end
  
  def full_view?(item)
    item["usRightsString"] == "Full view"
  end
  
  def search_url_to(item_json)
    if @direct_link_base
      @direct_link_base + "ptsearch?id=" + CGI.escape(item_json['htid'])
    else
      return nil
    end
  end


  
  
  # Handle search_inside
  def response_url(service_response, submitted_params)
    if ( ! (service_response.service_type_value.name == "search_inside" ))
      return super(service_response, submitted_params)
    else
      base = service_response[:url]      
      query = CGI.escape(submitted_params["query"] || "")
      url = base + "&q1=#{query}"

      return url
    end
  end
  
  # sample OCLCnums with appropriate results showing that we can pick up other
  #   resources by using this service
  # 02029914  MBooks: full, GBS: info with search inside
  # 01635828  MBooks: full, GBS: snippet
  # 55517975  MBooks: search, GBS: limited preview
  # 02299399  MBooks: full, GBS: snippet
  # 16857172  MBooks: full, GBS: info
  
  # Example of a serial with some full text volumes:
  # JAMA, lccn:07037314
  #
  # Example of a multi-volume with search-only, split accross
  # two HT records. 
  # Handbook of biochemistry and molecular biology lccn: 75029514
  
end