app/service_adaptors/google_book_search.rb from team-umlaut/umlaut

app/service_adaptors/google_book_search.rb
Summary

Maintainability

1 day
Test Coverage

Issues
# Service that searches Google Book Search to determine viewability.
# It searches by ISBN, OCLCNUM and/or LCCN. 
#
# Uses Google Books API, http://code.google.com/apis/books/docs/v1/getting_started.html 
# http://code.google.com/apis/books/docs/v1/using.html
# 
# If a full view is available it returns a fulltext service response.
# If partial view is available, return as "limited experts". 
# If no view at all, still includes a link in highlighted_links, to pay
#   lip service to google branding requirements. 
# Unfortunately there is no way tell which of the noview 
# books provide search, although some do -- search is advertised if full or 
# partial view is available. 
# 
# If a thumbnail_url is returned in the responses, a cover image is displayed.
#
# Can also enhances with an abstract, if available. -- off by default, set `abstract: true` to turn on.  
#
# And fleshes out bibliographic details from an identifier -- if all you had was an
# ISBN, will fill in title, author, etc in referent from GBS response.  
#
# = Google API Key
# 
# Setting an api key in :api_key STRONGLY recommended, or you'll
# probably get rate limited (not clear what the limit is with no api
# key supplied). You may have to ask for higher rate limit for your api
# key than the default 1000/day, which you can do through the google 
# api console:
# https://code.google.com/apis/console
#
# I requested 50k with this message, and was quickly approved with no questions
# "Services for academic library (Johns Hopkins Libraries) web applications  to match Google Books availability to items presented by our catalog, OpenURL link resolver,  and other software. "
#
# Recommend setting your 'per user limit' to something crazy high, as well
# as requesting more quota. 
class GoogleBookSearch < Service
  require 'multi_json'
  
  
  # Identifiers used in API response to indicate viewability level
  ViewFullValue = 'ALL_PAGES'
  ViewPartialValue = 'PARTIAL'
  # None might also be 'snippet', but Google doesn't want to distinguish
  ViewNoneValue = 'NO_PAGES'
  ViewUnknownValue = 'UNKNOWN'
      
  
  
  include MetadataHelper
  include UmlautHttp
  
  # required params
  
  # attr_reader is important for tests
  attr_reader :url, :display_name, :num_full_views 
  
  def service_types_generated
    types= []

    if @web_links
      types.push ServiceTypeValue[:highlighted_link]
      types.push ServiceTypeValue[:excerpts]
    end
    types.push(ServiceTypeValue[:search_inside]) if @search_inside
    types.push(ServiceTypeValue[:fulltext]) if @fulltext
    types.push(ServiceTypeValue[:cover_image]) if @cover_image
    types.push(ServiceTypeValue[:referent_enhance]) if @referent_enhance
    types.push(ServiceTypeValue[:abstract]) if @abstract

    return types
  end
  
  def initialize(config)    
    @url = 'https://www.googleapis.com/books/v1/volumes?q='
    
    @display_name = 'Google Books'
    
    # number of full views to show
    @num_full_views = 1
    
    # default on, to enhance our metadata with stuff from google
    @referent_enhance = true

    # default OFF, add description/abstract from GBS
    @abstract = false

    # Other responses on by default but can be turned off
    @cover_image   = true
    @fulltext      = true
    @search_inside = true
    @web_links     = true # to partial view :excerpts or :fulltext

    # google api key strongly recommended, otherwise you'll
    # probably get rate limited. 
    @api_key = nil
    
    @credits = {
      "Google Books" => "http://books.google.com/"
    }
    # While you can theoretically look up by LCCN on Google Books,
    # we have found FREQUENT false positives. There's no longer any
    # way to even report these to Google. By default, don't lookup
    # by LCCN. 
    @lookup_by_lccn = false
    
    super(config)
  end
  
  def handle(request)

    bibkeys = get_bibkeys(request.referent)
    return request.dispatched(self, true) if bibkeys.nil?

    data = do_query(bibkeys, request)
    
    
    if data.blank? || data["error"]
      # fail fatal
      return request.dispatched(self, false)
    end
    
    # 0 hits, return. 
    return request.dispatched(self, true) if data["totalItems"] == 0
    
    enhance_referent(request, data) if @referent_enhance

    add_abstract(request, data) if @abstract
    
    #return full views first
    if @fulltext
      full_views_shown = create_fulltext_service_response(request, data)
    end
    
    if @search_inside
      # Add search_inside link if appropriate
      add_search_inside(request, data)
    end
    
    # only if no full view is shown, add links for partial view or noview
    unless full_views_shown
      do_web_links(request, data)
    end
    
    if @cover_image
      thumbnail_url = find_thumbnail_url(data)
      if thumbnail_url
        add_cover_image(request, thumbnail_url)    
      end
    end

    return request.dispatched(self, true)
  end

  # Take the FIRST hit from google, and use it's values to enhance
  # our metadata. Will NOT overwrite existing data. 
  def enhance_referent(request, data)
    
    entry = data["items"].first
    

    if (volumeInfo = entry["volumeInfo"])
      
      title = volumeInfo["title"]
      title += ": #{volumeInfo["subtitle"]}" if (title && volumeInfo["subtitle"])
      
      element_enhance(request, "title", title)
      element_enhance(request, "au", volumeInfo["authors"].first) if volumeInfo["authors"]
      element_enhance(request, "pub", volumeInfo["publisher"])
      
      element_enhance(request, "tpages", volumeInfo["pageCount"])
      
      if (date = volumeInfo["publishedDate"]) && date =~ /^(\d\d\d\d)/
        element_enhance(request, "date", $1)
      end
      
      # LCCN is only rarely included, but is sometimes, eg:
      # "industryIdentifiers"=>[{"type"=>"OTHER", "identifier"=>"LCCN:72627172"}],          
      # Also "LCCN:76630875"
      #
      # And sometimes OCLC number like:
      # "industryIdentifiers"=>[{"type"=>"OTHER", "identifier"=>"OCLC:12345678"}],
      #        
      (volumeInfo["industryIdentifiers"] || []).each do |hash|
        
        if hash["type"] == "ISBN_13"
          element_enhance(request, "isbn", hash["identifier"])
          
        elsif hash["type"] == "OTHER" && hash["identifier"].starts_with?("LCCN:")
          lccn = normalize_lccn(  hash["identifier"].slice(5, hash["identifier"].length)  )
          request.referent.add_identifier("info:lccn/#{lccn}")
          
        elsif hash["type"] == "OTHER" && hash["identifier"].starts_with?("OCLC:")
          oclcnum = normalize_lccn(  hash["identifier"].slice(5, hash["identifier"].length)  )
          request.referent.add_identifier("info:oclcnum/#{oclcnum}")
        end
      
      end              
    end            
  end

  def add_abstract(request, data)
    info = data["items"].first.try {|h| h["volumeInfo"]}
    if description = info["description"]

      url = info["infoLink"]
      request.add_service_response(
          :service => self, 
          :display_text => "Description from Google Books", 
          :display_text_i18n => "description",
          :url => remove_query_context(url),
          :service_type_value =>  :abstract  
      )
    end
  end

  # Will not over-write existing referent values. 
  def element_enhance(request, rft_key, value)
    if (value)
      request.referent.enhance_referent(rft_key, value.to_s, true, false, :overwrite => false)
    end
  end

  
  # returns nil or escaped string of bibkeys
  # to increase the chances of good hit, we send all available bibkeys 
  # and later dedupe by id.
  # FIXME Assumes we only have one of each kind of identifier.
  def get_bibkeys(rft)
    isbn = get_identifier(:urn, "isbn", rft)
    oclcnum = get_identifier(:info, "oclcnum", rft)
    lccn = get_lccn(rft)

    # Google doesn't officially support oclc/lccn search, but does
    # index as token with prefix smashed up right with identifier
    # eg http://books.google.com/books/feeds/volumes?q=OCLC32012617
    #
    # Except turns out doing it as a phrase search is important! Or
    # google's normalization/tokenization does odd things. 
    keys = []
    keys << ('isbn:' + isbn) if isbn
    keys << ('"' + "OCLC" + oclcnum + '"') if oclcnum
    # Only use LCCN if we've got nothing else, and we're allowing it. 
    # it returns many false positives. 
    if @lookup_by_lccn && lccn && keys.length == 0
      keys << ('"' + 'LCCN' + lccn + '"')
    end
    
    return nil if keys.empty?
    keys = CGI.escape( keys.join(' OR ') )
    return keys
  end
  
  def do_query(bibkeys, request)    
    headers = build_headers(request)
    link = @url + bibkeys
    if @api_key
      link += "&key=#{@api_key}"
    end
    
    # Add on limit to only request books, not magazines. 
    link += "&printType=books"

    Rails.logger.debug("GoogleBookSearch requesting: #{link}")        
    response = http_fetch(link, :headers => headers, :raise_on_http_error_code => false)        
    data = MultiJson.load(response.body)
    
    # If Google gives us an error cause it says it can't geo-locate, 
    # remove the IP, log warning, and try again. 
    
    if (data["error"] && data["error"]["errors"] &&
        data["error"]["errors"].find {|h| h["reason"] == "unknownLocation"} )
      Rails.logger.warn("GoogleBookSearch: geo-locate error, retrying without X-Forwarded-For: '#{link}' headers: #{headers.inspect} #{response.inspect}\n    #{data.inspect}")
      
      response = http_fetch(link, :raise_on_http_error_code => false)        
      data = MultiJson.load(response.body)
        
    end
    
    
    if (! response.kind_of?(Net::HTTPSuccess)) || data["error"]      
      Rails.logger.error("GoogleBookSearch error: '#{link}' headers: #{headers.inspect} #{response.inspect}\n    #{data.inspect}")
    end
        
    return data
  end
  
  # We don't need to fake a proxy request anymore, but we still
  # include X-Forwarded-For so google can return location-appropriate
  # availability. If there's an existing X-Forwarded-For, we respect
  # it and add on to it. 
  def build_headers(request)
    original_forwarded_for = nil
    if (request.http_env && request.http_env['HTTP_X_FORWARDED_FOR'])
      original_forwarded_for = request.http_env['HTTP_X_FORWARDED_FOR']                                  
    end

    # we used to prepare a comma seperated list in x-forwarded-for if
    # we had multiple requests, as per the x-forwarded-for spec, but I
    # think Google doesn't like it. 
    
    ip_address = (original_forwarded_for ?
        original_forwarded_for  :
        request.client_ip_addr.to_s)
    
    return {} if ip_address.blank?

    # If we've got a comma-seperated list from an X-Forwarded-For, we
    # can't send it on to google, google won't accept that, just take
    # the first one in the list, which is actually the ultimate client
    # IP. split returns the whole string if seperator isn't found, convenient.
    ip_address = ip_address.split(",").first
    
    # If all we have is an internal/private IP from the internal network,
    # do NOT send that to Google, or Google will give you a 503 error
    # and refuse to process your request, as of 7 sep 2011. sigh.
    # Also if it doesn't look like an IP at all, forget it, don't send it.     
    if ((! ip_address =~ /^\d+\.\d+\.\d+\/\d$/) || 
       ip_address.start_with?("10.") || 
       ip_address.start_with?("172.16") || 
       ip_address.start_with?("192.168"))
       return {}
    else    
      return {'X-Forwarded-For' => ip_address }
    end
  end
  
  def find_entries(gbs_response, viewabilities)
    unless (viewabilities.kind_of?(Array))
      viewabilities = [viewabilities]
    end

    entries = gbs_response["items"].find_all do |entry|
      viewability = entry["accessInfo"]["viewability"]
      (viewability && viewabilities.include?(viewability))           
    end

    return entries
  end
  
  
  # We only create a fulltext service response if we have a full view.
  # We create only as many full views as are specified in config.
  def create_fulltext_service_response(request, data)
    full_views = find_entries(data, ViewFullValue)
    return nil if full_views.empty?
    
    count = 0
    full_views.each do |fv|
      
      uri = fv["volumeInfo"]["previewLink"]
          
      request.add_service_response(
          :service => self, 
          :display_text => @display_name, 
          :display_text_i18n => "display_name",
          :url => remove_query_context(uri),           
          :service_type_value =>  :fulltext  
      )
      count += 1
      break if count == @num_full_views
    end   
    return true
  end

  def add_search_inside(request, data)
    # Just take the first one we find, if multiple
    searchable_view = find_entries(data, [ViewFullValue, ViewPartialValue])[0]        
    
    if ( searchable_view )
      url = searchable_view["volumeInfo"]["infoLink"]
      
      request.add_service_response( 
        :service => self,
        :display_text=>@display_name,
        :display_text_i18n => "display_name",
        :url=> remove_query_context(url),
        :service_type_value => :search_inside
       )                  
    end
    
  end
  
  # create highlighted_link service response for partial and noview
  # Only show one web link. prefer a partial view over a noview.
  # Some noviews have a snippet/search, but we have no way to tell. 
  def do_web_links(request, data)

    # some noview items will have a snippet view, but we have no way to tell
    info_views = find_entries(data, ViewPartialValue)
    viewability = ViewPartialValue
    
    if info_views.blank?
      info_views = find_entries(data, ViewNoneValue)
      viewability = ViewNoneValue  
    end
    
    # Shouldn't ever get to this point, but just in case
    return nil if info_views.blank?
    
    url = ''
    iv = info_views.first
    type = nil
    if (viewability == ViewPartialValue && 
        url = iv["volumeInfo"]["previewLink"])
      url = fix_pg_gbs_link(url)
      display_text = @display_name
      display_text_i18n = "display_name"
      type = ServiceTypeValue[:excerpts]
    else
      url = iv["volumeInfo"]["infoLink"]
      url = fix_pg_gbs_link(url)
      display_text = "Book Information"
      display_text_i18n = "book_information"
      type = ServiceTypeValue[:highlighted_link]
    end


    request.add_service_response( 
        :service=>self,    
        :url=> remove_query_context(url),
        :display_text=>display_text,
        :display_text_i18n => display_text_i18n,
        :service_type_value => type    
     )
  end
  
  # google books direct links do weird things with linking to
  # internal pages, perhaps intending to be based on our
  # search criteria, which pages matched, but we're not
  # using it like that for links to excerpts or full page. 
  # reverse engineer it to go to full page. 
  def fix_pg_gbs_link(url)
    url.sub(/([\?\;\&])(pg=[^;&]+)/, '\1pg=1')
  end

  
 
  # Not all responses have a thumbnail_url. We look for them and return the 1st.
  def find_thumbnail_url(data)
    entries = data["items"].collect do |entry|      
      entry["volumeInfo"]["imageLinks"]["thumbnail"] if entry["volumeInfo"] && entry["volumeInfo"]["imageLinks"]      
    end
    
    # removenill values
    entries.compact!    
    
    # pick the first of the available thumbnails, or nil
    return entries[0]
  end
  

  def add_cover_image(request, url)
    zoom_url = url.clone
    
    # if we're sent to a page other than the frontcover then strip out the
    # page number and insert front cover
    zoom_url.sub!(/&pg=.*?&/, '&printsec=frontcover&')
    
    # hack out the 'curl' if we can
    zoom_url.sub!('&edge=curl', '')
    
    request.add_service_response(
        :service=>self, 
        :display_text => 'Cover Image',
        :url => zoom_url, 
        :size => "medium",
        :service_type_value => :cover_image
    )     
  end
  
  # Google gives us URL to the book that contains a 'dq' param
  # with the original query, which for us is an ISSN/LCCN/OCLCnum query,
  # which we don't actually want to leave in there. 
  def remove_query_context(url)
    url.sub(/&dq=[^&]+/, '')    
  end

  # Catch url_for call for search_inside, because we're going to redirect
  def response_url(service_response, submitted_params)
    if ( ! (service_response.service_type_value.name == "search_inside" ))
      return super(service_response, submitted_params)
    else
      # search inside!
      base = service_response[:url]
      query = CGI.escape(submitted_params["query"] || "")
      # attempting to reverse engineer a bit to get 'snippet'
      # style results instead of 'onepage' style results. 
      # snippet seem more user friendly, and are what google's own
      # interface seems to give you by default. but 'onepage' is the
      # default from our deep link, but if we copy the JS hash data,
      # it looks like we can get Google to 'snippet'.       
      url = base + "&q=#{query}#v=snippet&q=#{query}&f=false"
      return url
    end
  end
  
end

# Important to quote search, see: "OCLC1246014"

# Test WorldCat links
# FIXME: This produces two 'noview' links because the ids don't match.
#   This might be as good as we can do though, unless we want to only ever show
#   one 'noview' link. Notice that the metadata does differ between the two.
# http://localhost:3000/resolve?url_ver=Z39.88-2004&rfr_id=info%3Asid%2Fworldcat.org%3Aworldcat&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&req_dat=%3Csessionid%3E&rft_id=info%3Aoclcnum%2F34576818&rft_id=urn%3AISBN%3A9780195101386&rft_id=urn%3AISSN%3A&rft.aulast=Twain&rft.aufirst=Mark&rft.auinitm=&rft.btitle=The+prince+and+the+pauper&rft.atitle=&rft.date=1996&rft.tpages=&rft.isbn=9780195101386&rft.aucorp=&rft.place=New+York&rft.pub=Oxford+University+Press&rft.edition=&rft.series=&rft.genre=book&url_ver=Z39.88-2004
#
# Snippet view returns noview through the API
# http://localhost:3000/resolve?rft.isbn=0155374656
#
# full view example, LCCN 07020699  ; OCLC: 1246014