team-umlaut/umlaut

View on GitHub
app/service_adaptors/worldcat_identities.rb

Summary

Maintainability
D
2 days
Test Coverage
# Service that uses available metadata to try to find an exact match to a 
# WorldCat Identity. 
#
# Requires sufficient author information and/or an oclcnumber to have enough
# info to try and find a match. Best to run AFTER services that may enhance
# metadata with this info (such as Amazon). 
# 
# See: http://outgoing.typepad.com/outgoing/2008/06/linking-to-worl.html
# 
# Creates a highlighted_link
# Even though the WorldCat Identities API is built on top of SRU we use 
# open-uri to simply fetch the "html" page (which is really XML with a stylesheet)
# and look at the XML directly. 

#. SRU was too slow and was timing out the background service.
# (because SRU parses the response with REXML? we don't want to have anything
#  to do with REXML! Even still retrieving the large XML file and traversing
#  it with Hpricot (what we used to use) was still rather slow. The suggestion is to enable few
#  note_types and then constrain the number shown. The defaults are hopefully 
#  sane in this regard. note_types can be set to false to turn them off.
#  
#  Also can create an optional link to Wikipedia.
#  
#  There's probably a lot more we could pull out of these identities pages if
#  we wanted to. If more of these are used they might warrant their own 
#  service type and part of the page for better layout.

class WorldcatIdentities < Service
  require 'open-uri' # SRU is too slow even though we use an SRU-like link
  require 'nokogiri'
  include MetadataHelper
  
  attr_reader :url, :note_types, :display_name, :wikipedia_link, :openurl_base,
    :require_identifier,
    # below starts the note_types which can be restrained
    :num_of_roles, :num_of_subject_headings, :num_of_works, :num_of_genres
  
  def service_types_generated
    return [ ServiceTypeValue[:highlighted_link] ]
  end
  
  def initialize(config)
    @url = 'http://worldcat.org/identities/search/'
    @note_types = ["combined_counts"]
    @display_name = "WorldCat Identities"
    @require_identifier = false
    # any plural note_types can be restrained 
    @num_of_roles = 5
    @num_of_works = 1
    @num_of_genres = 5
    @wikipedia_link = true
    @openurl_widely_held = true
    @worldcat_widely_held = false
    @openurl_base  = '/resolve'
    
    @credits = {
      "WorldCat Identities" => "http://www.worldcat.org/identities/"
    }
    
    super(config)
  end
  
  def handle(request)
    index, query = define_query(request.referent)
    
    unless query.blank?
      do_query(request, index, query)
    end
    return request.dispatched(self, true)    
  end
  
  def define_query(rft)
    oclcnum = get_identifier(:info, "oclcnum", rft)
    metadata = rft.metadata
    
    # Do we have enough info to do a query with sufficient precision?
    # We are choosing better recall in exchange for lower precision. 
    # We'll search with oclcnum if we have it, but not require it, we'll search
    # fuzzily on various parts of the name if neccesary.
    if ( oclcnum.blank? && ( metadata['aulast'].blank? || metadata['aufirst'].blank? ) && metadata['au'].blank? && metadata['aucorp'].blank?  ) or (oclcnum.blank? && @require_identifier) 
      Rails.logger.debug("Worldcat Identities Service Adaptor: Skipped: Insufficient metadata for lookup")      
      return nil
    end
    
    
    # instead of searching across all indexes we target the one we want
    name_operator = "%3D"
    if ((! metadata['aulast'].blank?) && oclcnum)
      # Just last name is enough, we have an oclcnum.       
      index = 'PersonalIdentities'
      name_part = 'FamilyName'
      name = clean_name(metadata['aulast'])
    elsif (! metadata['au'].blank? )
      # Next choice, undivided author string
      index = "PersonalIdentities"
      name_part = 'Name'
      name = clean_name(metadata['au'])
      name_operator = "all"
    elsif (not metadata['aulast'].blank? and not metadata['aufirst'].blank?)
      # combine them.
      index = "PersonalIdentities"
      name_part = 'Name'
      name = clean_name(metadata['aufirst'] + ' ' + metadata['aulast'])
      name_operator = "all"
    elsif metadata['aucorp']
      # corp name
      index = 'CorporateIdentities'
      name_part = 'Name'
      name = clean_name(metadata['aucorp'])
    else
      # oclcnum but no author information at all! Might still work...
      index = "Identities"
    end

    query_conditions = []
    query_conditions << "local.#{name_part}+#{name_operator}+%22#{name}%22" if name    
    query_conditions << "local.OCLCNumber+%3D+%22#{CGI.escape(oclcnum)}%22" unless oclcnum.blank?

    query = query_conditions.join("+and+")
    
    # Sort keys is important when we don't have an oclcnumber, and doesn't hurt
    # when we do. 
    query += "&sortKeys=holdingscount"
    return index, query 
  end
  
  # We might have to remove certain characters, but for now we just CGI.escape 
  # it and remove any periods
  def clean_name(name)
    CGI.escape(name).gsub('.', '')
  end
  
  def do_query(request, index, query)
    # since we're only doing exact matching with last name and OCLCnum
    # we only request 1 record to hopefully speed things up.
    link = @url + index + '?query=' +query + "&maximumRecords=1"

    result = open(link, "Accept" => "text/xml").read
    xml = Nokogiri::XML(result)
    
    # Identities namespaces are all over the place, it's too hard
    # to interrogate with namespaces, ask nokogiri to remove them all
    # instead. 
    xml.remove_namespaces!
    
    return nil if xml.at("numberOfRecords").inner_text == '0'
   
    create_link(request, xml)
    create_wikipedia_link(request, xml) if @wikipedia_link
    create_openurl_widely_held(request, xml) if @openurl_widely_held
    create_worldcat_widely_held(request, xml) if @worldcat_widely_held
  end
  
  def create_link(request, xml)
    display_name = "About " + extract_display_name(xml)
    extracted_notes = extract_notes(xml) if @note_types
    url = extract_url(xml)
    create_service_response(request, display_name, url, extracted_notes )
  end
    
  def extract_notes(xml)    
    note_pieces = []    
    # a tiny bit of metaprogramming to make it easy to add methods and config
    # for note_types
    @note_types.each do |nt|
      method = ("extract_" + nt).to_sym  
      answer = self.send(method, xml)
      note_pieces << answer unless answer.nil?
    end
   return nil if note_pieces.blank? 
   return note_pieces.join(' | ')
  end
  
  def extract_display_name(doc)
    name = []    
    rawname = doc.at("nameInfo/rawName")
    return nil unless rawname
    rawname.children.each do |name_part|
      name << name_part.inner_text      
    end
    return nil if name.blank?
    return name.join(' ')
  end
    
  def extract_subject_headings(doc)
    subject_headings = []
    (doc.search("biogSH")).each_with_index do |sh, i|
      subject_headings << sh.inner_text
      break if @num_of_subject_headings == i + 1
    end
    return nil if subject_headings.blank?
    "subject headings: " + subject_headings.join('; ')
  end
  
  def extract_roles(doc)
    codes = []
    (doc.search("relators/relator")).each_with_index do |relate, i|
      codes << relate.attributes['code']
      break if @num_of_roles == i + 1
    end
    return nil if codes.blank?
    roles = codes.map{|code| RELATOR_CODES[code] }
    "roles: " + roles.join(', ')
  end
  
  # FIXME a lot more could be done with "by citations". identities gives summaries
  # of the most popular works as well as other descriptive information like
  # subject headings. This might be able to be used for enhancing metadata.
  def extract_works(doc)
    works = []
    doc.search("by/citation/title").each_with_index do |t, i|
      works << t.inner_text
      break if @num_of_works == i + 1
    end
    return nil if works.blank?
    "most widely held #{works.length == 1 ? "work" : "works"}: " + works.join("; ")
  end
  
  def extract_genres(doc)
    genres = []
    doc.search("genres/genre").each_with_index do |g, i|
      genres << g.inner_text
      break if @num_of_genres == i + 1
    end
    return nil if genres.blank?
    "genres: " + genres.join(', ')
  end
  
  def extract_combined_counts(doc)
    work_count = extract_work_count(doc)
    publications_count = extract_publications_count(doc)
    holdings_count = extract_holdings_count(doc)
    work_count << " in " << publications_count << " with " <<
      holdings_count
  end
  
  def extract_work_count(doc)
    work_count = doc.at("workCount").inner_text
    return insert_commas(work_count)  << " works"
  end
  
  def extract_holdings_count(doc)
    total_holdings = doc.at("totalHoldings").inner_text
    return insert_commas(total_holdings) << " total holdings in WorldCat"
  end
  
  def extract_publications_count(doc)
    return insert_commas( doc.at("recordCount").inner_text ) << " publications"
  end
  
  def extract_url(doc)
    pnkey = doc.at("pnkey").inner_text
    return 'http://worldcat.org/identities/' << pnkey
  end
  
  def insert_commas(n)
    n.reverse.scan(/(?:\d*\.)?\d{1,3}-?/).join(',').reverse
  end
  
  def create_service_response(request, display_name, url, extracted_notes)
    request.add_service_response( 
        :service=>self,    
        :url=>url,
        :display_text=>display_name,
        :notes => extracted_notes,
        :service_type_value => :highlighted_link)
  end
  
  def create_wikipedia_link(request, xml)
    name_element =  xml.at("wikiLink")
    return nil unless name_element
    name = name_element.inner_text
    # This is the base link that worldcat identities uses so we use the same
    link = "http://en.wikipedia.org/wiki/Special:Search?search=" << name
    request.add_service_response( 
        :service=>self,    
        :url=>link,
        :display_text=> "About " + name.titlecase,
        :notes => '', 
        :source => 'Wikipedia',
        :service_type_value => :highlighted_link)
  end
  
  def create_openurl_widely_held(request, xml)
    widely_held = get_widely_held_info(xml)
    # try to remove circular links
    return nil if circular_link?(request, widely_held)
    
    openurl = create_openurl(request, widely_held) 
    
    request.add_service_response( 
        :service=>self,    
        :url=>openurl,
        :display_text=> widely_held['title'],
        :notes => "This author's most widely held work.",
        :service_type_value => :highlighted_link) 
  end
  
  def circular_link?(request, citation_info)
    rft = request.referent
    request_oclcnum = get_identifier(:info, "oclcnum", rft)
    request_title = get_search_title(rft)
    return true if citation_info['oclcnum'] == request_oclcnum
    #further cleaning might be necessary for titles to be good matches
    return true if citation_info['title'].strip == request_title.strip
  end
  
  #createsa  minimal openurl to make a new request to umlaut
  def create_openurl(request, wh)
    metadata = request.referent.metadata
    
    co = OpenURL::ContextObject.new
    cor = co.referent
    cor.set_format(wh['record_type'])
    cor.add_identifier("info:oclcnum/#{wh['oclcnum']}")
    cor.set_metadata('aulast', metadata['aulast'] ) if metadata['aulast']
    cor.set_metadata('aufirst', metadata['aufirst']) if metadata['aufirst']
    cor.set_metadata('aucorp', metadata['aucorp']) if metadata['aucorp']
    cor.set_metadata('title', wh['title'])
    link = @openurl_base + '?' + co.kev
    return link
  end
  
  # We just link to worldcat using the oclc number provided
  # FIXME this might need special partial if we incorporate a cover image
  def create_worldcat_widely_held(request, xml)
    
    # try to prevent circular links
    top_holding_info = get_widely_held_info(xml) 
    return nil if circular_link?(request, top_holding_info)    
    
    # http://www.worldcat.org/links/
    most = top_holding_info['most']
    title = top_holding_info['title']
    oclcnum = top_holding_info['oclcnum']
    
    link = 'http://www.worldcat.org/oclc/' << oclcnum
    cover_image_link = extract_cover_image_link(request, most)    
    notes = "this author's most widely held work in WorldCat"
     if  cover_image_link 
      display_text = '<img src="' << cover_image_link << '" style="width:75px;"/>' 
      notes = title << ' is ' << notes
    else
      display_text = title
    end
    
    request.add_service_response( 
        :service=>self,    
        :url=>link,
        :display_text=> display_text,
        :notes => notes,
        :service_type_value => :highlighted_link) 
  end
  
  def get_widely_held_info(xml)
    h = {}
    h['most'] = most = xml.at("by/citation")
    h['oclcnum'] = clean_oclcnum(most.at("oclcnum").inner_text)
    h['title'] = most.at("title").inner_text
    h['record_type'] = most.at('recordType').inner_text
    h
  end
  
  def extract_cover_image_link(request, citation)
    cover = citation.at("cover")
    return nil unless cover
    # we try not to show a cover if we already probably have the same cover 
    # showing.
    oclc = clean_oclcnum( cover.attributes['oclc'] )
    metadata = request.referent.metadata
    if metadata['oclcnum'] and metadata['oclcnum'] =~ oclc
      return nil
    end 
    cover_number = cover.inner_text
    if metadata['isbn'] and metadata['isbn'] == cover_number
      return nil
    end
    
    if cover.attributes["type"] == 'isbn'
      link = "http://www.worldcat.org/wcpa/servlet/DCARead?standardNoType=1&standardNo="
      return link << cover_number
    end
    return nil
  end
  
  def clean_oclcnum(num)
    if num =~ /(ocn0*|ocm0*|on0*|\(OCoLC\)|ocl70*|0+)(.*)$/
      num = $2
    end
    return num
  end
  
  # relator codes are from http://worldcat.org/identities/relators.xml which was
  # referenced from http://worldcat.org/identities/Identities.xsl
  RELATOR_CODES = {
    "act" => "Actor",
    "adp" => "Adapter",
    "aft" => "Author of afterword, colophon, etc.",
    "anm" => "Animator ",
    "ann" => "Annotator",
    "ant" => "Bibliographic antecedent",
    "app" => "Applicant",
    "aqt" => "Author in quotations or text abstracts",
    "arc" => "Architect",
    "arr" => "Arranger",
    "art" => "Artist",
    "asg" => "Assignee",
    "asn" => "Associated name",
    "att" => "Attributed name",
    "auc" => "Auctioneer",
    "aud" => "Author of dialog",
    "aui" => "Author of introduction",
    "aus" => "Author of screenplay",
    "aut" => "Author",
    "bdd" => "Binding designer",
    "bjd" => "Bookjacket designer",
    "bkd" => "Book designer",
    "bkp" => "Book producer",
    "bnd" => "Binder",
    "bpd" => "Bookplate designer",
    "bsl" => "Bookseller",
    "ccp" => "Conceptor",
    "chr" => "Choreographer",
    "clb" => "Collaborator",
    "cli" => "Client",
    "cll" => "Calligrapher",
    "clt" => "Collotyper",
    "cmm" => "Commentator",
    "cmp" => "Composer",
    "cmt" => "Compositor",
    "cng" => "Cinematographer ",
    "cnd" => "Conductor",
    "cns" => "Censor",
    "coe" => "Contestant -appellee",
    "col" => "Collector",
    "com" => "Compiler",
    "cos" => "Contestant",
    "cot" => "Contestant -appellant",
    "cov" => "Cover designer",
    "cpc" => "Copyright claimant",
    "cpe" => "Complainant-appellee",
    "cph" => "Copyright holder",
    "cpl" => "Complainant",
    "cpt" => "Complainant-appellant",
    "cre" => "Creator",
    "crp" => "Correspondent",
    "crr" => "Corrector",
    "csl" => "Consultant",
    "csp" => "Consultant to a project",
    "cst" => "Costume designer",
    "ctb" => "Contributor",
    "cte" => "Contestee-appellee",
    "ctg" => "Cartographer",
    "ctr" => "Contractor",
    "cts" => "Contestee",
    "ctt" => "Contestee-appellant",
    "cur" => "Curator",
    "cwt" => "Commentator for written text",
    "dfd" => "Defendant",
    "dfe" => "Defendant-appellee",
    "dft" => "Defendant-appellant",
    "dgg" => "Degree grantor",
    "dis" => "Dissertant",
    "dln" => "Delineator",
    "dnc" => "Dancer",
    "dnr" => "Donor",
    "dpc" => "Depicted",
    "dpt" => "Depositor",
    "drm" => "Draftsman",
    "drt" => "Director",
    "dsr" => "Designer",
    "dst" => "Distributor",
    "dte" => "Dedicatee",
    "dto" => "Dedicator",
    "dub" => "Dubious author",
    "edt" => "Editor",
    "egr" => "Engraver",
    "elt" => "Electrotyper",
    "eng" => "Engineer",
    "etr" => "Etcher",
    "exp" => "Expert",
    "fac" => "Facsimilist",
    "flm" => "Film editor",
    "fmo" => "Former owner",
    "fpy" => "First party",
    "fnd" => "Funder",
    "frg" => "Forger",
    "grt" => "Graphic technician",
    "hnr" => "Honoree",
    "hst" => "Host",
    "ill" => "Illustrator",
    "ilu" => "Illuminator",
    "ins" => "Inscriber",
    "inv" => "Inventor",
    "itr" => "Instrumentalist",
    "ive" => "Interviewee",
    "ivr" => "Interviewer",
    "lbt" => "Librettist",
    "lee" => "Libelee-appellee",
    "lel" => "Libelee",
    "len" => "Lender",
    "let" => "Libelee-appellant",
    "lgd" => "Lighting designer ",
    "lie" => "Libelant-appellee",
    "lil" => "Libelant",
    "lit" => "Libelant-appellant",
    "lsa" => "Landscape architect",
    "lse" => "Licensee",
    "lso" => "Licensor",
    "ltg" => "Lithographer",
    "lyr" => "Lyricist",
    "mfr" => "Manufacturer ",
    "mdc" => "Metadata contact",
    "mod" => "Moderator",
    "mon" => "Monitor",
    "mrk" => "Markup editor",
    "mte" => "Metal-engraver",
    "mus" => "Musician",
    "nrt" => "Narrator",
    "opn" => "Opponent",
    "org" => "Originator",
    "orm" => "Organizer of meeting",
    "oth" => "Other",
    "own" => "Owner",
    "pat" => "Patron",
    "pbd" => "Publishing director",
    "pbl" => "Publisher",
    "pfr" => "Proofreader",
    "pht" => "Photographer",
    "plt" => "Platemaker",
    "pop" => "Printer of plates",
    "ppm" => "Papermaker",
    "ppt" => "Puppeteer ",
    "prc" => "Process contact",
    "prd" => "Production personnel",
    "prf" => "Performer",
    "prg" => "Programmer",
    "prm" => "Printmaker",
    "pro" => "Producer",
    "prt" => "Printer",
    "pta" => "Patent applicant",
    "pte" => "Plaintiff -appellee",
    "ptf" => "Plaintiff",
    "pth" => "Patent holder",
    "ptt" => "Plaintiff-appellant",
    "rbr" => "Rubricator",
    "rce" => "Recording engineer",
    "rcp" => "Recipient",
    "red" => "Redactor",
    "ren" => "Renderer",
    "res" => "Researcher",
    "rev" => "Reviewer",
    "rpt" => "Reporter",
    "rpy" => "Responsible party",
    "rse" => "Respondent -appellee",
    "rsg" => "Restager ",
    "rsp" => "Respondent",
    "rst" => "Respondent-appellant",
    "rth" => "Research team head",
    "rtm" => "Research team member",
    "sad" => "Scientific advisor",
    "sce" => "Scenarist",
    "scl" => "Sculptor",
    "scr" => "Scribe",
    "sec" => "Secretary",
    "sgn" => "Signer",
    "sng" => "Singer",
    "spk" => "Speaker",
    "spn" => "Sponsor",
    "spy" => "Second party",
    "srv" => "Surveyor",
    "std" => "Set designer ",
    "stl" => "Storyteller",
    "stn" => "Standards body",
    "str" => "Stereotyper",
    "tch" => "Teacher ",
    "ths" => "Thesis advisor",
    "trc" => "Transcriber",
    "trl" => "Translator",
    "tyd" => "Type designer",
    "tyg" => "Typographer",
    "vdg" => "Videographer ",
    "voc" => "Vocalist",
    "wam" => "Writer of accompanying material",
    "wdc" => "Woodcutter",
    "wde" => "Wood -engraver",
    "wit" => "Witness"    
  }
  
end