app/mixin_logic/metadata_helper.rb from team-umlaut/umlaut

app/mixin_logic/metadata_helper.rb
Summary

Maintainability

6 hrs
Test Coverage

Issues
# Helper class to get keyword searchable terms from OpenURL author and title
#
# OpenURLs have some commonly agreed upon metadata elements. This module is
# meant to help simplify things by sorting through the metadata and extracting
# what we need in a simpler interface. These values are specifically constructed
# from the citation to work well as keyword searches in other services.
#
# Also includes some helpful methods for getting identifiers out in a convenient to work with way, regardless of non-standard ways they may have been stored. 

module MetadataHelper
  include MarcHelper # for strip gmd functionality

  # DEPRECATED, not flexible enough, you really need to custom fit
  # for your given target. 
  # method that accepts a referent to return hash of common metadata elements 
  # choosing the available element for the format and the best available for 
  # searching. Wrapper around the other methods.
  def get_search_terms(rft)
    title = get_search_title(rft)
    creator = get_search_creator(rft)    
    
    # returns a hash of values so that more keys can be added
    # and not break services that use this module
    return {:title => title, :creator => creator}
  end


  # A utility method to 'normalize' a title, for use when trying to match a
  # title from one place with records in another database.
  # Does lowercasing and removing puncutation, but also stripping out
  # a bunch of other things that may result
  # in false negatives. Exactly how you want to do for best results depends
  # on the particular data you are working with, you need to experiment to see.
  # MANY options are offered, although defaults are somewhat sensible.
  # Much of this stuff especially takes account of titles that may have
  # been generated from mark.
  # Will never return the emtpy string, will sometimes return nil. 
  def normalize_title(arg_title, options = {})
    # default options
    options[:rstrip_parens] ||= true
    options[:remove_all_parens] ||= true
    options[:strip_gmd] ||= true
    options[:subtitle_on_semicolon] ||=false
    options[:remove_subtitle] ||= false
    options[:normalize_ampersand] ||= true
    options[:remove_punctuation] ||= true
    # Even if you're removing other punctuation, keep the apostrophes?
    options[:keep_apostrophes] ||=false
    
    return nil if arg_title.nil?
    title = arg_title.clone
    
    return nil if title.blank?

    # Sometimes titles given in the OpenURL have some additional stuff
    # in parens at the end, that messes up the search and isn't really
    # part of the title. Eliminate!
    title.gsub!(/\([^)]*\)\s*$/, '') if options[:rstrip_parens]
    # Or, not even just at the end, but anywhere! 
    title.gsub!(/\([^)]*\)/, '') if options[:remove_all_parens]

    # Remove things in brackets, part of an AACR2 GMD that's made it in.
    # replace with ':' so we can keep track of the fact that everything
    # that came afterwards was a sub-title like thing. 
    title = strip_gmd(title) if options[:strip_gmd]
    
    # There seems to be some catoging/metadata disagreement about when to
    # use ';' for a subtitle instead of ':'. Normalize to ':'.
    title.sub!(/[\;]/, ':') if options[:subtitle_on_semicolon]

    title.sub!(/\:(.*)$/, '') if options[:remove_subtitle]
    
    # Change ampersands to 'and' for consistency, we see it both ways.
    title.gsub!(/\&/, ' and ') if options[:normalize_ampersand]
      
    # remove non-alphanumeric, excluding apostrophe
    title.gsub!(/[^[[:alnum:]][[:space:]]\']/, ' ') if options[:remove_punctuation]

    # apostrophe not to space, just eat it.
    title.gsub!(/[\']/, '') if options[:remove_punctuation] && ! options[:keep_apostrophes]

    # compress whitespace
    title.strip!
    title.gsub!(/\s+/, ' ')

    title.downcase!
    
    title = nil if title.blank?

    return title
  end

  # pick title out of OpenURL referent from best element available,
  # no normalization. 
  def raw_search_title(rft)
    # Just make one call to create metadata hash
    metadata = rft.metadata
    title = nil
    if rft.format == 'journal' && metadata['atitle']
      title = metadata['atitle']
    elsif rft.format == 'book'
      title = metadata['btitle'] unless metadata['btitle'].blank?
      title = metadata['title'] if title.blank?
      
    # Well, if we don't know the format and we do have a title use that.  
    # This might happen if we only have an ISBN to start and then enhance.
    # So should services like Amazon also enhance with a format, should
    # we simplify this method to not worry about format so much, or do we
    # keep this as is?
    elsif metadata['btitle']
      title = metadata['btitle']
    elsif metadata['title']
      title = metadata['title']
    elsif metadata['jtitle']
      title = metadata['jtitle']
    end
    return title
  end
  
  # chooses the best available title for the format, normalizes
  def get_search_title(rft, options = {})
    #defaults
    options = {:remove_all_parens => true,
               :subtitle_on_semicolon => true,
               :remove_subtitle => true,
               :remove_punctuation => true}.merge(options)

    title = raw_search_title(rft)
    
    return normalize_title(title, options)
    
  end
  
  # chooses the best available creator for the format
  def get_search_creator(rft)
    # Just make one call to create metadata hash
    metadata = rft.metadata
    # Identify dc.creator query. Prefer aulast alone if available.
    creator = nil
    
    creator = metadata['aulast'] unless metadata['aulast'].blank?
    creator = metadata['au'] if creator.blank?
    # FIXME if capital letters are next to each other should we insert a space?
    #   Should we assume capitals next to each other are initials?
    #   Maybe only if we use au? 
    #   Logic like this makes refactoring to use Referent.to_citation less useful.
    
    # FIXME strip out commas from creator if we use au?

    return nil if creator.blank?
    
    return creator
  end

  def get_top_level_creator(rft)
     # If it's a non-journal thing, add the author if we have an aulast (preferred) or au. 
    # But wait--if it's a book _part_, don't include the author name, since
    # it _might_ just be the author of the part, not of the book. 
    unless (rft.format == "journal" ||
              ( rft.format == "book" &&  ! rft.metadata['atitle'].blank?))
       return get_search_creator(rft)
    end
    return nil
  end
  
  # oclcnum, lccn, and isbn are both _supposed_ to be stored as identifiers
  # with an info: uri. info:oclcnum/#, info:lccn/#. But SFX sometimes stores
  # them in the referent metadata instead: rft.lccn, rft.oclcnum. .
  #
  # On the other hand, isbn and issn can legitimately be included in referent
  # metadata or as a urn. 
  #
  # This method will find you an identifier accross multiple places.
  #
  # type:  :urn or :info
  # subscheme: "lccn", "oclcnum", "isbn", "issn", or anything else that could be found in either a urn an info uri or a referent metadata.
  # referent: an umlaut Referent object
  #
  # returns nil if no identifier found, otherwise the bare identifier (not formatted into a urn/uri right now. Option should be maybe be added?) 
  def get_identifier(type, sub_scheme, referent, options = {} )
    options[:multiple] ||= false
    
    raise Exception.new("type must be :urn or :info") unless type == :urn or type == :info

    prefix = case type
               when :info then "info:#{sub_scheme}/"
               when :urn  then "urn:#{sub_scheme}:"
             end
    
    bare_identifier = nil
    identifiers = referent.identifiers.collect {|id| $1 if id =~ /^#{prefix}(.*)/}.compact

    if ( identifiers.blank? &&  ['lccn', 'oclcnum', 'isbn', 'issn', 'doi', 'pmid'].include?(sub_scheme) )
      # try the referent metadata
      from_rft = referent.metadata[sub_scheme]
      identifiers = [from_rft] unless from_rft.blank?
    end

    if ( options[:multiple])
      return identifiers
    elsif ( identifiers[0].blank? )
      return nil
    else
      return identifiers[0]
    end        
    
  end

  # finds and normalizes an LCCN. If multiple LCCNs are in the record,
  # returns the first one. 
  def get_lccn(rft)
    lccn = get_identifier(:info, "lccn", rft)
    
    lccn = normalize_lccn(lccn)
    
    return lccn
  end

  # Gets an ISSN, makes sure it's a valid ISSN or else returns nil.
  # So will return a valid ISSN (NOT empty string) or nil. 
  def get_issn(rft)
    issn = rft.metadata['issn']
    issn = nil unless issn =~ /\d{4}(-)?\d{3}(\d|X)/
    return issn
  end

  # Some normalization. See:
  # http://info-uri.info/registry/OAIHandler?verb=GetRecord&metadataPrefix=reg&identifier=info:lccn/
  # doesn't validate right now, only normalizes.
  # tbd, raise exception if invalid string. 
  def normalize_lccn(lccn)
    if ( lccn )
      # remove whitespace
      lccn = lccn.gsub(/\s/, '')
      # remove any forward slashes and anything after them
      lccn = lccn.sub(/\/.*$/, '')
      # pad anything after a hyphen before removing hyphen, if neccesary
      lccn = lccn.sub(/-(.*)/) do |match_str| 
        if $1.length < 6 
          ("0" * (6 - $1.length)) + $1 
        else
          $1
        end
      end
    end
    return lccn
  end

  # Gets isbn, also removes any weird stuff on the end sometimes
  # included as 'isbn', but not part of the isbn. Like (paperback)
  # and such.
  def get_isbn(rft)
    isbn = get_identifier(:urn, "isbn", rft)
    isbn = isbn.gsub(/[^\dX]/, '') if isbn
    return nil if isbn.blank?
    return isbn
  end

  def get_oclcnum(rft)
    return get_identifier(:info, "oclcnum", rft)    
  end

  def get_doi(rft)
    return get_identifier(:info, "doi", rft)
  end

  def get_pmid(rft)
    return get_identifier(:info, "pmid", rft)
  end

  # Returns an array, possibly empty. 
  def get_gpo_item_nums(rft)
    # In a technically illegal but used by OCLC info:gpo uri
    ids = get_identifier(:info, "gpo", rft, :multiple => true)
    # Remove the uri part. 
    return ids.collect {|id| id.sub(/^info:gpo\//, '')  }
  end

  def get_sudoc(rft)
    # Don't forget to unescape the sudoc that was escaped to maek it a uri!
    
    # Option 1: In a technically illegal but oh well info:sudoc uri
    
    sudoc = get_identifier(:info, "sudoc", rft)
    sudoc = CGI.unescape(sudoc) if sudoc

    # Option 2: rsinger's purl for sudoc. http://dilettantes.code4lib.org/2009/03/a-uri-scheme-for-sudocs/    
    unless sudoc
      sudoc = rft.identifiers.collect {|id| $1 if id =~ /^http:\/\/purl.org\/NET\/sudoc\/(.*)$/}.compact.slice(0)
      sudoc = CGI.unescape(sudoc) if sudoc
    end

    return sudoc
  end

  def get_year(rft)
    # Some link generators use an illegal 'year' parameter    
    if (date = (rft['date'] || rft['year']))
      return date[0,4]
    end
    return nil
  end

  def get_month(rft)
    if rft.metadata['date'] =~ /\d\d\d\d\-(\d\d?)/
      return $1
    elsif rft.metadata['month']
      # some link generators use an illegal 'month' parameter
      return rft.metadata['month']
    else
      return nil
    end
  end

  # uses `spage` or tries to parse `pages`
  def get_spage(rft)
    if rft.metadata['spage'].present?
      return rft.metadata['spage']
    elsif rft.metadata['pages'] =~ /\A *(.*?) *\-.*\Z/
      return $1
    elsif rft.metadata['pages'].present?
      return rft.metadata['pages']
    else
      return nil
    end
  end

  # uses `epage` or tries to parse `pages`
  def get_epage(rft)
    if rft.metadata['epage'].present?
      return rft.metadata['epage']
    elsif rft.metadata['pages'] =~ /\A.*\- *(.*) *\Z/
      return $1
    elsif rft.metadata['pages'].present?
      return rft.metadata['pages']
    else
      return nil
    end
  end

  # Look at weird bad OpenURLs, use heuristics to see if the 'title' probably
  # represents a journal rather than a book. A guess at best, based on the bad
  # data we've seen, sigh. 
  def title_is_serial?(rft)   
    ( rft.format != "book" && rft.format != "dissertation") &&
    (  rft.metadata["btitle"].blank?  ) &&
    ( %w{journal article}.include?(rft.metadata["genre"]) ||
      rft.metadata['jtitle'].present? ||
      (rft.metadata["genre"].blank? && rft.metadata["issn"].present?)
    )  
  end
  # Mark it a module function so it can be called as a utility as
  # MetadataHelper.title_is_serial?(referent)
  module_function :title_is_serial?
  
end