gaurav/biburi

View on GitHub
lib/biburi/driver/coins.rb

Summary

Maintainability
D
1 day
Test Coverage
# A driver to read BibTeX from web pages by reading
# in COinS on that page. Find out more about COinS
# at: http://ocoins.info/
#
# Author:: Gaurav Vaidya
# Copyright:: Copyright (c) 2013 Gaurav Vaidya
# License:: MIT

require 'biburi'
require 'biburi/driver'
require 'net/http'
require 'json'
require 'bibtex'
require 'uri'
require 'nokogiri'
require 'cgi'

# This driver reads COinS citations in an HTML
# page and returns them. It defaults to returning
# a single citation, but can be used to return all
# citations.

module BibURI::Driver::COinS
  include BibURI::Driver

  # We support an identifier if we can make them look
  # canonical.
  def self.supported?(id)
    canonical = self.canonical(id)
    return !(canonical.nil?)
  end

  # The canonical form of this identifier is the
  # normalized URI, IF it has a scheme of 'http'
  # or 'https'.
  def self.canonical(id)
    uri = URI.parse(id)

    unless uri.scheme == 'http' || uri.scheme == 'https'
        return nil
    end

    return uri.to_s

  rescue URI::InvalidURIError
    # If there is an error in the URI, it is not an identifier.
    nil
  end

  # Returns a list of parsed values with
  # BibTeX names by looking up the provided id (a URL).
  # 
  # This will call self.lookup_all(), and then only
  # return the first match. For pages like Mendeley, this is
  # necessary to avoid pulling in 'related to' citations
  # or to pull in all entries.
  def self.lookup(id)
    self.lookup_all(id) do |first_only|
      return first_only
    end
  end
   
  # This method returns ALL COinS on this page. For Mendeley,
  # this will return the pages' COinS as well as all related
  # papers. Use self.lookup() to find a single one.
  def self.lookup_all(id)
    # Calculate the canonical identifier.
    url = canonical(id)
    if url.nil? then
        return nil
    end

    # Retrieve the HTML.
    content = Net::HTTP.get(URI(url))
    doc = Nokogiri::HTML(content)
    spans = doc.css('span.Z3988')

    # Go through results and format them as BibTeX::Entry.
    # We support both 
    results = [] unless block_given?
        
    spans.each do |span|
        coins = span['title']

        bibentry = self.coins_to_bibtex(coins)

        # Set identifiers so we know where this came from.
        bibentry[:url] = url

        identifiers = bibentry[:identifiers].split("\n")
        identifiers.push(url)
        bibentry.add(:identifiers, identifiers.join("\n"))

        # See if we have a DOI.
        identifiers.each do |identifier|
            match = identifier.match(/^(?:http:\/\/dx\.doi\.org\/|doi:|info:doi\/)(.*)$/i) 
            if match then
                bibentry[:doi] = match[1] 
            end
        end

        # Yield values or return array.
        if block_given? then
            yield(bibentry)
        else
            results.push(bibentry)
        end 
    end

    # If we built an array, return it.
    unless block_given? then
        return results
    end
  end

  # Converts a COinS string to a BibTeX entry.
  def self.coins_to_bibtex(coins)
    # Create a BibTeX entry to store these values.
    bibentry = BibTeX::Entry.new

    # If we have COinS data, we have a lot more data.
    coins_kv = CGI::parse(coins)
    metadata = {}
    coins_kv.each do |key, val|
        if val.length == 1 then
            metadata[key] = val[0]
        else
            metadata[key] = val
        end
    end

    # COinS values are explained at http://ocoins.info/cobg.html
    # and in http://ocoins.info/cobgbook.html.

    # If we're not Z3988-2004, skip out.
    ctx_ver = metadata['ctx_ver']
    if ctx_ver != 'Z39.88-2004' then
        ctx_ver = "" if ctx_ver.nil?
        raise "ctx_ver is: '#{ctx_ver}'"
    end

    # Add ALL the identifiers.
    bibentry[:identifiers] = ""
    if metadata.key?('rft_id') then
        if metadata['rft_id'].kind_of?(Array) then
            bibentry[:identifiers] = metadata['rft_id'].join("\n")
        else
            bibentry[:identifiers] = metadata['rft_id']
        end
    end

    # COinS supports some types
    genre = metadata['rft.genre']
    if genre == 'article' then
        bibentry.type = "article"
    elsif genre == 'book' then
        bibentry.type = "book"
    elsif genre == 'bookitem' then
        bibentry.type = "inbook"
    elsif genre == 'proceeding' then
        bibentry.type = "proceedings"
    elsif genre == 'conference' then
        bibentry.type = "inproceedings"
    elsif genre == 'report' then
        bibentry.type = "techreport"
    else
        # Default to misc.
        # There is a COinS genre called 'unknown'
        # which comes here, too.
        bibentry.type = "misc"
    end 
    
    # Journal title: title, jtitle
    journal_title = metadata['rft.title']       # The old-style journal title.
    journal_title ||= metadata['rft.stitle']    # An abbreviated title.
    journal_title ||= metadata['rft.jtitle']    # Complete title.
    bibentry[:journal] = journal_title

    # Book title: btitle
    if metadata.key?('rft.btitle')
        if journal_title
            bibentry[:booktitle] = metadata['rft.btitle']
        else
            bibentry[:title] = metadata['rft.btitle']
        end
    end

    # Pages: spage, epage
    pages = metadata['rft.pages']       # If only pages are provided

    # Expand a single dash to a BibTeX-y double-dash.
    pages.gsub!(/([^\-])\-([^\-])/, '\1--\2') unless pages.nil?

    pages ||= metadata['rft.spage'] + "--" + metadata['rft.epage']
                                    # If we have start and end pages
    bibentry[:pages] = pages

    # Authors are all in 'rft.au'
    authors = []
    metadata['rft.au'] = [ metadata['rft.au'] ] unless metadata['rft.au'].kind_of?(Array)
    metadata['rft.au'].each do |author|
        authors.push(BibTeX::Name.parse(author)) unless author.nil?
    end

    # However! Sometimes a name is in aufirst/aulast
    # and also in au; and sometimes it's only in aufirst/aulast.
    first_author = BibTeX::Name.new
    first_author.last = metadata['rft.aulast']
    first_author.suffix = metadata['rft.ausuffix'] if metadata.key?('rft.ausuffix')
    if metadata.key?('rft.aufirst') then
        first_author.first = metadata['rft.aufirst']
    elsif metadata.key?('rft.auinit') then
        first_author.first = metadata['rft.auinit']
    elsif 
        first_author.first = metadata['rft.auinit1']
        first_author.first += " " + metadata['rftinitm'] if metadata.key?('rftinitm')
    end
    if !authors.include?(first_author) then
        authors.unshift(first_author)
    end

    bibentry[:author] = BibTeX::Names.new(authors)

    # Dates. 
    date = metadata['rft.date']
    bibentry[:date] = date

    # Citeulike dates are easy to parse. 
    unless date.nil? then
        if match = date.match(/^(\d{4})$/) then
            bibentry[:year] = match[1] 

        elsif match = date.match(/^(\d{4})-(\d{1,2})$/) then
            bibentry[:year] = match[1] 
            bibentry[:month] = match[2] 

        elsif match = date.match(/^(\d{4})-(\d{1,2})-(\d{1,2})$/) then
            bibentry[:year] = match[1] 
            bibentry[:month] = match[2] 
            bibentry[:day] = match[3] 

        end
    end

    # Map remaining fields to BibTeX.
    standard_mappings = {
        "rft.atitle" =>     "title",
        "rft.volume" =>     "volume",
        "rft.issue" =>      "number",
        "rft.artnum" =>     "article_number",
        "rft.issn" =>       "issn",
        "rft.eissn" =>      "eissn",
        "rft.isbn" =>       "isbn",
        "rft.coden" =>      "CODEN",
        "rft.sici" =>       "SICI",
        "rft.chron" =>      "chronology",
        "rft.ssn" =>        "season",
        "rft.quarter" =>    "quarter",
        "rft.part" =>       "part",

        "rft.place" =>      "address",
        "rft.pub" =>        "publisher",
        "rft.edition" =>    "edition",
        "rft.tpages" =>     "total_pages",
        "rft.series" =>     "series",
        "rft.bici" =>       "bici"
    }

    standard_mappings.keys.each do |field|
        if metadata.key?(field) then
            bibentry[standard_mappings[field]] = metadata[field]
        end
    end

    return bibentry
  end
end