lib/relaton_omg/scraper.rb from relaton/relaton-omg

lib/relaton_omg/scraper.rb
Summary

Maintainability

1 hr
Test Coverage

Issues
require "nokogiri"

module RelatonOmg
  class Scraper
    URL_PATTERN = "https://www.omg.org/spec/".freeze

    def initialize(acronym, version = nil, spec = nil)
      @acronym = acronym
      @version = version
      @spec = spec
    end

    def self.scrape_page(ref)
      %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
      return unless acronym

      scraper = new(acronym, version, spec)
      doc = scraper.get_doc
      return if doc.nil? || scraper.fetch_link.empty?

      OmgBibliographicItem.new(**scraper.item)
    end

    def get_doc
      @url = "#{URL_PATTERN}#{@acronym}/"
      @url += @version.gsub(' ', '/') if @version
      @doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10)
    rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
      return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"

      raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})"
    end

    def item
      {
        id: fetch_id,
        fetched: Date.today.to_s,
        docid: fetch_docid,
        title: fetch_title,
        abstract: fetch_abstract,
        version: fetch_version,
        date: fetch_date,
        docstatus: fetch_status,
        link: fetch_link,
        relation: fetch_relation,
        keyword: fetch_keyword,
        license: fetch_license,
      }
    end

    def fetch_id
      "#{@acronym}#{doc_version}#{@spec}"
    end

    def fetch_title
      content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
      content += ": #{@spec}" if @spec
      title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
      [RelatonBib::TypedTitleString.new(type: "main", title: title)]
    end

    def fetch_docid
      id = ["OMG", @acronym]
      id << doc_version if doc_version
      id << @spec if @spec
      [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
    end

    def fetch_abstract
      content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
      [{ content: content, language: "en", script: "Latn" }]
    end

    def fetch_version
      [RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)]
    end

    def doc_version
      @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
    end

    def fetch_date
      [type: "published", on: pub_date.to_s]
    end

    def pub_date
      Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
    end

    def fetch_status
      status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
      stage = status.text.strip.match(/\w+/).to_s
      RelatonBib::DocumentStatus.new(stage: stage)
    end

    def fetch_link
      return @link if @link

      @links = []
      if @spec
        a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
        @links << { type: "src", content: a[:href] } if a
      else
        a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
        @links << { type: "src", content: a[:href] } if a
        pdf = @doc.at('//a[@class="download-document"]')
        @links << { type: "pdf", content: pdf[:href] } if pdf
      end
      @links
    end

    def fetch_relation
      v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
      v.reduce([]) do |mem, row|
        ver = row.at("td").text
        unless ver == doc_version
          acronym = row.at("td[3]/a")[:href].split("/")[4]
          fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
          bibitem = OmgBibliographicItem.new formattedref: fref
          mem << { type: "obsoletes", bibitem: bibitem }
        end
        mem
      end
    end

    def fetch_keyword
      @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
    end

    def fetch_license
      @doc.xpath(
        '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
      ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
    end
  end
end