relaton/relaton-doi

View on GitHub
lib/relaton_doi/parser.rb

Summary

Maintainability
D
1 day
Test Coverage
module RelatonDoi
  class Parser
    COUNTRIES = %w[USA].freeze

    TYPES = {
      "book-chapter" => "inbook",
      "book-part" => "inbook",
      "book-section" => "inbook",
      "book-series" => "book",
      "book-set" => "book",
      "book-track" => "inbook",
      "component" => "misc",
      "database" => "dataset",
      "dissertation" => "thesis",
      "edited-book" => "book",
      "grant" => "misc",
      "journal-article" => "article",
      "journal-issue" => "article",
      "journal-volume" => "journal",
      "monograph" => "book",
      "other" => "misc",
      "peer-review" => "article",
      "posted-content" => "dataset",
      "proceedings-article" => "inproceedings",
      "proceedings-series" => "proceedings",
      "reference-book" => "book",
      "reference-entry" => "inbook",
      "report-component" => "techreport",
      "report-series" => "techreport",
      "report" => "techreport",
    }.freeze

    REALATION_TYPES = {
      "is-cited-by" => "isCitedIn",
      "belongs-to" => "related",
      "is-child-of" => "includedIn",
      "is-expression-of" => "expressionOf",
      "has-expression" => "hasExpression",
      "is-manifestation-of" => "manifestationOf",
      "is-manuscript-of" => "draftOf",
      "has-manuscript" => "hasDraft",
      "is-preprint-of" => "draftOf",
      "has-preprint" => "hasDraft",
      "is-replaced-by" => "obsoletedBy",
      "replaces" => "obsoletes",
      "is-translation-of" => "translatedFrom",
      "has-translation" => "hasTranslation",
      "is-version-of" => "editionOf",
      "has-version" => "hasEdition",
      "is-based-on" => "updates",
      "is-basis-for" => "updatedBy",
      "is-comment-on" => "commentaryOf",
      "has-comment" => "hasCommentary",
      "is-continued-by" => "hasSuccessor",
      "continues" => "successorOf",
      "is-derived-from" => "derives",
      "has-derivation" => "derivedFrom",
      "is-documented-by" => "describedBy",
      "documents" => "describes",
      "is-part-of" => "partOf",
      "has-part" => "hasPart",
      "is-review-of" => "reviewOf",
      "has-review" => "hasReview",
      "references" => "cites",
      "is-referenced-by" => "isCitedIn",
      "requires" => "hasComplement",
      "is-required-by" => "complementOf",
      "is-supplement-to" => "complementOf",
      "is-supplemented-by" => "hasComplement",
    }.freeze

    ATTRS = %i[type fetched title docid date link abstract contributor place
               doctype relation extent series medium].freeze

    #
    # Initialize instance.
    #
    # @param [Hash] src The source hash.
    #
    def initialize(src)
      @src = src
      @item = {}
    end

    #
    # Initialize instance and parse the source hash.
    #
    # @param [Hash] src The source hash.
    #
    # @return [RelatonBib::BibliographicItem, RelatonIetf::IetfBibliographicItem,
    #   RelatonBipm::BipmBibliographicItem, RelatonIeee::IeeeBibliographicItem,
    #   RelatonNist::NistBibliographicItem] The bibitem.
    #
    def self.parse(src)
      new(src).parse
    end

    #
    # Parse the source hash.
    #
    # @return [RelatonBib::BibliographicItem, RelatonIetf::IetfBibliographicItem,
    #   RelatonBipm::BipmBibliographicItem, RelatonIeee::IeeeBibliographicItem,
    #   RelatonNist::NistBibliographicItem] The bibitem.
    #
    def parse
      ATTRS.each { |m| @item[m] = send "parse_#{m}" }
      create_bibitem @src["DOI"], @item
    end

    #
    # Create a bibitem from the bibitem hash.
    #
    # @param [String] doi The DOI.
    # @param [Hash] bibitem The bibitem hash.
    #
    # @return [RelatonBib::BibliographicItem, RelatonIetf::IetfBibliographicItem,
    #   RelatonBipm::BipmBibliographicItem, RelatonIeee::IeeeBibliographicItem,
    #   RelatonNist::NistBibliographicItem] The bibitem.
    #
    def create_bibitem(doi, bibitem) # rubocop:disable Metrics/CyclomaticComplexity
      case doi
      when /\/nist/ then RelatonNist::NistBibliographicItem.new(**bibitem)
      when /\/rfc\d+/ then RelatonIetf::IetfBibliographicItem.new(**bibitem)
      when /\/0026-1394\// then RelatonBipm::BipmBibliographicItem.new(**bibitem)
      when /\/ieee/ then RelatonIeee::IeeeBibliographicItem.new(**bibitem)
      else RelatonBib::BibliographicItem.new(**bibitem)
      end
    end

    #
    # Parse the type.
    #
    # @return [String] The type.
    #
    def parse_type
      TYPES[@src["type"]] || @src["type"]
    end

    #
    # Parse the document type
    #
    # @return [String] The document type.
    #
    def parse_doctype
      RelatonBib::DocumentType.new type: @src["type"]
    end

    #
    # Parse the fetched date.
    #
    # @return [String] The fetched date.
    #
    def parse_fetched
      Date.today.to_s
    end

    #
    # Parse titles from the source hash.
    #
    # @return [Array<Hash>] The titles.
    #
    def parse_title # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
      if @src["title"].is_a?(Array) && @src["title"].any?
        main_sub_titles
      elsif @src["project"].is_a?(Array) && @src["project"].any?
        project_titles
      elsif @src["container-title"].is_a?(Array) && @src["container-title"].size > 1
        @src["container-title"][0..-2].map { |t| create_title t }
      else []
      end
    end

    #
    # Parse main and subtitle from the source hash.
    #
    # @return [Array<Hash>] The titles.
    #
    def main_sub_titles
      title = @src["title"].map { |t| create_title t }
      RelatonBib.array(@src["subtitle"]).each { |t| title << create_title(t, "subtitle") }
      RelatonBib.array(@src["short-title"]).each { |t| title << create_title(t, "short") }
      title
    end

    #
    # Fetch titles from the projects.
    #
    # @return [Array<Hash>] The titles.
    #
    def project_titles
      RelatonBib.array(@src["project"]).reduce([]) do |memo, proj|
        memo + RelatonBib.array(proj["project-title"]).map { |t| create_title t["title"] }
      end
    end

    #
    # Create a title from the title and type.
    #
    # @param [String] title The title content.
    # @param [String] type The title type. Defaults to "main".
    #
    # @return [RelatonBib::TypedTitleString] The title.
    #
    def create_title(title, type = "main")
      cnt = str_cleanup title
      RelatonBib::TypedTitleString.new type: type, content: cnt, script: "Latn"
    end

    #
    # Parse a docid from the source hash.
    #
    # @return [Array<RelatonBib::DocumentIdentifier>] The docid.
    #
    def parse_docid
      %w[DOI ISBN ISSN].each_with_object([]) do |type, obj|
        prm = type == "DOI"
        RelatonBib.array(@src[type]).each do |id|
          t = issn_type(type, id)
          obj << RelatonBib::DocumentIdentifier.new(type: t, id: id, primary: prm)
        end
      end
    end

    #
    # Create an ISSN type if it's an ISSN ID.
    #
    # @param [String] type identifier type
    # @param [String] id identifier
    #
    # @return [String] identifier type
    #
    def issn_type(type, id)
      return type unless type == "ISSN"

      t = @src["issn-type"]&.find { |it| it["value"] == id }&.dig("type")
      t ? "issn.#{t}" : type.downcase
    end

    #
    # Parce dates from the source hash.
    #
    # @return [Array<RelatonBib::BibliographicDate>] The dates.
    #
    def parse_date # rubocop:disable Metrics/CyclomaticComplexity
      dates = %w[issued published approved].each_with_object([]) do |type, obj|
        next unless @src.dig(type, "date-parts")&.first&.compact&.any?

        obj << RelatonBib::BibliographicDate.new(type: type, on: date_type(type))
      end
      if dates.none?
        dates << RelatonBib::BibliographicDate.new(type: "created", on: date_type("created"))
      end
      dates
    end

    #
    # Join date parts into a string.
    #
    # @param [String] type The date type.
    #
    # @return [String] The date string.
    #
    def date_type(type)
      @src[type]["date-parts"][0].map { |d| d.to_s.rjust(2, "0") }.join "-"
    end

    #
    # Parse links from the source hash.
    #
    # @return [Array<RelatonBib::TypedUri>] The links.
    #
    def parse_link # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity
      disprefered_links = %w[similarity-checking text-mining]
      links = []
      if @src["URL"]
        links << RelatonBib::TypedUri.new(type: "DOI", content: @src["URL"])
      end
      [@src["link"], @src.dig("resource", "primary")].flatten.compact.each do |l|
        next if disprefered_links.include? l["intended-application"]

        type =  case l["URL"]
                when /\.pdf$/ then "pdf"
                # when /\/rfc\d+$|iopscience\.iop\.org|ieeexplore\.ieee\.org/
                else "src"
                end
        links << RelatonBib::TypedUri.new(type: type, content: l["URL"]) # if type
      end
      links
    end

    #
    # Parse abstract from the source hash.
    #
    # @return [Array<RelatonBib::FormattedString>] The abstract.
    #
    def parse_abstract
      return [] unless @src["abstract"]

      content = @src["abstract"]
      abstract = RelatonBib::FormattedString.new(
        content: content, language: "en", script: "Latn", format: "text/html",
      )
      [abstract]
    end

    #
    # Parse contributors from the source hash.
    #
    # @return [Array<RelatonBib::ContributionInfo>] The contributors.
    #
    def parse_contributor
      contribs = author_investigators
      contribs += authors_editors_translators
      contribs += contribs_from_parent(contribs)
      contribs << contributor(org_publisher, "publisher")
      contribs += org_aurhorizer
      contribs + org_enabler
    end

    #
    # Create authors investigators from the source hash.
    #
    # @return [Array<RelatonBib::ContributionInfo>] The authors investigators.
    #
    def author_investigators
      RelatonBib.array(@src["project"]).reduce([]) do |memo, proj|
        memo + create_investigators(proj, "lead-investigator") +
          create_investigators(proj, "investigator")
      end
    end

    #
    # Create investigators from the project.
    #
    # @param [Hash] project The project hash.
    # @param [String] type The investigator type. "lead-investigator" or "investigator".
    #
    # @return [Array<RelatonBib::ContributionInfo>] The investigators.
    #
    def create_investigators(project, type)
      description = type.gsub("-", " ")
      RelatonBib.array(project[type]).map do |inv|
        contributor(create_person(inv), "author", description)
      end
    end

    #
    # Create authors editors translators from the source hash.
    #
    # @return [Array<RelatonBib::ContributionInfo>] The authors editors translators.
    #
    def authors_editors_translators
      %w[author editor translator].each_with_object([]) do |type, a|
        @src[type]&.each do |c|
          contrib = if c["family"]
                      create_person(c)
                    else
                      RelatonBib::Organization.new(name: str_cleanup(c["name"]))
                    end
          a << contributor(contrib, type)
        end
      end
    end

    #
    # Fetch authors and editors from parent if they are not present in the book part.
    #
    # @param [Array<RelatonBib::ContributionInfo>] contribs present contributors
    #
    # @return [Array<RelatonBib::ContributionInfo>] contributors with authors and editors from parent
    #
    def contribs_from_parent(contribs) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
      return [] unless %w[inbook inproceedings dataset].include?(parse_type) && @src["container-title"]

      has_authors = contribs.any? { |c| c.role&.any? { |r| r.type == "author" } }
      has_editors = contribs.any? { |c| c.role&.any? { |r| r.type == "editor" } }
      return [] if has_authors && has_editors

      create_authors_editors(has_authors, "author")
    end

    #
    # Fetch parent item from Crossref.
    #
    # @return [Hash, nil] parent item
    #
    def parent_item # rubocop:disable Metrics/AbcSize
      @parent_item ||= begin
        query = [@src["container-title"][0], fetch_year].compact.join "+"
        filter = "type:#{%w[book book-set edited-book monograph reference-book].join ',type:'}"
        resp = Faraday.get "https://api.crossref.org/works?query=#{query}&filter=#{filter}"
        json = JSON.parse resp.body
        json["message"]["items"].detect { |i| i["title"].include? @src["container-title"][0] }
      end
    end

    #
    # Create authors and editors from parent item.
    #
    # @param [Boolean] has true if authors or editors are present in the book part
    # @param [String] type "author" or "editor"
    #
    # @return [Array<RelatonBib::ContributionInfo>] authors or editors
    #
    def create_authors_editors(has, type)
      return [] if has || !parent_item

      RelatonBib.array(parent_item[type]).map { |a| contributor(create_person(a), type) }
    end

    #
    # Cerate an organization publisher from the source hash.
    #
    # @return [RelatonBib::Organization] The organization.
    #
    def org_publisher
      pbr = @src["institution"]&.detect do |i|
        @src["publisher"].include?(i["name"]) ||
          i["name"].include?(@src["publisher"])
      end
      a = pbr["acronym"]&.first if pbr
      RelatonBib::Organization.new name: str_cleanup(@src["publisher"]), abbreviation: a
    end

    #
    # Clean up trailing punctuation and whitespace from a string.
    #
    # @param [String] str The string to clean up.
    #
    # @return [String] The cleaned up string.
    #
    def str_cleanup(str)
      str.strip.sub(/[,\/\s]+$/, "").sub(/\s:$/, "")
    end

    #
    # Parse authorizer contributor from the source hash.
    #
    # @return [Array<RelatonBib::ContributionInfo>] The authorizer contributor.
    #
    def org_aurhorizer
      return [] unless @src["standards-body"]

      name, acronym = @src["standards-body"].values_at("name", "acronym")
      org = RelatonBib::Organization.new name: name, abbreviation: acronym
      [contributor(org, "authorizer")]
    end

    #
    # Parse enabler contributor from the source hash.
    #
    # @return [Array<RelatonBib::ContributionInfo>] The enabler contributor.
    #
    def org_enabler
      RelatonBib.array(@src["project"]).each_with_object([]) do |proj, memo|
        proj["funding"].each do |f|
          memo << create_enabler(f.dig("funder", "name"))
        end
      end + RelatonBib.array(@src["funder"]).map { |f| create_enabler f["name"] }
    end

    #
    # Create enabler contributor with type "enabler".
    #
    # @param [String] name <description>
    #
    # @return [RelatonBib::ContributionInfo] The enabler contributor.
    #
    def create_enabler(name)
      org = RelatonBib::Organization.new name: name
      contributor(org, "enabler")
    end

    #
    # Create contributor from an entity and a role type.
    #
    # @param [RelatonBib::Person, RelatonBib::Organization] entity The entity.
    # @param [String] type The role type.
    #
    # @return [RelatonBib::ContributionInfo] The contributor.
    #
    def contributor(entity, type, descriprion = nil)
      role = { type: type }
      role[:description] = [descriprion] if descriprion
      RelatonBib::ContributionInfo.new(entity: entity, role: [role])
    end

    #
    # Create a person from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [RelatonBib::Person] The person.
    #
    def create_person(person)
      RelatonBib::Person.new(
        name: create_person_name(person),
        affiliation: create_affiliation(person),
        identifier: person_id(person),
      )
    end

    #
    # Create person affiliations from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [Array<RelatonBib::Affiliation>] The affiliations.
    #
    def create_affiliation(person)
      (person["affiliation"] || []).map do |a|
        org = RelatonBib::Organization.new(name: a["name"])
        RelatonBib::Affiliation.new organization: org
      end
    end

    #
    # Create a person full name from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [RelatonBib::FullName] The full name.
    #
    def create_person_name(person)
      surname = titlecase(person["family"])
      sn = RelatonBib::LocalizedString.new(surname, "en", "Latn")
      RelatonBib::FullName.new(
        surname: sn, forename: forename(person), addition: nameaddition(person),
        completename: completename(person), prefix: nameprefix(person)
      )
    end

    #
    # Capitalize the first letter of each word in a string except for words that
    #   are 2 letters or less.
    #
    # @param [<Type>] str <description>
    #
    # @return [<Type>] <description>
    #
    def titlecase(str)
      str.split.map do |s|
        if s.size > 2 && s.upcase == s && !/\.&/.match?(s)
          s.capitalize
        else
          s
        end
      end.join " "
    end

    #
    # Create a person name prefix from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [Array<RelatonBib::LocalizedString>] The name prefix.
    #
    def nameprefix(person)
      return [] unless person["prefix"]

      [RelatonBib::LocalizedString.new(person["prefix"], "en", "Latn")]
    end

    #
    # Create a complete name from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [RelatonBib::LocalizedString] The complete name.
    #
    def completename(person)
      return unless person["name"]

      RelatonBib::LocalizedString.new(person["name"], "en", "Latn")
    end

    #
    # Create a forename from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [Array<RelatonBib::LocalizedString>] The forename.
    #
    def forename(person)
      return [] unless person["given"]

      fname = titlecase(person["given"])
      [RelatonBib::Forename.new(content: fname, language: "en", script: "Latn")]
    end

    #
    # Create an addition from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [Array<RelatonBib::LocalizedString>] The addition.
    #
    def nameaddition(person)
      return [] unless person["suffix"]

      [RelatonBib::LocalizedString.new(person["suffix"], "en", "Latn")]
    end

    #
    # Create a person identifier from a person hash.
    #
    # @param [Hash] person The person hash.
    #
    # @return [Array<RelatonBib::PersonIdentifier>] The person identifier.
    #
    def person_id(person)
      return [] unless person["ORCID"]

      [RelatonBib::PersonIdentifier.new("orcid", person["ORCID"])]
    end

    #
    # Parse a place from the source hash.
    #
    # @return [Array<RelatonBib::Place>] The place.
    #
    def parse_place # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/AbcSize
      pub_location = @src["publisher-location"] || fetch_location
      return [] unless pub_location

      pls1, pls2 = pub_location.split(", ")
      pls1 = str_cleanup pls1
      pls2 &&= str_cleanup pls2
      if COUNTRIES.include? pls2
        country = RelatonBib::Place::RegionType.new(name: pls2)
        [RelatonBib::Place.new(city: pls1, country: [country])]
      elsif pls2 && pls2 == pls2&.upcase
        region = RelatonBib::Place::RegionType.new(name: pls2)
        [RelatonBib::Place.new(city: pls1, region: [region])]
      elsif pls1 == pls2 || pls2.nil? || pls2.empty?
        [RelatonBib::Place.new(city: pls1)]
      else
        [RelatonBib::Place.new(city: pls1), RelatonBib::Place.new(city: pls2)]
      end
    end

    #
    # Fetch location from container.
    #
    # @return [String, nil] The location.
    #
    def fetch_location # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
      title = @item[:title].first&.title&.content
      qparts = [title, fetch_year, @src["publisher"]]
      query = CGI.escape qparts.compact.join("+").gsub(" ", "+")
      filter = "type:#{%w[book-chapter book-part book-section book-track].join(',type:')}"
      resp = Faraday.get "https://api.crossref.org/works?query=#{query}&filter=#{filter}"
      json = JSON.parse resp.body
      json["message"]["items"].detect do |i|
        i["publisher-location"] && i["container-title"].include?(title)
      end&.dig("publisher-location")
    end

    #
    # Parse relations from the source hash.
    #
    # @return [Array<RelatonBib::DocumentRelation>] The relations.
    #
    def parse_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
      rels = included_in_relation
      @src["relation"].each_with_object(rels) do |(k, v), a|
        type, desc = relation_type k
        RelatonBib.array(v).each do |r|
          rel_item = Crossref.get_by_id r["id"]
          title = rel_item["title"].map { |t| create_title t }
          docid = RelatonBib::DocumentIdentifier.new(id: r["id"], type: "DOI")
          bib = create_bibitem r["id"], title: title, docid: [docid]
          a << RelatonBib::DocumentRelation.new(type: type, description: desc, bibitem: bib)
        end
      end
    end

    #
    # Transform crossref relation type to relaton relation type.
    #
    # @param [String] crtype The crossref relation type.
    #
    # @return [Array<String>] The relaton relation type and description.
    #
    def relation_type(crtype)
      type = REALATION_TYPES[crtype] || begin
        desc = RelatonBib::FormattedString.new(content: crtype)
        "related"
      end
      [type, desc]
    end

    #
    # Create included in relation.
    #
    # @return [Array<RelatonBib::DocumentRelation>] The relations.
    #
    def included_in_relation
      types = %w[
        book book-chapter book-part book-section book-track dataset journal-issue
        journal-value proceedings-article reference-entry report-component
      ]
      return [] unless @src["container-title"] && types.include?(@src["type"])

      @src["container-title"].map do |ct|
        contrib = create_authors_editors false, "editor"
        bib = RelatonBib::BibliographicItem.new(title: [content: ct], contributor: contrib)
        RelatonBib::DocumentRelation.new(type: "includedIn", bibitem: bib)
      end
    end

    #
    # Fetch year from the source hash.
    #
    # @return [String] The year.
    #
    def fetch_year
      d = @src["published"] || @src["approved"] || @src["created"]
      d["date-parts"][0][0]
    end

    #
    # Parse an extent from the source hash.
    #
    # @return [Array<RelatonBib::Locality>] The extent.
    #
    def parse_extent # rubocop:disable Metrics/AbcSize
      extent = []
      extent << RelatonBib::Locality.new("volume", @src["volume"]) if @src["volume"]
      extent << RelatonBib::Locality.new("issue", @src["issue"]) if @src["issue"]
      if @src["page"]
        from, to = @src["page"].split("-")
        extent << RelatonBib::Locality.new("page", from, to)
      end
      extent.any? ? [RelatonBib::LocalityStack.new(extent)] : []
    end

    #
    # Parse a series from the source hash.
    #
    # @return [Arrey<RelatonBib::Series>] The series.
    #
    def parse_series # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
      types = %w[inbook incollection inproceedings]
      return [] if !@src["container-title"] || types.include?(@item[:type]) || @src["type"] == "report-component"

      con_ttl = if main_sub_titles.any? || project_titles.any?
                  @src["container-title"]
                elsif @src["container-title"].size > 1
                  sct = @src["short-container-title"]&.last
                  abbrev = RelatonBib::LocalizedString.new sct if sct
                  @src["container-title"][-1..-1]
                else []
                end
      con_ttl.map do |ct|
        title = RelatonBib::TypedTitleString.new content: ct
        RelatonBib::Series.new title: title, abbreviation: abbrev
      end
    end

    #
    # Parse a medium from the source hash.
    #
    # @return [RelatonBib::Mediub, nil] The medium.
    #
    def parse_medium
      genre = @src["degree"]&.first
      return unless genre

      RelatonBib::Medium.new genre: genre
    end
  end
end