datacite/levriero

View on GitHub
app/models/concerns/importable.rb

Summary

Maintainability
C
1 day
Test Coverage
C
77%
module Importable
  extend ActiveSupport::Concern

  included do
    # strong_parameters throws an error, using attributes hash
    def update_record(attributes)
      if update(attributes)
        Rails.logger.debug "#{self.class.name} #{id} updated."
      else
        Rails.logger.error "#{self.class.name} #{id} not updated: #{errors.to_a.inspect}"
      end
    end

    def delete_record
      if destroy(refresh: true)
        Rails.logger.debug "#{self.class.name} record deleted."
      else
        Rails.logger.error "#{self.class.name} record not deleted: #{errors.to_a.inspect}"
      end
    end
  end

  module ClassMethods
    def get_doi_ra(prefix)
      return nil if prefix.blank?

      url = "https://doi.org/ra/#{prefix}"
      result = Maremma.get(url)

      return result.body.fetch("errors") if result.body.fetch("errors",
                                                              nil).present?

      result.body.dig("data", 0, "RA")
    end

    def validate_doi(doi)
      Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(doi)).last
    end

    def validate_prefix(doi)
      Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5})\/.+\z/.match(doi)).last
    end

    def normalize_doi(doi)
      doi = validate_doi(doi)
      return nil if doi.blank?

      # remove non-printing whitespace and downcase
      doi = doi.delete("\u200B").downcase

      # turn DOI into URL, escape unsafe characters
      "https://doi.org/#{Addressable::URI.encode(doi)}"
    end

    def normalize_url(id)
      return nil if id.blank?

      # check for valid protocol. We support AWS S3 and Google Cloud Storage
      uri = Addressable::URI.parse(id)
      return nil unless uri&.host && %w(http https ftp s3
                                        gs).include?(uri.scheme)

      # clean up URL
      PostRank::URI.clean(id)
    rescue Addressable::URI::InvalidURIError
      nil
    end

    def normalize_arxiv(id)
      return nil if id.blank?

      id = id.downcase

      # turn arXiv into a URL if needed
      id = "https://arxiv.org/abs/#{id[6..]}" if id.start_with?("arxiv:")

      # check for valid protocol.
      uri = Addressable::URI.parse(id)
      return nil unless uri&.host && %w(http https).include?(uri.scheme)

      # clean up URL
      PostRank::URI.clean(id)
    rescue Addressable::URI::InvalidURIError
      nil
    end

    def normalize_igsn(id)
      return nil if id.blank?

      id = id.downcase

      # turn igsn into a URL if needed
      id = "https://hdl.handle.net/10273/#{id}" unless id.start_with?("http")

      # check for valid protocol.
      uri = Addressable::URI.parse(id)
      return nil unless uri&.host && %w(http https).include?(uri.scheme)

      # don't use IGSN resolver as no support for ssl
      id = "https://hdl.handle.net/10273/#{id[15..]}" if id.start_with?("http://igsn.org")

      # clean up URL
      PostRank::URI.clean(id.downcase)
    rescue Addressable::URI::InvalidURIError
      nil
    end

    def normalize_handle(id)
      return nil if id.blank?

      id = id.downcase

      # turn handle into a URL if needed
      id = "https://hdl.handle.net/#{id}" unless id.start_with?("http")

      # check for valid protocol.
      uri = Addressable::URI.parse(id)
      return nil unless uri&.host && %w(http https).include?(uri.scheme)

      # clean up URL
      PostRank::URI.clean(id.downcase)
    rescue Addressable::URI::InvalidURIError
      nil
    end

    def normalize_pmid(id)
      return nil if id.blank?

      id = id.downcase

      # strip pmid prefix
      id = id[5..] if id.start_with?("pmid:")

      # turn handle into a URL if needed
      id = "https://identifiers.org/pubmed:#{id}" unless id.start_with?("http")

      # check for valid protocol.
      uri = Addressable::URI.parse(id)
      return nil unless uri&.host && %w(http https).include?(uri.scheme)

      # clean up URL
      PostRank::URI.clean(id.downcase)
    rescue Addressable::URI::InvalidURIError
      nil
    end

    def orcid_from_url(url)
      Array(/\A(http|https):\/\/orcid\.org\/(.+)/.match(url)).last
    end

    def orcid_as_url(orcid)
      "https://orcid.org/#{orcid}" if orcid.present?
    end

    def validate_orcid(orcid)
      orcid = Array(/\A(?:(http|https):\/\/(www\.)?orcid\.org\/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z/.match(orcid)).last
      orcid.gsub(/[[:space:]]/, "-") if orcid.present?
    end

    def normalize_orcid(orcid)
      orcid = validate_orcid(orcid)
      return nil if orcid.blank?

      # turn ORCID ID into URL
      "https://orcid.org/#{Addressable::URI.encode(orcid)}"
    end

    def validate_ror(ror_id)
      Array(/\A(?:(http|https):\/\/)?(ror\.org\/0\w{6}\d{2})\z/.match(ror_id)).last
    end

    def normalize_ror(ror_id)
      ror_id = validate_ror(ror_id)
      return nil if ror_id.blank?

      # turn ROR ID into URL
      "https://#{Addressable::URI.encode(ror_id)}"
    end

    def import_from_api
      route = "#{name.downcase}s"
      page_number = 1
      total_pages = 1
      total = 0

      # paginate through API results
      while page_number <= total_pages
        params = { "page[number]" => page_number, "page[size]" => 100 }.compact
        url = ENV["API_URL"] + "/#{route}?" + URI.encode_www_form(params)

        response = Maremma.get(url, content_type: "application/vnd.api+json")
        Rails.logger.error response.body["errors"].inspect if response.body.fetch(
          "errors", nil
        ).present?

        records = response.body.fetch("data", [])
        records.each do |data|
          if name == "Client"
            provider_id = data.dig("relationships", "provider", "data", "id")
            data["attributes"]["provider_id"] = provider_id
          end

          ImportJob.perform_later(data.except("relationships"))
        end

        processed = (page_number - 1) * 100 + records.size
        Rails.logger.info "#{processed} #{name.downcase}s processed."

        page_number = response.body.dig("meta", "page").to_i + 1
        total = response.body.dig("meta", "total") || total
        total_pages = response.body.dig("meta", "total-pages") || 0
      end

      total
    end

    def parse_record(sqs_msg: nil, data: nil)
      id = "https://doi.org/#{data['id']}"
      response = get_datacite_json(id)
      related_identifiers = Array.wrap(response.fetch("relatedIdentifiers",
                                                      nil)).select do |r|
        ["DOI", "URL"].include?(r["relatedIdentifierType"])
      end

      if related_identifiers.any? { |r| r["relatedIdentifierType"] == "DOI" }
        item = {
          "id" => data["id"],
          "type" => "dois",
          "attributes" => response,
        }
        RelatedIdentifier.push_item(item)
      end

      if related_identifiers.any? { |r| r["relatedIdentifierType"] == "URL" }
        item = {
          "id" => data["id"],
          "type" => "dois",
          "attributes" => response,
        }
        RelatedUrl.push_item(item)
      end

      funding_references = Array.wrap(response.fetch("fundingReferences",
                                                     nil)).select do |f|
        f.fetch("funderIdentifierType", nil) == "Crossref Funder ID"
      end
      if funding_references.present?
        item = {
          "doi" => data["id"],
          "type" => "dois",
          "attributes" => response,
        }
        FunderIdentifier.push_item(item)
      end

      name_identifiers = Array.wrap(response.fetch("creators",
                                                   nil)).select do |n|
        Array.wrap(n.fetch("nameIdentifiers",
                           nil)).any? do |n|
          n["nameIdentifierScheme"] == "ORCID"
        end
      end
      if name_identifiers.present?
        item = {
          "doi" => data["id"],
          "type" => "dois",
          "attributes" => response,
        }
        NameIdentifier.push_item(item)
      end

      affiliation_identifiers = Array.wrap(response.fetch("creators",
                                                          nil)).select do |n|
        Array.wrap(n.fetch("affiliation",
                           nil)).any? do |n|
          n["affiliationIdentifierScheme"] == "ROR"
        end && Array.wrap(n.fetch(
                            "nameIdentifiers", nil
                          )).any? do |n|
                 n["nameIdentifierScheme"] == "ORCID"
               end
      end
      if affiliation_identifiers.present?
        item = {
          "doi" => data["id"],
          "type" => "dois",
          "attributes" => response,
        }
        AffiliationIdentifier.push_item(item)
      end

      orcid_affiliation = Array.wrap(response.fetch("creators",
                                                    nil)).select do |n|
        Array.wrap(n.fetch("affiliation", nil)).any? do |n|
          n["affiliationIdentifierScheme"] == "ROR"
        end
      end
      if orcid_affiliation.present?
        item = {
          "doi" => data["id"],
          "type" => "dois",
          "attributes" => response,
        }
        OrcidAffiliation.push_item(item)
      end

      Rails.logger.info "[Event Data] #{related_identifiers.length} related_identifiers found for DOI #{data['id']}" if related_identifiers.present?
      Rails.logger.info "[Event Data] #{name_identifiers.length} name_identifiers found for DOI #{data['id']}" if name_identifiers.present?
      Rails.logger.info "[Event Data] #{affiliation_identifiers.length} affiliation_identifiers found for DOI #{data['id']}" if affiliation_identifiers.present?
      Rails.logger.info "[Event Data] #{orcid_affiliation.length} orcid_affiliations found for DOI #{data['id']}" if affiliation_identifiers.present?
      Rails.logger.info "[Event Data] #{funding_references.length} funding_references found for DOI #{data['id']}" if funding_references.present?
      Rails.logger.info "No events found for DOI #{data['id']}" if related_identifiers.blank? && name_identifiers.blank? && funding_references.blank? && affiliation_identifiers.blank?

      related_identifiers + name_identifiers + funding_references + affiliation_identifiers + orcid_affiliation
    end

    def create_record(attributes)
      parameters = ActionController::Parameters.new(attributes)
      new(parameters.permit(safe_params))
    end

    def to_kebab_case(hsh)
      hsh.stringify_keys.transform_keys!(&:underscore)
    end
  end
end