relaton/relaton-gb

View on GitHub
lib/relaton_gb/scrapper.rb

Summary

Maintainability
A
0 mins
Test Coverage
# encoding: UTF-8
# frozen_string_literal: true

require "yaml"
require "gb_agencies"

module RelatonGb
  # Common scrapping methods.
  module Scrapper
    STAGES = { "即将实施" => "published",
               "现行" => "activated",
               "废止" => "obsoleted",
               "被代替" => "replaced" }.freeze

    @prefixes = nil

    # @param doc [Nokogiri::HTML::Document]
    # @param src [String]
    # @param hit [RelatonGb::Hit]
    # @return [Hash]
    def scrapped_data(doc, src, hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
      {
        fetched: Date.today.to_s,
        committee: get_committee(doc, hit.docref),
        docid: get_docid(hit.docref),
        title: get_titles(doc),
        contributor: get_contributors(doc, hit.docref),
        doctype: get_type,
        docstatus: get_status(doc, hit.status),
        gbtype: get_gbtype(doc, hit.docref),
        ccs: get_ccs(doc),
        ics: get_ics(doc),
        link: [{ type: "src", content: src }],
        date: get_dates(doc),
        language: ["zh"],
        script: ["Hans"],
        structuredidentifier: fetch_structuredidentifier(hit.docref),
      }
    end

    # @param docref [String]
    # @return [Array<RelatonBib::DocumentIdentifier>]
    def get_docid(docref)
      [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard", primary: true)]
    end

    # @param docref [String]
    # @return [RelatonIsoBib::StructuredIdentifier]
    def fetch_structuredidentifier(docref)
      m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
      RelatonIsoBib::StructuredIdentifier.new(
        project_number: m[1], part_number: m[2], prefix: nil,
        id: docref, type: "Chinese Standard"
      )
    end

    # @param doc [Nokogiri::HTML::Document]
    # @param docref [Strings]
    # @return [Array<Hash>]
    def get_contributors(doc, docref)
      name = docref.match(/^[^\s]+/).to_s
      name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
      gbtype = get_gbtype(doc, docref)
      orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact
      return [] unless orgs.any?

      entity = RelatonBib::Organization.new name: orgs
      [{ entity: entity, role: [type: "publisher"] }]
    end

    # @param lang [String]
    # @param name [String]
    # @param gbtype [Hash]
    # @return [Hash]
    def org(lang, name, gbtype)
      ag = GbAgencies::Agencies.new(lang, {}, "")
      content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate])
      return unless content

      { language: lang, content: content }
    end

    # @param doc [Nokogiri::HTML::Document]
    # @return [Array<RelatonBib::TypedTitleString>]
    def get_titles(doc)
      tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text
      titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans"
      ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
      return titles if ten.empty?

      titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn")
    end

    def get_type
      DocumentType.new type: "standard"
    end

    # @param doc [Nokogiri::HTML::Document]
    # @param status [String, NilClass]
    # @return [RelatonBib::DocumentStatus]
    def get_status(doc, status = nil)
      status ||= doc.at("//td[contains(., '标准状态')]/span")&.text&.strip
      return unless STAGES[status]

      RelatonBib::DocumentStatus.new stage: STAGES[status]
    end

    private

    # @param doc [Nokogiri::HTML::Document]
    # @param ref [String]
    # @return [Hash]
    #   * :scope [String]
    #   * :prefix [String]
    #   * :mandate [String]
    def get_gbtype(doc, ref)
      # ref = get_ref(doc)
      { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
        mandate: get_mandate(ref), topic: "other" }
    end

    # @param doc [Nokogiri::HTML::Document]
    # @return [Array<String>]
    def get_ccs(doc)
      [doc.at("//div[contains(text(), '中国标准分类号')]/following-sibling::div").
        text.delete("\r\n\t\t")]
    end

    # @param doc [Nokogiri::HTML::Document]
    # @return [Array<Hash>]
    #   * :field [String]
    #   * :group [String]
    #   * :subgroup [String]
    def get_ics(doc)
      ics = doc.at("//div[contains(text(), '国际标准分类号')]/following-sibling::div"\
                   " | //dt[contains(text(), '国际标准分类号')]/following-sibling::dd")
      return [] unless ics

      field, group, subgroup = ics.text.delete("\r\n\t\t").split "."
      [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
    end

    # @param doc [Nokogiri::HTML::Document]
    # @return [String]
    def get_scope(doc)
      issued = doc.at("//div[contains(., '发布单位')]/following-sibling::div")
      case issued&.text
      when /国家标准/ then "national"
      when /^行业标准/ then "sector"
      end
    end

    # @param ref [String]
    # @return [String]
    def get_prefix(ref)
      pref = ref.match(/^[^\s]+/).to_s.split("/").first
      prefix pref
    end

    # @param pref [String]
    # @return [Hash{String=>String}]
    def prefix(pref)
      @prefixes ||= YAML.load_file File.join(__dir__, "yaml/prefixes.yaml")
      @prefixes[pref]
    end

    # @param ref [String]
    # @return [String]
    def get_mandate(ref)
      case ref.match(%r{(?<=\/)[^\s]+}).to_s
      when "T" then "recommended"
      when "Z" then "guidelines"
      else "mandatory"
      end
    end

    # @param doc [Nokogiri::HTML::Document]
    # @return [Array<Hash>]
    #   * :type [String] type of date
    #   * :on [String] date
    def get_dates(doc)
      date = doc.at("//div[contains(text(), '发布日期')]/following-sibling::div"\
                    " | //dt[contains(text(), '发布日期')]/following-sibling::dd")
      [{ type: "published", on: date.text.delete("\r\n\t\t") }]
    end
  end
end