relaton/relaton-gb

View on GitHub
lib/relaton_gb/gb_bibliography.rb

Summary

Maintainability
A
1 hr
Test Coverage
# frozen_string_literal: true

require "relaton_gb/gb_bibliographic_item"
require "relaton_gb/gb_standard_type"
require "relaton_gb/hit_collection"
require "relaton_gb/hit"

# GB bib module.
module RelatonGb
  # GB entry point class.
  class GbBibliography
    class << self
      # rubocop:disable Metrics/MethodLength
      # @param text [Strin] code of standard for search
      # @return [RelatonGb::HitCollection]
      def search(text)
        case text
        when /^(GB|GJ|GS)/
          # Scrape national standards.
          Util.warn "(#{text}) Fetching from openstd.samr.gov.cn ..."
          require "relaton_gb/gb_scrapper"
          GbScrapper.scrape_page text
        # when /^ZB/
          # Scrape proffesional.
        # when /^DB/
          # Scrape local standard.
        # when %r{^Q/}
          # Enterprise standard
        when %r{^T/[^\s]{2,6}\s}
          # Scrape social standard.
          Util.warn "(#{text}) Fetching from www.ttbz.org.cn ..."
          require "relaton_gb/t_scrapper"
          TScrapper.scrape_page text
        else
          # Scrape sector standard.
          # require "relaton_gb/sec_scrapper"
          # SecScrapper.scrape_page text
          []
        end
      end
      # rubocop:enable Metrics/MethodLength

      # @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
      # @param year [String] the year the standard was published (optional)
      # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
      # @return [String] Relaton XML serialisation of reference
      def get(code, year = nil, opts = {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
        if year.nil?
          /^(?<code1>[^-]+)-(?<year1>[^-]+)$/ =~ code
          unless code1.nil?
            code = code1
            year = year1
          end
        end

        code += ".1" if opts[:all_parts]
        code, year = code.split("-", 2) if code.include?("-")
        ret = get1(code, year, opts)
        return nil if ret.nil?

        ret = ret.to_most_recent_reference unless year
        ret = ret.to_all_parts if opts[:all_parts]
        ret
      end

      private

      def fetch_ref_err(code, year, missed_years) # rubocop:disable Metrics/MethodLength
        # id = year ? "#{code}:#{year}" : code
        # Util.warn "WARNING: No match found on the GB website for `#{id}`. " \
        #           "The code must be exactly like it is on the website."
        unless missed_years.empty?
          Util.warn "(There was no match for `#{year}`, though there " \
                    "were matches found for `#{missed_years.join('`, `')}`.)"
        end
        if /\d-\d/.match? code
          Util.warn "The provided document part may not exist, or " \
                    "the document may no longer be published in parts."
        else
          Util.warn "If you wanted to cite all document parts for the " \
                    "reference, use `#{code} (all parts)`.\nIf the document " \
                    "is not a standard, use its document type abbreviation " \
                    "(TS, TR, PAS, Guide)."
        end
        nil
      end

      def get1(code, year, _opts)
        # search must include year whenever available
        searchcode = code + (year.nil? ? "" : "-#{year}")
        result = search_filter(searchcode) || return
        ret = results_filter(result, year)
        if ret[:ret]
          Util.warn "(#{searchcode}) Found: `#{ret[:ret].docidentifier.first.id}`"
          ret[:ret]
        else
          Util.warn "(#{searchcode}) Not found."
          fetch_ref_err(code, year, ret[:years])
        end
      end

      def search_filter(code)
        # search filter needs to incorporate year
        docidrx = %r{^[^\s]+\s[\d.-]+}
        result = search(code)
        result.select do |hit|
          hit.docref && hit.docref.match(docidrx).to_s.include?(code)
        end
      end

      # Sort through the results from Isobib, fetching them three at a time,
      # and return the first result that matches the code,
      # matches the year (if provided), and which # has a title (amendments do
      # not).
      # Only expects the first page of results to be populated.
      # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
      # If no match, returns any years which caused mismatch, for error
      # reporting
      def results_filter(result, year) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
        missed_years = []
        result.each_slice(3) do |s| # ISO website only allows 3 connections
          fetch_pages(s, 3).each do |r|
            return { ret: r } if !year

            r.date.select { |d| d.type == "published" }.each do |d|
              return { ret: r } if year.to_i == d.on(:year)

              missed_years << d.on(:year)
            end
          end
        end
        { years: missed_years }
      end

      # @param hits [RelatonBib::HitCollection<RelatonBib::Hit>]
      # @param threads [Integer]
      # @return [Array<RelatonBib::GbBibliographicItem>]
      def fetch_pages(hits, threads)
        workers = RelatonBib::WorkersPool.new threads
        workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
        hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
        workers.end
        workers.result.sort_by { |x| x[:i] }.map { |x| x[:hit] }
      end
    end
  end
end