relaton/relaton-gb

View on GitHub
lib/relaton_gb/t_scrapper.rb

Summary

Maintainability
A
0 mins
Test Coverage
# encoding: UTF-8
# frozen_string_literal: true

require "open-uri"
require "net/http"
require "nokogiri"
require "relaton_gb/scrapper"
require "relaton_gb/gb_bibliographic_item"
require "relaton_gb/hit_collection"
require "relaton_gb/hit"

module RelatonGb
  # Social standard scarpper.
  module TScrapper
    extend Scrapper

    class << self
      # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
      # @param text [String]
      # @return [RelatonGb::HitCollection]
      def scrape_page(text)
        search_html = OpenURI.open_uri(
          "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" \
          "#{CGI.escape(text.tr('-', [8212].pack('U')))}",
        ).read
        header = Nokogiri::HTML search_html
        xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
        t_xpath = "../preceding-sibling::td[4]"
        hits = header.xpath(xpath).map do |h|
          docref = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-")
          status = h.at("../preceding-sibling::td[1]").text.delete "\r\n"
          pid = h[:href].sub(%r{/$}, "")
          Hit.new pid: pid, docref: docref, status: status, scrapper: self
        end
        HitCollection.new hits
      rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout
        raise RelatonBib::RequestError, "Cannot access http://www.ttbz.org.cn/Home/Standard"
      end
      # rubocop:enable Metrics/MethodLength, Metrics/AbcSize

      # @param hit [RelatonGb::Hit] standard's page path
      # @return [RelatonGb::GbBibliographicItem]
      def scrape_doc(hit)
        src = "http://www.ttbz.org.cn#{hit.pid}"
        doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
        GbBibliographicItem.new(**scrapped_data(doc, src, hit))
      rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout
        raise RelatonBib::RequestError, "Cannot access #{src}"
      end

      private

      # rubocop:disable Metrics/MethodLength
      # @param doc [Nokogiri::HTML::Document]
      # @param src [String]
      # @param hit [RelatonGb::Hit]
      # @return [Hash]
      def scrapped_data(doc, src, hit)
        # docid_xpt  = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
        # status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
        {
          committee: get_committee(doc, hit.docref),
          docid: get_docid(hit.docref),
          title: get_titles(doc),
          doctype: get_type,
          docstatus: get_status(doc, hit.status),
          gbtype: gbtype,
          ccs: get_ccs(doc),
          ics: get_ics(doc),
          link: [{ type: "src", content: src }],
          date: get_dates(doc),
          language: ["zh"],
          script: ["Hans"],
          structuredidentifier: fetch_structuredidentifier(hit.docref),
        }
      end
      # rubocop:enable Metrics/MethodLength

      def get_committee(doc, _ref)
        {
          name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
          type: "technical",
        }
      end

      def get_titles(doc)
        xpz = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
        titles = RelatonBib::TypedTitleString.from_string doc.at(xpz)
          .text, "zh", "Hans"
        xpe = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
        ten = doc.xpath(xpe).text
        return titles if ten.empty?

        titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn")
      end

      def gbtype
        { scope: "social-group", prefix: "T", mandate: "mandatory",
          topic: "other" }
      end

      def get_ccs(doc)
        [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
          .text.gsub(/[\r\n]/, "").strip.match(/^[^\s]+/).to_s]
      end

      def get_ics(doc)
        xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
        ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
        field, group, subgroup = ics.split "."
        [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
      end

      def get_dates(doc)
        d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
          .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
        [{ type: "published", on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
      end
    end
  end
end