fairplaysk/datacamp

View on GitHub
lib/etl/regis_extraction_old.rb

Summary

Maintainability
B
4 hrs
Test Coverage
# -*- encoding : utf-8 -*-

require 'fileutils'

module Etl
  class RegisExtractionOld < Etl::Extraction
    def document_url(id)
      "http://www.statistics.sk/pls/wregis/detail?wxidorg=#{id}"
    end

    def is_acceptable?(document)
      document.xpath("//div[@class='telo']").present?
    end

    def config
      @configuration ||= EtlConfiguration.find_by_name('regis_extraction')
    end

    def digest(doc)
      ico = name = legal_form = date_start = date_end = address = region = ''
      doc.xpath("//table[@class='tabid']//tr").each do |row|
        if row.xpath(".//td[1]").inner_text.match(/i(Č|č|c)o/i)
          ico = row.xpath(".//td[3]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/meno/i)
          name = row.xpath(".//td[3]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/forma/i)
          legal_form = row.xpath(".//td[3]").inner_text.split('-')[0].strip
        elsif row.xpath(".//td[1]").inner_text.match(/vzniku/i)
          date_start = row.xpath(".//td[3]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/z(a|á|Á)niku/i)
          date_end = row.xpath(".//td[3]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/adresa/i)
          address = row.xpath(".//td[3]").inner_text.strip
        elsif row.xpath(".//td[1]").inner_text.match(/okres/i)
          region = row.xpath(".//td[3]").inner_text.strip
        end
      end

      activity1 = activity2 = account_sector = ownership = size = ''
      doc.xpath("//table[@class='tablist']//tr").each do |row|
        if row.xpath(".//td[1]").inner_text.match(/SK NACE/i)
          activity1 = row.xpath(".//td[2]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/OKE(Č|č|c)/i)
          activity2 = row.xpath(".//td[2]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/sektor/i)
          account_sector = row.xpath(".//td[2]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/vlastn(i|í|Í)ctva/i)
          ownership = row.xpath(".//td[2]").inner_text
        elsif row.xpath(".//td[1]").inner_text.match(/ve(l|ľ|Ľ)kosti/i)
          size = row.xpath(".//td[2]").inner_text
        end
      end

      date_start = Date.parse(date_start) rescue nil
      date_end = Date.parse(date_end) rescue nil

      url = document_url(id).to_s

      { :doc_id => id,
        :ico => ico,
        :name => name,
        :legal_form => legal_form,
        :date_start => date_start,
        :date_end => date_end,
        :address => address,
        :region => region,
        :activity1 => activity1,
        :activity2 => activity2,
        :account_sector => account_sector,
        :ownership => ownership,
        :size => size,
        :date_created => Time.now,
        :source_url => url }
    end

    def save(procurement_hash)
      Staging::StaRegisMain.create(procurement_hash)
    end

    def enque_job(document_id)
      Delayed::Job.enqueue Etl::RegisExtraction.new(id+1, config.batch_limit, document_id)
    end

  end
end