fairplaysk/datacamp

View on GitHub
lib/etl/foundation_extraction.rb

Summary

Maintainability
C
1 day
Test Coverage
# -*- encoding : utf-8 -*-
require 'etl/page_loader'

module Etl
  class FoundationExtraction
    # static
    def self.update_last_run_time
      EtlConfiguration.find_by_name('foundation_extraction').update_attribute(:last_run_time, Time.now)
      DatasetDescription.where(identifier: %w{foundations foundation_founders foundation_trustees foundation_liquidators}).update_all(data_updated_at: Time.zone.now)
    end

    attr_reader :doc_id, :document_url, :page_loader

    def initialize(doc_id, params = {})
      @doc_id = doc_id
      @page_loader = params.fetch(:page_loader) { Etl::PageLoader }
      @document_url = params.fetch(:document_url) { "http://www.ives.sk/registre/detailrnd.do?action=uplny&formular=&id=#{doc_id}&locale=sk" }
    end

    def perform
      save(page.foundation.merge(record_status: Dataset::RecordStatus.find(:published)))
    end

    def page
      @page ||= page_loader.load_by_get(document_url, Etl::FoundationExtraction::FoundationPage, document_url: document_url, doc_id: doc_id)
    end

    def save(foundation_hash)
      return unless foundation_hash.present?
      current_foundation = Kernel::DsFoundation.find_or_initialize_by_ives_id(foundation_hash[:ives_id])

      unless current_foundation.new_record?
        foundation_hash[:ds_foundation_founders] = create_resources(:founder, current_foundation.ds_foundation_founders, foundation_hash.delete(:ds_foundation_founders_attributes))
        foundation_hash[:ds_foundation_trustees] = create_resources(:trustee, current_foundation.ds_foundation_trustees, foundation_hash.delete(:ds_foundation_trustees_attributes))
        foundation_hash[:ds_foundation_liquidators] = create_resources(:liquidator, current_foundation.ds_foundation_liquidators, foundation_hash.delete(:ds_foundation_liquidators_attributes))
      end

      current_foundation.update_attributes(foundation_hash)
    end

    def create_resources(resource_type, resources, attributes)
      all_ids = resources.map(&:_record_id)
      active_objects = []
      attributes.each do |obj_hash|
        case resource_type
          when :founder
            obj = resources.find_or_initialize_by_name_and_address_and_identification_number_and_contribution_value_and_contribution_currency_and_contribution_subject(obj_hash[:name], obj_hash[:address], obj_hash[:identification_number], obj_hash[:contribution_value], obj_hash[:contribution_currency], obj_hash[:contribution_subject])
          when :trustee
            obj = resources.find_or_initialize_by_name_and_address_and_trustee_from_and_trustee_to(obj_hash[:name], obj_hash[:address], obj_hash[:trustee_from], obj_hash[:trustee_to])
          when :liquidator
            obj = resources.find_or_initialize_by_name_and_address_and_liquidator_from_and_liquidator_to(obj_hash[:name], obj_hash[:address], obj_hash[:liquidator_from], obj_hash[:liquidator_to])
        end
        # publish resource
        obj.update_attributes(obj_hash.merge(record_status: Dataset::RecordStatus.find(:published)))
        active_objects << obj
      end
      # deleted ids
      active_ids = active_objects.map(&:_record_id)
      old_ids = all_ids - active_ids
      resource_class = case resource_type
                         when :founder
                           Kernel::DsFoundationFounder
                         when :trustee
                           Kernel::DsFoundationTrustee
                         when :liquidator
                           Kernel::DsFoundationLiquidator
                       end
      # Suspend which deleted resources
      resource_class.where(_record_id: old_ids).update_all(record_status: Dataset::RecordStatus.find(:suspended))
      active_objects
    end

    class FoundationPage
      attr_accessor :document_url, :doc_id

      attr_reader :document

      def initialize(document, params = {})
        @document = document
        @params = params
        @document_url = params.fetch(:document_url)
        @doc_id = params.fetch(:doc_id)
      end

      def foundation
        registration_number = identification_number = name = address = objective = ''
        date_start = date_end = date_liquidation = date_registration = ''
        assets_value = assets_currency = nil

        founders = []
        trustees = []
        liquidators = []

        document.xpath("//div[@class='divForm']//table//tr").each do |row|
          if row.xpath(".//td[1]").inner_text.match(/Registra(č|c)n(e|é) (c|č)(í|i)slo/i)
            registration_number = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/N(á|a)zov nad(á|a)cie/i)
            name = row.xpath(".//td[2]").inner_text.gsub('"','')
          elsif row.xpath(".//td[1]").inner_text.match(/S(í|i)dlo/i)
            address = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/^I(Č|C)O$/i)
            identification_number = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/D(á|a)tum vzniku/i)
            date_start = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/D(á|a)tum z(á|a)pisu/i)
            date_registration = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/D(á|a)tum z(á|a)niku/i)
            date_end = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/D(á|a)tum vstupu do likvid(á|a)cie/i)
            date_liquidation = row.xpath(".//td[2]").inner_text
          elsif row.xpath(".//td[1]").inner_text.match(/(Ú|U)(č|c)el nad(á|a)cie/i)
            objective = row.xpath(".//td[2]").inner_text.strip
          elsif row.xpath(".//td[1]").inner_text.match(/Aktu(á|a)lna hodnota nada(č|c)n(é|e)ho imania/i)
            assets = row.xpath(".//td[2]").inner_text[1..-2].strip.split(' ')
            if assets.size > 0
              assets_value = assets.first.to_f
              assets_currency = assets.last
            else
              assets_value = assets_currency = nil
            end
          elsif row.xpath(".//td[1]").inner_text.match(/Zakladatelia/i)
            founders = digest_founders row.xpath(".//td[2]//table")
          elsif row.xpath(".//td[1]").inner_text.match(/(Š|S)tatut(á|a)rny organ:(.)Spr(á|a)vca/i)
            trustees = digest_trustees_or_liquidators row.xpath(".//td[2]//table"), :trustee_from, :trustee_to
          elsif row.xpath(".//td[1]").inner_text.match(/Likvid(á|a)tori/i)
            liquidators = digest_trustees_or_liquidators row.xpath(".//td[2]//table"), :liquidator_from, :liquidator_to
          end
        end

        # Date parsers
        date_start = Date.parse(date_start) rescue nil
        date_registration = Date.parse(date_registration) rescue nil
        date_liquidation = Date.parse(date_liquidation) rescue nil
        date_end = Date.parse(date_end) rescue nil

        url = document_url.to_s

        { ives_id: doc_id,
          name: name,
          identification_number: identification_number,
          registration_number: registration_number,
          address: address,
          date_start: date_start,
          date_end: date_end,
          date_liquidation: date_liquidation,
          date_registration: date_registration,
          objective: objective,
          assets_currency: assets_currency,
          assets_value: assets_value,
          url: url,
          ds_foundation_founders_attributes: founders,
          ds_foundation_trustees_attributes: trustees,
          ds_foundation_liquidators_attributes: liquidators
        }
      end

      def digest_founders(table)
        founders = []
        table.xpath(".//tr").each do |founder_row|
          founder_identification = founder_row.xpath(".//td[1]").inner_text.strip.split("\n")
          founder_contribution = founder_row.xpath(".//td[3]").inner_text[1..-2].strip.split(' ')

          founder_name, founder_address = founder_identification[0].strip, founder_identification[1].strip
          if founder_identification.size > 2
            founder_identification_number = founder_identification[2].scan(/I(Č|C)O:(.)(.*)/).first.last
          else
            founder_identification_number = ''
          end

          if founder_contribution.size > 0
            founder_contribution_value = founder_contribution.first.to_f
            founder_contribution_currency = founder_contribution.last
          else
            founder_contribution_value = founder_contribution_currency = nil
          end
          founder_contribution_subject = founder_row.xpath(".//td[2]").inner_text.split("\n").last.strip.scan(/Predmet:(.)(.*)/).first.last

          founders << {
              name: founder_name,
              address: founder_address,
              identification_number: founder_identification_number,
              contribution_value: founder_contribution_value,
              contribution_currency: founder_contribution_currency,
              contribution_subject: founder_contribution_subject
          }
        end
        founders
      end

      def digest_trustees_or_liquidators(table, name_from, name_to)
        trustees_or_liquidators = []
        table.xpath(".//tr").each do |row|
          identification = row.xpath(".//td[1]").inner_text.strip.split("\n")
          name = identification[0].strip
          address = identification[1].strip
          date_from = row.xpath(".//td[3]").inner_text
          date_to = row.xpath(".//td[5]").inner_text

          date_from = Date.parse(date_from) rescue nil
          date_to = Date.parse(date_to) rescue nil

          trustees_or_liquidator = {
              name: name,
              address: address
          }
          trustees_or_liquidator[name_from] = date_from
          trustees_or_liquidator[name_to] = date_to
          trustees_or_liquidators << trustees_or_liquidator
        end
        trustees_or_liquidators
      end
    end
  end
end