fairplaysk/datacamp

View on GitHub
app/models/parsers/donations_parser/parser.rb

Summary

Maintainability
B
6 hrs
Test Coverage
# -*- encoding : utf-8 -*-
require 'csv'

module Parsers
  module DonationsParser
    class Parser
      ATTRIBUTES = [
                      :year,
                      :name,
                      :address,
                      :town,
                      :zip_code,
                      :district,
                      :region,
                      :in,
                      :number,
                      :contract_date,
                      :decision_date,
                      :project_name,
                      :program_number,
                      :requested,
                      :approved,
                      :currency
                    ]

      def self.parse_location(year)
        "registerkultury.gov.sk/proposals/#{year}.csv"
      end

      def self.parse(html, year)
        ::Parsers::Support.save(parse_location(year), 'csv', bucketize: false)
        CSV.open(::Parsers::Support.get_path(parse_location(year), bucketize: false), "wb", force_quotes: true) do |csv|
          csv << ATTRIBUTES

          doc = Nokogiri::HTML(html)
          doc.css('table table table table table tr, table.mensiepismo tr').each do |tr| # mensiepismo for 2007
            tds =  tr.css('td')
            if tds[0] and tds[0].text =~ /^\s*\d+/
              proposal = {}
              proposal[:year] = year

              if year < 2013
                proposal.merge! parse_recipient_2007_2012(tds[1])
              else
                proposal.merge! parse_recipient_2013(tds[1])
              end

              town, zip_code = parse_town_and_zip_code(proposal[:address])
              proposal[:town] = town
              proposal[:zip_code] = zip_code

              if proposal[:project_name].present? # different columns for 2007
                proposal[:program_number] = clean_string(tds[2].text)
                proposal[:requested], proposal[:requested_currency] = parse_donation(tds[3])
                proposal[:approved], proposal[:approved_currency] = parse_donation(tds[4])
              else
                proposal[:project_name] = clean_string(tds[2].text)
                proposal[:program_number] = clean_string(tds[3].text)
                proposal[:requested], proposal[:requested_currency] = parse_donation(tds[4])
                proposal[:approved], proposal[:approved_currency] = parse_donation(tds[5])
              end

              proposal[:currency] = proposal[:requested_currency] || proposal[:approved_currency]

              csv << ATTRIBUTES.map{ |a| proposal.fetch(a, nil) }
            end
          end
        end
      end

      private
        def self.parse_town_and_zip_code(address)
          town_and_zip_code = address.split(',').last
          zip_code = town_and_zip_code[0..5].gsub(' ','').strip
          town = town_and_zip_code[6..-1].strip
          return town, zip_code
        end

        def self.parse_recipient_2013(td)
          proposal = {}
          e = td.css('span.cervene').first
          return proposal unless e
          proposal[:name] = clean_string(e.text)
          e = td.children[3]
          proposal[:address] = clean_string(e.children[0].text, :remove_trailing_comma => true)
          it = 1
          while it < e.children.count
            case e.children[it].text.strip
              when 'okres:'
                proposal[:district] = clean_string(e.children[it+1].text)
                it+= 1
              when 'kraj:'
                proposal[:region] = clean_string(e.children[it+1].text)
                it+= 2
              when 'IČO:'
                proposal[:in] = clean_string(e.children[it+1].text, :remove_spaces => true)
                it+= 2
              when "evid. číslo žiadosti:"
                proposal[:number] = clean_string(e.children[it+1].text)
                it+= 2
              when "dátum rozhodnutia:"
                proposal[:decision_date] = clean_string(e.children[it+1].text)
                it+= 2
              when 'dátum uzatvorenia zmluvy:'
                proposal[:contract_date] = clean_string(e.children[it+1].text)
                it+= 2
            end
            it+= 1
          end
          proposal
        end

        def self.parse_recipient_2007_2012(td)
          proposal = {}
          e = td.css('span.cervene').first
          return proposal unless e
          proposal[:name] = clean_string(e.text)
          e = e.next.next
          proposal[:address] = clean_string(e.text, :remove_trailing_comma => false)
          if proposal[:address][-1] == ','
            e = e.next.next
            proposal[:address] << clean_string(e.text)
          end
          while e.next
            e = e.next
            case e.text.strip
              when 'okres:'
                proposal[:district] = clean_string(e.next.text)
              when 'kraj:'
                proposal[:region] = clean_string(e.next.text)
              when 'IČO:'
                proposal[:in] = clean_string(e.next.text, :remove_spaces => true)
              when "evidenčné číslo žiadosti:"
                proposal[:number] = clean_string(e.next.text)
              when 'dátum uzatvorenia zmluvy:'
                proposal[:contract_date] = clean_string(e.next.text)
              when /^Projekt: (.*)/ # Projekt: for 2007
                proposal[:project_name] = clean_string($1)
            end
          end
          proposal
        end

        def self.parse_donation(e)
          amount = e.children.first.text.gsub(/(\d)\D(\d)/, '\1\2').to_i
          currency = nil
          if amount > 0
            currency = e.children.first.text.gsub(/(\d{1,3}.)+(\w+)/, '\2').strip
            currency = (currency == 'Eur' ? 'EUR' : 'SKK')
          end
          [amount, currency]
        end

        def self.clean_string(str, options = {})
          options.reverse_merge!({
            :remove_trailing_comma => true,
            :remove_spaces => false
          })
          str = str.strip.squeeze(' ').tr("\n\t\r", '')
          str = str.sub(/,\Z/, '') if options[:remove_trailing_comma]
          str = str.tr(' ', '') if options[:remove_spaces]
          str
        end
    end
  end
end