fairplaysk/datacamp

View on GitHub
lib/etl/vvo_bulletin_downloader.rb

Summary

Maintainability
A
3 hrs
Test Coverage
# -*- encoding : utf-8 -*-

module Etl
  class VvoBulletinDownloader
    include Etl::Shared::VvoIncludes

    attr_reader :document_url

    def initialize(document_url)
      @document_url = document_url
    end

    def config
      @configuration ||= EtlConfiguration.find_by_name('vvo_extraction')
    end

    def download
      Nokogiri::HTML(Typhoeus::Request.get(document_url).body)
    end

    def document
      @document ||= download
    end

    # Oznámenia o výsledku verejného obstarávania
    # V = 21751

    # Informácie o uzavretí zmluvy
    # IP = 2937

    # Informácia o plnení zmluvy (5885)
    # IZ = 3888
    # ID = 1997

    # 30573

    def perform
      document_ids = []
      css_links = document.css('#layout-column_column-2 .portlet-body a')

      any_valid_link = false
      css_links.each do |css_link|
        valid_link = css_link.inner_text.match(/(\d*)(\ )(\-)(\ )(...)(\ )(\:)(\ )(\D*)/u)

        any_valid_link = true if valid_link

        if valid_link && (valid_link[5].start_with?('V') || valid_link[5].start_with?('IP') || valid_link[5].start_with?('IZ') || valid_link[5].start_with?('ID'))
          href = css_link.attributes['href'].text
          document_id = href.match(/(\D*)(\/)(\d*)(\D*)/u)[3].to_i
          document_ids << document_id
        end

      end

      unless any_valid_link
        Delayed::Job.enqueue Etl::VvoBulletinDownloader.new(document_url)
      end

      document_ids.each do |document_id|
        Etl::VvoBulletinDownloader.delay.download_document(document_id)
      end
    end

    def after(job)
    end

    # ----

    MIN_SIZE = 1000.0

    def self.download_document(id)
      url = "http://www2.uvo.gov.sk/sk/evestnik/-/vestnik/#{id}"

      file_path = get_path("vvo/procurements/#{id}.html")
      return if File.exist?(file_path) && File.size(file_path) > MIN_SIZE

      html = Typhoeus::Request.get(url).body

      if html.size < 1000
        Etl::VvoBulletinDownloader.delay.download_document(id)
        return
      end

      save("vvo/procurements/#{id}.html", html)
    end

    def self.save(name, html, options = {})
      filename = get_path(name, options)
      FileUtils.mkdir_p(File.dirname(filename))
      f = File.open(filename, "wb")
      begin
        f.write html
      ensure
        f.close
      end
    end

    def self.get_path(path, options = {})
      defaults = {
          :bucketize => true
      }
      options = defaults.merge(options)

      if options[:bucketize]
        path_chunks = path.split('/')
        output = []
        path_chunks.each do |chunk|
          numbers = chunk.scan(/[0-9]{3,}/).collect do |part|
            part.to_i / 1000 * 1000
          end
          output += numbers if numbers.any?
          output << chunk
        end
        "#{Rails.root}/data/#{Rails.env}/#{output.join('/')}"
      else
        "#{Rails.root}/data/#{Rails.env}/#{path}"
      end
    end

    # ----

    def self.config
      @configuration ||= EtlConfiguration.find_by_name('vvo_extraction')
    end

    def self.download_all_bulletins(year)
      all_bulletins_in_year(year).each do |bulletin_url|
        Delayed::Job.enqueue Etl::VvoBulletinDownloader.new(bulletin_url)
      end
    end

    def self.all_bulletins_in_year(in_year)
      document_url = "http://www2.uvo.gov.sk/sk/evestnik/-/vestnik/all"
      document = Nokogiri::HTML(Typhoeus::Request.get(document_url).body)

      links = []
      document.css('#layout-column_column-2 .portlet-body a').each_with_index do |css_link|
        if css_link.inner_text.include? '/'
          href = css_link.attributes['href'].text
          bulletin_and_year = css_link.inner_text.gsub(/ /, '').match(/.*?(\d*)\/(\d*)/u)
          year = bulletin_and_year[2].to_i
          links << href if year == in_year
        end
      end

      links
    end

  end
end