fairplaysk/datacamp

View on GitHub
lib/etl/lawyer_partnership_extraction.rb

Summary

Maintainability
A
3 hrs
Test Coverage
# -*- encoding : utf-8 -*-

require 'fileutils'

module Etl
  class LawyerPartnershipExtraction
    attr_reader :url, :reset_url, :parent_url

    def initialize(url, reset_url = nil, cookie = nil, parent_url = nil, filter = nil)
      @url, @reset_url, @cookie, @parent_url, @filter = url, reset_url, cookie, parent_url, filter
    end

    def download
      if @parent_url.present? && @cookie.present? && @reset_url
        Typhoeus::Request.get(@reset_url, headers: {'Cookie' => @cookie}, ssl_verifypeer: false)
        Typhoeus::Request.get(@parent_url, headers: {'Cookie' => @cookie}, ssl_verifypeer: false)
      end
      Nokogiri::HTML( Typhoeus::Request.get(@url, headers: {'Cookie' => @cookie}, ssl_verifypeer: false).body )
    end

    def is_acceptable?(document)
      document.xpath("//div[@class='section']/table[@class='filter']").present?
    end

    def perform
      document = download
      if is_acceptable?(document)
        lawyer_partnership_hash = digest(document)
        save(lawyer_partnership_hash)
      end
    end

    def digest(doc)
      lawyer_partnership_table = doc.xpath("//div[@class='section']/table[@class='filter']").first

      lawyer_sak_links = lawyer_partnership_table.xpath(".//div[@class='section']//table[1]//a")
      lawyer_sak_ids = lawyer_sak_links.map{ |link| link.attributes['href'].value.match(/\d+/); $&; }
      lawyers = Kernel::DsLawyer.find_all_by_sak_id(lawyer_sak_ids)


      sak_id = (@url.match(/\d+/)[0].to_i rescue nil)
      {
        :name => lawyer_partnership_table.xpath('./tr[1]/td[2]').inner_text.strip,
        :partnership_type => lawyer_partnership_table.xpath('./tr[2]/td[2]').inner_text.strip,
        :registry_number => lawyer_partnership_table.xpath('./tr[3]/td[2]').inner_text.strip,
        :ico => lawyer_partnership_table.xpath('./tr[4]/td[2]').inner_text.strip,
        :dic => lawyer_partnership_table.xpath('./tr[5]/td[2]').inner_text.strip,
        :street => lawyer_partnership_table.xpath('./tr[6]/td[2]').inner_text.strip,
        :city => (lawyer_partnership_table.xpath('./tr[7]/td[2]').inner_text.strip.mb_chars.upcase rescue nil),
        :zip => lawyer_partnership_table.xpath('./tr[8]/td[2]').inner_text.gsub(/\s+/, '').strip,
        :phone => lawyer_partnership_table.xpath('./tr[9]/td[2]').inner_text.strip,
        :fax => lawyer_partnership_table.xpath('./tr[10]/td[2]').inner_text.strip,
        :cell_phone => lawyer_partnership_table.xpath('./tr[11]/td[2]').inner_text.strip,
        :email => lawyer_partnership_table.xpath('./tr[12]/td[2]').inner_text.strip,
        :website => (validate_url(lawyer_partnership_table.xpath('./tr[13]/td[2]/a').first.attributes['href'].value) rescue nil),
        :sak_id => sak_id,
        :ds_lawyers => lawyers,
        :url => @url,
        :is_part_of_import => true
      }
    end

    def validate_url(url)
      parsed_url = URI.parse(url)
      if parsed_url.class == URI::HTTP
        url
      else
        nil
      end
    end

    def save(lawyer_partnership_hash)
      lawyer_partnership = Kernel::DsLawyerPartnership.find_or_initialize_by_sak_id(lawyer_partnership_hash[:sak_id])
      lawyer_partnership.update_attributes!(lawyer_partnership_hash)
    end

    def parse_id
      url.match(/\d+/)[0].to_i rescue nil
    end

    def self.map_ids(downloads)
      downloads.map{ |d| d.parse_id }
    end

    def get_ids_from_downloads
       Etl::LawyerExtraction.map_ids(get_downloads)
    end

    def get_downloads
      id = 0
      downloads = []
      begin
        doc_data = Typhoeus::Request.get(@url + id.to_s, disable_ssl_peer_verification: true)
        cookie = doc_data.headers_hash['Set-Cookie'].match(/[^ ;]*/)[0]
        doc = Nokogiri::HTML( doc_data.body )
        downloads << parse_for_links(doc, @reset_url, cookie, @url + id.to_s, @filter)
        id += 10
      end while doc.xpath("//div[@class='buttonbar']/table//tr[1]//td[@style='opacity: 0.5']").inner_html.match(/but_arrow_right.gif/).blank?
      downloads.flatten
    end

    def parse_for_links(doc, reset_url, cookie, parent_url, filter)
      doc.xpath("//div[@class='result']/table//a").map do |link|
        Etl::LawyerPartnershipExtraction.new("https://www.sak.sk/#{link.attributes['href'].value.match(/'(.*)'/)[1]}", reset_url, cookie, parent_url, filter)
      end
    end

  end
end