fairplaysk/datacamp

View on GitHub
lib/etl/vvo_v2/contract_information_parser.rb

Summary

Maintainability
F
1 wk
Test Coverage
# -*- encoding : utf-8 -*-

module Etl
  module VvoV2
    module ContractInformationParser

      def contract_information
        contract_information_hash = {}

        case document_format
          when :format1
            document.xpath("//fieldset[@class='fieldset']/legend").each do |element|
              if element.inner_text.match(/ODDIEL\s+II\W/)
                contract_information = element.next_sibling.next_sibling
                contract_information.xpath(".//span[@class='code']").each do |code|
                  code_text = code.inner_text.strip
                  header_text = code.parent.xpath(".//span[@class='title']").inner_text

                  if code_text.match(/^II.1.1/)
                    # nazov
                    contract_information_hash[:project_name] = next_element(code.parent).inner_text.strip

                  elsif code_text.match(/^II.1.2/) && header_text.match(/Druh zákazky/)
                    # druh zakazky - project type
                    type_element = next_element(code.parent)

                    first_line_category = false # oznacime, ze na prvom riadku nespomina kategoriu
                    # 237455, 238166
                    if type_element.inner_text.match(/Kategória služby číslo/)
                      contract_information_hash[:project_type] = type_element.xpath(".//span[1]").inner_text.strip
                      first_line_category = true # specialny pripad ked na prvom riadku spomina kategoriu
                    else
                      contract_information_hash[:project_type] = type_element.inner_text.strip
                    end

                    # kategoria
                    place_is_next = false # oznacenie, ze miesto vykonania je hned dalsie
                    if first_line_category
                      # 238166
                      if type_element.next_sibling.next_sibling.inner_text.match(/Kategória služby číslo/)
                        category_element = type_element.next_sibling.next_sibling
                        contract_information_hash[:project_category] = category_element.xpath(".//span[2]").inner_text.strip
                      else #237455
                        category_element = type_element
                      end
                    else
                      category_element = type_element.next_sibling
                      if category_element.inner_text.strip.blank?
                        category_element = category_element.next_sibling
                      end
                      if type_element.inner_text.match(/Tovary/) && category_element.inner_text.match(/Tovary/)
                        category_element = next_element(category_element)
                        place_is_next = true
                      end
                      content = category_element.inner_text.strip
                      if category_element.inner_text.match(/Kategórie služieb sú uvedené v prílohe C1/)
                        content = category_element.xpath(".//span[2]").inner_text.strip
                      elsif category_element.inner_text.match(/Kategória služby číslo/)
                        category_element = next_element(category_element)
                        content = category_element.xpath(".//span[2]").inner_text.strip
                      end
                      contract_information_hash[:project_category] = content
                    end

                    # miesto vykonavania
                    if place_is_next
                      place_element = category_element.next_sibling
                    else
                      place_element = category_element.next_sibling
                      if place_element.inner_text.strip.blank?
                        place_element = place_element.next_sibling
                      end
                    end

                    # 185547
                    if place_element.inner_text.match(/V prípade zmluvy/)
                      place_element = next_element(place_element)
                    end
                    header1 = place_element.xpath(".//span[1]").inner_text
                    header2 = place_element.inner_text
                    if header1.match(/miesto/) || header1.match(/stavenisko/) || header1.match(/lokalita/)
                      contract_information_hash[:place_of_performance] = place_element.xpath(".//span[2]").inner_text.strip
                    elsif header2.match(/miesto/) || header2.match(/lokalita/)
                      contract_information_hash[:place_of_performance] = place_element.xpath(".//span[1]").inner_text.strip
                    end

                    # Nuts code
                    nuts_element = next_element(place_element)
                    if nuts_element.inner_text.match(/NUTS/)
                      if nuts_element.xpath(".//div").any?
                        contract_information_hash[:nuts_code] = nuts_element.xpath(".//div").inner_text
                      else
                        contract_information_hash[:nuts_code] = next_element(nuts_element).inner_text.strip
                      end
                    end

                  elsif code_text.match(/^II.1.3/) && header_text.match(/Informácie o rámcovej dohode/)
                    general_contract = code.parent.next_sibling.next_sibling.inner_text.strip
                    contract_information_hash[:general_contract] = (general_contract == "Oznámenie zahŕňa uzavretie rámcovej dohody")

                  elsif code_text.match(/^II.1.3/) && header_text.match(/Oznámenie zahŕňa/)
                    general_contract = next_element(code.parent).inner_text.strip
                    contract_information_hash[:general_contract] = (general_contract == "Uzatvorenie rámcovej dohody")

                  elsif code_text.match(/^II.1.2/) && header_text.match(/Stručný opis/) #187656
                    contract_information_hash[:project_description] = next_element(code.parent).inner_text.strip

                  elsif code_text.match(/^II.1.4/) && header_text.match(/Stručný opis/)
                    contract_information_hash[:project_description] = next_element(code.parent).inner_text.strip

                  elsif (code_text.match(/^II.1.3/) || code_text.match(/^II.1.5/)) && (header_text.match(/Spoločný slovník obstarávania/) || header_text.match(/CPV/))
                    dictionary_element = code.parent.next_sibling.next_sibling
                    # TODO move duplicate code to method - spliting subjects
                    dictionary_main_subjects = dictionary_element.xpath(".//div[1]//span[2]").inner_text.match(/Hlavný slovník: (.*)/)
                    if dictionary_main_subjects && dictionary_main_subjects.size > 1
                      contract_information_hash[:dictionary_main_subjects] = dictionary_main_subjects[1].strip
                    end
                    if dictionary_element.xpath(".//div[2]").inner_text.match(/Doplňujúce predmety/) && dictionary_element.xpath(".//div[3]").any?
                      dictionary_additional_subjects = dictionary_element.xpath(".//div[3]").inner_text.match(/Hlavný slovník: (.*)/)
                      if dictionary_additional_subjects && dictionary_additional_subjects.size > 1
                        contract_information_hash[:dictionary_additional_subjects] = dictionary_additional_subjects[1].strip
                      end
                    end

                  elsif code_text.match(/^II.1.6/) && (header_text.match(/Informácie o rámcovej dohode/) || header_text.match(/GPA/))
                    gpa_agreement = code.parent.next_sibling.next_sibling.xpath(".//span")
                    contract_information_hash[:gpa_agreement] = !gpa_agreement.inner_text.match(/Nie/)

                  end

                end
              end
            end
          when :format2
            document.xpath("//table[@class='mainTable']//td[@class='cast']").each do |element|
              if element.inner_text.match(/ODDIEL\s+II\W/)
                element.xpath(".//table")[0].xpath(".//tr").each do |tr|
                  code_text = tr.xpath(".//td[@class='kod']").inner_text.strip
                  header_text = tr.xpath(".//td[2]//span[@class='nazov']").inner_text.strip

                  if code_text.match(/^II.1.1/) && header_text.match(/Názov/)
                    name_element = tr
                    if name_element.xpath(".//td[2]//span[@class='hodnota']").empty?
                      name_element = tr.next_element
                    end
                    name_element = name_element.xpath(".//td[2]//span[@class='hodnota']")

                    contract_information_hash[:project_name] = strip_last_point(name_element.inner_text.strip)

                  elsif code_text.match(/^II.1.2/) && header_text.match(/Druh zákazky/)
                    # Typ projektu
                    tr_type_element = tr
                    type_element = tr_type_element.xpath(".//span[@class='hodnota']")
                    if type_element.empty?
                      tr_type_element = next_element(tr_type_element)
                      type_element = tr_type_element.xpath(".//span[@class='hodnota']")
                    end
                    project_type = type_element.inner_text.strip
                    project_type_matches = project_type.match(/(a|b|c)\) (.*)$/)
                    if project_type_matches
                      project_type = project_type_matches[2]
                    end
                    project_type = strip_last_point(project_type)

                    project_type_exceptions = {
                        "Uskutočnenie prác" => 'Práce', #124203
                        "Kúpa" => 'Tovary', # 125359
                        "Vypracovanie projektovej dokumentácie a uskutočnenie prác" => 'Práce',
                        "Projektovanie a uskutočnenie prác" => 'Práce',
                        "Vykonanie" => 'Práce'
                    }

                    # try konvert to number - servies
                    category_number = Integer(project_type) rescue nil
                    if project_type_exceptions.include?(project_type)
                      contract_information_hash[:project_type] = project_type_exceptions[project_type]
                      contract_information_hash[:project_category] = project_type
                      tr_category_element = tr_type_element
                    elsif category_number
                      contract_information_hash[:project_type] = 'Služby'
                      contract_information_hash[:project_category] = project_type
                      tr_category_element = tr_type_element
                    else
                      contract_information_hash[:project_type] = project_type

                      # Kategoria
                      tr_category_element = next_element(tr_type_element)

                      category_element = tr_category_element.xpath(".//span[@class='hodnota']")
                      category = category_element.inner_text.strip
                      contract_information_hash[:project_category] = strip_last_point(category)
                    end

                    # Miesto vykonavania
                    tr_place_element = next_element(tr_category_element)
                    # Continue
                    next unless tr_place_element
                    place_header = tr_place_element.xpath(".//span[@class='podnazov']").inner_text
                    if place_header.match(/V prípade zmluvy/)
                      tr_place_element = next_element(tr_place_element)
                      place_header = tr_place_element.xpath(".//span[@class='podnazov']").inner_text
                    end
                    if place_header.match(/miesto/) || place_header.match(/stavenisko/)
                      contract_information_hash[:place_of_performance] = strip_last_point(tr_place_element.xpath(".//span[@class='hodnota']").inner_text.strip)
                    end

                    # Nuts kod
                    tr_nuts_element = next_element(tr_place_element)
                    # Continue
                    next unless tr_nuts_element
                    nuts_header = tr_nuts_element.xpath(".//span[@class='podnazov']").inner_text
                    if nuts_header.match(/NUTS/)
                      tr_nuts_content_element = next_element(tr_nuts_element)
                      nuts_code = tr_nuts_content_element.xpath(".//span[@class='hodnota']").inner_text.strip
                      contract_information_hash[:nuts_code] = strip_last_point(nuts_code)
                    end

                  elsif code_text.match(/^II.1.3/) && header_text.match(/Oznámenie zahŕňa/)
                    tr_general_contract = tr
                    if tr_general_contract.xpath(".//td[2]//span[@class='hodnota']").empty?
                      tr_general_contract = tr.next_element
                    end
                    general_contract_element = tr_general_contract.xpath(".//td[2]//span[@class='hodnota']")
                    general_contract = strip_last_point(general_contract_element.inner_text.strip)
                    contract_information_hash[:general_contract] = (general_contract == "Uzatvorenie rámcovej dohody")

                  elsif (code_text.match(/^II.1.4/) || code_text.match(/^II.1.2/)) && header_text.match(/Stručný opis/)
                    tr_project_description = tr
                    # TODO refactor - move to method
                    if tr_project_description.xpath(".//td[2]//span[@class='hodnota']").empty?
                      tr_project_description = tr.next_element
                    end
                    project_description_element = tr_project_description.xpath(".//td[2]//span[@class='hodnota']")
                    contract_information_hash[:project_description] = project_description_element.inner_text.strip

                  elsif (code_text.match(/^II.1.5/) || code_text.match(/^II.1.3/)) && (header_text.match(/Spoločný slovník obstarávania/) || header_text.match(/CPV/))
                    tr_subjects = tr
                    if tr_subjects.xpath(".//td[2]//span[@class='hodnota']").empty?
                      tr_subjects = tr.next_element
                    end
                    subjects = tr_subjects.xpath(".//td[2]//span[@class='hodnota']")
                    dictionary_main_subjects = subjects[2].inner_text.strip
                    contract_information_hash[:dictionary_main_subjects] = strip_last_point(dictionary_main_subjects)

                    if subjects.size == 5
                      next_dictionary_main_subjects = strip_last_point(subjects[4].inner_text.strip)
                      contract_information_hash[:dictionary_main_subjects]+= ', ' + next_dictionary_main_subjects if next_dictionary_main_subjects.present?
                    end

                    tr_additional_subjects = next_element(tr_subjects)
                    next unless tr_additional_subjects
                    additional_subjects_element = tr_additional_subjects.xpath(".//td[2]//span[@class='hodnota']")
                    if additional_subjects_element.inner_text.match(/Doplňujúce predmety/)
                      dictionary_additional_subjects = additional_subjects_element.children[-2].inner_text.strip
                      # TODO Refactor move to method
                      dictionary_additional_subjects = dictionary_additional_subjects.match(/Hlavný slovník: (.*)/)
                      if dictionary_additional_subjects && dictionary_additional_subjects.size > 1
                        contract_information_hash[:dictionary_additional_subjects] = strip_last_point(dictionary_additional_subjects[1].strip)
                      end
                    end

                  elsif code_text.match(/^II.1.6/) && (header_text.match(/Dohoda o vládnom obstarávaní/) || header_text.match(/GPA/))
                    # TODO move to method - as next_tr_content ??
                    tr_gpa_agreement = tr
                    if tr_gpa_agreement.xpath(".//td[2]//span[@class='hodnota']").empty?
                      tr_gpa_agreement = tr.next_element
                    end
                    gpa_agreement = tr_gpa_agreement.xpath(".//td[2]//span[@class='hodnota']").inner_text.strip
                    contract_information_hash[:gpa_agreement] = !gpa_agreement.match(/Nie/)

                  end

                end
              end
            end
          else
        end

        contract_information_hash
      end

    end
  end
end