SpeciesFileGroup/taxonworks

View on GitHub
lib/batch_load/import/collection_objects.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'digest/bubblebabble'

module BatchLoad
  class Import::CollectionObjects < BatchLoad::Import

    attr_accessor :collection_objects
    attr_accessor :collecting_events
    attr_accessor :taxon_determinations
    attr_accessor :otus

    attr_accessor :namespace

    # @param [Hash] args
    def initialize(**args)
      @collection_objects   = {}
      @collecting_events    = {}
      @taxon_determinations = {}
      @otus                 = {}
      super(**args)
    end

    # rubocop:disable Metrics/MethodLength
    # @return [Integer]
    def build_collection_objects
      # test_build
      build_objects = {}
      i             = 1 # accounting for headers
      # identifier namespace
      header5       = csv.headers[5] # should be 'collecting_event_identifier_namespace_short_name'
      header6       = csv.headers[6] # should be 'collecting_event_identifier_identifier'
      header7       = csv.headers[7] # should be 'collecting_event_identifier_type'
      #
      # first pass for CollectingEvent with Georeference and Identifier
      csv.each do |row|
        parse_result = BatchLoad::RowParse.new
        # creation of the possible-objects list
        parse_result.objects.merge!(co: [], otu: [], td: [], ce: [])
        # attach the results to the row
        @processed_rows[i] = parse_result

        # hot-wire the project into the row
        temp_row               = row
        temp_row['project_id'] = @project_id.to_s if row['project_id'].blank?

        begin # processing the CollectionObject
          co_list = BatchLoad::ColumnResolver.collection_object_by_identifier(temp_row)
          if co_list.no_matches? # no namespace to search!
            parse_result.parse_errors.push(co_list.error_messages.first)
            i += 1 # can't skip the increment!
            next
          end
          co = co_list.item # there can be only one
          unless co.collecting_event.nil? # if it exists
            parse_result.parse_errors.push('The specified CollectionObject already has a CollectingEvent.')
            i += 1 # can't skip the increment!
            next
          end
          parse_result.objects[:co].push(co)
        end

        begin # processing the Otu
          otu            = nil
          otu_attributes = {name: row['otu_name']}
          otu_list       = BatchLoad::ColumnResolver.otu(temp_row)
          otu            = otu_list.item if otu_list.resolvable?
          otu_match      = Digest::SHA256.digest(otu_attributes.to_s)
          otu            = build_objects[otu_match] if otu.blank?
          otu            = Otu.new(otu_attributes) if otu.blank?
          build_objects[otu_match] = otu # .merge!(otu_match => otu)
          parse_result.objects[:otu].push(otu)
        end

        begin # processing the TaxonDetermination
          td_attributes = {otu:,
          taxon_determination_object: co}
          # td_match      = Digest::SHA256.digest(td_attributes.to_s)
          # td            = build_objects[td_match]
          # td            = TaxonDetermination.find_by(td_attributes) if td.nil?
          td            = TaxonDetermination.new(td_attributes)
          parse_result.objects[:td].push(td)
          # build_objects.merge!(td_match => td)
        end

        begin # processing the CollectingEvent
          id_ce    = row[header6]
          vert_loc = row['verbatim_location']
          long     = row['longitude'] # longitude
          lat      = row['latitude'] # latitude
          method   = row['method']
          error    = (row['error'].to_s + ' ' + row['georeference_error_units'].to_s).strip

          ce_namespace = row[header5]
          ns_ce        = Namespace.where(short_name: ce_namespace).first
          parse_result.parse_errors.push("No available namespace '#{ce_namespace}'.") if ns_ce.nil?

          # force a verbatim_locality, if none is provided.
          if vert_loc.blank?
            vert_loc = "#{ns_ce.short_name} #{id_ce}"
          end
          ce_attributes = {verbatim_locality:                vert_loc,
                           verbatim_geolocation_uncertainty: error.empty? ? nil : error,
                           start_date_day:                   row['start_date_day'],
                           start_date_month:                 row['start_date_month'],
                           start_date_year:                  row['start_date_year'],
                           end_date_day:                     row['end_date_day'],
                           end_date_month:                   row['end_date_month'],
                           end_date_year:                    row['end_date_year'],
                           verbatim_longitude:               long,
                           verbatim_latitude:                lat,
                           verbatim_method:                  method,
                           geographic_area_id:               nil,
                           minimum_elevation:                nil,
                           maximum_elevation:                nil,
                           elevation_precision:              nil,
                           field_notes:                      nil,
                           verbatim_elevation:               nil,
                           verbatim_habitat:                 nil,
                           verbatim_datum:                   nil,
                           time_start_hour:                  nil,
                           time_start_minute:                nil,
                           time_start_second:                nil,
                           time_end_hour:                    nil,
                           time_end_minute:                  nil,
                           time_end_second:                  nil,
                           verbatim_date:                    row['verbatim_date'],
                           verbatim_trip_identifier:         nil,
                           verbatim_collectors:              nil,
                           verbatim_label:                   nil,
                           document_label:                   nil,
                           print_label:                      nil,
                           project_id:                       @project_id
          }

          ce_id_attributes = {identifiers_attributes: [{namespace:  ns_ce,
                                                        project_id: @project_id,
                                                        type:       'Identifier::' + row[header7],
                                                        identifier: id_ce}]}
          id_attributes    = ce_id_attributes[:identifiers_attributes][0]
          ce_key           = ce_attributes.merge(ce_id_attributes)
          # id_match         = Digest::SHA256.digest(ce_id_attributes.to_s)

          gr_attributes    = {}
          case method.downcase
            when 'geolocate'
              gr_attributes = {geo_locate_georeferences_attributes: [{iframe_response: "#{lat}|#{long}|#{Utilities::Geo.distance_in_meters(error).to_f}|Unavailable"}]}
            else
          end unless method.nil?

          case row[1]
            when '35397', '38866'
              ce_a1 = ce_attributes
              ce_m1 = Digest::SHA256.digest(ce_key.to_s)
            else
          end

          ce_match = Digest::SHA256.digest(ce_key.to_s)
          ce       = build_objects[ce_match]
          ce       = CollectingEvent.includes(:identifiers).find_by(ce_attributes, id_attributes) if ce.nil?
          # ce       = CollectingEvent.find_by(ce_attributes) if ce.nil?
          ce       = CollectingEvent.new(ce_key.merge(gr_attributes)) if ce.nil?

          if ce.valid? # various different possible errors.
            co.collecting_event = ce
            parse_result.objects[:ce].push(ce)
            build_objects.merge!(ce_match => ce)
          else
            err_list = 'Collecting event problems: '
            ce.errors.messages.each { |msg|
              msg.each { |key, value|
                err_list += "#{key}: #{value}"
              }
            }
            parse_result.parse_errors.push(err_list)
            # parse_result.parse_errors.push("Identifier '#{id_test.namespace} #{id_test.identifier}'
            # has been used for a different collecting event.")
          end
        end

        i += 1
      end
      @total_lines = i - 1

    end

    def test_build
      file_name = 'spec/files/batch/collection_object/CollectionObjectTest.tsvP'
      ns_1      = Namespace.find_by(short_name: 'PSUC')
      csv1      = CSV.read(file_name, {headers: true, header_converters: :downcase, col_sep: "\t", encoding: 'UTF-8'})

      csv1.each do |row|
        # the following invocation also creates a valid specimen as a collection_object
        # FactoryBot.create(:valid_identifier, namespace: ns_1, identifier: ident)
        co = CollectionObject.new(type: 'Specimen', total: 1, preparation_type_id: 5)
        id = Identifier.new(namespace:  ns_1,
                            type:       'Identifier::Local::CatalogNumber',
                            identifier: row[1])
        co.identifiers << id

        if co.valid?
          co.save
        end
      end
    end

    # rubocop:disable Metrics/MethodLength
    # process each row for information:
    # @return [Integer]
    def build_collection_objects_ori
      ce_a1         = ''
      ce_m1         = ''
      ce1           = CollectingEvent.find(1)
      # here we store the attributes of stuff we want to find later
      build_objects = {}
      i             = 1 # accounting for headers
      # identifier namespace
      # header0 = csv.headers[0] # should be 'collection_object_identifier_namespace_short_name'
      # header1 = csv.headers[1] # should be 'collection_object_identifier_identifier'
      header5       = csv.headers[5] # should be 'collecting_event_identifier_namespace_short_name'
      header6       = csv.headers[6] # should be 'collecting_event_identifier_identifier'
      header7       = csv.headers[7] # should be 'collecting_event_identifier_type'
      csv.each do |row|
        parse_result                 = BatchLoad::RowParse.new
        # creation of the possible-objects list
        parse_result.objects[:otu]   = []
        parse_result.objects[:co]    = []
        parse_result.objects[:td]    = []
        parse_result.objects[:ce]    = []
        parse_result.objects[:gr]    = []
        parse_result.objects[:ce_id] = []
        # attach the results to the row
        @processed_rows.merge!(i => parse_result)

        # hot-wire the project into the row
        row['project_id'] = @project_id.to_s if row['project_id'].blank?

        co_list  = BatchLoad::ColumnResolver.collection_object_by_identifier(row)
        otu_list = BatchLoad::ColumnResolver.otu(row)
        if co_list.no_matches? # no namespace to search!
          parse_result.parse_errors.push('No CollectionObject found with the specified identifier.')
          next
        end
        co = co_list.item # there can be only one
        unless co.collecting_event.nil? # if it exists
          parse_result.parse_errors.push('The specified CollectionObject already has a CollectingEvent.')
          next
        end
        parse_result.objects[:co].push(co)
        otu = otu_list.item if otu_list.resolvable?
        otu = Otu.new(name: row['otu_name']) if otu.blank?
        parse_result.objects[:otu].push(otu)
        long         = row['longitude'] # longitude
        lat          = row['latitude'] # latitude
        method       = row['method']
        error        = (row['error'].to_s + ' ' + row['georeference_error_units'].to_s).strip
        ce_namespace = row[header5]
        ns_ce        = Namespace.where(short_name: ce_namespace).first
        parse_result.parse_errors.push["No available namespace '#{ce_namespace}'."] if ns_ce.nil?

        begin # processing the TaxonDetermination
          td_attributes = {
            otu:,
            taxon_determination_object: co}
          td_match      = Digest::SHA256.digest(td_attributes.to_s)
          td            = build_objects[td_match]
          td            = TaxonDetermination.find_by(td_attributes) if td.nil?
          td            = TaxonDetermination.new(td_attributes) if td.nil?
          parse_result.objects[:td].push(td)
          build_objects.merge!(td_match => td)
        end

        begin # processing Identifier
          ce_id_attributes = {namespace:              ns_ce,
                              identifier_object_type: 'CollectingEvent',
                              project_id:             @project_id,
                              type:                   'Identifier::' + row[header7],
                              identifier:             row[header6]}
          ce_id_match      = Digest::SHA256.digest(ce_id_attributes.to_s)
          ce_id            = build_objects[ce_id_match]
          ce_id            = Identifier.find_by(ce_id_attributes) if ce_id.nil?
          ce_id            = Identifier.new(ce_id_attributes) if ce_id.nil?
          parse_result.objects[:ce_id].push(ce_id)
          build_objects.merge!(ce_id_match => ce_id) # whichever way we came by it, save the item in our stash
        end

        co.collecting_event = ce
        i                   += 1
      end
      @total_lines = i - 1
    end
    # rubocop:enable Metrics/MethodLength

    # @return [Boolean]
    def build
      if valid?
        build_collection_objects
        @processed = true
      end
    end
  end
end