SpeciesFileGroup/taxonworks

View on GitHub
app/models/dataset_record/darwin_core/occurrence.rb

Summary

Maintainability
A
0 mins
Test Coverage
# TODO: There are numerous very long methods here, we really need to break out logical chunks so that we can
#   a) better atomize and test the expecatations
#   b) interpret and document the behaviour of the importer
#
class DatasetRecord::DarwinCore::Occurrence < DatasetRecord::DarwinCore

  DWC_CLASSIFICATION_TERMS = %w{kingdom phylum class order superfamily family subfamily tribe subtribe}.freeze # genus, subgenus, specificEpithet and infraspecificEpithet are extracted from scientificName
  PARSE_DETAILS_KEYS = %i(uninomial genus species infraspecies).freeze

  ACCEPTED_ATTRIBUTES = {
    CollectionObject: %I(
      buffered_collecting_event buffered_determinations buffered_other_labels
      total
    ).to_set.freeze,

    CollectingEvent: %I(
      document_label print_label verbatim_label
      field_notes formation
      group
      lithology
      max_ma maximum_elevation member min_ma minimum_elevation elevation_precision
      start_date_day start_date_month start_date_year end_date_day end_date_month end_date_year
      time_end_hour time_end_minute time_end_second time_start_hour time_start_minute time_start_second
      verbatim_collectors verbatim_date verbatim_datum verbatim_elevation verbatim_geolocation_uncertainty verbatim_habitat
      verbatim_latitude verbatim_locality verbatim_longitude verbatim_method verbatim_trip_identifier
    ).to_set.freeze
  }.freeze

  class ImportProtonym

    def self.create_if_not_exists
      @create_if_not_exists ||= CreateIfNotExists.new
    end

    def self.match_existing
      @match_existing ||= MatchExisting.new
    end

    def execute(origins, parent, name)
      protonym = get_protonym(parent, name)
      raise DatasetRecord::DarwinCore::InvalidData.new(exception_args(origins, parent, name, protonym)) unless protonym&.persisted?
      protonym
    end

    # rubocop:disable Metric/MethodLength

    # @param [Protonym] parent
    # @return [Protonym, nil]
    def get_protonym(parent, name)
      name = name.except(:rank_class) if name[:rank_class].nil?

      %I(name masculine_name feminine_name neuter_name).inject(nil) do |protonym, field|
        break protonym unless protonym.nil?

        potential_protonyms = Protonym.where(name.slice(:rank_class).merge({ field => name[:name], :parent => parent }))

        # if multiple potential protonyms, this is a homonym situation
        if potential_protonyms.count > 1
          # verbatim author field (if present) applies to finest name only
          if (cached_author = name[:verbatim_author])
            # remove surrounding parentheses if present
            if cached_author.start_with?('(') && cached_author.end_with?(')')
              cached_author = cached_author.delete_prefix('(').delete_suffix(')')
              potential_protonyms_narrowed = potential_protonyms.is_not_original_name
            else
              potential_protonyms_narrowed = potential_protonyms.is_original_name
            end

            potential_protonyms_narrowed = potential_protonyms_narrowed.where(cached_author:)

            if name[:year_of_publication]
              potential_protonyms_narrowed = potential_protonyms_narrowed.where(year_of_publication: name[:year_of_publication])
            end

            # if only one result, everything's ok. Safe to take it as the protonym
            if potential_protonyms_narrowed.count == 1
              potential_protonyms = potential_protonyms_narrowed
            elsif potential_protonyms_narrowed.count == 0
              potential_protonym_strings = potential_protonyms.map { |proto| "[id: #{proto.id} #{proto.cached_html_name_and_author_year}]" }.join(', ')
              error_message =
                ["Multiple matches found for name #{name[:name]}, rank #{name[:rank_class]}, parent #{parent.id} #{parent.cached_html_name_and_author_year}: #{potential_protonym_strings}",
                 "No names matched author name #{name[:verbatim_author]}#{(', year ' + name[:year_of_publication].to_s) if name[:year_of_publication]}: "]
              raise DatasetRecord::DarwinCore::InvalidData.new({ 'scientificName' => error_message })
            else
              potential_protonym_strings = potential_protonyms_narrowed.map { |proto| "[id: #{proto.id} #{proto.cached_html_name_and_author_year}]" }.join(', ')
              raise DatasetRecord::DarwinCore::InvalidData.new(
                { 'scientificName' => ["Multiple matches found for name #{name[:name]} and author name #{name[:verbatim_author]}, year #{name[:year_of_publication]}: #{potential_protonym_strings}"] }
              )
            end
          else
            # for intermediate homonyms, skip it, we don't have any info
            return parent
          end
        end

        p = potential_protonyms.first

        # Protonym might not exist, or might have intermediate parent not listed in file
        # if it exists, run more expensive query to see if it has an ancestor matching parent name and rank
        if p.nil? && Protonym.where(name.slice(:rank_class).merge({ field => name[:name] })).where(project_id: parent.project_id).exists?
          if (cached_author = name[:verbatim_author])
            # remove surrounding parentheses if present
            if cached_author.start_with?('(') && cached_author.end_with?(')')
              cached_author = cached_author.delete_prefix('(').delete_suffix(')')
            end
          end

          potential_protonyms = Protonym.where(name.slice(:rank_class, :year_of_publication).merge({ field => name[:name], cached_author: }).compact).with_ancestor(parent)
          if potential_protonyms.count > 1
            return parent
            # potential_protonym_strings = potential_protonyms.map { |proto| "[id: #{proto.id} #{proto.cached_html_name_and_author_year}]" }
            # raise DatasetRecord::DarwinCore::InvalidData.new(
            #   { "scientificName" => ["Intermediate name not present, and multiple matches found: #{potential_protonym_strings.join(', ')}"] }
            # )
          end
          p = potential_protonyms.first
          # check parent.cached_valid_taxon_name_id if not valid, can have obsolete subgenus Aus (Aus) bus -> Aus bus, bus won't have ancestor (Aus)
          if p.nil? && !parent.cached_is_valid
            p = Protonym.where(name.slice(:rank_class).merge!({ field => name[:name] })).with_ancestor(parent.valid_taxon_name).first
          end

        end

        if p&.cached_misspelling && p.has_misspelling_relationship?
          correct_spelling = TaxonNameRelationship.where_subject_is_taxon_name(p)
                                                  .with_type_array(TAXON_NAME_RELATIONSHIP_NAMES_MISSPELLING_ONLY)
                                                  .first&.object_taxon_name
          if correct_spelling&.values_at(:name, :masculine_name, :feminine_name, :neuter_name).include?(name[:name])
            return correct_spelling
          end
        end
        p
      end
    end
    class CreateIfNotExists < ImportProtonym
      def get_protonym(parent, name)
        super || Protonym.create({parent:, also_create_otu: true}.merge!(name))
      end

      def exception_args(origins, parent, name, protonym)
        {
          origins[name.object_id] => name[:rank_class].present? ?
          protonym.errors.messages.values.flatten :
          ["Rank for #{name[:name]} could not be determined. Please create this taxon name manually and retry."]
        }
      end
    end

    class MatchExisting < ImportProtonym
      def exception_args(origins, parent, name, protonym)
        {
          origins[name.object_id] =>
          ["Protonym #{name[:name]} not found with that name and/or classification. Importing new names is disabled by import settings."]
        }
      end
    end
  end

  def import(dwc_data_attributes = {})
    super
    begin
      DatasetRecord.transaction(requires_new: true) do
        self.metadata.delete('error_data')

        names, origins = parse_taxon_class
        strategy = self.import_dataset.restrict_to_existing_nomenclature? ? ImportProtonym.match_existing : ImportProtonym.create_if_not_exists

        innermost_otu = nil
        innermost_protonym = names.inject(project.root_taxon_name) do |parent, name|
          otu_attributes = name.delete(:otu_attributes)

          unless name[:rank_class] || otu_attributes.present?
            name[:rank_class] = parent.predicted_child_rank(name[:name])&.to_s
            name.delete(:rank_class) unless name[:rank_class] && /::FamilyGroup::/ =~ name[:rank_class]
          end

          strategy.execute(origins, parent, name).tap do |protonym|
            innermost_otu = Otu.find_or_create_by!({taxon_name: protonym}.merge!(otu_attributes)) if otu_attributes
          end
        end

        attributes = parse_record_level_class
        record_level_biocuration_classifications = attributes.dig(:specimen, :biocuration_classifications)
        attributes.deep_merge!(parse_occurrence_class)
        attributes.deep_merge!(parse_event_class)
        attributes.deep_merge!(parse_location_class)
        attributes.deep_merge!(parse_identification_class(innermost_protonym))

        attributes.deep_merge!(parse_tw_collection_object_data_attributes)
        attributes.deep_merge!(parse_tw_collecting_event_data_attributes)

        attributes.deep_merge!(parse_tw_collection_object_attributes)
        attributes.deep_merge!(parse_tw_collecting_event_attributes)

        append_dwc_attributes(dwc_data_attributes['CollectionObject'], attributes[:specimen])
        append_dwc_attributes(dwc_data_attributes['CollectingEvent'], attributes[:collecting_event])

        Utilities::Hashes::set_unless_nil(attributes[:specimen], :biocuration_classifications,
          (parse_biocuration_group_fields.dig(:specimen, :biocuration_classifications) || []) +
          (record_level_biocuration_classifications || []) +
          (attributes.dig(:specimen, :biocuration_classifications) || [])
        )

        specimen = Specimen.create!({
          no_dwc_occurrence: true
        }.merge!(attributes[:specimen]))

        if attributes[:type_material] && (innermost_otu&.name).nil?

          type_material = TypeMaterial.new(
            {
              protonym: innermost_protonym,
              collection_object: specimen,
            }.merge!(attributes[:type_material])) # protoynm can be overwritten in type_materials hash if OC did not match scientific name / innermost_protonym

          if self.import_dataset.require_type_material_success? # raise error if validations fail and it cannot be imported
            type_material.save!
          else
            # Best effort only, import will proceed even if creating the type material fails
            type_material.save
          end
        end

        if attributes.dig(:catalog_number, :identifier)
          namespace = attributes.dig(:catalog_number, :namespace)
          delete_namespace_prefix!(attributes.dig(:catalog_number, :identifier), namespace)

          identifier = Identifier::Local::CatalogNumber
            .create_with(identifier_object: specimen, annotator_batch_mode: true)
            .find_or_create_by!(attributes[:catalog_number])

          # if desired, ensure that cached CO identifier will match verbatim catalogNumber
          # this ensures that DwC exported records will have identical catalogNumbers as when they were imported
          if self.import_dataset.require_catalog_number_match_verbatim? &&
            identifier.cached != get_field_value(:catalogNumber)

            error_message = "Computed catalog number #{identifier.cached} will not match verbatim #{get_field_value(:catalogNumber)}. "\
                            'Verify the mapped namespace and namespace delimiter are correct.'
            raise DarwinCore::InvalidData.new({'catalogNumber' => [error_message]})
          end

          object = identifier.identifier_object

          unless object == specimen
            raise DarwinCore::InvalidData.new({ 'catalogNumber' => ['Is already in use'] }) unless self.import_dataset.containerize_dup_cat_no?
            if object.is_a?(Container)
              object.add_container_items([specimen])
            else
              identifier.update!(
                identifier_object: Container::Virtual.containerize([object, specimen])
              )
            end
          end
        end

        Identifier::Local::Import::Dwc.create!(
          namespace: import_dataset.get_core_record_identifier_namespace,
          identifier_object: specimen,
          identifier: get_field_value(:occurrenceID),
          annotator_batch_mode: true
        ) unless get_field_value(:occurrenceID).nil? || import_dataset.get_core_record_identifier_namespace.nil?

        specimen.taxon_determinations.create!({
          otu: innermost_otu || innermost_protonym.otus.find_by(name: nil) || innermost_protonym.otus.first # TODO: Might require select-and-confirm functionality
        }.merge(attributes[:taxon_determination]))

        event_id = get_field_value(:eventID)
        unless event_id.nil?
          namespace = get_field_value('TW:Namespace:eventID')

          identifier_type = Identifier::Global.descendants.detect { |c| c.name.downcase == namespace.downcase } if namespace
          identifier_attributes = {
            identifier: event_id,
            identifier_object_type: 'CollectingEvent',
            project_id: Current.project_id
          }

          if identifier_type.nil?
            identifier_type = Identifier::Local::TripCode # TODO: Or maybe Identifier::Local::Import?

            using_default_event_id = false
            if namespace.nil?
              namespace = import_dataset.get_event_id_namespace
              using_default_event_id = true
            else
              namespace = Namespace.find_by(Namespace.arel_table[:short_name].matches(namespace)) # Case insensitive match
              raise DarwinCore::InvalidData.new({ 'TW:Namespace:eventID' => ['Namespace not found'] }) unless namespace
            end

            identifier_attributes[:namespace] = namespace

            delete_namespace_prefix!(event_id, namespace)

            if !using_default_event_id && self.import_dataset.require_tripcode_match_verbatim?
              if (cached_identifier = Identifier::Local.build_cached_prefix(namespace) + event_id) != get_field_value(:eventID)
                error_message = "Computed TripCode #{cached_identifier} will not match verbatim #{get_field_value(:eventID)}. "\
                            'Verify the namespace delimiter is correct.' # TODO include link to namespace?
                raise DarwinCore::InvalidData.new({'eventID' => [error_message]})
              end
            end
          end

          collecting_event = identifier_type.find_by(identifier_attributes)&.identifier_object
        end

        # TODO: If all attributes are equal assume it is the same event and share it with other specimens? (eventID is an alternate method to detect duplicates)
        if collecting_event
          # if tags have been specified to be added, update the collecting event
          if attributes[:collecting_event][:tags_attributes]
            # get list of preexisting tags, exclude them from update
            current_tags = collecting_event.tags.pluck(:keyword_id).to_set

            new_tags = attributes[:collecting_event][:tags_attributes].reject { |t| current_tags.member?(t[:keyword].id) }

            # add tags if there were any new ones
            unless new_tags.empty?
              collecting_event.tags.build(new_tags)
              collecting_event.save!
            end
          end

          specimen.update!(collecting_event:)
        else
          collecting_event = CollectingEvent.create!({
            collection_objects: [specimen],
            no_dwc_occurrence: true,
            no_cached: true
          }.merge!(attributes[:collecting_event]))

          identifier_type.create!({
            identifier_object: collecting_event,
            annotator_batch_mode: true
          }.merge!(identifier_attributes)) unless identifier_attributes.nil?

          has_shape = self.import_dataset.metadata.dig('import_settings', 'require_geographic_area_has_shape')
          data_origin = self.import_dataset.metadata.dig('import_settings', 'geographic_area_data_origin')
          disable_recursive_search = self.import_dataset.metadata.dig('import_settings', 'require_geographic_area_exact_match')
          require_ga_found = self.import_dataset.metadata.dig('import_settings', 'require_geographic_area_exists')
          should_check_ga_exists = false
          location_hash = {}  # if requiring geographic area to exist, use hash of inputs for error message

          if collecting_event.verbatim_latitude && collecting_event.verbatim_longitude
            Georeference::VerbatimData.create!({
              collecting_event:,
              error_radius: get_field_value('coordinateUncertaintyInMeters'),
              no_cached: true
            }.merge(attributes[:georeference]))
          end

          county = get_field_value(:county)
          state_province = get_field_value(:stateProvince)
          country = get_field_value(:country)
          country_code = get_field_value(:countryCode)
          if country.blank? && country_code.present?
            if country_code.size == 2
              country = GeographicArea.find_by(iso_3166_a2: country_code, data_origin: 'country_names_and_code_elements').name
            elsif country_code.size == 3  # there are no GAs with alpha3 presently
              country = GeographicArea.find_by(iso_3166_a3: country_code, data_origin: 'country_names_and_code_elements').name
            end
          end

          location_levels = [county, state_province, country].compact

          if require_ga_found && location_levels.size > 0
            location_hash = {county:, state_province:, country:, country_code:}
            should_check_ga_exists = true
          end

          # try to find geographic areas until no location levels are left
          geographic_areas = []
          if disable_recursive_search
            geographic_areas = GeographicArea.with_name_and_parent_names(location_levels).with_data_origin(data_origin).has_shape(has_shape)
          else
            while location_levels.size > 0 and geographic_areas.size == 0
              geographic_areas = GeographicArea.with_name_and_parent_names(location_levels).with_data_origin(data_origin).has_shape(has_shape)
              location_levels = location_levels.drop(1)
            end
          end

          if should_check_ga_exists && geographic_areas.size == 0
            levels = location_hash.to_a.filter{|_,v| !v.nil?}.map { |k,v| "#{k}:#{v}"}
            error_message = "GeographicArea with location levels #{levels.join(", ")} not found."
            raise DarwinCore::InvalidData.new({'country, stateProvince, county' => [error_message]})
          end

          collecting_event.geographic_area_id = geographic_areas[0].id if geographic_areas.size > 0
          collecting_event.save!
        end

        DwcOccurrenceUpsertJob.perform_later(specimen)

        self.metadata['imported_objects'] = { collection_object: { id: specimen.id } }
        self.status = 'Imported'
      end
    rescue DarwinCore::InvalidData => invalid
      self.status = 'Errored'
      self.metadata['error_data'] = { messages: invalid.error_data }
    rescue ActiveRecord::RecordInvalid => invalid
      self.status = 'Errored'
      self.metadata['error_data'] = {
        messages: invalid.record.errors.messages
      }
    rescue StandardError => e
      raise if Rails.env.development?
      self.status = 'Failed'
      self.metadata['error_data'] = {
        exception: {
          message: e.message,
          backtrace: e.backtrace
        }
      }
    ensure
      save!
    end

    self
  end

  #rubocop:enable Metrics/MethodLength

  private

  def term_value_changed(name, value)
    if ['institutioncode', 'collectioncode', 'catalognumber', 'basisofrecord'].include?(name.downcase) and self.status != 'Imported'
      ready = get_field_value('catalogNumber').blank?
      ready ||= !!self.import_dataset.get_catalog_number_namespace(get_field_value('institutionCode'), get_field_value('collectionCode'))

      self.metadata.delete('error_data')
      if ready
        self.status = 'Ready'
      else
        self.status = 'NotReady'
        self.metadata['error_data'] = { messages: { catalogNumber: ['Record cannot be imported until namespace is set, see "Settings".'] } }
      end

      self.import_dataset.add_catalog_number_namespace(get_field_value('institutionCode'), get_field_value('collectionCode'))
      self.import_dataset.add_catalog_number_collection_code_namespace(get_field_value('collectionCode'))

      self.save!
    end
  end

  def get_integer_field_value(field_name)
    value = get_field_value(field_name)

    if value.present?
      begin
        raise unless /^\s*(?<integer>[+-]?\d+)\s*$/ =~ value
        value = integer.to_i
      rescue
        raise DarwinCore::InvalidData.new({ field_name => ["'#{value}' is not a valid integer value"] })
      end
    else
      value = nil
    end

    value
  end


  # Search for an Organization by name or alternate name in the given field. If no organization found, find or create a
  # Person::Unvetted, scoped to the import_dataset
  #
  # @param [String, Symbol] field_name Field name (column) to parse for people in
  # @param [Boolean] search_alt_name Search by alternate_name in addition to name
  # @return [Array<Organization, Person::Unvetted>, nil]
  def parse_organizations_and_people(field_name, search_alt_name = false)
    org_name = get_field_value(field_name)
    possible_organizations = Organization.where(name: org_name)
    if search_alt_name
      possible_organizations = possible_organizations.or(Organization.where(alternate_name: org_name))
    end
    if possible_organizations.exists?
      if possible_organizations.count == 1
        return [possible_organizations.first]

      elsif possible_organizations.count > 1
        matching_orgs = possible_organizations.map do |o|
          str = "[id:#{o.id} #{o.name}"
          if o.alternate_name.present?
            str << " (AKA: #{o.alternate_name})"
          end
          str << ']'
        end.join(', ')
        # TODO how should the user disambiguate which organization they are referring to?
        raise DarwinCore::InvalidData.new({ field_name => ["Multiple organizations matched name or alternate name '#{org_name}': #{matching_orgs}"] })
      end
    end

    parse_people(field_name)
  end

  # Parse for names in a given field and find or create one or more Person::Unvetted (scoped to the import dataset).
  # @param [String, Symbol] field_name Field name (column) to parse for people in
  # @return [Array<Person::Unvetted>, nil]
  def parse_people(field_name)
    #noinspection RubyMismatchedReturnType
    Person.transaction(requires_new: true) do
      DwcAgent.parse(get_field_value(field_name)).map! { |n| DwcAgent.clean(n) }.map! do |name|
        attributes = {
          last_name: [name[:particle], name[:family]].compact.join(' '),
          first_name: name[:given],
          suffix: name[:suffix],
          prefix: name[:title] || name[:appellation]
        }

        # self.import_dataset.derived_people.merge(Person.where(attributes)).first || # TODO: Doesn't work, fails to detect Person subclasses. Why (besides explanation in Shared::OriginRelationship)?
        Person.where(attributes).joins(:related_origin_relationships).merge(
          OriginRelationship.where(old_object: self.import_dataset)
        ).first ||
        Person::Unvetted.create!(attributes.merge({
          related_origin_relationships: [OriginRelationship.new(old_object: self.import_dataset, annotator_batch_mode: true)]
        }))
      end
    end
  end

  # Parse an iso date string from the specified column name
  #
  # The date may be a single date, or an interval of two dates separated by a slash.
  # The second date may omit higher-order elements that are the same as the first date.
  # See https://en.wikipedia.org/wiki/ISO_8601#Time_intervals for more information.
  #
  # @param [String] field_name The column name to get the date string from
  # @return [Array<OpenStruct>] A list of one or two date structs (with year, month, day, hour, minute, second values)
  def parse_iso_date(field_name)
    value = get_field_value(field_name)

    return nil if value.nil?

    result = Utilities::Dates.parse_iso_date_str(value)
    raise DarwinCore::InvalidData.new(
      {
        "#{field_name}":
          ["Invalid date. Please make sure it conforms to ISO 8601 date format (yyyy-mm-ddThh:mm:ss). If expressing interval separate result with '/'. Examples: 1972-05; 1983-10-25; 2020-09-22T15:30; 2020-11-30/2020-12-04"]
      }
    ) if result.nil?
    result
  end



  # Remove the namespace short name and delimiter from start of string.
  #
  # If the namespace has a verbatim_short_name, that is removed instead of the short_name.
  # The delimiter is only removed if the short_name was found in the identifier.
  # @param [String] identifier_str
  # @param [Namespace] namespace
  def delete_namespace_prefix!(identifier_str, namespace)
    identifier_str&.delete_prefix!(namespace.verbatim_short_name || namespace.short_name)&.delete_prefix!(namespace.delimiter || '') if namespace
  end

  #rubocop:disable Metrics/MethodLength

  def parse_record_level_class
    res = {
      specimen: {},
      catalog_number: {}
    }
    # type: [Check it is 'PhysicalObject']
    type = get_field_value(:type) || 'PhysicalObject'
    raise DarwinCore::InvalidData.new({ 'type' => ["Only 'PhysicalObject' or empty allowed"] }) if type != 'PhysicalObject'

    # modified: [Not mapped]

    # language: [Not mapped]

    # license: [Not mapped. Possible with Attribution model? To which object(s)?]

    # rightsHolder: [Not mapped. Same questions as license but using roles]

    # accessRights: [Not mapped. Related to license]

    # bibliographicCitation: [Not mapped]

    # references: [Not mapped]

    # institutionID: [Not mapped. Review]

    # collectionID: [Not mapped. Review]

    # datasetID: [Not mapped]

    # institutionCode: [repository.acronym] # TODO: Use mappings like with namespaces here as well? (Although probably attempt guessing)
    institution_code = get_field_value(:institutionCode)
    if institution_code
      repository = nil
      error_messages = []

      if institution_code.starts_with?('http://') || institution_code.starts_with?('https://')
        url_repositories = Repository.where(url: institution_code)
        if url_repositories.count == 1
          repository = url_repositories.first
        elsif url_repositories.count > 1
          error_messages << "Multiple repositories with url #{institution_code} found"
        else
          error_messages << "No repositories with url #{institution_code} found"
        end
      end

      unless repository
        acronym_repositories = Repository.where(acronym: institution_code)
        if acronym_repositories.count == 1
          repository = acronym_repositories.first
        elsif acronym_repositories.count > 1
          error_messages << "Multiple repositories with acronym #{institution_code} found."
        else
          error_messages << "No repositories with acronym #{institution_code} found."
        end
      end

      # Some repositories may not have acronyms, in that case search by name as well
      unless repository
        repository_results = Repository.where(Repository.arel_table['name'].matches(Repository.sanitize_sql_like(institution_code)))
        if repository_results.count == 1
          repository = repository_results.first
        elsif repository_results.count > 1
          error_messages << "Multiple repositories match the name #{institution_code}."
        else
          error_messages << "No repositories match the name #{institution_code}"
        end

        unless repository
          if error_messages
            error_messages.unshift("Could not disambiguate repository name '#{institution_code}'.")
          else
            error_messages.unshift("Unknown #{institution_code} repository. If valid please register it using '#{institution_code}' as acronym or name.")
          end
          raise DarwinCore::InvalidData.new({ "institutionCode": error_messages })
        end
      end
      Utilities::Hashes::set_unless_nil(res[:specimen], :repository, repository)
    end

    # collectionCode: [catalog_number.namespace]
        # collection_code = get_field_value(:collectionCode)
        # Utilities::Hashes::set_unless_nil(res[:catalog_number], :namespace, Namespace.create_with({
        #     name: "#{institution_code}-#{collection_code} [CREATED FROM DWC-A IMPORT IN #{project.name} PROJECT]",
        #     delimiter: '-'
        # }).find_or_create_by!(short_name: "#{institution_code}-#{collection_code}")) if collection_code
    namespace_id = self.import_dataset.get_catalog_number_namespace(institution_code, get_field_value(:collectionCode))
    if namespace_id
      Utilities::Hashes::set_unless_nil(res[:catalog_number], :namespace, Namespace.find(namespace_id))
      Utilities::Hashes::set_unless_nil(res[:catalog_number], :project, self.project)
    end

    # datasetName: [Not mapped]

    # ownerInstitutionCode: [Not mapped]

    # basisOfRecord: [Check it is 'PreservedSpecimen', 'FossilSpecimen']
    basis = get_field_value(:basisOfRecord)
    basis = basis.downcase.camelize if basis.include? '_' # Reformat GBIF occurrence download basis of records (e.g., PRESERVED_SPECIMEN to PreservedSpecimen)
    if 'FossilSpecimen'.casecmp(basis) == 0
      fossil_biocuration = BiocurationClass.where(project:).find_by(uri: DWC_FOSSIL_URI)

      raise DarwinCore::InvalidData.new(
        { 'basisOfRecord' => ["Biocuration class #{DWC_FOSSIL_URI} is not present in project"] }
      ) if fossil_biocuration.nil?

      Utilities::Hashes::set_unless_nil(res[:specimen], :biocuration_classifications, [BiocurationClassification.new(biocuration_class: fossil_biocuration)])
    else
      raise DarwinCore::InvalidData.new(
        { 'basisOfRecord' => ["Only 'PreservedSpecimen', 'FossilSpecimen' or blank is allowed."] }
      ) unless basis.nil? || 'PreservedSpecimen'.casecmp(basis) == 0
    end

    # informationWithheld: [Not mapped]

    # dataGeneralizations: [Not mapped]

    # dynamicProperties: [Not mapped. Could be ImportAttribute?]

    Utilities::Hashes::delete_nil_and_empty_hash_values(res)
  end

  #rubocop:enable Metrics/MethodLength

  def parse_occurrence_class
    res = {
      catalog_number: {},
      specimen: {},
      collecting_event: {}
    }

    # occurrenceID: [Mapped in import method]

    # catalogNumber: [catalog_number.identifier]
    Utilities::Hashes::set_unless_nil(res[:catalog_number], :identifier, get_field_value(:catalogNumber))

    # recordNumber: [Not mapped]

    # recordedBy: [collecting_event.collectors and collecting_event.verbatim_collectors]
    Utilities::Hashes::set_unless_nil(res[:collecting_event], :collectors, (parse_people(:recordedBy) rescue nil))
    Utilities::Hashes::set_unless_nil(res[:collecting_event], :verbatim_collectors, get_field_value(:recordedBy))

    # individualCount: [specimen.total]
    Utilities::Hashes::set_unless_nil(res[:specimen], :total, get_field_value(:individualCount) || 1)

    # organismQuantity: [Not mapped. Check relation with invidivialCount]

    # organismQuantityType: [Not mapped. Check relation with invidivialCount]

    # sex: [Find or create by name inside Sex biocuration Group] TODO: Think of duplicates (with and without URI)
    sex = get_field_value(:sex)
    if sex
      raise DarwinCore::InvalidData.new({ "sex": ['Only single-word controlled vocabulary supported at this time.'] }) if sex =~ /\s/
      group   = BiocurationGroup.find_by(project_id: Current.project_id, uri: DWC_ATTRIBUTE_URIS[:sex])
      group ||= BiocurationGroup.where(project_id: Current.project_id).where('name ILIKE ?', 'sex').first
      group ||= BiocurationGroup.create!(
        name: 'Sex',
        definition: 'The sex of the individual(s) [CREATED FROM DWC-A IMPORT]',
        uri: DWC_ATTRIBUTE_URIS[:sex]
      )
      # TODO: BiocurationGroup.biocuration_classes not returning AR relation
      sex_biocuration = group.biocuration_classes.detect { |c| c.name.casecmp(sex) == 0 }
      unless sex_biocuration
        sex_biocuration = BiocurationClass.create!(name: sex, definition: "#{sex} individual(s) [CREATED FROM DWC-A IMPORT]")
        Tag.create!(keyword: group, tag_object: sex_biocuration)
      else
        sex = sex_biocuration
      end

      Utilities::Hashes::set_unless_nil(res[:specimen], :biocuration_classifications, [BiocurationClassification.new(biocuration_class: sex_biocuration)])
    end

    # lifeStage: [Not mapped]

    # reproductiveCondition: [Not mapped]

    # behavior: [Not mapped]

    # establishmentMeans: [Not mapped]

    # degreeOfEstablishment [Not mapped]

    # pathway [Not mapped]

    # occurrenceStatus: [Not mapped]

    # preparations: [Match PreparationType by name (case insensitive)]
    preparation_name = get_field_value(:preparations)
    if preparation_name
      preparation_type = PreparationType.find_by(PreparationType.arel_table[:name].matches(preparation_name))

      raise DarwinCore::InvalidData.new({
        "preparations": ["Unknown preparation \"#{preparation_name}\". If it is correct please add it to preparation types and retry."]
      }) unless preparation_type

      Utilities::Hashes::set_unless_nil(res[:specimen], :preparation_type, preparation_type)
    end

    Utilities::Hashes::delete_nil_and_empty_hash_values(res)

    # disposition: [Not mapped]

    # associatedMedia: [Not mapped]

    # associatedReferences: [Not mapped]

    # associatedSequences: [Not mapped]

    # associatedTaxa: [Not mapped]

    # otherCatalogNumbers: [Not mapped]

    # occurrenceRemarks: [specimen note]
    note = get_field_value(:occurrenceRemarks)
    Utilities::Hashes::set_unless_nil(res[:specimen], :notes_attributes, [{text: note, annotator_batch_mode: true}]) if note

    res
  end

  #rubocop:disable Metrics/MethodLength

  def parse_event_class
    collecting_event = { }

    # eventID: [Mapped in import method]

    # parentEventID: [Not mapped]

    # fieldNumber: verbatim_trip_identifier
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_trip_identifier, get_field_value(:fieldNumber))

    start_date, end_date = parse_iso_date(:eventDate)

    year = get_integer_field_value(:year)
    month = get_integer_field_value(:month)
    day = get_integer_field_value(:day)
    startDayOfYear = get_integer_field_value(:startDayOfYear)

    raise DarwinCore::InvalidData.new({ "eventDate": ['Conflicting values. Please check year, month, and day match eventDate'] }) if start_date &&
      (year && start_date.year != year || month && start_date.month != month || day && start_date.day != day)

    year  ||= start_date&.year
    month ||= start_date&.month
    day   ||= start_date&.day

    if startDayOfYear
      raise DarwinCore::InvalidData.new({ "startDayOfYear": ['Missing year value'] }) if year.nil?

      begin
        ordinal = Date.ordinal(year, startDayOfYear)
      rescue Date::Error
        raise DarwinCore::InvalidData.new({ "startDayOfYear": ['Out of range. Please also check year field'] })
      end

      if month && ordinal.month != month || day && ordinal.day != day
        raise DarwinCore::InvalidData.new({ "startDayOfYear": ['Month and/or day of the event date do not match'] })
      end

      month ||= ordinal.month
      day ||= ordinal.day
    end

    # eventDate | (year+month+day) | (year+startDayOfYear): start_date_*
    Utilities::Hashes::set_unless_nil(collecting_event, :start_date_year, year)
    Utilities::Hashes::set_unless_nil(collecting_event, :start_date_month, month)
    Utilities::Hashes::set_unless_nil(collecting_event, :start_date_day, day)

    # eventTime: time_start_*
    %r{^
      (?<start_hour>\d+)(:(?<start_minute>\d+))?(:(?<start_second>\d+))?
      (/(?<end_hour>\d+))?(:(?<end_minute>\d+))?(:(?<end_second>\d+))?
    $}x =~ get_field_value(:eventTime)
    Utilities::Hashes::set_unless_nil(collecting_event, :time_start_hour, start_hour)
    Utilities::Hashes::set_unless_nil(collecting_event, :time_start_minute, start_minute)
    Utilities::Hashes::set_unless_nil(collecting_event, :time_start_second, start_second)
    Utilities::Hashes::set_unless_nil(collecting_event, :time_end_hour, end_hour)
    Utilities::Hashes::set_unless_nil(collecting_event, :time_end_minute, end_minute)
    Utilities::Hashes::set_unless_nil(collecting_event, :time_end_second, end_second)

    endDayOfYear = get_integer_field_value(:endDayOfYear)

    if endDayOfYear
      raise DarwinCore::InvalidData.new({ "endDayOfYear": ['Missing year value'] }) if year.nil?

      begin
        ordinal = Date.ordinal(year, endDayOfYear)
      rescue Date::Error
        raise DarwinCore::InvalidData.new({ "endDayOfYear": ['Out of range. Please also check year field'] })
      end

      month = ordinal.month
      day = ordinal.day

      raise DarwinCore::InvalidData.new({ "eventDate": ['Conflicting values. Please check year and endDayOfYear match eventDate'] }) if end_date &&
      (year && end_date.year != year || month && end_date.month != month || day && end_date.day != day)
    else
      year = end_date&.year
      month = end_date&.month
      day = end_date&.day
    end

    Utilities::Hashes::set_unless_nil(collecting_event, :end_date_year, year)
    Utilities::Hashes::set_unless_nil(collecting_event, :end_date_month, month)
    Utilities::Hashes::set_unless_nil(collecting_event, :end_date_day, day)

    # verbatimEventDate: verbatim_date
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_date, get_field_value(:verbatimEventDate))

    # habitat: verbatim_habitat
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_habitat, get_field_value(:habitat))

    # samplingProtocol: verbatim_method
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_method, get_field_value(:samplingProtocol))

    # sampleSizeValue: [Not mapped]

    # sampleSizeUnit: [Not mapped]

    # samplingEffort: [Not mapped]

    # fieldNotes: field_notes
    Utilities::Hashes::set_unless_nil(collecting_event, :field_notes, get_field_value(:fieldNotes))

    # eventRemarks: [collecting event note]
    note = get_field_value(:eventRemarks)
    Utilities::Hashes::set_unless_nil(collecting_event, :notes_attributes, [{text: note, annotator_batch_mode: true}]) if note

    { collecting_event: }
  end

  #rubocop:enable Metrics/MethodLength

  def parse_location_class
    collecting_event = {}
    georeference = {}

    # locationID: [Not mapped]

    # higherGeographyID: [Not mapped]

    # higherGeography: [Not mapped]

    # continent: [Not mapped]

    # waterBody: [Not mapped]

    # islandGroup: [Not mapped]

    # island: [Not mapped]

    # country: [Not mapped]

    # countryCode: [Not mapped]

    # stateProvince: [Not mapped]

    # county: [Not mapped]

    # municipality: [Not mapped]

    # locality: [Not mapped]

    # verbatimLocality: [verbatim_locality]
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_locality, get_field_value(:verbatimLocality))

    # minimumElevationInMeters: [Not mapped]
    Utilities::Hashes::set_unless_nil(collecting_event, :minimum_elevation, get_field_value(:minimumElevationInMeters))

    # maximumElevationInMeters: [Not mapped]
    Utilities::Hashes::set_unless_nil(collecting_event, :maximum_elevation, get_field_value(:maximumElevationInMeters))

    # verbatimElevation: [Not mapped]
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_elevation, get_field_value(:verbatimElevation))

    # minimumDepthInMeters: [Not mapped. REVISIT]

    # maximumDepthInMeters: [Not mapped. REVISIT]

    # verbatimDepth: [Not mapped. REVISIT]

    # minimumDistanceAboveSurfaceInMeters: [Not mapped]

    # maximumDistanceAboveSurfaceInMeters: [Not mapped]

    # locationAccordingTo: [Not mapped. REVISIT]

    # locationRemarks: [Not mapped. REVISIT]

    # decimalLatitude: [verbatim_latitude]
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_latitude, get_field_value(:decimalLatitude))

    # decimalLongitude: [verbatim_longitude]
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_longitude, get_field_value(:decimalLongitude))

    # geodeticDatum: [verbatim_datum]
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_datum, get_field_value(:geodeticDatum))

    # coordinateUncertaintyInMeters: [verbatim_geolocation_uncertainty]
    uncertainty = get_field_value(:coordinateUncertaintyInMeters)
    unless uncertainty.nil? || uncertainty =~ /\A[+-]?\d+\z/
      raise DarwinCore::InvalidData.new({ "coordinateUncertaintyInMeters": ['Non-integer value'] })
    end
    Utilities::Hashes::set_unless_nil(collecting_event, :verbatim_geolocation_uncertainty, uncertainty&.send(:+, 'm'))

    # coordinatePrecision: [Not mapped. Fail import if claimed precision is incorrect? Round to precision?]

    # pointRadiusSpatialFit: [Not mapped]

    # verbatimCoordinates: [Not mapped]

    # verbatimLatitude: [Not mapped]

    # verbatimLongitude: [Not mapped]

    # verbatimCoordinateSystem: [Not mapped]

    # verbatimSRS: [Not mapped]

    # footprintWKT: [Not mapped]

    # footprintSRS: [Not mapped]

    # footprintSpatialFit: [Not mapped]

    # georeferencedBy: [Not mapped]
    if georeferenced_by = get_field_value(:georeferencedBy)
      predicate_base_props = {uri: 'http://rs.tdwg.org/dwc/terms/georeferencedBy', project: self.project}
      predicate = Predicate.find_by(predicate_base_props)
      predicate ||= Predicate.where(project:).find_by(
        Predicate.arel_table[:name].matches('georeferencedBy')
      )
      predicate ||= Predicate.create!(predicate_base_props.merge(
        {
          name: 'georeferencedBy',
          definition: 'A list (concatenated and separated) of names of people, groups, or organizations who determined the georeference (spatial representation) for the Location.'
        })
      )

      georeference[:data_attributes] = [
        InternalAttribute.new(
          type: 'InternalAttribute',
          predicate:,
          value: georeferenced_by,
          annotator_batch_mode: true
        )
      ]
    end

    # georeferencedDate: [Not mapped]

    # georeferenceProtocol: [Not mapped]

    # georeferenceSources: [Not mapped. REVISIT]

    # georeferenceVerificationStatus: [Not mapped]

    # georeferenceRemarks: [georeference note]
    note = get_field_value(:georeferenceRemarks)
    georeference[:notes_attributes] = [{text: note, annotator_batch_mode: true}] if note

    {
      collecting_event:,
      georeference:
    }
  end

  #rubocop:disable Metric/MethodLength

  # @param [String] type_status
  # @param [Protonym] taxon_protonym
  # @return [Hash{Symbol=>String, TaxonName}, nil]
  def parse_typestatus(type_status, taxon_protonym)
    type_material = nil
    type_status_parsed = type_status&.match(/^(?<type>\w+)$/i) || type_status&.match(/(?<type>\w+)(\s+OF\s+(?<scientificName>.*))/i)
    # only nil if non-alphanumeric entry, or multiple words not matching "\w+ of \w+"
    raise DarwinCore::InvalidData.new({ "typeStatus": ['Unprocessable typeStatus information'] }) unless type_status_parsed && type_status_parsed[:type]
    type_type = type_status_parsed[:type].downcase

    code = get_field_value(:nomenclaturalCode)&.downcase&.to_sym || import_dataset.default_nomenclatural_code
    unless TypeMaterial::legal_type_type(code, type_type)
      raise DarwinCore::InvalidData.new({ "typeStatus": ['could not extract legal type from typeStatus'] })
    end

    # Gets the correct spelling for a protonym, or returns the protonym if not a misspelling
    # @param [Protonym] protonym the protonym to get correct spelling for
    def get_correct_spelling(protonym)
      if protonym.is_protonym? && protonym.has_misspelling_relationship?
        return TaxonNameRelationship.where_subject_is_taxon_name(protonym)
                                    .with_type_array(TAXON_NAME_RELATIONSHIP_NAMES_MISSPELLING_ONLY)
                                    .first&.object_taxon_name
      end
      protonym
    end

    scientific_name = get_field_value(:scientificName)&.gsub(/\s+/, ' ')

    # Run the name through the biodiversity parser to remove authorship info
    parse_results = Biodiversity::Parser.parse((type_status_parsed&.[](:scientificName)&.gsub(/\s+/, ' ') rescue nil) || '')

    type_author_name, type_year = nil
    # Only use biodiversity parsed name if it has very high confidence
    if parse_results[:quality] == 1
      type_scientific_name = parse_results.dig(:canonical, :simple)
      # Save authorship info for narrowing down potential protonyms
      type_author_name, type_year = Utilities::Strings.parse_authorship(parse_results.dig(:authorship, :normalized))
    end

    # if typeStatus is single word, assume the user wants the specimen name as the type name
    type_scientific_name ||= scientific_name

    if scientific_name && type_scientific_name.present?
      # list of messages to help user debug why matching failed
      error_messages = []

      # if type_scientific_name matches the current name of the occurrence, use that
      if type_scientific_name.delete_prefix(scientific_name)&.match(/^\W*$/)
        return {
          type_type:
        }
      end

      name_pattern = "^#{type_scientific_name.split.map { |n| "#{n}(?: \\[sic\\])?" }.join(" ")}$"
      original_combination_protonyms = Protonym.where('cached_original_combination ~ :pat', pat: name_pattern)
                                               .where(project_id: self.project_id)

      if original_combination_protonyms.count == 1
        oc_protonym = original_combination_protonyms.first
        return {
          type_type:,
          protonym: get_correct_spelling(oc_protonym)
        }
      elsif original_combination_protonyms.count > 1
        potential_protonym_strings = original_combination_protonyms.map { |proto|
          "[id: #{proto.id} #{proto.cached_original_combination_html}]"
        }.join(', ')
        error_messages << "Multiple matches found for name #{type_scientific_name}}: #{potential_protonym_strings}"
      else
        error_messages << 'Could not find exact original combination match for typeStatus'
      end

      # See if name matches a synonym of taxon name (ie any name linked to current taxon name)
      synonyms = taxon_protonym.synonyms
      matching_synonyms = Set[]
      synonyms.each do |s|
        possible_names = [s.cached, s.cached_original_combination].compact.to_set
        # Try excluding subgenus
        possible_names += possible_names.map {|n| n.sub(/\(\w+\) /, '')}
        # Check for misspellings
        possible_names += possible_names.map { |n| n.gsub(' [sic]', '') }
        if possible_names.include?(type_scientific_name)
          if s.is_combination?
            matching_synonyms << s.finest_protonym
          else
            matching_synonyms << s
          end
        end
      end

      matching_synonyms = matching_synonyms.map { |s| get_correct_spelling(s) }.uniq

      if matching_synonyms.count == 1
        return {
          type_type:,
          protonym: matching_synonyms.first
        }
      elsif matching_synonyms.count > 1
        synonym_strings = matching_synonyms.map { |proto| "[id: #{proto.id} #{proto.cached_original_combination_html}]" }.join(', ')
        error_messages << "Multiple synonym matches found for name #{type_scientific_name}}: #{synonym_strings}"
      end

      # Try wildcard match on subgenus if not present
      type_name_elements = type_scientific_name.split
      if type_name_elements.length > 1 && type_name_elements[1].first != '(' && type_name_elements[1].last != ')'
        type_name_elements.map! { |s| Regexp.escape(s) }
        # append subgenus wildcard to genus string
        type_name_elements[0] << '( \(\w+\))?'
        name_pattern = "^#{type_name_elements.join(" ")}$"

        wildcard_original_protonym = Protonym.where('cached_original_combination ~ :pat', pat: name_pattern)
                                             .or(Protonym.where('cached ~ :pat', pat: name_pattern))
                                             .where(project_id: self.project_id)

        if type_author_name.present?
          cached_author = type_author_name
          if cached_author.starts_with?('(') && cached_author.end_with?(')')
            cached_author.delete_prefix!('(').delete_suffix!(')')
          end
          wildcard_original_protonym = wildcard_original_protonym.where(cached_author:)
        end

        if type_year.present?
          wildcard_original_protonym = wildcard_original_protonym.where(year_of_publication: type_year)
        end

        if wildcard_original_protonym.count == 1
          return {
            type_type:,
            protonym: get_correct_spelling(wildcard_original_protonym.first)
          }
        elsif wildcard_original_protonym.count > 1
          matching_protonyms = wildcard_original_protonym.map { |p| "[id: #{p.id} #{p.cached_html_original_name_and_author_year}]" }
                                                         .join(', ')
          error_messages << "Multiple names returned in wildcard search: #{matching_protonyms}"
        else
          error_messages << 'No names returned in subgenus wildcard search'
        end
      end

      # report errors
      if error_messages
        error_messages.unshift "Could not identify or disambiguate name #{type_scientific_name}."
        raise DarwinCore::InvalidData.new({ "typeStatus": error_messages })
      end

    end
    type_material
  end

  #rubocop:enable Metric/MethodLength

  def parse_identification_class(taxon_protonym)
    taxon_determination = {}
    type_material = nil

    # identificationID: [Not mapped]

    # identificationQualifier: [Mapped as part of otu name in parse_taxon_class]

    # typeStatus: [Type material only if scientific name matches scientificName and type term is recognized by TW vocabulary]
    if (type_status = get_field_value(:typeStatus))
      type_material = parse_typestatus(type_status, taxon_protonym)
      if type_material.nil? && self.import_dataset.require_type_material_success?
        # generic error message, nothing more specific provided
        raise DarwinCore::InvalidData.new({ "typeStatus": ['Unprocessable typeStatus information'] })
      end
    end

    # identifiedBy: determiners of taxon determination
    determiners = nil
    if self.import_dataset.enable_organization_determiners?
      determiners = parse_organizations_and_people(:identifiedBy,
                                                   self.import_dataset.enable_organization_determiners_alt_name?)
    else
      determiners = parse_people(:identifiedBy)
    end
    unless determiners.nil?
      if determiners.first.is_a?(Person)
        taxon_determination[:determiners] = determiners
      elsif determiners.first.is_a?(Organization)
        taxon_determination[:determiners_organization] = determiners
      end
    end

    # dateIdentified: {year,month,day}_made of taxon determination
    start_date, end_date = parse_iso_date(:dateIdentified)

    raise DarwinCore::InvalidData.new({ "dateIdentified": ['Date range for taxon determination is not supported.'] }) if end_date

    if start_date
      Utilities::Hashes::set_unless_nil(taxon_determination, :year_made, start_date.year)
      Utilities::Hashes::set_unless_nil(taxon_determination, :month_made, start_date.month)
      Utilities::Hashes::set_unless_nil(taxon_determination, :day_made, start_date.day)
    end

    # identificationReferences: [Not mapped. Can they be imported as citations without breaking semantics?]

    # identificationVerificationStatus: [Not mapped]

    # identificationRemarks: Note for taxon determination
    note = get_field_value(:identificationRemarks)
    taxon_determination[:notes_attributes] = [{text: note, annotator_batch_mode: true}] if note

    {
      taxon_determination:,
      type_material:
    }
  end

  # rubocop:disable Metric/MethodLength

  def parse_taxon_class
    names = []
    otu_names = []
    origins = {}
    # taxonID: [Not mapped. Usually alias of core id]

    # scientificNameID: [Not mapped. Could be mapped with type detection into LSID identifier or global ID]

    # acceptedNameUsageID: [N/A for occurrences]

    # parentNameUsageID: [N/A for occurrences]

    # originalNameUsageID: [N/A for occurrences]

    # nameAccordingToID: [Not mapped]

    # namePublishedInID: [Not mapped]

    # taxonConceptID: [Not mapped]

    # acceptedNameUsage: [Not mapped. Review]

    # parentNameUsage: [N/A for occurrences]

    # originalNameUsage: [Not mapped. Review]

    # nameAccordingTo: [Not mapped]

    # namePublishedIn: [Not mapped]

    # namePublishedInYear: [Not mapped]

    # nomenclaturalCode: [Selects nomenclature code to pick ranks from]
    code = get_field_value(:nomenclaturalCode)&.downcase&.to_sym || import_dataset.default_nomenclatural_code
    unless Ranks::CODES.include?(code)
      raise DarwinCore::InvalidData.new(
        { "nomenclaturalCode": ["Unrecognized nomenclatural code #{get_field_value(:nomenclaturalCode)}"] }
      )
    end

    # kingdom: [Kingdom protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'kingdom'), name: get_field_value(:kingdom)}.tap { |h| names << h }.object_id
    ] = :kingdom

    # phylum: [Phylum protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'phylum'), name: get_field_value(:phylum)}.tap { |h| names << h }.object_id
    ] = :phylum

    # class: [Class protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'class'), name: get_field_value(:class)}.tap { |h| names << h }.object_id
    ] = :class

    # order: [Order protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'order'), name: get_field_value(:order)}.tap { |h| names << h }.object_id
    ] = :order

    # superfamily: [Superfamily protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'superfamily'), name: get_field_value(:superfamily)}.tap { |h| names << h }.object_id
    ] = :superfamily

    # family: [Family protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'family'), name: get_field_value(:family)}.tap { |h| names << h }.object_id
    ] = :family

    # subfamily: [Subfamily protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'subfamily'), name: get_field_value(:subfamily)}.tap { |h| names << h }.object_id
    ] = :subfamily

    # tribe: [Tribe protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'tribe'), name: get_field_value(:tribe)}.tap { |h| names << h }.object_id
    ] = :tribe

    # subtribe: [Subtribe protonym]
    origins[
      {rank_class: Ranks.lookup(code, 'subtribe'), name: get_field_value(:subtribe)}.tap { |h| names << h }.object_id
    ] = :subtribe

    # genus: [Not mapped, extracted from scientificName instead]

    # subgenus: [Not mapped, extracted from scientificName instead]

    # specificEpithet: [Not mapped, extracted from scientificName instead]

    # infraspecificEpithet: [Not mapped, extracted from scientificName instead]

    # scientificName: [Parsed with biodiversity and mapped into several protonyms]
    parse_results = Biodiversity::Parser.parse(get_field_value(:scientificName) || '')
    parse_details = parse_results[:details]
    parse_details = (parse_details&.keys - PARSE_DETAILS_KEYS).empty? ? parse_details.values.first : nil if parse_details

    unless (1..3).include?(parse_results[:quality]) && parse_details
      parse_details = parse_results[:details]&.values&.first
      otu_names << get_field_value(:scientificName)
    end

    raise DarwinCore::InvalidData.new({
      "scientificName": parse_results[:qualityWarnings] ?
        parse_results[:qualityWarnings].map { |q| q[:warning] } :
        ['Unable to parse scientific name. Please make sure it is correctly spelled.']
    }) unless parse_details&.is_a?(Hash)

    unless parse_details[:uninomial]
      origins[
        {rank_class: Ranks.lookup(code, 'genus'), name: parse_details[:genus]}.tap { |h| names << h }.object_id
      ] = :scientificName
      origins[
        {rank_class: Ranks.lookup(code, 'subgenus'), name: parse_details[:subgenus]}.tap { |h| names << h }.object_id
      ] = :scientificName
      origins[
        {rank_class: Ranks.lookup(code, 'species'), name: parse_details[:species]}.tap { |h| names << h }.object_id
      ] = :scientificName
      origins[
        {rank_class: Ranks.lookup(code, 'subspecies'), name: parse_details[:infraspecies]&.map{ |d| d.dig(:value) }&.join(' ') }.tap { |h| names << h }.object_id
      ] = :scientificName
    else
      if parse_details[:parent]
        origins[
          {rank_class: Ranks.lookup(code, 'genus'), name: parse_details[:parent]}.tap { |h| names << h }.object_id
        ] = :scientificName
        origins[
          {
            rank_class: /subgen/ =~ parse_details[:rank] ? Ranks.lookup(code, 'subgenus') : nil,
            name: parse_details[:uninomial]
          }.tap { |h| names << h }.object_id
        ] = :scientificName
      elsif get_field_value(:genus) == parse_details[:uninomial]
        origins[
          {rank_class: Ranks.lookup(code, 'genus'), name: parse_details[:uninomial]}.tap { |h| names << h }.object_id
        ] = :scientificName
      elsif names.reverse.detect { |n| n[:name] }&.dig(:name) != parse_details[:uninomial]
        origins[
          {rank_class: nil, name: parse_details[:uninomial]}.tap { |h| names << h }.object_id
        ] = :scientificName
      end
    end

    names.reject! { |v| v[:name].nil? }

    # taxonRank: [Rank of innermost protonym]
    rank = get_field_value(:taxonRank)
    if rank && otu_names.empty?
      names.last[:rank_class] = Ranks.lookup(code, rank)
      raise DarwinCore::InvalidData.new({ "taxonRank": ["Unknown #{code.upcase} rank #{rank}"] }) unless names.last[:rank_class]
    end

    ident_qualifier = get_field_value(:identificationQualifier)
    if ident_qualifier =~ /^cf[\.\s]/
      otu_names << ident_qualifier
    else
      otu_names << "#{get_field_value(:scientificName)} #{ident_qualifier}"
    end unless ident_qualifier.nil?
    names.last&.merge!({otu_attributes: {name: otu_names.join(' ')}}) unless otu_names.empty?

    # higherClassification: [Several protonyms with ranks determined automatically when possible. Classification lower or at genus level is ignored and extracted from scientificName instead]
    higherClassification = ['|', ':', ';', ','].inject([]) do |names, separator|
      break names if names.size > 1
      get_field_value(:higherClassification)&.split(separator) || []
    end.map! do |name|
      normalize_value!(name)
      {rank_class: nil, name:}
    end

    curr = 0
    names.each do |name|
      idx = higherClassification[curr..].index { |n| n[:name] == name[:name] }

      if idx
        higherClassification[curr+idx] = name
        curr += idx + 1
      end
    end
    idx = higherClassification.index { |n| n[:rank_class] == Ranks.lookup(code, 'genus') }
    higherClassification = higherClassification.slice(0, idx) if idx

    curr = 0
    higherClassification.each do |name|
      if name[:rank_class]
        curr = names.index(name) + 1
      else
        names.insert(curr, name)
        origins[name.object_id] = :higherClassification
        curr += 1
      end
    end

    # verbatimTaxonRank: [Not mapped]

    # scientificNameAuthorship: [verbatim_author of innermost protonym]
    begin
      author_name, year = Utilities::Strings.parse_authorship(get_field_value('scientificNameAuthorship'))

      names.last&.merge!({ verbatim_author: author_name, year_of_publication: year })
    end

    # vernacularName: [Not mapped]

    # taxonomicStatus: [Not mapped. Review]

    # nomenclaturalStatus: [Not mapped. Review]

    # taxonRemarks: [Not mapped]

    [names, origins]
  end

  # rubocop:disable Metric/MethodLength

  def parse_tw_collection_object_data_attributes
    attributes = []
    tags = []

    get_tw_data_attribute_fields_for('CollectionObject').each do |attribute|
      append_data_attribute(attributes, attribute)
    end

    get_tw_tag_fields_for('CollectionObject').each do |tag|
      append_tag_attribute(tags, tag)
    end

    {
      specimen: {
        data_attributes_attributes: attributes,
        tags_attributes: tags
      }
    }
  end

  def parse_tw_collecting_event_data_attributes
    attributes = []
    tags = []

    get_tw_data_attribute_fields_for('CollectingEvent').each do |attribute|
      append_data_attribute(attributes, attribute)
    end

    get_tw_tag_fields_for('CollectingEvent').each do |tag|
      append_tag_attribute(tags, tag)
    end

    {
      collecting_event: {
        data_attributes_attributes: attributes,
        tags_attributes: tags
      }
    }
  end

  def append_tag_attribute(tags, tag)
    value = get_field_value(tag[:field])
    return unless value

    keyword = Keyword.find_by(uri: tag[:selector], project: self.project)
    keyword ||= Keyword.where(project:).find_by(
      Keyword.arel_table[:name].matches(ApplicationRecord.sanitize_sql_like(tag[:selector]))
    )

    if value
      raise DarwinCore::InvalidData.new({ tag[:field] => ["Tag with #{tag[:selector]} URI or name not found"] }) unless keyword

      if value.downcase == 'true' || value == '1'
        tags.append({keyword:, annotator_batch_mode: true})
        return
      end

      unless value.downcase == 'false' || value == '0'
        raise DarwinCore::InvalidData.new({ tag[:field] => ['Tag value must be "true" or "1" to apply, or blank, "false", or "0", to not apply'] })
      end
    end
  end

  def append_data_attribute(attributes, attribute)
    predicate = Predicate.find_by(uri: attribute[:selector], project: self.project)
    predicate ||= Predicate.where(project:).find_by(
      Predicate.arel_table[:name].matches(ApplicationRecord.sanitize_sql_like(attribute[:selector]))
    )

    value = get_field_value(attribute[:field])
    if value
      raise DarwinCore::InvalidData.new({ attribute[:field] => ["Predicate with #{attribute[:selector]} URI or name not found"] }) unless predicate
      attributes << {
        type: 'InternalAttribute',
        predicate:,
        value:,
        annotator_batch_mode: true
      }
    end
  end

  def parse_biocuration_group_fields
    {
      specimen: {
        biocuration_classifications: get_tw_biocuration_groups
          .map { |g| parse_biocuration_group_field(g) }
          .reject(&:nil?)
      }
    }
  end

  def parse_biocuration_group_field(group)
    biocuration_group = BiocurationGroup.find_by(uri: group[:selector], project: self.project)
    biocuration_group ||= BiocurationGroup.where(project:).find_by(
      BiocurationGroup.arel_table[:name].matches(ApplicationRecord.sanitize_sql_like(group[:selector]))
    )

    value = get_field_value(group[:field])
    if value
      raise DarwinCore::InvalidData.new({ group[:field] => ["Biocuration group with '#{group[:selector]}' URI or name not found"] }) unless biocuration_group

      biocuration_class = BiocurationClass.where(project:).joins(:tags).merge(
        Tag.where(keyword: biocuration_group)
      ).find_by(uri: value)
      biocuration_class ||= BiocurationClass.where(project:).joins(:tags).merge(
        Tag.where(keyword: biocuration_group)
      ).find_by(
        BiocurationClass.arel_table[:name].matches(ApplicationRecord.sanitize_sql_like(value))
      )

      raise DarwinCore::InvalidData.new({ group[:field] => ["Biocuration class with '#{value}' URI or name not found"] }) unless biocuration_class

      BiocurationClassification.new(biocuration_class:)
    end
  end

  def parse_tw_collection_object_attributes

    attributes = {}

    get_tw_fields_for('CollectionObject').each do |attribute|
      value = get_field_value(attribute[:field])
      if value
        if !ACCEPTED_ATTRIBUTES[:CollectionObject].include?(attribute[:name])
          raise DarwinCore::InvalidData.new({ attribute[:field] => ["#{attribute[:name]} is not a valid CollectionObject attribute"] })
        end
        attributes[attribute[:name]] = value
      end
    end

    {
      specimen: attributes
    }
  end

  def parse_tw_collecting_event_attributes

    attributes = {}

    get_tw_fields_for('CollectingEvent').each do |attribute|
      value = get_field_value(attribute[:field])
      if value
        if !ACCEPTED_ATTRIBUTES[:CollectingEvent].include?(attribute[:name])
          raise DarwinCore::InvalidData.new({ attribute[:field] => ["#{attribute[:name]} is not a valid CollectingEvent attribute"] })
        end
        attributes[attribute[:name]] = value
      end
    end

    {
      collecting_event: attributes
    }
  end

  def append_dwc_attribute(attributes, predicate, value)
    attributes << {
      type: 'InternalAttribute',
      predicate:,
      value:,
      annotator_batch_mode: true
    } if value
  end

  def append_dwc_attributes(dwc_attributes, target)
    dwc_attributes.each do |field, predicate|
      append_dwc_attribute(target[:data_attributes_attributes], predicate, get_field_value(field))
    end
  end

end