SpeciesFileGroup/taxonworks

View on GitHub
lib/tasks/import/sf/sf_supplementary.rake

Summary

Maintainability
Test Coverage
namespace :tw do
  namespace :project_import do
    namespace :sf_import do
      require 'fileutils'
      require 'logged_task'
      namespace :supplementary do



        # 52;"Person::Vetted";"Schröder";"C."
        # 29;"Person::Vetted";"Tinkham";"Ernest R."
        # 41;"Person::Vetted";"Voisin";"Jean-François"
        # 13;"Person::Vetted";"Blatchley";"W.S."
        #
        # Source::Human.joins(:people).where(person_ids: [1,2,3], year: 1234)


        desc 'time rake tw:project_import:sf_import:supplementary:scrutiny_related user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
        LoggedTask.define scrutiny_related: [:data_directory, :environment, :user_id] do |logger|

          import = Import.find_or_create_by(name: 'SpeciesFileData')
          skipped_file_ids = import.get('SkippedFileIDs')
          excluded_taxa = import.get('ExcludedTaxa')
          # get_sf_file_id = import.get('SFTaxonNameIDToSFFileID')
          get_tw_project_id = import.get('SFFileIDToTWProjectID')
          get_tw_person_id = import.get('SFPersonIDToTWPersonID')
          get_tw_user_id = import.get('SFFileUserIDToTWUserID') # for housekeeping
          get_tw_taxon_name_id = import.get('SFTaxonNameIDToTWTaxonNameID')

          counter = 0

          # first create hash of scrutinies
          get_scrutinies = {} # key = ScrutinyID, value = FileID, Year, Comment
          logger.info 'Creating scrutinies hash...'
          path = @args[:data_directory] + 'tblScrutinies.txt'
          file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
          file.each_with_index do |row, i|
            get_scrutinies[row['ScrutinyID']] = {sf_file_id: row['FileID'], year: row['Year'], comment: row['Comment']}
          end

          # next create hash of arrays for scrutiny authors
          # No error handling if no TW equiv person
          get_tw_scrutiny_authors = {} # from tblScrutinyAuthors, key = ScrutinyID, value = PersonID, SeqNum
          logger.info 'Creating scrutiny_authors hash...'
          path = @args[:data_directory] + 'tblScrutinyAuthors.txt'
          file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
          file.each_with_index do |row, i|
            id = row['ScrutinyID']
            index = row['SeqNum'].to_i
            if get_tw_scrutiny_authors[id] # subsequent author for same ScrutinyID
              get_tw_scrutiny_authors[id][index] = get_tw_person_id[row['PersonID']]
            else
              get_tw_scrutiny_authors[id] = [] # first author for different ScrutinyID
              get_tw_scrutiny_authors[id][index] = get_tw_person_id[row['PersonID']]
            end
          end

          import.set('Scrutinies', get_scrutinies)
          import.set('ScrutinyAuthors', get_tw_scrutiny_authors)

          puts 'Scrutinies'
          ap get_scrutinies
          puts 'ScrutinyAuthors'
          ap get_tw_scrutiny_authors

          # finally process tblTaxonScrutinies
          # No error handling if there is no TW equiv taxon_name
          path = @args[:data_directory] + 'tblTaxonScrutinies.txt'
          file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')

          file.each_with_index do |row, i|
            sf_taxon_name_id = row['TaxonNameID']
            next if excluded_taxa.include? sf_taxon_name_id
            tw_taxon_name_id = get_tw_taxon_name_id[sf_taxon_name_id] # cannot to_i because if nil, nil.to_i = 0 ]
            scrutiny_id = row['ScrutinyID']
            sf_file_id = get_scrutinies[scrutiny_id][:sf_file_id]
            next if skipped_file_ids.include? sf_file_id.to_i
            if tw_taxon_name_id.nil?
              logger.error "TW.taxon_name_id is nil: ScrutinyID = #{scrutiny_id}, SF.TaxonNameID #{sf_taxon_name_id}, SF.FileID = #{sf_file_id}"
              next
            end

            seqnum = row['SeqNum']
            project_id = get_tw_project_id[sf_file_id].to_i
            year = get_scrutinies[scrutiny_id][:year]
            comment = get_scrutinies[scrutiny_id][:comment]

            logger.info "Working on ScrutinyID = #{scrutiny_id}, SF.TaxonNameID #{sf_taxon_name_id} = tw.taxon_name_id #{tw_taxon_name_id}, project_id = #{project_id}, counter = #{counter += 1}"

            content = "SeqNum = #{seqnum}, ScrutinyID = #{scrutiny_id}, Year = #{year}, PersonIDs = #{get_tw_scrutiny_authors[scrutiny_id]}, Comment = '#{comment}'"

            sh = Source::Human.create!(stated_year: year, person_ids: get_tw_scrutiny_authors[scrutiny_id])

            scrutiny_predicate = Predicate.find_or_create_by(name: 'Species File scrutiny', definition: 'from tblScrutinies, limit of three scrutinies per taxon name', project_id: project_id)
            scrutiny = DataAttribute.create!(type: 'InternalAttribute',
                                            controlled_vocabulary_term_id: scrutiny_predicate.id,
                                            attribute_subject_id: tw_taxon_name_id,
                                            attribute_subject_type: 'TaxonName',
                                            value: content,
                                            project_id: project_id,
                                            created_at: row['CreatedOn'],
                                            updated_at: row['LastUpdate'],
                                            created_by_id: get_tw_user_id[row['CreatedBy']],
                                            updated_by_id: get_tw_user_id[row['ModifiedBy']],

                                            citations_attributes: [{source_id: sh.id, project_id: project_id}]  # source: {id: sh.id}   # source_id: sh.id
            )

            if scrutiny.nil?
              logger.error "Error creating TaxonScrutiny: ScrutinyID = #{scrutiny_id}, SF.TaxonNameID #{sf_taxon_name_id} = tw.taxon_name_id #{tw_taxon_name_id}"
            # else
            #   cite = Citation.new(source_id: sh.id, citation_object: scrutiny)
            #   byebug
            #
            end
          end
        end

        desc 'time rake tw:project_import:sf_import:supplementary:taxon_info user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
        LoggedTask.define taxon_info: [:data_directory, :environment, :user_id] do |logger|

          logger.info 'Importing SupplementaryTaxonInformation...'

          import = Import.find_or_create_by(name: 'SpeciesFileData')
          get_sf_taxon_info = import.get('SFTaxonNameIDMiscInfo')
          skipped_file_ids = import.get('SkippedFileIDs')
          excluded_taxa = import.get('ExcludedTaxa')
          get_tw_project_id = import.get('SFFileIDToTWProjectID')
          get_tw_user_id = import.get('SFFileUserIDToTWUserID') # for housekeeping
          get_tw_taxon_name_id = import.get('SFTaxonNameIDToTWTaxonNameID')
          get_tw_otu_id = import.get('SFTaxonNameIDToTWOtuID') # Note this is an OTU associated with a SF.TaxonNameID (probably a bad taxon name)
          get_taxon_name_otu_id = import.get('TWTaxonNameIDToOtuID') # Note this is the OTU offically associated with a real TW.taxon_name_id
          get_tw_source_id = import.get('SFRefIDToTWSourceID')
          # get_sf_verbatim_ref = import.get('RefIDToVerbatimRef') # key is SF.RefID, value is verbatim string

          counter = 0
          # otu_only_counter = 0
          # otu_not_found_array = []

          path = @args[:data_directory] + 'tblSupplTaxonInfo.txt'
          file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')

          file.each_with_index do |row, i|
            sf_taxon_name_id = row['TaxonNameID']
            sf_file_id = get_sf_taxon_info[sf_taxon_name_id]['file_id']
            next if skipped_file_ids.include? sf_file_id.to_i
            next if excluded_taxa.include? sf_taxon_name_id
            tw_taxon_name_id = get_tw_taxon_name_id[sf_taxon_name_id] # cannot to_i because if nil, nil.to_i = 0
            project_id = get_tw_project_id[sf_file_id]

            logger.info "Working on SF.TaxonNameID #{sf_taxon_name_id} = tw.taxon_name_id #{tw_taxon_name_id}, project_id = #{project_id}, counter = #{counter += 1}"

            title = row['Title']
            if tw_taxon_name_id.nil?
              if get_tw_otu_id[sf_taxon_name_id]
                attribute_subject_id = get_tw_otu_id[sf_taxon_name_id]
                attribute_subject_type = 'Otu'
              else
                
                logger.warn "SF.TaxonNameID = #{sf_taxon_name_id} not found and OTU not found"
                next
              end
            else
              if title.include? 'etymology'
                attribute_subject_id = tw_taxon_name_id
                attribute_subject_type = 'TaxonName'
              else
                attribute_subject_id = get_taxon_name_otu_id[tw_taxon_name_id]
                attribute_subject_type = 'Otu'
              end
            end

            if row['SourceID'].to_i > 0
              title += ", source_id = #{get_tw_source_id[row['SourceID']]}"
            end

            logger.info "attribute_subject_id = #{attribute_subject_id}, attribute_subject_type = #{attribute_subject_type}, title = #{title}"

            da = DataAttribute.new(type: 'ImportAttribute',
                                   attribute_subject_id: attribute_subject_id,
                                   attribute_subject_type: attribute_subject_type,
                                   import_predicate: title,
                                   value: row['Content'],
                                   project_id: project_id,
                                   created_at: row['CreatedOn'],
                                   updated_at: row['LastUpdate'],
                                   created_by_id: get_tw_user_id[row['CreatedBy']],
                                   updated_by_id: get_tw_user_id[row['ModifiedBy']])

            begin
              da.save!
                # data_attributes are not citable
                # if row['SourceID'].to_i > 0
                #   Citation.create!(source_id: get_tw_source_id[row['SourceID']],
                #                    citation_object: da,
                #                    created_at: row['CreatedOn'],
                #                    updated_at: row['LastUpdate'],
                #                    created_by_id: get_tw_user_id[row['CreatedBy']],
                #                    updated_by_id: get_tw_user_id[row['ModifiedBy']])
                # end
            rescue ActiveRecord::RecordInvalid
              logger.error "DataAttribute ERROR" + da.errors.full_messages.join(';')
              # Validation failed: Value has already been taken
              # next
            end
          end

          #######################################################################################
          `rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/18_after_scrutinies/`
          #######################################################################################
        end

      end # namespaces below
    end
  end
end