lib/tasks/import/sf/sf_start.rake
namespace :tw do
namespace :project_import do
namespace :sf_import do
require 'fileutils'
require 'logged_task'
namespace :start do
# Anyone who runs these tasks: Substitute your id as user_id, not user_id=1
# check out default user_id if SF.FileUserID < 1 (indicates change was made programmatically)
desc 'time rake tw:project_import:sf_import:start:create_source_roles user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define create_source_roles: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Running create_source_roles...'
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
get_tw_person_id = import.get('SFPersonIDToTWPersonID')
get_tw_source_id = import.get('SFRefIDToTWSourceID')
get_tw_user_id = import.get('SFFileUserIDToTWUserID') # for housekeeping
# source_editor_array = import.get('TWSourceEditorList') # if source.id is in array
# get_containing_source_id = import.get('TWSourceIDToContainingSourceID')
ref_file_id = import.get('RefIDsByFileID')
ref_id_containing_id_hash = import.get('RefContainingRefHash')
ref_id_editor_array = import.get('RefIDEditorArray') # author as editor if RefID is in array
ref_id_pub_id_hash = import.get('RefIDPubIDHash')
ref_id_pub_type = import.get('SFPubIDToPubTypeString')
# First pass: Create authors for sources (standalone and containing)
# Second pass: Create editors in contained sources (applies only to books where author acted as editor)
path = @args[:data_directory] + 'sfRefAuthorsOrdered.txt'
file = CSV.read(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
author_error_counter = 0
file.each_with_index do |row, i|
ref_id = row['RefID']
next if skipped_file_ids.include? ref_file_id[ref_id].to_i
source_id = get_tw_source_id[ref_id]
next if source_id.nil? # @todo Should be recorded
# Reloop if TW.source record is verbatim
# next if Source.find(source_id).try(:class) == Source::Verbatim # << HernĂ¡n's, Source.find(source_id).type == 'Source::Verbatim'
# next if Source.where(id: source_id).pluck(:type)[0] == 'Source::Verbatim' # faster per Matt # there aren't any?
logger.info "working with SF.RefID = #{row['RefID']}, TW.source_id = #{source_id}, position = #{row['SeqNum']} \n"
role = Role.new(
person_id: get_tw_person_id[row['PersonID']],
type: 'SourceAuthor',
role_object_id: source_id,
role_object_type: 'Source',
created_at: row['CreatedOn'],
updated_at: row['LastUpdate'],
created_by_id: get_tw_user_id[row['CreatedBy']],
updated_by_id: get_tw_user_id[row['ModifiedBy']]
)
begin
role.save!
rescue ActiveRecord::RecordInvalid
logger.error "Author role ERROR (#{author_error_counter += 1}): " + role.errors.full_messages.join(';')
end
end
## Second pass: Check tblRefs rather than sfRefAuthorsOrdered
path = @args[:data_directory] + 'tblRefs.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
editor_error_counter = 0
file.each_with_index do |row, i|
next if row['ContainingRefID'] == '0' # reloop if no containing ref (eliminate most common attribute first)
next if skipped_file_ids.include? row['FileID'].to_i # ref_file_id[ref_id].to_i
ref_id = row['RefID']
source_id = get_tw_source_id[ref_id]
next if source_id.nil? # @todo Should be recorderd
# containing_ref_id = ref_id_containing_id_hash[ref_id]
containing_ref_id = row['ContainingRefID']
next unless ref_id_pub_type[ref_id_pub_id_hash[containing_ref_id]] == 'book' # is the containing ref a book
next unless ref_id_editor_array.include? containing_ref_id # did author act as editor
containing_source_id = get_tw_source_id[containing_ref_id]
next if containing_source_id.nil? # @todo Should be recorded
logger.info "working with SF.RefID = #{ref_id}, SF.ContainingRefID = #{containing_ref_id}, TW.source_id = #{source_id}, TW.containing_source_id = #{containing_source_id} \n"
ordered_editors = SourceAuthor.where(role_object_id: containing_source_id).order(:position).pluck(:person_id).each do |person_id|
puts person_id
end
ordered_editors.each do |person_id|
role = SourceEditor.new(
person_id: person_id,
role_object_id: source_id, # the ref in ref's editors, not the contained ref's editors
role_object_type: 'Source',
created_at: row['CreatedOn'],
updated_at: row['LastUpdate'],
created_by_id: get_tw_user_id[row['CreatedBy']],
updated_by_id: get_tw_user_id[row['ModifiedBy']]
)
begin
role.save!
rescue ActiveRecord::RecordInvalid
logger.error "Editor role ERROR person_id = #{person_id} (#{editor_error_counter += 1}): " + role.errors.full_messages.join(';')
end
end
end
#######################################################################################
`rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/5_after_source_roles/`
#######################################################################################
end
desc 'time rake tw:project_import:sf_import:start:create_misc_ref_info user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
# via tblRefs
LoggedTask.define create_misc_ref_info: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Running create_misc_ref_info...'
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
ref_id_editor_array = []
ref_id_containing_id_hash = {} # key = RefID, value = ContainingRefID
ref_id_pub_id_hash = {} # key = RefID, value = PubID
ref_file_id = {} # key = SF.RefID, value = SF.FileID
# Part I: Create array of refs with editor flag set
# Part II: Create hash of refs with containing refs
# Part III: Create hash of SF.RefID to SF.PubID
path = @args[:data_directory] + 'tblRefs.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each do |row|
next if skipped_file_ids.include? row['FileID'].to_i
ref_id = row['RefID']
containing_ref_id = row['ContainingRefID']
pub_id = row['PubID']
logger.info "working with SF.RefID = #{ref_id}, SF.ContainingRefID = #{row['ContainingRefID']}, flags = #{row['Flags']} \n"
ref_id_editor_array.push(ref_id) if row['Flags'].to_i & 2 == 2 # is_editor
ref_id_containing_id_hash[ref_id] = containing_ref_id if containing_ref_id != '0'
ref_id_pub_id_hash[ref_id] = pub_id
end
# Part IV: Create hash of RefID and FileID
path = @args[:data_directory] + 'sfRefIDsByFileID.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each do |row|
ref_file_id[row['RefID']] = row['FileID']
end
import.set('RefIDEditorArray', ref_id_editor_array)
import.set('RefContainingRefHash', ref_id_containing_id_hash)
import.set('RefIDPubIDHash', ref_id_pub_id_hash)
import.set('RefIDsByFileID', ref_file_id)
puts 'RefIDEditorArray'
ap ref_id_editor_array
puts 'RefContainingRefHash'
ap ref_id_containing_id_hash
puts 'RefIDPubIDHash'
ap ref_id_pub_id_hash
puts 'RefIDsByFileID'
ap ref_file_id
end
desc 'time rake tw:project_import:sf_import:start:create_sources user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define create_sources: [:data_directory, :environment, :user_id] do |logger|
# @todo: See :create_sf_book_hash and :update_sources_with_book_info above. Should be incorporated here.
logger.info 'Running create_sources...'
=begin Old logic
# tblRefs columns to import: Title, PubID, Series, Volume, Issue, RefPages, ActualYear, StatedYear, LinkID, LastUpdate, ModifiedBy, CreatedOn, CreatedBy
# tblRefs other columns: RefID => Source.identifier, FileID => used when creating ProjectSources, ContainingRefID => sfVerbatimRefs contains full
# RefStrings attached as data_attributes in ProjectSources (no need for ContainingRefID), AccessCode => n/a, Flags => identifies editor
# (use when creating roles and generating author string from tblRefAuthors), Note => attach to ProjectSources, CiteDataStatus => can be derived,
# Verbatim => not used
=end
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
get_tw_user_id = import.get('SFFileUserIDToTWUserID') # for housekeeping
get_tw_serial_id = import.get('SFPubIDToTWSerialID') # for FK
get_sf_ref_link = import.get('RefIDToRefLink') # key is SF.RefID, value is URL string
get_sf_verbatim_ref = import.get('RefIDToVerbatimRef') # key is SF.RefID, value is verbatim string
get_tw_project_id = import.get('SFFileIDToTWProjectID')
get_sf_booktitle_publisher_address = import.get('SFPubIDTitlePublisherAddress') # key = SF.PubID, value = hash of booktitle, publisher, address
get_sf_pub_type_string = import.get('SFPubIDToPubTypeString')
get_contained_cite_aux_data = import.get('SFContainedCiteAuxData')
get_tw_source_id = {} # key = SF.RefID, value = TW.source_id
get_containing_source_id = {} # key = TW.contained_source_id, value = TW.containing_source_id # use for containing auths/eds
# byebug
# Namespace for Identifier
# source_namespace = Namespace.find_or_create_by(institution: 'Species File', name: 'tblRefs', short_name: 'SF RefID')
count_found = 0
error_counter = 0
contained_error_counter = 0
source_not_found_error = 0
path = @args[:data_directory] + 'tblRefs.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
##### First Ref loop: Create sources for SF.tblRefs.ContainingRefID = 0
file.each_with_index do |row, i|
# break if i == 20
next if skipped_file_ids.include? row['FileID'].to_i
next if row['ContainingRefID'].to_i > 0 # Create source in second loop
next if (row['Title'].empty? and row['PubID'] == '0' and row['Series'].empty? and row['Volume'].empty? and row['Issue'].empty? and row['ActualYear'].empty? and row['StatedYear'].empty?) or row['AccessCode'] == '4'
ref_id = row['RefID']
logger.info "working with SF.RefID = #{ref_id}, SF.FileID = #{row['FileID']} (count = #{count_found += 1}) \n"
pub_id = row['PubID']
pub_type_string = get_sf_pub_type_string[pub_id]
booktitle = nil
publisher = nil
address = nil
if get_sf_booktitle_publisher_address[pub_id]
booktitle = get_sf_booktitle_publisher_address[pub_id]['booktitle']
publisher = get_sf_booktitle_publisher_address[pub_id]['publisher']
address = get_sf_booktitle_publisher_address[pub_id]['address']
end
# if year range, select min year (record full verbatim ref as data attribute after save)
actual_year = row['ActualYear'].split('-').map(&:to_i).min
stated_year = row['StatedYear'].split('-').map(&:to_i).min
source = Source::Bibtex.new(
bibtex_type: pub_type_string,
title: row['Title'],
booktitle: booktitle.blank? ? nil : booktitle,
publisher: publisher.blank? ? nil : publisher,
address: address.blank? ? nil : address,
serial_id: get_tw_serial_id[row['PubID']],
series: row['Series'],
volume: row['Volume'],
number: row['Issue'],
pages: row['RefPages'],
year: actual_year,
stated_year: stated_year,
url: row['LinkID'].to_i > 0 ? get_sf_ref_link[ref_id] : nil,
created_at: row['CreatedOn'],
updated_at: row['LastUpdate'],
created_by_id: get_tw_user_id[row['CreatedBy']],
updated_by_id: get_tw_user_id[row['ModifiedBy']]
)
# end
begin
source.save!
if row['ActualYear'].include?('-') or row['StatedYear'].include?('-')
source.data_attributes << ImportAttribute.new(import_predicate: 'SF verbatim reference for year range', value: get_sf_verbatim_ref[ref_id])
end
if pub_type_string == 'unpublished'
source.data_attributes << ImportAttribute.new(import_predicate: 'SF verbatim reference for unpublished reference', value: get_sf_verbatim_ref[ref_id])
end
source_id = source.id.to_s
get_tw_source_id[ref_id] = source_id
ProjectSource.create!(
project_id: get_tw_project_id[row['FileID']],
source_id: source.id,
created_at: row['CreatedOn'],
updated_at: row['LastUpdate'],
created_by_id: get_tw_user_id[row['CreatedBy']],
updated_by_id: get_tw_user_id[row['ModifiedBy']]
)
rescue ActiveRecord::RecordInvalid
logger.info "Source (ContainingRefID = 0) ERROR (#{error_counter += 1}): " + source.errors.full_messages.join(';')
# @todo Not found: Slater, J.A. Date unknown. A Catalogue of the Lygaeidae of the world. << RefID = 44058, PubID = 21898
end
end
##### Second Ref loop: Create sources for SF.tblRefs.ContainingRefID > 0
# skip if get_contained_cite_aux_data[sf_ref_id] -- do not create this source stub
file.each_with_index do |row, i|
next if row['ContainingRefID'].to_i == 0 # Creating only contained references in this pass
next if get_contained_cite_aux_data[row['RefID']]
ref_id = row['RefID']
containing_ref_id = row['ContainingRefID']
containing_source_id = get_tw_source_id[containing_ref_id]
logger.info "working with contained SF.RefID = #{ref_id}, SF.ContainingRefID = #{containing_ref_id}, tw.containing_source_id = #{containing_source_id}, SF.FileID = #{row['FileID']} (count = #{count_found += 1}) \n"
begin
containing_source = Source.find(containing_source_id)
rescue ActiveRecord::RecordInvalid, ActiveRecord::RecordNotFound
logger.error "Source ERROR: containing source not found for RefID = #{containing_source_id} (source not found = #{source_not_found_error += 1})"
next
end
if containing_source.bibtex_type == 'book'
pub_type_string = 'inbook'
else
logger.error "Source ERROR: containing source bibtex_type is not 'book', SF.RefID = #{ref_id}, SF.ContainingRefID = #{containing_ref_id}, TW.containing_source_id = #{containing_source_id}"
next
# pub_type_string = 'misc' # per Matt, parent source is 'article'
end
source = Source::Bibtex.new(
bibtex_type: pub_type_string,
title: row['Title'],
booktitle: containing_source.booktitle,
publisher: containing_source.publisher,
address: containing_source.address,
serial_id: containing_source.serial_id,
series: containing_source.series,
volume: containing_source.volume,
number: containing_source.number,
pages: row['RefPages'],
year: containing_source.year,
stated_year: containing_source.stated_year,
url: row['LinkID'].to_i > 0 ? get_sf_ref_link[ref_id] : nil,
created_at: row['CreatedOn'],
updated_at: row['LastUpdate'],
created_by_id: get_tw_user_id[row['CreatedBy']],
updated_by_id: get_tw_user_id[row['ModifiedBy']]
)
begin
source.save!
source_id = source.id.to_s
get_tw_source_id[ref_id] = source_id
get_containing_source_id[source_id] = containing_source_id
# Also keep db record of containing_source_id for future reference
source.data_attributes << ImportAttribute.new(import_predicate: 'containing_source_id', value: containing_source_id)
rescue ActiveRecord::RecordInvalid
logger.info "Source (Containing_ref_id > 0) ERROR (#{contained_error_counter += 1}): " + source.errors.full_messages.join(';')
end
end
import.set('SFRefIDToTWSourceID', get_tw_source_id)
import.set('TWSourceIDToContainingSourceID', get_containing_source_id)
puts 'SFRefIDToTWSourceID'
ap get_tw_source_id
puts 'TWSourceIDToContainingSourceID'
ap get_containing_source_id
#######################################################################################
`rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/4_after_create_sources/`
puts 'dump created'
#######################################################################################
end
desc 'time rake tw:project_import:sf_import:start:contained_cite_aux_data user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define contained_cite_aux_data: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Creating SF contained cite aux data...'
# Misc. data associated with contained ref acting as taxon name author. Ref in refs of articles, not books. Do not create source records
# for these. The aux data will be used in a note for a citation. The containing ref ID will be the actual record for the citation.
get_contained_cite_aux_data = {} # key = SF.RefID, value = ContainingRefID, RefPages, Note, LinkID
path = @args[:data_directory] + 'sfContainedCiteAuxData.txt'
file = CSV.read(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each do |row|
ref_id = row['RefID']
logger.info "Working with SF.RefID = '#{ref_id}' \n"
get_contained_cite_aux_data[ref_id] = {containing_ref_id: row['ContainingRefID'], ref_pages: row['RefPages'], note: row['Note'], link_id: row['LinkID']}
end
import = Import.find_or_create_by(name: 'SpeciesFileData')
import.set('SFContainedCiteAuxData', get_contained_cite_aux_data)
puts 'SFContainedCiteAuxData'
ap get_contained_cite_aux_data
#######################################################################################
# `rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/`
#######################################################################################
end
desc 'time rake tw:project_import:sf_import:start:map_pub_type user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
# map SF.PubID by SF.PubType
LoggedTask.define map_pub_type: [:data_directory, :environment, :user_id] do |logger|
# Can be run independently at any time
logger.info 'Running map_pub_types...'
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
get_sf_pub_type_string = {} # key = SF.PubID, value = SF.PubType
path = @args[:data_directory] + 'tblPubs.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each_with_index do |row|
next if skipped_file_ids.include? row['FileID'].to_i
pub_type = row['PubType']
if pub_type == '1'
pub_type_string = 'article'
elsif pub_type == '3'
pub_type_string = 'book'
elsif pub_type == '4'
pub_type_string = 'unpublished'
else
pub_type_string = '**ERROR**'
end
get_sf_pub_type_string[row['PubID']] = pub_type_string
end
import.set('SFPubIDToPubTypeString', get_sf_pub_type_string)
puts 'SFPubIDToPubTypeString'
ap get_sf_pub_type_string
#######################################################################################
`rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/3_after_pub_type`
#######################################################################################
end
desc 'time rake tw:project_import:sf_import:start:create_sf_book_hash user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
# consists of book_title:, publisher:, and place_published: (address)'
LoggedTask.define create_sf_book_hash: [:data_directory, :environment, :user_id] do |logger|
# Can be run independently at any time
logger.info 'Running create_sf_book_hash...'
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
get_sf_booktitle_publisher_address = {} # key = SF.PubID, value = booktitle, publisher, and address from tblPubs
path = @args[:data_directory] + 'tblPubs.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each_with_index do |row, i|
next if skipped_file_ids.include? row['FileID'].to_i
next unless row['PubType'] == '3' # book
logger.info "working with PubID #{row['PubID']}"
get_sf_booktitle_publisher_address[row['PubID']] = {booktitle: row['ShortName'], publisher: row['Publisher'], address: row['PlacePublished']}
end
import.set('SFPubIDTitlePublisherAddress', get_sf_booktitle_publisher_address)
puts 'SFPubIDTitlePublisherAddress'
ap get_sf_booktitle_publisher_address
end
desc 'time rake tw:project_import:sf_import:start:create_projects user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define create_projects: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Running create_projects...'
get_tw_project_id = {} # key = SF.FileID, value = TW.project_id
# create mb as project member for each project -- comment out for Sandbox
user = User.find_by_email('mbeckman@illinois.edu')
$user_id = user.id # not sure if this is really needed?
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
path = @args[:data_directory] + 'tblFiles.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each_with_index do |row, i|
file_id = row['FileID']
next if file_id == '0'
next if skipped_file_ids.include? file_id.to_i
website_name = row['WebsiteName'].downcase # want to be lower case
# project = Project.new(name: "#{website_name}_species_file(#{Time.now})", without_root_taxon_name: true)
project = Project.new(
name: "#{website_name}_species_file(#{Time.now})"
)
# byebug
if project.save
# Protonym.create!(name: 'Root', rank_class: 'NomenclaturalRank', parent_id: nil, project: project, creator: user, updater: user, cached_html: 'Root')
get_tw_project_id[file_id] = project.id.to_s
# comment out project_member for Sandbox use
ProjectMember.create!(user_id: user.id, project: project, is_project_administrator: true)
else
logger.info "ERROR (#{error_counter += 1}): " + source.errors.full_messages.join(';')
logger.info "FileID: #{file_id}, sf row created by: #{row['CreatedBy']}, sf row updated by: #{row['ModifiedBy']} "
end
end
import.set('SFFileIDToTWProjectID', get_tw_project_id)
puts 'SFFileIDToTWProjectID'
ap get_tw_project_id
end
desc 'time rake tw:project_import:sf_import:start:list_verbatim_refs user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define list_verbatim_refs: [:data_directory, :environment, :user_id] do |logger|
# Can be run independently at any time before referenced
logger.info 'Running list_verbatim_refs...'
get_sf_verbatim_ref = {} # key = SF.RefID, value = SF verbatim ref (table generated from a script)
path = @args[:data_directory] + 'sfVerbatimRefs.txt'
file = CSV.read(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each do |row|
# byebug
# puts row.inspect
ref_id = row['RefID']
logger.info "working with #{ref_id} \n"
get_sf_verbatim_ref[ref_id] = row['RefString']
end
i = Import.find_or_create_by(name: 'SpeciesFileData')
i.set('RefIDToVerbatimRef', get_sf_verbatim_ref)
puts 'RefIDToVerbatimRef'
ap get_sf_verbatim_ref
#######################################################################################
`rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/2_after_verbatim_refs/`
#######################################################################################
end
desc 'time rake tw:project_import:sf_import:start:map_ref_links user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define map_ref_links: [:data_directory, :environment, :user_id] do |logger|
# Can be run independently at any time before referenced
logger.info 'Running map_ref_links...'
get_sf_ref_link = {} # key = SF.RefID, value = SF ref link (table generated from a script)
path = @args[:data_directory] + 'sfRefLinks.txt'
file = CSV.read(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each do |row|
# byebug
# puts row.inspect
ref_id = row['RefID']
logger.info "working with #{ref_id} \n"
get_sf_ref_link[ref_id] = row['RefLink']
end
import = Import.find_or_create_by(name: 'SpeciesFileData')
import.set('RefIDToRefLink', get_sf_ref_link)
puts 'RefIDToRefLink'
ap get_sf_ref_link
end
=begin obsolete
# :create_no_ref_list_array is now created on the fly in :create_sources (data conflicts)
# desc 'make array from no_ref_list'
# task :create_no_ref_list_array => [:data_directory, :environment, :user_id] do
# ### rake tw:project_import:sf_start:create_no_ref_list_array user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/
# sf_no_ref_list = []
#
# path = @args[:data_directory] + 'direct_from_sf/no_ref_list.txt'
# file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-8')
#
# file.each do |row|
# sf_no_ref_list.push(row[0])
# end
#
# i = Import.find_or_create_by(name: 'SpeciesFileData')
# i.set('SFNoRefList', sf_no_ref_list)
#
# puts 'SF no_ref_list'
# ap sf_no_ref_list
#
# end
=end
desc 'time rake tw:project_import:sf_import:start:map_serials user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define map_serials: [:environment, :user_id] do |logger|
# Can be run independently at any time before referenced: Why can't the value be cast as string??
logger.info 'Running map_serials...'
# pubs = DataAttribute.where(import_predicate: 'SF ID', attribute_subject_type: 'Serial').limit(10).pluck(:value, :attribute_subject_id)
get_tw_serial_id = DataAttribute.where(import_predicate: 'SF ID', attribute_subject_type: 'Serial').pluck(:value, :attribute_subject_id).to_h
import = Import.find_or_create_by(name: 'SpeciesFileData')
import.set('SFPubIDToTWSerialID', get_tw_serial_id)
puts 'SFPubIDToTWSerialID'
ap get_tw_serial_id
#######################################################################################
`rake tw:db:dump backup_directory=/Users/mbeckman/src/db_backup/1_after_serials/`
#######################################################################################
end
desc 'time rake tw:project_import:sf_import:start:create_people user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define create_people: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Running create_people...'
=begin Thinking through logic:
# Two loops:
# Loop # 1
# loop through entire table (22004 entries)
# process only those rows where row['PrefID'] == 0
# create person, how to assign original housekeeping (save hashes from create_users)?
# save person, validate, etc.
# save PersonID as identifier or data_attribute or ?? << probably identifier/local import identifier
# save Role (bitmap) as data_attribute (?) for later role assignment; Role & 256 = 256 should indicate name is deletable, but it is often incorrectly set!
# make SF.PersonID and TW.person.id hash (for processing in second loop)
#
# Loop # 2
# loop through entire table
# process only those rows where row['PrefID'] > 0
# identify tw.person.id via row['PrefID'] in hash
# create alternate_value for tw.person.id using last_name only
# tblPeople: PersonID, FileID, PrefID, [PersonRegID], FamilyName, GivenName, GivenInitials, Suffix, *Role*, [Status], LastUpdate, ModifiedBy, CreatedOn, CreatedBy
# Identifiers: PersonID; DataAttributes: FileID, Role; Do not import: PersonRegID; GivenName/GivenInitials: If GN is blank, use GI
#
# People: id, type, last_name, first_name, created_at, updated_at, suffix, prefix, created_by_id, updated_by_id, cached
# @project = Project.find_by_name('Orthoptera Species File')
# $project_id = @project.id
=end
import = Import.find_or_create_by(name: 'SpeciesFileData')
get_tw_user_id = import.get('SFFileUserIDToTWUserID') # for housekeeping
get_tw_person_id ||= {} # key = SF.PersonID, value = TW.person_id; make empty hash if doesn't exist (otherwise it would be nil), used in loop 2
# create Namespace for Identifier (used in loop below): Species File, tblPeople, SF PersonID
# 'Key3' => Namespace.find_or_create_by(name: '3i_Source_ID', short_name: '3i_Source_ID') # 'Key3' was key in hash @data.keywords.merge! ??
# auth_user_namespace = Namespace.find_or_create_by(institution: 'Species File', name: 'tblAuthUsers', short_name: 'SF AuthUserID')
person_namespace = Namespace.find_or_create_by(institution: 'Species File', name: 'tblPeople', short_name: 'SF PersonID')
# No longer using InternalAttribute for following import values; using ImportAttribute instead since it doesn't require a project_id
# file_id = Predicate.find_or_create_by(name: 'FileID', definition: 'SpeciesFile.FileID', project_id: $project_id)
# person_roles = Predicate.find_or_create_by(name: 'Roles', definition: 'Bitmap of person roles', project_id: $project_id)
# example of internal attr:
# person.data_attributes << InternalAttribute.new(predicate: person_roles, value: row['Role'])
# person.identifiers.new(type: 'Identifier::Local::Import', namespace: person_namespace, identifier: sf_person_id)
# # probably only writes to memory, to save in db, use <<
path = @args[:data_directory] + 'tblPeople.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
# loop 1: Get preferred records only
person_error_counter = 0
file.each_with_index do |row, i|
sf_person_id = row['PersonID']
# next if get_tw_person_id[sf_person_id] # do not create if already exists
pref_id = row['PrefID']
next if pref_id.to_i > 0 # alternate spellings will be handled in second loop
logger.info "working with SF.PersonID: #{sf_person_id} \n"
person = Person::Vetted.new(
# type: 'Person_Vetted',
last_name: row['FamilyName'],
first_name: row['GivenNames'].blank? ? row['GivenInitials'] : row['GivenNames'],
created_at: row['CreatedOn'],
updated_at: row['LastUpdate'],
suffix: row['Suffix'],
# prefix: '',
created_by_id: get_tw_user_id[row['CreatedBy']],
updated_by_id: get_tw_user_id[row['ModifiedBy']]
# cached: '?'
)
begin
person.save!
person.data_attributes << ImportAttribute.new(import_predicate: 'FileID', value: row['FileID'])
person.data_attributes << ImportAttribute.new(import_predicate: 'Role', value: row['Role'])
person.identifiers << Identifier::Local::Import.new(namespace: person_namespace, identifier: sf_person_id)
get_tw_person_id[sf_person_id] = person.id.to_s
rescue ActiveRecord::RecordInvalid
logger.info "Person ERROR (#{person_error_counter += 1}): " + person.errors.full_messages.join(';')
end
end
import.set('SFPersonIDToTWPersonID', get_tw_person_id) # write to db
logger.info 'SFPersonIDToTWPersonID'
ap get_tw_person_id
# loop 2: Get non-preferred records and save as alternate values
added_counter = 0
error_counter = 0
file.each_with_index do |row, i| # uses path & file from loop 1
pref_id = row['PrefID']
next if pref_id.to_i == 0 # handle only non-preferred records
non_pref_family_name = row['FamilyName'] # use the non-preferred person's family name as default alternate name
if get_tw_person_id[pref_id]
puts "working with SF.PrefID: #{pref_id} (from SF.PersonID: #{row['PersonID']}), TW.person_id: #{get_tw_person_id[pref_id]}"
# pref_person.alternate_values.new(value: non_pref_family_name, type: 'AlternateValue::AlternateSpelling', alternate_value_object_attribute: 'last_name')
a = AlternateValue::AlternateSpelling.new(
alternate_value_object_type: 'Person',
alternate_value_object_id: get_tw_person_id[pref_id],
value: non_pref_family_name,
alternate_value_object_attribute: 'last_name'
)
begin
a.save!
logger.info "Attribute added (#{added_counter += 1})"
rescue ActiveRecord::RecordInvalid
logger.info "Attribute ERROR (#{error_counter += 1}): invalid attribute -- " + a.errors.full_messages.join(';')
end
end
end
logger.info "person_error_counter = #{person_error_counter}, added_counter = #{added_counter}, error_counter = #{error_counter}"
end
desc 'time rake tw:project_import:sf_import:start:create_users user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define create_users: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Running create_users...'
=begin
Most logic for authorized users (one login per user) vs. file users (same authorized user for Orthoptera, Plecoptera, and Phasmida but each is different in each SF)
# ProjectMembers: id, project_id, user_id, created_at, updated_at, created_by_id, updated_by_id, is_project_administrator
# * Cannot annotate a project_member
# Users: id, email, password_digest, created_at, updated_at, remember_token, created_by_id, updated_by_id, is_administrator,
# password_reset_token, password_reset_token_date, name, current_sign_in_at, last_sign_in_at, current_sign_in_ip, last_sign_in_ip,
# hub_tab_order, api_access_token, is_flagged_for_password_reset, footprints, sign_in_count,
# * Annotations: FullName, TaxaShowSpecs, CiteShowSpecs, SpmnShowSpecs
# * Since no annotations for project_member, could add notes to Users for (Access, LastLogin, NumLogins, LastEdit, NumEdits) by SF
# tblFileUsers: FileUserID, AuthUserID, FileID, Access, LastLogin, NumLogins, LastEdit, NumEdits, CreatedOn, CreatedBy
# tblAuthUsers: AuthUserID, Name, HashedPassword, FullName, TaxaShowSpecs, CiteShowSpecs, SpmnShowSpecs, LastUpdate, ModifiedBy,
# CreatedOn, CreatedBy
# Fields for potential data attributes
# AuthUserID
# create a ControlledVocabularyTerm of type Predicate (to be used in DataAttribute in User instance below)
# predicates = {
# 'AuthUserID' => Predicate.find_or_create_by(name: 'AuthUserID', definition: 'Unique user name id', project_id: $project_id)
# }
# Now that User is identifiable, we can use an identifier for the unique AuthUserID (vs. FileUserID)
# Create Namespace for Identifier: Species File, tblAuthUsers, SF AuthUserID
# 'Key3' => Namespace.find_or_create_by(name: '3i_Source_ID', short_name: '3i_Source_ID') # 'Key3' was key in hash @data.keywords.merge! in 3i.rake ??
=end
auth_user_namespace = Namespace.find_or_create_by(institution: 'Species File', name: 'tblAuthUsers', short_name: 'SF AuthUserID')
import = Import.find_or_create_by(name: 'SpeciesFileData')
skipped_file_ids = import.get('SkippedFileIDs')
# find unique editors/admin, i.e. people getting users accounts in TW
unique_auth_users = {} # unique sf.authorized users with edit+ access, not stored in Import, used only in this task
sf_file_user_id_to_sf_auth_user_id = {} # not stored in Import; multiple file users map onto same auth user
get_tw_user_id = {} # key = sf.file_user_id, value = tw.user_id
get_sf_file_id = {} # key = sf.file_user_id, value sf.file_id; for future use when creating projects and project members
@user_index = {}
project_url = 'speciesfile.org'
path = @args[:data_directory] + 'tblFileUsers.txt'
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
file.each_with_index do |row, i|
next if skipped_file_ids.include? row['FileID'].to_i
au_id = row['AuthUserID']
fu_id = row['FileUserID']
# next if [0, 8].freeze.include?(row['Access'].to_i)
next if [8].freeze.include?(row['Access'].to_i) # in some cases, user access has been rescinded after user edited something; keep this user, if no name, use NoName_1, 2, 3, etc.
logger.info "WARNING - NON UNIQUE FileUserID: #{fu_id}" if sf_file_user_id_to_sf_auth_user_id[fu_id]
sf_file_user_id_to_sf_auth_user_id[fu_id] = au_id
if unique_auth_users[au_id]
unique_auth_users[au_id].push fu_id
else
unique_auth_users[au_id] = [fu_id]
end
get_sf_file_id[fu_id] = row['FileID']
end
path = @args[:data_directory] + 'tblAuthUsers.txt'
logger.info "Creating users\n"
raise "file #{path} not found" if not File.exist?(path)
file = CSV.foreach(path, col_sep: "\t", headers: true, encoding: 'UTF-16:UTF-8')
error_counter = 0
no_name_counter = 0
file.each_with_index do |row, i|
au_id = row['AuthUserID']
logger.info "working with AuthUser: #{au_id}"
user_name = row['Name']
if user_name.blank?
user_name = "NoName_#{no_name_counter += 1}"
end
if unique_auth_users[au_id]
logger.info "is a unique user, creating: #{i}: #{user_name}"
user = User.new(
name: user_name,
password: '12345678',
email: 'auth_user_id' + au_id.to_s + '_random' + rand(1000).to_s + '@' + project_url
)
if user.save
unique_auth_users[au_id].each do |fu_id|
get_tw_user_id[fu_id] = user.id.to_s # @ Making user.id into string for consistency of all hash values being strings
end
@user_index[row['FileUserID']] = user.id # maps multiple FileUserIDs onto single TW user.id
# create AuthUserID as DataAttribute as InternalAttribute for table users
# user.data_attributes << InternalAttribute.new(predicate: predicates['AuthUserID'], value: au_id)
# Now using an identifier for this:
user.identifiers.new(type: 'Identifier::Local::Import', namespace: auth_user_namespace, identifier: au_id)
# Do not create project_members right now; store hash of file_user_id => file_id in Import table
# ProjectMember.create(user: user, project: @project)
else
logger.info "User ERROR (#{error_counter += 1}): " + user.errors.full_messages.join(';')
end
else
logger.info " skipping, public access only\n"
end
end
# Save the file user mappings to the import table
import.set('SFFileUserIDToTWUserID', get_tw_user_id)
import.set('SFFileUserIDToSFFileID', get_sf_file_id) # will be used when tables containing FileID are imported
# display user mappings
puts 'unique authorized users with edit+ access'
ap unique_auth_users # list of unique authorized users (who may or may not currently have edit+ access via FileUserIDs)
puts 'multiple FileUserIDs mapped to single AuthUserID'
ap sf_file_user_id_to_sf_auth_user_id # map multiple FileUserIDs onto single AuthUserID
puts 'SFFileUserIDToTWUserID'
ap get_tw_user_id # map multiple FileUserIDs on single TW user.id
puts 'SFFileUserIDToSFFileID'
ap get_sf_file_id
end
desc 'time rake tw:project_import:sf_import:start:list_skipped_file_ids user_id=1 data_directory=/Users/mbeckman/src/onedb2tw/working/'
LoggedTask.define list_skipped_file_ids: [:data_directory, :environment, :user_id] do |logger|
logger.info 'Running list_skipped_file_ids...'
skipped_file_ids = [
9, # Lepidoptera
24, # Collembola
48, # Rhyparochromidae
54, # Heteroptera
56, # Membracoidea
66, # Odonata
70, # Tortricidae
77, # Erebidae
78, # Melanoplus
80, # Pyrgomorphidae
81, # Ommexechidae
82, # Carabidae
83, # Cicadoidea
84, # Psychodidae
85, # Megaloptera
86, # Scutelleridae
88, # Praxibulini
89, # Prostoia
92 # Dysoniini
]
import = Import.find_or_create_by(name: 'SpeciesFileData')
import.set('SkippedFileIDs', skipped_file_ids)
puts 'SkippedFileIDs'
ap skipped_file_ids
end
end
end
end
end