lib/tasks/development/data/geo/build_geographic_areas.rake
namespace :tw do
namespace :development do
namespace :data do
namespace :geo do
desc 'Rebuilds the closure_tree indexing on GeographicAreas.'
task rebuild_geographic_areas_nesting: [:environment, :geo_dev_init] do
puts "\n\n#{Time.now.strftime "%H:%M:%S"}."
if Current.user_id.nil?
u = User.order(:id).first
if u.nil?
# FactoryBot is not allowed in rake tasks
u = User.create!(id: 1, email: 'test@example.com', name: 'Rake user', password: '12345678', self_created: true)
end
Current.user_id = u.id
end
GeographicArea.rebuild!
puts "\n\n#{Time.now.strftime "%H:%M:%S"}."
end
# !! It is recommended that you start from scratch prior to running this.
# rake db:drop RAILS_ENV=development && rake db:create RAILS_ENV=development && rake db:migrate RAILS_ENV=development && rake db:seed RAILS_ENV=development
#
desc "Builds GeographicAreas, simultaneously builds GeographicAreaTypes if needed and stubs GeographicAreasGeographicItems.\n
rake tw:development:data:geo:build_geographic_areas data_directory=/Users/matt/src/sf/tw/gaz/ user_id=1 database_role=matt NO_GEO_NESTING=1 NO_GEO_VALID=1"
task build_geographic_areas: [:environment, :geo_dev_init, :data_directory, :user_id, :build_temporary_shapefile_tables] do
@connection = ApplicationRecord.connection
@earth = build_earth # make sure the earth record exists and is available
@data_index = GeoAreasIndex.new(@earth)
build_areas_from_country_codes
IMPORT_TABLES.each_key do |source_table|
self.send("build_areas_from_#{source_table}")
end
@data_index.build_internal_nodes
@data_index.create_areas
puts "Collisions: #{@data_index.collision_count}"
puts "Total geographic areas created: #{@data_index.index.count}"
end
def build_earth
if e = GeographicArea.where(name: 'Earth').first
e
else
GeographicArea.create(name: 'Earth', geographic_area_type: GeographicAreaType.find_or_create_by(name: 'Planet'))
end
end
# An index structure for the data.
class GeoAreasIndex
attr_accessor :index, :collision_count, :duplicate_names
def initialize(earth)
@index = {}
@collision_count = 0
@duplicate_names = []
i = add_item(
name: earth.name,
parent_names: [],
source_table: 'SFG',
is_internal_node: true
)
@index[i.index].geographic_area = earth
end
def names
@index.values.collect { |n| n.temp_area }
end
# A setter method, all additions to the index must use this.
def add_item(attribute_hash)
i = TempArea.new(attribute_hash)
if @index[i.index]
@collision_count += 1
@duplicate_names.push(i)
@index[i.index].temp_area.add_shape(attribute_hash)
else
@index.merge!(i.index => RecordLink.new(temp_area: i, geographic_area: nil))
end
i
end
# Internal nodes presently do not build corresponding lvl records! Not sure if this is an issue.
def build_internal_nodes
names.each do |tmp_geo_area|
recurse_nodes(tmp_geo_area.parent_names, tmp_geo_area)
end
end
# Recursively builds internal nodes.
def recurse_nodes(parent_names, source_tmp_geo_area)
if parent_names.count > 1
parents = parent_names.dup # Tricky!
name = parents.shift
puts "building internal node: #{name} : #{parents} "
add_item(
name: name,
parent_names: parents,
source_table: source_tmp_geo_area.source_table,
source_table_gid: source_tmp_geo_area.source_table_gid,
is_internal_node: true
)
recurse_nodes(parents, source_tmp_geo_area)
end
end
def create_areas
build_individual_areas
update_parents
update_levels
save_all
end
# If you add an attribute, it must be included here.
def build_individual_areas
@index.each_value do |n|
geographic_area_type = (n.temp_area.geographic_area_type_name.blank? ? 'Unknown' : n.temp_area.geographic_area_type_name)
n.geographic_area = GeographicArea.new(
name: n.temp_area.name, # Clean name at this point if needed.
data_origin: n.temp_area.source_table,
geographic_area_type: GeographicAreaType.find_or_create_by(name: geographic_area_type),
tdwgID: n.temp_area.tdwgID,
iso_3166_a2: n.temp_area.iso_3166_a2,
iso_3166_a3: n.temp_area.iso_3166_a3
)
end
end
def update_parents
@index.each_value do |n|
next if n.temp_area.name == 'Earth' # This is a bit ugh, but let's us use fewer exceptions?
n.geographic_area.parent = @index[n.temp_area.parent_index].geographic_area
end
end
def update_levels
@index.each_value do |n|
next if n.temp_area.name == 'Earth'
l0, l1, l2 = n.temp_area.level0_index, n.temp_area.level1_index, n.temp_area.level2_index
n.geographic_area.level0 = @index[l0].geographic_area if l0 != {}
n.geographic_area.level1 = @index[l1].geographic_area if l1 != {}
n.geographic_area.level2 = @index[l2].geographic_area if l2 != {}
end
end
def save_all
puts 'Saving..'
save_geographic_areas
raise if names_not_saved.count > 0
create_geographic_areas_geographic_items
puts "\n...done.\n"
end
def create_geographic_areas_geographic_items
@index.each_value do |n|
n.temp_area.shapes.each do |s|
GeographicAreasGeographicItem.create!(geographic_area: n.geographic_area, data_origin: s[0], origin_gid: s[1], date_valid_from: s[2], date_valid_to: s[3])
end
end
end
def save_geographic_areas
@index.each_value do |n|
recursively_save(n.geographic_area)
end
end
def recursively_save(geographic_area)
parent = geographic_area.parent
if parent && parent.new_record?
recursively_save(parent)
end
print "\r#{geographic_area.name} "
geographic_area.save!
end
# The following methods should not be used to handle/parse incoming data,
# they are for debugging/reporting only.
def all_name_strings
names.collect { |a| a.name }
end
def duplicate_name_strings
@duplicate_names.collect {|a| a.name}.sort.uniq
end
def internal_names
names.select { |n| n.is_internal_node }
end
def names_with_multiple_shapes
names.select { |n| n.shapes.count > 1 }
end
# These should be empty Arrays
def names_not_indexed
names.reject { |n| @index[n.index] }
end
def names_without_parent_arrays
names.select { |n| n.parent_names.nil? || n.parent_names.count == 0 }
end
def names_not_saved
@index.values.collect { |i| i.geographic_area }.select { |a| a.new_record? }
end
end
# A instance links indexed data to its model representation
class RecordLink
attr_accessor :temp_area, :geographic_area
def initialize(params)
@temp_area = params[:temp_area]
@geographic_area = params[:geographic_area]
end
end
# Instances store attributes of the to be created GeographicArea and related records
class TempArea
attr_accessor :name, :tdwgID, :iso_3166_a2, :source_table, :iso_3166_a3 # Base attributes, source_table is both GeographicArea#data_origin and GeographicAreasGeographicItem#data_origin
attr_accessor :lvl0, :lvl1, :lvl2, :parent_names, :geographic_area_type_name # Used to build related records
attr_accessor :source_table_gid, :shapes, :date_valid_from, :date_valid_to # GeographicAreasGeographicItem attributes
attr_accessor :is_internal_node # internal nodes have no shapes
def initialize(attribute_hash)
@shapes = []
raise if attribute_hash[:name].strip.blank?
attribute_hash.each do |k, v|
self.send("#{k}=", v)
end
add_shape(attribute_hash)
# true
end
# Handle the shape-related metadata
def add_shape(attribute_hash)
return false if attribute_hash[:is_internal_node]
raise if attribute_hash[:source_table].blank? || attribute_hash[:source_table_gid].blank?
@shapes.push [attribute_hash[:source_table], attribute_hash[:source_table_gid], attribute_hash[:date_valid_from], attribute_hash[:date_valid_to]]
end
# All records are indexed by this hash.
def index
{name: @name, parent_names: @parent_names}
end
# Don't check Earth
def parent_index
{name: @parent_names.first, parent_names: @parent_names[1..(@parent_names.length)]}
end
# All of our data are presently only loded with @earth.name as parent
def level0_index
return {} if @lvl0.nil?
{name: @lvl0, parent_names: [@parent_names.last]}
end
def level1_index
return {} if @lvl1.nil?
{name: @lvl1, parent_names: [@lvl0, @parent_names.last].delete_if { |n| n.blank? }}
end
def level2_index
return {} if @lvl2.nil?
{name: @lvl2, parent_names: [@lvl1, @lvl0, @parent_names.last].delete_if { |n| n.blank? }}
end
end # End TempArea
# Some query helpers
# Return an Array of every record in the database MINUS its geom column.
def all_records(table_name)
@connection.execute("Select #{columns_minus_geom(table_name).collect { |c| "\"#{c}\"" }.join(", ")} from #{table_name}")
end
# Return an Array of every column header MINUS its geom column.
def columns_minus_geom(table_name)
@connection.execute("select column_name from information_schema.columns where table_name = '#{table_name}';").collect { |a| a['column_name'] }.delete_if { |b| b == 'geom' }
end
# Return a string capitalized except for 'and'
def uncapitalize(name)
raise if name.blank?
name.downcase.titleize.gsub(/\sAnd\s/, ' and ')
end
# Build methods for individual sources. Each source has its own method.
def build_areas_from_country_codes
CSV.read("#{@args[:data_directory]}data/external/country_names_and_code_elements.txt", options = {headers: true, col_sep: ';'}).each do |r|
@data_index.add_item(
name: uncapitalize(r['Country Name']),
parent_names: [@earth.name],
lvl0: uncapitalize(r['Country Name']),
source_table: 'country_names_and_code_elements',
geographic_area_type_name: 'Country',
is_internal_node: true,
iso_3166_a2: r['ISO 3166-1-alpha-2 code']
)
end
end
def build_areas_from_gadm
source_table = 'gadm'
all_records(source_table).each do |r|
# Only select records in levels 0-2
next if !r['name_3'].blank? || !r['name_4'].blank? |!r['name_5'].blank?
puts "#\r #{r['gid']} #{gadm_name_from_row(r)} : #{gadm_parent_names_from_row(r)}"
@data_index.add_item(
name: gadm_name_from_row(r),
parent_names: gadm_parent_names_from_row(r),
lvl0: r['name_0'],
lvl1: r['name_1'],
lvl2: r['name_2'],
source_table: source_table,
source_table_gid: r['gid'],
date_valid_from: gadm_valid_from(r),
date_valid_to: gadm_valid_to(r),
)
end
end
def gadm_valid_from(row)
r = gadm_level(row)
return nil if r == 0
row["validfr_#{r}"]
end
def gadm_valid_to(row)
r = gadm_level(row)
return nil if r == 0
row["validto_#{r}"]
end
# Returns the assumed level the gadm record is for
def gadm_level(row)
return '2' if !row['name_2'].to_s.strip.blank?
return '1' if !row['name_1'].to_s.strip.blank?
'0'
end
# TODO: Assumes all level0 entities are type Country, likely not fully true?
def gadm_geographic_area_type_from_row(row)
return row['engtype_2'] if !row['name_2'].blank?
return row['engtype_1'] if !row['name_1'].blank?
'Country'
end
def gadm_names_from_row(row)
[row['name_2'], row['name_1'], row['name_0'], @earth.name].delete_if { |a| a.blank? }
end
def gadm_parent_names_from_row(row)
parents = gadm_names_from_row(row)
parents.shift
parents
end
def gadm_name_from_row(row)
gadm_names_from_row(row).first
end
def gadm_parent_name_from_row(row)
gadm_names_from_row(row)[1]
end
def build_areas_from_tdwg_l1
puts 'tdwg level 1'
source_table = 'tdwg_l1'
all_records(source_table).each do |r|
@data_index.add_item(
name: uncapitalize(r['level1_nam']),
parent_names: [@earth.name],
source_table: source_table,
source_table_gid: r['gid'],
geographic_area_type_name: 'TDWG Level 1',
tdwgID: r['level1_cod'],
)
end
end
def build_areas_from_tdwg_l2
puts 'tdwg level 2'
source_table = 'tdwg_l2'
all_records(source_table).each do |r|
@data_index.add_item(
name: r['level2_nam'],
parent_names: [uncapitalize(r['level1_nam']), @earth.name],
source_table: source_table,
source_table_gid: r['gid'],
geographic_area_type_name: 'TDWG Level 2',
tdwgID: r['level2_cod'],
)
end
end
def build_areas_from_tdwg_l3
puts 'tdwg level 3'
source_table = 'tdwg_l3'
all_records(source_table).each do |r|
@data_index.add_item(
name: r['level3_nam'],
parent_names: [tdwg_level_2_name(r), tdwg_level_1_name(r), @earth.name],
source_table: source_table,
source_table_gid: r['gid'],
geographic_area_type_name: 'TDWG Level 3',
tdwgID: "#{r['level2_cod']}#{r['level3_cod']}",
)
end
end
def build_areas_from_tdwg_l4
puts 'tdwg level 4'
source_table = 'tdwg_l4'
all_records(source_table).each do |r|
@data_index.add_item(
name: r['level_4_na'],
parent_names: [tdwg_level_3_name(r), tdwg_level_2_name(r), tdwg_level_1_name(r), @earth.name],
source_table: source_table,
source_table_gid: r['gid'],
tdwgID: "#{r['level2_cod']}#{r['level4_cod']}",
geographic_area_type_name: 'TDWG Level 4'
)
end
end
def tdwg_level_1_name(r)
uncapitalize(@connection.execute("select level1_nam from tdwg_l1 where level1_cod = '#{r['level1_cod']}';").first['level1_nam'])
end
def tdwg_level_2_name(r)
@connection.execute("select level2_nam from tdwg_l2 where level1_cod = '#{r['level1_cod']}' and level2_cod = '#{r['level2_cod']}';").first['level2_nam']
end
def tdwg_level_3_name(r)
@connection.execute("select level3_nam from tdwg_l3 where level3_cod = '#{r['level3_cod']}' AND level2_cod = '#{r['level2_cod']}';").first['level3_nam']
end
def build_areas_from_ne_countries
puts 'indexing ne_countries'
source_table = 'ne_countries'
all_records(source_table).each do |r|
@data_index.add_item(
name: r['name'],
lvl0: r['name'],
parent_names: [@earth.name],
source_table: source_table,
source_table_gid: r['gid'],
geographic_area_type_name: 'Country',
iso_3166_a3: r['iso_a3']
)
end
end
def build_areas_from_ne_states
puts 'indexing ne_states'
source_table = 'ne_states'
all_records(source_table).each do |r|
@data_index.add_item(
name: r['name'] ? r['name'] : 'UNKNOWN', # !@# 3285 has no name
lvl0: r['admin'],
parent_names: [r['admin'], @earth.name],
source_table: source_table,
source_table_gid: r['gid'],
geographic_area_type_name: r['type'].blank?
)
end
end
# The following may need to be reviewed for re-implementation, MAYBE.
#
# @gadm_xlate = YAML.load(File.read(@args[:data_directory] + 'data/external/import_helpers/gadm_translations.yaml'))
# @tdwg_xlate = YAML.load(File.read(@args[:data_directory] + 'data/external/import_helpers/tdwg_translations.yaml'))
#
# def name_fix(name)
# case name
# when /Jõgeva\r (commune)/
# name = 'Jõgeva (commune)'
# when /Põltsamaa\r\rPõltsamaa/
# name = 'Põltsamaa'
# when /Halmahera Tengah\rHalmahera Tengah/
# name = 'Halmahera Tengah'
# end
# name
# end
end
end
end
end