bin/normalize_country
#!/usr/bin/env ruby
require "normalize_country"
require "optparse"
require "tempfile"
require "fileutils"
class FileSource
def initialize(pattern)
@pattern = pattern
end
def build_paths(path)
File.directory?(path) ? Dir[ File.join(path, @pattern) ] : [path]
end
def tmpfile
Tempfile.new("normalize_country")
end
def normalize(path)
paths = build_paths(path)
paths.each do |path|
print "Processing #{path}... "
process(path)
print "done!\n"
end
end
end
class NormalizeXML < FileSource
def initialize(to, xpath)
super "*.xml"
@to = to
@xpath = xpath
end
def process(path)
io = File.open(path)
begin
doc = REXML::Document.new(io)
REXML::XPath.each(doc, @xpath).each do |e|
# element or attribute
value, writer = e.is_a?(REXML::Element) ? [ e.text, :text= ] : [ e.to_s, :normalized= ]
value.strip!
next if value.empty?
replace = NormalizeCountry(value, :to => @to)
next unless replace
e.send(writer, replace)
end
tmp = tmpfile
doc.write(tmp)
tmp.close
io.close
FileUtils.mv(tmp.path, path)
ensure
tmp.close if tmp and !tmp.closed?
io.close unless io.closed?
end
end
end
class NormalizeCSV < FileSource
def initialize(to, column)
super "*.[tc]sv"
@to = to
@column = column
end
def process(path)
dest = tmpfile
begin
CSV.open dest.path, "wb" do |out_csv|
CSV.foreach path, :headers => true, :return_headers => true do |in_csv|
if in_csv.header_row?
raise "#{path} does not include a column named '#@column'" unless in_csv.include?(@column)
out_csv << in_csv
else
if replace = NormalizeCountry(in_csv[@column], :to => @to)
in_csv[@column] = replace
end
out_csv << in_csv
end
end
end
dest.close
FileUtils.mv(dest.path, path)
ensure
dest.close unless dest.closed?
end
end
end
class NormalizeDB
def initialize(to, column)
@to = to
@table, @column = column.split(".", 2)
raise "no database column given" if @column.nil? or @column.empty?
@table = @table.to_sym
@column = @column.to_sym
end
def normalize(dsn)
rs = connect(dsn)
rs.select(@column).distinct.each do |row|
new_name = NormalizeCountry(row[@column], :to => @to)
next unless new_name and new_name != row[@column]
rs.where(@column => row[@column]).update(@column => new_name)
end
end
def connect(dsn)
db = Sequel.connect(dsn)
raise "database has no table named '#@table'" unless db.table_exists?(@table)
raise "database has no column named '#@table.#@column'" unless db[@table].columns.include?(@column)
db[@table]
end
end
options = {}
OptionParser.new do |opts|
opts.banner = "usage: #{File.basename($0)} [options] SOURCE"
opts.on("-h", "--help", "Show this message") do
puts opts
exit 2
end
opts.on "-f", "--format FORMAT", "The format of SOURCE" do |format|
options[:format] = format.to_sym
end
opts.on "-t", "--to CONVERSION", "Convert country names to this format (see docs for valid formats)" do |to|
options[:to] = to
end
opts.on "-l", "--location LOCATION ", "The location of the conversion" do |source|
options[:location] = source
end
opts.on("-v", "--version", "Version") do
puts "v#{NormalizeCountry::VERSION}"
exit
end
end.parse!
abort "source option required" unless ARGV.any?
missing = [:location, :format, :to].find { |opt| options[opt].nil? }
abort "#{missing} option required" if missing
abort "unknown conversion format '#{options[:to]}'" unless NormalizeCountry.formats.include?(options[:to].to_sym)
klass = case options[:format]
when :csv
abort "CSV processing requires ruby 1.9" if RUBY_VERSION < "1.9"
require "csv"
NormalizeCSV
when :db
begin
require "sequel"
rescue LoadError => e
abort "the db format requires Sequel, you can install it by running `gem install sequel`"
end
NormalizeDB
when :xml
require "rexml/document"
NormalizeXML
else
abort "don't know how to normalize the format '#{options[:format]}'"
end
begin
klass.new(options[:to], options[:location]).normalize(ARGV[0])
rescue => e
abort "normalization failed: #{e}"
end