sshaw/normalize_country

View on GitHub
bin/normalize_country

Summary

Maintainability
Test Coverage
#!/usr/bin/env ruby

require "normalize_country"
require "optparse"
require "tempfile"
require "fileutils"

class FileSource
  def initialize(pattern)
    @pattern = pattern
  end

  def build_paths(path)
    File.directory?(path) ? Dir[ File.join(path, @pattern) ] : [path]
  end

  def tmpfile
    Tempfile.new("normalize_country")
  end

  def normalize(path)
    paths = build_paths(path)
    paths.each do |path|
      print "Processing #{path}... "
      process(path)
      print "done!\n"
    end
  end
end

class NormalizeXML < FileSource
  def initialize(to, xpath)
    super "*.xml"
    @to = to
    @xpath = xpath
  end

  def process(path)
    io = File.open(path)

    begin
      doc = REXML::Document.new(io)
      REXML::XPath.each(doc, @xpath).each do |e|
    # element or attribute
    value, writer = e.is_a?(REXML::Element) ? [ e.text, :text= ] : [ e.to_s, :normalized= ] 
    value.strip!
    next if value.empty?

    replace = NormalizeCountry(value, :to => @to)
    next unless replace

    e.send(writer, replace)
      end

      tmp = tmpfile
      doc.write(tmp)
      tmp.close
      io.close
      FileUtils.mv(tmp.path, path)
    ensure
      tmp.close if tmp and !tmp.closed?
      io.close unless io.closed?
    end
  end
end

class NormalizeCSV < FileSource
  def initialize(to, column)
    super "*.[tc]sv"
    @to = to
    @column = column
  end

  def process(path)
    dest = tmpfile

    begin
      CSV.open dest.path, "wb" do |out_csv|
        CSV.foreach path, :headers => true, :return_headers => true do |in_csv|
          if in_csv.header_row?
            raise "#{path} does not include a column named '#@column'" unless in_csv.include?(@column)
            out_csv << in_csv
          else
            if replace = NormalizeCountry(in_csv[@column], :to => @to)
              in_csv[@column] = replace
            end
            out_csv << in_csv
          end
        end
      end

      dest.close
      FileUtils.mv(dest.path, path)
    ensure
      dest.close unless dest.closed?
    end
  end
end

class NormalizeDB
  def initialize(to, column)
    @to = to
    @table, @column = column.split(".", 2)
    raise "no database column given" if @column.nil? or @column.empty?
    @table = @table.to_sym
    @column = @column.to_sym
  end

  def normalize(dsn)
    rs = connect(dsn)
    rs.select(@column).distinct.each do |row|
      new_name = NormalizeCountry(row[@column], :to => @to)
      next unless new_name and new_name != row[@column]
      rs.where(@column => row[@column]).update(@column => new_name)
    end
  end

  def connect(dsn)
    db = Sequel.connect(dsn)
    raise "database has no table named '#@table'" unless db.table_exists?(@table)
    raise "database has no column named '#@table.#@column'" unless db[@table].columns.include?(@column)
    db[@table]
  end
end

options = {}
OptionParser.new do |opts|
  opts.banner = "usage: #{File.basename($0)} [options] SOURCE"

  opts.on("-h", "--help", "Show this message") do
    puts opts
    exit 2
  end

  opts.on "-f", "--format FORMAT", "The format of SOURCE" do |format|
    options[:format] = format.to_sym
  end

  opts.on "-t", "--to CONVERSION", "Convert country names to this format (see docs for valid formats)" do |to|
    options[:to] = to
  end

  opts.on "-l", "--location LOCATION ", "The location of the conversion" do |source|
    options[:location] = source
  end

  opts.on("-v", "--version", "Version") do
    puts "v#{NormalizeCountry::VERSION}"
    exit
  end
end.parse!

abort "source option required" unless ARGV.any?

missing = [:location, :format, :to].find { |opt| options[opt].nil? }
abort "#{missing} option required" if missing

abort "unknown conversion format '#{options[:to]}'" unless NormalizeCountry.formats.include?(options[:to].to_sym)

klass = case options[:format]
  when :csv
    abort "CSV processing requires ruby 1.9" if RUBY_VERSION < "1.9"
    require "csv"
    NormalizeCSV
  when :db
    begin
      require "sequel"
    rescue LoadError => e
      abort "the db format requires Sequel, you can install it by running `gem install sequel`"
    end
    NormalizeDB
  when :xml
    require "rexml/document"
    NormalizeXML
  else
    abort "don't know how to normalize the format '#{options[:format]}'"
end

begin
  klass.new(options[:to], options[:location]).normalize(ARGV[0])
rescue => e
  abort "normalization failed: #{e}"
end