mezis/blurrily

View on GitHub
bin/bench

Summary

Maintainability
Test Coverage
#!/usr/bin/env ruby
require 'rubygems'
require 'bundler/setup'
require 'blurrily/map'
require 'progressbar'
require 'open-uri'
require 'tempfile'
require 'benchmark/ips'
require 'zlib'


module Blurrily
  class Benchmark
    ARCHIVES = {
      cities:  'http://mezis.s3.amazonaws.com/blurrily-data/cities1000-filtered.txt.gz',
      europe:  'http://mezis.s3.amazonaws.com/blurrily-data/de_fr_es_gb_it-filtered.txt.gz',
      us:      'http://mezis.s3.amazonaws.com/blurrily-data/us-filtered.txt.gz',
      world:   'http://mezis.s3.amazonaws.com/blurrily-data/allCountries-filtered.txt.gz',
      russia:  'http://mezis.s3.amazonaws.com/blurrily-data/ru-filtered.txt.gz',
      asia:    'http://mezis.s3.amazonaws.com/blurrily-data/ru_cn_ir-filtered.txt.gz',
      english: 'http://mezis.s3.amazonaws.com/blurrily-data/english.txt.gz',
    }

    SEARCH_CITIES = %w(London Paris Rome Luxembourg) +
      %w(Lonndon Pari Roma Luxenbour)

    def initialize(key)
      @source_url = ARCHIVES[key] or raise ArgumentError
      @key = key
    end

    def run
      log "Starting benchmark for '#{key}'"
      do_download
      do_import
      do_save
      do_bm
    end

    private

    attr :key, :source_url

    def do_download
      return if raw_data_path.exist?
      log 'download and save file'
      output = Tempfile.new($PROGRAM_NAME)
      URI.parse(source_url).open do |input|
        output.write input.read
      end
      output.close
      FileUtils.cp(output.path, raw_data_path.to_s)
      return
    end

    def do_import
      log "Counting data entries"
      rows = 0
      get_reader.each_line { |line| rows += 1 }

      log "Importing data"
      progress = ProgressBar.new(key.to_s, rows)
      get_reader.each_line do |line|
        index, needle = line.strip.split("\t")
        map.put(needle, index.to_i)
        progress.inc
      end
      progress.finish
      puts "#{rows} records imported, #{map.stats[:references]} refs, #{map.stats[:trigrams]} trigrams"
      return
    end

    def do_save(path = nil)
      path ||= trigram_data_path.to_s
      map.save path
    end

    def do_load
      @map.close
      @map = Map.load(trigram_data_path.to_s)
      do_gc
    end

    def do_warm
      # rehersal, necessary as the hash table of refs will be reconstructed
      map.put 'foo', 123
    end

    def do_bm
      log 'Benchmarking'

      ::Benchmark.ips do |x|
        x.report('find') do |times|
          times.times { map.find(random_city) }
        end

        x.report('put') do |times|
          times.times { map.put(random_city, rand(1<<31)) }
        end

        x.report('delete') do |times|
          times.times { map.delete(rand(1<<31)) }
        end

        x.report('stress') do |times|
          times.times do 
            case rand(3)
            when 0 then map.delete(rand(1<<31))
            when 1 then map.put(random_city, rand(1<<31))
            when 2 then map.find(random_city)
            end
          end
        end

        x.report('save') do |times|
          times.times do
            path = Pathname.new "tmp/#{$$}.#{rand(1<<32)}.trigrams"
            do_save path.to_s
            path.delete
          end
        end

        x.report('load') do |times|
          times.times { do_load }
        end

        x.report('warm load') do |times|
          times.times { do_load ; do_warm }
        end
      end
    end

    def do_gc
      Thread.pass
      ObjectSpace.garbage_collect
      GC.start
    end

    def raw_data_path
      @raw_data_path ||= Pathname.new("#{key}.txt.gz")
    end

    def trigram_data_path
      @trigram_data_path ||= Pathname.new("#{key}.trigrams")
    end

    def get_reader
      Zlib::GzipReader.open(raw_data_path.to_s)
    end

    def log(message)
      $stderr.puts "[%s] %s: %s" % [Time.now.strftime('%T.%L'), $0, message]
      $stderr.flush
    end

    def map
      @map ||= Map.new
    end

    def random_city
      SEARCH_CITIES[rand(SEARCH_CITIES.length)]
    end
  end
end

$PROGRAM_NAME = 'blurrily:bench'

%i(english cities europe russia asia us world).each do |key|
  bbm = Blurrily::Benchmark.new(key)
  benchmarks = bbm.run
  puts "-"*80
  benchmarks.each do |bm|
    puts "%s\t%s\t%1.3e" % [key, bm.label, 1e3/bm.ips]
  end
  puts "-"*80
end

__END__