josephwilk/creative-machine

View on GitHub
tasks/lexicon.rb

Summary

Maintainability
A
40 mins
Test Coverage
require 'nokogiri'
require 'open-uri'
require 'json'

require 'pronounce'

module LexiconBuilder
  class << self
    URL = 'http://dictionary.reference.com/browse'
    
    def haiku_words
      return @words if @words
      words = File.read(File.dirname(__FILE__) + '/../data/haiku.txt')
      words = words.split(' ').
                    reject {|word| word =~ /-|--/}.
                    map{|word| word.downcase}.
                    map{|word| word.gsub(/[^\w\d]/, '')}

      @words ||= words
    end
  
    def lookup(word)
      doc = Nokogiri::HTML(open("#{URL}/#{URI.escape(word)}"))

      nodes = doc.xpath('//h2[@class="me"]')
      return unless nodes && nodes.first
      word_with_syllables_seperated = nodes.first.text
      word_with_syllables_seperated = word_with_syllables_seperated.gsub(/\u00B7/, '-')
    
      pronouncations = doc.xpath('//span[@class="show_spellpr"]/span[@class="pron"]').
                           map{|a| a.text}.
                           reject{|w| w =~ /,|;/}.
                           uniq

      phonemes = Pronounce.how_do_i_pronounce(word)
      
      word_with_syllables_seperated, pronouncations = *correct_any_bad_lookups(word, word_with_syllables_seperated, pronouncations)       

      pronouncations = correct_any_bad_pronounications(pronouncations, word_with_syllables_seperated)

      {:word => word,
       :syllables => word_with_syllables_seperated,
       :pronouncations => pronouncations,
       :phonemes => phonemes}
    rescue
      puts "#{word} lookup failed: #{$!} #{$!.backtrace.join("\n")}"
      nil
    end
    
    def correct_any_bad_pronounications(pronouncations, syllables)
      syllables_count = syllables.count
      pronouncations.select{|pronouncation| pronouncation.split('-').reject(&:empty?).count == syllables_count}.
                     map{|pronouncation| pronouncation.split('-') }
    end
    
    def correct_any_bad_lookups(word, syllables, pronouncations)
      dictionary_looked_up_word = syllables.gsub('-','')
      
      if failed_match?(dictionary_looked_up_word, word)
        if word[-1] == 's' && word[0..-2] == dictionary_looked_up_word
          syllables = syllables + "s"
          pronouncations = pronouncations.map{|word| word + "s" }  
        elsif word[-3..-1] == 'ing' && word[0..-4] == dictionary_looked_up_word
          syllables = syllables + "-ing"
          pronouncations = pronouncations.map{|word| word + "-ing" }
        end
      end
      
      [syllables.split("-"), pronouncations]
    end
    
    private

    def failed_match?(syllables, word)
      (syllables.gsub('-','')).length != word.length
    end
  end
  
end

namespace :lexicon do  
  desc "map words to syllables/pronuciation/emphasis"
  task :build do
    tmpdir = File.join File.dirname(__FILE__), '..', 'tmp'
    Dir.mkdir tmpdir unless Dir.exists? tmpdir
    error_log = File.open File.join(tmpdir, 'lookup_fails.json'), 'w'
    moderation_log = File.open File.join(tmpdir, 'lookup_moderation.json'), 'w'
    word_log = File.open File.join(tmpdir, 'lookup.json'), 'w'

    seen = {}

    LexiconBuilder.haiku_words.each do |word|
      print '.'
      next if seen[word]
      data = LexiconBuilder.lookup(word)
      next unless data

      log = if !data[:phonemes]
        error_log
      elsif (data[:syllables].join('')).length != word.length || data[:phonemes].first.class != Array
        moderation_log
      else
        word_log
      end
        
      log.write(data.to_json + "\n")
      log.flush          

      seen[word] = true
    end
    puts

    [word_log, error_log, moderation_log].map(&:close)

    puts File.expand_path(File.join(tmpdir, 'lookup.json'))

    FileUtils.copy(File.join(tmpdir, 'lookup.json'),
                   File.join(File.dirname(__FILE__) + '/../data/lookup.json'))
    Rake::Task['lexicon:format_lookup'].invoke
  end
  
  desc "process moderation"
  task :moderation_process do
    word_log = File.open(File.dirname(__FILE__) + '/../tmp/lookup.json', 'a')
    File.open(File.dirname(__FILE__) + '/../tmp/lookup_moderation.json') do |f|
      f.readlines.each do |line|
        data = JSON.parse(line)
        
        next unless data['phonemes']

        word = data['word']
        dictionary_looked_up_word = data['syllables'].gsub('-','')
      
        if word[-1] == 's' && word[0..-2] == dictionary_looked_up_word
          data['syllables'] = data['syllables'] + "s"
          data['pronouncations'] = data['pronouncations'].map{|word| word + "s" }  
          
          word_log.write(data.to_json + "\n") 
        elsif word[-3..-1] == 'ing' && word[0..-4] == dictionary_looked_up_word
          data['syllables'] = data['syllables'] + "-ing"
          data['pronouncations'] = data['pronouncations'].map{|word| word + "-ing" }
          
          word_log.write(data.to_json + "\n") 
        end

      end
    end
    word_log.close
  end
  
  desc "format json line segments into a single json hash"
  task :format_lookup do
    data = File.read(File.dirname(__FILE__) + '/../data/lookup.json')
    lines = data.split("\n")
    result = lines.reduce({}) do |result, line|
      word_data = JSON.parse(line)
      word = word_data.delete('word')
      result[word] = word_data
      result
    end
    File.open(File.dirname(__FILE__) + '/../data/lookup_dictionary.json', 'w') do |f|
      f.write(result.to_json)
    end
  end
  
end