malavbhavsar/sentimentalizer

View on GitHub
lib/engine/classifier.rb

Summary

Maintainability
A
0 mins
Test Coverage
require "#{File.dirname(__FILE__)}/document"
require "#{File.dirname(__FILE__)}/classification_result"

class Classifier

  UNKNOWN_WORD_STRENGTH = 1.0
  UNKNOWN_WORD_PROBABILITY = 0.5
  TOLERANCE = 0.05

  STOP_WORDS = %w{
      a,able,about,across,after,all,almost,also,am,among,an,and,
      any,are,as,at,be,because,been,but,by,can,cannot,could,dear,
      did,do,does,either,else,ever,every,for,from,get,got,had,has,
      have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,
      just,least,let,like,likely,may,me,might,most,must,my,neither,
      no,nor,not,of,off,often,on,only,or,other,our,own,rather,
      really,said,say,says,she,should,since,so,some,than,that,the,
      their,them,then,there,these,they,this,tis,to,too,totally,twas,
      us,wants,was,we,were,what,when,where,which,while,who,whom,why,
      will,with,would,yet,you,your
  }

  def initialize positive_corpus, negative_corpus
    @positive_corpus = positive_corpus
    @negative_corpus = negative_corpus
  end

  def classify sentence
    result = ClassificationResult.new sentence

    Document.new(sentence).each_token do |token|
      next if STOP_WORDS.include? token

      positive_count = @positive_corpus.token_count token
      negative_count = @negative_corpus.token_count token

      token_probability = calculate_probability(
        positive_count, @positive_corpus.entry_count,
        negative_count, @negative_corpus.entry_count)

        record_probability token_probability

        result.token_probabilities.push TokenProbability.new(
          token, token_probability, @positive_corpus.entry_count,
          positive_count, @negative_corpus.entry_count,
          negative_count, calculate_sentiment(token_probability)
        )
    end

    result.overall_probability = combine_probabilities
    result.sentiment = calculate_sentiment result.overall_probability

    result
  end

  private

  def calculate_probability positive_count, positive_total, negative_count, negative_total
    total = positive_count + negative_count
    positive_ratio = positive_count.to_f / positive_total
    negative_ratio = negative_count.to_f / negative_total

    probability = positive_ratio.to_f / (positive_ratio + negative_ratio)
    probability = 0 if probability.nan?

    ((UNKNOWN_WORD_STRENGTH*UNKNOWN_WORD_PROBABILITY) + (total * probability)) / (UNKNOWN_WORD_STRENGTH+total)
  end

  def record_probability probability
    return if probability.nan?

    @total_probability = 1 if @total_probability == 0 || @total_probability.nil?
    @inverse_total_probability = 1 if @inverse_total_probability == 0 || @inverse_total_probability.nil?

    @total_probability = @total_probability * probability
    @inverse_total_probability = @inverse_total_probability*(1-probability)
  end

  def calculate_sentiment probability
    return Sentiment::NEGATIVE if probability <= (UNKNOWN_WORD_PROBABILITY - TOLERANCE)
    return Sentiment::POSITIVE if probability >= (UNKNOWN_WORD_PROBABILITY + TOLERANCE)
    Sentiment::NEUTRAL
  end

  def combine_probabilities
    @total_probability / (@total_probability + @inverse_total_probability)
  end
end