louismullie/treat

View on GitHub
lib/treat/workers/extractors/name_tag/stanford.rb

Summary

Maintainability
A
1 hr
Test Coverage
# Named entity tag extraction using the Stanford NLP
# Deterministic Coreference Resolver, which implements a
# multi-pass sieve coreference resolution (or anaphora 
# resolution) system based on conditional random fields.
#
# Original paper: Heeyoung Lee, Yves Peirsman, Angel 
# Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky. 
# Stanford's Multi-Pass Sieve Coreference Resolution 
# System at the CoNLL-2011 Shared Task. In Proceedings 
# of the CoNLL-2011 Shared Task, 2011.
class Treat::Workers::Extractors::NameTag::Stanford

  Treat::Loaders::Stanford.load
  
  @@classifiers = {}

  def self.name_tag(entity, options = {})

    language = entity.language
    Treat::Loaders::Stanford.load(language)
    
    isolated_token = entity.is_a?(Treat::Entities::Token)
    tokens = isolated_token ? [entity] : entity.tokens
    
    unless classifier = @@classifiers[language]
      model = Treat::Loaders::Stanford.find_model(:ner, language)
      unless StanfordCoreNLP.const_defined?('CRFClassifier')
        StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
      end
      classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
      @@classifiers[language] = classifier
    end
    
    token_list = StanfordCoreNLP.get_list(tokens)
    sentence = classifier.classify_sentence(token_list)
    i = 0
    
    sentence.each do |s_token|
      tag = s_token.get(:answer).to_s.downcase
      tag = nil if tag == 'o'
      return tag if isolated_token
      if tag
        tokens[i].set :name_tag, tag
      end
      i += 1
    end
    
  end



end