louismullie/treat

View on GitHub
lib/treat/workers/lexicalizers/taggers/stanford.rb

Summary

Maintainability
A
1 hr
Test Coverage
# POS tagging using a maximum entropy model, with (i) 
# explicit use of both preceding and following tag 
# contexts via a dependency network representation, 
# (ii) broad use of lexical features, including jointly 
# conditioning on multiple consecutive words, (iii) 
# effective use of priors in conditional loglinear models, 
# and (iv) fine-grained modeling of unknown word features.
#
# Original paper: Toutanova, Manning, Klein and Singer.
# 2003. Feature-Rich Part-of-Speech Tagging with a
# Cyclic Dependency Network. In Proceedings of the
# Conference of the North American Chapter of the
# Association for Computational Linguistics.
class Treat::Workers::Lexicalizers::Taggers::Stanford

  # Hold one tagger per language.
  @@taggers = {}

  # Hold the default options.
  DefaultOptions =  {
    :tagger_model => nil
  }

  # Tag the word using one of the Stanford taggers.
  def self.tag(entity, options = {})

    # Handle tags for sentences and phrases.
    if entity.is_a?(Treat::Entities::Group) &&
      !entity.parent_sentence

      tag_set = options[:tag_set]
      entity.set :tag_set, tag_set
    end

    return 'S' if entity.is_a?(Treat::Entities::Sentence)
    return 'P' if entity.is_a?(Treat::Entities::Phrase)
    return 'F' if entity.is_a?(Treat::Entities::Fragment)
    return 'G' if entity.is_a?(Treat::Entities::Group)

    # Handle options and initialize the tagger.
    lang = entity.language.intern
    init_tagger(lang) unless @@taggers[lang]
    options = get_options(options, lang)
    tokens, t_list = get_token_list(entity)

    # Do the tagging.
    i = 0
    isolated_token = entity.is_a?(Treat::Entities::Token)

    @@taggers[lang].apply(t_list).each do |tok|
      tokens[i].set(:tag, tok.tag.split('-').first)
      tokens[i].set(:tag_set,
      options[:tag_set]) if isolated_token
      return tok.tag if isolated_token
      i += 1
    end

  end

  # Initialize the tagger for a language.
  def self.init_tagger(language)
    unless @@taggers[language]
      Treat::Loaders::Stanford.load(language)
      unless StanfordCoreNLP.const_defined?('MaxentTagger')
        StanfordCoreNLP.load_class('MaxentTagger', 
        'edu.stanford.nlp.tagger.maxent')
      end
      model = Treat::Loaders::Stanford.find_model(:pos,language)
      tagger = StanfordCoreNLP::MaxentTagger.new(model)
      @@taggers[language] = tagger
    end
    @@taggers[language]
  end

  # Handle the options for the tagger.
  def self.get_options(options, language)
    options = DefaultOptions.merge(options)
    if options[:tagger_model]
      StanfordCoreNLP.set_model('pos.model',
      options[:tagger_model])
    end
    options[:tag_set] =
    StanfordCoreNLP::Config::TagSets[language]
    options
  end

  # Retrieve a Java ArrayList object.
  def self.get_token_list(entity)
    list = StanfordCoreNLP::ArrayList.new
    if entity.is_a?(Treat::Entities::Token)
      tokens = [entity]
    else
      tokens = entity.tokens
    end
    tokens.each do |token|
      list.add(StanfordCoreNLP::Word.new(token.to_s))
    end
    return tokens, list
  end

end