louismullie/treat

View on GitHub
lib/treat/workers/processors/tokenizers/ptb.rb

Summary

Maintainability
A
2 hrs
Test Coverage
# encoding: utf-8
# Tokenization based on the tokenizer developped by
# Robert Macyntyre in 1995 for the Penn Treebank
# project. This tokenizer mostly follows the conventions
# used by the Penn Treebank. N.B. Contrary to the 
# standard PTB tokenization, double quotes (") are 
# NOT changed to doubled single forward- and 
# backward- quotes (`` and '') by default.
#
# Authors: Utiyama Masao (mutiyama@nict.go.jp).
# License: Ruby License.
class Treat::Workers::Processors::Tokenizers::PTB

  # Default options for the tokenizer.
  DefaultOptions = {
    directional_quotes: false
  }

  # Perform tokenization of the entity and add
  # the resulting tokens as its children.
  #
  # Options:
  # - (Boolean) => :directional_quotes whether to
  # replace double quotes by `` and '' or not.
  def self.tokenize(entity, options = {})
    options = DefaultOptions.merge(options)
    entity.check_hasnt_children
    if entity.has_children?
      raise Treat::Exception,
      "Cannot tokenize an #{entity.class} " +
      "that already has children."
    end
    chunks = split(entity.to_s, options)
    chunks.each do |chunk|
      next if chunk =~ /([[:space:]]+)/
      entity << Treat::Entities::Token.
      from_string(chunk)
    end
  end

  def self.split(string, options)

    s = " " + string + " "

    s.gsub!(/‘/,"'")
    s.gsub!(/’/,"'")
    s.gsub!(/“/,"``")
    s.gsub!(/”/,"''")

    s.gsub!(/\s+/," ")
    s.gsub!(/(\s+)''/,'\1"')
    s.gsub!(/(\s+)``/,'\1"')
    s.gsub!(/''(\s+)/,'"\1')
    s.gsub!(/``(\s+)/,'"\1')
    s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
    s.gsub!(/([ (\[{<])"/,'\1 `` ')
    s.gsub!(/\.\.\./,' ... ')
    s.gsub!(/[,;:@\#$%&]/,' \& ')
    s.gsub!(/([^.])([.])([\])}>"']*)[     ]*$/,'\1 \2\3 ')
    s.gsub!(/[?!]/,' \& ')
    s.gsub!(/[\]\[(){}<>]/,' \& ')
    s.gsub!(/--/,' -- ')
    s.sub!(/$/,' ')
    s.sub!(/^/,' ')
    s.gsub!(/"/,' \'\' ')
    s.gsub!(/([^'])' /,'\1 \' ')
    s.gsub!(/'([sSmMdD]) /,' \'\1 ')
    s.gsub!(/'ll /,' \'ll ')
    s.gsub!(/'re /,' \'re ')
    s.gsub!(/'ve /,' \'ve ')
    s.gsub!(/n't /,' n\'t ')
    s.gsub!(/'LL /,' \'LL ')
    s.gsub!(/'RE /,' \'RE ')
    s.gsub!(/'VE /,' \'VE ')
    s.gsub!(/N'T /,' N\'T ')
    s.gsub!(/ ([Cc])annot /,' \1an not ')
    s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
    s.gsub!(/ ([Gg])imme /,' \1im me ')
    s.gsub!(/ ([Gg])onna /,' \1on na ')
    s.gsub!(/ ([Gg])otta /,' \1ot ta ')
    s.gsub!(/ ([Ll])emme /,' \1em me ')
    s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
    s.gsub!(/ '([Tt])is /,' \'\1 is ')
    s.gsub!(/ '([Tt])was /,' \'\1 was ')
    s.gsub!(/ ([Ww])anna /,' \1an na ')
    while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
    s.gsub!(/\//, ' / ')
    s.gsub!(/\s+/,' ')
    s.strip!
    
    # Remove directional quotes.
    unless options[:directional_quotes]
      s.gsub!(/``/,'"')
      s.gsub!(/''/,'"')
    end

    s.split(/\s+/)
  end

end