lib/treat/workers/processors/tokenizers/ptb.rb
# encoding: utf-8
# Tokenization based on the tokenizer developped by
# Robert Macyntyre in 1995 for the Penn Treebank
# project. This tokenizer mostly follows the conventions
# used by the Penn Treebank. N.B. Contrary to the
# standard PTB tokenization, double quotes (") are
# NOT changed to doubled single forward- and
# backward- quotes (`` and '') by default.
#
# Authors: Utiyama Masao (mutiyama@nict.go.jp).
# License: Ruby License.
class Treat::Workers::Processors::Tokenizers::PTB
# Default options for the tokenizer.
DefaultOptions = {
directional_quotes: false
}
# Perform tokenization of the entity and add
# the resulting tokens as its children.
#
# Options:
# - (Boolean) => :directional_quotes whether to
# replace double quotes by `` and '' or not.
def self.tokenize(entity, options = {})
options = DefaultOptions.merge(options)
entity.check_hasnt_children
if entity.has_children?
raise Treat::Exception,
"Cannot tokenize an #{entity.class} " +
"that already has children."
end
chunks = split(entity.to_s, options)
chunks.each do |chunk|
next if chunk =~ /([[:space:]]+)/
entity << Treat::Entities::Token.
from_string(chunk)
end
end
def self.split(string, options)
s = " " + string + " "
s.gsub!(/‘/,"'")
s.gsub!(/’/,"'")
s.gsub!(/“/,"``")
s.gsub!(/”/,"''")
s.gsub!(/\s+/," ")
s.gsub!(/(\s+)''/,'\1"')
s.gsub!(/(\s+)``/,'\1"')
s.gsub!(/''(\s+)/,'"\1')
s.gsub!(/``(\s+)/,'"\1')
s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
s.gsub!(/([ (\[{<])"/,'\1 `` ')
s.gsub!(/\.\.\./,' ... ')
s.gsub!(/[,;:@\#$%&]/,' \& ')
s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
s.gsub!(/[?!]/,' \& ')
s.gsub!(/[\]\[(){}<>]/,' \& ')
s.gsub!(/--/,' -- ')
s.sub!(/$/,' ')
s.sub!(/^/,' ')
s.gsub!(/"/,' \'\' ')
s.gsub!(/([^'])' /,'\1 \' ')
s.gsub!(/'([sSmMdD]) /,' \'\1 ')
s.gsub!(/'ll /,' \'ll ')
s.gsub!(/'re /,' \'re ')
s.gsub!(/'ve /,' \'ve ')
s.gsub!(/n't /,' n\'t ')
s.gsub!(/'LL /,' \'LL ')
s.gsub!(/'RE /,' \'RE ')
s.gsub!(/'VE /,' \'VE ')
s.gsub!(/N'T /,' N\'T ')
s.gsub!(/ ([Cc])annot /,' \1an not ')
s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
s.gsub!(/ ([Gg])imme /,' \1im me ')
s.gsub!(/ ([Gg])onna /,' \1on na ')
s.gsub!(/ ([Gg])otta /,' \1ot ta ')
s.gsub!(/ ([Ll])emme /,' \1em me ')
s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
s.gsub!(/ '([Tt])is /,' \'\1 is ')
s.gsub!(/ '([Tt])was /,' \'\1 was ')
s.gsub!(/ ([Ww])anna /,' \1an na ')
while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
s.gsub!(/\//, ' / ')
s.gsub!(/\s+/,' ')
s.strip!
# Remove directional quotes.
unless options[:directional_quotes]
s.gsub!(/``/,'"')
s.gsub!(/''/,'"')
end
s.split(/\s+/)
end
end