lib/tactful_tokenizer.rb
# -*- encoding : utf-8 -*-
# TactfulTokenizer is a Ruby library for high quality sentence
# tokenization. It uses a Naive Bayesian statistical model, and
# is based on Splitta[http://code.google.com/p/splitta/]. But
# has support for '?' and '!' as well as primitive handling of
# XHTML markup. Better support for XHTML parsing is coming shortly.
#
# Example usage:
#
# require "tactful_tokenizer"
# m = TactfulTokenizer::Model.new
# m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way, really? Yes.")
# #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way, really?", "Yes."]
#
# The input text is expected to consist of paragraphs delimited
# by line breaks.
#
# Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
# License:: GNU General Public License v3
# Performance TODOs.
# TODO: Use inline C where necessary?
# TODO: Use RE2 regexp extension.
# -*- encoding : utf-8 -*-
require "word_tokenizer.rb"
include WordTokenizer
module TactfulTokenizer
# Basic String extensions.
String.class_eval do
# Simple regex to check if a string is alphabetic.
def is_alphabetic?
!/[[:lower:][:upper:][:space:]]+/u.match(self).nil?
end
# Check for upper case.
# Surprisingly, this is faster than a regex in benchmarks.
# Using the trinary operator is faster than to_s
def is_upper_case?
self == self.upcase
end
end
# A model stores normalized probabilities of different features occuring.
class Model
# Initialize the model. feats, lower_words, and non_abbrs
# indicate the locations of the respective Marshal dumps.
def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
@feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
File.open(file) do |f|
Marshal.load(f.read)
end
end
@p0 = @feats["<prior>"] ** 4
end
# feats = {feature => normalized probability of feature}
# lower_words = {token => log count of occurences in lower case}
# non_abbrs = {token => log count of occurences when not an abbrv.}
attr_accessor :feats, :lower_words, :non_abbrs
# This function is the only one that'll end up being used.
# m = TactfulTokenizer::Model.new
# m.tokenize_text("Hey, are these two sentences? I bet they should be.")
# => ["Hey, are these two sentences?", "I bet they should be."]
def tokenize_text(text)
data = Doc.new(text)
featurize(data)
classify(data)
return data.segment
end
# Assign a prediction (probability, to be precise) to each sentence fragment.
# For each feature in each fragment we hunt up the normalized probability and
# multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
def classify(doc)
frag, probs, feat = nil, nil, nil
doc.frags.each do |frag|
probs = @p0
frag.features.each do |feat|
probs *= @feats[feat]
end
frag.pred = probs / (probs + 1)
end
end
# Get the features of every fragment.
def featurize(doc)
frag = nil
doc.frags.each do |frag|
get_features(frag, self)
end
end
# Finds the features in a text fragment of the form:
# ... w1. (sb?) w2 ...
# Features listed in rough order of importance:
# * w1: a word that includes a period.
# * w2: the next word, if it exists.
# * w1length: the number of alphabetic characters in w1.
# * both: w1 and w2 taken together.
# * w1abbr: logarithmic count of w1 occuring without a period.
# * w2lower: logarithmiccount of w2 occuring lowercased.
def get_features(frag, model)
w1 = (frag.cleaned.last or '')
w2 = (frag.next or '')
frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]
unless w2.empty?
frag.push_w1_features(w1, model)
frag.push_w2_features(w2, model)
end
end
end
# A document represents the input text. It holds a list of fragments generated
# from the text.
class Doc
# List of fragments.
attr_accessor :frags
# Receives a text, which is then broken into fragments.
# A fragment ends with a period, quesetion mark, or exclamation mark followed
# possibly by right handed punctuation like quotation marks or closing braces
# and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
# No, it doesn't have a period, but that's the end of paragraph.
#
# Input assumption: Paragraphs delimited by line breaks.
def initialize(text)
@frags = []
res = nil
text.each_line do |line|
unless line.strip.empty?
line.split(/(.*?[.!?](?:[”"')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
unless res.strip.empty?
frag = Frag.new(res)
@frags.last.next = frag.cleaned.first unless @frags.empty?
@frags.push frag
end
end
end
end
end
# Segments the text. More precisely, it reassembles the fragments into sentences.
# We call something a sentence whenever it is more likely to be a sentence than not.
def segment
sents, sent = [], []
thresh = 0.5
frag = nil
@frags.each do |frag|
sent.push(frag.orig)
if frag.pred && frag.pred > thresh
break if frag.orig.nil?
sents.push(sent.join('').strip)
sent = []
end
end
sents
end
end
# A fragment is a potential sentence, but is based only on the existence of a period.
# The text "Here in the U.S. Senate we prefer to devour our friends." will be split
# into "Here in the U.S." and "Senate we prefer to devour our friends."
class Frag
# orig = The original text of the fragment.
# next = The next word following the fragment.
# cleaned = Array of the fragment's words after cleaning.
# pred = Probability that the fragment is a sentence.
# features = Array of the fragment's features.
attr_accessor :orig, :next, :cleaned, :pred, :features
# Create a new fragment.
def initialize(orig='')
@orig = orig
clean(orig)
@next, @pred, @features = nil, nil, nil
end
# Normalizes numbers and discards ambiguous punctuation. And then splits into an
# array, because realistically only the last and first words are ever accessed.
def clean(s)
@cleaned = String.new(s)
tokenize(@cleaned)
@cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
@cleaned.gsub!(/[^[[:upper:][:lower:]]\d[:space:],!?.;:<>\-'\/$% ]/u, '')
@cleaned.gsub!('--', ' ')
@cleaned = @cleaned.split
end
def push_w1_features w1, model
if w1.chop.is_alphabetic?
features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
end
end
def push_w2_features w2, model
if w2.chop.is_alphabetic?
features.push "w2cap_#{w2[0,1].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
end
end
end
end