lib/greeb/parser.rb from dmchk/greeb

lib/greeb/parser.rb
Summary

Maintainability

3 hrs
Test Coverage

Issues
# encoding: utf-8

# It is often necessary to find different entities in a natural language
# text. These entities are URLs, e-mail addresses, names, etc. This module
# includes several helpers that could help to solve these problems.
#
module Greeb::Parser extend self
  # An URL pattern. Not so precise, but IDN-compatible.
  #
  URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i

  # A horrible e-mail pattern.
  #
  EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i

  # Another horrible pattern. Now for abbreviations.
  #
  ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i

  # This pattern matches anything that looks like HTML. Or not.
  #
  HTML = /<(.*?)>/i

  # Time pattern.
  #
  TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i

  # Apostrophe pattern.
  #
  APOSTROPHE = /['’]/i

  # Together pattern.
  #
  TOGETHER = [:letter, :integer, :apostrophe, :together]

  # Recognize URLs in the input text. Actually, URL is obsolete standard
  # and this code should be rewritten to use the URI concept.
  #
  # @param text [String] input text.
  #
  # @return [Array<Greeb::Span>] found URLs.
  #
  def urls(text)
    scan(text, URL, :url)
  end

  # Recognize e-mail addresses in the input text.
  #
  # @param text [String] input text.
  #
  # @return [Array<Greeb::Span>] found e-mail addresses.
  #
  def emails(text)
    scan(text, EMAIL, :email)
  end

  # Recognize abbreviations in the input text.
  #
  # @param text [String] input text.
  #
  # @return [Array<Greeb::Span>] found abbreviations.
  #
  def abbrevs(text)
    scan(text, ABBREV, :abbrev)
  end

  # Recognize HTML-alike entities in the input text.
  #
  # @param text [String] input text.
  #
  # @return [Array<Greeb::Span>] found HTML entities.
  #
  def html(text)
    scan(text, HTML, :html)
  end

  # Recognize timestamps in the input text.
  #
  # @param text [String] input text.
  #
  # @return [Array<Greeb::Span>] found HTML entities.
  #
  def time(text)
    scan(text, TIME, :time)
  end

  # Retrieve apostrophes from the tokenized text. The algorithm may be
  # more optimal.
  #
  # @param text [String] input text.
  # @param spans [Array<Greeb::Span>] already tokenized text.
  #
  # @return [Array<Greeb::Span>] retrieved apostrophes.
  #
  def apostrophes(text, spans)
    apostrophes = scan(text, APOSTROPHE, :apostrophe)
    return [] if apostrophes.empty?

    apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear

    spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
      next unless s1 && s1.type == :letter
      next unless s2 && s2.type == :apostrophe
      next unless !s3 || s3 && s3.type == :letter
      s3, k = s2, j unless s3
      apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
      spans[i..k] = apostrophes.last
    end

    apostrophes
  end

  # Merge some spans that are together.
  #
  # @param spans [Array<Greeb::Span>] already tokenized text.
  #
  # @return [Array<Greeb::Span>] merged spans.
  #
  def together(spans)
    loop do
      converged = true

      spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
        next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
        spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
        converged = false
      end

      break if converged
    end

    spans
  end

  private
  # Implementation of regexp-based {Greeb::Span} scanner.
  #
  # @param text [String] input text.
  # @param regexp [Regexp] regular expression to be used.
  # @param type [Symbol] type field for the new {Greeb::Span} instances.
  # @param offset [Fixnum] offset of the next match.
  #
  # @return [Array<Greeb::Span>] found entities.
  #
  def scan(text, regexp, type, offset = 0)
    Array.new.tap do |matches|
      while text and md = text.match(regexp)
        start, stop = md.offset(0)
        matches << Greeb::Span.new(offset + start, offset + stop, type)
        text, offset = text[stop..-1], offset + stop
      end
    end
  end
end