louismullie/treat

View on GitHub
lib/treat/workers/processors/parsers/enju.rb

Summary

Maintainability
C
1 day
Test Coverage
# This class is a wrapper for the Enju syntactic
# parser for English. Given an entity's string value,
# the parser formats it runs it through Enju, and
# parses the XML output by Enju using the Nokogiri
# XML reader. It creates wrappers for the sentences,
# syntactical phrases and tokens that Enju identified.
#
# Original paper: Takuya M., Yusuke M., and Jun'ichi T.
# 2007. Efficient HPSG Parsing with Supertagging and
# CFG-filtering. In Proceedings of IJCAI 2007.
class Treat::Workers::Processors::Parsers::Enju

  # Require the 'open3' library to connect
  # with the background Enju process.
  require 'open3'
  
  # Require the Nokogiri XML parser.
  require 'nokogiri'
  
  # Create only one process and hold on to it.
  @@parser = nil
  
  # A hash of Enju cat tags mapped to word categories.
  Ectc = Treat.tags.enju.cat_to_category
  
  # A hash of Enju cat/xcat pairs mapped to PTB tags.
  Ecxtp = Treat.tags.enju.xcat_to_ptb
  
  # Parse the entity into its syntactical 
  # phrases using Enju.
  #
  # Options: none.
  def self.parse(entity, options = {})
    
    entity.check_hasnt_children
    val = entity.to_s
    
    @@id_table = {}
    @@edges_table = {}
    
    stdin, stdout = proc
    text, remove_last = valid_text(val)
    stdin.puts(text + "\n")
    
    parsed = build(stdout.gets, remove_last)
    
    if parsed
      entity.remove_all!
      parsed.children.each do |child|
        entity << child
      end
      # Remove the period we added at the end.
      if remove_last
        last = entity.punctuations[-1]
        entity.remove!(last)
      end
    else
      warn "Warning - Enju couldn't " +
      "parse the text '#{entity.short_value}'."
      return
    end
    
    link_heads(entity)
    add_edges(entity)
  end
  
  # Return the process running Enju.
  def self.proc
    begin
      @@parser = ::Open3.popen3("enju -xml -i")
    rescue Exception => e
      raise Treat::Exception,
      "Couldn't initialize Enju: #{e.message}."
    end
    @@parser
  end
  
  # Parses an Enju XML output file using the Nogoriki
  # XML reader and converts that structure into a tree
  # of wrappers for textual entities.
  def self.build(xml, remove_last = false)
    # Read in the XML file.
    reader = Nokogiri::XML::Reader.from_memory(xml)
    entity = nil
    pd = 0
    # Read the XML file entity by entity.
    while reader.read
      # The depth in the XML tree.
      cd = reader.depth
      # If we are at the end of the 
      # children stack, pop up.
      if pd > cd
        entity = entity.parent
      end
      # If an end element has been reached,
      # change the depth and pop up on next
      # iteration.
      if reader.node_type ==
        Nokogiri::XML::Reader::TYPE_END_ELEMENT
        pd = cd
        next
      end
      # Get and format attributes and edges.
      attributes = reader.attributes
      id = attributes.delete('id')
      new_attr = {}; edges = {}
      unless attributes.size == 0
        new_attr, edges =
        cleanup_attributes(reader.name, attributes)
      end
      # Create the appropriate entity for the
      # element.
      current_value = ''
      case reader.name
      when 'sentence'
        entity = Treat::Entities::Sentence.new('')
        @@id_table[id] = entity.id
        @@edges_table[entity.id] = edges
        entity.features = new_attr
      when 'cons'
        entity = entity <<
        Treat::Entities::Phrase.new('')
        @@id_table[id] = entity.id
        @@edges_table[entity.id] = edges
        entity.features = new_attr
      when 'tok'
        tmp_attributes = new_attr
        tmp_edges = edges
      else
        current_value = reader.value.gsub(/\s+/, "")
        unless current_value.size == 0
          entity = entity <<
          Treat::Entities::Token.from_string(current_value)
          if entity.is_a?(Treat::Entities::Word)
            entity.features = tmp_attributes
            @@id_table[id] = entity.id
            @@edges_table[entity.id] = tmp_edges
          else
            # Do something useful here
            entity.set :tag, 'SYM'
          end
        end
      end
      pd = cd
    end
    entity
  end
  
  # Validate a text - Enju wants period to parse a sentence.
  def self.valid_text(val)
    if val.count('.') == 0
      remove_last = true
      text = val + '.'
    else
      remove_last = false
      text = val.gsub('.', '')
      text += '.' unless ['!', '?'].include?(text[-1])
    end
    return text, remove_last
  end
  
  # Link the head and sem_head to their entities.
  def self.link_heads(entity)
    entity.each_phrase do |phrase|
      if phrase.has?(:head)
        phrase.link(
        @@id_table[phrase.head], 
        'head', true, -1)
        phrase.unset(:head)
      end
      if phrase.has?(:sem_head)
        phrase.link(
        @@id_table[phrase.sem_head], 
        'sem_head', true, -1)
        phrase.unset(:sem_head)
      end
    end
  end
  
  # Add edges a posteriori to a parsed entity.
  def self.add_edges(entity2)
    
    entity2.each_entity(:word, :phrase) do |entity|
      @@edges_table.each_pair do |id, edges|
        next if edges.nil?
        entity = entity2.root.find(id)
        next if entity.nil?
        edges.each_pair do |argument, type|
          # Skip this argument if we 
          # don't know the target node.
          next if argument == 'unk'
          entity.link(
            @@id_table[argument], 
            type.intern
          )
        end
      end
    end
    
  end
  
  # Helper function to convert Enju attributes to Treat attributes.
  def self.cleanup_attributes(name, attributes)
    
    new_attr = {}
    edges = {}
    pred = attributes.delete('pred')
    
    attributes.each_pair do |attribute2, value|
      
      attribute = attribute2.strip
      
      if attribute == 'arg1' || 
        attribute == 'arg2'
        edges[value] = pred
        next
      end
      
      if attribute == 'cat'
        new_attr[:cat] = value
        if name == 'tok'
          if value.length > 1 && 
            ['P', 'X'].include?(value[-1]) &&
            value != 'PN'
            new_attr[:saturated] = 
            (value[-1] == 'P')
            value = value[0..-2]
          end
          new_attr[:category] = Ectc[value]
        else
          tags = Ecxtp.select do |m|
            m[0] == value && m[1] == 
            attributes['xcat']
          end
          tag = (tags.size == 0) ? 
          'FW' : tags[0][2]
          new_attr[:tag] = tag
        end
      else
        new_attr[:"#{attribute}"] = value
      end
      
    end
    
    # Handle naming conventions.
    if attributes.has_key?('pos')
      new_attr[:tag] = new_attr[:pos]
      new_attr[:tag_set] = :penn
      new_attr.delete :pos
    end
    
    if attributes.has_key?('base')
      new_attr[:lemma] = new_attr[:base]
      new_attr.delete :base
    end
    
    return new_attr, edges
  
  end
  
end