doches/rwordnet

View on GitHub
lib/rwordnet/lemma.rb

Summary

Maintainability
A
0 mins
Test Coverage
module WordNet
  # Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
  class Lemma
    SPACE = ' '
    POS_SHORTHAND = {:v => :verb, :n => :noun, :a => :adj, :r => :adv}

    # The word this lemma represents
    attr_accessor :word

    # The part of speech (noun, verb, adjective) of this lemma. One of 'n', 'v', 'a' (adjective), or 'r' (adverb)
    attr_accessor :pos

    # The number of times the sense is tagged in various semantic concordance texts. A tagsense_count of 0 indicates that the sense has not been semantically tagged.
    attr_accessor :tagsense_count

    # The offset, in bytes, at which the synsets contained in this lemma are stored in WordNet's internal database.
    attr_accessor :synset_offsets

    # A unique integer id that references this lemma. Used internally within WordNet's database.
    attr_accessor :id

    # An array of valid pointer symbols for this lemma. The list of all valid
    # pointer symbols is defined in pointers.rb.
    attr_accessor :pointer_symbols

    # Create a lemma from a line in an lexicon file. You should not be creating Lemmas by hand; instead,
    # use the WordNet::Lemma.find and WordNet::Lemma.find_all methods to find the Lemma for a word.
    def initialize(lexicon_line, id)
      @id = id
      line = lexicon_line.split(" ")

      @word = line.shift
      @pos = line.shift
      synset_count = line.shift.to_i
      @pointer_symbols = line.slice!(0, line.shift.to_i)
      line.shift # Throw away redundant sense_cnt
      @tagsense_count = line.shift.to_i
      @synset_offsets = line.slice!(0, synset_count).map(&:to_i)
    end

    # Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
    def synsets
      @synset_offsets.map { |offset| Synset.new(@pos, offset) }
    end

    # Returns a compact string representation of this lemma, e.g. "fall, v" for
    # the verb form of the word "fall".
    def to_s
      [@word, @pos].join(",")
    end

    class << self
      @@cache = {}

      # Find all lemmas for this word across all known parts of speech
      def find_all(word)
        [:noun, :verb, :adj, :adv].flat_map do |pos|
          find(word, pos) || []
        end
      end

      # Find a lemma for a given word and pos. Valid parts of speech are:
      # 'adj', 'adv', 'noun', 'verb'. Additionally, you can use the shorthand
      # forms of each of these ('a', 'r', 'n', 'v')/
      def find(word, pos)
        # Map shorthand POS to full forms
        pos = POS_SHORTHAND[pos] || pos

        cache = @@cache[pos] ||= build_cache(pos)
        if found = cache[word]
          Lemma.new(*found)
        end
      end

      private

      def build_cache(pos)
        cache = {}
        DB.open(File.join("dict", "index.#{pos}")).each_line.each_with_index do |line, index|
          word = line.slice(0, line.index(SPACE))
          cache[word] = [line, index+1]
        end
        cache
      end
    end
  end
end