david-mccullars/text_rank

View on GitHub
lib/text_rank/graph_strategy/coocurrence.rb

Summary

Maintainability
A
0 mins
Test Coverage
module TextRank
  module GraphStrategy
    ##
    # The original TextRank algorithm uses co-occurrence in determining how to
    # construct a graph of eligible token keywords and relate them together.  Given a
    # window size of N any other token at most N positions away from a token is
    # considered co-ocurrent, and an edge will be drawn between them.
    #
    # This implementation makes a slight change from the original algorithm by
    # choosing a weight of 1/distance_from_token as the edge weight.
    #
    # = Example
    #   Coocurrence.new(ngram_size: 4).build_graph(%w[what a pessimist you are exclaimed candide], graph)
    #   # graph.add("what", "a", 1.0)
    #   # graph.add("what", "pessimist", 0.5)
    #   # graph.add("what", "you", 0.3333333333333333)
    #   # graph.add("what", "are", 0.25)
    #   # graph.add("a", "what", 1.0)
    #   # graph.add("a", "pessimist", 1.0)
    #   # graph.add("a", "you", 0.5)
    #   # graph.add("a", "are", 0.3333333333333333)
    #   # graph.add("a", "exclaimed", 0.25)
    #   # graph.add("pessimist", "what", 0.5)
    #   # graph.add("pessimist", "a", 1.0)
    #   # graph.add("pessimist", "you", 1.0)
    #   # graph.add("pessimist", "are", 0.5)
    #   # graph.add("pessimist", "exclaimed", 0.3333333333333333)
    #   # graph.add("pessimist", "candide", 0.25)
    #   # graph.add("you", "what", 0.3333333333333333)
    #   # graph.add("you", "a", 0.5)
    #   # graph.add("you", "pessimist", 1.0)
    #   # graph.add("you", "are", 1.0)
    #   # graph.add("you", "exclaimed", 0.5)
    #   # graph.add("you", "candide", 0.3333333333333333)
    #   # graph.add("are", "what", 0.25)
    #   # graph.add("are", "a", 0.3333333333333333)
    #   # graph.add("are", "pessimist", 0.5)
    #   # graph.add("are", "you", 1.0)
    #   # graph.add("are", "exclaimed", 1.0)
    #   # graph.add("are", "candide", 0.5)
    #   # graph.add("exclaimed", "a", 0.25)
    #   # graph.add("exclaimed", "pessimist", 0.3333333333333333)
    #   # graph.add("exclaimed", "you", 0.5)
    #   # graph.add("exclaimed", "are", 1.0)
    #   # graph.add("exclaimed", "candide", 1.0)
    #   # graph.add("candide", "pessimist", 0.25)
    #   # graph.add("candide", "you", 0.3333333333333333)
    #   # graph.add("candide", "are", 0.5)
    #   # graph.add("candide", "exclaimed", 1.0)
    ##
    class Coocurrence

      # @param ngram_size [Fixnum] Window size around a token considered co-occurrence
      def initialize(ngram_size: 3, **_)
        @ngram_size = ngram_size
      end

      # Build a graph for which the PageRank algorithm will be applied
      # @param tokens [Array<String>] filtered tokens from which to build a graph
      # @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
      # return [nil]
      def build_graph(tokens, graph)
        ngram_window = (@ngram_size * 2) + 1
        tokens.size.times do |i|
          ngram_window.times do |j|
            consider_ngram_window(tokens, graph, i, j)
          end
        end
        nil
      end

      private

      def consider_ngram_window(tokens, graph, idx_i, idx_j)
        return if idx_j == @ngram_size || idx_i + idx_j < @ngram_size

        token_i = tokens[idx_i]
        token_j = tokens[idx_i - @ngram_size + idx_j]

        if token_j
          graph.add(token_i, token_j, weight: 1.0 / (idx_j - @ngram_size).abs)
        end
      end

    end
  end
end