david-mccullars/text_rank

View on GitHub
lib/text_rank/token_filter/part_of_speech.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'engtagger'

module TextRank
  module TokenFilter
    ##
    # Token filter to keep only a selected set of parts of speech
    #
    # = Example
    #
    #   PartOfSpeech.new(parts_to_keep: %w[nn nns]).filter!(%w[
    #     all men are by nature free
    #   ])
    #   => ["men", "nature"]
    ##
    class PartOfSpeech

      # @param parts_to_keep [Array<String>] list of engtagger parts of speech to keep
      # @see https://github.com/yohasebe/engtagger#tag-set
      def initialize(parts_to_keep: %w[nn nnp nnps nns jj jjr jjs vb vbd vbg vbn vbp vbz], **_)
        @parts_to_keep = Set.new(parts_to_keep)
        @eng_tagger = EngTagger.new
        @last_pos_tag = 'pp'
      end

      # Perform the filter
      # @param tokens [Array<String>]
      # @return [Array<String>]
      def filter!(tokens)
        tokens.keep_if do |token|
          @parts_to_keep.include?(pos_tag(token))
        end
      end

      private

      def pos_tag(token)
        tag = @eng_tagger.assign_tag(@last_pos_tag, token) rescue nil
        tag = 'nn' if tag.nil? || tag == ''
        @last_pos_tag = tag
      end

    end
  end
end