nullnull/simstring

View on GitHub
simstring/feature_extractor/word_ngram.py

Summary

Maintainability
A
0 mins
Test Coverage
from .base import BaseFeatureExtractor

SENTINAL_CHAR = " "  # non breaking space

class WordNgramFeatureExtractor(BaseFeatureExtractor):
    def __init__(self, n=2, splitter=" "):
        self.n = n
        self.splitter = splitter

    def features(self, text):
        # Split text by white space.
        # If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own.
        words = text.split(self.splitter)
        return self._words_ngram(words, self.n, SENTINAL_CHAR)