allay-ds-api/fastapi_app/text_preprocessing.py
"""Process incoming text into a format usable by the ML models.
Contains functions for cleaning the text, creating lemmas, creating the
word2idx sequences for incoming text.
Entrypoint should be `preprocess_XXX()` methods to customize prepreocessing
for the type of model in use.
Preprocessing should use the same methods used in the train_XX_models.ipynb
notebooks in the <repo-root>/exploration folder.
"""
import os
import re
from pickle import load
from tensorflow.keras.preprocessing.sequence import pad_sequences
from .globals import NLP
# load pickled word indexes
dir_path = os.path.dirname(os.path.realpath(__file__))
with open(dir_path + '/pickles/word2idx.pkl', 'rb') as f:
word2idx = load(f)
# Preprocessing for lemmatization
# add / remove stop words, normalize, any text processing as necessary
# documents passed to the make_lemmas function should be processed with
# clean_strings first:
# df['cleaned'] = df['tweet'].apply(clean_strings)
# df['lemmas'] = make_lemmas(nlp, df['cleaned'])
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from
https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
modified to accept '@' and '#' characters
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`@#]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
# Lemmatization variables
# additional tokens to ignore
STOP_WORDS = ['user', 'amp', '-PRON-']
# empty / entirely whitespace
is_empty_pattern = re.compile(r'^\s*$')
# entirely (123, 1.23, 1/2, 1,234, 1st, 12th, etc)
is_numeric_pattern = re.compile(r'^[\d./,]+(th|st|am|pm)?$')
# entirely unicode symbols
is_symbol_pattern = re.compile(r'^[\d&#\\ud;]$')
def make_lemmas(nlp, docs):
"""Creates a list of documents containing the lemmas of each document
in the input docs.
:param nlp: spaCy NLP model to use
:param docs: list of documents to lemmatize
:returns: list of lemmatized documents
"""
lemmas = []
for doc in nlp.pipe(docs):
doc_lemmas = []
for token in doc:
if (
not token.is_stop # spaCy stopwords
and not token.is_punct # punctuation
and token.pos_ != 'PRON' # pronouns
and len(token.lemma_) > 2 # two or less characters
and token.lemma_ not in STOP_WORDS # custom stopwords
and not token.lemma_.startswith('@') # twitter handles
and not token.lemma_.startswith('#') # hash tags
and not is_empty_pattern.match(token.lemma_)
and not is_numeric_pattern.match(token.lemma_)
and not is_symbol_pattern.match(token.lemma_)
):
doc_lemmas.append(token.lemma_)
lemmas.append(doc_lemmas)
return lemmas
def to_sequence(index, text):
"""Returns a list of integer indicies of lemmas in `text` to the word2idx
vocab in `index`.
:param index: word2idx vocab.
:param text: list of tokens / lemmas / words to be indexed
:returns: list of indicies
"""
indexes = [index[word] for word in text if word in index]
return indexes
def preprocess_cnn(text: str):
"""Preprocess a string of text into a padded sequence for use as input
in the CNN content moderation model.
:param text: string of text to be rated for inappropriateness
:returns: padded sequence of vocab indicies for use as input on the model
"""
cleaned = clean_str(text)
lemmas = make_lemmas(NLP, [cleaned])
sequence = to_sequence(word2idx, lemmas[0])
# maxlen and value take n from exploration/train_nn_models.ipynb
padded_sequence = pad_sequences([sequence], maxlen=62, value=5000)
return padded_sequence