project/app/utils/complexity/squad_score.py from Lambda-School-Labs/Labs26-StorySquad-DS-TeamB

project/app/utils/complexity/squad_score.py
Summary

Maintainability

1 hr
Test Coverage

Issues
from os import path

import joblib
import nltk
import pandas as pd

PICKLE_PATH = path.join(path.dirname(__file__), "", "MinMaxScaler.pkl")
scaler = joblib.load(PICKLE_PATH)


def metrics(document: str):
    """
    Cleans and generates metrics for a single story transcription.

    Called within squad_score async function, so only need be run separately
    if metrics without a complexity metric are required.

    Input: story transcription as a string
    Output: a single row of a dataframe, with the following metrics as columns:

    Included metrics:
    - Length of story (in characters)
    - Average word length (in chars)
    - Number of quotation marks
    - Number of unique words (over 2 chars)
    - Number of adjectives (using nltk)
    """

    # Strip leading or tailing spaces and integers
    cleaned = document.strip().strip("/-0123456789")

    # Ensure all commas and periods are followed by a space
    cleaned = cleaned.replace(".", ". ").replace(",", ", ")

    # Remove any instances of multiple spaces
    cleaned = " ".join(cleaned.split())

    # Generate single row dataframe from transcription
    cols = ["transcription"]
    df = pd.DataFrame([cleaned], columns=cols)

    # Generate metrics
    # Length of story
    df["story_length"] = df["transcription"].str.len()

    # Average word length
    word_count = (df["transcription"].str.split()).str.len()
    df["avg_word_len"] = df["story_length"] / word_count

    # Number of quotation marks
    df["quotes_num"] = df["transcription"].str.count('"')

    # Number of unique words, over 2 characters
    def over_two_chars(cleaned_transcription):
        """Returns number of unique 2+ char words in transcription."""
        word_list = cleaned_transcription.split()
        word_set = set()
        for x in word_list:
            if len(x) > 2:
                word_set.add(x)
        return len(word_set)

    df["unique_words_num"] = df["transcription"].apply(over_two_chars)

    # Number of adjectives
    def num_adj(transcription):
        """Returns number of adjectives in transcription."""
        tokens = nltk.word_tokenize(transcription)
        pos = nltk.pos_tag(tokens)
        adj_count = 1
        for word in pos:
            if word[1] == "JJ":
                adj_count += 1
        return adj_count

    df["adj_num"] = df["transcription"].apply(num_adj)

    return df


async def squad_score(document: str, scaler):
    """
    Generates a complexity metric, Squad Score, for a given transcription.

    Calls metrics function to generate metrics, then scales, weights,
    and adds all metrics together to create Squad Score.

    Initial scaling based on pickled MinMaxScaler from training data.
    See squad_score_mvp notebook for more.

    In future iterations, weights can be adjusted based on additional analysis.
    In version 0.1, weights initialized at 1 for all factors.

    Input: story transcription as a string, and pickled scaler
    Output: single integer value for Squad Score

    Note that this is an async function. In order to run locally as an
    isolated function, must be run using:
    asyncio.run(squad_score(document, scaler))
    """

    # Generate metrics and single row DF from transcription
    row = metrics(document).iloc[0, :]

    # Instantiate weights
    weights = {
        "story_length": 1,
        "avg_word_len": 1,
        "quotes_number": 1,
        "unique_words": 1,
        "adj_num": 1,
    }

    # Scale metrics with pickled MinMax Scaler
    scaled = scaler.transform([row[1:]])[0]

    # Generate scaler to create desired output range (~1-100)
    range_scaler = 30

    # Weight values
    sl = weights["story_length"] * scaled[0] * range_scaler
    awl = weights["avg_word_len"] * scaled[1] * range_scaler
    qn = weights["quotes_number"] * scaled[2] * range_scaler
    uw = weights["unique_words"] * scaled[3] * range_scaler
    an = weights["adj_num"] * scaled[4] * range_scaler

    # Add all values
    squad_score = sl + awl + qn + uw + an

    # Ensure squad_score is a positive value
    # This would only be necessary for 1-2 sentence-long stories
    # or test sentences
    if squad_score < 0:
        squad_score = 0

    return squad_score