project/app/utils/clustering/clustering_mvp.py from Lambda-School-Labs/Labs26-StorySquad-DS-TeamB

project/app/utils/clustering/clustering_mvp.py
Summary

Maintainability

3 hrs
Test Coverage

Issues
import json

import pandas as pd


def cluster(cohort_submissions: dict) -> list:
    """
    Splits given dict into clusters of 4 based on their ranked complexity

    The 'remainder problem' of needing to have 4 submissions per cluster,
    regardless of number of submissions, is solved here by duplicating
    submission IDs across clusters to supplement any clusters with less than 4,
    with no clusters containing more than 1 submission_ID that also
    appears in another cluster, unless there are fewer than 3 total
    submissions, or exactly 5 submissions, in the given cohort.

    Input: dictionary of a single cohort containing nested dictionary
    with 'submission_id' as first level key,
    and 'complexity' as one of the inner keys
    Output: Nested list of clusters:
    [[list of submission_ids], [list of submission_ids]]
    """

    # Generate DataFrame from dict
    df = pd.DataFrame.from_dict(cohort_submissions, orient="index")

    # Rank by complexity
    df = df.sort_values(by=["Complexity"], ascending=False)

    # Initial variables
    num_submissions = len(df)
    num_clusters = num_submissions // 4
    remainder = num_submissions % 4
    clusters = []

    # Edge Cases:
    # - less than 4, they are all in one group
    if num_submissions < 4:
        return "Not enough submissions in this cohort to form a full cluster"

    # If the remainder is 3 -> last group will be a group of 3 users
    elif remainder == 3:
        # Cluster submissions until 7 remain
        for i in range(num_clusters - 1):
            # Group by top 4 squad scores
            clusters.append(list(df.index[:4]))
            # Drop stories you have grouped already
            df = df[4:]

        # Manually cluster final two groups to handle remainder problem
        # by duplicating 1 submission so each have 4 submission_ids
        clusters.append(list(df.index[:4]))
        clusters.append(list(df.index[3:]))

    # If the remainder is 2 -> last 2 groups will be groups of 3
    elif remainder == 2:
        if num_submissions == 6:
            clusters.append(list(df.index[:4]))
            clusters.append(list(df.index[2:]))
        else:
            # Cluster submissions until 10 remain
            for i in range(num_clusters - 2):
                # Group by top 4 squad scores
                clusters.append(list(df.index[:4]))
                # Drop stories you have grouped already
                df = df[4:]

            # Manually cluster final three groups to handle remainder problem
            clusters.append(list(df.index[:4]))
            clusters.append(list(df.index[3:7]))
            clusters.append(list(df.index[6:]))

    # If the remainder is 1 -> last 3 groups will be groups of 3
    elif remainder == 1:
        if num_submissions == 5:
            # Overlap both clusters by 3 submissions
            clusters.append(list(df.index[:4]))
            clusters.append(list(df.index[1:]))
        elif num_submissions == 9:
            clusters.append(list(df.index[:4]))
            clusters.append(list(df.index[2:6]))
            clusters.append(list(df.index[5:]))
        else:
            # Cluster submissions until 13 remain
            for i in range(num_clusters - 3):
                # Group by top 4 squad scores
                clusters.append(list(df.index[:4]))
                # Drop stories you have already grouped
                df = df[4:]

            # Manually cluster final three overlapping groups to handle
            # remainder problem
            clusters.append(list(df.index[:4]))
            clusters.append(list(df.index[3:7]))
            clusters.append(list(df.index[6:10]))
            clusters.append(list(df.index[9:]))

    # Else, the remainder is 0. Split evenly by 4
    else:
        for i in range(num_clusters):
            # Group by top 4 squad scores
            clusters.append(list(df.index[:4]))
            # Drop stories you have already grouped
            df = df[4:]

    return clusters


async def batch_cluster(submissions: dict) -> json:
    """
    Generates a return JSON object of clusters for all provided cohorts.

    Input: dictionary of all cohort submissions
    Output: JSON object of nested lists of submission IDs by cluster, by cohort

    To test locally in isolation as an async function, run the following code:
    import asyncio
    asyncio.run(batch_cluster(submissions_json))
    """

    # Initiate cluster dictionary
    cluster_dict = {}

    # Iterate through cohorts to get clusters, and
    # add each to cluster_dict
    for cohort_id in submissions:
        clusters = cluster(submissions[cohort_id])
        cluster_dict[cohort_id] = clusters

    # Convert dict back to JSON
    cluster_json = json.dumps(cluster_dict)

    return cluster_json