DeveloperCAP/MLCAT

View on GitHub
lib/analysis/thread/message_activity.py

Summary

Maintainability
D
1 day
Test Coverage
import plotly as ply
import plotly.graph_objs as plygo
from lib.util.read import *
import json


def generate_weekly_message_activity_heatmap(json_data, filename):
    """
    Generate weekly message activity heatmap from JSON data.

    :param json_data: Data for generating the heatmap.
    :param filename: Heatmap file name.
    """
    heatmap_data = [[0 for x in range(48)] for x in range(7)]
    for msg_uid, json_obj in json_data.items():
        sent_day = json_obj['Time'].weekday()
        bin_number = 2*json_obj['Time'].hour if json_obj['Time'].minute < 30 else 2*json_obj['Time'].hour+1
        heatmap_data[sent_day][bin_number] += 1
    heatmap = [plygo.Heatmap(
        z=heatmap_data,
        x=list(range(48)),
        y=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])]
    ply.offline.plot(heatmap, filename=filename)


def generate_monthly_message_activity_heatmap(json_data, filename):
    """
    Generate monthly message activity heatmap from JSON data.

    :param json_data: Data for generating the heatmap.
    :param filename: Heatmap file name.
    """
    heatmap_data = [[0 for x in range(48)] for x in range(12)]
    for msg_uid, json_obj in json_data.items():
        sent_month = json_obj['Time'].month-1
        bin_number = 2*json_obj['Time'].hour if json_obj['Time'].minute < 30 else 2*json_obj['Time'].hour+1
        heatmap_data[sent_month][bin_number] += 1
    heatmap = [plygo.Heatmap(
        z=heatmap_data,
        x=list(range(48)),
        y=list(range(1,12)))]
    ply.offline.plot(heatmap, filename=filename)


def generate_daily_message_activity_timeline(json_data, filename):
    """
    Generate daily message activity timeline from JSON data.

    :param json_data: Data for generating the timeline.
    :param filename: Timeline file name.
    """
    timeline_data = [0 for x in range(48)]
    for msg_uid, json_obj in json_data.items():
        bin_number = 2*json_obj['Time'].hour if json_obj['Time'].minute < 30 else 2*json_obj['Time'].hour+1
        timeline_data[bin_number] += 1
    trace = plygo.Scatter(
        x=list(range(48)),
        y=timeline_data
    )
    timeline = [trace]
    ply.offline.plot(timeline, filename=filename)


def generate_weekly_message_activity_timeline(json_data, filename):
    """
    Generate weekly message activity timeline from JSON data.

    :param json_data: Data for generating the timeline.
    :param filename: Timeline file name.
    """
    timeline_data = [0 for x in range(7*24)]
    for msg_uid, json_obj in json_data.items():
        bin_number = json_obj['Time'].hour + 24 * json_obj['Time'].weekday()
        timeline_data[bin_number] += 1
    trace = plygo.Scatter(
        x=list(range(7*24)),
        y=timeline_data
    )
    timeline = [trace]
    ply.offline.plot(timeline, filename=filename)


def generate_monthly_message_activity_timeline(json_data, filename):
    """
    Generate monthly message activity timeline from JSON data.

    :param json_data: Data for generating the timeline.
    :param filename: Timeline file name.
    """    
    timeline_data = [0 for x in range(31*24)]
    for msg_uid, json_obj in json_data.items():
        bin_number = json_obj['Time'].hour + 24 * (json_obj['Time'].day - 1)
        timeline_data[bin_number] += 1
    trace = plygo.Scatter(
        x=list(range(31*24)),
        y=timeline_data
    )
    timeline = [trace]
    ply.offline.plot(timeline, filename=filename)


def generate_yearly_message_activity_timeline(json_data, filename):
    """
    Generate yearly message activity timeline from JSON data.

    :param json_data: Data for generating the timeline.
    :param filename: Timeline file name.
    """
    timeline_data = [0 for x in range(366*24)]
    for msg_uid, json_obj in json_data.items():
        bin_number = json_obj['Time'].hour + 24 * (json_obj['Time'].timetuple().tm_yday - 1)
        timeline_data[bin_number] += 1
    trace = plygo.Scatter(
        x=list(range(365*24)),
        y=timeline_data
    )
    timeline = [trace]
    ply.offline.plot(timeline, filename=filename)


def generate_message_activity_heatmaps(clean_headers_filename, foldername, timeline=True):
    """
    Extract header information and call functions to generate various timelines or heatmaps.

    :param clean_headers_filename: The JSON file containing cleaned headers.
    :param foldername: The MBOX folder.
    :param timeline: True for generating timelines,False for heatmaps.
    """
    # Time limit can be specified here in the form of a timestamp in one of the identifiable formats. All messages
    # that have arrived after time_ubound and before time_lbound will be ignored.
    time_ubound = None
    time_lbound = None

    # If ignore_lat is true, then messages that belong to threads that have only a single author are ignored.
    ignore_lat = False

    author_graph = nx.DiGraph()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
    json_data = dict()

    if time_ubound is None:
        time_ubound = time.strftime("%a, %d %b %Y %H:%M:%S %z")
    time_ubound = get_datetime_object(time_ubound)

    if time_lbound is None:
        time_lbound = "Sun, 01 Jan 2001 00:00:00 +0000"
    time_lbound = get_datetime_object(time_lbound)

    print("All messages before", time_ubound, "and after", time_lbound,  "are being considered.")

    if not ignore_lat:
        with open(clean_headers_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                json_obj['Message-ID'] = int(json_obj['Message-ID'])
                json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
                if time_lbound <= json_obj['Time'] < time_ubound:
                    # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                    from_addr = email_re.search(json_obj['From'])
                    json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
                    json_obj['To'] = set(email_re.findall(json_obj['To']))
                    json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
                    # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                    json_data[json_obj['Message-ID']] = json_obj
    else:
        lone_author_threads = get_lone_author_threads(save_file=None, nodelist_filename=foldername+"/tables/graph_nodes.csv", edgelist_filename=foldername+"/tables/graph_edges.csv")
        with open(clean_headers_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                json_obj['Message-ID'] = int(json_obj['Message-ID'])
                if json_obj['Message-ID'] not in lone_author_threads:
                    json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
                    if time_lbound <= json_obj['Time'] < time_ubound:
                        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                        from_addr = email_re.search(json_obj['From'])
                        json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
                        json_obj['To'] = set(email_re.findall(json_obj['To']))
                        json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
                        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                        json_data[json_obj['Message-ID']] = json_obj
    print("JSON data loaded.")

    if not timeline:
        generate_weekly_message_activity_heatmap(json_data, foldername+'/heatmaps/weekly-message-activity-heatmap.html')
        generate_monthly_message_activity_heatmap(json_data, foldername+'/heatmaps/monthly-message-activity-heatmap.html')
    else:
        generate_daily_message_activity_timeline(json_data, foldername + '/plots/daily-message-activity-timeline.html')
        generate_weekly_message_activity_timeline(json_data, foldername + '/plots/weekly-message-activity-timeline.html')
        generate_monthly_message_activity_timeline(json_data, foldername + '/plots/monthly-message-activity-timeline.html')
        generate_yearly_message_activity_timeline(json_data, foldername + '/plots/yearly-message-activity-timeline.html')