DeveloperCAP/MLCAT

View on GitHub
lib/analysis/author/edge_list.py

Summary

Maintainability
C
1 day
Test Coverage
"""
This module is used to  graphs that show the interaction between authors in the mailing list. There is an edge from
one author to another if the former sent a message to the latter either in To or by marking in CC. These graphs are for
the entire mailing list.
"""
import json
from lib.util.read import *


def generate_edge_list(author_nodes, author_edges, graph_nodes,
                       graph_edges, threads_json, author_json, ignore_lat=True):
    """    
    :param author_nodes: The csv file containing the author nodes data.
    :param author_edges: The csv file containing the author edges data.
    :param graph_nodes: The csv file containing the nodes (Lone author threads).
    :param graph_edges: The csv file containing the edges (Lone author threads).
    :param threads_json: The JSON file containing the cleaned headers.
    :param author_json: The JSON file containing the author UID map.
    :param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored.
    """
    # Time limit can be specified here in the form of a timestamp in one of the identifiable formats and all messages
    # that have arrived after this timestamp will be ignored.
    time_limit = None
    author_graph = nx.DiGraph()
    with open(author_json, 'r') as author_uid_file:
        author_uid_map = json.load(author_uid_file)
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
    json_data = dict()
    if time_limit is None:
        time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z")
    msgs_before_time = set()
    time_limit = get_datetime_object(time_limit)
    print("All messages before", time_limit, "are being considered.")

    if not ignore_lat:
        with open(threads_json, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                json_obj['Message-ID'] = int(json_obj['Message-ID'])
                json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
                if json_obj['Time'] < time_limit:
                    # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                    from_addr = email_re.search(json_obj['From'])
                    json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
                    json_obj['To'] = set(email_re.findall(json_obj['To']))
                    json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
                    # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                    json_data[json_obj['Message-ID']] = json_obj
        print("JSON data loaded.")
    else:
        lone_author_threads = get_lone_author_threads(None, graph_nodes, graph_edges)
        with open(threads_json, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                json_obj['Message-ID'] = int(json_obj['Message-ID'])
                if json_obj['Message-ID'] not in lone_author_threads:
                    json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
                    if json_obj['Time'] < time_limit:
                        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                        from_addr = email_re.search(json_obj['From'])
                        json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
                        json_obj['To'] = set(email_re.findall(json_obj['To']))
                        json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
                        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                        json_data[json_obj['Message-ID']] = json_obj
        print("JSON data loaded.")

    for msg_id, message in json_data.items():
        if message['Cc'] is None:
            addr_list = message['To']
        else:
            addr_list = message['To'] | message['Cc']
        for to_address in addr_list:
            author_graph.add_edge(author_uid_map[message['From']], author_uid_map[to_address])

    nx.write_edgelist(author_graph, author_edges, delimiter="\t")

    with open(author_nodes, 'w') as nodelist_file:
        for author_address, author_uid in author_uid_map.items():
            nodelist_file.write(str(author_uid) + "\t" + author_address + "\n")

    # print("No. of Weakly Connected Components:", nx.number_weakly_connected_components(author_graph))
    # print("No. of Strongly Connected Components:", nx.number_strongly_connected_components(author_graph))
    # print("Nodes:", nx.number_of_nodes(author_graph))
    # print("Edges:", nx.number_of_edges(author_graph))

# generate_edge_list(author_nodelist_filename='./data/lkml/tables/author_graph_nodes.txt', author_edgelist_filename='./data/lkml/tables/author_graph_edges.txt',
#                    nodelist_filename='./data/lkml/tables/graph_nodes.csv', edgelist_filename='./data/lkml/tables/graph_edges.csv',
#                    threads_json_filename="./data/lkml/json/clean_data.json", author_json_filename='./data/lkml/json/author_uid_map.json')