lib/analysis/author/edge_list.py
"""
This module is used to graphs that show the interaction between authors in the mailing list. There is an edge from
one author to another if the former sent a message to the latter either in To or by marking in CC. These graphs are for
the entire mailing list.
"""
import json
from lib.util.read import *
def generate_edge_list(author_nodes, author_edges, graph_nodes,
graph_edges, threads_json, author_json, ignore_lat=True):
"""
:param author_nodes: The csv file containing the author nodes data.
:param author_edges: The csv file containing the author edges data.
:param graph_nodes: The csv file containing the nodes (Lone author threads).
:param graph_edges: The csv file containing the edges (Lone author threads).
:param threads_json: The JSON file containing the cleaned headers.
:param author_json: The JSON file containing the author UID map.
:param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored.
"""
# Time limit can be specified here in the form of a timestamp in one of the identifiable formats and all messages
# that have arrived after this timestamp will be ignored.
time_limit = None
author_graph = nx.DiGraph()
with open(author_json, 'r') as author_uid_file:
author_uid_map = json.load(author_uid_file)
email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
json_data = dict()
if time_limit is None:
time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z")
msgs_before_time = set()
time_limit = get_datetime_object(time_limit)
print("All messages before", time_limit, "are being considered.")
if not ignore_lat:
with open(threads_json, 'r') as json_file:
for chunk in lines_per_n(json_file, 9):
json_obj = json.loads(chunk)
json_obj['Message-ID'] = int(json_obj['Message-ID'])
json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
if json_obj['Time'] < time_limit:
# print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
from_addr = email_re.search(json_obj['From'])
json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
json_obj['To'] = set(email_re.findall(json_obj['To']))
json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
# print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
json_data[json_obj['Message-ID']] = json_obj
print("JSON data loaded.")
else:
lone_author_threads = get_lone_author_threads(None, graph_nodes, graph_edges)
with open(threads_json, 'r') as json_file:
for chunk in lines_per_n(json_file, 9):
json_obj = json.loads(chunk)
json_obj['Message-ID'] = int(json_obj['Message-ID'])
if json_obj['Message-ID'] not in lone_author_threads:
json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
if json_obj['Time'] < time_limit:
# print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
from_addr = email_re.search(json_obj['From'])
json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
json_obj['To'] = set(email_re.findall(json_obj['To']))
json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
# print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
json_data[json_obj['Message-ID']] = json_obj
print("JSON data loaded.")
for msg_id, message in json_data.items():
if message['Cc'] is None:
addr_list = message['To']
else:
addr_list = message['To'] | message['Cc']
for to_address in addr_list:
author_graph.add_edge(author_uid_map[message['From']], author_uid_map[to_address])
nx.write_edgelist(author_graph, author_edges, delimiter="\t")
with open(author_nodes, 'w') as nodelist_file:
for author_address, author_uid in author_uid_map.items():
nodelist_file.write(str(author_uid) + "\t" + author_address + "\n")
# print("No. of Weakly Connected Components:", nx.number_weakly_connected_components(author_graph))
# print("No. of Strongly Connected Components:", nx.number_strongly_connected_components(author_graph))
# print("Nodes:", nx.number_of_nodes(author_graph))
# print("Edges:", nx.number_of_edges(author_graph))
# generate_edge_list(author_nodelist_filename='./data/lkml/tables/author_graph_nodes.txt', author_edgelist_filename='./data/lkml/tables/author_graph_edges.txt',
# nodelist_filename='./data/lkml/tables/graph_nodes.csv', edgelist_filename='./data/lkml/tables/graph_edges.csv',
# threads_json_filename="./data/lkml/json/clean_data.json", author_json_filename='./data/lkml/json/author_uid_map.json')