DeveloperCAP/MLCAT

View on GitHub
lib/analysis/author/wh_table.py

Summary

Maintainability
D
2 days
Test Coverage
import csv
from lib.util.read import *
 
 
Function `generate_wh_table_authors` has a Cognitive Complexity of 66 (exceeds 5 allowed). Consider refactoring.
Function `generate_wh_table_authors` has 5 arguments (exceeds 4 allowed). Consider refactoring.
def generate_wh_table_authors(nodelist_filename, edgelist_filename, output_filename, ignore_lat=False, time_limit=None):
"""
This module is used to generate the author version of the width height table. The width height table for the
authors is a representation of the number of total and new authors in a thread aggregated at a given generation.
The table, which itself is temporarily stored in a two dimensional array, is then written into a CSV file. These
tables are can be used to decipher the basic conversation structure.
 
:param nodelist_filename: The csv file containing the nodes.
:param edgelist_filename: The csv file containing the edges.
:param output_filename: Stores the width-height table values.
:param ignore_lat: If true, then lone author threads are ignored.
:param time_limit: All messages until this time are considered and all messages after this time are ignored. Time is specified as a string in one of the recognized formats.
"""
if time_limit is None:
time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z")
msgs_before_time = set()
time_limit = get_datetime_object(time_limit)
print("All messages before", time_limit, "are being considered.")
 
discussion_graph = nx.DiGraph()
email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
 
# Add nodes into NetworkX graph by reading from CSV file
if not ignore_lat:
with open(nodelist_filename, "r") as node_file:
for pair in node_file:
node = pair.split(';', 2)
if get_datetime_object(node[2].strip()) < time_limit:
node[0] = int(node[0])
msgs_before_time.add(node[0])
from_addr = email_re.search(node[1].strip())
from_addr = from_addr.group(0) if from_addr is not None else node[1].strip()
discussion_graph.add_node(node[0], time=node[2].strip(), color="#ffffff", style='bold', sender=from_addr)
node_file.close()
print("Nodes added.")
 
# Add edges into NetworkX graph by reading from CSV file
with open(edgelist_filename, "r") as edge_file:
for pair in edge_file:
edge = pair.split(';')
edge[0] = int(edge[0])
edge[1] = int(edge[1])
Identical blocks of code found in 2 locations. Consider refactoring.
if edge[0] in msgs_before_time and edge[1] in msgs_before_time:
try:
discussion_graph.node[edge[0]]['sender']
discussion_graph.node[edge[1]]['sender']
discussion_graph.add_edge(*edge)
except KeyError:
pass
edge_file.close()
print("Edges added.")
 
else:
lone_author_threads = get_lone_author_threads(nodelist_filename=nodelist_filename, edgelist_filename=edgelist_filename)
# Add nodes into NetworkX graph only if they are not a part of a thread that has only a single author
with open(nodelist_filename, "r") as node_file:
for pair in node_file:
node = pair.split(';', 2)
node[0] = int(node[0])
if get_datetime_object(node[2].strip()) < time_limit and node[0] not in lone_author_threads:
msgs_before_time.add(node[0])
from_addr = email_re.search(node[1].strip())
from_addr = from_addr.group(0) if from_addr is not None else node[1].strip()
discussion_graph.add_node(node[0], time=node[2].strip(), color="#ffffff", style='bold', sender=from_addr)
node_file.close()
print("Nodes added.")
 
# Add edges into NetworkX graph only if they are not a part of a thread that has only a single author
with open(edgelist_filename, "r") as edge_file:
for pair in edge_file:
edge = pair.split(';')
edge[0] = int(edge[0])
edge[1] = int(edge[1])
Identical blocks of code found in 2 locations. Consider refactoring.
if edge[0] not in lone_author_threads and edge[1] not in lone_author_threads:
Avoid deeply nested control flow statements.
if edge[0] in msgs_before_time and edge[1] in msgs_before_time:
try:
discussion_graph.node[edge[0]]['sender']
discussion_graph.node[edge[1]]['sender']
discussion_graph.add_edge(*edge)
except KeyError:
pass
edge_file.close()
print("Edges added.")
 
print("No. of Nodes: ", nx.number_of_nodes(discussion_graph))
print("No. of Edges: ", nx.number_of_edges(discussion_graph))
print("No. of Weakly Connected Components: ", nx.number_weakly_connected_components(discussion_graph))
max_height = nx.dag_longest_path_length(discussion_graph)
 
# The following 2D array stores the number of nodes at given height with a given number of children
# If A[i][j] = n, then there are n nodes at height i with j children
new_wh_table = [[0 for x in range(nx.number_of_nodes(discussion_graph)//2)] for x in range(max_height+1)]
wh_table = [[0 for x in range(nx.number_of_nodes(discussion_graph)//2)] for x in range(max_height+1)]
max_width = 0
 
for conn_subgraph in nx.weakly_connected_component_subgraphs(discussion_graph):
# The following lists of sets store the authors / new authors at each level in the thread.
authors_at_height = [set() for x in range(max_height+1)]
new_authors_at_height = [set() for x in range(max_height+1)]
thread_authors = set()
source_node = min(conn_subgraph.nodes())
# print("Source node:", source_node)
for node, attributes in sorted(conn_subgraph.nodes_iter(data=True)):
node_author = attributes['sender']
try:
node_height = nx.shortest_path_length(conn_subgraph, source_node, node)
except:
node_height = 1
authors_at_height[node_height].add(node_author)
if node_author not in thread_authors:
new_authors_at_height[node_height].add(node_author)
thread_authors.add(node_author)
 
for height in range(max_height+1):
wh_table[height][len(authors_at_height[height])] += 1
for height in range(max_height+1):
new_wh_table[height][len(new_authors_at_height[height])] += 1
# print("Node:", node, "Height:",node_height, "Width:",node_width)
thread_max_width = max([len(i) for i in authors_at_height])
max_width = thread_max_width if max_width < thread_max_width else max_width
 
irow = 0
combined_table = [[0 for x in range(2 * max_width)] for x in range(max_height+1)]
for (row1, row2) in zip(wh_table, new_wh_table):
icol = 0
row1 = row1[1:max_width+1]
row2 = row2[1:max_width+1]
for i in range(max_width):
combined_table[irow][icol] = row1[i]
combined_table[irow][icol+1] = row2[i]
icol += 2
irow += 1
 
with open(output_filename, 'w') as csvfile:
tablewriter = csv.writer(csvfile)
tablewriter.writerow(["Height(h)", "Number of authors(i)"])
tablewriter.writerow([" "] + " ".join([str(x) for x in range(1, max_width + 1)]).split(" ") + [" ", "Subtotal"])
tablewriter.writerow([" "] + ("Total New "*max_width).split())
row_height = 0
total = 0
for row in combined_table:
subtotal = 0
for j in row[::2]:
subtotal += j
tablewriter.writerow([row_height] + row + [subtotal])
row_height += 1
total += subtotal
tablewriter.writerow(["Total:", total])