lib/analysis/author/wh_table.py
import csvfrom lib.util.read import * Function `generate_wh_table_authors` has a Cognitive Complexity of 66 (exceeds 5 allowed). Consider refactoring.
Function `generate_wh_table_authors` has 5 arguments (exceeds 4 allowed). Consider refactoring.def generate_wh_table_authors(nodelist_filename, edgelist_filename, output_filename, ignore_lat=False, time_limit=None): """ This module is used to generate the author version of the width height table. The width height table for the authors is a representation of the number of total and new authors in a thread aggregated at a given generation. The table, which itself is temporarily stored in a two dimensional array, is then written into a CSV file. These tables are can be used to decipher the basic conversation structure. :param nodelist_filename: The csv file containing the nodes. :param edgelist_filename: The csv file containing the edges. :param output_filename: Stores the width-height table values. :param ignore_lat: If true, then lone author threads are ignored. :param time_limit: All messages until this time are considered and all messages after this time are ignored. Time is specified as a string in one of the recognized formats. """ if time_limit is None: time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z") msgs_before_time = set() time_limit = get_datetime_object(time_limit) print("All messages before", time_limit, "are being considered.") discussion_graph = nx.DiGraph() email_re = re.compile(r'[\w\.-]+@[\w\.-]+') # Add nodes into NetworkX graph by reading from CSV file if not ignore_lat: with open(nodelist_filename, "r") as node_file: for pair in node_file: node = pair.split(';', 2) if get_datetime_object(node[2].strip()) < time_limit: node[0] = int(node[0]) msgs_before_time.add(node[0]) from_addr = email_re.search(node[1].strip()) from_addr = from_addr.group(0) if from_addr is not None else node[1].strip() discussion_graph.add_node(node[0], time=node[2].strip(), color="#ffffff", style='bold', sender=from_addr) node_file.close() print("Nodes added.") # Add edges into NetworkX graph by reading from CSV file with open(edgelist_filename, "r") as edge_file: for pair in edge_file: edge = pair.split(';') edge[0] = int(edge[0]) edge[1] = int(edge[1])Identical blocks of code found in 2 locations. Consider refactoring. if edge[0] in msgs_before_time and edge[1] in msgs_before_time: try: discussion_graph.node[edge[0]]['sender'] discussion_graph.node[edge[1]]['sender'] discussion_graph.add_edge(*edge) except KeyError: pass edge_file.close() print("Edges added.") else: lone_author_threads = get_lone_author_threads(nodelist_filename=nodelist_filename, edgelist_filename=edgelist_filename) # Add nodes into NetworkX graph only if they are not a part of a thread that has only a single author with open(nodelist_filename, "r") as node_file: for pair in node_file: node = pair.split(';', 2) node[0] = int(node[0]) if get_datetime_object(node[2].strip()) < time_limit and node[0] not in lone_author_threads: msgs_before_time.add(node[0]) from_addr = email_re.search(node[1].strip()) from_addr = from_addr.group(0) if from_addr is not None else node[1].strip() discussion_graph.add_node(node[0], time=node[2].strip(), color="#ffffff", style='bold', sender=from_addr) node_file.close() print("Nodes added.") # Add edges into NetworkX graph only if they are not a part of a thread that has only a single author with open(edgelist_filename, "r") as edge_file: for pair in edge_file: edge = pair.split(';') edge[0] = int(edge[0]) edge[1] = int(edge[1])Identical blocks of code found in 2 locations. Consider refactoring. if edge[0] not in lone_author_threads and edge[1] not in lone_author_threads:Avoid deeply nested control flow statements. if edge[0] in msgs_before_time and edge[1] in msgs_before_time: try: discussion_graph.node[edge[0]]['sender'] discussion_graph.node[edge[1]]['sender'] discussion_graph.add_edge(*edge) except KeyError: pass edge_file.close() print("Edges added.") print("No. of Nodes: ", nx.number_of_nodes(discussion_graph)) print("No. of Edges: ", nx.number_of_edges(discussion_graph)) print("No. of Weakly Connected Components: ", nx.number_weakly_connected_components(discussion_graph)) max_height = nx.dag_longest_path_length(discussion_graph) # The following 2D array stores the number of nodes at given height with a given number of children # If A[i][j] = n, then there are n nodes at height i with j children new_wh_table = [[0 for x in range(nx.number_of_nodes(discussion_graph)//2)] for x in range(max_height+1)] wh_table = [[0 for x in range(nx.number_of_nodes(discussion_graph)//2)] for x in range(max_height+1)] max_width = 0 for conn_subgraph in nx.weakly_connected_component_subgraphs(discussion_graph): # The following lists of sets store the authors / new authors at each level in the thread. authors_at_height = [set() for x in range(max_height+1)] new_authors_at_height = [set() for x in range(max_height+1)] thread_authors = set() source_node = min(conn_subgraph.nodes()) # print("Source node:", source_node) for node, attributes in sorted(conn_subgraph.nodes_iter(data=True)): node_author = attributes['sender'] try: node_height = nx.shortest_path_length(conn_subgraph, source_node, node) except: node_height = 1 authors_at_height[node_height].add(node_author) if node_author not in thread_authors: new_authors_at_height[node_height].add(node_author) thread_authors.add(node_author) for height in range(max_height+1): wh_table[height][len(authors_at_height[height])] += 1 for height in range(max_height+1): new_wh_table[height][len(new_authors_at_height[height])] += 1 # print("Node:", node, "Height:",node_height, "Width:",node_width) thread_max_width = max([len(i) for i in authors_at_height]) max_width = thread_max_width if max_width < thread_max_width else max_width irow = 0 combined_table = [[0 for x in range(2 * max_width)] for x in range(max_height+1)] for (row1, row2) in zip(wh_table, new_wh_table): icol = 0 row1 = row1[1:max_width+1] row2 = row2[1:max_width+1] for i in range(max_width): combined_table[irow][icol] = row1[i] combined_table[irow][icol+1] = row2[i] icol += 2 irow += 1 with open(output_filename, 'w') as csvfile: tablewriter = csv.writer(csvfile) tablewriter.writerow(["Height(h)", "Number of authors(i)"]) tablewriter.writerow([" "] + " ".join([str(x) for x in range(1, max_width + 1)]).split(" ") + [" ", "Subtotal"]) tablewriter.writerow([" "] + ("Total New "*max_width).split()) row_height = 0 total = 0 for row in combined_table: subtotal = 0 for j in row[::2]: subtotal += j tablewriter.writerow([row_height] + row + [subtotal]) row_height += 1 total += subtotal tablewriter.writerow(["Total:", total])