lib/analysis/author/graph/generate.py
"""This module is used to graphs that show the interaction between authors in the mailing list. There is an edge fromone author to another if the former sent a message to the latter either in To or by marking in CC. These graphs are forthe entire mailing list."""import jsonfrom lib.util.read import * def write_to_pajek(author_graph, filename="author_graph.net"): """ Write the networkx graph in Pajek format to author_graph.net :param author_graph: Networkx graph. :param filename: Write to Pajek file compatible with the Infomap Community Detection module. """ # Write Pajek file compatible with the Infomap Community Detection module nx.write_pajek(author_graph, filename) lines_in_file= list() with open(filename, 'r') as pajek_file: for line in pajek_file: lines_in_file.append(line) num_vertices = int(lines_in_file[0].split()[1]) Identical blocks of code found in 2 locations. Consider refactoring. for i in range(1, num_vertices+1): line = lines_in_file[i].split() line[1] = "\"" + line[1] + "\"" del line[2:] line.append("\n") lines_in_file[i] = " ".join(line) with open(filename, 'w') as pajek_file: for line in lines_in_file: pajek_file.write(line) Function `author_interaction` has a Cognitive Complexity of 41 (exceeds 5 allowed). Consider refactoring.
Function `author_interaction` has 5 arguments (exceeds 4 allowed). Consider refactoring.def author_interaction(clean_data, graph_nodes, graph_edges, pajek_file, ignore_lat=True): """ Prints the number of strongly connected components,weekly connected components, number of nodes and edges from the author graph. :param clean_data: Path to clean_data.json file :param graph_nodes: Path to graph_nodes.csv file :param graph_edges: Path to graph_edges.csv file :param pajek_file: Path to author_graph.net(pajek file) which is used by write_to_pajek() :param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored. """ # Time limit can be specified here in the form of a timestamp in one of the identifiable formats and all messages # that have arrived after this timestamp will be ignored. time_limit = None author_graph = nx.DiGraph() email_re = re.compile(r'[\w\.-]+@[\w\.-]+') json_data = dict() if time_limit is None: time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z") msgs_before_time = set() time_limit = get_datetime_object(time_limit) print("All messages before", time_limit, "are being considered.") if not ignore_lat: with open(clean_data, 'r') as json_file: for chunk in lines_per_n(json_file, 9): json_obj = json.loads(chunk) json_obj['Message-ID'] = int(json_obj['Message-ID']) json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z") if json_obj['Time'] < time_limit: # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) from_addr = email_re.search(json_obj['From']) json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From'] json_obj['To'] = set(email_re.findall(json_obj['To'])) json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) json_data[json_obj['Message-ID']] = json_obj print("JSON data loaded.") else: lone_author_threads = get_lone_author_threads(nodelist_filename=graph_nodes, edgelist_filename=graph_edges) with open(clean_data, 'r') as json_file: for chunk in lines_per_n(json_file, 9): json_obj = json.loads(chunk) json_obj['Message-ID'] = int(json_obj['Message-ID']) if json_obj['Message-ID'] not in lone_author_threads: json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")Avoid deeply nested control flow statements. if json_obj['Time'] < time_limit: # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) from_addr = email_re.search(json_obj['From']) json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From'] json_obj['To'] = set(email_re.findall(json_obj['To'])) json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) json_data[json_obj['Message-ID']] = json_obj print("JSON data loaded.") for msg_id, message in json_data.items(): if message['Cc'] is None: addr_list = message['To'] else: addr_list = message['To'] | message['Cc'] for to_address in addr_list: author_graph.add_edge(message['From'], to_address) write_to_pajek(author_graph, filename=pajek_file) print("No. of Weakly Connected Components:", nx.number_weakly_connected_components(author_graph)) print("No. of Strongly Connected Components:", nx.number_strongly_connected_components(author_graph)) print("Nodes:", nx.number_of_nodes(author_graph)) print("Edges:", nx.number_of_edges(author_graph))