lib/analysis/author/community.py
"""This module is used to find the community structure of the network according to the Infomap method of Martin Rosvalland Carl T. Bergstrom and returns an appropriate VertexClustering object. This module has been implemented using boththe iGraph package and the Infomap tool from MapEquation.org. The VertexClustering object represents the clustering ofthe vertex set of a graph and also provides some methods for getting the subgraph corresponding to a cluster and such. """import jsonimport subprocessimport sysimport igraphimport numpyimport networkx as nxfrom lib.analysis.author import rankingfrom lib.util.read import * sys.setrecursionlimit(10000) def write_pajek(author_graph, filename="author_graph.net"): """ Writes Pajek file that is compatible with the Infomap Community Detection script for an author graph :param author_graph: NX Graph object that contains the author interaction graph :param filename: Path of the Net file generated by Infomap detection module :return: None """ nx.write_pajek(author_graph, filename) lines_in_file= list() with open(filename, 'r') as pajek_file: for line in pajek_file: lines_in_file.append(line) num_vertices = int(lines_in_file[0].split()[1])Identical blocks of code found in 2 locations. Consider refactoring. for i in range(1, num_vertices+1): line = lines_in_file[i].split() line[1] = "\"" + line[1] + "\"" del line[2:] line.append("\n") lines_in_file[i] = " ".join(line) with open(filename, 'w') as pajek_file: for line in lines_in_file: pajek_file.write(line) print("Written to:", filename) Function `vertex_clustering` has a Cognitive Complexity of 83 (exceeds 5 allowed). Consider refactoring.
Function `vertex_clustering` has 26 lines of code (exceeds 25 allowed). Consider refactoring.
Function `vertex_clustering` has 6 arguments (exceeds 4 allowed). Consider refactoring.def vertex_clustering(json_filename, nodelist_filename, edgelist_filename, foldername, time_limit=None, ignore_lat=False): """ This function performs vertex clustering on the dataset passed in the parameters and saves the dendrogram resulting from the vertex clustering as a PDF along with the visualization of the vertex cluster itself. It is recommended to limit these graphs to 200 authors as the visualization becomes incompehensible beyond that. :param json_filename: Path of the JSON file containing the dataset under analysis :param nodelist_filename: Path of the CSV file containing the list of nodes for the dataset under analysis :param edgelist_filename: Path of the CSV file containing the list of edges for the dataset under analysis :param time_limit: Time limit can be specified here in the form of a timestamp in one of the identifiable formats and all messages that have arrived after this timestamp will be ignored. :param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored. :return: None """ json_data = dict() email_re = re.compile(r'[\w\.-]+@[\w\.-]+') if time_limit is None: time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z") msgs_before_time = set() time_limit = get_datetime_object(time_limit) print("All messages before", time_limit, "are being considered.") if not ignore_lat: with open(json_filename, 'r') as json_file: for chunk in lines_per_n(json_file, 9): json_obj = json.loads(chunk) json_obj['Message-ID'] = int(json_obj['Message-ID']) json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z") if json_obj['Time'] < time_limit: # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) from_addr = email_re.search(json_obj['From']) json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From'] json_obj['To'] = set(email_re.findall(json_obj['To'])) json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) json_data[json_obj['Message-ID']] = json_obj else: lone_author_threads = get_lone_author_threads(False, nodelist_filename, edgelist_filename) with open(json_filename, 'r') as json_file: for chunk in lines_per_n(json_file, 9): json_obj = json.loads(chunk) json_obj['Message-ID'] = int(json_obj['Message-ID']) if json_obj['Message-ID'] not in lone_author_threads: json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")Avoid deeply nested control flow statements. if json_obj['Time'] < time_limit: # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) from_addr = email_re.search(json_obj['From']) json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From'] json_obj['To'] = set(email_re.findall(json_obj['To'])) json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) json_data[json_obj['Message-ID']] = json_obj print("JSON data loaded.") author_graph = igraph.Graph() author_graph.es["weight"] = 1.0 author_map = dict() """ Graphs can also be indexed by strings or pairs of vertex indices or vertex names. When a graph is indexed by a string, the operation translates to the retrieval, creation, modification or deletion of a graph attribute. When a graph is indexed by a pair of vertex indices or names, the graph itself is treated as an adjacency matrix and the corresponding cell of the matrix is returned. Assigning values different from zero or one to the adjacency matrix will be translated to one, unless the graph is weighted, in which case the numbers will be treated as weights. """ top_authors = set() author_scores = ranking.get(json_filename, None, active_score=2, passive_score=1, write_to_file=False) index = 0 for email_addr, author_score in author_scores: index += 1 top_authors.add(email_addr) if index == 100: break index = 0 for id, node in json_data.items(): if node['From'] in top_authors: if node['From'] not in author_map: author_map[node['From']] = index author_graph.add_vertex(name=node['From'], label=node['From']) index += 1 for to_addr in node['To']: if to_addr in top_authors:Avoid deeply nested control flow statements. if to_addr not in author_map: author_map[to_addr] = index author_graph.add_vertex(name=to_addr, label=to_addr) index += 1Avoid deeply nested control flow statements. if author_graph[node['From'], to_addr] == 0: author_graph.add_edge(node['From'], to_addr, weight=1) else: author_graph[node['From'], to_addr] += 1 if node['Cc'] is None: continue for to_addr in node['Cc']: if to_addr in top_authors:Avoid deeply nested control flow statements. if to_addr not in author_map: author_map[to_addr] = index author_graph.add_vertex(name=to_addr, label=to_addr) index += 1Avoid deeply nested control flow statements. if author_graph[node['From'], to_addr] == 0: author_graph.add_edge(node['From'], to_addr, weight=1) else: author_graph[node['From'], to_addr] += 1 print("Nodes and Edges added to iGraph.") vertex_dendogram = author_graph.community_edge_betweenness(clusters=8, directed=True, weights="weight") igraph.plot(vertex_dendogram, foldername + "vd.pdf", vertex_label_size=3, bbox=(1200, 1200)) print("Dendrogram saved as PDF.") vertex_clustering_obj = author_graph.community_infomap(edge_weights=author_graph.es["weight"]) igraph.plot(vertex_clustering_obj, foldername + "vc.pdf", vertex_label_size=10, bbox=(1500, 1500), edge_color="gray") print("Vertex Clustering saved as PDF.") with open(foldername + "community_vertex_clustering.txt", 'w') as output_file: output_file.write(str(vertex_clustering_obj)) output_file.close()