DeveloperCAP/MLCAT

View on GitHub
lib/analysis/author/community.py

Summary

Maintainability
D
3 days
Test Coverage
"""
This module is used to find the community structure of the network according to the Infomap method of Martin Rosvall
and Carl T. Bergstrom and returns an appropriate VertexClustering object. This module has been implemented using both
the iGraph package and the Infomap tool from MapEquation.org. The VertexClustering object represents the clustering of
the vertex set of a graph and also provides some methods for getting the subgraph corresponding to a cluster and such.

"""
import json
import subprocess
import sys
import igraph
import numpy
import networkx as nx
from lib.analysis.author import ranking
from lib.util.read import *

sys.setrecursionlimit(10000)


def write_pajek(author_graph, filename="author_graph.net"):
    """
    Writes Pajek file that is compatible with the Infomap Community Detection script for an author graph

    :param author_graph: NX Graph object that contains the author interaction graph
    :param filename: Path of the Net file generated by Infomap detection module
    :return: None
    """
    nx.write_pajek(author_graph, filename)
    lines_in_file= list()
    with open(filename, 'r') as pajek_file:
        for line in pajek_file:
            lines_in_file.append(line)
    num_vertices = int(lines_in_file[0].split()[1])
    for i in range(1, num_vertices+1):
        line = lines_in_file[i].split()
        line[1] = "\"" + line[1] + "\""
        del line[2:]
        line.append("\n")
        lines_in_file[i] = " ".join(line)
    with open(filename, 'w') as pajek_file:
        for line in lines_in_file:
            pajek_file.write(line)
    print("Written to:", filename)


def vertex_clustering(json_filename, nodelist_filename, edgelist_filename, foldername, time_limit=None, ignore_lat=False):
    """
    This function performs vertex clustering on the dataset passed in the parameters and saves the dendrogram resulting
    from the vertex clustering as a PDF along with the visualization of the vertex cluster itself. It is recommended to
    limit these graphs to 200 authors as the visualization becomes incompehensible beyond that.

    :param json_filename: Path of the JSON file containing the dataset under analysis
    :param nodelist_filename: Path of the CSV file containing the list of nodes for the dataset under analysis
    :param edgelist_filename: Path of the CSV file containing the list of edges for the dataset under analysis
    :param time_limit: Time limit can be specified here in the form of a timestamp in one of the identifiable formats
    and all messages that have arrived after this timestamp will be ignored.
    :param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored.
    :return: None
    """

    json_data = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    if time_limit is None:
        time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z")
    msgs_before_time = set()
    time_limit = get_datetime_object(time_limit)
    print("All messages before", time_limit, "are being considered.")

    if not ignore_lat:
        with open(json_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                json_obj['Message-ID'] = int(json_obj['Message-ID'])
                json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
                if json_obj['Time'] < time_limit:
                    # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                    from_addr = email_re.search(json_obj['From'])
                    json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
                    json_obj['To'] = set(email_re.findall(json_obj['To']))
                    json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
                    # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                    json_data[json_obj['Message-ID']] = json_obj
    else:
        lone_author_threads = get_lone_author_threads(False, nodelist_filename, edgelist_filename)
        with open(json_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                json_obj['Message-ID'] = int(json_obj['Message-ID'])
                if json_obj['Message-ID'] not in lone_author_threads:
                    json_obj['Time'] = datetime.datetime.strptime(json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z")
                    if json_obj['Time'] < time_limit:
                        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                        from_addr = email_re.search(json_obj['From'])
                        json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
                        json_obj['To'] = set(email_re.findall(json_obj['To']))
                        json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
                        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
                        json_data[json_obj['Message-ID']] = json_obj
    print("JSON data loaded.")

    author_graph = igraph.Graph()
    author_graph.es["weight"] = 1.0
    author_map = dict()

    """
    Graphs can also be indexed by strings or pairs of vertex indices or vertex names. When a graph is
    indexed by a string, the operation translates to the retrieval, creation, modification or deletion
    of a graph attribute.

    When a graph is indexed by a pair of vertex indices or names, the graph itself is treated as an
    adjacency matrix and the corresponding cell of the matrix is returned. Assigning values different
    from zero or one to the adjacency matrix will be translated to one, unless the graph is weighted,
    in which case the numbers will be treated as weights.
    """
    top_authors = set()
    author_scores = ranking.get(json_filename, None, active_score=2, passive_score=1, write_to_file=False)
    index = 0
    for email_addr, author_score in author_scores:
        index += 1
        top_authors.add(email_addr)
        if index == 100:
            break

    index = 0
    for id, node in json_data.items():
        if node['From'] in top_authors:
            if node['From'] not in author_map:
                author_map[node['From']] = index
                author_graph.add_vertex(name=node['From'], label=node['From'])
                index += 1
            for to_addr in node['To']:
                if to_addr in top_authors:
                    if to_addr not in author_map:
                        author_map[to_addr] = index
                        author_graph.add_vertex(name=to_addr, label=to_addr)
                        index += 1
                    if author_graph[node['From'], to_addr] == 0:
                        author_graph.add_edge(node['From'], to_addr, weight=1)
                    else:
                        author_graph[node['From'], to_addr] += 1
            if node['Cc'] is None:
                continue
            for to_addr in node['Cc']:
                if to_addr in top_authors:
                    if to_addr not in author_map:
                        author_map[to_addr] = index
                        author_graph.add_vertex(name=to_addr, label=to_addr)
                        index += 1
                    if author_graph[node['From'], to_addr] == 0:
                        author_graph.add_edge(node['From'], to_addr, weight=1)
                    else:
                        author_graph[node['From'], to_addr] += 1

    print("Nodes and Edges added to iGraph.")

    vertex_dendogram = author_graph.community_edge_betweenness(clusters=8, directed=True, weights="weight")
    igraph.plot(vertex_dendogram, foldername + "vd.pdf", vertex_label_size=3, bbox=(1200, 1200))
    print("Dendrogram saved as PDF.")

    vertex_clustering_obj = author_graph.community_infomap(edge_weights=author_graph.es["weight"])
    igraph.plot(vertex_clustering_obj, foldername + "vc.pdf", vertex_label_size=10, bbox=(1500, 1500), edge_color="gray")
    print("Vertex Clustering saved as PDF.")

    with open(foldername + "community_vertex_clustering.txt", 'w') as output_file:
        output_file.write(str(vertex_clustering_obj))
        output_file.close()