DeveloperCAP/MLCAT

View on GitHub
lib/input/mbox/mbox_hdr.py

Summary

Maintainability
B
4 hrs
Test Coverage
import mailbox
import json
import datetime
from lib.util.read import *


def extract_mail_header(mbox_filename, json_filename='headers.json', thread_uid_filename='thread_uid_map.json', author_uid_filename='author_uid_map.json'):
    """
    From the .MBOX file, this function extracts the header information is extracted using two predefined classes
    available in the Python Standard Library: Mailbox and Message, for accessing and manipulating on-disk mailboxes
    and the messages they contain respectively. The headers are then saved to a JSON file. an unique Message-ID is
    provided to each message in the .MBOX file in the ascending order of their arrival times. The mapping between
    the UIDs and Message-IDs are stored in another JSON file.
    
    :param mbox_filename: Contains the absolue or relative address of the file to be opened.
    :param json_filename: The JSON file in which the extracted headers are stored.
    :param thread_uid_filename: Contains a mapping between threads and UIDs.
    :param author_uid_filename: Contains a mapping between authors and UIDs.
    """
    print("Reading messages from MBOX file...")
    mailbox_obj = mailbox.mbox(mbox_filename)
    author_uid_map = dict()
    msg_hdr_list = list()
    uid_msg_id_map = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
    for msg_obj in mailbox_obj:
        msg_data = dict()
        if msg_obj.get('Message-ID') is None:
            continue
        msg_data['Message-ID'] = None
        msg_data['From'] = str(msg_obj.get('From'))
        msg_data['To'] = str(msg_obj.get('To'))
        if msg_data['To'] == 'None' or msg_data['From'] == 'None':
            continue
        msg_data['Cc'] = str(msg_obj.get('Cc'))
        from_addr = email_re.search(msg_data['From'])
        msg_data['From_iter'] = from_addr.group(0) if from_addr is not None else msg_data['From']
        msg_data['To_iter'] = set(email_re.findall(msg_data['To']))
        msg_data['Cc_iter'] = set(email_re.findall(msg_data['Cc'])) if msg_data['Cc'] is not None else None
        msg_data['Time'] = get_utc_time(msg_obj.get('Date'))
        if msg_data['Time'] == 'Error':
            continue
        msg_data['In-Reply-To'] = str(msg_obj.get('In-Reply-To'))
        msg_data['References'] = str(msg_obj.get('References'))
        msg_hdr_list.append((msg_data, str(msg_obj.get('Message-ID'))[1:-1]))

    msg_count = 1
    author_count = 1
    msg_hdr_list.sort(key=lambda msg: datetime.datetime.strptime(msg[0]['Time'], "%a, %d %b %Y %H:%M:%S %z"))
    for msg_data, uid in msg_hdr_list:
        msg_data['Message-ID'] = msg_count
        uid_msg_id_map[uid] = msg_count
        if msg_data['Cc'] is None:
            addr_list = msg_data['To_iter']
        else:
            addr_list = msg_data['To_iter'] | msg_data['Cc_iter']
        addr_list.add(msg_data['From_iter'])
        for to_address in addr_list:
            if author_uid_map.get(to_address, None) is None:
                author_uid_map[to_address] = author_count
                author_count += 1
        msg_count += 1

    print("Writing mail headers to JSON file...")
    with open(json_filename, mode='w', encoding='utf-8') as json_file:
        for msg_data, uid in msg_hdr_list:
            msg_data.pop('To_iter')
            msg_data.pop('From_iter')
            msg_data.pop('Cc_iter')
            if not msg_data['In-Reply-To'] == 'None':
                msg_data['In-Reply-To'] = uid_msg_id_map.get(msg_data['In-Reply-To'][1:-1], 0)
                if msg_data['In-Reply-To'] > msg_data['Message-ID']:
                    msg_data['In-Reply-To'] = 0
            else:
                msg_data['In-Reply-To'] = 0
            if msg_data['References']:
                ref_list = list()
                for reference in msg_data['References'].split(','):
                    ref_list.append(uid_msg_id_map.get(reference.strip()[1:-1], 0))
                msg_data['References'] = str(ref_list)[1:-1]
            json.dump(msg_data, json_file, indent=1)
            json_file.write("\n")
        json_file.close()

    print("Number of messages processed:", msg_count-1)
    print("Writing threads UID map to file...")
    with open(thread_uid_filename, mode='w', encoding='utf-8') as map_file:
        json.dump(uid_msg_id_map, map_file, indent=1)
        map_file.close()

    print("Writing authors UID map to file...")
    with open(author_uid_filename, mode='w', encoding='utf-8') as map_file:
        json.dump(author_uid_map, map_file, indent=1)
        map_file.close()