DeveloperCAP/MLCAT

View on GitHub
lib/input/data_cleanup.py

Summary

Maintainability
B
6 hrs
Test Coverage
import json
from lib.util.read import lines_per_n
 
 
Function `remove_invalid_references` has a Cognitive Complexity of 40 (exceeds 5 allowed). Consider refactoring.
def remove_invalid_references(input_json_filename, output_json_filename, ref_toggle=False):
"""
This function is used to remove headers associated with invalid references.
:param input_json_filename: The json file containing all the references.
:param output_json_filename: The output json without invalid references.
:param ref_toggle: If true, gets the reference list from 'References' attribute.
"""
 
# The "unspecified_ref" list is used to keep track of all those mails that have '0' in their reference list.
# If any mail has any of the element in this list in its list of references, we can eliminate them as well
unspecified_ref = ['0']
 
print("Removing headers associated with invalid references...")
 
with open(input_json_filename, 'r') as fil:
with open(output_json_filename, mode='w', encoding='utf-8') as fin_file :
 
for chunk in lines_per_n(fil, 9):
# The "jfile" is used to store the json object read from the file.
jfile = json.loads(chunk)
 
"""
Mails that have references that are of type None indicate that they maybe the start of threads.
Anything else could be mail in a thread or something else.
"""
if jfile['References'] is not None:
# Checking if the references is an empty string
Avoid deeply nested control flow statements.
if not jfile['References'] == "":
# The references are stored as a comma separated string. We have to split it at the ',' to get a list.
if ref_toggle:
ref_list = jfile['References'].split(',')
else:
if jfile['In-Reply-To'] is not None:
ref_list = [str(jfile['In-Reply-To'])]
else:
ref_list = None
# A '0' in the list indicates that the mail contains references to some other mail which is not available to us
if '0' not in ref_list or ref_list is None:
data = {}
data['Message-ID'] = jfile['Message-ID']
data['From'] = jfile['From']
data['To'] = jfile['To']
data['Cc'] = jfile['Cc']
data['In-Reply-To'] = jfile['In-Reply-To']
data['References'] = jfile['References']
data['Time'] = jfile['Time']
contain_unspec_ref = False
 
# This is done to eliminate all those mails whose reference list contains mails that have '0' in their reference list
for ref in ref_list :
if ref in unspecified_ref:
contain_unspec_ref = True
if not contain_unspec_ref:
json.dump(data, fin_file, indent=1)
fin_file.write('\n')
else:
unspecified_ref.append(str(jfile['Message-ID']))
 
# Writing all those mails that have None as their References
else:
data = {}
data['Message-ID'] = jfile['Message-ID']
data['From'] = jfile['From']
data['To'] = jfile['To']
data['Cc'] = jfile['Cc']
data['In-Reply-To'] = jfile['In-Reply-To']
data['References'] = jfile['References']
data['Time'] = str(jfile['Time'])
json.dump(data, fin_file, indent=1)
fin_file.write('\n')
 
fin_file.close()
fil.close()