tactus2daap-en.py from e-mental-health/data-processing

tactus2daap-en.py
Summary

Maintainability

1 day
Test Coverage

Issues
#!/usr/bin/python3
"""
    tactus2daap-en.py: convert xml files from tactus to daap scores per mail csv
    usage: tactus2daap-en.py [-m] file1.xml [file2.xml ...] > file1.csv
    notes: 
    * option -m activates mail-internal analysis (default: per-mail analysis)
    * based on tactus2liwc-en.py
    20190204 erikt(at)xs4all.nl
"""

import csv
import nltk
import operator
import re
import sys
import xml.etree.ElementTree as ET
import getopt
import daap

COMMAND = sys.argv.pop(0)
USAGE = "usage: "+COMMAND+" [-m] file1 [file2 ...]"
INTAKEQUESTIONNAIRE = "./Intake/Questionnaire"
QUESTIONNAIRE = "./Treatment/TreatmentSteps/TreatmentStep/Questionnaire"
QUESTIONNAIRETITLES = { "Intake":True,"Lijst tussenmeting":True,"Lijst nameting":True,"Lijst 3 maanden":True,"Lijst half jaar":True }
ANSWERS = "./Content/question/answer"
MESSAGES = "./Messages/Message"
AGE = "leeftijd"
CLIENT = "CLIENT"
COUNSELOR = "COUNSELOR"
SENDER = "Sender"
RECIPIENT = "Recipients"
QUESTION = "question"
DATESENT = "DateSent"
DATE = "DATE"
BODY = "Body"
ATTACHMENT = "Attachment"
SUBJECT = "Subject"
SENDERID = 1
MAILDATEID = 3
MAILTITLEID = 4
MAILBODYID = 5
OUTPUTDIR = "/home/erikt/projects/e-mental-health/usb/output"
EMAILFILE = OUTPUTDIR+"/emails.csv"
EMAILHEADING = ["id","sender","receipient","date","subject","text"]
DAAPDIR = "/home/erikt/projects/e-mental-health/DAAP09.6/WRAD/"
DAAPFILE = "WRAD.Wt"
TEXTBOUNDARY = "%"
NBROFTOKENS = "NBROFTOKENS"
NBROFSENTS = "NBROFSENTS"
NBROFMATCHES = "NBROFMATCHES"
NUMBER = "number"
METADATAFEATURES = [DATE,NBROFTOKENS,NBROFSENTS,NBROFMATCHES,SENDER]
DAAP = "daap"
DAAPAVG = "daapavg"
DAAPPOS = "daappos"
DAAPNEG = "daapneg"
MAILID = "mailId"

numberId = -1
headerPrinted = False

def cleanupText(text):
    if text == None: return("")
    text = re.sub(r"\s+"," ",text)
    text = re.sub(r"^ ","",text)
    text = re.sub(r" $","",text)
    return(text)

def removeSpaces(text):
    text = re.sub(r"\s+","",text)
    return(text)

def makeId(fileName):
    thisId = re.sub(r".*/","",fileName)
    thisId = re.sub(r"\.xml.*$","",thisId)
    return(thisId)

def anonymizeCounselor(name):
    if name != CLIENT: return(COUNSELOR)
    else: return(name)

def getEmailData(root,thisId):
    clientMails = []
    counselorMails = []
    for message in root.findall(MESSAGES):
        body = ""
        date = ""
        recipient = ""
        sender = ""
        subject = ""
        for child in message:
            if child.tag == SENDER: 
                sender = anonymizeCounselor(cleanupText(child.text))
            elif child.tag == RECIPIENT: 
                recipient = anonymizeCounselor(cleanupText(child.text))
            elif child.tag == DATESENT: date = cleanupText(child.text)
            elif child.tag == SUBJECT: subject = cleanupText(child.text)
            elif child.tag == BODY: body = cleanupText(child.text)
        if sender == CLIENT: clientMails.append([thisId,sender,recipient,date,subject,body])
        else: counselorMails.append([thisId,sender,recipient,date,subject,body])
    clientMails = cleanupMails(clientMails,counselorMails)
    counselorMails = cleanupMails(counselorMails,clientMails)
    allMails = clientMails
    allMails.extend(counselorMails)
    return(sorted(allMails,key=lambda subList:subList[MAILDATEID]))

def sentenceSplit(text):
    tokens = text.split()
    sentence = []
    sentences = []
    for token in tokens:
        sentence.append(token)
        if not re.search(r"[a-zA-Z0-9'\"]",token): 
            sentences.append(" ".join(sentence))
            sentence = []
    if len(sentence) > 0: sentences.append(" ".join(sentence))
    return(sentences)

def cleanupMails(clientMails, counselorMails):
    clientSentenceDates = {}
    counselorSentenceDates = {}
    for i in range(0,len(clientMails)):
        date = clientMails[i][MAILDATEID]
        body = clientMails[i][MAILBODYID]
        sentences = sentenceSplit(body)
        for s in sentences:
            if (s in clientSentenceDates and date < clientSentenceDates[s]) or \
                not s in clientSentenceDates:
                clientSentenceDates[s] = date
    for i in range(0,len(counselorMails)):
        date = counselorMails[i][MAILDATEID]
        body = counselorMails[i][MAILBODYID]
        sentences = sentenceSplit(body)
        for s in sentences:
            if s in clientSentenceDates and date < clientSentenceDates[s]:
                counselorSentenceDates[s] = date
                del(clientSentenceDates[s])
            elif s in counselorSentenceDates and date < counselorSentenceDates[s]:
                counselorSentenceDates[s] = date
            elif not s in clientSentenceDates and not s in counselorSentenceDates:
                counselorSentenceDates[s] = date
    for i in range(0,len(clientMails)):
        date = clientMails[i][MAILDATEID]
        body = clientMails[i][MAILBODYID]
    return(clientMails)

def getQuestionnaires(root,thisId):
    qs = []
    for questionnaires in INTAKEQUESTIONNAIRE,QUESTIONNAIRE:
        for questionnaire in root.findall(questionnaires):
            title = cleanupText(questionnaire.findall("./Title")[0].text)
            if title in QUESTIONNAIRETITLES:
                q = {"title":title,"id":thisId}
                for answer in questionnaire.findall(ANSWERS):
                    try:
                        key = answer.attrib["ID"]
                        value = cleanupText(answer.findall("./answerText")[0].text)
                        q[key] = value
                    except: continue 
                qs.append(q)
    return(qs)

def tokenize(text):
    sentences = nltk.sent_tokenize(text)
    tokens = []
    for s in sentences:
        t = nltk.word_tokenize(s)
        tokens.extend(t)
    return(tokens,len(sentences))

def isNumber(string):
    return(string.lstrip("-").replace(".","1").isnumeric())

def readWords(inFile):
    words = {}
    for line in inFile:
        line = line.strip()
        word,weight = line.split()
        weight = re.sub("r-","0",weight)
        words[word] = float(weight)
    return(words)

def readDaapDict(inFileName):
    try: inFile = open(inFileName,"r")
    except Exception as e: 
        sys.exit(COMMAND+": cannot read DAAP dictionary "+inFileName)
    words = readWords(inFile)
    inFile.close()
    return(words)

def addFeatureToCounts(counts,feature):
    if feature in counts: counts[feature] += 1
    else: counts[feature] = 1

def text2daap(words,tokens):
    global numberId

    counts = { NBROFMATCHES:0 , DAAPAVG:0.0, DAAPPOS:0.0, DAAPNEG:0.0 }
    for token in tokens:
        if token in words:
            addFeatureToCounts(counts,NBROFMATCHES)
            counts[DAAPAVG] += words[token]
            if words[token] > 0: counts[DAAPPOS] += words[token]
            if words[token] < 0: counts[DAAPNEG] += words[token]
    if counts[NBROFMATCHES] > 0: 
        counts[DAAPAVG] /= counts[NBROFMATCHES]
        counts[DAAPPOS] /= counts[NBROFMATCHES]
        counts[DAAPNEG] /= counts[NBROFMATCHES]
    return(counts)

def readTextFromStdin():
    text = ""
    for line in sys.stdin: text += line
    return(text)

def printHeader(features):
    for i in range(0,len(METADATAFEATURES)):
        if i != 0: print(",",end="")
        print(METADATAFEATURES[i],end="")
    for value in features:
        print(","+value,end="")
    print()

def printResults(features,results):
    for i in range(0,len(METADATAFEATURES)):
        if i != 0: print(",",end="")
        if METADATAFEATURES[i] in results:
            print(removeSpaces(str(results[METADATAFEATURES[i]])),end="")
    for feature in features:
        print(",",end="")
        if feature in results: print(results[feature],end="")
        else: print(0,end="")
    print()

def emails2daapPerMail(emails,features,words):
    global headerPrinted

    if not headerPrinted:
        printHeader(features)
        headerPrinted = True
    for row in emails:
        text = row[MAILTITLEID]+" "+row[MAILBODYID]
        tokens,nbrOfSents = tokenize(text)
        results = text2daap(words,tokens)
        results[NBROFTOKENS] = len(tokens)
        results[NBROFSENTS] = nbrOfSents
        results[SENDER] = row[SENDERID]
        results[DATE] = row[MAILDATEID]
        printResults(features,results)

def emails2daapMailInternal(emails,features,words):
    global headerPrinted

    if not headerPrinted:
        print(MAILID+","+SENDER+","+DATE+","+DAAP)
        headerPrinted = True
    for mailId in range(0,len(emails)):
        text = emails[mailId][MAILTITLEID]+" "+emails[mailId][MAILBODYID]
        tokens,nbrOfSents = tokenize(text)
        averageWeights = daap.daap(" ".join(tokens))
        for weight in averageWeights:
            print(str(mailId)+","+emails[mailId][SENDERID]+","+emails[mailId][MAILDATEID]+","+str(weight))

def processOptions(argv):
    try:
        optionList, files = getopt.getopt(argv,"m",[])
        options = {}
        for option, arg in optionList:
            if option == "-m": options[option] = True
            else: sys.exit(USAGE)
        return(options,files)
    except Exception as e: sys.exit(USAGE+" "+str(e))

def main(argv):
    emails = []
    questionnaires = []
    options,files = processOptions(argv)
    words = readDaapDict(DAAPDIR+DAAPFILE)
    features = [DAAPAVG,DAAPPOS,DAAPNEG]
    for inFile in files:
        tree = ET.parse(inFile)
        root = tree.getroot()
        thisId = makeId(inFile)
        emails.extend(getEmailData(root,thisId))
        questionnaires.extend(getQuestionnaires(root,thisId))
    if len(emails) > 0: 
        if "-m" in options: emails2daapMailInternal(emails,features,words)
        else: emails2daapPerMail(emails,features,words)
    return(0)

if __name__ == "__main__":
    sys.exit(main(sys.argv))