e-mental-health/data-processing

View on GitHub
ovkPrepare.py

Summary

Maintainability
B
6 hrs
Test Coverage
#!/usr/bin/python3 -W all
"""
    ovkPrepare.py: prepare ovk text files for conversion to csv
    usage: ovkPrepare.py [-g] file1 [file2 ...]
    note: option -g: use greetings as mail boundaries instead of dates
    20180509 erikt(at)xs4all.nl
"""

import getopt
import re
import sys

COMMAND = sys.argv.pop(0)
CHARPATTERN = r"[a-zA-Z]"
DATEHEADING = "EDate: "
DATESEP = "-"
DATEPATTERNALPHA = r"^\s*(\d\d?)\s+([a-z]+)\.?(\s+(\d+))?\s*$"
DATEPATTERNALPHADAY = 1
DATEPATTERNALPHAMONTH = 2
DATEPATTERNALPHAYEAR = 3
DATEPATTERNNUM = r"^\s*(([A-Za-z]+)\s+)?(\d+)\s*"+DATESEP+r"\s*(\d+)\s*"+DATESEP+r"\s*(\d+)\b$"
DATEPATTERNNUMWEEKDAY = 2
DATEPATTERNNUMDAY = 3
DATEPATTERNNUMMONTH = 4
DATEPATTERNNUMYEAR = 5
DEFAULTYEAR = "2011"
FROMHEADING = "EFrom: "
GREETPATTERN = r"^\s*([A-Za-z]+)\s+((heer|meneer|mevrouw)\s+)?([A-Za-z0-9]+)\b"
GREETPATTERNGREET = 1
GREETPATTERNTARGET = 4
GREETINGS = ["beste","dag","goedeavond","goedemorgen","hallo","hee","hi","hoi","Lieve"]
MONTHS = { "jan":"1","feb":"2","mrt":"3","apr":"4","mei":"5","jun":"6", \
           "jul":"7","aug":"8","sep":"9","okt":"10","nov":"11","dec":"12",
           "maart":"3","april":"4" }
NAMEPATTERN = r"([A-Za-z0-9]+)\s*$"
NAMEPATTERNTARGET = 1
SUSPECTEDPATTERN = r"^\s*[A-Z][a-z]+:"
UNKNOWN = "???"
USAGE = "usage: "+COMMAND+" [-g] file1 [file2 ...]"

def warn(message):
    print(COMMAND+": "+message,file=sys.stderr)
    return()

def error(message):
    warn(message)
    sys.exit(0)

def getClientName(fileName):
    matchId = re.search(r"^(\d+)\.",fileName)
    if matchId: return(matchId.group(1))
    else: error("cannot extract client name from file name: "+fileName)

def containsChars(string):
    return(re.search(CHARPATTERN,string))

def getCounselorName(lines):
    candidates = {}
    for line in lines:
        candidate = ""
        matchName = re.search(NAMEPATTERN,line)
        matchGreet = re.search(GREETPATTERN,line)
        if matchName and containsChars(matchName.group(NAMEPATTERNTARGET)): 
            candidate = matchName.group(NAMEPATTERNTARGET)
        elif matchGreet and containsChars(matchGreet.group(GREETPATTERNGREET)) \
             and matchGreet.group(GREETPATTERNGREET).lower() in GREETINGS: 
            candidate = matchGreet.group(GREETPATTERNTARGET)
        if candidate != "":
            if candidate in candidates: candidates[candidate] += 1
            else: candidates[candidate] = 1
    if not candidates: return("")
    else: 
        return(sorted(candidates,key=candidates.get,reverse=True)[0])

def readTextFile(fileName):
    lines = []
    try:
        inFile = open(fileName,"r")
        for line in inFile: lines.append(line)
        inFile.close()
    except: error("cannot read file "+fileName)
    return(lines)

def printMailText(client,counselor,date,mailText,receiver):
    if receiver == client: sender = counselor
    elif receiver == counselor: sender = client
    else: sender = UNKNOWN
    print(FROMHEADING+sender)
    if date != "": print(DATEHEADING+date)
    print("")
    print(mailText)
    return()

def getDateNum(line):
    matchDateNum = re.search(DATEPATTERNNUM,line)
    if not matchDateNum: date = ""
    else:
        weekday = matchDateNum.group(DATEPATTERNNUMWEEKDAY)
        day = matchDateNum.group(DATEPATTERNNUMDAY)
        month = matchDateNum.group(DATEPATTERNNUMMONTH)
        year = matchDateNum.group(DATEPATTERNNUMYEAR)
        date = day+DATESEP+month+DATESEP+year
        if weekday: date = weekday+" "+date
    return(date)

def getDateAlpha(line):
    try:
        matchDateAlpha = re.search(DATEPATTERNALPHA,line)
        if not matchDateAlpha: date = ""
        else:
            day = matchDateAlpha.group(DATEPATTERNALPHADAY)
            month = MONTHS[matchDateAlpha.group(DATEPATTERNALPHAMONTH)]
            year = matchDateAlpha.group(DATEPATTERNALPHAYEAR)
            date = day+DATESEP+month
            if not year: date += DATESEP+DEFAULTYEAR
            else: date += DATESEP+"20"+year
    except: date = ""
    return(date)

def processFile(client,counselor,lines,options):
    date = ""
    mailText = ""
    nbrOfProcessed = 0
    receiver = ""
    for i in range(0,len(lines)):
        line = lines[i]
        matchGreet = re.search(GREETPATTERN,line)
        matchDateNum = re.search(DATEPATTERNNUM,line)
        matchDateAlpha = re.search(DATEPATTERNALPHA,line)
        if matchGreet and matchGreet.group(GREETPATTERNGREET).lower() in GREETINGS:
            if "g" in options and (receiver !="" or nbrOfProcessed > 0):
                printMailText(client,counselor,date,mailText,receiver)
                mailText = ""
                receiver = ""
                date = ""
                nbrOfProcessed += 1
            if receiver == "": 
                receiver = matchGreet.group(GREETPATTERNTARGET)
            else: 
                warn("dupicate greeting (missing date?) on line "+str(i+1)+": "+line)
        if matchDateNum or matchDateAlpha:
            if receiver != "" or nbrOfProcessed > 0:
                printMailText(client,counselor,date,mailText,receiver)
                mailText = ""
                receiver = ""
                date = ""
                nbrOfProcessed += 1
            if matchDateNum: newDate = getDateNum(line)
            else: newDate = getDateAlpha(line)
            if newDate != "": date = newDate
            else: mailText += line
        else: mailText += line
    if mailText != "":
        printMailText(client,counselor,date,mailText,receiver)
    return()

def sanityCheck(lines):
    for line in lines:
        if re.search(SUSPECTEDPATTERN,line): warn(line)
    return()

def makeOptionDict(optionList):
    optionDict = {}
    for keyValuePair in optionList:
        optionName = re.sub("^-","",keyValuePair[0])
        optionDict[optionName] = True
    return(optionDict)

def main(argv):
    try: optionList, files = getopt.getopt(argv,"g",[])
    except: error(USAGE)
    optionDict = makeOptionDict(optionList)
    for inFileName in files:
        lines = readTextFile(inFileName)
        client = getClientName(inFileName)
        counselor = getCounselorName(lines)
        processFile(client,counselor,lines,optionDict)
        sanityCheck(lines)
    return(0)

if __name__ == "__main__":
    sys.exit(main(sys.argv))