data2tsne.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import csv\n",
"import sys\n",
"\n",
"ASDIR = \"../usb/ovk/data/eriktks/AS/text\"\n",
"ESDIR = \"../usb/ovk/data/eriktks/ES/text\"\n",
"REVERSEDFILENAME = \"reversed.txt\"\n",
"MAILFILENAME = \"mails.csv\"\n",
"SEPARATOR = \",\"\n",
"CLIENT = \"client-id\"\n",
"COUNSELOR = \"counselor\"\n",
"DATE = \"date\"\n",
"SENDER = \"sender\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def safeOpenFile(fileName,mode):\n",
" try: fileHandle = open(fileName,mode)\n",
" except Exception as e: sys.exit(\"error: cannot open file \"+fileName+\": \"+str(e))\n",
" return(fileHandle)\n",
"\n",
"def readFirstMails(inDir):\n",
" mails = {}\n",
" inFileName = inDir+\"/\"+MAILFILENAME\n",
" inFile = safeOpenFile(inFileName,\"r\")\n",
" csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)\n",
" for row in csvReader:\n",
" try:\n",
" client = row[CLIENT]\n",
" date = row[DATE]\n",
" sender = row[SENDER]\n",
" if sender == \"CLIENT\" and \\\n",
" (not client in mails or \n",
" (mails[client][DATE] > date and date != \"\")):\n",
" mails[client] = dict(row)\n",
" except Exception as e: sys.exit(\"error: \"+str(e)) \n",
" inFile.close()\n",
" return(mails)\n",
"\n",
"asFirstMails = readFirstMails(ASDIR)\n",
"esFirstMails = readFirstMails(ESDIR)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def textToTokenDict(text):\n",
" tokenDict = {}\n",
" for token in text.split(): \n",
" tokenDict[token] = True\n",
" return(tokenDict)\n",
" \n",
"def makeFeatureDict(mailData):\n",
" tokens = {}\n",
" for client in mailData.keys():\n",
" mailTokenDict = textToTokenDict(mailData[client][\"text\"])\n",
" for token in mailTokenDict.keys():\n",
" if token in tokens: tokens[token] += 1\n",
" else: tokens[token] = 1\n",
" featureList = list(tokens.keys())\n",
" return(featureList)\n",
"\n",
"featureList = makeFeatureDict({**asFirstMails,**esFirstMails})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"OUTFILENAME=\"data2tsne.txt\"\n",
" \n",
"def writeFeatureValueLine(mailTokenDict,featureList,outFile):\n",
" for i in range(0,len(featureList)):\n",
" if featureList[i] in mailTokenDict: print(1.0,end=\" \",file=outFile)\n",
" else: print(0.0,end=\" \",file=outFile)\n",
" print(\"\",file=outFile)\n",
"\n",
"def writeFeatureValues(mailData,featureList,outFileName):\n",
" outFile = safeOpenFile(outFileName,\"w\")\n",
" for client in mailData.keys():\n",
" mailTokenDict = textToTokenDict(mailData[client][\"text\"])\n",
" writeFeatureValueLine(mailTokenDict,featureList,outFile)\n",
" outFile.close()\n",
"\n",
"selectedData = {**asFirstMails,**esFirstMails}\n",
"writeFeatureValues(selectedData,featureList,OUTFILENAME)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"OVKMETAFILE = \"../usb/ovk/data/eriktks/spss/opve.csv\"\n",
"IDFIELDNAME = \"onderzoeksnummer1\"\n",
"EXITFIELDNAME = \"Redenstoppen\"\n",
"SEPARATOR = \",\"\n",
"\n",
"def readMetaData():\n",
" exitData = {}\n",
" cesdDiff = {}\n",
" mhcDiff = {}\n",
" inFile = safeOpenFile(OVKMETAFILE,\"r\")\n",
" csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)\n",
" for row in csvReader: \n",
" exitData[row[IDFIELDNAME]] = row[EXITFIELDNAME].strip()\n",
" if row[\"CESD_TOT_t0\"] != \"NA\" and row[\"CESD_TOT_t1\"] != \"NA\": \n",
" cesdDiff[row[IDFIELDNAME]] = int(row[\"CESD_TOT_t1\"])-int(row[\"CESD_TOT_t0\"])\n",
" if row[\"MHCtot_t0\"] != \"NA\" and row[\"MHCtot_t1\"] != \"NA\": \n",
" mhcDiff[row[IDFIELDNAME]] = float(row[\"MHCtot_t1\"])-float(row[\"MHCtot_t0\"])\n",
" inFile.close()\n",
" return(exitData,cesdDiff,mhcDiff)\n",
"\n",
"exitData,cesdDiff,mhcDiff = readMetaData()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"LABELFILE = \"data2tsne.txt.labels\"\n",
"\n",
"def test(client): return(testMhc(client))\n",
"def testCesd(client): return(client in cesdDiff and cesdDiff [client] > 0)\n",
"def testExit(client): return(client in exitData and exitData[client] != \"\")\n",
"def testMhc(client): return(client in mhcDiff and mhcDiff[client] < 0)\n",
"\n",
"def writeLabels(mailData,outFileName):\n",
" outFile = safeOpenFile(outFileName,\"w\")\n",
" for client in mailData.keys():\n",
" if test(client): print(\"1.0\",file=outFile)\n",
" else: print(\"0.0\",file=outFile)\n",
" outFile.close()\n",
"\n",
"writeLabels(selectedData,LABELFILE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}