e-mental-health/data-processing

View on GitHub
data2tsne.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "import sys\n",
    "\n",
    "ASDIR = \"../usb/ovk/data/eriktks/AS/text\"\n",
    "ESDIR = \"../usb/ovk/data/eriktks/ES/text\"\n",
    "REVERSEDFILENAME = \"reversed.txt\"\n",
    "MAILFILENAME = \"mails.csv\"\n",
    "SEPARATOR = \",\"\n",
    "CLIENT = \"client-id\"\n",
    "COUNSELOR = \"counselor\"\n",
    "DATE = \"date\"\n",
    "SENDER = \"sender\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def safeOpenFile(fileName,mode):\n",
    "    try: fileHandle = open(fileName,mode)\n",
    "    except Exception as e: sys.exit(\"error: cannot open file \"+fileName+\": \"+str(e))\n",
    "    return(fileHandle)\n",
    "\n",
    "def readFirstMails(inDir):\n",
    "    mails = {}\n",
    "    inFileName = inDir+\"/\"+MAILFILENAME\n",
    "    inFile = safeOpenFile(inFileName,\"r\")\n",
    "    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)\n",
    "    for row in csvReader:\n",
    "        try:\n",
    "            client = row[CLIENT]\n",
    "            date = row[DATE]\n",
    "            sender = row[SENDER]\n",
    "            if sender == \"CLIENT\" and \\\n",
    "               (not client in mails or \n",
    "                (mails[client][DATE] > date and date != \"\")):\n",
    "                mails[client] = dict(row)\n",
    "        except Exception as e: sys.exit(\"error: \"+str(e))      \n",
    "    inFile.close()\n",
    "    return(mails)\n",
    "\n",
    "asFirstMails = readFirstMails(ASDIR)\n",
    "esFirstMails = readFirstMails(ESDIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def textToTokenDict(text):\n",
    "    tokenDict = {}\n",
    "    for token in text.split(): \n",
    "        tokenDict[token] = True\n",
    "    return(tokenDict)\n",
    "    \n",
    "def makeFeatureDict(mailData):\n",
    "    tokens = {}\n",
    "    for client in mailData.keys():\n",
    "        mailTokenDict = textToTokenDict(mailData[client][\"text\"])\n",
    "        for token in mailTokenDict.keys():\n",
    "            if token in tokens: tokens[token] += 1\n",
    "            else: tokens[token] = 1\n",
    "    featureList = list(tokens.keys())\n",
    "    return(featureList)\n",
    "\n",
    "featureList = makeFeatureDict({**asFirstMails,**esFirstMails})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OUTFILENAME=\"data2tsne.txt\"\n",
    "        \n",
    "def writeFeatureValueLine(mailTokenDict,featureList,outFile):\n",
    "    for i in range(0,len(featureList)):\n",
    "        if featureList[i] in mailTokenDict: print(1.0,end=\" \",file=outFile)\n",
    "        else: print(0.0,end=\" \",file=outFile)\n",
    "    print(\"\",file=outFile)\n",
    "\n",
    "def writeFeatureValues(mailData,featureList,outFileName):\n",
    "    outFile = safeOpenFile(outFileName,\"w\")\n",
    "    for client in mailData.keys():\n",
    "        mailTokenDict = textToTokenDict(mailData[client][\"text\"])\n",
    "        writeFeatureValueLine(mailTokenDict,featureList,outFile)\n",
    "    outFile.close()\n",
    "\n",
    "selectedData = {**asFirstMails,**esFirstMails}\n",
    "writeFeatureValues(selectedData,featureList,OUTFILENAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "OVKMETAFILE = \"../usb/ovk/data/eriktks/spss/opve.csv\"\n",
    "IDFIELDNAME = \"onderzoeksnummer1\"\n",
    "EXITFIELDNAME = \"Redenstoppen\"\n",
    "SEPARATOR = \",\"\n",
    "\n",
    "def readMetaData():\n",
    "    exitData = {}\n",
    "    cesdDiff = {}\n",
    "    mhcDiff = {}\n",
    "    inFile = safeOpenFile(OVKMETAFILE,\"r\")\n",
    "    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)\n",
    "    for row in csvReader: \n",
    "        exitData[row[IDFIELDNAME]] = row[EXITFIELDNAME].strip()\n",
    "        if row[\"CESD_TOT_t0\"] != \"NA\" and row[\"CESD_TOT_t1\"] != \"NA\": \n",
    "            cesdDiff[row[IDFIELDNAME]] = int(row[\"CESD_TOT_t1\"])-int(row[\"CESD_TOT_t0\"])\n",
    "        if row[\"MHCtot_t0\"] != \"NA\" and row[\"MHCtot_t1\"] != \"NA\": \n",
    "            mhcDiff[row[IDFIELDNAME]] = float(row[\"MHCtot_t1\"])-float(row[\"MHCtot_t0\"])\n",
    "    inFile.close()\n",
    "    return(exitData,cesdDiff,mhcDiff)\n",
    "\n",
    "exitData,cesdDiff,mhcDiff = readMetaData()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "LABELFILE = \"data2tsne.txt.labels\"\n",
    "\n",
    "def test(client): return(testMhc(client))\n",
    "def testCesd(client): return(client in cesdDiff and cesdDiff [client] > 0)\n",
    "def testExit(client): return(client in exitData and exitData[client] != \"\")\n",
    "def testMhc(client): return(client in mhcDiff and mhcDiff[client] < 0)\n",
    "\n",
    "def writeLabels(mailData,outFileName):\n",
    "    outFile = safeOpenFile(outFileName,\"w\")\n",
    "    for client in mailData.keys():\n",
    "        if test(client): print(\"1.0\",file=outFile)\n",
    "        else: print(\"0.0\",file=outFile)\n",
    "    outFile.close()\n",
    "\n",
    "writeLabels(selectedData,LABELFILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}