e-mental-health/data-processing

View on GitHub
data-analysis.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "import sys\n",
    "\n",
    "ASFILE = \"../usb/ovk/data/eriktks/AS/text/AS-mails.csv\"\n",
    "ESFILE = \"../usb/ovk/data/eriktks/ES/text/ES-mails.csv\"\n",
    "SEPARATOR = \",\"\n",
    "CLIENT = \"client-id\"\n",
    "COUNSELOR = \"counselor\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "OVKMETAFILE = \"../usb/ovk/data/eriktks/spss/opve.csv\"\n",
    "IDFIELDNAME = \"onderzoeksnummer1\"\n",
    "EXITFIELDNAME = \"Redenstoppen\"\n",
    "\n",
    "def readMetaData():\n",
    "    exitData = {}\n",
    "    cesdDiff = {}\n",
    "    mhcDiff = {}\n",
    "    stopReasons = {}\n",
    "    try: inFile = open(OVKMETAFILE,\"r\")\n",
    "    except Exception as e: sys.exit(\"cannot read file \"+OVKMETAFILE+\": \"+str(e))\n",
    "    csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)\n",
    "    for row in csvReader: \n",
    "        exitData[row[IDFIELDNAME]] = row[EXITFIELDNAME].strip()\n",
    "        if exitData[row[IDFIELDNAME]]:\n",
    "            stopReason = exitData[row[IDFIELDNAME]].lower()\n",
    "            if not stopReason in stopReasons: stopReasons[stopReason] = 0\n",
    "            stopReasons[stopReason] += 1\n",
    "        if row[\"CESD_TOT_t0\"] != \"NA\" and row[\"CESD_TOT_t1\"] != \"NA\": \n",
    "            cesdDiff[row[IDFIELDNAME]] = int(row[\"CESD_TOT_t1\"])-int(row[\"CESD_TOT_t0\"])\n",
    "        if row[\"MHCtot_t0\"] != \"NA\" and row[\"MHCtot_t1\"] != \"NA\": \n",
    "            mhcDiff[row[IDFIELDNAME]] = float(row[\"MHCtot_t1\"])-float(row[\"MHCtot_t0\"])\n",
    "    inFile.close()\n",
    "    return(exitData,cesdDiff,mhcDiff,stopReasons)\n",
    "\n",
    "exitData,cesdDiff,mhcDiff,stopReasons = readMetaData()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reasons for stopping: different expectations (6), personal problems (4), external factors (3), other (1), unknown (12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def testFunction(clientId):\n",
    "    return(combi3(clientId))\n",
    "\n",
    "def betterMental(clientId):\n",
    "    if clientId == \"stopped\": return(\"decreased\")\n",
    "    elif clientId == \"name\": return(\"MHC\")\n",
    "    elif not clientId in mhcDiff: return(None)\n",
    "    else: return(mhcDiff[clientId] > 0)\n",
    "\n",
    "def lessDepressed(clientId):\n",
    "    if clientId == \"stopped\": return(\"increased\")\n",
    "    elif clientId == \"name\": return(\"CESD\")\n",
    "    elif not clientId in cesdDiff: return(None)\n",
    "    else: return(cesdDiff[clientId] < 0)\n",
    "    \n",
    "def finished(clientId):\n",
    "    if clientId == \"stopped\": return(\"stopped\")\n",
    "    elif clientId == \"name\": return(\"Treatment progress\")\n",
    "    elif not clientId in exitData: return(None)\n",
    "    else: return(exitData[clientId] == \"\")\n",
    "\n",
    "def combi3(clientId):\n",
    "    if clientId == \"stopped\": return(\"failed one test\")\n",
    "    elif clientId == \"name\": return(\"combi3\")\n",
    "    bm = betterMental(clientId)\n",
    "    ld = lessDepressed(clientId)\n",
    "    fi = finished(clientId)\n",
    "    if bm == None or ld == None or fi == None: return(None)\n",
    "    else: return(bm and ld and fi)\n",
    "\n",
    "def readData():\n",
    "    clientMails = {\"unknown\":{},\"finished\":{},\"stopped\":{}}\n",
    "    counselorMails = {\"unknown\":{},\"finished\":{},\"stopped\":{}}\n",
    "    counselorClients = {\"unknown\":{},\"finished\":{},\"stopped\":{}}\n",
    "    for inFileName in [ASFILE,ESFILE]:\n",
    "        try: inFile = open(inFileName,\"r\")\n",
    "        except Exception as e: sys.exit(str(e)+\" Cannot read file \"+inFileName+\": \"+str(e))\n",
    "        csvReader = csv.DictReader(inFile,delimiter=SEPARATOR)\n",
    "        for row in csvReader:\n",
    "            try:\n",
    "                client = row[CLIENT]\n",
    "                counselor = row[COUNSELOR]\n",
    "                if testFunction(client) == None: treatmentStatus = \"unknown\"\n",
    "                elif testFunction(client): treatmentStatus = \"finished\"\n",
    "                else: treatmentStatus = \"stopped\"\n",
    "                if not client in clientMails[treatmentStatus]: \n",
    "                    clientMails[treatmentStatus][client] = 0\n",
    "                clientMails[treatmentStatus][client] += 1\n",
    "                if not counselor in counselorMails[treatmentStatus]: \n",
    "                    counselorMails[treatmentStatus][counselor] = 0\n",
    "                counselorMails[treatmentStatus][counselor] += 1\n",
    "                if not counselor in counselorClients[treatmentStatus]: \n",
    "                    counselorClients[treatmentStatus][counselor] = {}\n",
    "                if not client in counselorClients[treatmentStatus][counselor]: \n",
    "                    counselorClients[treatmentStatus][counselor][client] = True\n",
    "            except Exception as e: sys.exit(str(e)+\" Unexpected row in file \"+inFileName+\": \"+str(row))\n",
    "        inFile.close()\n",
    "    return(clientMails,counselorMails,counselorClients)\n",
    "\n",
    "def fillEmptySpots(myDict,filler):\n",
    "    for ts1 in myDict:\n",
    "        for key in myDict[ts1]:\n",
    "            for ts2 in myDict:\n",
    "                if ts2 != ts1 and not key in myDict[ts2]:\n",
    "                    myDict[ts2][key] = filler\n",
    "    return(myDict)\n",
    "\n",
    "clientMails,counselorMails,counselorClients = readData()\n",
    "counselorMails = fillEmptySpots(counselorMails,0)\n",
    "counselorClients = fillEmptySpots(counselorClients,{})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "COLORS = { \"unknown\":\"grey\",\"finished\":\"blue\",\"stopped\":\"red\"}\n",
    "\n",
    "def computeAverage(myDict):\n",
    "    myValues = {}\n",
    "    for key1 in myDict:\n",
    "        for key2 in myDict[key1]:\n",
    "            if not key2 in myValues: myValues[key2] = myDict[key1][key2]\n",
    "            else: myValues[key2] += myDict[key1][key2]\n",
    "    return(np.average(list(myValues.values())))\n",
    "\n",
    "def computeMinMax(myDict):\n",
    "    myMin = None\n",
    "    myMax = None\n",
    "    for key in myDict:\n",
    "        if len(myDict[key]) > 0:\n",
    "            minList = min(myDict[key].keys())\n",
    "            maxList = max(myDict[key].keys())\n",
    "            if myMin == None or myMin > minList: myMin = minList\n",
    "            if myMax == None or myMax < maxList: myMax = maxList\n",
    "    return(myMin,myMax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15,8))\n",
    "plt.subplot(2,1,1)\n",
    "nbrOfClients = np.sum([len(clientMails[x]) for x in clientMails])\n",
    "average = computeAverage(clientMails)\n",
    "plt.title(\"OVK data: \"+testFunction(\"name\")+\". Number of mails per client: \"+str(nbrOfClients)+\" clients; \"+\n",
    "          str(len(clientMails[\"stopped\"]))+\" \"+testFunction(\"stopped\")+\"; \"+str(len(clientMails[\"unknown\"]))+\n",
    "          \" unknown; average number of mails per client: \"+str(int(average)))\n",
    "for ts in clientMails:\n",
    "    plt.bar([int(x) for x in clientMails[ts].keys()],[x for x in clientMails[ts].values()],color=COLORS[ts])\n",
    "myMin,myMax = computeMinMax(clientMails)\n",
    "plt.axis([7995,8288,0,40])\n",
    "plt.plot((int(myMin),int(myMax)),(average,average),\"y\")\n",
    "\n",
    "plt.subplot(2,2,3)\n",
    "average = computeAverage(counselorMails)\n",
    "plt.title(\"Mails per counselor (average: \"+str(int(average))+\")\")\n",
    "countsUnknown = [x for x in counselorMails[\"unknown\"].values()]\n",
    "countsStopped = [x for x in counselorMails[\"stopped\"].values()]\n",
    "countsFinished = [x for x in counselorMails[\"finished\"].values()]\n",
    "countsUnknownStopped = np.add(countsUnknown,countsStopped)\n",
    "plt.bar([x for x in range(1,len(countsUnknown)+1)],countsUnknown,color=COLORS[\"unknown\"])\n",
    "plt.bar([x for x in range(1,len(countsStopped)+1)],countsStopped,color=COLORS[\"stopped\"],bottom=countsUnknown)\n",
    "plt.bar([x for x in range(1,len(countsFinished)+1)],countsFinished,color=COLORS[\"finished\"],\n",
    "        bottom=countsUnknownStopped)\n",
    "plt.axis([0.5,9.5,0,400])\n",
    "#plt.bar([i for i in range(0,len(counselorMails[\"finished\"]))],[x for x in counselorMails[\"finished\"].values()])\n",
    "plt.plot((0.5,[len(counselorMails[x]) for x in counselorMails][0]+0.5),(average,average),\"y\")\n",
    "\n",
    "plt.subplot(2,2,4)\n",
    "countsUnknown = [len(x) for x in counselorClients[\"unknown\"].values()]\n",
    "countsStopped = [len(x) for x in counselorClients[\"stopped\"].values()]\n",
    "countsFinished = [len(x) for x in counselorClients[\"finished\"].values()]\n",
    "countsUnknownStopped = np.add(countsUnknown,countsStopped)\n",
    "average = np.average(np.add(countsUnknown,np.add(countsStopped,countsFinished)))\n",
    "plt.title(\"Clients per counselor (average: \"+str(int(average))+\")\")\n",
    "plt.bar([x for x in range(1,len(countsUnknown)+1)],countsUnknown,color=COLORS[\"unknown\"])\n",
    "plt.bar([x for x in range(1,len(countsStopped)+1)],countsStopped,color=COLORS[\"stopped\"],bottom=countsUnknown)\n",
    "plt.bar([x for x in range(1,len(countsFinished)+1)],countsFinished,color=COLORS[\"finished\"],\n",
    "        bottom=countsUnknownStopped)\n",
    "plt.plot((0.5,[len(counselorClients[x]) for x in counselorClients][0]+0.5),(average,average),\"y\")\n",
    "plt.axis([0.5,9.5,0,23])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Note: for two clients that stopped the treatment according to the metadata (8068, 8077), there are no mails in the collection. The mails of the client that was banned from the treatment (8236), were not used. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bmld = {}\n",
    "ldfi = {}\n",
    "fibm = {}\n",
    "for ts in counselorClients:\n",
    "    for cs in counselorClients[ts]:\n",
    "        for cl in counselorClients[ts][cs]:\n",
    "            bm = betterMental(cl)\n",
    "            ld = lessDepressed(cl)\n",
    "            fi = finished(cl)\n",
    "            if not bm in bmld: bmld[bm] = {}\n",
    "            if not ld in bmld[bm]: bmld[bm][ld] = 0\n",
    "            bmld[bm][ld] += 1\n",
    "            if not ld in ldfi: ldfi[ld] = {}\n",
    "            if not fi in ldfi[ld]: ldfi[ld][fi] = 0\n",
    "            ldfi[ld][fi] += 1\n",
    "            if not fi in fibm: fibm[fi] = {}\n",
    "            if not bm in fibm[fi]: fibm[fi][bm] = 0\n",
    "            fibm[fi][bm] += 1\n",
    "print(\"bm-ld\")\n",
    "for bm in bmld.keys():\n",
    "    for ld in bmld[bm]:\n",
    "        print(bm,ld,bmld[bm][ld],end=\"#\")\n",
    "    print(\"\")\n",
    "print(\"ld-fi\")\n",
    "for ld in ldfi.keys():\n",
    "    for fi in ldfi[ld]:\n",
    "        print(ld,fi,ldfi[ld][fi],end=\"#\")\n",
    "    print(\"\")\n",
    "print(\"fi-bm\")\n",
    "for fi in fibm.keys():\n",
    "    for bm in fibm[fi]:\n",
    "        print(fi,bm,fibm[fi][bm],end=\"#\")\n",
    "    print(\"\")\n",
    "print(\"\")\n",
    "print(\"BM=True | LD=True: \",bmld[True][True]/(bmld[None][True]+bmld[True][True]+bmld[False][True]))\n",
    "print(\"BM=False | LD=False: \",bmld[False][False]/(0+bmld[True][False]+bmld[False][False]))\n",
    "print(\"LD=True | BM=True: \",bmld[True][True]/(0+bmld[True][True]+bmld[True][False]))\n",
    "print(\"LD=False | BM=False: \",bmld[False][False]/(0+bmld[False][True]+bmld[False][False]))\n",
    "\n",
    "print(\"LD=True | FI=True: \",ldfi[True][True]/(ldfi[None][True]+ldfi[True][True]+ldfi[False][True]))\n",
    "print(\"LD=False | FI=False: \",ldfi[False][False]/(ldfi[None][False]+ldfi[True][False]+ldfi[False][False]))\n",
    "print(\"FI=True | LD=True: \",ldfi[True][True]/(0+ldfi[True][True]+ldfi[True][False]))\n",
    "print(\"FI=False | LD=False: \",ldfi[False][False]/(0+ldfi[False][True]+ldfi[False][False]))\n",
    "\n",
    "print(\"FI=True | BM=True: \",fibm[True][True]/(0+fibm[True][True]+fibm[False][True]))\n",
    "print(\"FI=False | BM=False: \",fibm[False][False]/(0+fibm[True][False]+fibm[False][False]))\n",
    "print(\"BM=True | FI=True: \",fibm[True][True]/(fibm[True][None]+fibm[True][True]+fibm[True][False]))\n",
    "print(\"BM=False | FI=False: \",fibm[False][False]/(fibm[False][None]+fibm[False][True]+fibm[False][False]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}