ovk-liwc-correlation.ipynb from e-mental-health/data-processing

ovk-liwc-correlation.ipynb
Summary

Maintainability

Test Coverage

Issues
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# OVK-LIWC correlation\n",
    "\n",
    "Test the correlation of the columns in the file ovk-liwc-wide.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "import re\n",
    "import sys\n",
    "\n",
    "INFILE = \"ovk-liwc-wide.csv\"\n",
    "\n",
    "def readFile(inFileName):\n",
    "    try:\n",
    "        table = []\n",
    "        inFile = open(inFileName,\"r\")\n",
    "        csvreader = csv.DictReader(inFile,delimiter=\",\")\n",
    "        for row in csvreader: table.append(row)\n",
    "        inFile.close()\n",
    "        return(table)\n",
    "    except Exception as e:\n",
    "        sys.exit(\"error processing file \"+inFileName+\": \"+str(e))\n",
    "\n",
    "table = readFile(INFILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib notebook\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def plot(x,y,title=\"\"):\n",
    "    fig = plt.figure()\n",
    "    plt.scatter(x,y)\n",
    "    plt.title(title)\n",
    "    plt.show()\n",
    "    plt.savefig(\"ovk-liwc-correlation.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, determine the correlation between the CESD score and the MHC score. The expectation is that this would be negative, since CESD measures depression (the higher, the more depression signals) and MHC measures mental health (the higher, the better). The expection proves to be correct. The correlation factor is -0.34 and the negative correlation is identifyable from the plot."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "CESD = \"cesd\"\n",
    "MHC = \"mhc\"\n",
    "CESDT0 = CESD+\"T0\"\n",
    "CESDT1 = CESD+\"T1\"\n",
    "MHCT0 = MHC+\"T0\"\n",
    "MHCT1 = MHC+\"T1\"\n",
    "\n",
    "cesd = [float(x[CESDT1])-float(x[CESDT0]) for x in table if x[CESDT1] != \"NA\" and x[MHCT1] != \"NA\" ]\n",
    "mhc = [float(y[MHCT1])-float(y[MHCT0]) for y in table if y[CESDT1] != \"NA\" and y[MHCT1] != \"NA\"]\n",
    "\n",
    "np.corrcoef(cesd,mhc)[0][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot(cesd,mhc,\"change in CESD (x) vs MHC (y); \"+str(len(cesd))+\" clients\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we determine the correlation between the change of CESD and MHC rates with any of the other features. We discard the features from T1 because we are interested in determing characteristics of patients from T0 data. We find that three features are weakly correlated with both CESD change and MHC change: avgWordLenCliT0, sexualT0 and assentT0. The absolute correlation factors of these fetaures is about 0.2."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MINCOR = 0.15\n",
    "\n",
    "def numeric(table,key):\n",
    "    for row in table:\n",
    "        if row[key] != \"NA\":\n",
    "            try: float(row[key])\n",
    "            except: return(False)\n",
    "    return(True)\n",
    "\n",
    "def computeCorrelations(targetName):\n",
    "    correlations = {}\n",
    "    for key in table[0]:\n",
    "        if numeric(table,key):\n",
    "            target = [float(x[targetName+\"T1\"])-float(x[targetName+\"T0\"]) \\\n",
    "                      for x in table if x[targetName+\"T1\"] != \"NA\" and x[key] != \"NA\"]\n",
    "            y = [float(y[key]) for y in table if y[targetName+\"T1\"] != \"NA\" and y[key] != \"NA\"]\n",
    "            if len(set(y)) > 1: correlations[key] = np.corrcoef(target,y)[0][1]\n",
    "        else:\n",
    "            keyValues = list(set([x[key] for x in table]))\n",
    "            if len(keyValues) == 2:\n",
    "                for x in table:\n",
    "                    if x[key] == keyValues[0]: x[key] = 0\n",
    "                    elif x[key] == keyValues[1]: x[key] = 1\n",
    "                    else: sys.exit(\"cannot happen\")\n",
    "                target = [float(x[targetName+\"T1\"])-float(x[targetName+\"T0\"]) \\\n",
    "                          for x in table if x[targetName+\"T1\"] != \"NA\" and x[key] != \"NA\"]\n",
    "                y = [float(y[key]) for y in table if y[targetName+\"T1\"] != \"NA\" and y[key] != \"NA\"]\n",
    "                if len(set(y)) > 1: correlations[key] = np.corrcoef(target,y)[0][1]\n",
    "    for key in sorted(correlations,key=correlations.__getitem__,reverse=True): \n",
    "        if abs(correlations[key]) >= MINCOR and not re.search(\"T1$\",key):\n",
    "            print(correlations[key],key)\n",
    "            \n",
    "computeCorrelations(\"cesd\")\n",
    "print(\"\\n***\\n\")\n",
    "computeCorrelations(\"mhc\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plotFeature(table,measureName,featureName,title=\"\",sb=1): \n",
    "    measure = [float(x[measureName+\"T1\"])-float(x[measureName+\"T0\"]) \\\n",
    "               for x in table if x[measureName+\"T1\"] != \"NA\" and x[featureName] != \"NA\"]\n",
    "    feature = [float(x[featureName]) \\\n",
    "               for x in table if x[measureName+\"T1\"] != \"NA\" and x[featureName] != \"NA\"]\n",
    "    corr = np.corrcoef(measure,feature)[0][1]\n",
    "    plot(measure,feature,measureName+\" vs \"+featureName+\" (\"+str(round(corr,2))+\")\")\n",
    "\n",
    "plotFeature(table,\"mhc\",\"assentT0\",\"CESD vs assentT0\",1)\n",
    "plotFeature(table,\"mhc\",\"sexualT0\",\"CESD vs sexualT0\",2)\n",
    "plotFeature(table,\"mhc\",\"avgWordLenCliT0\",\"CESD vs avgWordLenCliT0\",3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we check the relation between counselors and the therapy result. For this purpose, we need to convert the counselor ids to numbers. There are several ways to do this (different orders). We do not know what works best so we check all possible orders and select the best result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import permutations\n",
    "\n",
    "COUNSELOR = \"counselor\"\n",
    "NA = \"NA\"\n",
    "\n",
    "counselorList = [x[COUNSELOR] for x in table]\n",
    "counselors = sorted(list(set(counselorList)))\n",
    "cesd = [float(x[CESDT1])-float(x[CESDT0]) for x in table if x[CESDT1] != NA and x[MHCT1] != NA ]\n",
    "mhc = [float(x[MHCT1])-float(x[MHCT0]) for x in table if x[CESDT1] != NA and x[MHCT1] != NA ]\n",
    "target = mhc\n",
    "\n",
    "bestShuffle = []\n",
    "bestCorr = 0\n",
    "for shuffle in permutations([x for x in range(0,len(counselors))]):\n",
    "    shuffled = {counselors[i]:shuffle[i] for i in range(0,len(counselors))}\n",
    "    counselorsShuffled = [shuffled[x[COUNSELOR]] for x in table if x[CESDT1] != NA and x[MHCT1] != NA ]\n",
    "    corr = np.corrcoef(target,counselorsShuffled)[0][1]\n",
    "    if abs(corr) > bestCorr:\n",
    "        bestCorr = corr\n",
    "        bestShuffle = shuffled\n",
    "print(bestCorr,bestShuffle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counselorsShuffled = [bestShuffle[x[COUNSELOR]] for x in table if  x[CESDT1] != NA and x[MHCT1] != NA]\n",
    "corr = np.corrcoef(target,counselorsShuffled)[0][1]\n",
    "plot(target,counselorsShuffled)\n",
    "corr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Even the strongest correlated features are only weakly correlated (0.2). The small data size could be a cause of this. We could explore two directions for improving this:\n",
    "\n",
    "1. Increase the size of the current dataset\n",
    "2. Combine features\n",
    "3. Look for other features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We compute the correlation factor of the two variables delta CESD and delta MHC with the set of features COUNSELOR, ASSENTT0, SEXUALT0 and AVGWORDLENCLIT0. For this purpose we use [multiple correlation](https://en.wikipedia.org/wiki/Multiple_correlation#Computation). The results are:\n",
    "\n",
    "1. CESD: COU:11%, ASS:-26%, SEX:-25%, AVG:-20%, COMBI:43%\n",
    "2. MHC: COU:29%, ASS:22%, SEX:20%, AVG:18%, COMBI:44%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import pandas as pd\n",
    "\n",
    "COUNSELORSHUFFLED = COUNSELOR+\"shuffled\"\n",
    "ASSENTT0 = \"assentT0\"\n",
    "SEXUALT0 = \"sexualT0\"\n",
    "AVGWORDLENCLIT0 = \"avgWordLenCliT0\"\n",
    "\n",
    "def makeSubsetNonNA(table,columns):\n",
    "    subset = []\n",
    "    for row in table:\n",
    "        NAfound = False\n",
    "        for column in columns:\n",
    "            if not column in row or row[column] == None or row[column] == NA:\n",
    "                NAfound = True\n",
    "                break\n",
    "        if not NAfound:\n",
    "            newRow = {x:float(row[x]) for x in columns if x != COUNSELOR}\n",
    "            if COUNSELOR in columns: newRow[COUNSELOR] = row[COUNSELOR]\n",
    "            subset.append(newRow)\n",
    "    return(subset)\n",
    "\n",
    "#len(shuffled)\n",
    "subsetNonNADict = makeSubsetNonNA(table,\\\n",
    "                  [CESDT0,CESDT1,MHCT0,MHCT1,ASSENTT0,SEXUALT0,AVGWORDLENCLIT0,COUNSELOR])\n",
    "measures = []\n",
    "features = []\n",
    "for row in subsetNonNADict:\n",
    "    row[CESD] = row[CESDT1]-row[CESDT0]\n",
    "    row[MHC] = row[MHCT1]-row[MHCT0]\n",
    "    row[COUNSELORSHUFFLED] = bestShuffle[row[COUNSELOR]]\n",
    "    measures.append([row[CESD],row[MHC]])\n",
    "    features.append([row[COUNSELORSHUFFLED],row[ASSENTT0],row[SEXUALT0],row[AVGWORDLENCLIT0]])\n",
    "\n",
    "# https://en.wikipedia.org/wiki/Multiple_correlation#Computation\n",
    "Rxx = pd.DataFrame(np.array(features)).corr()\n",
    "c = []\n",
    "TARGETCESD = 0\n",
    "TARGETMHC = 1\n",
    "selectedMeasure = [measures[i][TARGETMHC] for i in range(0,len(measures))]\n",
    "for f in range(0,len(features[0])):\n",
    "    c.append(np.corrcoef(selectedMeasure,[features[i][f] for i in range(0,len(features))])[0][1])\n",
    "c = np.array(c)\n",
    "r = math.sqrt(np.transpose(c).dot(np.linalg.inv(Rxx).dot(c)))\n",
    "(r,c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}