tactus-fasttext.ipynb from e-mental-health/data-processing

tactus-fasttext.ipynb
Summary

Maintainability

Test Coverage

Issues
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fasttext experiments with tactus data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First import required modules: general Python modules and modules from Orange3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import fasttext\n",
    "import gzip\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "import sys\n",
    "\n",
    "sys.path.append(\"/home/erikt/projects/e-mental-health/enron/orange-hackathon/orangehackathon/libs\")\n",
    "import tactusloaderLIB\n",
    "import OWEmailSorterLIB\n",
    "import markduplicatesLIB\n",
    "import removemarkedtextLIB\n",
    "import LIWCLIB"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, read the mails from the data set via Orange 3 and store them in the data structure allMails. This takes several minutes so we do not want to do this often."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ANODIRECTORY = \"/home/erikt/projects/e-mental-health/usb/releases/20191217/\"\n",
    "ANOSTRING = \"-an\"\n",
    "GZEXTENSION = \".gz\"\n",
    "XMLEXTENSION = \".xml\"\n",
    "\n",
    "os.chdir(ANODIRECTORY)\n",
    "\n",
    "def shortenFileName(fileName):\n",
    "    fileName = re.sub(GZEXTENSION,\"\",fileName)\n",
    "    fileName = re.sub(XMLEXTENSION,\"\",fileName)\n",
    "    fileName = re.sub(ANOSTRING,\"\",fileName)   \n",
    "    return(fileName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "LASTFILENBR = 1987\n",
    "\n",
    "def getAllTactusMails():\n",
    "    allMails = {}\n",
    "    missingFiles = []\n",
    "    for patientId in list(range(1,LASTFILENBR+1)):\n",
    "        if patientId % 100 == 0: print(patientId,end=\" \")\n",
    "        fileName = tactusloaderLIB.makeFileName(str(patientId))+GZEXTENSION\n",
    "        if os.path.isfile(ANODIRECTORY+fileName):\n",
    "            mails = tactusloaderLIB.processFile(ANODIRECTORY,fileName)\n",
    "            if len(mails[0]) > 0:\n",
    "                sortedMails = OWEmailSorterLIB.filterEmails(mails[0],filter_asc=True)\n",
    "                markedMails = markduplicatesLIB.processCorpus(sortedMails)\n",
    "                strippedMails = removemarkedtextLIB.processCorpus(markedMails)\n",
    "                allMails[shortenFileName(fileName)] = strippedMails\n",
    "        else: missingFiles.append(fileName)\n",
    "    if len(missingFiles) > 0: print(\"\\nmissing files:\",missingFiles)\n",
    "    return(allMails)\n",
    "\n",
    "allMails = getAllTactusMails()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We tried to store the allMails data structure so that it did not need to be computed every time this notebook is started. But there is not enough space available on our encrypted usb stick (required: 100+Mb). Furthermore, Orange3's from_file routine (from orangecontrib.text.corpus.Corpus) does not work, which makes reading back the data complicated. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "STORAGEDIR = \"/home/erikt/projects/e-mental-health/usb/tactus/\"\n",
    "CSVEXTENSION = \".csv\"\n",
    "\n",
    "for clientId in allMails.keys():\n",
    "    fileName = STORAGEDIR+clientId+CSVEXTENSION\n",
    "    allMails[clientId].save(fileName)\n",
    "    break\n",
    "\n",
    "for fileName in os.listdir(STORAGEDIR):\n",
    "    if re.search(CSVEXTENSION,fileName):\n",
    "        clientId = re.sub(CSVEXTENSION,\"\",fileName)\n",
    "        df = pd.read_csv(STORAGEDIR+clientId+CSVEXTENSION)\n",
    "        variables = df.loc[2:,[\"date\",\"from\",\"to\"]].to_numpy()\n",
    "        variablesDomain = df.loc[0,\"date\":\"from\"]\n",
    "        metas = df.loc[2:,\"file\":].to_numpy()\n",
    "        metasDomain = df.loc[0,\"file\":]\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We also need labels for the data. We will use the dropout labels provided by a student's project"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SELECTEDFILE = \"/home/erikt/projects/e-mental-health/usb/releases/20200305/selected.csv.gz\"\n",
    "DROPOUT = \"dropout\"\n",
    "FILE = \"file\"\n",
    "NBROFCLIENTS = 791\n",
    "\n",
    "dropouts = {}\n",
    "inFile = gzip.open(SELECTEDFILE,\"rt\",encoding=\"utf-8\")\n",
    "csvreader = csv.DictReader(inFile)\n",
    "for row in csvreader:\n",
    "    row[FILE] = re.sub(\"(-an)?.xml(.gz)?$\",\"\",row[FILE])\n",
    "    if row[DROPOUT] == \"1\" or row[DROPOUT] == \"2\": \n",
    "        dropouts[row[FILE]] = row[DROPOUT]\n",
    "inFile.close()\n",
    "\n",
    "len(dropouts) == NBROFCLIENTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fasttext operates on files so we should store our data in a file to enable fasttext to access it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "CLIENT = \"CLIENT\"\n",
    "COUNSELOR = \"COUNSELOR\"\n",
    "FROMFIELD = \"from\"\n",
    "SPACE = \" \"\n",
    "LABELPREFIX = \"__label__\"\n",
    "OUTFILENAME = \"fasttext.txt\"\n",
    "NBROFCLIENTMAILS = 4\n",
    "\n",
    "def getFieldId(corpus,fieldName):\n",
    "    fieldId = -1\n",
    "    for i in range(0,len(corpus.domain.metas)):\n",
    "        if str(corpus.domain.metas[i]) == fieldName:\n",
    "            fieldId = i\n",
    "    return(fieldId)\n",
    "\n",
    "def getLastCounselorMailId(allMails,clientId):\n",
    "    for i in range(-1,-len(allMails[clientId])-1,-1):\n",
    "        if allMails[clientId][i][FROMFIELD] == COUNSELOR: return(i)\n",
    "    sys.exit(\"getLastCounselortMailId: client \"+clientId+\" did not receive any emails!\")\n",
    "\n",
    "def selectLastCounselorMail(allMails):\n",
    "    selectedData = []\n",
    "    selectedLabels = []\n",
    "    if len(allMails) > 0:\n",
    "        firstClient = list(allMails.keys())[0]\n",
    "        subjectId = getFieldId(allMails[firstClient],\"subject\")\n",
    "        textId = getFieldId(allMails[firstClient],\"text\")\n",
    "        for clientId in allMails:\n",
    "            if clientId in dropouts:\n",
    "                lastCounselorMailId = getLastCounselorMailId(allMails,clientId)\n",
    "                subject = allMails[clientId][lastCounselorMailId].metas[subjectId]\n",
    "                mailText = allMails[clientId][lastCounselorMailId].metas[textId]\n",
    "                selectedLabels.append(dropouts[clientId])\n",
    "                selectedData.append(subject+\" \"+mailText)\n",
    "    return(selectedData,selectedLabels)\n",
    "\n",
    "def getNextClientMailId(allMails,clientId,startClientMailId):\n",
    "    for i in range(startClientMailId+1,len(allMails[clientId])):\n",
    "        if allMails[clientId][i][FROMFIELD] == CLIENT: return(i)\n",
    "    return(-1)\n",
    "\n",
    "def selectFirstClientMails(allMails,nbrOfMails):\n",
    "    selectedData = []\n",
    "    selectedLabels = []\n",
    "    if len(allMails) > 0:\n",
    "        firstClient = list(allMails.keys())[0]\n",
    "        subjectId = getFieldId(allMails[firstClient],\"subject\")\n",
    "        textId = getFieldId(allMails[firstClient],\"text\")\n",
    "        for clientId in allMails:\n",
    "            if clientId in dropouts:\n",
    "                startClientMailId = -1\n",
    "                data = \"\"\n",
    "                clientMailId = 0\n",
    "                clientMailCounter = 0\n",
    "                while clientMailId >= 0 and clientMailCounter < nbrOfMails:\n",
    "                    clientMailId = getNextClientMailId(allMails,clientId,startClientMailId)\n",
    "                    if clientMailId >= 0:\n",
    "                        subject = allMails[clientId][clientMailId].metas[subjectId]\n",
    "                        mailText = allMails[clientId][clientMailId].metas[textId]\n",
    "                        data += subject+SPACE+mailText+SPACE\n",
    "                        clientMailCounter += 1\n",
    "                        startClientMailId = clientMailId\n",
    "                if len(data) > 0:\n",
    "                    selectedLabels.append(dropouts[clientId])\n",
    "                    selectedData.append(data)\n",
    "    return(selectedData,selectedLabels)\n",
    "\n",
    "def storeTactusDataInFastTextFile(X,y,outFileName):\n",
    "    if len(X) != len(y): \n",
    "        sys.exit(\"storeTactusDataInFastTextFile(): incompatable lengths of X and y\")\n",
    "    outFile = open(outFileName,\"w\")\n",
    "    for i in range(0,len(X)):\n",
    "        print(LABELPREFIX+str(y[i]),X[i],file=outFile)\n",
    "    outFile.close()\n",
    "    return()\n",
    "\n",
    "def shuffleXy(X,y):\n",
    "    if len(X) != len(y): sys.exit(\"shuffleXy(): incompatable lengths of X and y\")\n",
    "    shuffledX = []\n",
    "    shuffledY = []\n",
    "    while len(X) > 0:\n",
    "        r = random.randint(0,len(X)-1)\n",
    "        shuffledX.append(X[r])\n",
    "        shuffledY.append(y[r])\n",
    "        X[r] = X[0]\n",
    "        y[r] = y[0]\n",
    "        X.pop(0)\n",
    "        y.pop(0)\n",
    "    return(shuffledX,shuffledY)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can start a fasttext experiment using this data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NBROFFOLDS = 5\n",
    "\n",
    "selectedData,selectedLabels = selectFirstClientMails(allMails,NBROFCLIENTMAILS)\n",
    "X,y = shuffleXy(selectedData,selectedLabels)\n",
    "foldBoundaries = [round((f)*len(X)/NBROFFOLDS) for f in range(0,NBROFFOLDS+1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DIM = 300\n",
    "EPOCHSTART = 30\n",
    "EPOCHEND = 31\n",
    "EPOCHSTEP = 5\n",
    "TRAINFILE = \"fasttext-train.txt\"\n",
    "TESTFILE = \"fasttext-test.txt\"\n",
    "WIKIFILENAME = \"wiki.nl.vec\"\n",
    "CCFILENAME = \"cc.nl.300.vec\"\n",
    "WIKIDIR = \"/home/erikt/projects/newsgac/fasttext-runs/\"\n",
    "\n",
    "def fasttextPredict(X,y,foldBoundaries,epochStart,epochEnd,epochStep):\n",
    "    predictions = {str(epoch):[] for epoch in range(epochStart,epochEnd,epochStep)}\n",
    "    for f in range(0,len(foldBoundaries)-1):\n",
    "        startTest = foldBoundaries[f]\n",
    "        endTest = foldBoundaries[f+1]\n",
    "        storeTactusDataInFastTextFile(X[startTest:endTest],y[startTest:endTest],TESTFILE)\n",
    "        storeTactusDataInFastTextFile(X[:startTest]+X[endTest:],y[:startTest]+y[endTest:],TRAINFILE)\n",
    "        for epoch in range(epochStart,epochEnd,epochStep):\n",
    "            model = fasttext.train_supervised(ANODIRECTORY+TRAINFILE,dim=DIM,epoch=epoch) \n",
    "                                            # pretrainedVectors=WIKIDIR+WIKIFILENAME)\n",
    "            testFile = open(TESTFILE,\"r\")\n",
    "            for line in testFile:\n",
    "                tokens = line.strip().split()\n",
    "                if re.search(\"^\"+LABELPREFIX,tokens[0]): tokens.pop(0)\n",
    "                line = \" \".join(tokens)\n",
    "                predictions[str(epoch)].append(re.sub(LABELPREFIX,\"\",model.predict(line)[0][0]))\n",
    "            testFile.close()\n",
    "    return(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = fasttextPredict(X,y,foldBoundaries,EPOCHSTART,EPOCHEND,EPOCHSTEP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "uniqueCounts = np.unique(y,return_counts=True)\n",
    "totals = {list(uniqueCounts[0])[i]:list(uniqueCounts[1])[i] for i in range(0,len(uniqueCounts[0]))}\n",
    "totals['0'] = len(y)\n",
    "\n",
    "def evaluate(predictions,y):\n",
    "    for epoch in predictions:\n",
    "        counts = {'0':0,'1':0,'2':0}\n",
    "        for i in range(0,len(predictions[epoch])):\n",
    "            if predictions[epoch][i] == y[i]: \n",
    "                counts['0'] += 1\n",
    "                counts[y[i]] += 1\n",
    "        print(epoch,end = \" # \")\n",
    "        for key in counts: print(key,\":\",round(counts[key]/totals[key],3),end=\"; \",sep=\"\")\n",
    "        print()\n",
    "    pd.DataFrame([[counts['1'],totals['1']-counts['1']],[totals['2']-counts['2'],counts['2']]],columns=['1','2'],index=['1','2'])\n",
    "\n",
    "evaluate(predictions,y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When trying to predict dropout based on the final counselor mail, using a Wikipedia dictionary did not improve accuracy (83.7% vs 85.2%), at least not when training 30 epochs."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model explanation\n",
    "\n",
    "Find out which tokens contribute to which classes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "storeTactusDataInFastTextFile(X,y,TRAINFILE)\n",
    "model = fasttext.train_supervised(ANODIRECTORY+TRAINFILE,dim=DIM,epoch=EPOCHSTART)\n",
    "\n",
    "seen = {}\n",
    "predictions = {}\n",
    "for mailText in X:\n",
    "    for token in mailText.split():\n",
    "        if not token in seen:\n",
    "            seen[token] = True\n",
    "            labels,scores = model.predict(token)\n",
    "            for i in range(0,len(labels)):\n",
    "                label = list(labels)[i]\n",
    "                score = list(scores)[i]\n",
    "                if not label in predictions: predictions[label] = {}\n",
    "                predictions[label][token] = score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(predictions[\"__label__1\"]),len(predictions[\"__label__2\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted1 = {k: v for k, v in sorted(predictions[\"__label__1\"].items(), key=lambda item: item[1], reverse=True)}\n",
    "{ key:sorted1[key] for key in list(sorted1.keys())[0:20] }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted2 = {k: v for k, v in sorted(predictions[\"__label__2\"].items(), key=lambda item: item[1], reverse=True)}\n",
    "{ key:sorted2[key] for key in list(sorted2.keys())[0:20] }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are many more tokens that trigger the largest class (1/dropout) rather than the smallest (2/finisher; about 3200 vs about 80 for the final counselor mail). The tokens with the highest scores according to fastText do not seem very interesting."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Notes\n",
    "\n",
    "1. about the data: some clients did not even do do the first assignment: answer questions from the counselor. There probably is not enough material to properly analyze them. This assumption should be checked and, if found to be correct, a further selection should be made.\n",
    "2. it could be good idea to not select the first four client mails but only one: the, presumably extensive, introductory email in which the clients describe themselves"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CUTOFF = 1000\n",
    "\n",
    "counter = {'1':0,'2':0}\n",
    "for clientId in allMails:\n",
    "    if clientId in dropouts:\n",
    "        maxLength = 0\n",
    "        maxLengthPos = -1\n",
    "        clientMailCounter = 0\n",
    "        for m in range(0,len(allMails[clientId])):\n",
    "            if allMails[clientId][m][FROMFIELD] == CLIENT:\n",
    "                clientMailCounter += 1\n",
    "                nbrOfWords = len(str(allMails[clientId][m][\"text\"]).split())\n",
    "                if nbrOfWords > maxLength and clientMailCounter <= 4: \n",
    "                    maxLength = nbrOfWords\n",
    "                    maxLengthPos = clientMailCounter\n",
    "        if maxLength >= CUTOFF: \n",
    "            counter[dropouts[clientId]] += 1\n",
    "            # print(clientId,dropouts[clientId],maxLength,maxLengthPos)\n",
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def countWordsInMails(clientId):\n",
    "    nbrOfWords = []\n",
    "    for m in range(0,len(allMails[clientId])):\n",
    "        if allMails[clientId][m][FROMFIELD] == CLIENT:\n",
    "            nbrOfWords.append(len(str(allMails[clientId][m][\"text\"]).split()))\n",
    "    pd.DataFrame(nbrOfWords).plot.bar()\n",
    "\n",
    "countWordsInMails(\"AdB0023\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def selectByLongestMails(allMails,nbrOfMails):\n",
    "    selectedData = []\n",
    "    selectedLabels = []\n",
    "    if len(allMails) > 0:\n",
    "        firstClient = list(allMails.keys())[0]\n",
    "        subjectId = getFieldId(allMails[firstClient],\"subject\")\n",
    "        textId = getFieldId(allMails[firstClient],\"text\")\n",
    "        for clientId in allMails:\n",
    "            if clientId in dropouts:\n",
    "                startClientMailId = -1\n",
    "                data = \"\"\n",
    "                clientMailId = 0\n",
    "                clientMailCounter = 0\n",
    "                while clientMailId >= 0 and clientMailCounter < nbrOfMails:\n",
    "                    clientMailId = getNextClientMailId(allMails,clientId,startClientMailId)\n",
    "                    if clientMailId >= 0:\n",
    "                        mailText = allMails[clientId][clientMailId].metas[textId]\n",
    "                        if len(mailText.split()) >= CUTOFF: data += mailText+SPACE\n",
    "                        clientMailCounter += 1\n",
    "                        startClientMailId = clientMailId\n",
    "                if len(data) > 0:\n",
    "                    selectedLabels.append(dropouts[clientId])\n",
    "                    selectedData.append(data)\n",
    "    return(selectedData,selectedLabels)\n",
    "\n",
    "selectedData,selectedLabels = selectByLongestMails(allMails,NBROFCLIENTMAILS)\n",
    "X,y = shuffleXy(selectedData,selectedLabels)\n",
    "foldBoundaries = [round((f)*len(X)/NBROFFOLDS) for f in range(0,NBROFFOLDS+1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = fasttextPredict(X,y,foldBoundaries,5,50,5)\n",
    "evaluate(predictions,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python-orange",
   "language": "python",
   "name": "python-orange"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}