e-mental-health/data-processing

View on GitHub
xml2csv.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert Tactus xml files to csv files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "import sys\n",
    "import xml.etree.ElementTree as ET\n",
    "import warnings\n",
    "\n",
    "from nltk import word_tokenize\n",
    "from IPython.display import clear_output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "CLIENT = \"client\"\n",
    "COUNSELOR = \"counselor\"\n",
    "DATADIR = \"../usb/tmp/20190917/\"\n",
    "FILEPATTERN = \"AdB.*xml.gz\"\n",
    "OUTDIR = \"../usb/releases/20201018/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "TACTUSMAIL = \"alcoholdebaas@tactus.nl\"\n",
    "EMAIL = \"Email\"\n",
    "\n",
    "def dictToString(data):\n",
    "    string = \"\"\n",
    "    for key in sorted(data.keys()):\n",
    "        string += key+\":\"+data[key]+\" \"\n",
    "    return(string)\n",
    "\n",
    "def inExcludedCounselors(counselorString):\n",
    "    return(not re.search(EMAIL+\":\",counselorString,flags=re.IGNORECASE) or \\\n",
    "           re.search(TACTUSMAIL,counselorString,flags=re.IGNORECASE))\n",
    "\n",
    "def fileNameToId(fileName):\n",
    "    return(fileName.split(\".\")[0].split(\"-\")[0])\n",
    "\n",
    "def readGzippedXmlFile(inFileName):\n",
    "    inFile = gzip.open(inFileName)\n",
    "    inFileContent = inFile.read()\n",
    "    inFile.close()\n",
    "    root = ET.fromstring(inFileContent)\n",
    "    return(root)\n",
    "\n",
    "def cleanupText(text):\n",
    "    return(re.sub(\"\\n\",r\" \\\\n \",text.strip()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def showValueFrequencies(answerDataDf):\n",
    "    x = sorted(set(answerDataDf[CLIENT]))\n",
    "    answerDataDfGroups = answerDataDf.groupby([CLIENT])\n",
    "    y = [len(answerDataDfGroups.groups[client]) for client in x]\n",
    "    yCounts = {yValue:y.count(yValue) for yValue in y}\n",
    "    yCountsSorted = {yValue:yCounts[yValue] for yValue in sorted(yCounts.keys(),key=lambda yValue:yCounts[yValue],reverse=True)}\n",
    "    return(yCountsSorted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "def squeal(text=None):\n",
    "    clear_output(wait=True)\n",
    "    if not text is None: \n",
    "        print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process tag AssignedCounselor (counselors.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "COUNSELORQUERY = \"./AssignedCounselor\"\n",
    "OUTFILENAME = \"counselors.csv.gz\"\n",
    "\n",
    "def getCounselorData():\n",
    "    inFileNames = sorted(os.listdir(DATADIR))\n",
    "    clientList = []\n",
    "    counselorDict = {\"\":0}\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "            for counselor in root.findall(COUNSELORQUERY):\n",
    "                counselorData = {}\n",
    "                for i in range(0,len(counselor)):\n",
    "                    try:\n",
    "                        counselorData[counselor[i].tag.strip()] = counselor[i].text.strip()\n",
    "                    except: pass\n",
    "            counselorString = dictToString(counselorData)\n",
    "            if not counselorString in counselorDict:\n",
    "                counselorDict[counselorString] = len(counselorDict)\n",
    "            clientList.append((fileNameToId(inFileName),counselorDict[counselorString]))\n",
    "    return(clientList,counselorDict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "PLOTFILENAME = \"AssignedCounselor.png\"\n",
    "\n",
    "def clientDictToCounselorDf(clientDict):\n",
    "    counselorDf = pd.DataFrame(clientDict,index=[0]).T.rename(columns={0:COUNSELOR})\n",
    "    counselorDf.index.name = CLIENT\n",
    "    return(counselorDf)\n",
    "\n",
    "def saveCounselorDf(counselorDf,outFileName=OUTFILENAME):\n",
    "    counselorDf.to_csv(OUTDIR+outFileName)\n",
    "    \n",
    "def visualizeCounselorDf(counselorDf,counselorDict):\n",
    "    counselorGroups = counselorDf.groupby(COUNSELOR).groups\n",
    "    counselorDictT = {counselorDict[c]:c for c in counselorDict}\n",
    "    x = [c for c in sorted(counselorGroups.keys()) if not inExcludedCounselors(counselorDictT[c])]\n",
    "    y = [len(counselorGroups[g]) for g in x]\n",
    "    plt.figure(figsize=(10,5))\n",
    "    matplotlib.rc(\"font\",**{\"size\":12})\n",
    "    plt.bar(x,y)\n",
    "    plt.xlabel(\"counselor id\")\n",
    "    plt.title(f\"Number of clients per counselor (average: {round(np.average(y),1)}; standard deviation: {round(np.std(y))})\")\n",
    "    plt.savefig(PLOTFILENAME)\n",
    "    plt.show()\n",
    "    \n",
    "def showExcludedCounselors(counselorDf,counselorDict):\n",
    "    counselorGroups = counselorDf.groupby(COUNSELOR).groups\n",
    "    for c in counselorDict:\n",
    "        if inExcludedCounselors(c):\n",
    "            print(f\"id: {counselorDict[c]}; frequency: {len(counselorGroups[counselorDict[c]])}; data: {c}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of client-counselor pairs: total: 1983; one per client: 1983\n"
     ]
    }
   ],
   "source": [
    "clientList,counselorDict = getCounselorData()\n",
    "clientDict = {key:value for (key,value) in clientList}\n",
    "counselorDf = clientDictToCounselorDf(clientDict)\n",
    "saveCounselorDf(counselorDf,outFileName=OUTFILENAME)\n",
    "print(f\"number of client-counselor pairs: total: {len(clientList)}; one per client: {len(clientDict.keys())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlIAAAFUCAYAAADvbtLzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3deZwmVX3v8c8XRtlHBUYUF0ZBUFBBwbiiKGrcN0wkIIZEJUpMXG5yJQkaIiJ4zVVvgmiIRA1LEBNBr6g3QUWjaHSMQoIggg7KpoOyw4jguX+c01BT9PJ0dQ/dT8/n/Xo9r+7n1KlTp+rUU8+vTp2qJ6UUJEmSNHsbLXQFJEmSxpWBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKzlGRlkpLkKQtdl64ky5OcnuS6Vr+VA8rYp837wPZ+Ua6r5keSg5Pctp6X8bIk5yXxWKN5k+SjSc5aj+WvTnL4LOdZL8fLJGcn+fA8lLPO8X1DkmSjJN9L8sL1Uf5YHdzah6ck+V+99Ae29H0WqGqLweuBJwJPAe4P/GQeyvxJK+s/5qGsO9heG4Yky4C/Bv6ylPLrha7PYpRk7yT/kuSyJLck+UGSI5Js0smzWZLPJPlxkrVJfprkjCS7jlD+45Oc0+a7MsnRSTae53V4ZRIfSLj+vAx4y2xmSHJbkoN7yedQj+dXzFO9ZlOfeyd5f5Lzk9yU5Kq23z98mnkObt8TMwbMSbZK8vdJft7K/1ySHSemt+PPEcBfr4+TurEKpJq1wB8n2WGhKzLfktxjDrM/DDi/lPJfpZSrSim3z7U+pZTbW1m/mmtZ4yTVXNpigzHDdnopsCnw6bupOlNaxG36ZOAS4ABgV+CtwKHA+zt5CvBvwG8DuwDPB5YBX0iy6VQFJ3lQm+/7wJ7Uk60/AI6a97UYQ0nuudB1GEUp5RellOvnoZxb2/F8IU5q7g88BHg78FjqPrw58MUk9+lnbicJ7wK+MmL5JwL7Ai+ndiYE+Lckm3XynAFsAzxv4DpMrZQyNi/go8AXqD0kJ3fSH0g92OzT3q9s75/Sm/9i4IjO+wL8EfBx4Cbgx60h7gWcDNwA/BDYrzPPRNmvbHW5peXZv7es7Vp917RyvgY8tTN9n1bO84GvUgPE10+x3vcAjgEuB24Fvgcc0Jm+upU18Tp7mm24I/DPwC+Am4HzgBf06vTAqbbjLNbrWdQPwc2tvs/tbffua3WnHf8FuLptjx8CfzrNukws64XAN9s8/w08o5dvp1butcA1wL8Cj+pMPxi4DXg68J22jZ87xTKXAX9J/fL7ZWuTv+1Mvz9walvWLcDZwF6T1PmBvXJvAw7ubfffBj7TtuEPJ6Z35nkNcEFb71+07f3AzvQ927re2Nrrk8AO/fXulfk84Ntt3X4GHAds0fsMnkX93KwGfg1sNsW2OgM4vpf2kFaPK9p6/RdwUGf6a4HrgE17872V+vncaC5tOtPy27ybAce3elzTtsHRwMW9fPsD323bfzXw3u62msNx7i3Az2fIs3vbR3afJs+7gMsmtllL+0PqsW5W9ZxqX+vsz93XR9s8z6Lu/79o2/LLwG/0yi3UwPFE6vHkMuDPenm25s5j9E+BdwIfA87q5Bl1WX8MnNLyfLyzLc+h7vM/oH7uVgOHz7BNfpv6nbK2zf8i7nq8nHI/BZa3ffCAXrnbt333me392cCHR11X7vp9UKY69gBPaG15S6vfKcB9O9OPaOv4YuDC1gZnAw+bh/18m1afF/bSN6cex/enHW9mKGfnVs6zO2n3ae3ZP2Z+DDh1rnW/Sx3mu8D1+eLOg/je1AP4Xi19LoHUVcDvth3+uLZDfY56IN4J+Nu282zTK/sK4EDqGeI7gduBx7Q8m1GDh38B9mrl/EVr2Ef0duoLqYHAQ+h9uXbq+R7g58BvtZ3mz9v679umr6AeaL4C3A/Yeopy7kc9EJ1Fjdp3bB+Q5032Qetvx1mu17nAc6g9ZR8Brgfu0/I8puV5WavTipb+6Va3Pdqynw78zjT7w8SyfgC8AHgEcEJrr/u3PNu1Nv4g8KjWXn/btufEcg9u2/ObbZkPnZg2yTI/Rg0wDmrb7wnAm9u0UIP877bt+6jWLtcA2062jTvlThZI/ZB6sN6J+qV4G7Bzy7Nne/8qYIe2rNd02m5XagD1V8DD2/RPABfRghR6gRTw6Fbm+9o8z6UGLyf2PoPXA6dTv4AeBWw8xba6BnhNL+1RwBvavDtSA7LbgKe36feifgZf0ZvvfOBdc23TmZbf5v0b6ufkRa3so6lfWBd38hzc1u+gVvZTqSclJ/byFGDlLI9z7wB+PM30rdr6XgZsPk2+LwP/0Evbkbt+2Rc6x8VJyplyXwPuSQ3OCvWzfD/gXm2+l3JnL9puwIepX/7b9Jb9U2oAvWOnrH07eU6nHruf0co5qe2D3UBq1GX9vLX/jtRj02bUk6HPtn3iicC3qAHOlIEU9Rh2e9s3dqEey37EusfLUfbTU4DP9cr+n9RhFRMnDWezbiA17bpS9/PbgDdOtMkUx/f7te14SqvfU6j78Fc6yzqCejz9fNsPdqeeaP17r87T7kNTbMOHtvn6J74fAU7oHG9mCqR+j3qitHEv/d+7262lHQr8dDb1HGld5rvA9fnqbtT24Tq7/T+XQOr9nfcrWlq3h+E+Le0FvbKP7JV9Du0gSj2AXgYs6+X54sTyOjv1QTOs8+bUQOXQXvrpwBcn2zbTlHUk9YM96dnoJB+0dbbjLNfrZZ3p27W035ysvTr5zp3Nh7GzrFd30pYBl060D/VA8I3efKH2KL2ps14F2HuG5e3U8r18iun7tum7dtI2Aa4E3j7ZNu7kmyyQektn+sbUM/Y/aO9fSv1yXz7NZ+XUXtom1C+Il3TWuxtInQh8szfPi6kByQ6dcq8FtpxhW927rcOkPXu9vJ8C/r7z/lTgzM77vVpZu8x3m/aXD2xB/by9upfnG6wbSK0GXtfL89S2zIkThpdST5QeMIt9+hHUL7c3TDLt3dTguFBPaHaaoayLaMFnJ22LNv9vddIunGx5nekz7WuvpPV6zFCfjajB54GdtAL8TS/fBcDRvc/cszrT70kNfqY83k2zrBN6+V7Ttul9OmmPbHmnC6ROAr7WS3sD6x4vR9lPn0P97N+vk+e/Jta/vT+bXkAwwrrecTzppO3Dusf3I6nH83t28kz0dD61sw630TmxBF5BPSZs2kmbdh+apM4bU4Ozb7Juj+mrWvtv0d5/dLp2bnn+HLhikvRP0DmOtLSJXsM59xx3X+M4RmrCW4EnJ3nRHMs5d+KfUsoa6lnGeZ20a6jR7n1783299/5r1LMDgMdRo/1rk9w48aL2pD2sN983Z6jfTtQDR/9a8Zc7yxvVnsA5pZSbZjnfhNms13cn/iml/JS6Xbebofz3A3+e5D+SvDvJU0es1x1tUUq5jbpNu22xZ6++N1CDlX6dvzXDch7b/v7rFNN3o16S+V6nPr+k9lLNtq1g3W14O7UnbGIb/hu1x+pHSU5NckiSbTvzPg54aW+9f04ds9Rf7279J9vPQu3hmnBBKeXGGeo+MTZhbTcxyeZJjmmDTn/R6vU8ak/HhI8Bz04y8Zl7FTXA+35n3Qa16QjLn/i8faNXztc7Zaxo+d/bq8PnOmVQSjm9lPLwUsrlU2+mder2MOq+dWop5dhJsryH2hPydGrbn55kq1HKnk6r42TLmzDTvjapJA9JcmKSi5NcTw0Q78W6bQ2d/by5gjv384n97pxOfW/lru066rL6x9tdqfvzNZ3y/5saOE5n126dmq/23o+yn/4b9XN9QFuPx1IDuX+casGzWNeZ7EYN9G6dSCilnEtd9+7x6or23XjHe+ox4b6d+Wbah7r135i6fjtTT7h/3dJ3ofaG7z+H76iZTByPNps21ywtm8/C7k6llIuS/B31LO25vckTg+nSS59ssOlkA6n7aYXZDczfiBpVv3SSaTf33q+vHWZ9mM163TpJnmm3YSnlI0k+Tz1LezrwuSSnl1JeOaSynWV+gXq22Nc9WN5eSlk7SZ75dpd9sx1YJts2/W14x35YSrkxyV7UwcrPBF4H/K8k+5ZSvt3ynUgdW9f38zmtwWj77NWtvlv30t9D7eV6C3UQ9E3A/6Z+EUz41zb/AUk+QB0rcURn+lzadJTl0+o+lYm2eiPwpUmmXzbNvJNK8kjql+qnqIPC76KUcjV1u/wgyTnUdjwQ+NAUxV5JPfHp2q4zbSQj7GtT+Uyr7x9SL1XdSg02+oO8p9zPZ2HUZd2dx9sZ99NSyu1JTqaeLLy3/f1WKeWCacoddV3ny2TtAwNuVmsD/P+J2vP1tFJK97PyROrx4tvJHYfHjdp8t7X8X5uk2CuBbZNsXNa9yWo7aq9s19bUk/pfzLbu0xnnHimo4z+2Bw7ppU9Ez9tPJLSz2wfM47Kf0Hv/JGp3O8Aq6vXf60spF/des7319GLqpYZ+78zTqAPyZuPbwJOSbDHL+SbM13pNfDDvcht2KeXKUspHSimvAl4NHJhk+Qzl3dEWqbfc/wbrtsVuwGWT1HnNJGVN5z/b32dPMf18YJt0bktPvY398dzZVj9rf7fvzLcHdw36Z1TqXZVfKaW8ndrbeCXtzJa63o8GLplkva+ZosjzmXw/K23abOr2K+o693vinkq9UeS0dvb7Q+qZ6TrrRb3Z4yDqSdK9qJf7JsylTWda/sXU/fOJvfnu2MdaD+tPqJca+8u/eLYBeZLHUXv+TqPecDJdELfOrNQexql8DXhW73bv51BPer4zmzrOsK/dCnecEND+34baa3NMKeX/tV7atdy1Z38mE5/jJ3XKvie1t2c+lvU94BFJ7t0pbzfuGlhPNt+TemlP7r0fdT/9GLB7kscAv8P0vVGjruutTHJ87TkfeEL37sUku1PXfbbfLTNKsjl1HOyu1EuH/Uf0nEEdq7VH5/Vpao/+Hky9z36N2knyjM6y7k097vZ7CR8FfKfM852LYx1ItZ3xGOBNvfRbqBv3fybZPcme1J3zl/O4+FcnOSDJzkneQT3wvrdNO5k68PDMJM9OfVDb45P8WZKXzGYhpZSbqYNfj0zyW215f049q37XLOt8HLXNP5Xkya2L+AVJ+j16U5mv9bqaOi7h2UnuN3H7a5JjkzwvyY7tYPYy6hfWDTOUd1ib7xHUgZ0r2roCHEs9oHwq9Zk9K5M8JclRSfoHwmmVUi6mboPjUp+ds2OSxyV5Y8vyReqlg1Pa9p3oot+01QvqF/WlwBFJHp768L73MX0PyF0keXGSNyfZM8mDgZcAD+LOL553UcfbnJTkN1pbPz3J/0ny0CmKfQ/w2CTva3V7DnVw7MmllB/Ppn7NZ6mBWNf3gRe3Ou1KvTtu+7vMWbfbY6knS58ppXTPIOfSptMuv11S+Dvgne2zsXOSo6jbsttGf0F9DMtfJHlkkl2SvKT1kgOQ5KVJLkwy5Qlc6uXrL1B7oo4Gtmufift18uyT5PXtWPbgJE+m3nn7a+odiBP5vpDk6E7xH6R+Kf59kt1Sh0EcSR0DOnLPzAj72o/a3xclWZFkS+qYnTXAa9s2fCK1J+KWUZcLd3zmPg18oO2/u1IHV3cvac5lWadQjy8nte37BOAfRpj3fcAT2z63c5KXAv+jl2ek/bRdSvxOW+69W92nMuq6/gh4epLtM/Vl2GOpdw5+tO3DT6H2Yv97KeXfZ1j/dbT9fLKet4npWwH/jzpA/hXAryf287RHFJRSri2l/Hf3RR2PeVN7f3Mr6w1JLpwou5RyEfXz88EkT0uyB7VdL6fe7NO1D3DmbNZtJGUeB1yt7xeTDDyjfkn9mN7gZepZ5pepXbk/oH4pTzbY/JW98iYbpLeWdvcRdw4EPog6CHAtdaft38K6DfVANvHIgsupA8Qn7uzbh0kGHU+x3tM+/mCqbTNFWTu3elxHPTM9lxHv2pvLevW3K7UL+0ctfXVL+wC1K/YW6mWLM4HdplmXiWW9iDtv2f8enYGpLd8O1ABoTctzKXWw6EPa9IPpPQZghrY4kjrY+FbqZZzuDQv9xx98mc7jD1qex7f63tK2/95MPth8ypslqD0rX2zrtJa6jx/Wy/8o6gHmmrasi6mBw9ZTrTfrPv5gTWvruzz+YMRt9VDqZfIHddIeRD2g3kTt1fgr6p2WZ08y/3fadnjxJNMGtekoy+fOxx9c39rxOOr4vf/qlfUS6tipm1ve79JuKujUoTDNXXtte5bJXr395WzqZ6K7rrv2ylpNe/RAJ+0J1LE8a6k3mhzNXe9uKkx/194o+9r7qb2thTsff/A06v69lhrA7sdox+CzuutBPeac1tpsTVuH/uMPBi2rpT+mteMvqQPB92e0xx/sz52PQfkP6slt/3g57X7ayffGNu/pkyznbNa9a2+UdX0OdRjGrRP7EjM//uBapnj8Qa8+T6G3X4+wD00se7LXwdPM91Hu+p1/BJ3PR0vbCvh77nysz+fp3YzBJMej+XqlLUAaO6lPRv8S9YMx63EpWv+SnADcUEp504yZF7EkXwSuKaXst9B1mU+td/Ji6t2Nk40/kZaEJMcBKaVMOgZxLsZ2sLmksfBnwO8n2aiMyc/EJHkU9bLi16kDeA+i3vww6iXwcfIC4B8NorSUpY4TvIza0zz/5dsjpXFlj5TWhza27cPUcVEbUZ+Rc1Qp5YwFrZikRclASpIkaaCxvmtPkiRpIRlISZIkDbQgg8233XbbsnLlyoVYtCRJ0qx8+9vfvrqUsmKyaQsSSK1cuZJVq1YtxKIlSZJmJcmlU03z0p4kSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA20IL+1p/Gx8rAzZ8yz+pjn3w01kSRp8bFHSpIkaSADKUmSpIEMpCRJkgYykJIkSRrIQEqSJGkgAylJkqSBDKQkSZIGMpCSJEkayEBKkiRpIAMpSZKkgQykJEmSBjKQkiRJGshASpIkaaCRAqkkK5N8Nsk1Sa5KcmySZW3aHkm+neTm9neP9VtlSZKkxWHUHqnjgJ8B9wf2AJ4GHJrknsCngJOA+wAfAz7V0iVJkpa0UQOphwCnlVLWllKuAj4P7AbsAywD3l9K+WUp5W+AAM9YH5WVJElaTEYNpN4P7J9k8yQPAJ7LncHUeaWU0sl7XkuXJEla0kYNpL5CDY6uBy4DVgFnAFsC1/XyXgds1S8gySFJViVZtWbNmuE1liRJWiRmDKSSbETtffoksAWwLXU81LuBG4HlvVmWAzf0yymlHF9K2auUsteKFSvmWm9JkqQFN0qP1NbAg4Fj2zionwMfAZ4HnA88Okk6+R/d0iVJkpa0ZTNlKKVcneRHwOuT/DX1ct7vUsdCnQ3cDvxxkg8Br22zfXH9VHfDsPKwM2fMs/qY598NNZEkSdMZdYzUy4DnAGuAi4FfAW8updwKvAR4FXAt8PvAS1q6JEnSkjZjjxRAKeW71EcdTDbtO8Ce81gnSZKksTBSIKW583KdJElLj7+1J0mSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNNDIgVSS/ZNckOSmJJck2bul75vkwiQ3J/lSkh3WX3UlSZIWj2WjZEryLODdwCuAbwL3b+nbAp8EXgP8X+BI4OPAE9ZHZSWta+VhZ86YZ/Uxz78baiJJG6aRAingr4B3lFK+0d5fDpDkEOD8Uson2vsjgKuTPLyUcuF8V1aSJGkxmfHSXpKNgb2AFUkuTnJZkmOTbAbsBpw7kbeUchNwSUuXJEla0kYZI7UdcA/g5cDewB7AY4DDgS2B63r5rwO26heS5JAkq5KsWrNmzZwqLUmStBiMEkjd0v7+bSnlylLK1cB7gecBNwLLe/mXAzf0CymlHF9K2auUsteKFSvmUmdJkqRFYcZAqpRyDXAZULrJ7e/5wO4TiUm2AHZs6ZIkSUvaqIPNPwL8UZLPA78C3gx8BjgdeE+S/YAzgbcD543jQPNR7n4C74CSJEl3GvU5UkcC3wIuAi4AvgMcVUpZA+wHHAVcAzwe2H891FOSJGnRGalHqpTyK+DQ9upPOwt4+DzXS5IkadHzJ2IkSZIGMpCSJEkayEBKkiRpIAMpSZKkgQykJEmSBjKQkiRJGshASpIkaSADKUmSpIEMpCRJkgYa9bf2JGnBjPJbmP4OpqSFYI+UJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAy1b6ApIkoZZediZM+ZZfczz74aaSBsue6QkSZIGMpCSJEkayEBKkiRpIAMpSZKkgQykJEmSBjKQkiRJGshASpIkaSADKUmSpIF8IOcGZpQH+IEP8ZMkaRT2SEmSJA1kICVJkjSQl/a0JPkbZJKku4M9UpIkSQMZSEmSJA00q0AqycOSrE1yUiftgCSXJrkpyRlJtp7/akqSJC0+s+2R+gDwrYk3SXYD/g44CNgOuBk4bt5qJ0mStIiNPNg8yf7AtcA5wE4t+UDg/5ZSvtLyvA24IMlWpZQb5ruykiRJi8lIPVJJlgPvAN7Sm7QbcO7Em1LKJcCtwM6TlHFIklVJVq1Zs2Z4jSVJkhaJUS/tHQmcUEq5rJe+JXBdL+06YKt+AaWU40spe5VS9lqxYsXsaypJkrTIzHhpL8kewDOBx0wy+UZgeS9tOeBlPUmStOSNMkZqH2Al8OMkUHuhNk6yK/B5YPeJjEkeCmwCXDTfFZU2FD5MVJLGxyiB1PHAqZ33f0INrF4P3Bf4epK9gf+kjqP6pAPNJUnShmDGQKqUcjP1sQYAJLkRWFtKWQOsSfI64GRgG+As4PfWU10lSZIWlVn/1l4p5Yje+1OAU+arQpIkSePCn4iRJEkayEBKkiRpoFlf2pOmMsrdZuAdZ5KkpcMeKUmSpIEMpCRJkgYykJIkSRrIQEqSJGkgAylJkqSBvGtPC8bflJMkjTt7pCRJkgYykJIkSRrIS3tLgJfIJElaGPZISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAy3pHy32x3wlSdL6ZI+UJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kAGUpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNtKR/a09aDPzNR0lauuyRkiRJGshASpIkaSAv7UnSLHipVlKXPVKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQAZSkiRJAxlISZIkDWQgJUmSNJCBlCRJ0kA+kFPChyxKkoaxR0qSJGmgGQOpJJskOSHJpUluSPLdJM/tTN83yYVJbk7ypSQ7rN8qS5IkLQ6j9EgtA34CPA24F3A4cFqSlUm2BT4JvA3YGlgFfHw91VWSJGlRmXGMVCnlJuCITtJnkvwI2BPYBji/lPIJgCRHAFcneXgp5cL5r64kSdLiMesxUkm2A3YGzgd2A86dmNaCrktauiRJ0pI2q0AqyT2Ak4GPtR6nLYHretmuA7aaZN5DkqxKsmrNmjVD6ytJkrRojBxIJdkIOBG4FXhDS74RWN7Luhy4oT9/KeX4UspepZS9VqxYMbC6kiRJi8dIgVSSACcA2wH7lVJ+1SadD+zeybcFsGNLlyRJWtJG7ZH6IPAI4IWllFs66acDj0yyX5JNgbcD5znQXJIkbQhGeY7UDsAfAHsAVyW5sb0OLKWsAfYDjgKuAR4P7L8+KyxJkrRYjPL4g0uBTDP9LODh81kpSZKkceBPxEiSJA1kICVJkjSQgZQkSdJAM46RkiTdPVYeduaMeVYf8/y7oSaSRmWPlCRJ0kAGUpIkSQN5aU9jw8sei5vto/VhlP0K3Le0cOyRkiRJGshASpIkaSADKUmSpIEMpCRJkgYykJIkSRrIu/akDYh31knS/LJHSpIkaSADKUmSpIEMpCRJkgYykJIkSRrIQEqSJGkg79qTpA2Ad2xK64c9UpIkSQMZSEmSJA1kICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQD6QU9KSM5uHT/qgSklzYY+UJEnSQAZSkiRJA3lpT5qlUS4FgZeDJGlDYI+UJEnSQAZSkiRJA3lpT5K0wfKuTc2VPVKSJEkDGUhJkiQN5KU9SXc773zcsHk5TUuJPVKSJEkDGUhJkiQN5KU9SdJdePnt7uXl7vFlj5QkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUJEnSQN61J0laMrzbUHc3e6QkSZIGMpCSJEkayEt7krSe+JDFpWW2lw3X52XG2ZQ92/1woet9d9VlvtgjJUmSNNC8BFJJtk5yepKbklya5ID5KFeSJGkxm69Lex8AbgW2A/YAzkxybinl/HkqX5K0SHkJc8M2Dpff1qc590gl2QLYD3hbKeXGUspXgU8DB821bEmSpMVsPi7t7QzcVkq5qJN2LrDbPJQtSZK0aKWUMrcCkr2BT5RS7tdJey1wYClln07aIcAh7e0uwPYqxvwAAAdxSURBVPfntOA7bQtcPU9laeHZnkuHbbm02J5Li+05OzuUUlZMNmE+xkjdCCzvpS0HbugmlFKOB46fh+WtI8mqUspe812uFobtuXTYlkuL7bm02J7zZz4u7V0ELEvysE7a7oADzSVJ0pI250CqlHIT8EngHUm2SPJk4MXAiXMtW5IkaTGbrwdyHgpsBvwM+Cfg9Xfjow/m/XKhFpTtuXTYlkuL7bm02J7zZM6DzSVJkjZU/kSMJEnSQAZSkiRJA41tIOXv+42vJG9IsirJL5N8tDdt3yQXJrk5yZeS7LBA1dSIkmyS5IT2ObwhyXeTPLcz3TYdM0lOSnJlkuuTXJTkNZ1ptucYSvKwJGuTnNRJO6B9bm9KckaSrReyjuNqbAMp1v19vwOBDybxaerj4QrgncA/dBOTbEu9A/RtwNbAKuDjd3vtNFvLgJ8ATwPuBRwOnJZkpW06to4GVpZSlgMvAt6ZZE/bc6x9APjWxJv2ffl31J9z2w64GThuYao23sZysHn7fb9rgEdO/DRNkhOBy0sphy1o5TSyJO8EHlhKObi9PwQ4uJTypPZ+C+qTdx9TSrlwwSqqWUtyHvBXwDbYpmMtyS7A2cAbgXtje46dJPsDLwO+B+xUSnllkndRg+UDWp4dgQuAbUopN0xdmvrGtUfK3/dbmnajtiNwxzPKLsF2HStJtqN+Rs/HNh1bSY5LcjNwIXAl8Flsz7GTZDnwDuAtvUn9tryEepVn57uvdkvDuAZSWwLX99KuA7ZagLpo/mxJbccu23WMJLkHcDLwsdZDYZuOqVLKodR22pt6Oe+X2J7j6EjghFLKZb1023KejGsgNdLv+2ns2K5jLMlG1F80uBV4Q0u2TcdYKeX2UspXgQcCr8f2HCtJ9gCeCbxvksm25TyZjx8tXgh3/L5fKeUHLc3f9xt/5wO/O/Gmjb/YEdt10UsS4ATqoNXnlVJ+1SbZpkvDMu5sN9tzfOwDrAR+XD+ibAlsnGRX4PPU700AkjwU2IT6/apZGMseKX/fb7wlWZZkU2Bj6od60yTLgNOBRybZr01/O3Ceg1jHwgeBRwAvLKXc0km3TcdMkvsm2T/Jlkk2TvKbwO8AX8D2HDfHUwPdPdrrQ8CZwG9SL8G/MMneLSB+B/BJB5rP3lgGUs1C/r6f5uZw4BbgMOCV7f/DSylrgP2Ao6h3ZT4e2H+hKqnRtOcI/QH1QH1Vkhvb60DbdCwV6mW8y6ht9tfAm0opn7Y9x0sp5eZSylUTL+rlvLWllDXt+/J11IDqZ9SxUYcuYHXH1lg+/kCSJGkxGOceKUmSpAVlICVJkjSQgZQkSdJABlKSJEkDGUhJkiQNZCAlSZI0kIGUpA1CkpVJSnv463yU96Ekb5tmekmy03wsS9LiNa4/ESNJC6qU8rqFroOkhWePlCTNYL56sSQtPQZSkuYkyYOSfDLJmiQ/T3JsS98oyeFJLk3ysyT/mORebdo+SS7rlbM6yTPb/0ckOa3Nc0OS85Ps1cn71iSXt2nfT7JvZ5mHJbmk1eW0JFtPUe/tk3w6yS+SXJzktZ1pRyT55yQnJbkeOHiS+T+a5J2d93+a5MokVyT5/blsU0njw0BK0mBJNgY+A1xK/ZX5BwCntskHt9fTgYdSf3n+2FkU/6JW1r2BT0/Mm2QX4A3A40opW1F/gHV1m+ePgJcATwO2p/4e3AemKP9U6u/JbQ+8HHhXkmd0pr8Y+Oe2/JOnq2iS5wB/AjwLeBjwzNFXU9I4M5CSNBe/QQ1E/rSUclMpZW0p5att2oHAe0spPyyl3Aj8GbD/LC6TfbWU8tlSyu3AicDuLf12YBNg1yT3KKWsLqVc0qa9DviLUsplpZRfAkcAL+8vM8mDgCcDb211/i7wYeBVnWxfL6WcUUr5dSnllhnq+tvAR0op/11KuaktV9IGwEBK0lw8CLi0lHLbJNO2p/ZUTbiUeoPLdiOWfVXn/5uBTZMsK6VcDLyJGqz8LMmpSbZv+XYATk9ybZJrgQuogVd/mdsDvyil3NCr3wM6738yYj0nyuvmv3SqjJKWFgMpSXPxE+DBU/QyXUENbCY8GLgN+ClwE7D5xIR2iXDFqAstpZxSSnlKK78A7+7U57mllHt3XpuWUi6fpG5bJ9mqV79uvjJqfYArqUFltyxJGwADKUlz8U1qEHFMki2SbJrkyW3aPwFvTvKQJFsC7wI+3nqvLqL2MD0/yT2Aw6mX62aUZJckz0iyCbAWuAX4dZv8IeCoJDu0vCuSvLhfRinlJ8A5wNGtzo8GXg2cNGgrwGnAwUl2TbI58JcDy5E0ZgykJA3Wxi+9ENgJ+DF18PYr2uR/oI5t+grwI2rQ80dtvuuAQ6njki6n9lCtcxffNDYBjgGupl7+uy91/BXA/6EOTP/XJDcA3wAeP0U5v0MdIH8FcDrwl6WUs0aswzpKKZ8D3g98Ebi4/ZW0AUgps+m9liRJ0gR7pCRJkgYykJIkSRrIQEqSJGkgAylJkqSBDKQkSZIGMpCSJEkayEBKkiRpIAMpSZKkgQykJEmSBvr/uvmsI2qa/jwAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 720x360 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualizeCounselorDf(counselorDf,counselorDict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "showExcludedCounselors(counselorDf,counselorDict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* id: 0; frequency: 763; data: \n",
    "* id: 3; frequency: 254; data: Geen Hulpverlener \n",
    "* id: 19; frequency: 1; data: Aangehouden cliënten \n",
    "* id: 45; frequency: 1; data: Niet Gestart "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process tag Intake (clients.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "TITLE = \"title\"\n",
    "TITLECAPS = \"Title\"\n",
    "QUESTIONNUMBER = \"questionNumber\"\n",
    "ANSWER = \"answer\"\n",
    "ANSWERID = \"answerId\"\n",
    "ANSWERTEXT = \"answerText\"\n",
    "ANSWERTITLE = \"answerTitle\"\n",
    "ID = \"ID\"\n",
    "INTAKE = \"Intake\"\n",
    "QUERYINTAKE = \"./Intake\"\n",
    "QUERYQUESTION = \".//question\"\n",
    "QUERYTITLE = \"./\"+TITLE\n",
    "QUERYQUESTIONNUMBER = \"./\"+QUESTIONNUMBER\n",
    "QUERYANSWER = \"./\"+ANSWER\n",
    "QUERYANSWERTEXT = \"./\"+ANSWERTEXT\n",
    "OUTFILENAME = \"clients.csv.gz\"\n",
    "PLOTFILENAME = \"Intake.png\"\n",
    "TITLESTRING = \"Number of questions per client in Intake forms\"\n",
    "TREATMENTTITLE = \"treatmentTitle\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getAnswerDataList(querySection=QUERYINTAKE):\n",
    "    inFileNames = sorted(os.listdir(DATADIR))\n",
    "    answerDataList = []\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "            for section in root.findall(querySection):\n",
    "                for question in section.findall(QUERYQUESTION):\n",
    "                    answerDict = {CLIENT:fileNameToId(inFileName),TREATMENTTITLE:INTAKE}\n",
    "                    try:\n",
    "                        answerDict[TITLE] = list(question.findall(QUERYTITLE))[0].text.strip()\n",
    "                    except: pass\n",
    "                    try:\n",
    "                        answerDict[QUESTIONNUMBER] = list(question.findall(QUERYQUESTIONNUMBER))[0].text.strip()\n",
    "                    except: pass\n",
    "                    for answer in question.findall(QUERYANSWER):\n",
    "                        answerDictCopy = dict(answerDict)\n",
    "                        answerDictCopy[ANSWERID] = answer.attrib[ID]\n",
    "                        try:\n",
    "                            answerDictCopy[ANSWERTEXT] = list(answer.findall(QUERYANSWERTEXT))[0].text.strip()\n",
    "                        except: pass\n",
    "                        try:\n",
    "                            answerDictCopy[ANSWERTITLE] = list(answer.findall(QUERYTITLE))[0].text.strip()\n",
    "                        except: pass\n",
    "                        answerDataList.append(answerDictCopy)\n",
    "    return(answerDataList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATAFREQUENCYTHRESHOLD = 5\n",
    "REMOVED = \"REMOVED\"\n",
    "\n",
    "def makeAnswerId(answer):\n",
    "    if not QUESTIONNUMBER in answer: questionNumber = \"\"\n",
    "    else: questionNumber = answer[QUESTIONNUMBER]\n",
    "    if not ANSWERID in answer: answerId = \"\"\n",
    "    else: answerId = answer[ANSWERID]\n",
    "    return(questionNumber+\"-\"+answerId)\n",
    "\n",
    "def countAnswerTexts(answerDataList):\n",
    "    answerTextFreqs = {}\n",
    "    for i in range(0,len(answerDataList)):\n",
    "        answerId = makeAnswerId(answerDataList[i])\n",
    "        try:\n",
    "            answerText = answerDataList[i][ANSWERTEXT]\n",
    "        except: answerText = \"\"\n",
    "        if not answerId in answerTextFreqs: answerTextFreqs[answerId] = {}\n",
    "        for token in answerText.split():\n",
    "            if not token in answerTextFreqs[answerId]: answerTextFreqs[answerId][token] = 0\n",
    "            answerTextFreqs[answerId][token] += 1\n",
    "    return(answerTextFreqs)\n",
    "\n",
    "def removeRareDataValues(answerDataList,answerFreqs):\n",
    "    for i in range(0,len(answerDataList)):\n",
    "        answerId = makeAnswerId(answerDataList[i])\n",
    "        if ANSWERTEXT in answerDataList[i] and \\\n",
    "           answerDataList[i][ANSWERTEXT] != \"\":\n",
    "            for token in answerDataList[i][ANSWERTEXT].split():\n",
    "                if answerFreqs[answerId][token] < DATAFREQUENCYTHRESHOLD:\n",
    "                    answerDataList[i][ANSWERTEXT] = REMOVED\n",
    "    return(answerDataList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def answerDataListToDf(answerDataList):\n",
    "    return(pd.DataFrame(answerDataList))\n",
    "\n",
    "def saveAnswerDataDf(answerDataDf,outFileName=OUTFILENAME):\n",
    "    answerDataDf.to_csv(OUTDIR+outFileName,index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualizeAnswerDataDf(answerDataDf,titleString=TITLESTRING,plotFileName=PLOTFILENAME):\n",
    "    x = sorted(set(answerDataDf[CLIENT]))\n",
    "    answerDataDfGroups = answerDataDf.groupby([CLIENT])\n",
    "    y = [len(answerDataDfGroups.groups[client]) for client in x]\n",
    "    plt.figure(figsize=(16,5))\n",
    "    matplotlib.rc(\"font\",**{\"size\":20})\n",
    "    plt.bar(x,y)\n",
    "    plt.title(titleString)\n",
    "    plt.xlabel(\"client id\")\n",
    "    plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
    "    plt.savefig(plotFileName)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "GESLACHT = \"geslacht\"\n",
    "GESLACHT0 = \"geslacht0\"\n",
    "GESLACHTT0 = \"geslachtt0\"\n",
    "NONQUESTIONS = \"^(goTo[0-9]|ltgeslacht1|doel)$\"\n",
    "EXCEPTIONANSWERID = \"mdoel\"\n",
    "ANSWERIDORIGINAL = \"answerId_original\"\n",
    "ANSWERIDNORMALIZED = \"answerId_normalized\"\n",
    "\n",
    "def normalize_answer_id(answer_id, first_answer_id):\n",
    "    if first_answer_id == GESLACHT: \n",
    "        new_answer_id = answer_id\n",
    "    elif first_answer_id == GESLACHT0:\n",
    "        if re.search(\"0h$\",answer_id):\n",
    "            new_answer_id = re.sub(\"0h$\",\"0h-ignore\",answer_id)\n",
    "        else:\n",
    "            new_answer_id = re.sub(\"0$\",\"\",answer_id)\n",
    "    elif first_answer_id == GESLACHTT0: \n",
    "        new_answer_id = re.sub(\"t0$\",\"\",answer_id)\n",
    "    else: \n",
    "        sys.exit(f\"unknown first answer id: {first_answer_id}!\")\n",
    "    if re.search(\"^(goTo[0-9]|ltgeslacht1|doel)$\",answer_id):\n",
    "        return(\"\")\n",
    "    if (first_answer_id != GESLACHT and new_answer_id == answer_id and \n",
    "        not answer_id == EXCEPTIONANSWERID and not re.search(NONQUESTIONS,answer_id)):\n",
    "        sys.exit(f\"first answer id {first_answer_id} did not change {answer_id}!\")\n",
    "    return(new_answer_id)\n",
    "\n",
    "def normalize_all_answer_ids(df):\n",
    "    current_client_id = \"\"\n",
    "    first_answer_id = \"\"\n",
    "    answer_ids_normalized = []\n",
    "    for i,row in df.iterrows():\n",
    "        client_id = row[CLIENT]\n",
    "        answer_id = row[ANSWERID]\n",
    "        if client_id != current_client_id:\n",
    "            current_client_id = client_id\n",
    "            first_answer_id = answer_id\n",
    "        answer_ids_normalized.append(normalize_answer_id(answer_id,first_answer_id))\n",
    "    df.rename(columns={ANSWERID:ANSWERIDORIGINAL},inplace=True)\n",
    "    df[ANSWERIDNORMALIZED] = answer_ids_normalized\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_text_from_alcohol_intake(answer_data_list):\n",
    "    new_answer_data_list = []\n",
    "    for row in answer_data_list:\n",
    "        if re.search(\"^dag[0-9]\",row[\"answerId\"]) or re.search(\"^week\",row[\"answerId\"]):\n",
    "            row[\"answerText\"] = re.sub(\"^.*: *\",\"\",row[\"answerText\"])\n",
    "            row[\"answerText\"] = re.sub(\"^± *\",\"\",row[\"answerText\"])\n",
    "        new_answer_data_list.append(dict(row))\n",
    "    return(new_answer_data_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "answerDataList = getAnswerDataList()\n",
    "asnwerDataList = remove_text_from_alcohol_intake(answerDataList)\n",
    "answerTextFreqs = countAnswerTexts(answerDataList)\n",
    "answerDataList = removeRareDataValues(answerDataList,answerTextFreqs)\n",
    "answerDataDf = answerDataListToDf(answerDataList)\n",
    "answerDataDfNormalized = normalize_all_answer_ids(answerDataDf)\n",
    "saveAnswerDataDf(answerDataDfNormalized,outFileName=OUTFILENAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7gAAAFOCAYAAACsbbF2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3debgkZXnw/+8NwyIYFnEQl+gICuZ1jQ4KuDDIG7eooEAki4JGcENFlp8EJE5UjInEDVQElDESXzCoKIJoIgyguDAQJcQo66goy8AsLMOw3r8/6mlO01T3qe5zzpxzar6f6+qruqueraqequ67a4vMRJIkSZKk2W696W6AJEmSJEmTwQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIAraVJFxMKIyIhYNN1tmQ4RsWNEnBURt0TEA2VZLJzuds02ZbllRMyb7rZockTEon7bQxvXd0TsX+Zp8XS3ZTYY1D+mU0T8ZUT8OCJu7+qnC6a7XZL6M8CV1rKuL/GMiEvHSXvquhwszjYR8VRgMfBqYEvgFuAm4I5pbNaMEhELyp8ge053W6R+ZnM/jYilUxEoloB9YUQ8ZzLLncki4q+BrwI7ARtR7c9vAu6ZznZJGswAV5pez42I1093IzRpDgQ2AS4CtsrMrTNzm8w8dprbNZMsAD4IjBc4/Lq87p3qBmlGmGnrewHN+ukgq6jm6beT0aAZYH+qZbLOBLjAwWX4SWCTsj/fJjMvns5GSRpsznQ3QBIfiogzM/OB6W6IJuzpZfi1zFw5rS2Z5TLzadPdBq09bVzfmflN4JvT3Q5NSGef/qXMvG9aWyKpMY/gStPnAmA11RfoX01zWzQ5HlGGnpIsSbOf+3RpFjLAlabPjcDx5f3CiBjqjIrxbsoSEfM6aWqmLS7T9o+IzSLinyPimoi4KyKujYgPRcTGXel3j4jvlRsn3RkRF0bEixu0cb2IeF9E/KLkuzUivh0Rz2+Q740R8R8RsSwi7omIP0TE6RHxgj55Hry5Vcl/UET8LCJWlvGNT6sr+f82Ii6IiOURsSYirouIEyPiKTXpl5blvKCMOqVr/SxtWm8p63Glnt+Xeq+NiE9ExBb9blozaF13pVkwXnsi4hkR8aUyr2vKsvtRRLw9Ijbok2friPh4RFxR1vGaiPhdRFxc+tGTuttIdYojwH5dy+hhfblB/94uIr5Qls+aiFhR+uVbI2L9Pnm6+/0jSp/5den3N0fEaVFdR91v+ewREedExE0RcW/pG7+OiP8XEW/ol69PWb39ddZtJ6W8nSLiX8s2sCaqfcRlEfGPEbHDEOWMt74fGRFHRsQlEbGq1HVVRHwmIv64T56h1/ew/XSceep7k6kYu052QUQ8Kqpt/LqIuDuqbf+kiHhsk3qG0T0PEfHEUs/1pd7rIuLYiNisbj6AXcuoU3qWx9Ke9C+JiE9HxE9Lf7ynLO9zI2LvEdu9XkR8rtS3IiJ2rkkz9P6rT111+9PruuZ3UU/6jSLikDK/q0r/+nVZp9v0qeMhfSMi/jqq75tby/g9y/ju7TUi4l0R8V8RcUdE3BARX46IJ3SV+9Qy7vqyDK6IiAMGzOuTI+LzEXFlaffqiPhN2Xb+LiIe3XS5STNKZvry5WstvoBFQAKnAVtRXaeVwAE1aU8t0xbVTMvymtennnmdNDXTFpdp7wN+Vd7fQXXjjE653y5p3wk8ANzf1dYE7gZeWFP2wjL9y8A3yvt7gZVdee8D3tCn3X8E/EdX2gd66r0fOGices/sqmdFef+chutnE+B7XfXd09P2u4A9evJcQvWHRWf5rSqfbwQuGaJv/Alwc1ddd1Ad5U/gKuCQ8n5x03XdlWZBSbO0z/SDyrLt1H17WX6dz+dTXYPWnedJwB961uvyss46495e0v5xWR53dC3HG3tef9ykf1PdxOuurjQreWjf/Q9g0wH9/j3AZeX9mq5lnMCtwHY1eY/pSpPAbT1tuHHI/UB3f52N20kA/9SzTFaV5dL5vKgnz6IyfuEw+zOq7WJpV5p7u/pRlj5Xty8aen0zZD8dZxntT832WqZ15udvut7fWdrXadt1wJbD9Kuesgct5z3KvHf68r1d0y4BNujK8wb6798eso8DHtnTJ27r6ZcJfKFPu2v7B9XldP9Wpt0EPHsy9l8Dll+nD9zYlX9Z17hPd6Wd29W3Ov2rextYDuw0qG8An2Fsm11ehnv2bK+LqH4zJNV3b3f/v7a0YyfGtuOVPHQ/fHhNG57b09Z7uvJ3Xq8Ytv/58jUTXtPeAF++1rVX15f4aeVz5wvst8BGPWmnOsBdSRXgvqiM3xB4K2M/do4uX3ofBbYoaZ4EXFym/6ym7IVdZd9HFUQ/okzbDvh+mb6a+kDim2X6pcDLgI3L+C2Bo0p77qfnB21XvbeXHxnvoPygAbYGNmu4fk5g7IfK2zrrBNie6kdS54fo9gOW6/4j9IsNgP8p+a8BXlLGrwe8hirw7QQ/i5uu6640C+gT4FLdSKfzY/Rw4NFd/eHlwJXU/DAFvsRY8P1iYL0yfiPgGcCHKT/UatbTw/p0k/5d+lDnx91iYIeuOg9kLEA4ecD6WUEVPLwcWL8s4xcDvyvTv1azfDs/nj/aWT5l2lxgL+CLQ67vznKYrdvJ4V3r6LPAk7qmPZZq2zmqJ88ihgxwgc3Lukrga8CzgPXLtG0ZC3xupOyjJrq+h+mn4yyj/anZXsu0pV1t+y9g5zJ+DvBaxgKNfx6h3k7Zg5bzCuAHwDO6tp+3MLb9vHPA9tN3/0b1B+G/U+1THtU1fgvgXaXfJbBPTd6H9Q9gY+DbjH1H1u13R9p/NVyW433PfpexQHafrr45H7i8q28+uidfp2/cThWI/j1j37GbAVvX7CduB/66zFeUPnxDmX5CWe9nAdt2lfN5xv6o2aqnDeeVaT8B/rRnHc6nurHWzqP2f1++pvM17Q3w5Wtde/HwAHczxv5Jf29P2qkOcO8FnlIz/Ytd5X+pZvqTGPt3+Ik90xZ25T2qJu/GjB01Prln2v8t438FbN5nvo4oab4zoN4DR1w38xgLZN5WM30T4Ooy/V8HLNf9R6j7jYz9O79DzfQXd83f4qbruivNAmoCXKof/EvLtJf3ybsdVVB/L/DYrvG/LPlqjzL2Kauznh7Wp5v0766+eTU1R2Sogtws/fMpPdM662d1n36/F2N/bmzYNf4vyvj/HaVfjbMcZuN28ujSHxL46BD5FjF8gPuRMv6rA8rtBBmHTcb6HqafjjO/+9dtr2VaZ5u7kZ7Ao0w/tEy/doR6O2UPWs5X0POHapl+XJl+Xs20zvLcfwLLpLOfO3+8/kF1lsL5ZdyV9HzXlDQj778atrfv9ywP3Sc/rG7gMVSBbwIf6tM3Bm5DPdvrfgOWZ1LdsXtOz/T1qP6ATOBNPdM6ZzK8YNT16cvXTH15Da40zTLzNuCfy8e/i4hN12L1/56ZV9eM/8+u9//YOzEzf0MVYEB1pK7OauBTNXnXAP9SPu4VEdE1eb8yPCkzV/Up99/KcLeov9byVqoji6N4HdUPghuBk3snZuZqxtbV6/vUP6rOtWnfyMxf19R9EXDhJNbXsYDqD4srMvN7dQky8xqqf/nnMHadMVRHTKA6YjflSl/Zq3z8ZFkfvU4Gfk91hKPf9X5n9On3nSNFGwHd11p35nPziNhk6IYPNhu3k72p/uxZQXWUfip15vVfBqT5ahn+WZ/pw67vtenEzLy1ZvyZZfjkKfpO+ERm3j2g3n779Yk6qwx3GrT/jIitqI4wL6A6EvrizKx73NICRt9/TVRn/7Kkru7MvInqyCpUf5LVuR/4RIO6rge+UjO++7v62Oy503NWT2c4v3zsXadrdf8trU0GuNLMcBzVtUWPobpebG357z7jby7DNYwFsr1uKsMt+0xfkpl39pl2QRluATy5a/wuZfiBiLix7kV1fRhUP7C36lPvqI9zeG4ZXpSZ9/dJc14Zbgo0vonOEHVfMCDNoGmj6izzp/Zb5mW5d9J139DnnDL8p4j4bETsFhGPYOpsS3XKKoz9aHuI8oNucfn43Lo0jPWh3rz3Mtb3u/v1T6mOxDwW+HFEHBgRT+7NP6LZuJ3sVIbnZ+ZdI5YxrqhuHtW5gc45A+b10yVN7c2mGH59r021baP6k6Zji2mod+TlERFzorpJ37nlRkh3d920aUVJtvGAOh5H1fd3pApMF5Rgsc5E9l8T1dm/1O6Lis73xfZ9/qi4OjNvaVDXL7P+UYI3d72/ok/eft/Vnf33v0bEx6K6YVzjm3FJM5nPwZVmgMxcHREfpfqhdnhEfG7AkZnJdEOf8Z3g7qbMzHHS9PtC/H2f8b3T5lLdJAPG/klu+oOu7mjasoZ568wtw0Ftv74m/WTolPWHAWkGtWtUnWW+EdUfLOPpXub/BDyP6prBd5bXfRFxCdU1oifl5D4PuHt5N1lH/dbP7QPyrinDB/t1Zq6IiDdSXTLwLOALAOWH8/epTuMf9c+H2biddPpJ3RG1ydR9ZGnrBun7HV0fan2vZbVty8w1XQftp6Jt/ZZJZ3mM9PswIh5JdZO+XbpG30XV3zoBWqf/bArUBXedu/6uoLrJ0aDvwonsvyZqmO+LYOzU/m5Nt8Pa7+rMvL+rn4z3fd7bjw6n+pN2F+D95bUmIn5MdR31oqn8A0uaSh7BlWaOL1Dd9GRLquuv1kWdfdLrMjMavJbWlNHvyOswNh4/SWt0lvm3Gi7zhZ2MmXl3Zu4B7Ex16vZPqE757Hy+MiKePUXtXqvrKDPPoTqKeiDVzY7+AGwDvAlYHBEnrsXmzJTtZKp1/0bZssF8zpuuhupBR1MFTLdQnV7+mMzcJDO3zsxtgMd3pY26AqiOLN5O9V34uYgY9Ft15P3XJJrIvmjatsNyavyLqE7t/wzVzc42BHYDPgdc0f0IImk2McCVZohyPVTneraDGzx/rvPF2O/LdfM+49eWxzWc1v0PdudUqidOfnMa6bRlUP3dX/gTOQrWr+6my63bg6eaRtfzi3v06w8TXuaZ+ZPMfH9m7kz1o/QvqY7uzaXmWuYJ6F7eTdbRZK4fMnNVZp6UmW/IzMcDTwdOKpMPiIg/H6HY2biddOp/0lqqB6ZvXjWcfcrw3Zn5r5l5c8/0JkdZLwFeRXW086+AL/Zcg95tOreFYb4vkvqj1dMqK/+Zme/NzOdSHWV+G9UlGdtS3UlZmnUMcKWZ5RSqR8T8EdVdUAfpnPrZ7x/WHSerUSOaP+CGPLuW4Uqqx3d0/LgMXzllrRrssjJ8wYC2v7QM76S6a+Vk1/2SAWl27TO++zTgYftDZ5k/KyIe3ydNY5l5Z2aeRnWkE+B5PdeedU5T7PeDdZBrGZvX3eoSlKM9C8rHy+rSTJbM/GVmHkh15Br6r59BZuN20pnfBVN5zXVmXsdYALO253Ui/bStmiyTzv7nv/pM/79NKsrMH1I9Hu0uqjsOn9AnyJ3U/deQOvuXXQcE4J3viysHXGs/Y2Tmisw8ETiyjBplnyZNOwNcaQYpN31ZWD6+k8F3N+zcIGqP3gkRsRFw8KQ2bnibAu/tHVnadkj5eEbPNb6LyvDlEfGKQYVHxFTcFOYbVD/itmIsQOuucxOq65agutvxZJ5e9u9l+PqIeGpN3bvQJ/jNzDuoHpUB9f1hK6rnG9f5AdWp8esDHx/UwN5lHhEbDkjeuXYrqE576+jcuXPoG+eUvvKN8vG9fQLDt1KdBpmMLdMJGWc+YWxeNxqh+Nm4nZxBNc9bUj2/cyotKsPDBgUwUZnMmzGN3E9brMky6Vwv+8zeCeX63KOaVpaZ51M94/Zuqv3xp2uSjbz/mgRnlOHTqd/vPgZ4e/n4tUmue0IiYr2IGHSd9UT2adK0M8CVZp6vUj1f9BGM/ftbp/OFeUBEvLn8ICYink51DdOgUx/XhlXAhyPivZ2jPBGxLfAt4E+obmbyse4MmXkuVQATwDcj4vCIePBGQRHxqIjYMyK+TbNHKwwlq8cfda6l/Fi5W25nuW4PnE31OJHVVM/nnEynU633jajuGPuiUu965dTXbzD2A7NOpz98ICJe2/nxEhE7UT1KojZIK3eSPYgqIPzLiDgzIp7TmR4RG0TE/Ij4Zx56FBGqa7Q+GhE7doLAEmg8n+rO4ACXZOaKrjz/U4YvqgvkG/go1dHzxwFnR8QOpd6NIuIAqmvJAL5YHg8yGd4REd+LiL+KiAf/dIqILSLiSMaOGNc+pmQcs3E7uQX4h/LxiIg4PiIePE0zIh4bEYdExGQEvx+jOnL/aODiiPiL7qPGEfHEiDiQ6mjanpNQX8dE+2kbdZbJ6yOi3yUP/1GGn4iIB49sRsSOVMFo3R29+8rM71M9Guwe4N0RcWzP9InsvyYkq0e3nVs+fiki9o7y6KOIeB7VDei2pDoLoS44n06bAVdHxFER8cyudq8XEbsDx5R0o+zTpOmXM+BhvL58rUsvxh5mf9qANHsx9vD2pLqbYW+aDRi7qU9SPcR+VXl/K9U/ykk58NWTd3GZtn+f+heU6UsHtLG2DMYeTP9lqh/hSfXjZEVXW+8D9u1T7qZUd+DtpH2g5L2tZ5mc0qfehy2rIdfPJlQ/TLJP29cAewyzTIao+/9QPfahU9ftVMF0AldRHdFLYHFN3i2pTm/vbucd5f1vgL8ZtE6BN1MdKenkX1360X3dy70nz8qedXprWV6dccuAZ9X026u71u3NVEeflwJP6ErXKWNeTVs7py520qzoqfc/gU1HWT+lHUn1aJLOuIN7+t4dPX0igS8Mua47/XW2bidBdX1ed10rGdsHPawOxvZ9C2vKG7S+n0L150/3crmFsW2j89pvMtb3MP10nGW0P/2319p6my6TcertlD3Uci7T53XS1Ex7GmP7iHup7h68FPhhV5ptqbb7Tj13MbYfWg28rF8bxukfe5Y6EzimZvrQ+6+Gy3K85TWX6nTs7vnt3gaXAzsP0zeG3V4btPFhZVAdhe/edu6pWV7XNO3rvnzNtJdHcKWZ6RuMc/1gVv9c/xnVaVlLqX6E3Un1I+F5wC+mtIXjS6objhwC/C/VEcQVwHeAXbK6TvPhmaprOF8HvJpqOfyBKujs/OD8GtWPmXdPSaMzV1Nd7/dW4CKqH0qbUAWJJwPPzMxvTVHdvwSeU+q5gWqeb6QKJHak+rHUL+8KqruXnki1zNaj+sFyHNXzGq/vl7fkP4XqkRGfojpScz/Vv/y3UgUKH+Thz/3dA/hH4EelzkdS/VC6nOrI29Mz8/Keeu4Fdge+QvUDeUuqmxU9iYaPJsnMs6hOgTyJqu9vQrWefkh1KuPLc3Kvd/sq1aNLTqfqy/dSzesNwLeB12bm20Yse7ZuJ5mZ76M6bf50qnX5CKog4zKqI+3H9C9hqLquBv6U6rKN86mWz+ZUP8Yvp+rzf071GKdJMRn9tG0y81dU3znnUv2RsQ3V8nhCV5prgedTrYubqU4dXgn8G7BjVkdkR6n7TKqb190PHBkRH+yZPsr+a8IycxnVXeMPA5ZQ7Rs2pPpD8lNU+8Af9y9h2txGte/4FPAzqj8l/ojqN8QlVKeSPyczB35vSDNVZOZ0t0GS1EBE7E91I7ILMnPB9LZGExERC6l+dH85M/ef3tZIktQeHsGVJEmSJLWCAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBW8yJUmSJElqhVbe6v7Rj350zps3b7qbIUmSJEmaApdeeuktmTm3d3wrA9x58+axZMmS6W6GJEmSJGkKRMRv6sZ7Da4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUD3Gk074izh07T/bku/7wjzn7w1Z2mN1+/8f3KqEs/qD3defrNV7+y6saPN6331aSNdZ+btL1JumGWb7929NZRtz761Tte/n7ljbd8mpTXdFq/cgcZb1027QOD2to7brz2DkrT265B8zHettt0G+jXpvG2xybjeuscND918zeozYPmrUmbm+5zhtkOx1uv/cptkn9QmvE0WZfjbY/DLvvxtoUm28CgvOPtM4bdl/VrS93nQXUOmpdBfWrYfcSg+sfrI033U4PKGrQee/OPty7G2wcM2/+a9LtBZQ+a30F1DVqfg9rQb3kMSt9vOQwqa7z56ze/TeppUsd4+Zr0vyb7j7rP/fpbk8/92j7R7XGYft9kextvHzRsP6oru0lZs5UBriRJkiSpFQxwJUmSJEmt0CjAjYitIuKtEfHNiLg6Iu6KiFUR8cOI+NuIWK8n/byIyAGv0wbUtV9E/Cwi7ih1LI6IV090RiVJkiRJ7TanYbp9gM8DNwDnA78FHgO8HjgZeGVE7JOZ2ZPvF8CZNeVdUVdJRBwLHApcD5wEbAjsC5wVEe/OzOMbtleSJEmStI5pGuBeCbwWODszH+iMjIgjgZ8Be1EFu1/vyffzzFzYpIKI2IUquL0G2DEzV5TxHwcuBY6NiO9k5tKGbZYkSZIkrUManaKcmedl5lndwW0ZfyNwQvm4YIJteXsZHtMJbksdS4HPAhsBb55gHZIkSZKklpqMm0zdW4b31Ux7XES8LSKOLMNnDSjnpWV4bs207/akkSRJkiTpIZqeolwrIuYAbyof6wLTPyuv7jyLgf0y87dd4zYFHg/ckZk31JRzVRluP5H2SpIkSZLaa6JHcD8GPAM4JzO/1zV+NfBh4HnAluW1K9UNqhYAPyhBbcfmZbiqTz2d8Vv0a0hEHBgRSyJiybJly4adD0mSJEnSLDdygBsR76G6KdSvgDd2T8vMmzPz7zPzssxcWV4XAi8Dfgo8BXjrBNr9MJl5YmbOz8z5c+fOncyiJUmSJEmzwEgBbkQcBHwa+CWwW2Yub5IvM++jeqwQwEu6JnWO0G5Ovc74lUM2VZIkSZK0jhg6wI2Ig4HjqJ5lu1u5k/IwOucPP3iKcmbeCfweeGREPLYmz1PL8Moh65IkSZIkrSOGCnAj4v3AJ4GfUwW3N49Q505leG3P+PPK8BU1eV7Zk0aSJEmSpIdoHOBGxNFUN5W6FNg9M28ZkPa5EfGwsiNid+B95eOpPZM7z9M9KiK27MozD3gXcDdwStP2SpIkSZLWLY0eExQR+wEfAu4HLgLeExG9yZZm5qLy/hPAUyPiYuD6Mu5ZjD3H9ujMvLg7c2ZeHBGfAA4BLo+IM4ANgTcAjwLenZlLm8+aJEmSJGld0vQ5uE8uw/WBg/ukuQBYVN5/BXgdsCPV6cUbADcBXwOOz8yL6grIzEMj4r+pjtgeCDwAXAZ8PDO/07CtkiRJkqR1UKMANzMXAgubFpqZXwS+OEqDylHgRaPklSRJkiStu0Z+Dq4kSZIkSTOJAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKDc074uy1mm+2auP8tnGeZrI2Lu95R5w9qfPVxmW0Llmb629QXVPZDvvocGbj8pqNbda6wQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIArSZIkSWoFA1xJkiRJUisY4EqSJEmSWsEAV5IkSZLUCga4kiRJkqRWMMCVJEmSJLWCAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmt0CjAjYitIuKtEfHNiLg6Iu6KiFUR8cOI+NuIqC0nInaJiHMiYnnJc3lEHBwR6w+o69URsbiUf0dE/DQi9ht1BiVJkiRJ64Y5DdPtA3weuAE4H/gt8Bjg9cDJwCsjYp/MzE6GiNgD+DqwBjgdWA68Bvgk8MJS5kNExEHAccCtwKnAPcDewKKIeGZmHjbCPEqSJEmS1gFNA9wrgdcCZ2fmA52REXEk8DNgL6pg9+tl/GbAScD9wILMXFLGHw2cB+wdEftm5mldZc0DjqUKhOdn5tIy/kPAJcChEfH1zPzxqDMrSZIkSWqvRqcoZ+Z5mXlWd3Bbxt8InFA+LuiatDcwFzitE9yW9GuAD5SP7+ip5i3ARsDxneC25FkBfLR8fHuT9kqSJEmS1j2TcZOpe8vwvq5xLy3Dc2vSXwisBnaJiI0a5vluTxpJkiRJkh5iQgFuRMwB3lQ+dgemO5Thlb15MvM+4Dqq06O3bZjnBuBO4AkRsclE2ixJkiRJaqeJHsH9GPAM4JzM/F7X+M3LcFWffJ3xW4yQZ/O6iRFxYEQsiYgly5YtG9xqSZIkSVLrjBzgRsR7gEOBXwFvnLQWjSgzT8zM+Zk5f+7cudPdHEmSJEnSWjZSgFse5/Np4JfAbpm5vCfJwKOtXeNXjpCn3xFeSZIkSdI6bOgANyIOpnpW7RVUwe2NNcl+XYbb1+SfAzyZ6qZU1zbM81hgU+D6zFw9bJslSZIkSe03VIAbEe8HPgn8nCq4vblP0vPK8BU1014CbAJcnJl3N8zzyp40kiRJkiQ9ROMANyKOprqp1KXA7pl5y4DkZwC3APtGxPyuMjYGPlI+fr4nzynA3cBBETGvK8+WwJHl4wlIkiRJklRjTpNEEbEf8CHgfuAi4D0R0ZtsaWYuAsjM2yLiAKpAd3FEnAYsB15L9TigM4DTuzNn5nURcTjwGWBJRJwO3APsDTwB+JfM/PEoMylJkiRJar9GAS7VNbMA6wMH90lzAbCo8yEzz4yIXYGjgL2AjYGrgUOAz2Rm9haQmcdFxFLgMKrn665HdSOrD2Tmlxu2VZIkSZK0DmoU4GbmQmDhsIVn5o+AVw2Z5yzgrGHrkiRJkiSt20Z+Dq4kSZIkSTOJAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSa8EmOEAABMuSURBVJIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIArSZIkSWoFA1xJkiRJUisY4EqSJEmSWsEAV5IkSZLUCga4kiRJkqRWMMCVJEmSJLWCAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa3QOMCNiL0j4riIuCgibouIjIhT+6SdV6b3e502oJ79IuJnEXFHRKyKiMUR8epRZk6SJEmStO6YM0TaDwDPBu4Argee1iDPL4Aza8ZfUZc4Io4FDi3lnwRsCOwLnBUR787M44doryRJkiRpHTJMgPs+qsDzamBX4PwGeX6emQubFB4Ru1AFt9cAO2bmijL+48ClwLER8Z3MXDpEmyVJkiRJ64jGpyhn5vmZeVVm5hS15e1leEwnuC31LgU+C2wEvHmK6pYkSZIkzXJTfZOpx0XE2yLiyDJ81oC0Ly3Dc2umfbcnjSRJkiRJDzHMKcqj+LPyelBELAb2y8zfdo3bFHg8cEdm3lBTzlVluP0UtVOSJEmSNMtN1RHc1cCHgecBW5ZX57rdBcAPSlDbsXkZrupTXmf8Fv0qjIgDI2JJRCxZtmzZBJouSZIkSZqNpiTAzcybM/PvM/OyzFxZXhcCLwN+CjwFeOsk13liZs7PzPlz586dzKIlSZIkSbPAVF+D+xCZeR9wcvn4kq5JnSO0m1OvM37lVLRLkiRJkjT7rdUAt+icP/zgKcqZeSfwe+CREfHYmjxPLcMrp7htkiRJkqRZajoC3J3K8Nqe8eeV4Stq8ryyJ40kSZIkSQ8xJQFuRDw3Ih5WdkTsDryvfDy1Z/IJZXhURGzZlWce8C7gbuCUSW+sJEmSJKkVGj8mKCL2BPYsH7cpw50jYlF5f0tmHlbefwJ4akRcDFxfxj2LsefYHp2ZF3eXn5kXR8QngEOAyyPiDGBD4A3Ao4B3Z+bSpu2VJEmSJK1bhnkO7nOA/XrGbVteAL8BOgHuV4DXATtSnV68AXAT8DXg+My8qK6CzDw0Iv6b6ojtgcADwGXAxzPzO0O0VZIkSZK0jmkc4GbmQmBhw7RfBL44SoMycxGwaJS8kiRJkqR113TcZEqSJEmSpEk3zCnKmiTzjjibpR/784d8rtMvTW/6Tnl14+vq7n3f255hyhjvc/f7ThubzHuTcvu1uS5NXZvGa0f3cq0rp1/a8coc5vMgvWkHzc+gdvd73zvPg5Z3d19q0t7uMpv0l1H6wnjrebxlMqhPDGrHoP4yXlv76W1Tv+XSr8669xPp2/3mZ9h+0K+cfu0eps3D6rc/HGWbbFLGMN8D420H/ZZLk7aPsk8ab/12t3W8bWTYuobZl42yPU7WNtu0/U2Xb++ya/qd1a/c8b736+pqup8aNG3YfcMw0+vS987HeOt7InV11zlMeeNt171p6r4Hht1H1+XvLbtpHePN06h9vC5dv2nj1THZ5Yy6jfQzzDqsa1uTZQfNfsd1vx/vd/ZM5hFcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIArSZIkSWoFA1xJkiRJUisY4EqSJEmSWsEAV5IkSZLUCga4kiRJkqRWMMCVJEmSJLWCAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKzQKcCNi74g4LiIuiojbIiIj4tRx8uwSEedExPKIuCsiLo+IgyNi/QF5Xh0RiyNiVUTcERE/jYj9hp0pSZIkSdK6Z07DdB8Ang3cAVwPPG1Q4ojYA/g6sAY4HVgOvAb4JPBCYJ+aPAcBxwG3AqcC9wB7A4si4pmZeVjDtkqSJEmS1kFNT1F+H7A9sBnwjkEJI2Iz4CTgfmBBZv5tZh4OPAf4MbB3ROzbk2cecCxVIDw/M9+Vme8DngVcAxwaETs3nSlJkiRJ0rqnUYCbmedn5lWZmQ2S7w3MBU7LzCVdZayhOhIMDw+S3wJsBByfmUu78qwAPlo+vr1JWyVJkiRJ66apuMnUS8vw3JppFwKrgV0iYqOGeb7bk0aSJEmSpIeZigB3hzK8sndCZt4HXEd17e+2DfPcANwJPCEiNpncpkqSJEmS2mIqAtzNy3BVn+md8VuMkGfzPtOJiAMjYklELFm2bFmjhkqSJEmS2qM1z8HNzBMzc35mzp87d+50N0eSJEmStJZNRYA73tHWzviVI+Tpd4RXkiRJkrSOm4oA99dluH3vhIiYAzwZuA+4tmGexwKbAtdn5urJbaokSZIkqS2mIsA9rwxfUTPtJcAmwMWZeXfDPK/sSSNJkiRJ0sNMRYB7BnALsG9EzO+MjIiNgY+Uj5/vyXMKcDdwUETM68qzJXBk+XjCFLRVkiRJktQSc5okiog9gT3Lx23KcOeIWFTe35KZhwFk5m0RcQBVoLs4Ik4DlgOvpXoc0BnA6d3lZ+Z1EXE48BlgSUScDtwD7A08AfiXzPzxaLMoSZIkSVoXNApwgecA+/WM25axZ9n+BjisMyEzz4yIXYGjgL2AjYGrgUOAz2Rm9laQmcdFxNJSzpuoji7/EvhAZn656QxJkiRJktZNjQLczFwILBym4Mz8EfCqIfOcBZw1TB5JkiRJkqBFz8GVJEmSJK3bDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIArSZIkSWoFA1xJkiRJUisY4EqSJEmSWsEAV5IkSZLUCga4kiRJkqRWMMCVJEmSJLWCAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIArSZIkSWqFKQ1wI2JpRGSf14198uwSEedExPKIuCsiLo+IgyNi/alsqyRJkiRpdpuzFupYBXyqZvwdvSMiYg/g68Aa4HRgOfAa4JPAC4F9pq6ZkiRJkqTZbG0EuCszc+F4iSJiM+Ak4H5gQWYuKeOPBs4D9o6IfTPztKlsrCRJkiRpdppJ1+DuDcwFTusEtwCZuQb4QPn4julomCRJkiRp5lsbR3A3ioi/AZ4I3AlcDlyYmff3pHtpGZ5bU8aFwGpgl4jYKDPvnrLWSpIkSZJmpbUR4G4DfKVn3HUR8ebMvKBr3A5leGVvAZl5X0RcBzwd2Bb43ylpqSRJkiRp1prqU5RPAXanCnI3BZ4JfAGYB3w3Ip7dlXbzMlzVp6zO+C3qJkbEgRGxJCKWLFu2bKLtliRJkiTNMlMa4GbmP2TmeZl5U2auzswrMvPtwCeARwALJ7GuEzNzfmbOnzt37mQVK0mSJEmaJabrJlMnlOFLusZ1jtBuTr3O+JVT0iJJkiRJ0qw2XQFu5xziTbvG/boMt+9NHBFzgCcD9wHXTm3TJEmSJEmz0XQFuDuVYXewel4ZvqIm/UuATYCLvYOyJEmSJKnOlAW4EfEnEbFpzfh5wPHl46ldk84AbgH2jYj5Xek3Bj5SPn5+ShorSZIkSZr1pvIxQW8ADo2IC4HfALcD2wF/DmwMnAMc20mcmbdFxAFUge7iiDgNWA68luoRQmcAp09heyVJkiRJs9hUBrjnUwWmfwq8kOp625XAD6mei/uVzMzuDJl5ZkTsChwF7EUVCF8NHAJ8pje9JEmSJEkdUxbgZuYFwAUj5PsR8KrJb5EkSZIkqc2m6yZTkiRJkiRNKgNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQKBriSJEmSpFYwwJUkSZIktYIBrqRG5h1x9oyob223Yyq1aV5ms971MNXrxfWuptZGX5mKOpqWOR3bWhu3vzbOkzQRBriSJEmSpFYwwJUkSZIktYIBriRJkiSpFQxwJUmSJEmtYIArSZIkSWoFA1xJkiRJUisY4EqSJEmSWsEAV5IkSZLUCga4kiRJkqRWMMCVJEmSJLWCAa4kSZIkqRUMcCVJkiRJrWCAK0mSJElqBQNcSZIkSVIrGOBKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVZlyAGxFPiIgvRcQfIuLuiFgaEZ+KiC2nu22SJEmSpJlrznQ3oFtEbAdcDGwNfAv4FfB84L3AKyLihZl56zQ2UZIkSZI0Q820I7ifowpu35OZe2bmEZn5UuCTwA7AMdPaOkmSJEnSjDVjAtxy9PZlwFLgsz2TPwjcCbwxIjZdy02TJEmSJM0CMybABXYrw+9n5gPdEzLzduBHwCbATmu7YZIkSZKkmW8mBbg7lOGVfaZfVYbbr4W2SJIkSZJmmZkU4G5ehqv6TO+M32IttEWSJEmSNMtEZk53GwCIiBOBA4ADMvPkmunHAEcCR2bmP9ZMPxA4sHzcAfj1FDZXkiRJkjR9npSZc3tHzqTHBHWO0G7eZ3pn/Mq6iZl5InDiZDdKkiRJkjQ7zKRTlDtHXPtdY/vUMux3ja4kSZIkaR02k05R3g64muoxQdt130k5Iv4IuAEIYOvMvHNaGilJkiRJmrFmzBHczLwG+D4wD3hXz+R/ADYFvmJwK0ma6SJiXkRkRCzqGb+ojJ83LQ0bQUQsjoih/g0v87h4ipokSVJfM+kaXIB3AhcDn4mI3YH/BV5A9YzcK4GjprFtkiTNWiVIvSAzF0x3WyRJmioz5gguPHgUdz6wiCqwPRTYDvg0sFNm3jp9rZMkacL+DvgT4PfT3ZAhvImqzZIkzXgz7Qgumfk74M3T3Q5JkiZbZt5AdU+JWSMzfzvdbZAkqakZdQRXkqSZLiKeHxGnR8TvI+LuiLghIr4fEX/RIG/fa3Aj4gURcUZE3BgR90TE7yLiCxHxuJq0i0s5cyLiyIi4qrTldxHxTxGxYVfa/buuod215Ou8FjZoc+01uBGxYUQcHRHXlLqvi4iPRMRG45UpSdJUmXFHcCVJmqki4gDg88D9wLeBq4CtqS6veSfwtRHLfQvVs9zvLuX+jurxeG8FXhMRO/U5kvpV4MXAd4HbgFcB/19pU+dsqJ9T3azxg8BvqC4D6lg8YnuDal73AK4Bjgc2BN4CPHOUMiVJmgwGuJIkNRAR/wf4HFUg+eLM/J+e6U8YsdztgROoHpO3a2b+vmva7lRPGPg08Lqa7NsBT8/M5SX9UcAvgDdFxN9l5o2Z+XPg5xHxQWBpZi4cpZ09/pIquP0JsFtmrin1fxC4ZBLKlyRpJJ6iLElSM++g+mP4w73BLUBmXj+BcjcA3tsd3JYyf0B1RPc15Znwvd7fCW5L+juBf6P6fp8/Ynua6BwdPrIT3Jb6lwMfnsJ6JUkayCO4kiQ1s1MZfneSy925DHeNiB1rpm8NrA9sD1zaM21JTfrfleGWk9O8Ws8FHgB+WDNt8RTWK0nSQAa4kiQ1s0UZTvYjfrYqw8PHSffI3hGZubIm3X1luP5EGjWOzYHlmXlvzbQbp7BeSZIGMsCVJKmZTjD5eOBXk1juqjLcPDNvm8Ryp9Iq4FERsUFNkLvNdDRIkiTwGlxJkpr6SRm+corKffEkl9vrASbvqO5lVL8hXlQzbcEk1SFJ0tAMcCVJaubzVKf/Hl3uqPwQo95FmeoRO/cCnyx3VO4td8OImIzg91bgjyehHIBTyvCYiNi4MzIiHgV8YJLqkCRpaJ6iLElSA5n5y4h4J9Ujff4rIr5F9RzcrYAdqR4ftNsI5f6qPAf3S8D/RMS5wJVUd1Z+ItWR3WXA0yY4Cz8A9o2Is6iOwN4LXJiZF45Q1v8D3gC8FriiLIsNgL2pHhO03QTbKknSSAxwJUlqKDNPiogrgMOoTsXdE7gFuBw4eQLlnhoRvwAOpQqSXwbcCfwBOAM4fWItB+C9QAK7A6+iOovrH4ChA9zMzIjYBzgC2B84CLiB6sjuh4A1/XNLkjR1IjOnuw2SJEmSJE2Y1+BKkiRJklrBAFeSJEmS1AoGuJIkSZKkVjDAlSRJkiS1ggGuJEmSJKkVDHAlSZIkSa1ggCtJkiRJagUDXEmSJElSKxjgSpIkSZJawQBXkiRJktQK/z9gAWzXBTh1vgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 1152x360 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualizeAnswerDataDf(answerDataDf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{259: 1014, 140: 881, 27: 46, 236: 42}"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "showValueFrequencies(answerDataDf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process tag Treatment (treatments.csv)\n",
    "\n",
    "Uses Intake functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "QUERYTREATMENT = \"./Treatment\"\n",
    "QUERYTREATMENTSTEP = \".//TreatmentStep\"\n",
    "OUTFILENAME = \"treatments.csv.gz\"\n",
    "TITLESTRING = \"Number of questions per client in treatment steps\"\n",
    "TREATMENTTITLE = \"treatmentTitle\"\n",
    "PLOTFILENAME = \"Treatment.png\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getAnswerDataListTreatments(querySection=QUERYINTAKE):\n",
    "    inFileNames = sorted(os.listdir(DATADIR))\n",
    "    answerDataList = []\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "            for section in root.findall(querySection):\n",
    "                for treatmentStep in section.findall(QUERYTREATMENTSTEP):\n",
    "                    titleList = treatmentStep.findall(\"./\"+TITLECAPS)\n",
    "                    if not titleList is None and len(titleList) > 0 and not titleList[0].text is None:\n",
    "                        treatmentTitle = titleList[0].text.strip()\n",
    "                    else: treatmentTitle = \"\"\n",
    "                    for question in treatmentStep.findall(QUERYQUESTION):\n",
    "                        answerDict = {CLIENT:fileNameToId(inFileName),TREATMENTTITLE:treatmentTitle}\n",
    "                        try:\n",
    "                            answerDict[TITLE] = list(question.findall(QUERYTITLE))[0].text.strip()\n",
    "                        except: pass\n",
    "                        try:\n",
    "                            answerDict[QUESTIONNUMBER] = list(question.findall(QUERYQUESTIONNUMBER))[0].text.strip()\n",
    "                        except: pass\n",
    "                        for answer in question.findall(QUERYANSWER):\n",
    "                            answerDictCopy = dict(answerDict)\n",
    "                            answerDictCopy[ANSWERID] = answer.attrib[ID]\n",
    "                            try:\n",
    "                                answerDictCopy[ANSWERTEXT] = list(answer.findall(QUERYANSWERTEXT))[0].text.strip()\n",
    "                            except: pass\n",
    "                            try:\n",
    "                                answerDictCopy[ANSWERTITLE] = list(answer.findall(QUERYTITLE))[0].text.strip()\n",
    "                            except: pass\n",
    "                            answerDataList.append(answerDictCopy)\n",
    "    return(answerDataList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "answerDataList = getAnswerDataListTreatments(querySection=QUERYTREATMENT)\n",
    "answerTextFreqs = countAnswerTexts(answerDataList)\n",
    "answerDataList = removeRareDataValues(answerDataList,answerTextFreqs)\n",
    "answerDataDf = answerDataListToDf(answerDataList)\n",
    "saveAnswerDataDf(answerDataDf,outFileName=OUTFILENAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1152x360 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualizeAnswerDataDf(answerDataDf,titleString=TITLESTRING,plotFileName=PLOTFILENAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{8: 174,\n",
       " 369: 141,\n",
       " 9: 76,\n",
       " 304: 56,\n",
       " 337: 55,\n",
       " 108: 45,\n",
       " 142: 41,\n",
       " 210: 39,\n",
       " 224: 27,\n",
       " 179: 26,\n",
       " 189: 26,\n",
       " 151: 20,\n",
       " 199: 19,\n",
       " 682: 9,\n",
       " 645: 8,\n",
       " 467: 7,\n",
       " 742: 7,\n",
       " 217: 6,\n",
       " 422: 5,\n",
       " 552: 5,\n",
       " 182: 4,\n",
       " 223: 3,\n",
       " 429: 3,\n",
       " 631: 3,\n",
       " 434: 3,\n",
       " 37: 3,\n",
       " 135: 2,\n",
       " 214: 2,\n",
       " 297: 2,\n",
       " 428: 2,\n",
       " 689: 2,\n",
       " 394: 2,\n",
       " 488: 2,\n",
       " 192: 2,\n",
       " 228: 2,\n",
       " 172: 2,\n",
       " 654: 2,\n",
       " 530: 2,\n",
       " 203: 2,\n",
       " 387: 1,\n",
       " 279: 1,\n",
       " 556: 1,\n",
       " 290: 1,\n",
       " 663: 1,\n",
       " 559: 1,\n",
       " 88: 1,\n",
       " 340: 1,\n",
       " 254: 1,\n",
       " 165: 1,\n",
       " 516: 1,\n",
       " 427: 1,\n",
       " 213: 1,\n",
       " 550: 1,\n",
       " 29: 1,\n",
       " 335: 1,\n",
       " 233: 1,\n",
       " 154: 1,\n",
       " 35: 1,\n",
       " 205: 1,\n",
       " 349: 1,\n",
       " 268: 1,\n",
       " 384: 1,\n",
       " 272: 1,\n",
       " 139: 1,\n",
       " 144: 1}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "showValueFrequencies(answerDataDf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'answer': 179958,\n",
       " 'answerText': 179958,\n",
       " 'title': 108423,\n",
       " 'question': 65265,\n",
       " 'questionNumber': 62285,\n",
       " 'Title': 12180,\n",
       " 'TreatmentStep': 6821,\n",
       " 'SubmissionDate': 6821,\n",
       " 'ApprovalDate': 6821,\n",
       " 'ApprovingCounselor': 6821,\n",
       " 'FirstName': 6154,\n",
       " 'LastName': 6154,\n",
       " 'Questionnaire': 5375,\n",
       " 'Type': 5375,\n",
       " 'Content': 5375,\n",
       " 'Status': 1983,\n",
       " 'StartDate': 1983,\n",
       " 'EndDate': 1983,\n",
       " 'TreatmentSteps': 1983}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inFileNames = sorted(os.listdir(DATADIR))\n",
    "tags= {}\n",
    "for inFileName in inFileNames:\n",
    "    if re.search(FILEPATTERN,inFileName):\n",
    "        root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "        for section in root.findall(QUERYTREATMENT):\n",
    "            for tag in section.findall(\".//*\"):\n",
    "                if not tag.tag in tags: tags[tag.tag] = 0\n",
    "                tags[tag.tag] += 1\n",
    "{tag:tags[tag] for tag in sorted(tags.keys(),key=lambda t:tags[t],reverse=True)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "179958"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(answerDataList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "QUERYTREATMENTSTEP = \"./TreatmentSteps/TreatmentStep\"\n",
    "QUERYTITLE = \"./Title\"\n",
    "QUERYANSWER = \".//\"+ANSWER"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Most common treatment step order:\n",
    "\n",
    "1. Voordelen, nadelen\n",
    "2. Alcoholschrift bijhouden\n",
    "3. Situaties analyseren\n",
    "4. Meten en weten\n",
    "5. Doel stellen\n",
    "6. Gewoontes doorbreken\n",
    "7. Anders denken\n",
    "8. Anders doen\n",
    "9. Beslissingen\n",
    "10. Actieplan\n",
    "11. Afsluiting\n",
    "12. Na 3 maanden\n",
    "13. Na half jaar\n",
    "14. Na 9 maanden"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Voordelen, nadelen': 922,\n",
       " 'Alcoholschrift bijhouden': 749,\n",
       " 'Doel stellen': 689,\n",
       " 'Situaties analyseren': 627,\n",
       " 'Meten en weten': 574,\n",
       " 'Gewoontes doorbreken': 497,\n",
       " 'Anders denken': 472,\n",
       " 'Anders doen': 442,\n",
       " 'Afsluiting': 419,\n",
       " 'Beslissingen': 418,\n",
       " 'Actieplan': 396,\n",
       " 'Na 3 maanden': 326,\n",
       " 'Na half jaar': 264,\n",
       " 'Na 9 maanden': 9,\n",
       " 'Intake': 1}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inFileNames = sorted(os.listdir(DATADIR))\n",
    "titles= {}\n",
    "for inFileName in inFileNames:\n",
    "    if re.search(FILEPATTERN,inFileName):\n",
    "        root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "        questionCount = 0\n",
    "        for section in root.findall(QUERYTREATMENT):\n",
    "            questionCountAll = len(section.findall(QUERYANSWER))\n",
    "            for treatmentStep in section.findall(QUERYTREATMENTSTEP):\n",
    "                questionCount += len(treatmentStep.findall(QUERYANSWER))\n",
    "                for title in treatmentStep.findall(QUERYTITLE):\n",
    "                    titleText = cleanupText(title.text) # +\" \"+str(questionCountAll)+\" \"+str(questionCount)\n",
    "                    if not titleText in titles: titles[titleText] = 0\n",
    "                    titles[titleText] += 1\n",
    "{titleText:titles[titleText] for titleText in sorted(titles.keys(),key=lambda t:titles[t],reverse=True)}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process tag Diary (diaries.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "MINIMALDUPLICATEVALUES = 5\n",
    "OUTFILENAME = \"diaries.csv.gz\"\n",
    "QUERYDIARY = \"./Diary\"\n",
    "QUERYDIARYENTRY = \"./DiaryEntries/DiaryEntry\"\n",
    "REMOVED = \"REMOVED\"\n",
    "SEP = \"_\"\n",
    "UNKNOWN = \"-\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getAllTextFields(tag,prefix=\"\"):\n",
    "    textDict = {}\n",
    "    for child in tag.findall(\"./*\"):\n",
    "        if prefix == \"\": key = child.tag\n",
    "        else: key = prefix+SEP+child.tag\n",
    "        if child.text != None:\n",
    "            childText = cleanupText(child.text)\n",
    "            if childText != \"\" and childText != \"\" and childText != UNKNOWN:\n",
    "                textDict[key] = childText\n",
    "        textDict.update(getAllTextFields(child,prefix=key))\n",
    "    return(textDict)\n",
    "\n",
    "def readDiaries():\n",
    "    inFileNames = sorted(os.listdir(DATADIR))\n",
    "    dataList = []\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "            for section in root.findall(QUERYDIARY):\n",
    "                for diaryEntry in section.findall(QUERYDIARYENTRY):\n",
    "                    clientDictData = {CLIENT:fileNameToId(inFileName)}\n",
    "                    diaryEntryDict = getAllTextFields(diaryEntry)\n",
    "                    clientDictData.update(diaryEntryDict)\n",
    "                    dataList.append(clientDictData)\n",
    "    return(dataList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def countValues(listOfDict):\n",
    "    countsDict = {}\n",
    "    for i in range(0,len(listOfDict)):\n",
    "        for dictKey in listOfDict[i].keys():\n",
    "            if not dictKey in countsDict: countsDict[dictKey] = {}\n",
    "            for token in word_tokenize(listOfDict[i][dictKey]):\n",
    "                if not token in countsDict[dictKey]: countsDict[dictKey][token] = 0\n",
    "                countsDict[dictKey][token] += 1\n",
    "    return(countsDict)\n",
    "\n",
    "def anonymize(listOfDict,countsDict,minimalDuplicateValues=MINIMALDUPLICATEVALUES):\n",
    "    for i in range(0,len(listOfDict)):\n",
    "        for dictKey in listOfDict[i].keys():\n",
    "            tokenizedTextList = word_tokenize(listOfDict[i][dictKey])\n",
    "            for j in range(0,len(tokenizedTextList)):\n",
    "                if countsDict[dictKey][tokenizedTextList[j]] < minimalDuplicateValues and dictKey != CLIENT:\n",
    "                    tokenizedTextList[j] = REMOVED\n",
    "            listOfDict[i][dictKey] = \" \".join(tokenizedTextList)\n",
    "    return(listOfDict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of entries in list: 122330\n"
     ]
    }
   ],
   "source": [
    "dataList = readDiaries()\n",
    "countsDict = countValues(dataList)\n",
    "dataListAnonymized = anonymize(dataList,countsDict)\n",
    "saveAnswerDataDf(answerDataListToDf(dataListAnonymized),outFileName=OUTFILENAME)\n",
    "print(f\"Number of entries in list: {len(dataListAnonymized)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 2160x720 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "PLOTFILENAME = \"Diary.png\"\n",
    "\n",
    "def showNumberOfDiaryEntriesPerClient():\n",
    "    groups = pd.DataFrame(dataListAnonymized).groupby(CLIENT).groups\n",
    "    x = groups.keys()\n",
    "    y = [len(groups[client]) for client in x]\n",
    "\n",
    "    font = {\"size\":24}\n",
    "    matplotlib.rc(\"font\",**font)\n",
    "    plt.figure(figsize=(30,10))\n",
    "    plt.bar(x,y)\n",
    "    plt.title(\"diary entries per client\")\n",
    "    plt.xlabel(\"client id\")\n",
    "    plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
    "    plt.savefig(PLOTFILENAME)\n",
    "    plt.show()\n",
    "    \n",
    "showNumberOfDiaryEntriesPerClient()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1245 398 854 391\n"
     ]
    }
   ],
   "source": [
    "valueFrequencies = showValueFrequencies(pd.DataFrame(dataListAnonymized))\n",
    "print(sum([valueFrequencies[k] for k in list(valueFrequencies.keys())]),\n",
    "      sum([valueFrequencies[k] for k in list(valueFrequencies.keys()) if k < 10]),\n",
    "      sum([valueFrequencies[k] for k in list(valueFrequencies.keys()) if k < 100]),\n",
    "      sum([valueFrequencies[k] for k in list(valueFrequencies.keys()) if k >= 100]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 1983-1245 = 738 empty diaries\n",
    "* 398 more diaries with fewer than 10 entries\n",
    "* 854-398 = 456 more diaries with fewer than 100 entries\n",
    "* 391 diaries with 100 entries or more (max 1961)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'DiaryEntry': 122330,\n",
       " 'Date': 122330,\n",
       " 'Time': 122330,\n",
       " 'Urge': 122330,\n",
       " 'StandardUnits': 122330,\n",
       " 'Quantity': 82118,\n",
       " 'MeasurementUnitName': 72465,\n",
       " 'Snapshot': 18090,\n",
       " 'Situation': 18090,\n",
       " 'Location': 18090,\n",
       " 'Companion': 18090,\n",
       " 'Activity': 18090,\n",
       " 'Occasion': 18090,\n",
       " 'Thoughts': 18090,\n",
       " 'Feeling': 18090,\n",
       " 'Type': 18090,\n",
       " 'Emotion': 18090,\n",
       " 'Description': 18090,\n",
       " 'Behavior': 18090,\n",
       " 'BehaviorDetails': 18090,\n",
       " 'DayTarget': 9653,\n",
       " 'dayOfWeek': 9653,\n",
       " 'DiaryEntries': 1983,\n",
       " 'Targets': 1983,\n",
       " 'DateLastOpenedByClient': 1983,\n",
       " 'Target': 1379,\n",
       " 'StartDate': 1379,\n",
       " 'WeekTargetQuantity': 1379}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUERYDIARY = \"./Diary\"\n",
    "\n",
    "def showDiaryTextFieldFrequencies():\n",
    "    inFileNames = sorted(os.listdir(DATADIR))\n",
    "    tags= {}\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIR+inFileName)\n",
    "            for section in root.findall(QUERYDIARY):\n",
    "                for tag in section.findall(\".//*\"):\n",
    "                    if not tag.tag in tags: tags[tag.tag] = 0\n",
    "                    tags[tag.tag] += 1\n",
    "    return(tags)\n",
    "\n",
    "tags = showDiaryTextFieldFrequencies()\n",
    "{tag:tags[tag] for tag in sorted(tags.keys(),key=lambda t:tags[t],reverse=True)}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process tag Messages (emails.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "BODY = \"Body\"\n",
    "DATADIRANONYMIZED = \"../usb/releases/20200320/\"\n",
    "DATESENT = \"DateSent\"\n",
    "OUTFILENAME = \"emails.csv.gz\"\n",
    "QUERYMESSAGES = \"./Messages\"\n",
    "QUERYMESSAGESENTRY = \"./Message\"\n",
    "UNKNOWN = \"-\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "def readMessages():\n",
    "    inFileNames = sorted(os.listdir(DATADIRANONYMIZED))\n",
    "    dataList = []\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIRANONYMIZED+inFileName)\n",
    "            messages = []\n",
    "            for section in root.findall(QUERYMESSAGES):\n",
    "                for message in section.findall(QUERYMESSAGESENTRY):\n",
    "                    clientDictData = {CLIENT:fileNameToId(inFileName)}\n",
    "                    messagesDict = getAllTextFields(message)\n",
    "                    clientDictData.update(messagesDict)\n",
    "                    messages.append(clientDictData)\n",
    "            dataList.extend([message for message in sorted(messages,key=lambda m:m[DATESENT])])\n",
    "    return(dataList)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`getAllTextFields` can be found in the Diary code while `saveAnswerDataDf` is in the Intake code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of entries in list: 45469\n"
     ]
    }
   ],
   "source": [
    "dataListAnonymized = readMessages()\n",
    "print(f\"Number of entries in list: {len(dataListAnonymized)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAABs0AAAJlCAYAAACR2byAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAABJ1UlEQVR4nO3debg8Z10n7M+XhISsEEiQoMIPGDUmyGKiBhAkbDqsAi84wMyIziTqixsiElzGuKGMuKC4THgHQQVmggoKQUE2RQE1LFHCIiMEHAJJCJDkRxYCed4/uo5pOt19us/pPqe7676v61zV1VVP1VNrd59PPVXVWgsAAAAAAAD02S32uwIAAAAAAACw34RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAYIeq6sVV1aqqTRh+ztbwqjqwx9UDAADmIDQDAAAAAACg94RmAAAAQJKkqp461DLugftdHwAA2EtCMwAAAFiS1to5rbXq/i7e7/oAAACTCc0AAAAAAADoPaEZAAAAAAAAvSc0AwCANTTuuUNV9ZSqelNVXVZVn6uqf6qqZ1bVESNlH1xVr6qqf62q66rqI1X1a1V12xnnfeeq+qWquqCqrqiqz1fVJ6rqtVX1nVV16DblT6mqF1TVP1bVVVV1Q1VdWlXvrao/qaqnVdWXL6HsV1XV07t6fqxb9mu79fDKqvqOqjpkxnVwz6p6SVf2+qq6pKpeU1WP7IbP/Fyobpl+o9ten+mm93+r6o+r6turqrYpf5+qelFVfaCqDg5tj3+sqpdV1XfPum3HTPvFW8vR9R9bVf+tqi6sqiur6uqqeldVnT26n02Z5o73nzH1Oaqqfqyq/q6qPtUN+/WdLGs3vftU1e9W1UVV9dlu//hwd1w9s6rusoNpnjO0LxzYZtwd7wtV9ZZuHhd3/UdX1bOr6t3dsXKw224/VVVHjyn/wG69/t7Q228eqvvW34vnXQcAALAuqrW233UAAADmVFVPzU3/3H5oku9N8vgJo78lySOSXJvkvyf50QnjfSjJN7fWLpsy32ckeU6Sw6ZU751JHtNa+/iY8v9vkt9Isl049VOttZ9fYNlvT/LKbcolyV8neWxr7dOTRqiqpyX59SSTwp3fSfL3uWn7nNFae8uY6dwig+3x9Ey/oPF1Sb6jtXblmGn8YpKzp5Td8p9aa384w3ij039xku/seu+S5C+T/LsJo38wyUNba/86ZXq73X+G63PXDNbNV42M9vzW2g9Pmf64eh2d5H8meeI2o17YWrvXpDq11m4WalXVOUl+uuu9y7jnmi1oX3hLkm9J8tEkD07y57n5utnyj0ke2Fr7zFD5ByZ585R5b3lJa+2pM4wHAABrZ+oVoAAAwFr4uSSnJ3lFkj9I8vEkd07y7CTfkOSBSZ6Z5OoMArM3JDk3yb8kuX2SH07yrRn8g/1XkvyncTMZ+ef/h5P8VpL3J7k0yYkZhHbfmeTUJH9RVae31j43VP7uuSn0uiLJ7yZ5a5LLk9wyyZ26+j56zLx3XLZzaJKDSV6bQTDwwSSfTXLbDMKXM7vyD0jyh0kePmEdPCbJC7re67s6vbab9skZrMvvS3LvCfUY9qLcFABdmME2+VC3fHfOYDs8NoNt80dV9a2ttRuH6vLvc1Ng9q9JfjuDwOlTSW6VQch1nySPmaEuszgvyd2SvCTJy7v53C3J9ye5f5KvSfL6qrp3a+260cK73X/GeGUG2+6FSf4kyWVJviLbh6qj9TosgzDqvt1bH8tgXf5dkisz2Ee+PoN962YttBZkV/vCiCOTnJ/ky5M8L8lfJPlMBtvqWRms33tkcKx/91C5f0jydRnsL1uh83d37w/7TAAAYENpaQYAAGtopKVZkpzdWnvuyDhHJ7kog0DpqiSHJ3lha+0HRsY7NMnbMgiNbkhyx9bap0bGuV8GIVUl+c0kP9Ja+8KYej02yR934/1ka+0Xhob9bJKf6nrv1Vq7cMry3Xa4tdduynbv3SHJNa21q6aUG57HA1prbx0ZflgGYc+XJ/l8kge31v5mZJxbJnl1BuHGlpu1NKuqJyV5Wdf7rCS/3Mb8OKuqH8qgVVuS/MfW2kuHhv1+BmHKNUm+qrV2yYTlOiTJMa21z45d8ClGWnYlyX9trf3PkXEqg6Dxyd1b57TWfmZknF3vP2Pq05J8e2vtz+ZdrpFp/kKSH+96X5vkCa21ayaMe6fW2scm1WknLc0WsS90w9+SQUuzZHC8P2D0OKmqIzMIVk/KYB++Y2vtipFxnpptWkkCAMCm8kwzAABYf/8wGpglSWvtYJIXd73HZtAq6xljxvtCBi23kkGrrfuMmcdPZBBkvD/J08cFHt20XplBq58kOWtk8B267memhV7ddEZvj7ibsmmtfXJaYNb5mQxaTyXjb3X5mAwCsyT5zdHArJvPDUn+SwaBxDRb4dxfttb++7iQpJve85Nc0PVOWp8fnBSYddP44k4CszH+YjQw66bfMmhdtxW+fN+Y55ItYv8Z9QcLCMyOTbIVIl+S5EmTArOufh+bNGwXFrEvjPpv446Tbtl+s+s9LDe1rgMAACI0AwCATfDyKcOG/3H+J621SWHOe4Ze33V4QNdi7aFd7ytaa1/cpj5v6bp3qqqvGHp/6xlVx1XV47aZxqjdlL2Zqjqsqr6iqr62qu7e3f7xa5P8326UcbdXfMjQ65dMmnb3LK7XT5n3Sd28kunbbstfdd3Tu1ZjW7bWySlVdfoM09mtmwVmW7pA8ryu98syuP1fkoXuP6P+YJvpzOKMJMd0r8+dIVhdqAXuC6NeOmXY3w+9vuvEsQAAoIeEZgAAsP4+OGXYZ3cw3rEjw74+Nz0P+b9VVZv2l5tasiSDZ1VteWkGzwFLkj+uqrdU1TOq6vSqOnxK3XZbNsmgVVFV/WRVvSfJ5zJ4Dtj7kvzT0N+9utGPHzOJr+u612Zw28tpRp8DNewbh16/aIb1udU68LAMnq+15fcyuEXhYUn+pqrOr6qnVdW9x7T0WoS/m2P4PYZeL2r/GfWemWs+2alDr9+8gOnNa1H7wrDLR2+vOmK4JebosQ4AAL0mNAMAgPU38XZySW7cwXijLVhuP3eNbnLk1ovW2v9J8tgkn+ze+pYkz0vy9iRXVtWbqup7quqI0YnspmySdC3J3p/k55LcMzeFONvWe8hWSPGp1tqNY4YPu3TKsEWtz79O8t1Jrsxgmz08yQuSvCvJZ7oQ7SkLDNAu22b4J4de327o9UKWd4zP7GK6W04Yej3xFpdLtIx1M+04T6Yf6wAA0GvLuPoQAADYLMO/G34qyavmKPuR4Z7W2p9X1V0zCMC+Lck3J7lLksMzuFXeGUl+vKoe01p7zyLKVtUtk/xRkjt2b/1Bkv+VQWuxy5JcvxWCVdVfJ7l/Bs/f2o1p5YfX53dnequ0UV8S7LTWXlxVr0ryhAxugXi/DJbz6AxCtIcneVZVPaq19tE55jPO2GdtDZm0zAvbf76kMtvf5nFe2y3fMixsXwAAAHZPaAYAAGzn8qHXt2qtvXc3E2utXZvkZd1fqurEDAKfMzMIwu6U5FVV9dWjz2DbYdkzknxN9/qXWmvPnlK946YM27qt3fFVdYttWpudMGXY8Po8ZAHr87NJXtj9parukuRbk3xfBrdJ/LoMQsL77GY+GTyr7F+nDB9uNXXF0OuF7j8LNly3L0/yL/s4/13vCwAAwO64PSMAALCdd+emW7p926In3lr7RGvt95M8IMn53dt3zgwhz4xlv27o9csnTauqjkly0pTZ/VPXPSLJydtU7RumDHvn0OtlrM+PtNZ+t6vDe7q3T6+qO+9y0t+4zfBvGnr9j0Ovl7r/7NIFQ6/P2If5L3Vf2KH9aHEHAAArQWgGAABM1Vr7dJK/6npPraql/HO/tdaSvH7orWmtteYpO3yHjWnPyDor0+/G8Yah1985aaSu9dvDpkznwiQf7l5/e1WdMmXcHeta2r156K2Z1+cE3z1pQBc4PrHrvTRDodle7T879OYkV3Wvz6qqY/d4/nuyL8zpuqHXh+9bLQAAYB8IzQAAgFmck5taoLykqr5+2shV9dVV9R0j7z2uqm43pUzlS8OmDw8N23HZJP889Hps8FNV90/yc5Om3/nT3PQcqR+oqpu1hKuqQ5P8f5kSNnQB3zld7yFJXtk9q22iqjq1qh4+8t6TquroKWUOT/LArvfGJLt9ptnDq+qpY+ZTSX4ryfHdW7/TWvvCyGjnZJf7zzK01q5O8htd7x2TvLyqJgarVfWVC57/QvaFBfv40OuvXuJ8AABg5XimGQAAsK3W2l9X1U8l+fkMnl31jqp6aQa3RLy4G+32Se6Z5BFJ7pvBc8f+99BkfjDJy6rqLzNotXVRBs++OjLJXZN8V266Rd7bWmvvWlDZv0jyiSQnJjmzqm6b5PczCAdOSPLoJP81yWeTfCw3Pf9sdB18vqr+3ySvyiAUe1NV/UaS1yY5mMEtG384ydcneUeS07eKjpnWH1TVA7r5flWSf6yq38ugtdzHM/itdockp3b1u1eSX+jmteUXk5xbVa9N8pYkH0jymSTHZnCbye9Jcu9u3P/VWht+ftZO/H2SF3X1/l9JPpXkbkm+P4PbY6arw3PHLO8i9p9l+dkkD+rm+fAk76uq385gG16VwXPu7pXkMUlunZvW6UIsaF9YpHcn+VySo5L8WFVdluR9SW7ohl/ZWvvEkuYNAAD7SmgGAADMpLX2C1V1eZJfzeAf6k/t/ia5csx7hyd5ZPc3yQVJ/p9FlW2tXVtVT07y6iRHJ3l89zfs0iSPzSCIGhuaddP606r6/iTPT3KrJD/W/Q377QyeVbUVml2X8c7KIKT7qQzW5/d3f5OMW59HZ3BbxCeOGbblz5N875Ths3pikr/MIKD8rjHD/znJt7bWxi7vgvafhWut3VBV35rkJUkel8Ez8W4W/HUuXFI1FrEvLERr7XNV9csZtID7igwC0mEvyfTtBgAAa0toBgAAzKy1dm5V/XEGrWIelkHrqttmcPu/KzIITt6W5NWttb8bKf6EDFr0PCiD1lh3yKClV8sgtHpXklck+d+ttRsXWDattbdU1b2SPCvJQ5N8eQataT6aQZj2m621ywZ3Gtx2HfxWVf1Nkmdk0Lrt9kk+3dXhd1trr66qHxkqMjbg6G7N93Ndq6KzumX76gxaNt2Q5LIMWm69NcmfttbeOzKJb07y4K7c1yX5sm6d3JDBbST/IclLW2vnb7tQM2itfbSqTsugNd3jk9wlg1v+fyiDFmHPb61du800drP/LE1r7WCSx3ctvp6a5P4ZtEysDFopXpxBq7hXLGn+u90XFl2fn6mqf85gXdwzg210y2XOEwAAVkENvpsDAACwKFX1ogxaY92Q5JjW2vX7XKUdqaoXJ/nOJGmtbZ8oAgAArLFb7HcFAAAANklVHZ3k27ved65rYAYAANA3M4dmVXWnqvrhqnp1VX2sqq6vqqur6sKq+qWqOnFCuQNV1Wb4O22b+T+hqt5UVVdU1TVV9f6q+vmqOmbehQYAANipqvrqKcNumeRFGdxWL91rAAAA1sBMt2esqq/M4F7/w7fjuCqDBxQf0vV/JsnjW2tvHil7IMlHut5Lp8zmW1trYx+qXFXnJjmz6/1CBg/SPrrr/3CS+7fWLtl2QQAAAHapqj6d5D1J/jTJhUk+m+SYDJ61dlYGz+lKkguS3Le1dsPe13Ix3J4RAADok0NnHG8rGDs/yYuTvLG19pmqOiyDB0//VgYPgX5VVX1Na+2T4ybSWrvDvBWsqu/LIDC7MYOHdv9ma+36qrpvkpcluWuS8zJ4CDYAAMCy3SLJGd3fJP+Q5NHrHJgBAAD0zawtzW6d5MCUlmAnJXl3klslOae19jNDww6ka2k275WJVXV4ko8luX2SX2ut/cjI8HsneWcGLeAe3Vp79XbTPP7449uBAwfmqQYAAMC/ueqqq3LllVfm4MGDueGGG/KFL3whSXLooYfmqKOOynHHHZfjjjsuVevfMOviiy/OFVdckSQ59dRT97k2AAAAu/fOd77zU621E8YNm6mlWWvtygxuOzJp+Aeq6h1JHphkkb+kHpJBYNaS/MqY+b67qt6Q5KFJnpJk29DswIEDueCCCxZYRQAAAAAAANZBVX100rBbLHA+V3TdQ6aONZ+t2528t7X28QnjvK7rPmiB8wUAAAAAAKBHFhKaVdWhSe7X9b53ynhvr6qrquraqvpIVf1hVU17FtnWA7QvmjLO+7ruCVV1/Oy1BgAAAAAAgIFFtTR7WpI7JLkxye9PGe/0bpwkOZDBLRXfWlW/XuNv+H9i171kyjSHh504cSwAAAAAAACYYNehWVXdI8lzut4XtNZGW4Vdl+S3kzwgyTGttdskOTKDZ59tPYPsh5I8e8zkj+q6106pwjVDr4+eUMezquqCqrrg8ssvnzIpAAAAAAAA+mhXoVlVnZjkVRmEYO9M8qzRcVprn2ytPa219tbW2sHuvdZae1dr7dFJXtGN+uNVdZvRWWxNZjf1bK2d21o7rbV22gknnLCbSQEAAAAAALCBdhyaVdVtk7w+yV2SfCjJI1pr1+1gUltB21FJHjwy7GDXPXJK+eFhByeOBQAAAAAAABPsKDSrqlsneV2Suyf5WJKHtNYu3cm0WmsfSbJ1z8S7jgzeel7ZHadMYnjYJ3ZSBwAAAAAAAPpt7tCsqo5K8tokpyX5ZAaB2cd2WY9Jt2F8X9c9ZUrZk7vu5a21T+2yHgAAAAAAAPTQXKFZVR2R5NVJ7pvkigwCsw/tpgJVdZckx3e9F48MfnPXPaWqJrU2e1jXfeNu6gEAAAAAAEB/zRyaVdVhSf4kyRlJPpvkYa21i2YoV9uM8pyue22SN40Me2OSy7p6/siYad8zyUO63pduVxcAAAAAAAAYZ6bQrKoOSfKyJN+W5Ook/7619q4Z5/GWqnp2Vd29m05q4N5V9cok/6Eb77mttU8PF2ytXZ/knK736VX1jKo6vJvGfZK8sluGv22tvWbG+gAAAAAAAMCXqNZGHyM2ZqSqByT5q673uiRXThn9X1tr3zBU9uIkd+56b0hyVZIjkxwxVOYFSX6wTahMVZ2b5MyhaVyf5Oiu/8NJ7t9au2TbBUly2mmntQsuuGCWUQEAAAAAANggVfXO1tpp44YdOuM0hluk3ar7m+S6kf5nJnlokm9Mcockt03y+SQfTPK3Sc5trf3dtJm31s6qqjck+d4k98ogcPtAkj/OoIXa1TMuBwAAAAAAANzMTKFZa+0tSbZ7Ntmksq9I8oqdlB2ZznlJztvtdAAAAAAAAGDUTM80AwAAAAAAgE0mNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAQw6cff5+VwEAAACAfSA0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO/NHJpV1Z2q6oer6tVV9bGqur6qrq6qC6vql6rqxG3KH1ZVP1ZV76mqg1X12ap6e1WdVVU1w/yfUFVvqqorquqaqnp/Vf18VR0z6zIAAAAAAADAOIfOMlJVfWWSi5MMh1tXJTkqyT26v7Oq6vGttTePKX9skjclObV765okRyQ5vft7VFU9trX2hQnzPzfJmV3vF5Jcl+SkJD+R5ElVdf/W2iWzLAsAAAAAAACMmrWl2SFd9/wkT0hy29barZMcmeThST6S5Lgkr6qqO4wp/8IMArNPJ3lUkqO7sk/NIAB7ZJKfGTfjqvq+DAKzG5M8M8nRrbVjktwvyUeT3DXJeTMuBwAAAAAAANzMrKHZZ5Lcu7X2yNbaH7XWPpMkrbXPt9b+PIPg7Lokxyb5nuGCVXXvJE/ser+rtfaaNvDF1tpLkpzdDXt6Vd1+pOzhSc7pep/fWntea+36bt5vS/LYJC3J/arqUbMvNgAAAAAAANxkptCstXZla+3CKcM/kOQdXe+pI4Of3HU/2Fr7szHFz01yZQa3a3zcyLCHJLl9BsHYr4yZ77uTvKHrfcq0ZQAAAAAAAIBJZm1pNosruu4hI++f0XVfP65Qa+3aJG/teh80oex7W2sfnzDf100oCwAAAAAAADNZSGhWVYdm8IyxJHnv0PuV5KSu96Ipk3hf1z155P2t/lnKnlBVx29fWwAAAAAAAPhSi2pp9rQkd0hyY5LfH3r/2CRHda8vmVJ+a9iJI++fODJ8Wtlx5ZMkVXVWVV1QVRdcfvnlUyYFAAAAAABAH+06NKuqeyR5Ttf7gtbacKuwo4ZeXztlMtd03aNH3t8qP0vZceWTJK21c1trp7XWTjvhhBOmTAoAAAAAAIA+2lVoVlUnJnlVkiOTvDPJs0ZHGXrddjKLXZQFAAAAAACAmew4NKuq2yZ5fZK7JPlQkke01q4bGe3g0Osjp0xua9jBkfcPjgyfVnZceQAAAAAAANjWjkKzqrp1ktcluXuSjyV5SGvt0jGjXpXkc93rO06Z5NawT4y8f8nI8Gllx5UHAAAAAACAbc0dmlXVUUlem+S0JJ/MIDD72LhxW2styfu73lOmTPbkrvu+kfe3+mcpe3lr7VNTxgMAAAAAAICx5grNquqIJK9Oct8kV2QQmH1om2Jv7roPnTDNWyW5f9f7xgllT6mqSa3NHjahLAAAAAAAAMxk5tCsqg5L8idJzkjy2SQPa61dNEPRl3fdk6rqkWOGn5nk1kmuTfLKkWFvTHJZV88fGVOneyZ5SNf70hnqAgAAAAAAADczU2hWVYckeVmSb0tydZJ/31p71yxlW2vvTnJe1/viqnr41jSr6j8neW437Ndaa5eNlL0+yTld79Or6hlVdXhX/j4ZhGy3SPK3rbXXzFIfAAAAAAAAGHXojOPdL8nju9e3TPKqqpo07r+21r5h5L0zk9wtyalJzq+qa5IckuTwbvhrkvz0uIm11n6nqu7dTeN5SX6xqq5PcnQ3yoeTPHHG5QAAAAAAAICbmTU0G26Rdqvub5LrRt9orV1VVfdN8vQkT0ry75Jcn+TdSX4vyQtba23SBFtrZ1XVG5J8b5J7JTkiyQeS/HGS57bWrp5xOQAAAAAAAOBmZgrNWmtvSTKxadmM0/h8BrdifO52404of15uus0jAAAAAAAALMxMzzQDAAAAAACATSY0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAgH9z4Ozz97sKAAAAsC+EZgAAAAAAAPSe0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAADlw9vn7XQUAAADYV0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPTezKFZVR1TVY+uqp+rqj+vqk9VVev+TppS7sDQeNP+Tttm/k+oqjdV1RVVdU1Vvb+qfr6qjplngQEAAAAAAGDUoXOM++Akr9zl/C6dMuyGSQOq6twkZ3a9X0hyXZKTkvxEkidV1f1ba5fssm4AAAAAAAD01DyhWZJcluSCJP+Q5ONJzp2ncGvtDnPOL1X1fRkEZjcmeVaS32ytXV9V903ysiR3TXJekm+ed9oAAAAAAACQzBeavbq19qqtnqo6sPDajKiqw5Oc0/U+v7X2vK1hrbW3VdVjk7wzyf2q6lGttVcvu04AAAAAAABsnpmfadZa++IyKzLBQ5LcPklL8iujA1tr707yhq73KXtYLwAAAAAAADbIzKHZPjmj6763tfbxCeO8rus+aA/qAwAAAAAAwAba09Csqt5eVVdV1bVV9ZGq+sOqmvYsspO77kVTxnlf1z2hqo5fTE0BAAAAAADok71uaXZ6khu71wcyuKXiW6vq16uqxox/Yte9ZMo0h4edOG6Eqjqrqi6oqgsuv/zyOasMAAAAAADAptuL0Oy6JL+d5AFJjmmt3SbJkUlOTfLqbpwfSvLsMWWP6rrXTpn+NUOvjx43Qmvt3Nbaaa2100444YQ5qg4AAAAAAEAfLD00a619srX2tNbaW1trB7v3WmvtXa21Ryd5RTfqj1fVbUaKb7U+a8uuJwAAm+PA2efvdxVYcfYRAAAARu317RnHeVbXPSrJg0eGHey6R04pPzzs4MSxAAAAAAAAYIJ9D81aax9JsvWgsbuODN56Xtkdp0xieNgnFlUvAAAAAAAA+mPfQ7POpNswvq/rnjKl7Mld9/LW2qcWWisAAAAAAAB6Yd9Ds6q6S5Lju96LRwa/ueueUlWTWps9rOu+ccFVAwAAAAAAoCeWHppVVW0zynO67rVJ3jQy7I1JLsugnj8yZtr3TPKQrvelu6gmAAAAAAAAPTZXaFZVx2/9JTluaNBthodV1fB031JVz66qu1fVId10qqruXVWvTPIfuvGe21r79PD8WmvXJzmn6316VT2jqg7vpnGfJK/sluFvW2uvmWdZAAAAAAAAYMuhc45/+YT33z7Sf5fcdKvFO2fQmuw5SW6oqquSHJnkiKHxX5DkZ8dNuLX2O1V17yRnJnlekl+squuTHN2N8uEkT5xvMQAAAAAAAOAme/FMs2cmeWGSC5N8OsmxSW5M8sEkL0pyemvtB1prbdIEWmtnJfmODJ5xdjCDsO8DSX4hyb1aa5csdQkAAAAAAADYaHO1NGutbfd8snFlXpHkFfOWGzOd85Kct9vpAAAAAAAAwKi9aGkGAAAAAAAAK01oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAHviwNnn73cVAAAAACYSmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgCslANnn7/fVQAAAAAAekhoBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAADA2jlw9vn7XQUAAAA2jNAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAS+a5OwAAAACw+oRmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7QjMAAAAAAAB6T2gGAAAAAABA7wnNAAAAAAAA6D2hGQAAAAAAAL0nNAMAAAAAAKD3hGYAAEkOnH3+flcBAAAAgH0kNAMAAAAAAKD3hGYAAAAAAAD0ntAMAAAAAACA3hOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAAEDvCc0AAAAAAADoPaEZAHM7cPb5+10FAAAAAICFEpoBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmACzcgbPP3+8q0HP2QQAAAADmJTQDAAAAAACg94RmABNoqQIAAAAA0B9CMwAAAAAAAHpPaAYAALAEWq0DAACsF6EZAAAAAAAAvSc0AwAAABZOa0sAANaN0AwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQCAmzlw9vn7XQUAAAAA2FNCMwAAAAAAAHpPaAYAAMDG0FoaAADYKaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AyAjeEZJgAAAADATgnNAAAAAAAA6D2hGQAAAGxDi3YA1o3PLoD5Cc0AAAAAAADoPaEZAAAAAAAAvSc0AwAAAAAAoPeEZgAAAAAAAPSe0AwA1oAHOAMAAADAcgnNAAAAAAAA6D2hGQAAAAD/xl0OAIC+EpoBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzWAXlnGfd/eOBwAAAACAvSc0AwAAAAAAoPeEZkDvaM0HAAAA0G/+PwSMIzQDAAAAAACg94RmAABAL7iaGAAAgGmEZgAAAAAAAPSe0AwAoOe0vgEAAAAQmgEAAAAAAMDsoVlVHVNVj66qn6uqP6+qT1VV6/5OmqH8YVX1Y1X1nqo6WFWfraq3V9VZVVUzlH9CVb2pqq6oqmuq6v1V9fNVdcysywAAAAAAAADjzNPS7MFJ/jTJTyb5tiS3m7VgVR2b5G1JnpvknkkqyRFJTk/yP5L8WVUdOqX8uUnOS3JGkmOTfDHJSUl+Isl7quqOcywHAAAAAAA75Bbvy2X9wv6Z9/aMlyV5bZKfSXLWHOVemOTUJJ9O8qgkRyc5MslTk1yX5JHdNG+mqr4vyZlJbkzyzCRHt9aOSXK/JB9NctcMAjUAAAAAAADYkXlCs1e31r6stfaI1to5Sf5ylkJVde8kT+x6v6u19po28MXW2kuSnN0Ne3pV3X6k7OFJzul6n99ae15r7fokaa29Lcljk7Qk96uqR82xLAAAAAAAa2udWiOtU12Bfps5NGutfXGH83hy1/1ga+3Pxgw/N8mVGdyu8XEjwx6S5PYZBGO/MqZO707yhq73KTusHwAAAAAAAD037+0Zd+KMrvv6cQNba9cmeWvX+6AJZd/bWvv4hOm/bkJZAAD2iCtHAQAAgHW31NCsqirJSV3vRVNGfV/XPXnk/a3+WcqeUFXHz1dDAAAAAAAAWH5Ls2OTHNW9vmTKeFvDThx5/8SR4dPKjiufJKmqs6rqgqq64PLLL58yKWCdjLZq0MoBAABgNfh9xqrZpH1yk5YFmM7xvveWHZodNfT62injXdN1j55Qfpay48onSVpr57bWTmutnXbCCSdMmRQAAAAAAAB9tOzQrIZet12U30lZgKVxlQcAAAAAwGZZdmh2cOj1kVPG2xp2cOT9gyPDp5UdVx4AAAAAAAC2tezQ7Kokn+te33HKeFvDPjHy/iUjw6eVHVceAAAAAAAAtrXU0Ky11pK8v+s9ZcqoJ3fd9428v9U/S9nLW2ufmq+GAAAAAAAAsPyWZkny5q770HEDq+pWSe7f9b5xQtlTqmpSa7OHTSgLAAAAAAAAM9mL0OzlXfekqnrkmOFnJrl1kmuTvHJk2BuTXJZBPX9ktGBV3TPJQ7rely6ktgA9c+Ds8/e7CgAAS+X7DsDyOMcCsEnmCs2q6vitvyTHDQ26zfCwqvq36bbW3p3kvK73xVX18G5ah1TVf07y3G7Yr7XWLhueX2vt+iTndL1Pr6pnVNXhXfn7ZBCy3SLJ37bWXjPPsgAAAAAAAMCWeVuaXT70966h998+MuxOI+XOTPLOJLdLcn5VfS7J55K8JMkRSV6T5KfHzbC19jtJXtjV9XlJrq6qq5O8Lcldknw4yRPnXA4AAABgxWnBwiT2DQBgGfbi9oxprV2V5L5Jzk5yYZKW5Pok70jyPUke3Vr7wpTyZyX5jgyecXYwyaFJPpDkF5Lcq7V2yVIXAAAAAAAAgI02V2jWWqsZ/y4eU/bzrbXnttbu1Vo7urV269bafVpr57bW2gzzPq+19qDW2m1ba0e01r62tfaTrbWr51kGAAAAAFhFWtDtH+segGSPWpoBAAAAAADAKhOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAAAAsAI8W4152WdWm+2zfoRmAAAAAAAA9J7QDACAleNqPAAAAGCvCc0AAAAAAADoPaEZAAAAAAAAvSc0AwDm5tZ5AADAbvhNAcAqEpoBAAAAAADQe0IzAAD2hauL2ST2ZwAAgPUnNAMAAAAAAKD3hGZsvD5c9duHZWTz2G8BAADYa36LAjCN0AwAAAAAAIDeE5oBAAAAM9FCAzaX4xsAhGYAAAAAAAAgNGO9uOoJVofjEcC5kPVkv4XxHBsD1gMA0GdCMwAAAAAAAHpPaAYAAGwcLSUA4Ev5bFxNtsvqsC2ARGgGAAAAAAAAQjMAAAAAAAAQmgEAAAAAANB7QjMAAAAAAAB6T2gGAADsmAemA7BJfK4BQL8JzQAAAAAAAOg9oRnAErg6EWDzOdezqezbAABAXwnNAAAAAAAA6D2hGczBVbcAAAAA+B8RwGYSmgEAAAAAANB7QjMANp4rAAEAAACA7QjNAAAAAAAA6D2hGQCwdrQeBPaDcw8A7D+fxwAsk9AMAAAAAACA3hOaAQAAAMCctHiC/eHYA5ZJaAYAAAAAAEDvCc0AAAAAAADoPaEZ9NSimrJrEg8AsH98FwPYGedPAGAcoRkAAAAAAAC9JzQDvoSr7Wa36HVl3QMAAKwvv+kAYP0JzQAAAAAAAOg9oRm74ioq2J7jBObnuAH6zDkQYLM5zwPA6hKaAQAAAAAA0HtCMwAAYOW5Kh8AFsNnKsBsnC/7SWgGAAAAAABA7wnNAABgjbjacXbWFQAAAPMQmgEAAAAAANB7QjMAgDWkBQ3AZnFeh51x7AAAiyQ0AwAAAAAAoPeEZqwkV4oBrBfnbWZlX+kH2xkA2ES+49B3jgH6QGgGAAAAAABA7wnNAAAAAAAA6D2hGRtB02DWgf0UABbP5+vyWLe7Zx0Ce8k5B8ab5dhw/ABbhGYAAAAAAAD0ntAM6BVXDgHQBz7vAAAA9pbfYZtBaAYAAAAAAEDvCc0AgJXnai0AAGCR/MZgndl/YXmEZgAAAAAAAPSe0AwAgJXk6snlm3cdL2KbrPp2XfX6AQDAuvDdmnUkNAMAAAAAAKD3hGbsG1caACyG8ykA9Ne47wG+GwCwn3wOAetMaAYAAAAAAEDvCc0AYMO5yg+AWfi82Hy28XqwnSazbpiVfYVlsn/BZhOaAQAAAAAA0HtCMwAAAAAAAHpPaAYAAACwodxGDABgdkIzAAAAAAAAek9oBhtuN1cVuiIRgD7zObj3rHMAABbB90r6wr6+eEIzAAAAAAAAek9oRm9I3QFYtr5/1vR9+VeF7QAAAAA7IzQDAAAAAACg94RmAADQ0UoLYGBdzofrUk+wrwLAehCaAQAAAAAA0HtCM1aeq7EAYDqflQDLs4nn2E1cJgAAWAShGQAAAAAAAL0nNAMAWGF9bA3Qx2UGWATnTzbZKuzfq1AH2E+OAfpq0fu+Y2m1Cc0AAAAAAADoPaEZAAAAAAAAvSc0AwAA2HBuAQPAfvEZxDLYr4BlEZoBAAAAAADQe0IzgBXjaimAyZwjV5dtA6wy56idsd5g9c1ynDqWAWYnNAMAAAAAAKD3hGYA7Jt1vdptXeu9Cqy7zWJ7Ls6Bs8+3PgEAesb3v+WzjoF5Cc0AAAAAAADoPaEZwD5z1RN7Zdn7mn0Z2A3nENh7jjsA+FI+GwGhGQAAAAAAAL0nNAMAmIMrDwEAANhrfouuFttjcwnNAAAAAAAA6D2hGQAAbKhVufpxVeoBAAAA0+xJaFZVT62qts3fwSnlD6uqH6uq91TVwar6bFW9varOqqrai2UAAAAAAABgc+11S7Mbklw65e9mqurYJG9L8twk90xSSY5IcnqS/5Hkz6rq0KXXHKBHtAhYPOt0+axj2J7jBPbeIo+7ZR7D63x+WOe6A0Df+Nxm1e11aPa21todJvzdbUKZFyY5NcmnkzwqydFJjkzy1CTXJXlkkp/Zg7oDAAAAAACwoVb6mWZVde8kT+x6v6u19po28MXW2kuSnN0Ne3pV3X5/agkAAAAAAMC6W+nQLMmTu+4HW2t/Nmb4uUmuzOB2jY/bs1oBAMAGcYsUVp19FABWk89oYNOsemh2Rtd9/biBrbVrk7y1633QntQIAAAAAACAjbPXodkpVXVRVV1bVVdX1Xur6teq6i6jI1ZVJTmp671oyjTf13VPXnRlAegvV8sBAAAAQL/sdWh2fJKvTXJNklslOSXJDye5qKqePDLusUmO6l5fMmWaW8NOXFw1AQAAAAAA6JO9Cs0uSfLTSe6e5FattdslOTrJIzJoKXZEkt+vqgcMlTlq6PW1U6Z9Tdc9etIIVXVWVV1QVRdcfvnlO6k/LJ1WLawL+yoALJ7P181m+7Joq7pPrWq9AGA/+FxcT3sSmrXWXt9a+9nW2kWttc93713fWnttkvsm+T9JDknyS0PFangSu5z/ua2101prp51wwgm7mRQAAAAAAAAbaK9vz3gzrbUrkzyn6z29qrZSrYNDox05ZRJbww5OGQdgrbgSBQAAgFn4/bgebCeA9bDvoVnn77puJTnQvb4qyee613ecUnZr2CcWXy0AAAAAAAD6YFVCs5vdirG11pK8v3vvlCllT+6671tCvQAAgJ5yRfjiWJcArAOfV7Bcq3KMrUo9WE2rEpp949Drjw69fnPXfei4QlV1qyT373rfuIR6AQAAAAAA0ANLD82qqrYZfmySs7vev2+tXT40+OVd96SqeuSY4mcmuXWSa5O8crd1ZT4S+c1ie8JqcmwCAAAAwN7Yi5Zmd66qd1TVf6mqO229WVWHVdW3JfnbJF+d5MYkzx4u2Fp7d5Lzut4XV9XDu7KHVNV/TvLcbtivtdYuW/aCAAAAAAAAsJkO3aP5fFP3l6q6Lsnnkhyb5Jbd8GuSfG9r7U1jyp6Z5G5JTk1yflVdk+SQJId3w1+T5KeXV3UAAAAAAAA23V60NLs0yQ9m0GLsgxkEZLfuuhdk0Frs5NbaH4wr3Fq7Ksl9M7iF44VJWpLrk7wjyfckeXRr7QtLXgYAADaIW5/CanOMwt5wrAEAfKmltzRrrV2b5De7v51O4/MZhGvP3W5cAAAAAAAAmNdetDQDoMdcvbq3rG/WlX0XANhPvovA/nIMAqtCaAYAAAAAAEDvCc2AXduUq4E2ZTn2i/UHAAAAAMvn/3DLIzQDAAAAAACg94Rm0AOuPAAAAFgvfsfBfBwzACyC0AwAAAAAAIDeE5oBLImr3IBNtqxznHPn/rL+95b1DbAanI/Za/Y5Vp19lD4TmgEAAAAAANB7QjNWmqsa4CaOB/aD/Y51Zv+Fm3NcAADrZJ2+u6xTXYHJhGYAAAAAAAD0ntAMAAAAAACA3hOaAQCsALfy2DvWNevM/gsAwCx8b4SdEZoBAAAAAADQe0IzgBXmqiAAAABgi/8TACyX0AwAAAAAAIDeE5oB0GvDV+mt8xV761x3Nt+m75+bvnx9Z/vC3nG8AbDXfPast3XafutU174TmgEAAAAAANB7QjOWSoIOAAAsgt8WAIvjnAoA4wnNAAAAAAAA6D2hGfSQK8pYdfZRGHAsQP847gEAZuN7U/9s4jbfxGVad0IzAAAAAAAAek9oxlqSwFsHLIf9CuBLOS8CLMe6n1/Xvf4AAIwnNAMAAAAAAKD3hGZM5eo5AABgWfzeAAD4Ur4fwf4SmgEAAAAAANB7QjMAAAAAAAB6T2gG0FOa+7MK7IcAAAAkfh8Cq0FoBgAAAAAAQO8JzWCDuCIHAOg734eASdbp/LBOdQW255iG/eP4Y15CMwAAAAAAAHpPaMaekuyvnkVvE9t4PtYXzM9xA7AanI8BWCSfK5vPNl4dtgVMJjQDAAAAAACg94RmwEpwhQsAAAAA68L/smAzCc0AAAAAAADoPaEZsHCutAEAAIDF8TsboF+c9/eP0AwAAAAAAIDeE5oBAOwRV4oBANxcX78jbfpyb/ryLZr1xTj2C9h7QjMAAAAAAAB6T2gGAAAAAABA7wnNAICV4vYTsNocowDAfvAdBIC9IDQDAAAAAACg94RmAMC/cfUmAHvB5w2wV5xvAIB5CM0AAAAAAADoPaEZwAZw9SR7wX4GbDLnOAD2gs8bAFhtQjMAAAAAAAB6T2gGwESuggSAfvCZD1/KMQEAs1v056bP4dlYT8shNAMAAAAAAKD3hGbMTYK9fNYx9Nu6nAPWpZ7b2ZTlWCbrCGD9OZfDanOMro5lbwvbGmC1Cc0AAAAAAADoPaEZMBdXRLFT9h1g0ZxXYHkcX7D6HKdMY/8AWB7n2M0mNAMAAAAAAKD3hGYAAAAAAAD0ntAMekTTYVg9q3Bc7lcdVmHZAQBG+Y6yfmwzWE19OTb7spzQF0IzAAAAAAAAek9oBkASV0YBA84FzGrcvjL63rrtT+tWX4B14hzbb8vY/vYpYFmcX/pNaAYAAAAAAEDvCc1gB1xtAKyDTTxXbeIysTfsOwDMwucFm8Y+DQDzEZoBAAAAAADQe0IzWCGuAANg1fhsAlg+59r1ZxvC3nLMwd5xvNE3QjMAAAAAAAB6T2gGAPSCq+OYlX2FvbZO+9w61RUARq3b59he13fd1s889mrZNnkdQl8IzQAAAAAAAOg9oRkbw5UcAACwWXzHZ172GWBT7eX5bZPPpfMs2yavh90aXjfWE5tGaAYAAAAAAEDvCc0g631FxDrXfRH6vvysF/srwOZwTgdgP6zS589O6rJK9d8rWuRsllXahrPWZZXqDOtAaAYAAAAAAEDvCc0AAAAAAADoPaEZe2ZTmwyvW33pL/vqarE9dm9V16F6Mclut8G08rYvwHjOj+vJdoPdcxyxCPYj+khoBgAAAAAAQO8JzVg6VySwn+x/m8X23H+2AQDs/efhpn3+btryAOvPeQlW26oeo8uq16oub18IzQAAAAAAAOg9oRkwlSsbYPE29RmPu9GnZQWYxLmQcewX6882hPXk2GVd2Xdhd4RmAAAAAAAA9J7QjF4ZvtLCVReTbeq6mWW59mvZN3Wds3z2nb1hPcPeWudjbp3rzvpZ9P5m/wX6ZifnvT6dK/u0rABbhGYAAAAAAAD0ntCMpVmHZ/a4YmZn9nq9rXILMea3iG01zzRGx7WvsGrsk5tt2va17XfPOmQTbbdfr+t+rzXHTcYt1yr+xgKYZDe/yTfNpi8f9JHQDAAAAAAAgN4TmjGzSVdOrNsVFbut77ot75Z1rTf9scqtU9ft+Fl2fddtfezGIpe1T+tt0VZt3e1FfVZtmcE+ufpWdRutar12YxOXaTcOnH2+dbIL1h2sB8cq89rLluX2z8USmgEAAAAAANB7QjMAAAAAAAB6T2jGntBEdDMNb9dN38Zby7fT5Vzm+tn0db8bq7i9Vllfl3tdLHr7bNL2XvdlWcf6r1qdV60+q2CdPwNX/XvTKqyjTdan9TvrbQ3XbZ2sW31XiXPU8ri99vrYOjfuZn2u+rZY1Ufg7Pf857Wo+q7bcrN8QjMAAAAAAAB6T2gGrJVZr8Rc16tEVuHKwnVdd4y3CvvUqtnrdTLv/FZhfW/afjNrXZbVgnqV1kWiPqs2//20X8u+6i3Jljm9RdrrVhOrvC6WbR0/y0fNsi1Xsd57bSfrwHpbrlVYv6tQh1W3l+todF62z+pbhW20CnXYrU1YhnUhNAMAAAAAAKD3hGbMZCdXaO8FCft6W6X79/f9SqVpy9u3dbFIq77uVr1+i9KX5dwEWsre3LTWAPt5RS8Dvvtuhv1qXbuqz9dZ1WVhfe3H/rEpv+9W4TfzJre+X4bRVp2r2lJ13Hz6sH12apO/K4yb5ybejQDmITQDAAAAAACg94RmrIRNuJJg3JUYW/2bsHx7bZWfp7MK23Nd962dXlm1Css5us63q9M6Pa9l2Xb6/K9FtHJe1av4d1NutzZlvvv9PJb9nn9frNL3gXXatqvQ6mAvt91+fiazOc89m3W/Xmar6FU65203rXX5rbWo1jTrvG8n61//7ez374FlWNd6z2Lcb+pZP8vX7c4Uq/x7dNlW4fso7JTQDAAAAAAAgN4TmrGRdnvV/zJbtoxOe12vclvXeu/Ebuq9n/cJ36tWHcsyy5Vm69bCpy/W4TlLi2glsd25fJlXvO7kqsxZ6rSO56dVPp538hmwDsfPflvFc/9Oro7eaSvcSe+tw1X2+93qb1Et17Yrt+jvjjsdb5H12Ivz2bzLNK2Oy97Wy5z2on5n7fS8sx+fQ4s+NyyiNfhuf4tMGj7Pvrkq5+4tq37uXca0FzX/ZU17kf+XmeVcsd0+vd/77KKPm0X9ZtvN78bdzHfe+aziutvN+Dv9brxf+/Skz7D9Pq76am1Cs6q6Q1U9v6r+paquq6pLq+rVVfXg/a4bAAAAAAAA620tQrOqukeS9yb5wSR3TXJ9kuOTPDLJX1bV2ftYPUbs5iqfWZL+ea4em+fKtXnrNO/VUju5SmK3V6bNM69ZyixiGba7CnLalUGj4+2mHpPqMu39adPfSbmdzGPe90bX6aKv9pn1SrNlXl24m+242yvlRvetWff5SdOZ9xjc6ysLdzK9ea5y3o+rNHd6FeW885112+72qubRaezXeX3a+Lutw07LLGJau5nvvOttJ9Pd7baYNP5ur9Kcdh7Y7nvBvPNd1JW1i9wvdrrf7GQd7eRzbRGfl8PvjftcnPXcNs9+Om3/mlSfWeo663xH6z1tvFnGn6fu8xq3rKPDdzLNnZRd1Pe23Xy+LvozZJ59dhHznrY9d7ptdzrebr6/znpuW+SxMK0+08adtcyiPttnKb+TY3DW7wnD3WnLPuk8vJPz6bi6zLqfzzPtecyzb+/083UZ9dluu00qs5t5z2I35/9l2Ov5zvMdbZZ1Nc8+N+2cOu+5ZLf7+rTPkZ1+t5g0vUnzm1an7eow6/fKWcvu5ty+X8dOX6x8aFZVRyT5syS3S/LuJHdvrd06yXFJfiVJJfnFqnrY/tUSAAAAAACAdbbyoVmS70ly5yQHkzyqtXZRkrTWrmqt/WiSV3Xj/eL+VK+fFnF14nbT3C+LuEJot1dWzTq9ea5U2a1FXumx2+kt0rKu+FvEfJd5JdZeX/W27PKj09jJcbzbfXxRFn1F8KTx9+MY3Ok8d3rl2k7mt9v1uig72Q/mvaJuN/biatRFTXPWqzZ3u0zL+vxdxLRX5TN3N9btc2va59AmbI9hs37vXda8lzXNvVqmTdsfkvm/h00af9nrZtZz626uoJ+3/LzT3kur9lm03+tjHvtZ153+7tzr73qr+Bt3r6e/F3y2zU5Ln9nM87tzN58dq/D/jHnmv8z67fey98U6hGZP6bova619fMzwX+66X19VJ+1RnQAAAAAAANggKx2aVdUxSU7tel83YbR3JLmye/2gpVcKAAAAAACAjbPSoVmSr83gmWVJctG4EVprNyb5YNd78l5UCgBg07ntAwCw6vr8faXPyw6wU86dzGLVQ7MTh15fMmW8rWEnThkHAAAAAAAAxqrW2n7XYaKqenKSl3a9t2ytfWHCeC9N8uQkr2+tfeuY4WclOavr/Zrc1DINAAAAAACA/rhza+2EcQMO3euazKm2H2V7rbVzk5y7iGkBAAAAAACweVb99owHh14fMWW8I8eMDwAAAAAAADNZ9dBs+Dlmd5wy3tawTyyxLgAAAAAAAGyoVQ/NPpBk66Frp4wboapukcFzypLkfXtRKQAAAAAAADbLSodmrbWrk1zQ9T50wmjflOTW3es3Lr1SAAAAO1RV51RVq6oXjxl2cTfsgXtesSWpqgd2y3TxDstv3DoBAABW10qHZp2Xdd2nVNWJY4b/aNd9Z2vtg3tUJwAAgI1WVffqQr6n7nddAAAA9sI6hGb/I8lHkxyT5DVVdXKSVNUxVfXfkzyuG+/H96l+AAAAi/AvST6Y5Jr9rkjnXkl+OslTdzGNazJYpn9ZQH0AAACW6tD9rsB2WmvXVtVjMrj14tcnuaiqrkpydAahX0vy46211+9jNQEAAHaltfbg/a7DorXW/j7JSftdDwAAgFmsQ0uztNYuTHL3JL+R5MNJDk9yRZLzkzy0tfZL+1g9AAAAAAAA1txahGZJ0lr7ZGvth1prd2ut3aq1dvvW2iNba2/c77oBAAD9VVVfW1W/W1X/XFWfq6rPVtU/VdVvVNWpc0zn4qpqVfXACcMPq6rvr6q3VtWnq+r6qvpoVb2oqr52QpkXd9M8p6oOqaofrqoLq+qabhqvqarTxpRrSX6v6/2WbhrDf2PrOGY6D+zGv3jKOE+pqndU1cGuTm+qqkfMMn0AAIBFWvnbMwIAAKyqqvqBJL+W5JDurc8lOSyDO2XcPck9kjxwAfM5McmfJ7ln99aN3bzulOS7kjypqp7SWvuTCZM4NMlrknxbkhuSXJ/kuCSPSPLgqnpQa+3tQ+NfmuSIJMd24396ZHqf3+0yJUlVvSDJ07reG7t5PTDJGVX1Q4uYBwAAwKzWpqUZAADAKqmqJ2RwC/lDkvxRkpNba0cnOSrJHZP8xyTvXMB8bpnkTzMIzP46yQOSHNFaOzbJHZL8SpJbJfmDqrrbhMk8Lck3JvmOJEe31o7ppvferuzzh0durd0hyVZo9bbW2h1G/t62gOV6Sm4KzJ6X5HatteOSnJjk97v3TtjtfAAAAGYlNAMAAJhTF2T9atf78tbaE1pr70+SNvCJ1tpLW2vPWMDsvjPJNyT5hyQPa629tbX2+W5el7bWfjTJ7yQ5MsnTJ0zjNkke01o7b6jsPyZ5ajf8G6rqzguo60yqqpKc0/W+pLX2zNbaZ7t6XdrV668yWCYAAIA9ITQDAACY34OTfEWSLyZ55pLn9Z1d97daa9dPGOdlXfehE4a/tbX2N6NvttbemeT/dr2n7LyKc7tXkn/Xvf7F0YGttZbkOXtYHwAAAM80AwAA2IHTu+6FrbWPL2smVXVoBrdVTJJfrarnThh165lqXzlh+D9Mmc3HMwgAj5u/hjv29V33stbaByeM87YkX4jfrQAAwB7x4wMAAGB+X9Z1P7bk+dw2yWFDr7dzxIT3r55S5rque8tZK7UAW88qmxg4ttaur6pPZfDcNgAAgKVze0YAAID51R7NZ/g32z1ba7Xd3x7Va69s2vIAAAArTGgGAAAwv0923TsveT5XZPDctCQ5ecnz2kuXd907Thqhqg5Lcru9qQ4AAIDQDAAAYCfe0XXvUVVfvqyZtNZuSHJB1/u4Zc1nghu77jJae72r635ZVX31hHHuG48UAAAA9pDQDAAAYH5vzOB5XIck+eUlz+vFXffxVXXGtBGr6rgFzveqrnubBU5zy3uS/J/u9bNGB1ZVJTl7CfMFAACYSGgGAAAwp64F2DO63idV1XlVddLW8Ko6sarOrKrfWMDs/mcGLdtukeQ1VfVDVXXboXndvqqeVFVvSfJDC5jflou67slV9U0LnG5aay3JOV3vd1fVc6vqNklSVV+W5EVJHpTkmkXOFwAAYBqhGQAAwA601v53BsHZjUmekOT9VXV1VV2T5JIk5ya5xwLmc0OSxyT52yRHJvn1JJ+qqk9X1dVJLk3ysiTfkqTtdn5D8/1Qkr/O4BaJ76iqK6rq4u7v9AVM/6VJfqvr/bF0y5TkE0memuRHc9OzzwAAAJZOaAYAALBDrbVfTXLvJL+X5OIkt0xyXZJ/TPL8JE9f0HwuyyAUe0qS1ya5LMnRGTxv7AMZtEZ7eJLnLGJ+Qx6X5LeTfKSb3527v1stYuKtte9P8h+T/F2S6zNYnr9K8sjW2iJa6QEAAMysBnfFAAAAAAAAgP7S0gwAAAAAAIDeE5oBAAAAAADQe0IzAAAAAAAAek9oBgAAAAAAQO8JzQAAAAAAAOg9oRkAAAAAAAC9JzQDAAAAAACg94RmAAAAAAAA9J7QDAAAAAAAgN4TmgEAAAAAANB7/z8DHWc9S1DpmAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 2160x720 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "PLOTFILENAME = \"Messages.png\"\n",
    "\n",
    "groups = pd.DataFrame(dataListAnonymized).groupby(CLIENT).groups\n",
    "x = groups.keys()\n",
    "y = [len(groups[client]) for client in x]\n",
    "\n",
    "font = {\"size\":24}\n",
    "matplotlib.rc(\"font\",**font)\n",
    "plt.figure(figsize=(30,10))\n",
    "plt.bar(list(x),y)\n",
    "plt.title(\"messages per client\")\n",
    "plt.xlabel(\"client id\")\n",
    "plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
    "plt.savefig(PLOTFILENAME)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "valueFrequencies = showValueFrequencies(pd.DataFrame(dataListAnonymized))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1982 789 1595 387 240\n"
     ]
    }
   ],
   "source": [
    "print(sum([valueFrequencies[k] for k in list(valueFrequencies.keys())]),\n",
    "      sum([valueFrequencies[k] for k in list(valueFrequencies.keys()) if k < 5]),\n",
    "      sum([valueFrequencies[k] for k in list(valueFrequencies.keys()) if k < 50]),\n",
    "      sum([valueFrequencies[k] for k in list(valueFrequencies.keys()) if k >= 50]),\n",
    "      max([k for k in list(valueFrequencies.keys())]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 1983-1982 = 1 client with no mails\n",
    "* 789 more clients with fewer than 5 mails\n",
    "* 1595-789 = 806 more clients with fewer than 50 mails\n",
    "* 387 clients with 50 mails or more (max 240)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Message': 45469,\n",
       " 'Sender': 45469,\n",
       " 'Recipients': 45469,\n",
       " 'DateSent': 45469,\n",
       " 'Subject': 45469,\n",
       " 'Body': 45469,\n",
       " 'IsReplied': 45469,\n",
       " 'Location': 45469,\n",
       " 'TreatmentStep': 7017,\n",
       " 'Attachments': 1894}"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "QUERYMESSAGES = \"./Messages\"\n",
    "\n",
    "def showMessageTextFieldFrequencies():\n",
    "    inFileNames = sorted(os.listdir(DATADIRANONYMIZED))\n",
    "    tags= {}\n",
    "    for inFileName in inFileNames:\n",
    "        if re.search(FILEPATTERN,inFileName):\n",
    "            root = readGzippedXmlFile(DATADIRANONYMIZED+inFileName)\n",
    "            for section in root.findall(QUERYMESSAGES):\n",
    "                for tag in section.findall(\".//*\"):\n",
    "                    if not tag.tag in tags: tags[tag.tag] = 0\n",
    "                    tags[tag.tag] += 1\n",
    "    return(tags)\n",
    "\n",
    "tags = showMessageTextFieldFrequencies()\n",
    "{tag:tags[tag] for tag in sorted(tags.keys(),key=lambda t:tags[t],reverse=True)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataListAnonymized = [d for d in sorted(dataListAnonymized, key=lambda d:(d[\"client\"], d[\"DateSent\"]))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 20\n",
    "\n",
    "\n",
    "def makePhrase(wordList, index):\n",
    "    return(\" \".join(wordList[index:index+N]))\n",
    "\n",
    "\n",
    "def addPhraseToRefs(phraseRefs, phrase, msgId):\n",
    "    phraseRefs[phrase] = msgId\n",
    "\n",
    "    \n",
    "def countPhrases(phraseRefs, message, msgId):\n",
    "    words = message.split()\n",
    "    inDuplicate = False\n",
    "    duplicateStart = -1\n",
    "    duplicateEnd = -1\n",
    "    duplicates = []\n",
    "    for i in range(0,len(words)-N+1):\n",
    "        phrase = makePhrase(words, i)\n",
    "        if not phrase in phraseRefs:\n",
    "            addPhraseToRefs(phraseRefs, phrase, msgId)\n",
    "            if inDuplicate:\n",
    "                duplicates.append((duplicateStart, duplicateEnd))\n",
    "                inDuplicate = False\n",
    "                duplicateStart = -1\n",
    "                duplicateEnd = -1\n",
    "        elif phraseRefs[phrase] < msgId:\n",
    "            if inDuplicate:\n",
    "                duplicateEnd += 1\n",
    "            else:\n",
    "                inDuplicate = True\n",
    "                duplicateStart = i\n",
    "                duplicateEnd = i+N\n",
    "    if inDuplicate:\n",
    "        duplicates.append((duplicateStart, duplicateEnd))\n",
    "    return(duplicates)\n",
    "\n",
    "\n",
    "def prepareText(text):\n",
    "    text = re.sub(\"</*line>\",\" \",text)\n",
    "    text = re.sub(\">>+\",\" \",text)\n",
    "    text = \" \".join(word_tokenize(text))\n",
    "    return(text)\n",
    "\n",
    "\n",
    "def removeDuplicates(text, duplicates):\n",
    "    words = text.split()\n",
    "    for duplicateStart, duplicateEnd in list(reversed(duplicates)):\n",
    "        del(words[duplicateStart:duplicateEnd])\n",
    "    return(\" \".join(words))\n",
    "\n",
    "\n",
    "def processCorpus(messagesIn, client):\n",
    "    phraseRefs = {}\n",
    "    messagesOut = []\n",
    "    for msgId in range(0, len(messagesIn)):\n",
    "        try:\n",
    "            textIn = prepareText(messagesIn[msgId][BODY])\n",
    "        except:\n",
    "            textIn = \"\"\n",
    "        duplicates = countPhrases(phraseRefs, textIn, msgId)\n",
    "        messageOut = dict(messagesIn[msgId])\n",
    "        messageOut[BODY] = removeDuplicates(textIn, duplicates)\n",
    "        messagesOut.append(messageOut)\n",
    "    return(messagesOut)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AdB1987\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "45469"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clients = sorted(list(set([message[\"client\"] for message in dataListAnonymized])))\n",
    "\n",
    "dataListAnonymizedWithoutDuplicateText = []\n",
    "for client in clients:\n",
    "    squeal(client)\n",
    "    messagesFromClient = [message for message in dataListAnonymized if message[\"client\"] == client]\n",
    "    dataListAnonymizedWithoutDuplicateText.extend(processCorpus(messagesFromClient, client))\n",
    "\n",
    "len(dataListAnonymizedWithoutDuplicateText)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "saveAnswerDataDf(answerDataListToDf(dataListAnonymizedWithoutDuplicateText), outFileName=OUTFILENAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}