e-mental-health/data-processing

View on GitHub
frontierspaper.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6359b8c0-9758-486b-ac4f-0e35935c1a39",
   "metadata": {},
   "source": [
    "# Calculations for the Frontiers 2021 paper\n",
    "\n",
    "(computed post-acceptance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8736adb0-2ace-4805-aaba-7d640dd8f7d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import pandas as pd\n",
    "import re "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d9a076a-022b-4713-bf2b-2c27800f3b9f",
   "metadata": {},
   "source": [
    "## 1. Recompute Table 1 percentages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c1b91646-84aa-48e8-ba83-a53f10662645",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_dict = {         \"males\": { 1: 209, 2: 133 },\n",
    "                     \"females\": { 1: 215, 2: 213 },\n",
    "                       \"dutch\": { 1:  22, 2:  27 },\n",
    "                 \"no answer 1\": { 1: 402, 2: 319 },\n",
    "                     \"primary\": { 1:   5, 2:   4 },\n",
    "                       \"lower\": { 1:   0, 2:   0 },\n",
    "                      \"school\": { 1:  56, 2:  33 },\n",
    "                \"intermediate\": { 1: 103, 2:  68 },\n",
    "                      \"higher\": { 1: 137, 2: 124 },\n",
    "                  \"university\": { 1:  40, 2:  58 },\n",
    "                 \"no answer 2\": { 1:  11, 2:  19 },\n",
    "                         \"yes\": { 1:  25, 2:  27 },\n",
    "                          \"no\": { 1: 308, 2: 240 },\n",
    "                 \"no answer 3\": { 1:  91, 2:  79 },\n",
    "                     \"i think\": { 1: 334, 2: 264 },\n",
    "                      \"i want\": { 1:  14, 2:  10 },\n",
    "                   \"something\": { 1:  36, 2:  31 },\n",
    "                \"others think\": { 1:  14, 2:   6 },\n",
    "               \"other reasons\": { 1:  24, 2:  29 },\n",
    "                 \"no answer 4\": { 1:   2, 2:   6 },\n",
    "                       \"never\": { 1: 164, 2: 187 },\n",
    "                \"now and then\": { 1:  36, 2:  24 },\n",
    "                       \"daily\": { 1: 224, 2: 135 },\n",
    "             }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "34d1479a-fc27-4c0c-b2c0-cc2ef310dbc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pretty_print(results):\n",
    "    results_list = []\n",
    "    for key in results:\n",
    "        if results[key][1] + results[key][2] == 0:\n",
    "            print(f\"pretty_print: skipping unused key value: {key}\")\n",
    "            continue\n",
    "        percentage = round(100 * results[key][1] / (results[key][1] + results[key][2]), 1)\n",
    "        results_list.append({\"key\": key, \n",
    "                             \"dropout:N\": results[key][1],\n",
    "                             \"dropout:%\": percentage,\n",
    "                             \"completer:N\": results[key][2],\n",
    "                             \"completer:%\": 100 - percentage,\n",
    "                            })\n",
    "    return pd.DataFrame(results_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "32b1fbee-aa53-4e62-9e7d-1255d635df19",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pretty_print: skipping unused key value: lower\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>males</td>\n",
       "      <td>209</td>\n",
       "      <td>61.1</td>\n",
       "      <td>133</td>\n",
       "      <td>38.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>females</td>\n",
       "      <td>215</td>\n",
       "      <td>50.2</td>\n",
       "      <td>213</td>\n",
       "      <td>49.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>dutch</td>\n",
       "      <td>22</td>\n",
       "      <td>44.9</td>\n",
       "      <td>27</td>\n",
       "      <td>55.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>no answer 1</td>\n",
       "      <td>402</td>\n",
       "      <td>55.8</td>\n",
       "      <td>319</td>\n",
       "      <td>44.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>primary</td>\n",
       "      <td>5</td>\n",
       "      <td>55.6</td>\n",
       "      <td>4</td>\n",
       "      <td>44.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>school</td>\n",
       "      <td>56</td>\n",
       "      <td>62.9</td>\n",
       "      <td>33</td>\n",
       "      <td>37.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>intermediate</td>\n",
       "      <td>103</td>\n",
       "      <td>60.2</td>\n",
       "      <td>68</td>\n",
       "      <td>39.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>higher</td>\n",
       "      <td>137</td>\n",
       "      <td>52.5</td>\n",
       "      <td>124</td>\n",
       "      <td>47.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>university</td>\n",
       "      <td>40</td>\n",
       "      <td>40.8</td>\n",
       "      <td>58</td>\n",
       "      <td>59.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>no answer 2</td>\n",
       "      <td>11</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19</td>\n",
       "      <td>63.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>yes</td>\n",
       "      <td>25</td>\n",
       "      <td>48.1</td>\n",
       "      <td>27</td>\n",
       "      <td>51.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>no</td>\n",
       "      <td>308</td>\n",
       "      <td>56.2</td>\n",
       "      <td>240</td>\n",
       "      <td>43.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>no answer 3</td>\n",
       "      <td>91</td>\n",
       "      <td>53.5</td>\n",
       "      <td>79</td>\n",
       "      <td>46.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>i think</td>\n",
       "      <td>334</td>\n",
       "      <td>55.9</td>\n",
       "      <td>264</td>\n",
       "      <td>44.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>i want</td>\n",
       "      <td>14</td>\n",
       "      <td>58.3</td>\n",
       "      <td>10</td>\n",
       "      <td>41.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>something</td>\n",
       "      <td>36</td>\n",
       "      <td>53.7</td>\n",
       "      <td>31</td>\n",
       "      <td>46.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>others think</td>\n",
       "      <td>14</td>\n",
       "      <td>70.0</td>\n",
       "      <td>6</td>\n",
       "      <td>30.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>other reasons</td>\n",
       "      <td>24</td>\n",
       "      <td>45.3</td>\n",
       "      <td>29</td>\n",
       "      <td>54.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>no answer 4</td>\n",
       "      <td>2</td>\n",
       "      <td>25.0</td>\n",
       "      <td>6</td>\n",
       "      <td>75.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>never</td>\n",
       "      <td>164</td>\n",
       "      <td>46.7</td>\n",
       "      <td>187</td>\n",
       "      <td>53.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>now and then</td>\n",
       "      <td>36</td>\n",
       "      <td>60.0</td>\n",
       "      <td>24</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>daily</td>\n",
       "      <td>224</td>\n",
       "      <td>62.4</td>\n",
       "      <td>135</td>\n",
       "      <td>37.6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              key  dropout:N  dropout:%  completer:N  completer:%\n",
       "0           males        209       61.1          133         38.9\n",
       "1         females        215       50.2          213         49.8\n",
       "2           dutch         22       44.9           27         55.1\n",
       "3     no answer 1        402       55.8          319         44.2\n",
       "4         primary          5       55.6            4         44.4\n",
       "5          school         56       62.9           33         37.1\n",
       "6    intermediate        103       60.2           68         39.8\n",
       "7          higher        137       52.5          124         47.5\n",
       "8      university         40       40.8           58         59.2\n",
       "9     no answer 2         11       36.7           19         63.3\n",
       "10            yes         25       48.1           27         51.9\n",
       "11             no        308       56.2          240         43.8\n",
       "12    no answer 3         91       53.5           79         46.5\n",
       "13        i think        334       55.9          264         44.1\n",
       "14         i want         14       58.3           10         41.7\n",
       "15      something         36       53.7           31         46.3\n",
       "16   others think         14       70.0            6         30.0\n",
       "17  other reasons         24       45.3           29         54.7\n",
       "18    no answer 4          2       25.0            6         75.0\n",
       "19          never        164       46.7          187         53.3\n",
       "20   now and then         36       60.0           24         40.0\n",
       "21          daily        224       62.4          135         37.6"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(table_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9651871-6a09-4467-bd85-b75eeb36fd7f",
   "metadata": {},
   "source": [
    "## 2. Read data\n",
    "\n",
    "There are 791 records in the file. 21 need to be removed because of missing information. The fuction `count_missing` identifies these."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c17d7a0d-13b6-4497-b199-e4bdf84dfe52",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATAFILE = \"/home/erikt/projects/e-mental-health/usb/releases/20200320/liwc+intake123+dropout.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9f66923c-9570-4576-9815-b4105dcba786",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(DATAFILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6a8d7d83-7958-4c73-a329-b542b958bc10",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_missing(data):\n",
    "    missings = []\n",
    "    for idx, row in data.iterrows():\n",
    "        missing = 0\n",
    "        if pd.isna(row[\"1-geslacht\"]) and pd.isna(row[\"1-geslacht0\"]) and pd.isna(row[\"1-geslachtt0\"]):\n",
    "            missing += 1\n",
    "        if pd.isna(row[\"3-national0\"]) or row[\"3-national0\"] == \"EMPTY\":\n",
    "            missing += 1\n",
    "        if pd.isna(row[\"4-opleidng\"]) and (pd.isna(row[\"8-opleidng0\"]) or row[\"8-opleidng0\"] == \"EMPTY\") and pd.isna(row[\"4-opleidngt0\"]):\n",
    "            missing += 1\n",
    "        if pd.isna(row[\"19-behdrink\"]) and pd.isna(row[\"25-behdrinkt0\"]):\n",
    "            missing += 1\n",
    "        if pd.isna(row[\"7-aanleid\"]) and pd.isna(row[\"12-aanleid0\"]) and pd.isna(row[\"7-aanleidt0\"]):\n",
    "            missing += 1\n",
    "        if pd.isna(row[\"13-roken\"]) and pd.isna(row[\"13-rokent0\"]):\n",
    "            missing += 1\n",
    "        missings.append(missing)\n",
    "    return missings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9fd0091e-f34e-485d-aebd-19412bcf1c30",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_values(missings):\n",
    "    return pd.DataFrame([{\"count\": x} for x in missings])[\"count\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "221cf056-7b41-4845-97cc-3168d35d1c3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "missings = count_missing(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a3394fee-ca1e-4eb5-ac60-db8b5d7b89ec",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    721\n",
       "2     49\n",
       "4     21\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count_values(missings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e63cc0a2-494b-4525-ad54-0f92a027c45a",
   "metadata": {},
   "source": [
    "## 3. Count gender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4f101e34-3b2c-4f5f-9249-22db6df41935",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_gender(data, missings):\n",
    "    genders = { \"man\": {1: 0, 2: 0}, \"vrouw\": {1: 0, 2: 0} }\n",
    "    for idx, row in data.iterrows():\n",
    "        if missings[idx] < 3:\n",
    "            gender = math.nan\n",
    "            if not pd.isna(row[\"1-geslacht\"]):\n",
    "                gender = row[\"1-geslacht\"]\n",
    "            if not pd.isna(row[\"1-geslacht0\"]):\n",
    "                if not pd.isna(gender):\n",
    "                    print(\"cannot happen: duplicate gender value\")\n",
    "                gender = row[\"1-geslacht0\"]\n",
    "            if not pd.isna(row[\"1-geslachtt0\"]):\n",
    "                if not pd.isna(gender):\n",
    "                    print(\"cannot happen: duplicate gender value\")\n",
    "                gender = row[\"1-geslachtt0\"]\n",
    "            if pd.isna(gender):\n",
    "                print(\"cannot happen: missing gender value\")\n",
    "            genders[gender][row[\"dropout\"]] += 1\n",
    "    return genders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "defe790b-fa4f-47f9-85ab-d2fda6bd7395",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>man</td>\n",
       "      <td>209</td>\n",
       "      <td>61.1</td>\n",
       "      <td>133</td>\n",
       "      <td>38.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>vrouw</td>\n",
       "      <td>215</td>\n",
       "      <td>50.2</td>\n",
       "      <td>213</td>\n",
       "      <td>49.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     key  dropout:N  dropout:%  completer:N  completer:%\n",
       "0    man        209       61.1          133         38.9\n",
       "1  vrouw        215       50.2          213         49.8"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(count_gender(data, missings))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5e6956a-c269-4383-a5d0-1261054651b9",
   "metadata": {},
   "source": [
    "## 4. Count nationalities\n",
    "\n",
    "Value NaN stands for \"Not a Number\" and signals missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4b10190c-91a7-4b80-adf9-a3cf5d1a8227",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_nationalities(data, missings):\n",
    "    nationalities = { \"nederlands\": {1: 0, 2: 0}, \"EMPTY\": {1: 0, 2: 0}, math.nan: {1: 0, 2: 0}}\n",
    "    for idx, row in data.iterrows():\n",
    "        if missings[idx] < 3:\n",
    "            nationality = math.nan\n",
    "            if not pd.isna(row[\"3-national0\"]):\n",
    "                nationality = row[\"3-national0\"]\n",
    "            nationalities[nationality][row[\"dropout\"]] += 1\n",
    "    return nationalities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "95674d65-5ae6-4c5d-8002-948b3e3f4b63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pretty_print: skipping unused key value: EMPTY\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>nederlands</td>\n",
       "      <td>22</td>\n",
       "      <td>44.9</td>\n",
       "      <td>27</td>\n",
       "      <td>55.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>402</td>\n",
       "      <td>55.8</td>\n",
       "      <td>319</td>\n",
       "      <td>44.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          key  dropout:N  dropout:%  completer:N  completer:%\n",
       "0  nederlands         22       44.9           27         55.1\n",
       "1         NaN        402       55.8          319         44.2"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(count_nationalities(data, missings))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23590783-9bf7-43e0-9a2f-2adccbb6cfc5",
   "metadata": {},
   "source": [
    "## 5. Count educations\n",
    "\n",
    "Key value \"MULTIWORD\" represents a phrase removed by the anonymization process. It is unknown why some key fields have been filled with numbers (in this case: 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9223bae4-0071-4e99-8a48-afd92d30f84e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_educations(data, missings):\n",
    "    educations = { \"hbo\": {1: 0, 2: 0}, \"mbo\": {1: 0, 2: 0}, \"wo\": {1: 0, 2: 0}, \"lbo/mavo\": {1: 0, 2: 0}, \"havo/vwo\": {1: 0, 2: 0}, \n",
    "                  \"MULTIWORD\": {1: 0, 2: 0}, \"lbo/mavo/vmbo\": {1: 0, 2: 0}, \"basisschool\": {1: 0, 2: 0}, \"EMPTY\": {1: 0, 2: 0}, \"3\": {1: 0, 2: 0}, }\n",
    "    for idx, row in data.iterrows():\n",
    "        if missings[idx] < 3:\n",
    "            education = math.nan\n",
    "            if not pd.isna(row[\"4-opleidng\"]):\n",
    "                education = row[\"4-opleidng\"]\n",
    "            if not pd.isna(row[\"8-opleidng0\"]):\n",
    "                if not pd.isna(education):\n",
    "                    print(f\"cannot happen: duplicate education value: {education} and {row['8-opleidng0']}\")\n",
    "                education = row[\"8-opleidng0\"]\n",
    "            if not pd.isna(row[\"4-opleidngt0\"]):\n",
    "                if not pd.isna(education):\n",
    "                    print(f\"cannot happen: duplicate education value: {education} and {row['4-opleidngt0']}\")\n",
    "                education = row[\"4-opleidngt0\"]\n",
    "            if pd.isna(education):\n",
    "                print(\"cannot happen: missing education value\")\n",
    "            educations[education][row[\"dropout\"]] += 1\n",
    "    return educations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "cdd21daf-994e-4df6-882d-fcf89025e4e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pretty_print: skipping unused key value: EMPTY\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>hbo</td>\n",
       "      <td>137</td>\n",
       "      <td>52.5</td>\n",
       "      <td>124</td>\n",
       "      <td>47.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mbo</td>\n",
       "      <td>103</td>\n",
       "      <td>60.2</td>\n",
       "      <td>68</td>\n",
       "      <td>39.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>wo</td>\n",
       "      <td>40</td>\n",
       "      <td>40.8</td>\n",
       "      <td>58</td>\n",
       "      <td>59.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>lbo/mavo</td>\n",
       "      <td>65</td>\n",
       "      <td>65.0</td>\n",
       "      <td>35</td>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>havo/vwo</td>\n",
       "      <td>56</td>\n",
       "      <td>62.9</td>\n",
       "      <td>33</td>\n",
       "      <td>37.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>MULTIWORD</td>\n",
       "      <td>14</td>\n",
       "      <td>42.4</td>\n",
       "      <td>19</td>\n",
       "      <td>57.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>lbo/mavo/vmbo</td>\n",
       "      <td>4</td>\n",
       "      <td>50.0</td>\n",
       "      <td>4</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>basisschool</td>\n",
       "      <td>5</td>\n",
       "      <td>55.6</td>\n",
       "      <td>4</td>\n",
       "      <td>44.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             key  dropout:N  dropout:%  completer:N  completer:%\n",
       "0            hbo        137       52.5          124         47.5\n",
       "1            mbo        103       60.2           68         39.8\n",
       "2             wo         40       40.8           58         59.2\n",
       "3       lbo/mavo         65       65.0           35         35.0\n",
       "4       havo/vwo         56       62.9           33         37.1\n",
       "5      MULTIWORD         14       42.4           19         57.6\n",
       "6  lbo/mavo/vmbo          4       50.0            4         50.0\n",
       "7    basisschool          5       55.6            4         44.4\n",
       "8              3          0        0.0            1        100.0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(count_educations(data, missings))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76a44ed9-40b7-46b0-b1a1-b103e910b477",
   "metadata": {},
   "source": [
    "## 6. Count previous treatments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "47277603-0ab4-4efc-a126-ecc88f815789",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_treatments(data, missings):\n",
    "    treatments = { \"nee\": {1: 0, 2: 0}, \"ja\": {1: 0, 2: 0}, \"EMPTY\": {1: 0, 2: 0}, math.nan: {1: 0, 2: 0},\n",
    "                   \"ja ik heb deze internetbehandeling al eens gevolgd.\": {1: 0, 2: 0}, \"MULTIWORD\": {1: 0, 2: 0}, }\n",
    "    for idx, row in data.iterrows():\n",
    "        if missings[idx] < 3:\n",
    "            treatment = math.nan\n",
    "            if not pd.isna(row[\"19-behdrink\"]):\n",
    "                treatment = row[\"19-behdrink\"]\n",
    "            if not pd.isna(row[\"25-behdrinkt0\"]):\n",
    "                if not pd.isna(treatment):\n",
    "                    print(f\"cannot happen: duplicate treatment value: {reason} and {row['25-behdrinkt0']}\")\n",
    "                treatment = row[\"25-behdrinkt0\"]\n",
    "            if not pd.isna(row[\"26-behversl0\"]):\n",
    "                if not pd.isna(treatment):\n",
    "                    print(f\"cannot happen: duplicate treatment value: {reason} and {row['25-behversl0']}\")\n",
    "                treatment = row[\"26-behversl0\"]\n",
    "\n",
    "            treatments[treatment][row[\"dropout\"]] += 1\n",
    "    return treatments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "0e1698b8-0593-4507-b55d-376e77a2e909",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pretty_print: skipping unused key value: nan\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>nee</td>\n",
       "      <td>308</td>\n",
       "      <td>56.2</td>\n",
       "      <td>240</td>\n",
       "      <td>43.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>ja</td>\n",
       "      <td>12</td>\n",
       "      <td>50.0</td>\n",
       "      <td>12</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>EMPTY</td>\n",
       "      <td>91</td>\n",
       "      <td>53.5</td>\n",
       "      <td>79</td>\n",
       "      <td>46.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>ja ik heb deze internetbehandeling al eens gev...</td>\n",
       "      <td>13</td>\n",
       "      <td>50.0</td>\n",
       "      <td>13</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>MULTIWORD</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 key  dropout:N  dropout:%  \\\n",
       "0                                                nee        308       56.2   \n",
       "1                                                 ja         12       50.0   \n",
       "2                                              EMPTY         91       53.5   \n",
       "3  ja ik heb deze internetbehandeling al eens gev...         13       50.0   \n",
       "4                                          MULTIWORD          0        0.0   \n",
       "\n",
       "   completer:N  completer:%  \n",
       "0          240         43.8  \n",
       "1           12         50.0  \n",
       "2           79         46.5  \n",
       "3           13         50.0  \n",
       "4            2        100.0  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(count_treatments(data, missings))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96890e53-53ff-4f29-86d8-d7d920487e03",
   "metadata": {},
   "source": [
    "## 7. Count reasons for treatment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "75ff3960-0de9-49a6-90ec-ffb55e9b326b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_reasons(data, missings):\n",
    "    reasons = { \"ik vind zelf dat ik teveel drink\": {1: 0, 2: 0}, \n",
    "                \"anderen vinden dat ik teveel drink\": {1: 0, 2: 0},\n",
    "                \"ik wil advies over mijn alcoholgebruik\": {1: 0, 2: 0},\n",
    "                \"er is iets vervelends gebeurd en daarom wil ik iets aan mijn drinken doen\": {1: 0, 2: 0},\n",
    "                \"1\": {1: 0, 2: 0}, \n",
    "                \"2\": {1: 0, 2: 0}, \n",
    "                \"3\": {1: 0, 2: 0}, \n",
    "                \"4\": {1: 0, 2: 0}, \n",
    "                \"MULTIWORD\": {1: 0, 2: 0}, \n",
    "                \"EMPTY\": {1: 0, 2: 0}, }\n",
    "    for idx, row in data.iterrows():\n",
    "        if missings[idx] < 3:\n",
    "            reason = math.nan\n",
    "            if not pd.isna(row[\"7-aanleid\"]):\n",
    "                reason = row[\"7-aanleid\"]\n",
    "            if not pd.isna(row[\"12-aanleid0\"]):\n",
    "                if not pd.isna(reason):\n",
    "                    print(f\"cannot happen: duplicate education value: {reason} and {row['12-aanleid0']}\")\n",
    "                reason = row[\"12-aanleid0\"]\n",
    "            if not pd.isna(row[\"7-aanleidt0\"]):\n",
    "                if not pd.isna(reason):\n",
    "                    print(f\"cannot happen: duplicate education value: {reason} and {row['7-aanleidt0']}\")\n",
    "                reason = row[\"7-aanleidt0\"]\n",
    "            if pd.isna(reason):\n",
    "                print(\"cannot happen: missing reason value\")\n",
    "            reasons[reason][row[\"dropout\"]] += 1\n",
    "    return reasons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "6943b797-27b4-436b-93d0-7d785217b5ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>ik vind zelf dat ik teveel drink</td>\n",
       "      <td>334</td>\n",
       "      <td>55.9</td>\n",
       "      <td>264</td>\n",
       "      <td>44.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>anderen vinden dat ik teveel drink</td>\n",
       "      <td>14</td>\n",
       "      <td>70.0</td>\n",
       "      <td>6</td>\n",
       "      <td>30.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>ik wil advies over mijn alcoholgebruik</td>\n",
       "      <td>14</td>\n",
       "      <td>58.3</td>\n",
       "      <td>10</td>\n",
       "      <td>41.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>er is iets vervelends gebeurd en daarom wil ik...</td>\n",
       "      <td>36</td>\n",
       "      <td>53.7</td>\n",
       "      <td>31</td>\n",
       "      <td>46.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>40.0</td>\n",
       "      <td>3</td>\n",
       "      <td>60.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>50.0</td>\n",
       "      <td>1</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>MULTIWORD</td>\n",
       "      <td>19</td>\n",
       "      <td>43.2</td>\n",
       "      <td>25</td>\n",
       "      <td>56.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>EMPTY</td>\n",
       "      <td>2</td>\n",
       "      <td>25.0</td>\n",
       "      <td>6</td>\n",
       "      <td>75.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 key  dropout:N  dropout:%  \\\n",
       "0                   ik vind zelf dat ik teveel drink        334       55.9   \n",
       "1                 anderen vinden dat ik teveel drink         14       70.0   \n",
       "2             ik wil advies over mijn alcoholgebruik         14       58.3   \n",
       "3  er is iets vervelends gebeurd en daarom wil ik...         36       53.7   \n",
       "4                                                  1          1      100.0   \n",
       "5                                                  2          2       40.0   \n",
       "6                                                  3          1      100.0   \n",
       "7                                                  4          1       50.0   \n",
       "8                                          MULTIWORD         19       43.2   \n",
       "9                                              EMPTY          2       25.0   \n",
       "\n",
       "   completer:N  completer:%  \n",
       "0          264         44.1  \n",
       "1            6         30.0  \n",
       "2           10         41.7  \n",
       "3           31         46.3  \n",
       "4            0          0.0  \n",
       "5            3         60.0  \n",
       "6            0          0.0  \n",
       "7            1         50.0  \n",
       "8           25         56.8  \n",
       "9            6         75.0  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(count_reasons(data, missings))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac8f93bf-05dc-443b-a3cb-276b43f4386c",
   "metadata": {},
   "source": [
    "## 8. Count smoking behaviour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "2cbe5227-4b5d-4fd5-95a4-86613ba1065d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_smokers(data, missings):\n",
    "    smokers = { \"ja af en toe\": {1: 0, 2: 0}, \"ja dagelijks\": {1: 0, 2: 0}, \"nee helemaal niet\": {1: 0, 2: 0}, \n",
    "                math.nan: {1: 0, 2: 0}, \"MULTIWORD\": {1: 0, 2: 0}, \"ja\": {1: 0, 2: 0}, \"nee\": {1: 0, 2: 0}, }\n",
    "    for idx, row in data.iterrows():\n",
    "        if missings[idx] < 3:\n",
    "            smoker = math.nan\n",
    "            if not pd.isna(row[\"13-roken\"]):\n",
    "                smoker = row[\"13-roken\"]\n",
    "            if not pd.isna(row[\"19-tabak0\"]):\n",
    "                if not pd.isna(smoker):\n",
    "                    print(\"cannot happen: duplicate smoker value\")\n",
    "                smoker = row[\"19-tabak0\"]\n",
    "            if not pd.isna(row[\"13-rokent0\"]):\n",
    "                if not pd.isna(smoker):\n",
    "                    print(\"cannot happen: duplicate smoker value\")\n",
    "                smoker = row[\"13-rokent0\"]\n",
    "            smokers[smoker][row[\"dropout\"]] += 1\n",
    "    return smokers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5a120e95-af90-4aa5-8ed3-53f9458f20ca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pretty_print: skipping unused key value: nan\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>dropout:N</th>\n",
       "      <th>dropout:%</th>\n",
       "      <th>completer:N</th>\n",
       "      <th>completer:%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>ja af en toe</td>\n",
       "      <td>31</td>\n",
       "      <td>60.8</td>\n",
       "      <td>20</td>\n",
       "      <td>39.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>ja dagelijks</td>\n",
       "      <td>210</td>\n",
       "      <td>62.3</td>\n",
       "      <td>127</td>\n",
       "      <td>37.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>nee helemaal niet</td>\n",
       "      <td>159</td>\n",
       "      <td>48.3</td>\n",
       "      <td>170</td>\n",
       "      <td>51.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>MULTIWORD</td>\n",
       "      <td>2</td>\n",
       "      <td>50.0</td>\n",
       "      <td>2</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>ja</td>\n",
       "      <td>21</td>\n",
       "      <td>46.7</td>\n",
       "      <td>24</td>\n",
       "      <td>53.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>nee</td>\n",
       "      <td>1</td>\n",
       "      <td>25.0</td>\n",
       "      <td>3</td>\n",
       "      <td>75.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 key  dropout:N  dropout:%  completer:N  completer:%\n",
       "0       ja af en toe         31       60.8           20         39.2\n",
       "1       ja dagelijks        210       62.3          127         37.7\n",
       "2  nee helemaal niet        159       48.3          170         51.7\n",
       "3          MULTIWORD          2       50.0            2         50.0\n",
       "4                 ja         21       46.7           24         53.3\n",
       "5                nee          1       25.0            3         75.0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretty_print(count_smokers(data, missings))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64bd44dc-6390-4432-aa30-a81813ea3cc3",
   "metadata": {},
   "source": [
    "## 9. Inspecting metadata field names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9d7b4320-e7c5-4422-a02f-7dad9b25b1dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_columns(data, column_name):\n",
    "    for column in data.columns:\n",
    "        if re.search(column_name, column, flags=re.IGNORECASE):\n",
    "            print(column)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6a740654-aa76-42d9-af1e-cc48a8296f45",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19-tabak0\n",
      "19-tabakjr0\n",
      "20-tabak300\n",
      "20-tabakdg0\n",
      "20-tabakhv0\n",
      "21-tabak0h\n"
     ]
    }
   ],
   "source": [
    "find_columns(data, \"tab\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "abd9ab66-5bd9-4452-807b-676fead32a84",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ja       45\n",
       "EMPTY    21\n",
       "nee       4\n",
       "Name: 19-tabak0, dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"19-tabak0\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "0b2b6ac0-6ca4-4b5a-b27e-0dc58c70ff05",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "file\n",
      "nbr of mails\n",
      "Number of matches\n",
      "number count\n",
      "1 function\n",
      "2 pronoun\n",
      "3 ppron\n",
      "4 i\n",
      "5 we\n",
      "6 you\n",
      "7 shehe\n",
      "8 they\n",
      "9 ipron\n",
      "10 article\n",
      "11 prep\n",
      "12 auxverb\n",
      "13 adverb\n",
      "14 conj\n",
      "15 negate\n",
      "20 verb\n",
      "21 adj\n",
      "22 compare\n",
      "23 interrog\n",
      "24 number\n",
      "25 quant\n",
      "30 affect\n",
      "31 posemo\n",
      "32 negemo\n",
      "33 anx\n",
      "34 anger\n",
      "35 sad\n",
      "40 social\n",
      "41 family\n",
      "42 friend\n",
      "43 female\n",
      "44 male\n",
      "50 cogproc\n",
      "51 insight\n",
      "52 cause\n",
      "53 discrep\n",
      "54 tentat\n",
      "55 certain\n",
      "56 differ\n",
      "60 percept\n",
      "61 see\n",
      "62 hear\n",
      "63 feel\n",
      "70 bio\n",
      "71 body\n",
      "72 health\n",
      "73 sexual\n",
      "74 ingest\n",
      "80 drives\n",
      "81 affiliation\n",
      "82 achieve\n",
      "83 power\n",
      "84 reward\n",
      "85 risk\n",
      "90 focuspast\n",
      "91 focuspresent\n",
      "92 focusfuture\n",
      "100 relativ\n",
      "101 motion\n",
      "102 space\n",
      "103 time\n",
      "110 work\n",
      "111 leisure\n",
      "112 home\n",
      "113 money\n",
      "114 relig\n",
      "115 death\n",
      "120 informal\n",
      "121 swear\n",
      "122 netspeak\n",
      "123 assent\n",
      "124 nonflu\n",
      "125 filler\n",
      "counselor\n",
      "dropout\n",
      "0-title\n",
      "1-geslacht\n",
      "1-geslacht0\n",
      "1-geslachtt0\n",
      "10-dag1\n",
      "10-dag1t0\n",
      "10-dag2\n",
      "10-dag2t0\n",
      "10-dag3\n",
      "10-dag3t0\n",
      "10-dag4\n",
      "10-dag4t0\n",
      "10-dag5\n",
      "10-dag5t0\n",
      "10-dag6\n",
      "10-dag6t0\n",
      "10-dag7\n",
      "10-dag7t0\n",
      "10-dagb0\n",
      "10-typisch\n",
      "10-typischt0\n",
      "10-week\n",
      "10-weekt0\n",
      "11-dagritme0\n",
      "11-terugbl\n",
      "11-terugblt0\n",
      "12-aanleid0\n",
      "12-medi0\n",
      "12-medi0t0\n",
      "12-medi1\n",
      "12-medi1t0\n",
      "12-medi2\n",
      "12-medi2t0\n",
      "12-medi3\n",
      "12-medi3t0\n",
      "12-medi4\n",
      "12-medi4t0\n",
      "12-medi5\n",
      "12-medi5t0\n",
      "12-medicijn\n",
      "12-medicijnt0\n",
      "12-medidos0\n",
      "12-medidos0t0\n",
      "12-medidos1\n",
      "12-medidos1t0\n",
      "12-medidos2\n",
      "12-medidos2t0\n",
      "12-medidos3\n",
      "12-medidos3t0\n",
      "12-medidos4\n",
      "12-medidos4t0\n",
      "12-medidos5\n",
      "12-medidos5t0\n",
      "12-mediln0\n",
      "12-mediln0t0\n",
      "12-mediln1\n",
      "12-mediln1t0\n",
      "12-mediln2\n",
      "12-mediln2t0\n",
      "12-mediln3\n",
      "12-mediln3t0\n",
      "12-mediln4\n",
      "12-mediln4t0\n",
      "12-mediln5\n",
      "12-mediln5t0\n",
      "12-medivr0\n",
      "12-medivr0t0\n",
      "12-medivr1\n",
      "12-medivr1t0\n",
      "12-medivr2\n",
      "12-medivr2t0\n",
      "12-medivr3\n",
      "12-medivr3t0\n",
      "12-medivr4\n",
      "12-medivr4t0\n",
      "12-medivr5\n",
      "12-medivr5t0\n",
      "12-verhaal0\n",
      "13-hoeinfo0\n",
      "13-rodag\n",
      "13-rodagt0\n",
      "13-roken\n",
      "13-rokent0\n",
      "13-romaand\n",
      "13-romaandt0\n",
      "13-rovroe\n",
      "13-rovroet0\n",
      "14-drugs\n",
      "14-drugs0\n",
      "14-drugs0t0\n",
      "14-drugs1\n",
      "14-drugs1t0\n",
      "14-drugs2\n",
      "14-drugs2t0\n",
      "14-drugs3\n",
      "14-drugs3t0\n",
      "14-drugs4\n",
      "14-drugs4t0\n",
      "14-drugs5\n",
      "14-drugs5t0\n",
      "14-drugsln0\n",
      "14-drugsln0t0\n",
      "14-drugsln1\n",
      "14-drugsln1t0\n",
      "14-drugsln2\n",
      "14-drugsln2t0\n",
      "14-drugsln3\n",
      "14-drugsln3t0\n",
      "14-drugsln4\n",
      "14-drugsln4t0\n",
      "14-drugsln5\n",
      "14-drugsln5t0\n",
      "14-drugst0\n",
      "14-drugsvk0\n",
      "14-drugsvk0t0\n",
      "14-drugsvk1\n",
      "14-drugsvk2\n",
      "14-drugsvk3\n",
      "14-drugsvk4\n",
      "14-drugsvk5\n",
      "15-dag10\n",
      "15-dag20\n",
      "15-dag30\n",
      "15-dag40\n",
      "15-dag50\n",
      "15-dag60\n",
      "15-dag70\n",
      "15-gokken\n",
      "15-gokkent0\n",
      "15-problgok\n",
      "15-problgokt0\n",
      "15-week0\n",
      "16-eetdrang\n",
      "16-eetdrangt0\n",
      "16-gebleven0\n",
      "16-gewgebdg0\n",
      "16-gewgebhv0\n",
      "16-hoggebdg0\n",
      "16-hoggebhv0\n",
      "17-depres\n",
      "17-diarree\n",
      "17-epilep\n",
      "17-geheug\n",
      "17-ghqa1t0\n",
      "17-ghqa2t0\n",
      "17-ghqa3t0\n",
      "17-ghqa4t0\n",
      "17-ghqa5t0\n",
      "17-ghqa6t0\n",
      "17-ghqa7t0\n",
      "17-ghqb1t0\n",
      "17-ghqb2t0\n",
      "17-ghqb3t0\n",
      "17-ghqb4t0\n",
      "17-ghqb5t0\n",
      "17-ghqb6t0\n",
      "17-hartklp\n",
      "17-hoofdp\n",
      "17-hyper\n",
      "17-maag\n",
      "17-moe\n",
      "17-probleem0\n",
      "17-seksuel\n",
      "17-transpi\n",
      "17-trillen\n",
      "18-maaltd\n",
      "18-verlang10\n",
      "18-verlang20\n",
      "18-verlang30\n",
      "18-verlang40\n",
      "18-verlang50\n",
      "18-verlangtot0\n",
      "19-behdrink\n",
      "19-canna0\n",
      "19-cannajr0\n",
      "19-coca0\n",
      "19-cocajr0\n",
      "19-deprest0\n",
      "19-diarreet0\n",
      "19-drugs0\n",
      "19-drugsjr0\n",
      "19-epilept0\n",
      "19-geheugt0\n",
      "19-ghb0\n",
      "19-ghbjr0\n",
      "19-goTo3\n",
      "19-gok0\n",
      "19-gokjr0\n",
      "19-hartklpt0\n",
      "19-hoofdpt0\n",
      "19-hypert0\n",
      "19-map10t0\n",
      "19-map1t0\n",
      "19-map2t0\n",
      "19-map3t0\n",
      "19-map4t0\n",
      "19-map5t0\n",
      "19-map6t0\n",
      "19-map7t0\n",
      "19-map8t0\n",
      "19-map9t0\n",
      "19-maptotaalt0\n",
      "19-opiat0\n",
      "19-opiatjr0\n",
      "19-seksuelt0\n",
      "19-slaap0\n",
      "19-slaapjr0\n",
      "19-speed0\n",
      "19-speedjr0\n",
      "19-tabak0\n",
      "19-tabakjr0\n",
      "19-transpit0\n",
      "19-xtc0\n",
      "19-xtcjr0\n",
      "2-leeftijd\n",
      "2-leeftijd0\n",
      "2-leeftijdt0\n",
      "2-medi0\n",
      "2-medi00\n",
      "2-medi10\n",
      "2-medi20\n",
      "2-medi30\n",
      "2-medi40\n",
      "2-medi50\n",
      "2-medicijnr0\n",
      "2-medidos00\n",
      "2-medidos10\n",
      "2-medidos20\n",
      "2-medidos30\n",
      "2-medidos40\n",
      "2-medidos50\n",
      "2-mediln00\n",
      "2-mediln10\n",
      "2-mediln20\n",
      "2-mediln30\n",
      "2-mediln40\n",
      "2-mediln50\n",
      "2-medivr00\n",
      "2-medivr10\n",
      "2-medivr20\n",
      "2-medivr30\n",
      "2-medivr40\n",
      "2-medivr50\n",
      "20-canna300\n",
      "20-cannadg0\n",
      "20-cannahv0\n",
      "20-coca300\n",
      "20-cocadg0\n",
      "20-cocahv0\n",
      "20-drugs300\n",
      "20-drugsdg0\n",
      "20-drugshv0\n",
      "20-eq1t0\n",
      "20-eq2t0\n",
      "20-eq3t0\n",
      "20-eq4t0\n",
      "20-eq5t0\n",
      "20-eq6t0\n",
      "20-ghb300\n",
      "20-ghbdg0\n",
      "20-ghbhv0\n",
      "20-gok300\n",
      "20-gokdg0\n",
      "20-gokhv0\n",
      "20-insult\n",
      "20-opiat300\n",
      "20-opiatdg0\n",
      "20-opiathv0\n",
      "20-slaap300\n",
      "20-slaapdg0\n",
      "20-slaaphv0\n",
      "20-speed300\n",
      "20-speeddg0\n",
      "20-speedhv0\n",
      "20-tabak300\n",
      "20-tabakdg0\n",
      "20-tabakhv0\n",
      "20-xtc300\n",
      "20-xtcdg0\n",
      "20-xtchv0\n",
      "21-canna0h\n",
      "21-coca0h\n",
      "21-delirium\n",
      "21-drugs0h\n",
      "21-eetdrang0\n",
      "21-eqtht0\n",
      "21-ghb0h\n",
      "21-gok0h\n",
      "21-opiat0h\n",
      "21-slaap0h\n",
      "21-speed0h\n",
      "21-tabak0h\n",
      "21-xtc0h\n",
      "22-dass10t0\n",
      "22-dass11t0\n",
      "22-dass12t0\n",
      "22-dass13t0\n",
      "22-dass1t0\n",
      "22-dass2t0\n",
      "22-dass3t0\n",
      "22-dass4t0\n",
      "22-dass5t0\n",
      "22-dass6t0\n",
      "22-dass7t0\n",
      "22-dass8t0\n",
      "22-dass9t0\n",
      "22-medi0\n",
      "22-medi00\n",
      "22-medi10\n",
      "22-medi20\n",
      "22-medi30\n",
      "22-medi40\n",
      "22-medi50\n",
      "22-medicijnr0\n",
      "22-medidos00\n",
      "22-medidos10\n",
      "22-medidos20\n",
      "22-medidos30\n",
      "22-medidos40\n",
      "22-medidos50\n",
      "22-mediln00\n",
      "22-mediln10\n",
      "22-mediln20\n",
      "22-mediln30\n",
      "22-mediln40\n",
      "22-mediln50\n",
      "22-medivr00\n",
      "22-medivr10\n",
      "22-medivr20\n",
      "22-medivr30\n",
      "22-medivr40\n",
      "22-medivr50\n",
      "22-ontwen\n",
      "23-diarree0\n",
      "23-doofgev0\n",
      "23-eetlust0\n",
      "23-epilep0\n",
      "23-geheug0\n",
      "23-goTo2\n",
      "23-hartklp0\n",
      "23-hoofdp0\n",
      "23-hyper0\n",
      "23-klachttot0\n",
      "23-kortad0\n",
      "23-ltgeslacht1\n",
      "23-maag0\n",
      "23-maphss0\n",
      "23-missel0\n",
      "23-moe0\n",
      "23-pijnbor0\n",
      "23-pijngewr0\n",
      "23-psych\n",
      "23-psytijd0\n",
      "23-psytijd1\n",
      "23-psytijd2\n",
      "23-psytijd3\n",
      "23-psytijd4\n",
      "23-psytijd5\n",
      "23-psyvoor0\n",
      "23-psyvoor1\n",
      "23-psyvoor2\n",
      "23-psyvoor3\n",
      "23-psyvoor4\n",
      "23-psyvoor5\n",
      "23-psywaar0\n",
      "23-psywaar1\n",
      "23-psywaar2\n",
      "23-psywaar3\n",
      "23-psywaar4\n",
      "23-psywaar5\n",
      "23-seksuel0\n",
      "23-spierp0\n",
      "23-transpi0\n",
      "23-trillen0\n",
      "24-maaltdt0\n",
      "24-mindpres\n",
      "24-telaat\n",
      "24-ziekmeld\n",
      "24-zwanger0\n",
      "25-behdrinkt0\n",
      "25-maaltd0\n",
      "25-rcq1\n",
      "25-rcq10\n",
      "25-rcq11\n",
      "25-rcq12\n",
      "25-rcq2\n",
      "25-rcq3\n",
      "25-rcq4\n",
      "25-rcq5\n",
      "25-rcq6\n",
      "25-rcq7\n",
      "25-rcq8\n",
      "25-rcq9\n",
      "25-rcqact\n",
      "25-rcqcon\n",
      "25-rcqpre\n",
      "26-behamb0\n",
      "26-behkli0\n",
      "26-behtoel0\n",
      "26-behversl0\n",
      "26-bereik\n",
      "26-insultt0\n",
      "27-deliriumt0\n",
      "27-insult0\n",
      "27-toelich\n",
      "28-delirium0\n",
      "28-ontwent0\n",
      "29-ontwen0\n",
      "29-psycht0\n",
      "29-psytijd0t0\n",
      "29-psytijd1t0\n",
      "29-psytijd2t0\n",
      "29-psytijd3t0\n",
      "29-psytijd4t0\n",
      "29-psytijd5t0\n",
      "29-psyvoor0t0\n",
      "29-psyvoor1t0\n",
      "29-psyvoor2t0\n",
      "29-psyvoor3t0\n",
      "29-psyvoor4t0\n",
      "29-psyvoor5t0\n",
      "29-psywaar0t0\n",
      "29-psywaar1t0\n",
      "29-psywaar2t0\n",
      "29-psywaar3t0\n",
      "29-psywaar4t0\n",
      "29-psywaar5t0\n",
      "3-national0\n",
      "3-woonsit\n",
      "3-woonsitt0\n",
      "30-mindprest0\n",
      "30-psych0\n",
      "30-psychr0\n",
      "30-psytijd00\n",
      "30-psytijd10\n",
      "30-psytijd20\n",
      "30-psytijd30\n",
      "30-psytijd40\n",
      "30-psytijd50\n",
      "30-psyvoor00\n",
      "30-psyvoor10\n",
      "30-psyvoor20\n",
      "30-psyvoor30\n",
      "30-psyvoor40\n",
      "30-psyvoor50\n",
      "30-psywaar00\n",
      "30-psywaar10\n",
      "30-psywaar20\n",
      "30-psywaar30\n",
      "30-psywaar40\n",
      "30-psywaar50\n",
      "30-telaatt0\n",
      "30-ziekmeldt0\n",
      "31-angst0\n",
      "31-dass10\n",
      "31-dass100\n",
      "31-dass110\n",
      "31-dass120\n",
      "31-dass130\n",
      "31-dass140\n",
      "31-dass150\n",
      "31-dass160\n",
      "31-dass170\n",
      "31-dass180\n",
      "31-dass190\n",
      "31-dass20\n",
      "31-dass200\n",
      "31-dass210\n",
      "31-dass30\n",
      "31-dass40\n",
      "31-dass50\n",
      "31-dass60\n",
      "31-dass70\n",
      "31-dass80\n",
      "31-dass90\n",
      "31-dasstot0\n",
      "31-depres0\n",
      "31-rcq10t0\n",
      "31-rcq11t0\n",
      "31-rcq12t0\n",
      "31-rcq1t0\n",
      "31-rcq2t0\n",
      "31-rcq3t0\n",
      "31-rcq4t0\n",
      "31-rcq5t0\n",
      "31-rcq6t0\n",
      "31-rcq7t0\n",
      "31-rcq8t0\n",
      "31-rcq9t0\n",
      "31-rcqactt0\n",
      "31-rcqcont0\n",
      "31-rcqpret0\n",
      "31-stress0\n",
      "32-bereikt0\n",
      "32-suicide10\n",
      "32-suicide20\n",
      "33-halluci0\n",
      "33-toelicht0\n",
      "34-mdoel\n",
      "34-wanen0\n",
      "35-justitie0\n",
      "35-mhfddoelt0\n",
      "36-eq10\n",
      "36-eq20\n",
      "36-eq30\n",
      "36-eq40\n",
      "36-eq50\n",
      "36-eq60\n",
      "36-maanvult0\n",
      "36-mdenkent0\n",
      "36-mdoodt0\n",
      "36-mfamt0\n",
      "36-mgezndht0\n",
      "36-mlastt0\n",
      "36-mminlvnt0\n",
      "36-mpolitiet0\n",
      "36-mprblmt0\n",
      "36-mvinwrkt0\n",
      "37-dh1t0\n",
      "37-dh2t0\n",
      "37-dh3t0\n",
      "37-dh4t0\n",
      "37-dh5t0\n",
      "37-eqth0\n",
      "37-re1t0\n",
      "37-re2t0\n",
      "37-tr1t0\n",
      "37-tr2t0\n",
      "37-tr3t0\n",
      "37-tr4t0\n",
      "37-tr5t0\n",
      "37-tr6t0\n",
      "37-tr7t0\n",
      "37-tr8t0\n",
      "38-basaal0\n",
      "38-beperking0\n",
      "38-mateicn10\n",
      "38-mateicn100\n",
      "38-mateicn110\n",
      "38-mateicn120\n",
      "38-mateicn130\n",
      "38-mateicn140\n",
      "38-mateicn150\n",
      "38-mateicn160\n",
      "38-mateicn170\n",
      "38-mateicn180\n",
      "38-mateicn190\n",
      "38-mateicn20\n",
      "38-mateicn30\n",
      "38-mateicn40\n",
      "38-mateicn50\n",
      "38-mateicn60\n",
      "38-mateicn70\n",
      "38-mateicn80\n",
      "38-mateicn90\n",
      "38-mopmerkt0\n",
      "38-relation0\n",
      "39-doel\n",
      "4-cultherk0\n",
      "4-opleidng\n",
      "4-opleidngt0\n",
      "40-hfddoel0\n",
      "41-benniet0\n",
      "41-mailadres0\n",
      "41-smsnr0\n",
      "41-telnr0\n",
      "42-vragen10\n",
      "42-vragen20\n",
      "42-vragen30\n",
      "42-vragen40\n",
      "43-toelich0\n",
      "5-burstaat0\n",
      "5-dagb\n",
      "5-dagbt0\n",
      "6-dagritme\n",
      "6-dagritmet0\n",
      "6-kind0\n",
      "6-kindn0\n",
      "6-kindthuis0\n",
      "7-aanleid\n",
      "7-aanleidt0\n",
      "7-verhaal\n",
      "7-verhaalt0\n",
      "7-woonsit0\n",
      "8-hoeinfo\n",
      "8-hoeinfot0\n",
      "8-opleidng0\n",
      "9-inkomst0\n"
     ]
    }
   ],
   "source": [
    "for column in data.columns:\n",
    "    print(column)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07ac00bd-6537-490e-b874-6a9d3416688c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python37",
   "language": "python",
   "name": "python37"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}