data/notebooks/Lab/9_contextual_IDHI_dataset.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"id": "d5022bb7-643c-4346-a4d2-86587bdcb9d1",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"import numpy as np\n",
"import requests\n",
"from psycopg2.pool import ThreadedConnectionPool\n",
"from shapely.geometry import shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "703af319-5a48-42f8-9a84-06329fcd73e7",
"metadata": {},
"outputs": [],
"source": [
"env_path = \".env\"\n",
"with open(env_path) as f:\n",
" env = {}\n",
" for line in f.readlines():\n",
" if line.startswith(\"#\"):\n",
" continue\n",
" env_key, _val = line.split(\"=\", 1)\n",
" env_value = _val.split(\"\\n\")[0]\n",
" env[env_key] = env_value\n",
"\n",
"postgres_thread_pool = ThreadedConnectionPool(\n",
" 1,\n",
" 50,\n",
" host=env[\"API_POSTGRES_HOST\"],\n",
" port=env[\"API_POSTGRES_PORT\"],\n",
" user=env[\"API_POSTGRES_USERNAME\"],\n",
" password=env[\"API_POSTGRES_PASSWORD\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2e3a67c7-1a09-4cb0-a198-972091e02293",
"metadata": {},
"outputs": [],
"source": [
"CSV_URL = \"https://hdr.undp.org/sites/default/files/data/2020/IHDI_HDR2020_040722.csv\"\n",
"YEAR = 2019"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b19b8c3b-a425-4900-8c24-ab0d490fbe3d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>iso3</th>\n",
" <th>country</th>\n",
" <th>hdicode</th>\n",
" <th>region</th>\n",
" <th>hdi_2010</th>\n",
" <th>hdi_2011</th>\n",
" <th>hdi_2012</th>\n",
" <th>hdi_2013</th>\n",
" <th>hdi_2014</th>\n",
" <th>hdi_2015</th>\n",
" <th>...</th>\n",
" <th>ineq_inc_2010</th>\n",
" <th>ineq_inc_2011</th>\n",
" <th>ineq_inc_2012</th>\n",
" <th>ineq_inc_2013</th>\n",
" <th>ineq_inc_2014</th>\n",
" <th>ineq_inc_2015</th>\n",
" <th>ineq_inc_2016</th>\n",
" <th>ineq_inc_2017</th>\n",
" <th>ineq_inc_2018</th>\n",
" <th>ineq_inc_2019</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AFG</td>\n",
" <td>Afghanistan</td>\n",
" <td>Low</td>\n",
" <td>SA</td>\n",
" <td>0.472</td>\n",
" <td>0.477</td>\n",
" <td>0.489</td>\n",
" <td>0.496</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>10.8</td>\n",
" <td>10.8</td>\n",
" <td>10.8</td>\n",
" <td>10.8</td>\n",
" <td>10.8</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AGO</td>\n",
" <td>Angola</td>\n",
" <td>Medium</td>\n",
" <td>SSA</td>\n",
" <td>0.517</td>\n",
" <td>0.533</td>\n",
" <td>0.544</td>\n",
" <td>0.555</td>\n",
" <td>0.565</td>\n",
" <td>0.572</td>\n",
" <td>...</td>\n",
" <td>36.4</td>\n",
" <td>50</td>\n",
" <td>50</td>\n",
" <td>50</td>\n",
" <td>28.9</td>\n",
" <td>28.9</td>\n",
" <td>28.9</td>\n",
" <td>28.9</td>\n",
" <td>28.9</td>\n",
" <td>28.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ALB</td>\n",
" <td>Albania</td>\n",
" <td>High</td>\n",
" <td>ECA</td>\n",
" <td>0.745</td>\n",
" <td>0.764</td>\n",
" <td>0.775</td>\n",
" <td>0.782</td>\n",
" <td>0.787</td>\n",
" <td>0.788</td>\n",
" <td>...</td>\n",
" <td>14.4</td>\n",
" <td>18.3</td>\n",
" <td>18.3</td>\n",
" <td>18.3</td>\n",
" <td>18.3</td>\n",
" <td>18.3</td>\n",
" <td>18.3</td>\n",
" <td>12.2</td>\n",
" <td>13.2</td>\n",
" <td>13.17898</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AND</td>\n",
" <td>Andorra</td>\n",
" <td>Very High</td>\n",
" <td></td>\n",
" <td>0.837</td>\n",
" <td>0.836</td>\n",
" <td>0.858</td>\n",
" <td>0.856</td>\n",
" <td>0.863</td>\n",
" <td>0.862</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ARE</td>\n",
" <td>United Arab Emirates</td>\n",
" <td>Very High</td>\n",
" <td>AS</td>\n",
" <td>0.82</td>\n",
" <td>0.826</td>\n",
" <td>0.832</td>\n",
" <td>0.838</td>\n",
" <td>0.847</td>\n",
" <td>0.859</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 74 columns</p>\n",
"</div>"
],
"text/plain": [
" iso3 country hdicode region hdi_2010 hdi_2011 hdi_2012 \\\n",
"0 AFG Afghanistan Low SA 0.472 0.477 0.489 \n",
"1 AGO Angola Medium SSA 0.517 0.533 0.544 \n",
"2 ALB Albania High ECA 0.745 0.764 0.775 \n",
"3 AND Andorra Very High 0.837 0.836 0.858 \n",
"4 ARE United Arab Emirates Very High AS 0.82 0.826 0.832 \n",
"\n",
" hdi_2013 hdi_2014 hdi_2015 ... ineq_inc_2010 ineq_inc_2011 ineq_inc_2012 \\\n",
"0 0.496 0.5 0.5 ... \n",
"1 0.555 0.565 0.572 ... 36.4 50 50 \n",
"2 0.782 0.787 0.788 ... 14.4 18.3 18.3 \n",
"3 0.856 0.863 0.862 ... \n",
"4 0.838 0.847 0.859 ... \n",
"\n",
" ineq_inc_2013 ineq_inc_2014 ineq_inc_2015 ineq_inc_2016 ineq_inc_2017 \\\n",
"0 10.8 10.8 10.8 10.8 10.8 \n",
"1 50 28.9 28.9 28.9 28.9 \n",
"2 18.3 18.3 18.3 18.3 12.2 \n",
"3 \n",
"4 \n",
"\n",
" ineq_inc_2018 ineq_inc_2019 \n",
"0 \n",
"1 28.9 28.9 \n",
"2 13.2 13.17898 \n",
"3 \n",
"4 \n",
"\n",
"[5 rows x 74 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with requests.Session() as s:\n",
" download = s.get(CSV_URL)\n",
" decoded_content = download.content.decode(\"utf-8\")\n",
"\n",
"csv_rows = list(csv.reader(decoded_content.splitlines(), delimiter=\",\"))\n",
"df = pd.DataFrame(csv_rows[1:], columns=csv_rows[0])\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "09bbd410-1be7-44d6-bdea-98da75784a7d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 206 entries, 0 to 205\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 iso3 206 non-null object \n",
" 1 country 206 non-null object \n",
" 2 hdicode 206 non-null object \n",
" 3 region 206 non-null object \n",
" 4 hdi_2019 200 non-null float64\n",
"dtypes: float64(1), object(4)\n",
"memory usage: 8.2+ KB\n"
]
}
],
"source": [
"cols = f\"iso3 country hdicode region hdi_{YEAR}\".split()\n",
"df = df.loc[:, cols] # filter columns\n",
"df[\"hdi_2019\"] = pd.to_numeric(df[\"hdi_2019\"])\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "d0212cfd-701b-4814-a363-ace5023f9005",
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(subset=[\"hdi_2019\"])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "35829c57-c84a-4251-b676-7b9c9930c843",
"metadata": {},
"outputs": [],
"source": [
"df = df[df.iso3 != \"\"]"
]
},
{
"cell_type": "markdown",
"id": "a49fbd3f-7fa9-405d-862e-773f5a9a783c",
"metadata": {},
"source": [
"## Values for the meatadata.json\n",
"\n",
"### minimum value"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "297d8e53-2829-480b-afd2-84082524f7c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.394"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"hdi_2019\"].min()"
]
},
{
"cell_type": "markdown",
"id": "1ab6e39e-d1ee-4c80-9618-eeaee3c0a43e",
"metadata": {},
"source": [
"\n",
"### `min` and `max` for every hdi code (low -> very high) to set as bins for the legend"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "5ceb3f74-82ff-44fe-a7cc-3a84edd42098",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"hdicode\n",
"High 0.703 - 0.796\n",
"Low 0.394 - 0.546\n",
"Medium 0.554 - 0.697\n",
"Very High 0.804 - 0.957\n",
"Name: hdi_2019, dtype: object"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"hdicode\")[\"hdi_2019\"].apply(lambda x: f\"{x.min()} - {x.max()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d680140b-8aea-471f-b3af-0f8a16fc83b8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}