data/notebooks/Lab/9_contextual_IDHI_dataset.ipynb from Vizzuality/landgriffon

data/notebooks/Lab/9_contextual_IDHI_dataset.ipynb
Summary

Maintainability

Test Coverage

Issues
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d5022bb7-643c-4346-a4d2-86587bdcb9d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests\n",
    "from psycopg2.pool import ThreadedConnectionPool\n",
    "from shapely.geometry import shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "703af319-5a48-42f8-9a84-06329fcd73e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "env_path = \".env\"\n",
    "with open(env_path) as f:\n",
    "    env = {}\n",
    "    for line in f.readlines():\n",
    "        if line.startswith(\"#\"):\n",
    "            continue\n",
    "        env_key, _val = line.split(\"=\", 1)\n",
    "        env_value = _val.split(\"\\n\")[0]\n",
    "        env[env_key] = env_value\n",
    "\n",
    "postgres_thread_pool = ThreadedConnectionPool(\n",
    "    1,\n",
    "    50,\n",
    "    host=env[\"API_POSTGRES_HOST\"],\n",
    "    port=env[\"API_POSTGRES_PORT\"],\n",
    "    user=env[\"API_POSTGRES_USERNAME\"],\n",
    "    password=env[\"API_POSTGRES_PASSWORD\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2e3a67c7-1a09-4cb0-a198-972091e02293",
   "metadata": {},
   "outputs": [],
   "source": [
    "CSV_URL = \"https://hdr.undp.org/sites/default/files/data/2020/IHDI_HDR2020_040722.csv\"\n",
    "YEAR = 2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b19b8c3b-a425-4900-8c24-ab0d490fbe3d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iso3</th>\n",
       "      <th>country</th>\n",
       "      <th>hdicode</th>\n",
       "      <th>region</th>\n",
       "      <th>hdi_2010</th>\n",
       "      <th>hdi_2011</th>\n",
       "      <th>hdi_2012</th>\n",
       "      <th>hdi_2013</th>\n",
       "      <th>hdi_2014</th>\n",
       "      <th>hdi_2015</th>\n",
       "      <th>...</th>\n",
       "      <th>ineq_inc_2010</th>\n",
       "      <th>ineq_inc_2011</th>\n",
       "      <th>ineq_inc_2012</th>\n",
       "      <th>ineq_inc_2013</th>\n",
       "      <th>ineq_inc_2014</th>\n",
       "      <th>ineq_inc_2015</th>\n",
       "      <th>ineq_inc_2016</th>\n",
       "      <th>ineq_inc_2017</th>\n",
       "      <th>ineq_inc_2018</th>\n",
       "      <th>ineq_inc_2019</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AFG</td>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Low</td>\n",
       "      <td>SA</td>\n",
       "      <td>0.472</td>\n",
       "      <td>0.477</td>\n",
       "      <td>0.489</td>\n",
       "      <td>0.496</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>10.8</td>\n",
       "      <td>10.8</td>\n",
       "      <td>10.8</td>\n",
       "      <td>10.8</td>\n",
       "      <td>10.8</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AGO</td>\n",
       "      <td>Angola</td>\n",
       "      <td>Medium</td>\n",
       "      <td>SSA</td>\n",
       "      <td>0.517</td>\n",
       "      <td>0.533</td>\n",
       "      <td>0.544</td>\n",
       "      <td>0.555</td>\n",
       "      <td>0.565</td>\n",
       "      <td>0.572</td>\n",
       "      <td>...</td>\n",
       "      <td>36.4</td>\n",
       "      <td>50</td>\n",
       "      <td>50</td>\n",
       "      <td>50</td>\n",
       "      <td>28.9</td>\n",
       "      <td>28.9</td>\n",
       "      <td>28.9</td>\n",
       "      <td>28.9</td>\n",
       "      <td>28.9</td>\n",
       "      <td>28.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ALB</td>\n",
       "      <td>Albania</td>\n",
       "      <td>High</td>\n",
       "      <td>ECA</td>\n",
       "      <td>0.745</td>\n",
       "      <td>0.764</td>\n",
       "      <td>0.775</td>\n",
       "      <td>0.782</td>\n",
       "      <td>0.787</td>\n",
       "      <td>0.788</td>\n",
       "      <td>...</td>\n",
       "      <td>14.4</td>\n",
       "      <td>18.3</td>\n",
       "      <td>18.3</td>\n",
       "      <td>18.3</td>\n",
       "      <td>18.3</td>\n",
       "      <td>18.3</td>\n",
       "      <td>18.3</td>\n",
       "      <td>12.2</td>\n",
       "      <td>13.2</td>\n",
       "      <td>13.17898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AND</td>\n",
       "      <td>Andorra</td>\n",
       "      <td>Very High</td>\n",
       "      <td></td>\n",
       "      <td>0.837</td>\n",
       "      <td>0.836</td>\n",
       "      <td>0.858</td>\n",
       "      <td>0.856</td>\n",
       "      <td>0.863</td>\n",
       "      <td>0.862</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ARE</td>\n",
       "      <td>United Arab Emirates</td>\n",
       "      <td>Very High</td>\n",
       "      <td>AS</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.826</td>\n",
       "      <td>0.832</td>\n",
       "      <td>0.838</td>\n",
       "      <td>0.847</td>\n",
       "      <td>0.859</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 74 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  iso3               country    hdicode region hdi_2010 hdi_2011 hdi_2012  \\\n",
       "0  AFG           Afghanistan        Low     SA    0.472    0.477    0.489   \n",
       "1  AGO                Angola     Medium    SSA    0.517    0.533    0.544   \n",
       "2  ALB               Albania       High    ECA    0.745    0.764    0.775   \n",
       "3  AND               Andorra  Very High           0.837    0.836    0.858   \n",
       "4  ARE  United Arab Emirates  Very High     AS     0.82    0.826    0.832   \n",
       "\n",
       "  hdi_2013 hdi_2014 hdi_2015  ... ineq_inc_2010 ineq_inc_2011 ineq_inc_2012  \\\n",
       "0    0.496      0.5      0.5  ...                                             \n",
       "1    0.555    0.565    0.572  ...          36.4            50            50   \n",
       "2    0.782    0.787    0.788  ...          14.4          18.3          18.3   \n",
       "3    0.856    0.863    0.862  ...                                             \n",
       "4    0.838    0.847    0.859  ...                                             \n",
       "\n",
       "  ineq_inc_2013 ineq_inc_2014 ineq_inc_2015 ineq_inc_2016 ineq_inc_2017  \\\n",
       "0          10.8          10.8          10.8          10.8          10.8   \n",
       "1            50          28.9          28.9          28.9          28.9   \n",
       "2          18.3          18.3          18.3          18.3          12.2   \n",
       "3                                                                         \n",
       "4                                                                         \n",
       "\n",
       "  ineq_inc_2018 ineq_inc_2019  \n",
       "0                              \n",
       "1          28.9          28.9  \n",
       "2          13.2      13.17898  \n",
       "3                              \n",
       "4                              \n",
       "\n",
       "[5 rows x 74 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with requests.Session() as s:\n",
    "    download = s.get(CSV_URL)\n",
    "    decoded_content = download.content.decode(\"utf-8\")\n",
    "\n",
    "csv_rows = list(csv.reader(decoded_content.splitlines(), delimiter=\",\"))\n",
    "df = pd.DataFrame(csv_rows[1:], columns=csv_rows[0])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "09bbd410-1be7-44d6-bdea-98da75784a7d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 206 entries, 0 to 205\n",
      "Data columns (total 5 columns):\n",
      " #   Column    Non-Null Count  Dtype  \n",
      "---  ------    --------------  -----  \n",
      " 0   iso3      206 non-null    object \n",
      " 1   country   206 non-null    object \n",
      " 2   hdicode   206 non-null    object \n",
      " 3   region    206 non-null    object \n",
      " 4   hdi_2019  200 non-null    float64\n",
      "dtypes: float64(1), object(4)\n",
      "memory usage: 8.2+ KB\n"
     ]
    }
   ],
   "source": [
    "cols = f\"iso3 country hdicode region hdi_{YEAR}\".split()\n",
    "df = df.loc[:, cols]  # filter columns\n",
    "df[\"hdi_2019\"] = pd.to_numeric(df[\"hdi_2019\"])\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "d0212cfd-701b-4814-a363-ace5023f9005",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.dropna(subset=[\"hdi_2019\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "35829c57-c84a-4251-b676-7b9c9930c843",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df.iso3 != \"\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a49fbd3f-7fa9-405d-862e-773f5a9a783c",
   "metadata": {},
   "source": [
    "## Values for the meatadata.json\n",
    "\n",
    "### minimum value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "297d8e53-2829-480b-afd2-84082524f7c0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.394"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"hdi_2019\"].min()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ab6e39e-d1ee-4c80-9618-eeaee3c0a43e",
   "metadata": {},
   "source": [
    "\n",
    "### `min` and `max` for every hdi code (low -> very high) to set as bins for the legend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "5ceb3f74-82ff-44fe-a7cc-3a84edd42098",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "hdicode\n",
       "High         0.703 - 0.796\n",
       "Low          0.394 - 0.546\n",
       "Medium       0.554 - 0.697\n",
       "Very High    0.804 - 0.957\n",
       "Name: hdi_2019, dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(\"hdicode\")[\"hdi_2019\"].apply(lambda x: f\"{x.min()} - {x.max()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d680140b-8aea-471f-b3af-0f8a16fc83b8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}