okfn-brasil/perfil-politico

View on GitHub
jupyter/limpa-candidatos-2018.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import json\n",
    "import os\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '../data/consulta_cand_2018'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = glob.glob(os.path.join(path,'*.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_final = [\n",
    "    'ano_eleicao',\n",
    "    'num_turno',\n",
    "    'descricao_eleicao',\n",
    "    'sigla_uf',\n",
    "    'sigla_ue',\n",
    "    'descricao_ue',\n",
    "    'codigo_cargo',\n",
    "    'descricao_cargo',\n",
    "    'nome_candidato',\n",
    "    'sequencial_candidato',\n",
    "    'numero_candidato',\n",
    "    'cpf_candidato',\n",
    "    'nome_urna_candidato',\n",
    "    'cod_situacao_candidatura',\n",
    "    'des_situacao_candidatura',\n",
    "    'numero_partido',\n",
    "    'sigla_partido',\n",
    "    'nome_partido',        \n",
    "    'codigo_legenda',\n",
    "    'sigla_legenda',\n",
    "    'composicao_legenda',\n",
    "    'nome_legenda',\n",
    "    'codigo_ocupacao',\n",
    "    'descricao_ocupacao',\n",
    "    'data_nascimento',\n",
    "    'num_titulo_eleitoral_candidato',\n",
    "    'idade_data_eleicao',\n",
    "    'codigo_sexo',\n",
    "    'descricao_sexo',\n",
    "    'cod_grau_instrucao',\n",
    "    'descricao_grau_instrucao',\n",
    "    'codigo_estado_civil',\n",
    "    'descricao_estado_civil',\n",
    "    'codigo_cor_raca',\n",
    "    'descricao_cor_raca',\n",
    "    'codigo_nacionalidade',\n",
    "    'descricao_nacionalidade',\n",
    "    'sigla_uf_nascimento',\n",
    "    'codigo_municipio_nascimento',\n",
    "    'nome_municipio_nascimento',\n",
    "    'despesa_max_campanha',\n",
    "    'cod_sit_tot_turno',\n",
    "    'desc_sit_tot_turno',\n",
    "    'nm_email',\n",
    "    'datahora_geracao',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DT_GERACAO</th>\n",
       "      <th>HH_GERACAO</th>\n",
       "      <th>ANO_ELEICAO</th>\n",
       "      <th>CD_TIPO_ELEICAO</th>\n",
       "      <th>NM_TIPO_ELEICAO</th>\n",
       "      <th>NR_TURNO</th>\n",
       "      <th>CD_ELEICAO</th>\n",
       "      <th>DS_ELEICAO</th>\n",
       "      <th>DT_ELEICAO</th>\n",
       "      <th>TP_ABRANGENCIA</th>\n",
       "      <th>...</th>\n",
       "      <th>DS_COR_RACA</th>\n",
       "      <th>CD_OCUPACAO</th>\n",
       "      <th>DS_OCUPACAO</th>\n",
       "      <th>NR_DESPESA_MAX_CAMPANHA</th>\n",
       "      <th>CD_SIT_TOT_TURNO</th>\n",
       "      <th>DS_SIT_TOT_TURNO</th>\n",
       "      <th>ST_REELEICAO</th>\n",
       "      <th>ST_DECLARAR_BENS</th>\n",
       "      <th>NR_PROTOCOLO_CANDIDATURA</th>\n",
       "      <th>NR_PROCESSO</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>20/08/2018</td>\n",
       "      <td>15:14:58</td>\n",
       "      <td>2018</td>\n",
       "      <td>2</td>\n",
       "      <td>ELEIÇÃO ORDINÁRIA</td>\n",
       "      <td>1</td>\n",
       "      <td>297</td>\n",
       "      <td>Eleições Gerais Estaduais 2018</td>\n",
       "      <td>07/10/2018</td>\n",
       "      <td>ESTADUAL</td>\n",
       "      <td>...</td>\n",
       "      <td>PARDA</td>\n",
       "      <td>125</td>\n",
       "      <td>ADMINISTRADOR</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>#NULO#</td>\n",
       "      <td>N</td>\n",
       "      <td>S</td>\n",
       "      <td>-1</td>\n",
       "      <td>6009339520186140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20/08/2018</td>\n",
       "      <td>15:14:58</td>\n",
       "      <td>2018</td>\n",
       "      <td>2</td>\n",
       "      <td>ELEIÇÃO ORDINÁRIA</td>\n",
       "      <td>1</td>\n",
       "      <td>297</td>\n",
       "      <td>Eleições Gerais Estaduais 2018</td>\n",
       "      <td>07/10/2018</td>\n",
       "      <td>ESTADUAL</td>\n",
       "      <td>...</td>\n",
       "      <td>BRANCA</td>\n",
       "      <td>131</td>\n",
       "      <td>ADVOGADO</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>#NULO#</td>\n",
       "      <td>N</td>\n",
       "      <td>S</td>\n",
       "      <td>-1</td>\n",
       "      <td>6004896220186140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20/08/2018</td>\n",
       "      <td>15:14:58</td>\n",
       "      <td>2018</td>\n",
       "      <td>2</td>\n",
       "      <td>ELEIÇÃO ORDINÁRIA</td>\n",
       "      <td>1</td>\n",
       "      <td>297</td>\n",
       "      <td>Eleições Gerais Estaduais 2018</td>\n",
       "      <td>07/10/2018</td>\n",
       "      <td>ESTADUAL</td>\n",
       "      <td>...</td>\n",
       "      <td>PARDA</td>\n",
       "      <td>125</td>\n",
       "      <td>ADMINISTRADOR</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>#NULO#</td>\n",
       "      <td>N</td>\n",
       "      <td>N</td>\n",
       "      <td>-1</td>\n",
       "      <td>6003735620186140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20/08/2018</td>\n",
       "      <td>15:14:58</td>\n",
       "      <td>2018</td>\n",
       "      <td>2</td>\n",
       "      <td>ELEIÇÃO ORDINÁRIA</td>\n",
       "      <td>1</td>\n",
       "      <td>297</td>\n",
       "      <td>Eleições Gerais Estaduais 2018</td>\n",
       "      <td>07/10/2018</td>\n",
       "      <td>ESTADUAL</td>\n",
       "      <td>...</td>\n",
       "      <td>PARDA</td>\n",
       "      <td>278</td>\n",
       "      <td>VEREADOR</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>#NULO#</td>\n",
       "      <td>N</td>\n",
       "      <td>S</td>\n",
       "      <td>-1</td>\n",
       "      <td>6008698520186140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20/08/2018</td>\n",
       "      <td>15:14:58</td>\n",
       "      <td>2018</td>\n",
       "      <td>2</td>\n",
       "      <td>ELEIÇÃO ORDINÁRIA</td>\n",
       "      <td>1</td>\n",
       "      <td>297</td>\n",
       "      <td>Eleições Gerais Estaduais 2018</td>\n",
       "      <td>07/10/2018</td>\n",
       "      <td>ESTADUAL</td>\n",
       "      <td>...</td>\n",
       "      <td>PRETA</td>\n",
       "      <td>931</td>\n",
       "      <td>ESTUDANTE, BOLSISTA, ESTAGIÁRIO E ASSEMELHADOS</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>#NULO#</td>\n",
       "      <td>N</td>\n",
       "      <td>N</td>\n",
       "      <td>-1</td>\n",
       "      <td>6010716220186140000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 58 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   DT_GERACAO HH_GERACAO  ANO_ELEICAO  CD_TIPO_ELEICAO    NM_TIPO_ELEICAO  \\\n",
       "0  20/08/2018   15:14:58         2018                2  ELEIÇÃO ORDINÁRIA   \n",
       "1  20/08/2018   15:14:58         2018                2  ELEIÇÃO ORDINÁRIA   \n",
       "2  20/08/2018   15:14:58         2018                2  ELEIÇÃO ORDINÁRIA   \n",
       "3  20/08/2018   15:14:58         2018                2  ELEIÇÃO ORDINÁRIA   \n",
       "4  20/08/2018   15:14:58         2018                2  ELEIÇÃO ORDINÁRIA   \n",
       "\n",
       "   NR_TURNO  CD_ELEICAO                      DS_ELEICAO  DT_ELEICAO  \\\n",
       "0         1         297  Eleições Gerais Estaduais 2018  07/10/2018   \n",
       "1         1         297  Eleições Gerais Estaduais 2018  07/10/2018   \n",
       "2         1         297  Eleições Gerais Estaduais 2018  07/10/2018   \n",
       "3         1         297  Eleições Gerais Estaduais 2018  07/10/2018   \n",
       "4         1         297  Eleições Gerais Estaduais 2018  07/10/2018   \n",
       "\n",
       "  TP_ABRANGENCIA         ...          DS_COR_RACA CD_OCUPACAO  \\\n",
       "0       ESTADUAL         ...                PARDA         125   \n",
       "1       ESTADUAL         ...               BRANCA         131   \n",
       "2       ESTADUAL         ...                PARDA         125   \n",
       "3       ESTADUAL         ...                PARDA         278   \n",
       "4       ESTADUAL         ...                PRETA         931   \n",
       "\n",
       "                                      DS_OCUPACAO  NR_DESPESA_MAX_CAMPANHA  \\\n",
       "0                                   ADMINISTRADOR                       -1   \n",
       "1                                        ADVOGADO                       -1   \n",
       "2                                   ADMINISTRADOR                       -1   \n",
       "3                                        VEREADOR                       -1   \n",
       "4  ESTUDANTE, BOLSISTA, ESTAGIÁRIO E ASSEMELHADOS                       -1   \n",
       "\n",
       "  CD_SIT_TOT_TURNO  DS_SIT_TOT_TURNO  ST_REELEICAO ST_DECLARAR_BENS  \\\n",
       "0               -1            #NULO#             N                S   \n",
       "1               -1            #NULO#             N                S   \n",
       "2               -1            #NULO#             N                N   \n",
       "3               -1            #NULO#             N                S   \n",
       "4               -1            #NULO#             N                N   \n",
       "\n",
       "  NR_PROTOCOLO_CANDIDATURA          NR_PROCESSO  \n",
       "0                       -1  6009339520186140000  \n",
       "1                       -1  6004896220186140000  \n",
       "2                       -1  6003735620186140000  \n",
       "3                       -1  6008698520186140000  \n",
       "4                       -1  6010716220186140000  \n",
       "\n",
       "[5 rows x 58 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs_candidatos = [] \n",
    "for file in files:\n",
    "    df = pd.read_csv(file, sep=';', encoding='latin-1')\n",
    "    df.rename(columns={\n",
    "        'ANO_ELEICAO': 'ano_eleicao',\n",
    "        'NR_TURNO': 'num_turno',\n",
    "        'DS_ELEICAO': 'descricao_eleicao',\n",
    "        'SG_UF': 'sigla_uf',\n",
    "        'SG_UE': 'sigla_ue',\n",
    "        'NM_UE': 'descricao_ue',\n",
    "        'CD_CARGO': 'codigo_cargo',\n",
    "        'DS_CARGO': 'descricao_cargo',\n",
    "        'NM_CANDIDATO': 'nome_candidato',\n",
    "        'SQ_CANDIDATO': 'sequencial_candidato',\n",
    "        'NR_CANDIDATO': 'numero_candidato',\n",
    "        'NR_CPF_CANDIDATO': 'cpf_candidato',\n",
    "        'NM_URNA_CANDIDATO': 'nome_urna_candidato',\n",
    "        'CD_SITUACAO_CANDIDATURA': 'cod_situacao_candidatura',\n",
    "        'DS_SITUACAO_CANDIDATURA': 'des_situacao_candidatura',\n",
    "        'NR_PARTIDO': 'numero_partido',\n",
    "        'SG_PARTIDO': 'sigla_partido',\n",
    "        'NM_PARTIDO': 'nome_partido',        \n",
    "        'TP_AGREMIACAO': 'codigo_legenda',\n",
    "        'SQ_COLIGACAO': 'sigla_legenda',\n",
    "        'DS_COMPOSICAO_COLIGACAO': 'composicao_legenda',\n",
    "        'NM_COLIGACAO': 'nome_legenda',\n",
    "        'CD_OCUPACAO': 'codigo_ocupacao',\n",
    "        'DS_OCUPACAO': 'descricao_ocupacao',\n",
    "        'DT_NASCIMENTO': 'data_nascimento',\n",
    "        'NR_TITULO_ELEITORAL_CANDIDATO': 'num_titulo_eleitoral_candidato',\n",
    "        'NR_IDADE_DATA_POSSE': 'idade_data_eleicao',\n",
    "        'CD_GENERO': 'codigo_sexo',\n",
    "        'DS_GENERO': 'descricao_sexo',\n",
    "        'CD_GRAU_INSTRUCAO': 'cod_grau_instrucao',\n",
    "        'DS_GRAU_INSTRUCAO': 'descricao_grau_instrucao',\n",
    "        'CD_ESTADO_CIVIL': 'codigo_estado_civil',\n",
    "        'DS_ESTADO_CIVIL': 'descricao_estado_civil',\n",
    "        'CD_COR_RACA': 'codigo_cor_raca',\n",
    "        'DS_COR_RACA': 'descricao_cor_raca',\n",
    "        'CD_NACIONALIDADE': 'codigo_nacionalidade',\n",
    "        'DS_NACIONALIDADE': 'descricao_nacionalidade',\n",
    "        'SG_UF_NASCIMENTO': 'sigla_uf_nascimento',\n",
    "        'CD_MUNICIPIO_NASCIMENTO': 'codigo_municipio_nascimento',\n",
    "        'NM_MUNICIPIO_NASCIMENTO': 'nome_municipio_nascimento',\n",
    "        'NR_DESPESA_MAX_CAMPANHA': 'despesa_max_campanha',\n",
    "        'CD_SIT_TOT_TURNO': 'cod_sit_tot_turno',\n",
    "        'DS_SIT_TOT_TURNO': 'desc_sit_tot_turno',\n",
    "        'NM_EMAIL': 'nm_email',\n",
    "    }, inplace=True)\n",
    "    df['datahora_geracao'] = df.DT_GERACAO + ' ' + df.HH_GERACAO\n",
    "    df = df[labels_final]\n",
    "    df.desc_sit_tot_turno = df.desc_sit_tot_turno.str.replace('#NULO#', '')\n",
    "\n",
    "    dfs_candidatos.append(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_candidatos_2018 = pd.concat(dfs_candidatos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(27887, 45)\n"
     ]
    }
   ],
   "source": [
    "print(df_candidatos_2018.shape)\n",
    "df_candidatos_2018.to_csv('../data/candidatos-2018-limpo.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}