jupyter/Deputies' tweets.ipynb from okfn-brasil/perfil-politico

jupyter/Deputies' tweets.ipynb
Summary

Maintainability

Test Coverage

Issues
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "from collections import Counter\n",
    "\n",
    "import pandas as pd\n",
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from polyglot.text import Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nbr_retweet</th>\n",
       "      <th>user_id</th>\n",
       "      <th>url</th>\n",
       "      <th>text</th>\n",
       "      <th>usernameTweet</th>\n",
       "      <th>datetime</th>\n",
       "      <th>is_reply</th>\n",
       "      <th>is_retweet</th>\n",
       "      <th>ID</th>\n",
       "      <th>nbr_reply</th>\n",
       "      <th>nbr_favorite</th>\n",
       "      <th>medias</th>\n",
       "      <th>has_media</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>67061352</td>\n",
       "      <td>/anaperugini/status/248235576548012032</td>\n",
       "      <td>Dep. @anaperugini  no programa Notícias em Deb...</td>\n",
       "      <td>anaperugini</td>\n",
       "      <td>2012-09-18 22:42:28</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>248235576548012032</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>116</td>\n",
       "      <td>22864100</td>\n",
       "      <td>/DeputadoFederal/status/960864983881043968</td>\n",
       "      <td>E agora, Moro ? pic.twitter.com/5t4sLmm6gc</td>\n",
       "      <td>DeputadoFederal</td>\n",
       "      <td>2018-02-06 11:17:18</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>960864983881043968</td>\n",
       "      <td>6</td>\n",
       "      <td>155</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>63020349</td>\n",
       "      <td>/deputadoariosto/status/512154450879074304</td>\n",
       "      <td>Não existe crise em 2014 para empresa maior em...</td>\n",
       "      <td>deputadoariosto</td>\n",
       "      <td>2014-09-17 05:21:51</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>512154450879074304</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>35805725</td>\n",
       "      <td>/marcofeliciano/status/522439346008621058</td>\n",
       "      <td>\"Tu, Senhor, guardarás em perfeita paz aquele ...</td>\n",
       "      <td>marcofeliciano</td>\n",
       "      <td>2014-10-15 14:30:21</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>522439346008621058</td>\n",
       "      <td>1</td>\n",
       "      <td>51</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>92033111</td>\n",
       "      <td>/depbulhoes/status/713733683065331714</td>\n",
       "      <td>@Marciabasto Eu que agradeço por sua compreens...</td>\n",
       "      <td>depbulhoes</td>\n",
       "      <td>2016-03-26 11:25:45</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>713733683065331714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   nbr_retweet   user_id                                         url  \\\n",
       "0            0  67061352      /anaperugini/status/248235576548012032   \n",
       "1          116  22864100  /DeputadoFederal/status/960864983881043968   \n",
       "2            1  63020349  /deputadoariosto/status/512154450879074304   \n",
       "3           22  35805725   /marcofeliciano/status/522439346008621058   \n",
       "4            0  92033111       /depbulhoes/status/713733683065331714   \n",
       "\n",
       "                                                text    usernameTweet  \\\n",
       "0  Dep. @anaperugini  no programa Notícias em Deb...      anaperugini   \n",
       "1         E agora, Moro ? pic.twitter.com/5t4sLmm6gc  DeputadoFederal   \n",
       "2  Não existe crise em 2014 para empresa maior em...  deputadoariosto   \n",
       "3  \"Tu, Senhor, guardarás em perfeita paz aquele ...   marcofeliciano   \n",
       "4  @Marciabasto Eu que agradeço por sua compreens...       depbulhoes   \n",
       "\n",
       "              datetime  is_reply  is_retweet                  ID  nbr_reply  \\\n",
       "0  2012-09-18 22:42:28     False       False  248235576548012032          0   \n",
       "1  2018-02-06 11:17:18     False       False  960864983881043968          6   \n",
       "2  2014-09-17 05:21:51     False       False  512154450879074304          0   \n",
       "3  2014-10-15 14:30:21     False       False  522439346008621058          1   \n",
       "4  2016-03-26 11:25:45     False       False  713733683065331714          0   \n",
       "\n",
       "   nbr_favorite medias has_media  \n",
       "0             0    NaN       NaN  \n",
       "1           155    NaN       NaN  \n",
       "2             0    NaN       NaN  \n",
       "3            51    NaN       NaN  \n",
       "4             0    NaN       NaN  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('/mnt/data/tweets.csv.xz')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3018484, 13)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Entidades"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def valid_token(token):\n",
    "    token = token.strip()\n",
    "    \n",
    "    if token in stopwords.words('portuguese'):\n",
    "        return False\n",
    "    \n",
    "    return token.isalpha()\n",
    "\n",
    "def pre_process(rows):\n",
    "    contents = ' '.join(rows.text)\n",
    "    tokens = (token for token in word_tokenize(contents) if valid_token(token))\n",
    "    return ' '.join(tokens)\n",
    "\n",
    "def get_entities(rows, **kwargs):\n",
    "    top = kwargs.get('top', 10)\n",
    "  \n",
    "    data = {}\n",
    "    for count in range(1, top + 1):\n",
    "        data[f'top_entity_{count}'] = None\n",
    "        data[f'top_entity_{count}_count'] = None\n",
    "\n",
    "    if len(rows) < 100:\n",
    "        return pd.Series(data)\n",
    "\n",
    "    text = Text(pre_process(rows), hint_language_code='pt')\n",
    "    entities = (' '.join(entity) for entity in text.entities)\n",
    "    counter = Counter(entities)\n",
    "\n",
    "   \n",
    "    for count, obj in enumerate(counter.most_common(top), 1):\n",
    "        text, entity_count = obj\n",
    "        data[f'top_entity_{count}'] = text\n",
    "        data[f'top_entity_{count}_count'] = entity_count\n",
    "        \n",
    "    return pd.Series(data)\n",
    "\n",
    "grouped = df.groupby('usernameTweet') \\\n",
    "     .apply(get_entities) \\\n",
    "     .dropna() \\\n",
    "     .sort_values('top_entity_1_count', ascending=False) \\\n",
    "     .reset_index()\n",
    "grouped.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total = pd.DataFrame([get_entities(df)])\n",
    "total.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped.to_csv('/mnt/data/tweets_grouped_by_congressperson.csv')\n",
    "total.to_csv('/mnt/data/all_tweets_by_congresspeople.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}