jupyter/Deputies' tweets.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import string\n",
"from collections import Counter\n",
"\n",
"import pandas as pd\n",
"from nltk import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from polyglot.text import Text"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nbr_retweet</th>\n",
" <th>user_id</th>\n",
" <th>url</th>\n",
" <th>text</th>\n",
" <th>usernameTweet</th>\n",
" <th>datetime</th>\n",
" <th>is_reply</th>\n",
" <th>is_retweet</th>\n",
" <th>ID</th>\n",
" <th>nbr_reply</th>\n",
" <th>nbr_favorite</th>\n",
" <th>medias</th>\n",
" <th>has_media</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>67061352</td>\n",
" <td>/anaperugini/status/248235576548012032</td>\n",
" <td>Dep. @anaperugini no programa Notícias em Deb...</td>\n",
" <td>anaperugini</td>\n",
" <td>2012-09-18 22:42:28</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>248235576548012032</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>116</td>\n",
" <td>22864100</td>\n",
" <td>/DeputadoFederal/status/960864983881043968</td>\n",
" <td>E agora, Moro ? pic.twitter.com/5t4sLmm6gc</td>\n",
" <td>DeputadoFederal</td>\n",
" <td>2018-02-06 11:17:18</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>960864983881043968</td>\n",
" <td>6</td>\n",
" <td>155</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>63020349</td>\n",
" <td>/deputadoariosto/status/512154450879074304</td>\n",
" <td>Não existe crise em 2014 para empresa maior em...</td>\n",
" <td>deputadoariosto</td>\n",
" <td>2014-09-17 05:21:51</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>512154450879074304</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>22</td>\n",
" <td>35805725</td>\n",
" <td>/marcofeliciano/status/522439346008621058</td>\n",
" <td>\"Tu, Senhor, guardarás em perfeita paz aquele ...</td>\n",
" <td>marcofeliciano</td>\n",
" <td>2014-10-15 14:30:21</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>522439346008621058</td>\n",
" <td>1</td>\n",
" <td>51</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>92033111</td>\n",
" <td>/depbulhoes/status/713733683065331714</td>\n",
" <td>@Marciabasto Eu que agradeço por sua compreens...</td>\n",
" <td>depbulhoes</td>\n",
" <td>2016-03-26 11:25:45</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>713733683065331714</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nbr_retweet user_id url \\\n",
"0 0 67061352 /anaperugini/status/248235576548012032 \n",
"1 116 22864100 /DeputadoFederal/status/960864983881043968 \n",
"2 1 63020349 /deputadoariosto/status/512154450879074304 \n",
"3 22 35805725 /marcofeliciano/status/522439346008621058 \n",
"4 0 92033111 /depbulhoes/status/713733683065331714 \n",
"\n",
" text usernameTweet \\\n",
"0 Dep. @anaperugini no programa Notícias em Deb... anaperugini \n",
"1 E agora, Moro ? pic.twitter.com/5t4sLmm6gc DeputadoFederal \n",
"2 Não existe crise em 2014 para empresa maior em... deputadoariosto \n",
"3 \"Tu, Senhor, guardarás em perfeita paz aquele ... marcofeliciano \n",
"4 @Marciabasto Eu que agradeço por sua compreens... depbulhoes \n",
"\n",
" datetime is_reply is_retweet ID nbr_reply \\\n",
"0 2012-09-18 22:42:28 False False 248235576548012032 0 \n",
"1 2018-02-06 11:17:18 False False 960864983881043968 6 \n",
"2 2014-09-17 05:21:51 False False 512154450879074304 0 \n",
"3 2014-10-15 14:30:21 False False 522439346008621058 1 \n",
"4 2016-03-26 11:25:45 False False 713733683065331714 0 \n",
"\n",
" nbr_favorite medias has_media \n",
"0 0 NaN NaN \n",
"1 155 NaN NaN \n",
"2 0 NaN NaN \n",
"3 51 NaN NaN \n",
"4 0 NaN NaN "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('/mnt/data/tweets.csv.xz')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3018484, 13)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Entidades"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def valid_token(token):\n",
" token = token.strip()\n",
" \n",
" if token in stopwords.words('portuguese'):\n",
" return False\n",
" \n",
" return token.isalpha()\n",
"\n",
"def pre_process(rows):\n",
" contents = ' '.join(rows.text)\n",
" tokens = (token for token in word_tokenize(contents) if valid_token(token))\n",
" return ' '.join(tokens)\n",
"\n",
"def get_entities(rows, **kwargs):\n",
" top = kwargs.get('top', 10)\n",
" \n",
" data = {}\n",
" for count in range(1, top + 1):\n",
" data[f'top_entity_{count}'] = None\n",
" data[f'top_entity_{count}_count'] = None\n",
"\n",
" if len(rows) < 100:\n",
" return pd.Series(data)\n",
"\n",
" text = Text(pre_process(rows), hint_language_code='pt')\n",
" entities = (' '.join(entity) for entity in text.entities)\n",
" counter = Counter(entities)\n",
"\n",
" \n",
" for count, obj in enumerate(counter.most_common(top), 1):\n",
" text, entity_count = obj\n",
" data[f'top_entity_{count}'] = text\n",
" data[f'top_entity_{count}_count'] = entity_count\n",
" \n",
" return pd.Series(data)\n",
"\n",
"grouped = df.groupby('usernameTweet') \\\n",
" .apply(get_entities) \\\n",
" .dropna() \\\n",
" .sort_values('top_entity_1_count', ascending=False) \\\n",
" .reset_index()\n",
"grouped.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"total = pd.DataFrame([get_entities(df)])\n",
"total.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"grouped.to_csv('/mnt/data/tweets_grouped_by_congressperson.csv')\n",
"total.to_csv('/mnt/data/all_tweets_by_congresspeople.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}