projects/scratchpads/sosweet-correlations.ipynb
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Basic exploration of correlations between node- and word- vecs in SoSweet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"\n",
"# utilities and plotting\n",
"%matplotlib inline\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sb\n",
"# node2vec stuff\n",
"from gensim.models import Word2Vec\n",
"\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:gensim.utils:loading Word2Vec object from data/sosweet-network/undir_weighted_mention_network_thresh_5.emb\n",
"INFO:gensim.utils:loading wv recursively from data/sosweet-network/undir_weighted_mention_network_thresh_5.emb.wv.* with mmap=None\n",
"INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
"INFO:gensim.utils:setting ignored attribute cum_table to None\n",
"INFO:gensim.utils:loaded data/sosweet-network/undir_weighted_mention_network_thresh_5.emb\n",
"INFO:gensim.utils:loading Word2Vec object from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v\n",
"INFO:gensim.utils:loading wv recursively from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v.wv.* with mmap=None\n",
"INFO:gensim.utils:loading syn0 from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v.wv.syn0.npy with mmap=None\n",
"INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
"INFO:gensim.utils:loading syn1neg from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v.syn1neg.npy with mmap=None\n",
"INFO:gensim.utils:setting ignored attribute cum_table to None\n",
"INFO:gensim.utils:loaded data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v\n"
]
}
],
"source": [
"network_emb = Word2Vec.load('data/sosweet-network/undir_weighted_mention_network_thresh_5.emb')\n",
"lang_emb = Word2Vec.load('data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"178359"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(network_emb.wv.vocab)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"492519"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(lang_emb.wv.vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TODODOC:\n",
"* users.txt: `cat data/sosweet-network/undir_weighted_mention_network_thresh_5.txt | scripts/pipe-unique_users > data/sosweet-network/undir_weigted_mention_network_thresh_5.users.txt`\n",
"* filtered tweets: `scripts/iter-gz \"scripts/pipe-filter_users data/sosweet-network/undir_weighted_mention_network_thresh_5.users.txt\" data/sosweet-text/undir_weighted_mention_network_thresh_5 data/sosweet-text/*.gz`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Jacobo's parsing to get the words out of the tweets (which we need to reproduce here):\n",
"\n",
"```bash\n",
"for f in /datastore/complexnet/twitter/data/2017*.tgz\n",
"do\n",
"\techo $f\n",
"\tzcat $f|sed \"s/^[^{]*//g\"| ./jq -r '.|[.actor.id,.body,.actor.followersCount,.actor.friendsCount]|@tsv'| sed '/RT /d;s/@[^ ]*/ /g;s/http[^ ]*/ /g'|sort -k1,1| uniq >ids_and_body.txt\n",
"\tjoin -11 -21 $net_ids ids_and_body.txt -t $'\\t'>> $net_txt\n",
"done\n",
"\n",
"cat $net_txt| sort -k1,1 -b| tr \"[:upper:]\" \"[:lower:]\"|sed 's/[?!;,*+<>-]/ /g'|sed \"s/'/ /g\"|sed 's/\"//g' > temp\n",
"sed -i \"y/âāáǎàêēéěèîīíǐìôōóǒòûūúǔùǖǘǚǜÂĀÁǍÀÊĒÉĚÈÎĪÍǏÌÔŌÓǑÒÛŪÚǓÙǕǗǙǛ/aaaaaeeeeeiiiiiooooouuuuuuuuuaaaaaeeeeeiiiiiooooouuuuuuuuu/\" temp\n",
"rm ids_and_body.txt\n",
"mv temp $net_txt\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"implement using https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string#518232"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import gzip\n",
"from collections import defaultdict\n",
"import re\n",
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url_regex = re.compile('http[^ ]')\n",
"\n",
"def words(text):\n",
" # Remove urls (which contain : and / characters)\n",
" re.sub(url_regex, '', text)\n",
" # TODO: transliterate accents\n",
" # TODO: split on ?!;:,.<>'\"`=+-*%$/\\|()\n",
" # TODO: remove stuff starting with @, and 'RT'"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-37-4690041e3e69>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m '2017-10-pipe-user_timestamp_body-csv-pipe-filter_users.gz', 'rt') as tweets_file:\n\u001b[1;32m 4\u001b[0m \u001b[0mreader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfieldnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'timestamp'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mtweet\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mwords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' '\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# TODO: properly filter body\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0muser_words\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/base36/lib/python3.6/csv.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;31m# Used only for its side effect.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfieldnames\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m \u001b[0mrow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 113\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mline_num\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mline_num\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"user_words = defaultdict(set)\n",
"with gzip.open('data/sosweet-text/undir_weighted_mention_network_thresh_5/'\n",
" '2017-10-pipe-user_timestamp_body-csv-pipe-filter_users.gz', 'rt') as tweets_file:\n",
" reader = csv.DictReader(tweets_file, fieldnames=['user_id', 'timestamp', 'body'])\n",
" for i, tweet in enumerate(reader):\n",
" user_words[int(tweet['user_id'])].update(words(tweet['body']))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import unicodedata"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"word '😂' not in vocabulary\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-33-3be55b46b3af>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlang_emb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/envs/base36/lib/python3.6/site-packages/gensim/models/keyedvectors.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 595\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 596\u001b[0m \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 597\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 598\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/base36/lib/python3.6/site-packages/gensim/models/keyedvectors.py\u001b[0m in \u001b[0;36mword_vec\u001b[0;34m(self, word, use_norm)\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 284\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word '%s' not in vocabulary\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 285\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmost_similar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpositive\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnegative\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtopn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrestrict_vocab\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: \"word '😂' not in vocabulary\""
]
}
],
"source": [
"lang_emb.wv[tweet['body'][-1]]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Lo'"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unicodedata.category('界')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}