ixxi-dante/nw2vec

View on GitHub
projects/scratchpads/sosweet-correlations.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Basic exploration of correlations between node- and word- vecs in SoSweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "# utilities and plotting\n",
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sb\n",
    "# node2vec stuff\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:gensim.utils:loading Word2Vec object from data/sosweet-network/undir_weighted_mention_network_thresh_5.emb\n",
      "INFO:gensim.utils:loading wv recursively from data/sosweet-network/undir_weighted_mention_network_thresh_5.emb.wv.* with mmap=None\n",
      "INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
      "INFO:gensim.utils:setting ignored attribute cum_table to None\n",
      "INFO:gensim.utils:loaded data/sosweet-network/undir_weighted_mention_network_thresh_5.emb\n",
      "INFO:gensim.utils:loading Word2Vec object from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v\n",
      "INFO:gensim.utils:loading wv recursively from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v.wv.* with mmap=None\n",
      "INFO:gensim.utils:loading syn0 from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v.wv.syn0.npy with mmap=None\n",
      "INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
      "INFO:gensim.utils:loading syn1neg from data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v.syn1neg.npy with mmap=None\n",
      "INFO:gensim.utils:setting ignored attribute cum_table to None\n",
      "INFO:gensim.utils:loaded data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v\n"
     ]
    }
   ],
   "source": [
    "network_emb = Word2Vec.load('data/sosweet-network/undir_weighted_mention_network_thresh_5.emb')\n",
    "lang_emb = Word2Vec.load('data/sosweet-w2v/dim_50/lowe_dim_sosweet2vec.w2v')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "178359"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(network_emb.wv.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "492519"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(lang_emb.wv.vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODODOC:\n",
    "* users.txt: `cat data/sosweet-network/undir_weighted_mention_network_thresh_5.txt | scripts/pipe-unique_users > data/sosweet-network/undir_weigted_mention_network_thresh_5.users.txt`\n",
    "* filtered tweets: `scripts/iter-gz \"scripts/pipe-filter_users data/sosweet-network/undir_weighted_mention_network_thresh_5.users.txt\" data/sosweet-text/undir_weighted_mention_network_thresh_5 data/sosweet-text/*.gz`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Jacobo's parsing to get the words out of the tweets (which we need to reproduce here):\n",
    "\n",
    "```bash\n",
    "for f in /datastore/complexnet/twitter/data/2017*.tgz\n",
    "do\n",
    "\techo $f\n",
    "\tzcat $f|sed \"s/^[^{]*//g\"| ./jq -r '.|[.actor.id,.body,.actor.followersCount,.actor.friendsCount]|@tsv'| sed '/RT /d;s/@[^ ]*/ /g;s/http[^ ]*/ /g'|sort -k1,1| uniq >ids_and_body.txt\n",
    "\tjoin -11 -21 $net_ids ids_and_body.txt -t $'\\t'>> $net_txt\n",
    "done\n",
    "\n",
    "cat $net_txt| sort -k1,1 -b| tr \"[:upper:]\" \"[:lower:]\"|sed 's/[?!;,*+<>-]/ /g'|sed \"s/'/ /g\"|sed 's/\"//g' > temp\n",
    "sed -i \"y/âāáǎàêēéěèîīíǐìôōóǒòûūúǔùǖǘǚǜÂĀÁǍÀÊĒÉĚÈÎĪÍǏÌÔŌÓǑÒÛŪÚǓÙǕǗǙǛ/aaaaaeeeeeiiiiiooooouuuuuuuuuaaaaaeeeeeiiiiiooooouuuuuuuuu/\" temp\n",
    "rm ids_and_body.txt\n",
    "mv temp $net_txt\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "implement using https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string#518232"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import gzip\n",
    "from collections import defaultdict\n",
    "import re\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "url_regex = re.compile('http[^ ]')\n",
    "\n",
    "def words(text):\n",
    "    # Remove urls (which contain : and / characters)\n",
    "    re.sub(url_regex, '', text)\n",
    "    # TODO: transliterate accents\n",
    "    # TODO: split on ?!;:,.<>'\"`=+-*%$/\\|()\n",
    "    # TODO: remove stuff starting with @, and 'RT'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-37-4690041e3e69>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m                '2017-10-pipe-user_timestamp_body-csv-pipe-filter_users.gz', 'rt') as tweets_file:\n\u001b[1;32m      4\u001b[0m     \u001b[0mreader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfieldnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'timestamp'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mtweet\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m         \u001b[0mwords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtweet\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' '\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# TODO: properly filter body\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m         \u001b[0muser_words\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'user_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/base36/lib/python3.6/csv.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    110\u001b[0m             \u001b[0;31m# Used only for its side effect.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    111\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfieldnames\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m         \u001b[0mrow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    113\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mline_num\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mline_num\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "user_words = defaultdict(set)\n",
    "with gzip.open('data/sosweet-text/undir_weighted_mention_network_thresh_5/'\n",
    "               '2017-10-pipe-user_timestamp_body-csv-pipe-filter_users.gz', 'rt') as tweets_file:\n",
    "    reader = csv.DictReader(tweets_file, fieldnames=['user_id', 'timestamp', 'body'])\n",
    "    for i, tweet in enumerate(reader):\n",
    "        user_words[int(tweet['user_id'])].update(words(tweet['body']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import unicodedata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"word '😂' not in vocabulary\"",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-33-3be55b46b3af>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlang_emb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtweet\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'body'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/anaconda3/envs/base36/lib/python3.6/site-packages/gensim/models/keyedvectors.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m    595\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    596\u001b[0m             \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 597\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    598\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    599\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/base36/lib/python3.6/site-packages/gensim/models/keyedvectors.py\u001b[0m in \u001b[0;36mword_vec\u001b[0;34m(self, word, use_norm)\u001b[0m\n\u001b[1;32m    282\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    283\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 284\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word '%s' not in vocabulary\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    285\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    286\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmost_similar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpositive\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnegative\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtopn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrestrict_vocab\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: \"word '😂' not in vocabulary\""
     ]
    }
   ],
   "source": [
    "lang_emb.wv[tweet['body'][-1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Lo'"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unicodedata.category('界')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}