demos/connector/neo4j/load-cora-into-neo4j.ipynb from stellargraph/stellargraph

demos/connector/neo4j/load-cora-into-neo4j.ipynb
Summary

Maintainability

Test Coverage

Issues
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Loading Cora dataset into Neo4j database\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {
    "nbsphinx": "hidden",
    "tags": [
     "CloudRunner"
    ]
   },
   "source": [
    "<table><tr><td>Run the latest release of this notebook:</td><td><a href=\"https://mybinder.org/v2/gh/stellargraph/stellargraph/master?urlpath=lab/tree/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Binder\" target=\"_parent\"><img src=\"https://mybinder.org/badge_logo.svg\"/></a></td><td><a href=\"https://colab.research.google.com/github/stellargraph/stellargraph/blob/master/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Colab\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\"/></a></td></tr></table>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2",
   "metadata": {},
   "source": [
    "This notebook demonstrates how to load Cora dataset into Neo4j graph database.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3",
   "metadata": {
    "nbsphinx": "hidden",
    "tags": [
     "CloudRunner"
    ]
   },
   "outputs": [],
   "source": [
    "# install StellarGraph if running on Google Colab\n",
    "import sys\n",
    "if 'google.colab' in sys.modules:\n",
    "  %pip install -q stellargraph[demos]==1.3.0b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4",
   "metadata": {
    "nbsphinx": "hidden",
    "tags": [
     "VersionCheck"
    ]
   },
   "outputs": [],
   "source": [
    "# verify that we're using the correct version of StellarGraph for this notebook\n",
    "import stellargraph as sg\n",
    "\n",
    "try:\n",
    "    sg.utils.validate_notebook_version(\"1.3.0b\")\n",
    "except AttributeError:\n",
    "    raise ValueError(\n",
    "        f\"This notebook requires StellarGraph version 1.3.0b, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>.\"\n",
    "    ) from None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from stellargraph import datasets\n",
    "from IPython.display import display, HTML"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "## Load Cora dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7",
   "metadata": {
    "tags": [
     "DataLoadingLinks"
    ]
   },
   "source": [
    "(See [the \"Loading from Pandas\" demo](../../basics/loading-pandas.ipynb) for details on how data can be loaded.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8",
   "metadata": {
    "tags": [
     "DataLoading"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words."
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = datasets.Cora()\n",
    "display(HTML(dataset.description))\n",
    "dataset.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9",
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_list = pd.read_csv(\n",
    "    os.path.join(dataset.data_directory, \"cora.cites\"),\n",
    "    sep=\"\\t\",\n",
    "    header=None,\n",
    "    names=[\"target\", \"source\"],\n",
    ")\n",
    "edge_list[\"label\"] = \"cites\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "10",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "      <th>source</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>35</td>\n",
       "      <td>1033</td>\n",
       "      <td>cites</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>35</td>\n",
       "      <td>103482</td>\n",
       "      <td>cites</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>35</td>\n",
       "      <td>103515</td>\n",
       "      <td>cites</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35</td>\n",
       "      <td>1050679</td>\n",
       "      <td>cites</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35</td>\n",
       "      <td>1103960</td>\n",
       "      <td>cites</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   target   source  label\n",
       "0      35     1033  cites\n",
       "1      35   103482  cites\n",
       "2      35   103515  cites\n",
       "3      35  1050679  cites\n",
       "4      35  1103960  cites"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(edge_list.head(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "11",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = [\"w_{}\".format(ii) for ii in range(1433)]\n",
    "column_names = feature_names + [\"subject\"]\n",
    "node_list = pd.read_csv(\n",
    "    os.path.join(dataset.data_directory, \"cora.content\"),\n",
    "    sep=\"\\t\",\n",
    "    header=None,\n",
    "    names=column_names,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12",
   "metadata": {},
   "source": [
    "## Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "13",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>subject</th>\n",
       "      <th>features</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>31336</th>\n",
       "      <td>Neural_Networks</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>31336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1061127</th>\n",
       "      <td>Rule_Learning</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...</td>\n",
       "      <td>1061127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1106406</th>\n",
       "      <td>Reinforcement_Learning</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>1106406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13195</th>\n",
       "      <td>Reinforcement_Learning</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>13195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37879</th>\n",
       "      <td>Probabilistic_Methods</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>37879</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        subject  \\\n",
       "31336           Neural_Networks   \n",
       "1061127           Rule_Learning   \n",
       "1106406  Reinforcement_Learning   \n",
       "13195    Reinforcement_Learning   \n",
       "37879     Probabilistic_Methods   \n",
       "\n",
       "                                                  features       id  \n",
       "31336    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    31336  \n",
       "1061127  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...  1061127  \n",
       "1106406  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  1106406  \n",
       "13195    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    13195  \n",
       "37879    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    37879  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# gather all features into lists under 'features' column.\n",
    "node_list[\"features\"] = node_list[feature_names].values.tolist()\n",
    "\n",
    "node_list = node_list.drop(columns=feature_names)\n",
    "node_list[\"id\"] = node_list.index\n",
    "node_list.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {},
   "source": [
    "## Ingest data into Neo4j database\n",
    "\n",
    "We define the graph schema as below:\n",
    "\n",
    "- Each vertex represents a paper\n",
    "    + subject (String): the class where each subject belongs to. There are seven classes in total. \n",
    "    + features (List[int]): 1D-vector represents the presence of each words in the dictionary.\n",
    "    + ID (int): id of each paper. (**Note**: this ID attribute is different from the Neo4j id, i.e., the id of each node or relationship which Neo4j automatically assigns with). \n",
    "    \n",
    "- Each *directed* edge represents a citation. Each edge points to the paper being cited.\n",
    "\n",
    "As the Cora dataset is small, we could use Cypher queries and execute the transactions via a Python-supported driver.\n",
    "\n",
    "For bigger dataset, this loading job might take very long, so it is more convenient to use ```neo4j-admin import ``` tool, [tutorial here](https://neo4j.com/docs/operations-manual/current/tutorial/import-tool/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "16",
   "metadata": {},
   "outputs": [],
   "source": [
    "import py2neo\n",
    "\n",
    "default_host = os.environ.get(\"STELLARGRAPH_NEO4J_HOST\")\n",
    "\n",
    "# Create the Neo4j Graph database object; port, user, password parameters can be add to specify location and authentication\n",
    "graph = py2neo.Graph(host=default_host)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17",
   "metadata": {},
   "source": [
    "Delete the existing edges and relationships in the current database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "empty_db_query = \"\"\"\n",
    "    MATCH(n) DETACH\n",
    "    DELETE(n)\n",
    "    \"\"\"\n",
    "\n",
    "tx = graph.begin(autocommit=True)\n",
    "tx.evaluate(empty_db_query)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {},
   "source": [
    "Delete any existing constraints or indexes in the current database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "constraints = graph.run(\"CALL db.constraints\").data()\n",
    "for constraint in constraints:\n",
    "    graph.run(f\"DROP CONSTRAINT {constraint['name']}\")\n",
    "\n",
    "indexes = graph.run(\"CALL db.indexes\").data()\n",
    "for index in indexes:\n",
    "    graph.run(f\"DROP INDEX {index['name']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {},
   "source": [
    "Load all nodes to the graph database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "loading_node_query = \"\"\"\n",
    "    UNWIND $node_list as node\n",
    "    CREATE( e: paper {\n",
    "        ID: toInteger(node.id),\n",
    "        subject: node.subject,\n",
    "        features: node.features\n",
    "    })\n",
    "    \"\"\"\n",
    "\n",
    "# For efficient loading, we will load batch of nodes into Neo4j.\n",
    "batch_len = 500\n",
    "\n",
    "for batch_start in range(0, len(node_list), batch_len):\n",
    "    batch_end = batch_start + batch_len\n",
    "    # turn node dataframe into a list of records\n",
    "    records = node_list.iloc[batch_start:batch_end].to_dict(\"records\")\n",
    "    tx = graph.begin(autocommit=True)\n",
    "    tx.evaluate(loading_node_query, parameters={\"node_list\": records})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23",
   "metadata": {},
   "source": [
    "Load all edges to the graph database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "loading_edge_query = \"\"\"\n",
    "    UNWIND $edge_list as edge\n",
    "    \n",
    "    MATCH(source: paper {ID: toInteger(edge.source)})\n",
    "    MATCH(target: paper {ID: toInteger(edge.target)})\n",
    "    \n",
    "    MERGE (source)-[r:cites]->(target)\n",
    "    \"\"\"\n",
    "\n",
    "batch_len = 500\n",
    "\n",
    "for batch_start in range(0, len(edge_list), batch_len):\n",
    "    batch_end = batch_start + batch_len\n",
    "    # turn edge dataframe into a list of records\n",
    "    records = edge_list.iloc[batch_start:batch_end].to_dict(\"records\")\n",
    "    tx = graph.begin(autocommit=True)\n",
    "    tx.evaluate(loading_edge_query, parameters={\"edge_list\": records})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25",
   "metadata": {},
   "source": [
    "Ensure node IDs are unique. Creating this constraint also automatically creates an index which will improve performance of querying nodes by ID."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_id_constraint = \"\"\"\n",
    "    CREATE CONSTRAINT\n",
    "    ON (n:paper)\n",
    "    ASSERT n.ID IS UNIQUE\n",
    "    \"\"\"\n",
    "\n",
    "tx = graph.begin(autocommit=True)\n",
    "tx.evaluate(node_id_constraint)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27",
   "metadata": {
    "nbsphinx": "hidden",
    "tags": [
     "CloudRunner"
    ]
   },
   "source": [
    "<table><tr><td>Run the latest release of this notebook:</td><td><a href=\"https://mybinder.org/v2/gh/stellargraph/stellargraph/master?urlpath=lab/tree/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Binder\" target=\"_parent\"><img src=\"https://mybinder.org/badge_logo.svg\"/></a></td><td><a href=\"https://colab.research.google.com/github/stellargraph/stellargraph/blob/master/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Colab\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\"/></a></td></tr></table>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}