demos/connector/neo4j/load-cora-into-neo4j.ipynb
{
"cells": [
{
"cell_type": "markdown",
"id": "0",
"metadata": {},
"source": [
"# Loading Cora dataset into Neo4j database\n"
]
},
{
"cell_type": "markdown",
"id": "1",
"metadata": {
"nbsphinx": "hidden",
"tags": [
"CloudRunner"
]
},
"source": [
"<table><tr><td>Run the latest release of this notebook:</td><td><a href=\"https://mybinder.org/v2/gh/stellargraph/stellargraph/master?urlpath=lab/tree/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Binder\" target=\"_parent\"><img src=\"https://mybinder.org/badge_logo.svg\"/></a></td><td><a href=\"https://colab.research.google.com/github/stellargraph/stellargraph/blob/master/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Colab\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\"/></a></td></tr></table>"
]
},
{
"cell_type": "markdown",
"id": "2",
"metadata": {},
"source": [
"This notebook demonstrates how to load Cora dataset into Neo4j graph database.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3",
"metadata": {
"nbsphinx": "hidden",
"tags": [
"CloudRunner"
]
},
"outputs": [],
"source": [
"# install StellarGraph if running on Google Colab\n",
"import sys\n",
"if 'google.colab' in sys.modules:\n",
" %pip install -q stellargraph[demos]==1.3.0b"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4",
"metadata": {
"nbsphinx": "hidden",
"tags": [
"VersionCheck"
]
},
"outputs": [],
"source": [
"# verify that we're using the correct version of StellarGraph for this notebook\n",
"import stellargraph as sg\n",
"\n",
"try:\n",
" sg.utils.validate_notebook_version(\"1.3.0b\")\n",
"except AttributeError:\n",
" raise ValueError(\n",
" f\"This notebook requires StellarGraph version 1.3.0b, but a different version {sg.__version__} is installed. Please see <https://github.com/stellargraph/stellargraph/issues/1172>.\"\n",
" ) from None"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"from stellargraph import datasets\n",
"from IPython.display import display, HTML"
]
},
{
"cell_type": "markdown",
"id": "6",
"metadata": {},
"source": [
"## Load Cora dataset"
]
},
{
"cell_type": "markdown",
"id": "7",
"metadata": {
"tags": [
"DataLoadingLinks"
]
},
"source": [
"(See [the \"Loading from Pandas\" demo](../../basics/loading-pandas.ipynb) for details on how data can be loaded.)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8",
"metadata": {
"tags": [
"DataLoading"
]
},
"outputs": [
{
"data": {
"text/html": [
"The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words."
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset = datasets.Cora()\n",
"display(HTML(dataset.description))\n",
"dataset.download()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9",
"metadata": {},
"outputs": [],
"source": [
"edge_list = pd.read_csv(\n",
" os.path.join(dataset.data_directory, \"cora.cites\"),\n",
" sep=\"\\t\",\n",
" header=None,\n",
" names=[\"target\", \"source\"],\n",
")\n",
"edge_list[\"label\"] = \"cites\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "10",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target</th>\n",
" <th>source</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>35</td>\n",
" <td>1033</td>\n",
" <td>cites</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>35</td>\n",
" <td>103482</td>\n",
" <td>cites</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>35</td>\n",
" <td>103515</td>\n",
" <td>cites</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>35</td>\n",
" <td>1050679</td>\n",
" <td>cites</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35</td>\n",
" <td>1103960</td>\n",
" <td>cites</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" target source label\n",
"0 35 1033 cites\n",
"1 35 103482 cites\n",
"2 35 103515 cites\n",
"3 35 1050679 cites\n",
"4 35 1103960 cites"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(edge_list.head(5))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "11",
"metadata": {},
"outputs": [],
"source": [
"feature_names = [\"w_{}\".format(ii) for ii in range(1433)]\n",
"column_names = feature_names + [\"subject\"]\n",
"node_list = pd.read_csv(\n",
" os.path.join(dataset.data_directory, \"cora.content\"),\n",
" sep=\"\\t\",\n",
" header=None,\n",
" names=column_names,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "12",
"metadata": {},
"source": [
"## Preprocess data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "13",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject</th>\n",
" <th>features</th>\n",
" <th>id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31336</th>\n",
" <td>Neural_Networks</td>\n",
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
" <td>31336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1061127</th>\n",
" <td>Rule_Learning</td>\n",
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...</td>\n",
" <td>1061127</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1106406</th>\n",
" <td>Reinforcement_Learning</td>\n",
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
" <td>1106406</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13195</th>\n",
" <td>Reinforcement_Learning</td>\n",
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
" <td>13195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37879</th>\n",
" <td>Probabilistic_Methods</td>\n",
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
" <td>37879</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject \\\n",
"31336 Neural_Networks \n",
"1061127 Rule_Learning \n",
"1106406 Reinforcement_Learning \n",
"13195 Reinforcement_Learning \n",
"37879 Probabilistic_Methods \n",
"\n",
" features id \n",
"31336 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 31336 \n",
"1061127 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... 1061127 \n",
"1106406 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 1106406 \n",
"13195 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 13195 \n",
"37879 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 37879 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# gather all features into lists under 'features' column.\n",
"node_list[\"features\"] = node_list[feature_names].values.tolist()\n",
"\n",
"node_list = node_list.drop(columns=feature_names)\n",
"node_list[\"id\"] = node_list.index\n",
"node_list.head(5)"
]
},
{
"cell_type": "markdown",
"id": "14",
"metadata": {},
"source": [
"## Ingest data into Neo4j database\n",
"\n",
"We define the graph schema as below:\n",
"\n",
"- Each vertex represents a paper\n",
" + subject (String): the class where each subject belongs to. There are seven classes in total. \n",
" + features (List[int]): 1D-vector represents the presence of each words in the dictionary.\n",
" + ID (int): id of each paper. (**Note**: this ID attribute is different from the Neo4j id, i.e., the id of each node or relationship which Neo4j automatically assigns with). \n",
" \n",
"- Each *directed* edge represents a citation. Each edge points to the paper being cited.\n",
"\n",
"As the Cora dataset is small, we could use Cypher queries and execute the transactions via a Python-supported driver.\n",
"\n",
"For bigger dataset, this loading job might take very long, so it is more convenient to use ```neo4j-admin import ``` tool, [tutorial here](https://neo4j.com/docs/operations-manual/current/tutorial/import-tool/)."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "15",
"metadata": {},
"outputs": [],
"source": [
"import time"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "16",
"metadata": {},
"outputs": [],
"source": [
"import py2neo\n",
"\n",
"default_host = os.environ.get(\"STELLARGRAPH_NEO4J_HOST\")\n",
"\n",
"# Create the Neo4j Graph database object; port, user, password parameters can be add to specify location and authentication\n",
"graph = py2neo.Graph(host=default_host)"
]
},
{
"cell_type": "markdown",
"id": "17",
"metadata": {},
"source": [
"Delete the existing edges and relationships in the current database."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "18",
"metadata": {},
"outputs": [],
"source": [
"empty_db_query = \"\"\"\n",
" MATCH(n) DETACH\n",
" DELETE(n)\n",
" \"\"\"\n",
"\n",
"tx = graph.begin(autocommit=True)\n",
"tx.evaluate(empty_db_query)"
]
},
{
"cell_type": "markdown",
"id": "19",
"metadata": {},
"source": [
"Delete any existing constraints or indexes in the current database."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "20",
"metadata": {},
"outputs": [],
"source": [
"constraints = graph.run(\"CALL db.constraints\").data()\n",
"for constraint in constraints:\n",
" graph.run(f\"DROP CONSTRAINT {constraint['name']}\")\n",
"\n",
"indexes = graph.run(\"CALL db.indexes\").data()\n",
"for index in indexes:\n",
" graph.run(f\"DROP INDEX {index['name']}\")"
]
},
{
"cell_type": "markdown",
"id": "21",
"metadata": {},
"source": [
"Load all nodes to the graph database."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "22",
"metadata": {},
"outputs": [],
"source": [
"loading_node_query = \"\"\"\n",
" UNWIND $node_list as node\n",
" CREATE( e: paper {\n",
" ID: toInteger(node.id),\n",
" subject: node.subject,\n",
" features: node.features\n",
" })\n",
" \"\"\"\n",
"\n",
"# For efficient loading, we will load batch of nodes into Neo4j.\n",
"batch_len = 500\n",
"\n",
"for batch_start in range(0, len(node_list), batch_len):\n",
" batch_end = batch_start + batch_len\n",
" # turn node dataframe into a list of records\n",
" records = node_list.iloc[batch_start:batch_end].to_dict(\"records\")\n",
" tx = graph.begin(autocommit=True)\n",
" tx.evaluate(loading_node_query, parameters={\"node_list\": records})"
]
},
{
"cell_type": "markdown",
"id": "23",
"metadata": {},
"source": [
"Load all edges to the graph database."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "24",
"metadata": {},
"outputs": [],
"source": [
"loading_edge_query = \"\"\"\n",
" UNWIND $edge_list as edge\n",
" \n",
" MATCH(source: paper {ID: toInteger(edge.source)})\n",
" MATCH(target: paper {ID: toInteger(edge.target)})\n",
" \n",
" MERGE (source)-[r:cites]->(target)\n",
" \"\"\"\n",
"\n",
"batch_len = 500\n",
"\n",
"for batch_start in range(0, len(edge_list), batch_len):\n",
" batch_end = batch_start + batch_len\n",
" # turn edge dataframe into a list of records\n",
" records = edge_list.iloc[batch_start:batch_end].to_dict(\"records\")\n",
" tx = graph.begin(autocommit=True)\n",
" tx.evaluate(loading_edge_query, parameters={\"edge_list\": records})"
]
},
{
"cell_type": "markdown",
"id": "25",
"metadata": {},
"source": [
"Ensure node IDs are unique. Creating this constraint also automatically creates an index which will improve performance of querying nodes by ID."
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "26",
"metadata": {},
"outputs": [],
"source": [
"node_id_constraint = \"\"\"\n",
" CREATE CONSTRAINT\n",
" ON (n:paper)\n",
" ASSERT n.ID IS UNIQUE\n",
" \"\"\"\n",
"\n",
"tx = graph.begin(autocommit=True)\n",
"tx.evaluate(node_id_constraint)"
]
},
{
"cell_type": "markdown",
"id": "27",
"metadata": {
"nbsphinx": "hidden",
"tags": [
"CloudRunner"
]
},
"source": [
"<table><tr><td>Run the latest release of this notebook:</td><td><a href=\"https://mybinder.org/v2/gh/stellargraph/stellargraph/master?urlpath=lab/tree/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Binder\" target=\"_parent\"><img src=\"https://mybinder.org/badge_logo.svg\"/></a></td><td><a href=\"https://colab.research.google.com/github/stellargraph/stellargraph/blob/master/demos/connector/neo4j/load-cora-into-neo4j.ipynb\" alt=\"Open In Colab\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\"/></a></td></tr></table>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}