notebooks/doc-001-quickstart.ipynb
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Quickstart"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from parquetranger import TableRepo\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(\n",
" {\n",
" \"A\": [1, 2, 3, 4, 5, 6],\n",
" \"B\": [\"x\", \"y\", \"z\", \"x1\", \"x2\", \"x3\"],\n",
" \"C\": [1, 2, 1, 1, 1, 2],\n",
" \"C2\": [\"a\", \"a\", \"b\", \"a\", \"c\", \"c\"],\n",
" },\n",
" index=[\"a1\", \"a2\", \"a3\", \"a4\", \"a5\", \"a6\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>C2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a1</th>\n",
" <td>1</td>\n",
" <td>x</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a2</th>\n",
" <td>2</td>\n",
" <td>y</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a3</th>\n",
" <td>3</td>\n",
" <td>z</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>4</td>\n",
" <td>x1</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a5</th>\n",
" <td>5</td>\n",
" <td>x2</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a6</th>\n",
" <td>6</td>\n",
" <td>x3</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C C2\n",
"a1 1 x 1 a\n",
"a2 2 y 2 a\n",
"a3 3 z 1 b\n",
"a4 4 x1 1 a\n",
"a5 5 x2 1 c\n",
"a6 6 x3 2 c"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"trepo = TableRepo(\"some_tmp_path\", group_cols=\"C2\") # this creates the directory"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"trepo.extend(df)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>C2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a1</th>\n",
" <td>1</td>\n",
" <td>x</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a2</th>\n",
" <td>2</td>\n",
" <td>y</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>4</td>\n",
" <td>x1</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a3</th>\n",
" <td>3</td>\n",
" <td>z</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a5</th>\n",
" <td>5</td>\n",
" <td>x2</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a6</th>\n",
" <td>6</td>\n",
" <td>x3</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C C2\n",
"a1 1 x 1 a\n",
"a2 2 y 2 a\n",
"a4 4 x1 1 a\n",
"a3 3 z 1 b\n",
"a5 5 x2 1 c\n",
"a6 6 x3 2 c"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trepo.get_full_df()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df2 = pd.DataFrame(\n",
" {\n",
" \"A\": [21, 22, 23],\n",
" \"B\": [\"X\", \"Y\", \"Z\"],\n",
" \"C\": [10,20,1],\n",
" \"C2\": [\"a\", \"b\", \"a\"],\n",
" },\n",
" index=[\"a1\", \"a4\", \"a7\"]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"trepo.replace_records(df2) # replaces based on index"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>C2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a2</th>\n",
" <td>2</td>\n",
" <td>y</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a1</th>\n",
" <td>21</td>\n",
" <td>X</td>\n",
" <td>10</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a7</th>\n",
" <td>23</td>\n",
" <td>Z</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a3</th>\n",
" <td>3</td>\n",
" <td>z</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>22</td>\n",
" <td>Y</td>\n",
" <td>20</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a5</th>\n",
" <td>5</td>\n",
" <td>x2</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a6</th>\n",
" <td>6</td>\n",
" <td>x3</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C C2\n",
"a2 2 y 2 a\n",
"a1 21 X 10 a\n",
"a7 23 Z 1 a\n",
"a3 3 z 1 b\n",
"a4 22 Y 20 b\n",
"a5 5 x2 1 c\n",
"a6 6 x3 2 c"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trepo.get_full_df()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"trepo.replace_groups(df2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>C2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a1</th>\n",
" <td>21</td>\n",
" <td>X</td>\n",
" <td>10</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a7</th>\n",
" <td>23</td>\n",
" <td>Z</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>22</td>\n",
" <td>Y</td>\n",
" <td>20</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a5</th>\n",
" <td>5</td>\n",
" <td>x2</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a6</th>\n",
" <td>6</td>\n",
" <td>x3</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C C2\n",
"a1 21 X 10 a\n",
"a7 23 Z 1 a\n",
"a4 22 Y 20 b\n",
"a5 5 x2 1 c\n",
"a6 6 x3 2 c"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trepo.get_full_df() # replaced the whole groups where C2==a and C2==b with the records that were present in df2"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"trepo.replace_all(df2) # erases everything and puts df2 in. all traces of df are lost"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>C2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a1</th>\n",
" <td>21</td>\n",
" <td>X</td>\n",
" <td>10</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a7</th>\n",
" <td>23</td>\n",
" <td>Z</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>22</td>\n",
" <td>Y</td>\n",
" <td>20</td>\n",
" <td>b</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C C2\n",
"a1 21 X 10 a\n",
"a7 23 Z 1 a\n",
"a4 22 Y 20 b"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trepo.get_full_df()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"trepo.replace_records(df, by_groups=True) # replaces records based on index, but only looks for indices within groups, so this way duplicate a4 index is possible\n",
"# as they are in different groups, with different values in C2"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>C2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a7</th>\n",
" <td>23</td>\n",
" <td>Z</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a1</th>\n",
" <td>1</td>\n",
" <td>x</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a2</th>\n",
" <td>2</td>\n",
" <td>y</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>4</td>\n",
" <td>x1</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a4</th>\n",
" <td>22</td>\n",
" <td>Y</td>\n",
" <td>20</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a3</th>\n",
" <td>3</td>\n",
" <td>z</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a5</th>\n",
" <td>5</td>\n",
" <td>x2</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a6</th>\n",
" <td>6</td>\n",
" <td>x3</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C C2\n",
"a7 23 Z 1 a\n",
"a1 1 x 1 a\n",
"a2 2 y 2 a\n",
"a4 4 x1 1 a\n",
"a4 22 Y 20 b\n",
"a3 3 z 1 b\n",
"a5 5 x2 1 c\n",
"a6 6 x3 2 c"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trepo.get_full_df()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"trepo.purge() # deletes everything"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"vscode": {
"interpreter": {
"hash": "179b4a2717221c5b6c10b9961221f2c8ce7c1e8b0ad1e6e0b8db36b57c5c13d6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}