dogshow/projects/cc-dog-raw/{{cookiecutter.project}}/notebooks/create-dogshow-test-data.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dbaada81",
"metadata": {},
"outputs": [],
"source": [
"import datetime as dt\n",
"import random\n",
"from functools import partial\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from datazimmer import get_raw_data_path\n",
"from faker import Faker\n",
"\n",
"fake = Faker()\n",
"Faker.seed(42069)\n",
"\n",
"rng = random.Random(42069)\n",
"\n",
"people_n = 20\n",
"dog_n = 35\n",
"rel_n = 30\n",
"spotted_n = 100\n",
"comp_n = 90\n",
"photo_n = 50\n",
"comp2_n = 30\n",
"dog2_n = 10\n",
"dotm_limit = 40\n",
"\n",
"\n",
"def _prefixed(key, prefix):\n",
" return \"__\".join(filter(None, [prefix, key]))\n",
"\n",
"\n",
"def get_nested_address(prefix=\"\"):\n",
" _p = partial(_prefixed, prefix=prefix)\n",
" return {\n",
" _p(\"city\"): fake.city(),\n",
" _p(\"zip\"): fake.zipcode(),\n",
" _p(\"street_address\"): \" \".join(fake.street_address().split()[:3]),\n",
" _p(\"building__floor\"): int(rng.lognormvariate(2, 1)) + 1,\n",
" _p(\"building__door\"): rng.randint(1, 101),\n",
" }\n",
"\n",
"\n",
"people_df = (\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"cid\": f\"p-{i+1}\",\n",
" \"name\": fake.name(),\n",
" \"dob\": fake.date_between(dt.date(1953, 1, 1), dt.date(2003, 1, 1))\n",
" if rng.random() < 0.9\n",
" else None,\n",
" }\n",
" for i in range(people_n)\n",
" ]\n",
" + [\n",
" {\n",
" \"cid\": f\"p-{people_n + 1}\",\n",
" \"name\": \"Adam Groff\",\n",
" \"dob\": fake.date_between(dt.date(2003, 1, 1), dt.date(2004, 1, 1)),\n",
" }\n",
" ]\n",
" )\n",
" .set_index(\"cid\")\n",
" .assign(date_of_birth=lambda df: pd.to_datetime(df[\"dob\"]))\n",
" .drop(\"dob\", axis=1)\n",
")\n",
"\n",
"\n",
"dog_df = (\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"cid\": f\"d-{i+1}\",\n",
" \"name\": fake.first_name_nonbinary(),\n",
" \"dob\": fake.date_between(dt.date(2010, 1, 1), dt.date(2020, 1, 1)),\n",
" \"waist\": rng.uniform(8, 44) if rng.random() < 0.85 else None,\n",
" \"sex\": rng.choice([\"male\", \"female\"]),\n",
" }\n",
" for i in range(dog_n)\n",
" ]\n",
" + [\n",
" {\n",
" \"cid\": f\"d-{dog_n + 1}\",\n",
" \"name\": \"Madam\",\n",
" \"dob\": fake.date_between(dt.date(2015, 1, 1), dt.date(2016, 1, 1)),\n",
" \"waist\": 12,\n",
" \"sex\": \"female\",\n",
" }\n",
" ]\n",
" )\n",
" .set_index(\"cid\")\n",
" .assign(date_of_birth=lambda df: pd.to_datetime(df[\"dob\"]))\n",
" .drop(\"dob\", axis=1)\n",
")\n",
"\n",
"rel_df = (\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"owner_id\": rng.choice(people_df.index),\n",
" \"dog_id\": rng.choice(dog_df.index),\n",
" \"since_birth\": rng.random() > 0.2,\n",
" }\n",
" for _ in range(rel_n)\n",
" ]\n",
" + [\n",
" {\n",
" \"owner_id\": f\"p-{people_n + 1}\",\n",
" \"dog_id\": f\"d-{dog_n + 1}\",\n",
" \"since_birth\": True,\n",
" }\n",
" ]\n",
" )\n",
" .drop_duplicates(subset=[\"owner_id\", \"dog_id\"])\n",
" .set_index([\"owner_id\", \"dog_id\"])\n",
")\n",
"\n",
"\n",
"places = [\"winner\", \"runner_up\", \"special_mention\"]\n",
"\n",
"comp_df = (\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"competition_id\": f\"c-{i+1}\",\n",
" \"prize_pool\": rng.randint(1, 30) * 500,\n",
" **{\n",
" f\"{place}__{prefix}__{ind_id}\": val\n",
" for place in places\n",
" for prefix, ind_id, val in zip(\n",
" [\"owner\", \"pet\"],\n",
" [people_df.index.name, dog_df.index.name],\n",
" random.choice(rel_df.index),\n",
" )\n",
" },\n",
" }\n",
" for i in range(comp_n)\n",
" ]\n",
" )\n",
" .set_index(\"competition_id\")\n",
" .pipe(\n",
" lambda df: pd.concat(\n",
" [\n",
" df,\n",
" pd.DataFrame(\n",
" np.sort(np.random.rand(comp_n, len(places)), axis=1),\n",
" columns=[f\"{place}__prize\" for place in places[::-1]],\n",
" index=df.index,\n",
" )\n",
" .pipe(\n",
" lambda _df: _df\n",
" / _df.sum(axis=1).to_frame().values\n",
" * np.random.rand(comp_n, 1)\n",
" * df[[\"prize_pool\"]].values\n",
" )\n",
" .astype(int),\n",
" ],\n",
" axis=1,\n",
" )\n",
" )\n",
")\n",
"\n",
"spotted_df = pd.DataFrame(\n",
" [\n",
" {\n",
" \"dog_1__cid\": rng.choice(dog_df.index),\n",
" \"dog_2__cid\": rng.choice(dog_df.index),\n",
" **get_nested_address(\"place\"),\n",
" }\n",
" for _ in range(spotted_n)\n",
" ]\n",
").set_index(\n",
" \"dog_1__cid\"\n",
") # only set so that to_csv does not add new col\n",
"\n",
"sizes_df = pd.DataFrame(\n",
" {\n",
" \"dogsize_name\": [\"XS\", \"SM\", \"MED\", \"LG\", \"XL\"],\n",
" \"waist_limit__min\": [8, 16, 18, 24, 27],\n",
" \"waist_limit__max\": [22, 27, 34, 39, 50],\n",
" \"weight_limit__min\": [10, 20, 40, 75, 90],\n",
" \"weight_limit__max\": [20, 45, 80, 100, 140],\n",
" }\n",
").set_index(\"dogsize_name\")\n",
"\n",
"dog2_df = (\n",
" # maybe add coreferences from ds1\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"cid\": f\"d-{i+1}\",\n",
" \"name\": fake.first_name_female(),\n",
" \"sex\": rng.choice([\"male\", \"female\"]),\n",
" \"date_of_birth\": fake.date_between(\n",
" dt.date(2008, 1, 1), dt.date(2021, 1, 1)\n",
" ),\n",
" \"size__dogsize_name\": rng.choice(sizes_df.index),\n",
" \"color\": fake.color_name() if rng.random() < 0.8 else None,\n",
" }\n",
" for i in range(dog2_n)\n",
" ]\n",
" ).set_index(\"cid\")\n",
")\n",
"\n",
"race_df = (\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"competition_id\": f\"cx-{i+1}\",\n",
" \"held_date\": fake.date_time_between(\n",
" dt.date(2019, 1, 1), dt.date(2021, 8, 1)\n",
" ),\n",
" \"fastest_time\": rng.lognormvariate(6, 1),\n",
" \"champion__cid\": rng.choice(dog2_df.index),\n",
" }\n",
" for i in range(comp2_n)\n",
" ]\n",
" )\n",
" .set_index(\"competition_id\")\n",
" .assign(held_date=lambda df: df[\"held_date\"].dt.round(\"1h\"))\n",
")\n",
"\n",
"dotm_ind = [\"dog_type__pure\", \"dog_type__neutered\", \"year\", \"month\"]\n",
"dog_of_the_month_df = (\n",
" pd.DataFrame(\n",
" [\n",
" {\n",
" \"winner__cid\": rng.choice(dog2_df.index),\n",
" \"dog_type__pure\": rng.random() > 0.2,\n",
" \"dog_type__neutered\": rng.random() > 0.6,\n",
" \"year\": rng.randint(2002, 2021),\n",
" \"month\": rng.randint(1, 12),\n",
" }\n",
" for _ in range(dotm_limit)\n",
" ]\n",
" )\n",
" .drop_duplicates(subset=dotm_ind)\n",
" .set_index(dotm_ind)\n",
" .sort_index()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51ad5982",
"metadata": {},
"outputs": [],
"source": [
"for k, v in [*globals().items()]:\n",
" if k.endswith(\"_df\"):\n",
" fname = k.split(\"_df\")[0] + \".csv\"\n",
" v.to_csv(get_raw_data_path(fname))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]"
},
"vscode": {
"interpreter": {
"hash": "c34365d60e34b30dc48102674d627ffeb37891f3ae5b0d543a727d8b1b3f5762"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}