sscu-budapest/datazimmer

View on GitHub
dogshow/projects/cc-dog-raw/{{cookiecutter.project}}/notebooks/create-dogshow-test-data.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dbaada81",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime as dt\n",
    "import random\n",
    "from functools import partial\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datazimmer import get_raw_data_path\n",
    "from faker import Faker\n",
    "\n",
    "fake = Faker()\n",
    "Faker.seed(42069)\n",
    "\n",
    "rng = random.Random(42069)\n",
    "\n",
    "people_n = 20\n",
    "dog_n = 35\n",
    "rel_n = 30\n",
    "spotted_n = 100\n",
    "comp_n = 90\n",
    "photo_n = 50\n",
    "comp2_n = 30\n",
    "dog2_n = 10\n",
    "dotm_limit = 40\n",
    "\n",
    "\n",
    "def _prefixed(key, prefix):\n",
    "    return \"__\".join(filter(None, [prefix, key]))\n",
    "\n",
    "\n",
    "def get_nested_address(prefix=\"\"):\n",
    "    _p = partial(_prefixed, prefix=prefix)\n",
    "    return {\n",
    "        _p(\"city\"): fake.city(),\n",
    "        _p(\"zip\"): fake.zipcode(),\n",
    "        _p(\"street_address\"): \" \".join(fake.street_address().split()[:3]),\n",
    "        _p(\"building__floor\"): int(rng.lognormvariate(2, 1)) + 1,\n",
    "        _p(\"building__door\"): rng.randint(1, 101),\n",
    "    }\n",
    "\n",
    "\n",
    "people_df = (\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"cid\": f\"p-{i+1}\",\n",
    "                \"name\": fake.name(),\n",
    "                \"dob\": fake.date_between(dt.date(1953, 1, 1), dt.date(2003, 1, 1))\n",
    "                if rng.random() < 0.9\n",
    "                else None,\n",
    "            }\n",
    "            for i in range(people_n)\n",
    "        ]\n",
    "        + [\n",
    "            {\n",
    "                \"cid\": f\"p-{people_n + 1}\",\n",
    "                \"name\": \"Adam Groff\",\n",
    "                \"dob\": fake.date_between(dt.date(2003, 1, 1), dt.date(2004, 1, 1)),\n",
    "            }\n",
    "        ]\n",
    "    )\n",
    "    .set_index(\"cid\")\n",
    "    .assign(date_of_birth=lambda df: pd.to_datetime(df[\"dob\"]))\n",
    "    .drop(\"dob\", axis=1)\n",
    ")\n",
    "\n",
    "\n",
    "dog_df = (\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"cid\": f\"d-{i+1}\",\n",
    "                \"name\": fake.first_name_nonbinary(),\n",
    "                \"dob\": fake.date_between(dt.date(2010, 1, 1), dt.date(2020, 1, 1)),\n",
    "                \"waist\": rng.uniform(8, 44) if rng.random() < 0.85 else None,\n",
    "                \"sex\": rng.choice([\"male\", \"female\"]),\n",
    "            }\n",
    "            for i in range(dog_n)\n",
    "        ]\n",
    "        + [\n",
    "            {\n",
    "                \"cid\": f\"d-{dog_n + 1}\",\n",
    "                \"name\": \"Madam\",\n",
    "                \"dob\": fake.date_between(dt.date(2015, 1, 1), dt.date(2016, 1, 1)),\n",
    "                \"waist\": 12,\n",
    "                \"sex\": \"female\",\n",
    "            }\n",
    "        ]\n",
    "    )\n",
    "    .set_index(\"cid\")\n",
    "    .assign(date_of_birth=lambda df: pd.to_datetime(df[\"dob\"]))\n",
    "    .drop(\"dob\", axis=1)\n",
    ")\n",
    "\n",
    "rel_df = (\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"owner_id\": rng.choice(people_df.index),\n",
    "                \"dog_id\": rng.choice(dog_df.index),\n",
    "                \"since_birth\": rng.random() > 0.2,\n",
    "            }\n",
    "            for _ in range(rel_n)\n",
    "        ]\n",
    "        + [\n",
    "            {\n",
    "                \"owner_id\": f\"p-{people_n + 1}\",\n",
    "                \"dog_id\": f\"d-{dog_n + 1}\",\n",
    "                \"since_birth\": True,\n",
    "            }\n",
    "        ]\n",
    "    )\n",
    "    .drop_duplicates(subset=[\"owner_id\", \"dog_id\"])\n",
    "    .set_index([\"owner_id\", \"dog_id\"])\n",
    ")\n",
    "\n",
    "\n",
    "places = [\"winner\", \"runner_up\", \"special_mention\"]\n",
    "\n",
    "comp_df = (\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"competition_id\": f\"c-{i+1}\",\n",
    "                \"prize_pool\": rng.randint(1, 30) * 500,\n",
    "                **{\n",
    "                    f\"{place}__{prefix}__{ind_id}\": val\n",
    "                    for place in places\n",
    "                    for prefix, ind_id, val in zip(\n",
    "                        [\"owner\", \"pet\"],\n",
    "                        [people_df.index.name, dog_df.index.name],\n",
    "                        random.choice(rel_df.index),\n",
    "                    )\n",
    "                },\n",
    "            }\n",
    "            for i in range(comp_n)\n",
    "        ]\n",
    "    )\n",
    "    .set_index(\"competition_id\")\n",
    "    .pipe(\n",
    "        lambda df: pd.concat(\n",
    "            [\n",
    "                df,\n",
    "                pd.DataFrame(\n",
    "                    np.sort(np.random.rand(comp_n, len(places)), axis=1),\n",
    "                    columns=[f\"{place}__prize\" for place in places[::-1]],\n",
    "                    index=df.index,\n",
    "                )\n",
    "                .pipe(\n",
    "                    lambda _df: _df\n",
    "                    / _df.sum(axis=1).to_frame().values\n",
    "                    * np.random.rand(comp_n, 1)\n",
    "                    * df[[\"prize_pool\"]].values\n",
    "                )\n",
    "                .astype(int),\n",
    "            ],\n",
    "            axis=1,\n",
    "        )\n",
    "    )\n",
    ")\n",
    "\n",
    "spotted_df = pd.DataFrame(\n",
    "    [\n",
    "        {\n",
    "            \"dog_1__cid\": rng.choice(dog_df.index),\n",
    "            \"dog_2__cid\": rng.choice(dog_df.index),\n",
    "            **get_nested_address(\"place\"),\n",
    "        }\n",
    "        for _ in range(spotted_n)\n",
    "    ]\n",
    ").set_index(\n",
    "    \"dog_1__cid\"\n",
    ")  # only set so that to_csv does not add new col\n",
    "\n",
    "sizes_df = pd.DataFrame(\n",
    "    {\n",
    "        \"dogsize_name\": [\"XS\", \"SM\", \"MED\", \"LG\", \"XL\"],\n",
    "        \"waist_limit__min\": [8, 16, 18, 24, 27],\n",
    "        \"waist_limit__max\": [22, 27, 34, 39, 50],\n",
    "        \"weight_limit__min\": [10, 20, 40, 75, 90],\n",
    "        \"weight_limit__max\": [20, 45, 80, 100, 140],\n",
    "    }\n",
    ").set_index(\"dogsize_name\")\n",
    "\n",
    "dog2_df = (\n",
    "    # maybe add coreferences from ds1\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"cid\": f\"d-{i+1}\",\n",
    "                \"name\": fake.first_name_female(),\n",
    "                \"sex\": rng.choice([\"male\", \"female\"]),\n",
    "                \"date_of_birth\": fake.date_between(\n",
    "                    dt.date(2008, 1, 1), dt.date(2021, 1, 1)\n",
    "                ),\n",
    "                \"size__dogsize_name\": rng.choice(sizes_df.index),\n",
    "                \"color\": fake.color_name() if rng.random() < 0.8 else None,\n",
    "            }\n",
    "            for i in range(dog2_n)\n",
    "        ]\n",
    "    ).set_index(\"cid\")\n",
    ")\n",
    "\n",
    "race_df = (\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"competition_id\": f\"cx-{i+1}\",\n",
    "                \"held_date\": fake.date_time_between(\n",
    "                    dt.date(2019, 1, 1), dt.date(2021, 8, 1)\n",
    "                ),\n",
    "                \"fastest_time\": rng.lognormvariate(6, 1),\n",
    "                \"champion__cid\": rng.choice(dog2_df.index),\n",
    "            }\n",
    "            for i in range(comp2_n)\n",
    "        ]\n",
    "    )\n",
    "    .set_index(\"competition_id\")\n",
    "    .assign(held_date=lambda df: df[\"held_date\"].dt.round(\"1h\"))\n",
    ")\n",
    "\n",
    "dotm_ind = [\"dog_type__pure\", \"dog_type__neutered\", \"year\", \"month\"]\n",
    "dog_of_the_month_df = (\n",
    "    pd.DataFrame(\n",
    "        [\n",
    "            {\n",
    "                \"winner__cid\": rng.choice(dog2_df.index),\n",
    "                \"dog_type__pure\": rng.random() > 0.2,\n",
    "                \"dog_type__neutered\": rng.random() > 0.6,\n",
    "                \"year\": rng.randint(2002, 2021),\n",
    "                \"month\": rng.randint(1, 12),\n",
    "            }\n",
    "            for _ in range(dotm_limit)\n",
    "        ]\n",
    "    )\n",
    "    .drop_duplicates(subset=dotm_ind)\n",
    "    .set_index(dotm_ind)\n",
    "    .sort_index()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51ad5982",
   "metadata": {},
   "outputs": [],
   "source": [
    "for k, v in [*globals().items()]:\n",
    "    if k.endswith(\"_df\"):\n",
    "        fname = k.split(\"_df\")[0] + \".csv\"\n",
    "        v.to_csv(get_raw_data_path(fname))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]"
  },
  "vscode": {
   "interpreter": {
    "hash": "c34365d60e34b30dc48102674d627ffeb37891f3ae5b0d543a727d8b1b3f5762"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}