Test Coverage
    # Parameter Optimization with Optuna
    "In this example we will train a RandomForest model and optimize its parameters using [Optuna](https://optuna.readthedocs.io/en/stable/).\n",
    This example is an adapted version from the Optuna [Basic Concept example](https://optuna.readthedocs.io/en/stable/#basic-concepts).
      "Initialized empty Git repository in /tmp/tmpp4i3ht48/.git/\n",
      "Initialized DVC repository.\n",
      "You can now commit the changes to git.\n",
      "|                                                                     |\n",
      "|        DVC has enabled anonymous aggregate usage analytics.         |\n",
      "|     Read the analytics documentation (and how to opt-out) here:     |\n",
      "|             <https://dvc.org/doc/user-guide/analytics>              |\n",
      "|                                                                     |\n",
      "What's next?\n",
      "- Check out the documentation: <https://dvc.org/doc>\n",
      "- Get help and share ideas: <https://dvc.org/chat>\n",
      "- Star us on GitHub: <https://github.com/iterative/dvc>\n"
    "# Setup temporary directory and initialize git and dvc\n",
    "from zntrack import config\n",
    "config.nb_name = \"parameter_optimization.ipynb\"\n",
    "from zntrack.utils import cwd_temp_dir\n",
    "temp_dir = cwd_temp_dir()\n",
    "!git init\n",
    "!dvc init"
    "## Workflow\n",
    "Our Workflow consists of multiple steps:\n",
    "- Download the dataset\n",
    "- Split into train / test data\n",
    "- Train a RandomForest model on the train data\n",
    "- Evaluate the model on the test data\n",
    "We want to optimize using two differen Models: RandomForest and LinearSVR with their respective hyperparameters.\n",
    "We want to optimize them and use the `Evaluate` Node to compute a RMSE that Optuna optimizes.\n",
    "We will use DVC [Experiments](https://dvc.org/doc/start/experiments) to track each run.\n",
    "In combination with Optuna, this allows us not only to optimize the parameters but also easily store and access the trained models afterwards.\n"
    "import optuna, sklearn, zntrack\n",
    "import sklearn.datasets\n",
    "import sklearn.ensemble\n",
    "import sklearn.model_selection\n",
    "import sklearn.metrics\n",
    "class HousingDataSet(zntrack.Node):\n",
    "    \"\"\"Download and prepare the California housing dataset.\"\"\"\n",
    "    data = zntrack.dvc.outs(\"scikit_learn_data\")\n",
    "    def run(self) -> None:\n",
    "        _ = sklearn.datasets.fetch_california_housing(\n",
    "            data_home=self.data, return_X_y=True\n",
    "        )\n",
    "    @property\n",
    "    def labels(self) -> dict:\n",
    "        _, labels = sklearn.datasets.fetch_california_housing(\n",
    "            data_home=self.data, return_X_y=True\n",
    "        )\n",
    "        return labels\n",
    "    @property\n",
    "    def features(self) -> dict:\n",
    "        features, _ = sklearn.datasets.fetch_california_housing(\n",
    "            data_home=self.data, return_X_y=True\n",
    "        )\n",
    "        return features\n",
    "class TrainTestSplit(zntrack.Node):\n",
    "    \"\"\"Split the dataset into train and test sets.\"\"\"\n",
    "    labels = zntrack.zn.deps()\n",
    "    features = zntrack.zn.deps()\n",
    "    seed = zntrack.zn.params(1234)\n",
    "    train_features = zntrack.zn.outs()\n",
    "    test_features = zntrack.zn.outs()\n",
    "    train_labels = zntrack.zn.outs()\n",
    "    test_labels = zntrack.zn.outs()\n",
    "    def run(self) -> None:\n",
    "        self.train_features, self.test_features, self.train_labels, self.test_labels = (\n",
    "            sklearn.model_selection.train_test_split(\n",
    "                self.features, self.labels, test_size=0.2, random_state=self.seed\n",
    "            )\n",
    "        )\n",
    "class RandomForest(zntrack.Node):\n",
    "    \"\"\"Train a random forest model.\"\"\"\n",
    "    train_features = zntrack.zn.deps()\n",
    "    train_labels = zntrack.zn.deps()\n",
    "    seed = zntrack.zn.params(1234)\n",
    "    max_depth = zntrack.zn.params()\n",
    "    model = zntrack.zn.outs()\n",
    "    def run(self) -> None:\n",
    "        self.model = sklearn.ensemble.RandomForestRegressor(\n",
    "            random_state=self.seed, max_depth=self.max_depth\n",
    "        )\n",
    "        self.model.fit(self.train_features, self.train_labels)\n",
    "class LinearSVR(zntrack.Node):\n",
    "    \"\"\"Train a SVR model.\"\"\"\n",
    "    train_features = zntrack.zn.deps()\n",
    "    train_labels = zntrack.zn.deps()\n",
    "    C = zntrack.zn.params()\n",
    "    model = zntrack.zn.outs()\n",
    "    def run(self) -> None:\n",
    "        self.model = sklearn.svm.LinearSVR(C=self.C)\n",
    "        self.model.fit(self.train_features, self.train_labels)\n",
    "class Evaluate(zntrack.Node):\n",
    "    \"\"\"Evaluate the model on a test set.\"\"\"\n",
    "    model = zntrack.zn.deps()\n",
    "    test_features = zntrack.zn.deps()\n",
    "    test_labels = zntrack.zn.deps()\n",
    "    score = zntrack.zn.metrics()\n",
    "    def run(self) -> None:\n",
    "        prediction = self.model.predict(self.test_features)\n",
    "        self.score = sklearn.metrics.mean_squared_error(self.test_labels, prediction)"
    "We use the `zntrack.Project` to create our workflow as usual.\n",
    "To use DVC Experiments, we need to create an initial commit.\n",
    "Therefore, we run the project directly and make an initial git commit afterwards."
      "Running DVC command: 'stage add --name HousingDataSet --force ...'\n",
      "Jupyter support is an experimental feature! Please save your notebook before running this command!\n",
      "Submit issues to https://github.com/zincware/ZnTrack.\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name TrainTestSplit --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name model --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name Evaluate --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'repro'\n"
   "source": [
    "with zntrack.Project() as project:\n",
    "    data = HousingDataSet()\n",
    "    split = TrainTestSplit(labels=data.labels, features=data.features)\n",
    "    model = RandomForest(\n",
    "        train_features=split.train_features,\n",
    "        train_labels=split.train_labels,\n",
    "        max_depth=2,\n",
    "        name=\"model\",\n",
    "    )\n",
    "    evaluate = Evaluate(\n",
    "        model=model.model,\n",
    "        test_features=split.test_features,\n",
    "        test_labels=split.test_labels,\n",
    "    )\n",
   evaluate.state
      "[main (root-commit) 6b7996b] initial commit\n",
      " 24 files changed, 1580 insertions(+)\n",
      " create mode 100644 .dvc/.gitignore\n",
      " create mode 100644 .dvc/config\n",
      " create mode 100644 .dvcignore\n",
      " create mode 100644 .gitignore\n",
      " create mode 100644 dvc.lock\n",
      " create mode 100644 dvc.yaml\n",
      " create mode 100644 nodes/Evaluate/node-meta.json\n",
      " create mode 100644 nodes/Evaluate/score.json\n",
      " create mode 100644 nodes/HousingDataSet/node-meta.json\n",
      " create mode 100644 nodes/TrainTestSplit/.gitignore\n",
      " create mode 100644 nodes/TrainTestSplit/node-meta.json\n",
      " create mode 100644 nodes/model/.gitignore\n",
      " create mode 100644 nodes/model/node-meta.json\n",
      " create mode 100644 parameter_optimization.ipynb\n",
      " create mode 100644 params.yaml\n",
      " create mode 100644 src/Evaluate.py\n",
      " create mode 100644 src/HousingDataSet.py\n",
      " create mode 100644 src/RandomForest.py\n",
      " create mode 100644 src/TrainTestSplit.py\n",
      " create mode 100644 src/__pycache__/Evaluate.cpython-310.pyc\n",
      " create mode 100644 src/__pycache__/HousingDataSet.cpython-310.pyc\n",
      " create mode 100644 src/__pycache__/RandomForest.cpython-310.pyc\n",
      " create mode 100644 src/__pycache__/TrainTestSplit.cpython-310.pyc\n",
      " create mode 100644 zntrack.json\n"
   "source": [
    "!git add .\n",
    "!git commit -m \"initial commit\""
    "## Optimize\n",
    "For Optuna we need to define an objective we want to optimize.\n",
    "We use the `project.create_experiment` API from ZnTrack to change the model parameter and return the score from the `Evaluate` stage as final metric to optimize.\n",
    "To later identify the experiments, we name them according to the `trial.number` from optuna."
      [I 2023-07-26 15:58:10,744] A new study created in memory with name: no-name-85a8203d-fed8-45ba-99ba-3adcda3a06c0
      "Running DVC command: 'stage add --name HousingDataSet --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name TrainTestSplit --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name model --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name Evaluate --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'exp apply exp-0'\n",
      [I 2023-07-26 15:58:22,034] Trial 0 finished with value: 0.8952389211454506 and parameters: {'classifier': 'SVR', 'svr_c': 1.9996547699912692e-05}. Best is trial 0 with value: 0.8952389211454506.
      "Running DVC command: 'stage add --name HousingDataSet --force ...'\n"
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name TrainTestSplit --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name model --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name Evaluate --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'exp apply exp-1'\n",
      [I 2023-07-26 15:58:42,694] Trial 1 finished with value: 0.2627596918267919 and parameters: {'classifier': 'RandomForest', 'max_depth': 27}. Best is trial 0 with value: 0.8952389211454506.
      "Running DVC command: 'stage add --name HousingDataSet --force ...'\n"
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name TrainTestSplit --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name model --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'stage add --name Evaluate --force ...'\n",
     "name": "stdout",
     "output_type": "stream",
     "text": [
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running DVC command: 'exp apply exp-2'\n",
      [I 2023-07-26 15:58:54,242] Trial 2 finished with value: 1.5650217116478067 and parameters: {'classifier': 'SVR', 'svr_c': 171896877.50579312}. Best is trial 2 with value: 1.5650217116478067.
     "name": "stdout",
     "output_type": "stream",
     "text": [
   "source": [
    "def objective(trial):\n",
    "    with project.create_experiment(queue=False, name=f\"exp-{trial.number}\") as exp:\n",
    "        regressor_name = trial.suggest_categorical(\"classifier\", [\"SVR\", \"RandomForest\"])\n",
    "        # we need to replace the existing model on the graph with a new model.\n",
    "        project.remove(\"model\")\n",
    "        if regressor_name == \"SVR\":\n",
    "            svr_c = trial.suggest_float(\"svr_c\", 1e-10, 1e10, log=True)\n",
    "            model = LinearSVR(\n",
    "                train_features=split.train_features,\n",
    "                train_labels=split.train_labels,\n",
    "                C=svr_c,\n",
    "                name=\"model\",\n",
    "            )\n",
    "        else:\n",
    "            max_depth = trial.suggest_int(\"max_depth\", 2, 32)\n",
    "            model = RandomForest(\n",
    "                train_features=split.train_features,\n",
    "                train_labels=split.train_labels,\n",
    "                max_depth=max_depth,\n",
    "                name=\"model\",\n",
    "            )\n",
    "        # need to let the evaluate node know which model to evaluate\n",
    "        evaluate.model = model.model\n",
    "    return exp[evaluate].score\n",
    "study = optuna.create_study(\n",
    "    direction=\"maximize\", sampler=optuna.samplers.TPESampler(seed=314)\n",
    "study.optimize(objective, n_trials=3)"
    "## Evaluate\n",
    "We can now investigate the best parameters via `study.best_params`.\n",
    "Additionally, because we used DVC experiments we can directly access the experiment with the best parameters, by the name we used."
     "data": {
      "text/plain": [
       "{'classifier': 'SVR', 'svr_c': 171896877.50579312}"
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
   "source": [
   "cell_type": "code",
     "data": {
      "text/plain": [
       "dict_keys(['exp-2', 'exp-1', 'exp-0'])"
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
   "source": [
    "We can either load the Node via the experiment or by it's name using `zntrack.from_rev()`.\n",
    "The node should not be loaded via `model.load()` because the `model` instance could be `RandomForest` and the best model would be `LinearSVR` or *vice versa*."
   "source": [
    "exp = project.experiments[f\"exp-{study.best_trial.number}\"]\n",
    "best_model = exp[\"model\"]"
     best_model
      "text/plain": [
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
   "source": [
      Best score: 1.565 compared to initial score: 0.750
   "source": [
    "# we load split data into memory to compute the score.\n",
    "best_score = evaluate.from_rev(rev=f\"exp-{study.best_trial.number}\").score\n",
    "initial_score = evaluate.from_rev(rev=\"HEAD\").score\n",
    "print(f\"Best score: {best_score:.3f} compared to initial score: {initial_score:.3f}\")"
