From 5ce6e31930528dafaa988ae889c8a96a4fbfe9d1 Mon Sep 17 00:00:00 2001
From: AlexKolosov <alexckolosov@gmail.com>
Date: Thu, 26 Aug 2021 00:03:22 +0900
Subject: [PATCH 1/9] Step 1: Create structure

---
 .gitignore                                    |   5 +
 README.md                                     |   1 +
 data/.gitignore                               |   3 +
 data/processed/.gitignore                     |   4 +
 data/raw/.gitignore                           |   2 +
 models/.gitignore                             |   1 +
 .../step-0-prototype.ipynb                    |   0
 notebooks/step-1-organize-ml-project.ipynb    | 531 ++++++++++++++++++
 reports/.gitignore                            |   1 +
 9 files changed, 548 insertions(+)
 create mode 100644 data/.gitignore
 create mode 100644 data/processed/.gitignore
 create mode 100644 data/raw/.gitignore
 create mode 100644 models/.gitignore
 rename step-0-prototype.ipynb => notebooks/step-0-prototype.ipynb (100%)
 create mode 100644 notebooks/step-1-organize-ml-project.ipynb
 create mode 100644 reports/.gitignore

diff --git a/.gitignore b/.gitignore
index cd56dce1..186220be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,11 @@
 ## OS configs
 .DS_Store
 
+# Project
+data/*
+models/*
+reports/*
+
 # Python
 __pycache__
 .ipynb_checkpoints
diff --git a/README.md b/README.md
index 83039e3a..5d3aa41d 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 ```bash
 git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git
 cd course-ds-base
+git checkout step-1
 ```
 
 
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 00000000..b6e069c5
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,3 @@
+*
+!*/
+!.gitignore
\ No newline at end of file
diff --git a/data/processed/.gitignore b/data/processed/.gitignore
new file mode 100644
index 00000000..6bd59f84
--- /dev/null
+++ b/data/processed/.gitignore
@@ -0,0 +1,4 @@
+!.gitignore
+!*.dvc
+/train_iris.csv
+/test_iris.csv
\ No newline at end of file
diff --git a/data/raw/.gitignore b/data/raw/.gitignore
new file mode 100644
index 00000000..3fc404be
--- /dev/null
+++ b/data/raw/.gitignore
@@ -0,0 +1,2 @@
+!.gitignore
+!*.dvc
\ No newline at end of file
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file
diff --git a/step-0-prototype.ipynb b/notebooks/step-0-prototype.ipynb
similarity index 100%
rename from step-0-prototype.ipynb
rename to notebooks/step-0-prototype.ipynb
diff --git a/notebooks/step-1-organize-ml-project.ipynb b/notebooks/step-1-organize-ml-project.ipynb
new file mode 100644
index 00000000..16d63795
--- /dev/null
+++ b/notebooks/step-1-organize-ml-project.ipynb
@@ -0,0 +1,531 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base\n",
+    "random_state = 42\n",
+    "\n",
+    "# Data\n",
+    "dataset_csv = '../data/raw/iris.csv'\n",
+    "features_path = '../data/processed/featured_iris.csv'\n",
+    "\n",
+    "test_size = 0.2\n",
+    "\n",
+    "trainset_path = '../data/processed/train_iris.csv'\n",
+    "testset_path = '../data/processed/test_iris.csv'\n",
+    "\n",
+    "\n",
+    "# Train\n",
+    "clf_params = {\n",
+    "    'C': 0.001,\n",
+    "    'solver': 'lbfgs',\n",
+    "    'multi_class': 'multinomial',\n",
+    "    'max_iter': 100\n",
+    "}\n",
+    "model_path= '../models/model.joblib'\n",
+    "\n",
+    "# Reports\n",
+    "metrics_file = '../reports/metrics.json'\n",
+    "confusion_matrix_image = '../reports/confusion_matrix.png'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset.to_csv(dataset_csv, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(features_path, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(trainset_path)\n",
+    "test_dataset.to_csv(testset_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(**clf_params, random_state=random_state)\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.875303Z",
+     "start_time": "2019-06-16T21:21:55.864724Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_confusion_matrix(cm,\n",
+    "                          target_names,\n",
+    "                          title='Confusion matrix',\n",
+    "                          cmap=None,\n",
+    "                          normalize=True):\n",
+    "    \"\"\"\n",
+    "    given a sklearn confusion matrix (cm), make a nice plot\n",
+    "\n",
+    "    Arguments\n",
+    "    ---------\n",
+    "    cm:           confusion matrix from sklearn.metrics.confusion_matrix\n",
+    "\n",
+    "    target_names: given classification classes such as [0, 1, 2]\n",
+    "                  the class names, for example: ['high', 'medium', 'low']\n",
+    "\n",
+    "    title:        the text to display at the top of the matrix\n",
+    "\n",
+    "    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm\n",
+    "                  see http://matplotlib.org/examples/color/colormaps_reference.html\n",
+    "                  plt.get_cmap('jet') or plt.cm.Blues\n",
+    "\n",
+    "    normalize:    If False, plot the raw numbers\n",
+    "                  If True, plot the proportions\n",
+    "\n",
+    "    Usage\n",
+    "    -----\n",
+    "    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by\n",
+    "                                                              # sklearn.metrics.confusion_matrix\n",
+    "                          normalize    = True,                # show proportions\n",
+    "                          target_names = y_labels_vals,       # list of names of the classes\n",
+    "                          title        = best_estimator_name) # title of graph\n",
+    "\n",
+    "    Citiation\n",
+    "    ---------\n",
+    "    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    accuracy = np.trace(cm) / float(np.sum(cm))\n",
+    "    misclass = 1 - accuracy\n",
+    "\n",
+    "    if cmap is None:\n",
+    "        cmap = plt.get_cmap('Blues')\n",
+    "\n",
+    "    plt.figure(figsize=(8, 6))\n",
+    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
+    "    plt.title(title)\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    if target_names is not None:\n",
+    "        tick_marks = np.arange(len(target_names))\n",
+    "        plt.xticks(tick_marks, target_names, rotation=45)\n",
+    "        plt.yticks(tick_marks, target_names)\n",
+    "\n",
+    "    if normalize:\n",
+    "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
+    "\n",
+    "    thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n",
+    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
+    "        if normalize:\n",
+    "            plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "        else:\n",
+    "            plt.text(j, i, \"{:,}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.ylabel('True label')\n",
+    "    plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n",
+    "    \n",
+    "    return plt.gcf()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(metrics_file, 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(confusion_matrix_image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/reports/.gitignore b/reports/.gitignore
new file mode 100644
index 00000000..b722e9e1
--- /dev/null
+++ b/reports/.gitignore
@@ -0,0 +1 @@
+!.gitignore
\ No newline at end of file

From 1046b4b66185c5cd4b2b9a4aaf0746e4faf2b47b Mon Sep 17 00:00:00 2001
From: AlexKolosov <alexckolosov@gmail.com>
Date: Thu, 26 Aug 2021 13:40:59 +0900
Subject: [PATCH 2/9] Add configuration file

---
 notebooks/step-2-create-config-file.ipynb | 530 ++++++++++++++++++++++
 params.yaml                               |  22 +
 2 files changed, 552 insertions(+)
 create mode 100644 notebooks/step-2-create-config-file.ipynb
 create mode 100644 params.yaml

diff --git a/notebooks/step-2-create-config-file.ipynb b/notebooks/step-2-create-config-file.ipynb
new file mode 100644
index 00000000..a95d4434
--- /dev/null
+++ b/notebooks/step-2-create-config-file.ipynb
@@ -0,0 +1,530 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Go to project root folder\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read config\n",
+    "import pprint\n",
+    "\n",
+    "with open('params.yaml') as conf_file:\n",
+    "    config = yaml.safe_load(conf_file)\n",
+    "\n",
+    "pprint.pprint(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset.to_csv(config['data']['dataset_csv'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(config['data']['features_path'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(\n",
+    "    dataset, test_size=config['data']['test_size'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(config['data']['trainset_path'])\n",
+    "test_dataset.to_csv(config['data']['testset_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(\n",
+    "    **config['train']['clf_params'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, config['train']['model_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.875303Z",
+     "start_time": "2019-06-16T21:21:55.864724Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_confusion_matrix(cm,\n",
+    "                          target_names,\n",
+    "                          title='Confusion matrix',\n",
+    "                          cmap=None,\n",
+    "                          normalize=True):\n",
+    "    \"\"\"\n",
+    "    given a sklearn confusion matrix (cm), make a nice plot\n",
+    "\n",
+    "    Arguments\n",
+    "    ---------\n",
+    "    cm:           confusion matrix from sklearn.metrics.confusion_matrix\n",
+    "\n",
+    "    target_names: given classification classes such as [0, 1, 2]\n",
+    "                  the class names, for example: ['high', 'medium', 'low']\n",
+    "\n",
+    "    title:        the text to display at the top of the matrix\n",
+    "\n",
+    "    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm\n",
+    "                  see http://matplotlib.org/examples/color/colormaps_reference.html\n",
+    "                  plt.get_cmap('jet') or plt.cm.Blues\n",
+    "\n",
+    "    normalize:    If False, plot the raw numbers\n",
+    "                  If True, plot the proportions\n",
+    "\n",
+    "    Usage\n",
+    "    -----\n",
+    "    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by\n",
+    "                                                              # sklearn.metrics.confusion_matrix\n",
+    "                          normalize    = True,                # show proportions\n",
+    "                          target_names = y_labels_vals,       # list of names of the classes\n",
+    "                          title        = best_estimator_name) # title of graph\n",
+    "\n",
+    "    Citiation\n",
+    "    ---------\n",
+    "    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    accuracy = np.trace(cm) / float(np.sum(cm))\n",
+    "    misclass = 1 - accuracy\n",
+    "\n",
+    "    if cmap is None:\n",
+    "        cmap = plt.get_cmap('Blues')\n",
+    "\n",
+    "    plt.figure(figsize=(8, 6))\n",
+    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
+    "    plt.title(title)\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    if target_names is not None:\n",
+    "        tick_marks = np.arange(len(target_names))\n",
+    "        plt.xticks(tick_marks, target_names, rotation=45)\n",
+    "        plt.yticks(tick_marks, target_names)\n",
+    "\n",
+    "    if normalize:\n",
+    "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
+    "\n",
+    "    thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n",
+    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
+    "        if normalize:\n",
+    "            plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "        else:\n",
+    "            plt.text(j, i, \"{:,}\".format(cm[i, j]),\n",
+    "                     horizontalalignment=\"center\",\n",
+    "                     color=\"white\" if cm[i, j] > thresh else \"black\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.ylabel('True label')\n",
+    "    plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n",
+    "    \n",
+    "    return plt.gcf()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(config['reports']['metrics_file'], 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(config['reports']['confusion_matrix_image'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/params.yaml b/params.yaml
new file mode 100644
index 00000000..a48a0538
--- /dev/null
+++ b/params.yaml
@@ -0,0 +1,22 @@
+base:
+  random_state: 42
+
+data:
+  dataset_csv: 'data/raw/iris.csv'
+  features_path: 'data/processed/featured_iris.csv'
+  test_size: 0.2
+  trainset_path: 'data/processed/train_iris.csv'
+  testset_path: 'data/processed/test_iris.csv'
+
+
+train:
+  clf_params:
+    'C': 0.001
+    'solver': 'lbfgs'
+    'multi_class': 'multinomial'
+    'max_iter': 100
+  model_path: 'models/model.joblib'
+
+reports:
+  metrics_file: 'reports/metrics.json'
+  confusion_matrix_image: 'reports/confusion_matrix.png'
\ No newline at end of file

From 920f7b46a30537132c980fe88af167fd9f24b0ca Mon Sep 17 00:00:00 2001
From: AlexKolosov <alexckolosov@gmail.com>
Date: Thu, 26 Aug 2021 16:11:07 +0900
Subject: [PATCH 3/9] Distribute configs through notebook

---
 notebooks/step-1-organize-ml-project.ipynb | 58 +++++++---------------
 1 file changed, 18 insertions(+), 40 deletions(-)

diff --git a/notebooks/step-1-organize-ml-project.ipynb b/notebooks/step-1-organize-ml-project.ipynb
index 16d63795..3a115fea 100644
--- a/notebooks/step-1-organize-ml-project.ipynb
+++ b/notebooks/step-1-organize-ml-project.ipynb
@@ -22,46 +22,6 @@
     "from sklearn.model_selection import train_test_split"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Base\n",
-    "random_state = 42\n",
-    "\n",
-    "# Data\n",
-    "dataset_csv = '../data/raw/iris.csv'\n",
-    "features_path = '../data/processed/featured_iris.csv'\n",
-    "\n",
-    "test_size = 0.2\n",
-    "\n",
-    "trainset_path = '../data/processed/train_iris.csv'\n",
-    "testset_path = '../data/processed/test_iris.csv'\n",
-    "\n",
-    "\n",
-    "# Train\n",
-    "clf_params = {\n",
-    "    'C': 0.001,\n",
-    "    'solver': 'lbfgs',\n",
-    "    'multi_class': 'multinomial',\n",
-    "    'max_iter': 100\n",
-    "}\n",
-    "model_path= '../models/model.joblib'\n",
-    "\n",
-    "# Reports\n",
-    "metrics_file = '../reports/metrics.json'\n",
-    "confusion_matrix_image = '../reports/confusion_matrix.png'"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -127,6 +87,7 @@
    "outputs": [],
    "source": [
     "# Save raw data\n",
+    "dataset_csv = '../data/raw/iris.csv'\n",
     "dataset.to_csv(dataset_csv, index=False)"
    ]
   },
@@ -180,6 +141,7 @@
    "outputs": [],
    "source": [
     "# Save features\n",
+    "features_path = '../data/processed/featured_iris.csv'\n",
     "dataset.to_csv(features_path, index=False)"
    ]
   },
@@ -201,6 +163,9 @@
    },
    "outputs": [],
    "source": [
+    "random_state = 42\n",
+    "test_size = 0.2\n",
+    "\n",
     "train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)\n",
     "train_dataset.shape, test_dataset.shape"
    ]
@@ -212,6 +177,9 @@
    "outputs": [],
    "source": [
     "# Save train and test sets\n",
+    "trainset_path = '../data/processed/train_iris.csv'\n",
+    "testset_path = '../data/processed/test_iris.csv'\n",
+    "\n",
     "train_dataset.to_csv(trainset_path)\n",
     "test_dataset.to_csv(testset_path)"
    ]
@@ -252,6 +220,12 @@
    "outputs": [],
    "source": [
     "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "clf_params = {\n",
+    "    'C': 0.001,\n",
+    "    'solver': 'lbfgs',\n",
+    "    'multi_class': 'multinomial',\n",
+    "    'max_iter': 100\n",
+    "}\n",
     "\n",
     "logreg = LogisticRegression(**clf_params, random_state=random_state)\n",
     "logreg.fit(X_train, y_train)"
@@ -263,6 +237,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "model_path= '../models/model.joblib'\n",
     "joblib.dump(logreg, model_path)"
    ]
   },
@@ -414,6 +389,8 @@
    "outputs": [],
    "source": [
     "# Save metrics\n",
+    "metrics_file = '../reports/metrics.json'\n",
+    "\n",
     "metrics = {\n",
     "    'f1': f1\n",
     "}\n",
@@ -447,6 +424,7 @@
    "outputs": [],
    "source": [
     "# Save confusion matrix image\n",
+    "confusion_matrix_image = '../reports/confusion_matrix.png'\n",
     "cm_plot.savefig(confusion_matrix_image)"
    ]
   },

From e85c826b6fb95eb54d1860af4a688b0d7bfd5ee0 Mon Sep 17 00:00:00 2001
From: AlexKolosov <alexckolosov@gmail.com>
Date: Thu, 26 Aug 2021 16:51:48 +0900
Subject: [PATCH 4/9] Update docs: Fix branch name to checkout

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5d3aa41d..6083ff24 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 ```bash
 git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git
 cd course-ds-base
-git checkout step-1
+git checkout step-2
 ```
 
 

From ebe019100b0112c8101ca75ecf7aa74b7c1005d2 Mon Sep 17 00:00:00 2001
From: Mikhail <mnrozhkov@gmail.com>
Date: Wed, 3 Nov 2021 14:43:29 +0300
Subject: [PATCH 5/9] Update README

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6083ff24..2d58776a 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,8 @@
 ### 1. Fork / Clone this repository
 
 ```bash
-git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git
+git clone https://github.com/iterative/course-ds-base.git
 cd course-ds-base
-git checkout step-2
 ```
 
 

From 4a25bd3f5ecf2e433d007f1a2689682fde071c38 Mon Sep 17 00:00:00 2001
From: Mikhail <mnrozhkov@gmail.com>
Date: Wed, 3 Nov 2021 14:57:12 +0300
Subject: [PATCH 6/9] Update step-3 solution

---
 README.md                                 |   1 +
 notebooks/step-2-create-config-file.ipynb |   8 +-
 notebooks/step-3-reusable-code.ipynb      | 435 ++++++++++++++++++++++
 src/report/visualize.py                   |  80 ++++
 4 files changed, 521 insertions(+), 3 deletions(-)
 create mode 100644 notebooks/step-3-reusable-code.ipynb
 create mode 100644 src/report/visualize.py

diff --git a/README.md b/README.md
index 2d58776a..6fd7557f 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ cd course-ds-base
 Create virtual environment named `dvc-venv` (you may use other name)
 ```bash
 python3 -m venv dvc-venv
+echo "export PYTHONPATH=$PWD" >> dvc-venv/bin/activate
 source dvc-venv/bin/activate
 ```
 Install python libraries
diff --git a/notebooks/step-2-create-config-file.ipynb b/notebooks/step-2-create-config-file.ipynb
index a95d4434..1169b196 100644
--- a/notebooks/step-2-create-config-file.ipynb
+++ b/notebooks/step-2-create-config-file.ipynb
@@ -465,9 +465,11 @@
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
+  },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
+   "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)",
    "name": "python3"
   },
   "language_info": {
@@ -480,7 +482,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.4"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/notebooks/step-3-reusable-code.ipynb b/notebooks/step-3-reusable-code.ipynb
new file mode 100644
index 00000000..0f026c8c
--- /dev/null
+++ b/notebooks/step-3-reusable-code.ipynb
@@ -0,0 +1,435 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import yaml\n",
+    "\n",
+    "# import plot_confusion_matrix()\n",
+    "from src.report.visualize import plot_confusion_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Go to project root folder\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read config\n",
+    "import pprint\n",
+    "\n",
+    "with open('params.yaml') as conf_file:\n",
+    "    config = yaml.safe_load(conf_file)\n",
+    "\n",
+    "pprint.pprint(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset.to_csv(config['data']['dataset_csv'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(config['data']['features_path'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(\n",
+    "    dataset, test_size=config['data']['test_size'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(config['data']['trainset_path'])\n",
+    "test_dataset.to_csv(config['data']['testset_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(\n",
+    "    **config['train']['clf_params'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, config['train']['model_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(config['reports']['metrics_file'], 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(config['reports']['confusion_matrix_image'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/report/visualize.py b/src/report/visualize.py
new file mode 100644
index 00000000..7656d4b3
--- /dev/null
+++ b/src/report/visualize.py
@@ -0,0 +1,80 @@
+import itertools
+import matplotlib.colors
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import List, Text
+
+
+def plot_confusion_matrix(cm: np.array,
+                          target_names: List[Text],
+                          title: Text = 'Confusion matrix',
+                          cmap: matplotlib.colors.LinearSegmentedColormap = None,
+                          normalize: bool = True):
+    """
+    given a sklearn confusion matrix (cm), make a nice plot
+
+    Arguments
+    ---------
+    cm:           confusion matrix from sklearn.metrics.confusion_matrix
+
+    target_names: given classification classes such as [0, 1, 2]
+                  the class names, for example: ['high', 'medium', 'low']
+
+    title:        the text to display at the top of the matrix
+
+    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
+                  see http://matplotlib.org/examples/color/colormaps_reference.html
+                  plt.get_cmap('jet') or plt.cm.Blues
+
+    normalize:    If False, plot the raw numbers
+                  If True, plot the proportions
+
+    Usage
+    -----
+    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
+                                                              # sklearn.metrics.confusion_matrix
+                          normalize    = True,                # show proportions
+                          target_names = y_labels_vals,       # list of names of the classes
+                          title        = best_estimator_name) # title of graph
+
+    Citiation
+    ---------
+    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
+
+    """
+
+    accuracy = np.trace(cm) / float(np.sum(cm))
+    misclass = 1 - accuracy
+
+    if cmap is None:
+        cmap = plt.get_cmap('Blues')
+
+    plt.figure(figsize=(8, 6))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+
+    if target_names is not None:
+        tick_marks = np.arange(len(target_names))
+        plt.xticks(tick_marks, target_names, rotation=45)
+        plt.yticks(tick_marks, target_names)
+
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        if normalize:
+            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
+                     horizontalalignment="center",
+                     color="white" if cm[i, j] > thresh else "black")
+        else:
+            plt.text(j, i, "{:,}".format(cm[i, j]),
+                     horizontalalignment="center",
+                     color="white" if cm[i, j] > thresh else "black")
+
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
+
+    return plt.gcf()

From b12c5dbcfd7c4e0b3dd2b755854bdbd634334e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Redzy=C5=84ski?= <pawelredzynski@gmail.com>
Date: Wed, 17 Nov 2021 17:10:04 +0100
Subject: [PATCH 7/9] dvc: update version to 2.8.3

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d5b4910e..d04337a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-dvc==2.6.4
+dvc==2.8.3
 joblib==1.0.1
 jupyter==1.0.0
 jupyter_contrib_nbextensions==0.5.1

From 511f1e31bb7afb8e8a8e17d8810b23dfe2c944b7 Mon Sep 17 00:00:00 2001
From: Jenifer De Figueiredo <jeny.defigueiredo@gmail.com>
Date: Wed, 29 Mar 2023 12:59:04 -0700
Subject: [PATCH 8/9] catching up

---
 notebooks/step-3-reusable-code.ipynb     |   3 +-
 notebooks/step-4-build-ml-pipeline.ipynb | 459 +++++++++++++++++++++++
 src/stages/data_load.py                  |   0
 3 files changed, 461 insertions(+), 1 deletion(-)
 create mode 100644 notebooks/step-4-build-ml-pipeline.ipynb
 create mode 100644 src/stages/data_load.py

diff --git a/notebooks/step-3-reusable-code.ipynb b/notebooks/step-3-reusable-code.ipynb
index 0f026c8c..c083cdb5 100644
--- a/notebooks/step-3-reusable-code.ipynb
+++ b/notebooks/step-3-reusable-code.ipynb
@@ -372,7 +372,8 @@
    "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
   },
   "kernelspec": {
-   "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)",
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
diff --git a/notebooks/step-4-build-ml-pipeline.ipynb b/notebooks/step-4-build-ml-pipeline.ipynb
new file mode 100644
index 00000000..51955f51
--- /dev/null
+++ b/notebooks/step-4-build-ml-pipeline.ipynb
@@ -0,0 +1,459 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.460557Z",
+     "start_time": "2019-06-16T21:17:29.395297Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    },
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'src'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 16\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# import plot_confusion_matrix()\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msrc\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreport\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvisualize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m plot_confusion_matrix\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import itertools\n",
+    "import joblib\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.metrics import confusion_matrix, f1_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import yaml\n",
+    "\n",
+    "# import plot_confusion_matrix()\n",
+    "from src.report.visualize import plot_confusion_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Go to project root folder\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read config\n",
+    "import pprint\n",
+    "\n",
+    "with open('params.yaml') as conf_file:\n",
+    "    config = yaml.safe_load(conf_file)\n",
+    "\n",
+    "pprint.pprint(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:31.485189Z",
+     "start_time": "2019-06-16T21:17:31.473720Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get data \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "data = load_iris(as_frame=True)\n",
+    "dataset = data.frame\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print labels for target values \n",
+    "\n",
+    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:17:32.328046Z",
+     "start_time": "2019-06-16T21:17:32.323611Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# feature names\n",
+    "\n",
+    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "\n",
+    "feature_names = dataset.columns.tolist()[:4]\n",
+    "feature_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save raw data\n",
+    "dataset.to_csv(config['data']['dataset_csv'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features engineering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.150708Z",
+     "start_time": "2019-06-16T21:21:02.144518Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n",
+    "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n",
+    "\n",
+    "dataset = dataset[[\n",
+    "    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
+    "#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n",
+    "    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n",
+    "    'target'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:02.987144Z",
+     "start_time": "2019-06-16T21:21:02.976092Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save features\n",
+    "dataset.to_csv(config['data']['features_path'], index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Split dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:07.438133Z",
+     "start_time": "2019-06-16T21:21:07.431649Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = train_test_split(\n",
+    "    dataset, test_size=config['data']['test_size'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "train_dataset.shape, test_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save train and test sets\n",
+    "train_dataset.to_csv(config['data']['trainset_path'])\n",
+    "test_dataset.to_csv(config['data']['testset_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:10.932148Z",
+     "start_time": "2019-06-16T21:21:10.927844Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_train = train_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:55.427365Z",
+     "start_time": "2019-06-16T21:21:55.416431Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create an instance of Logistic Regression Classifier CV and fit the data\n",
+    "\n",
+    "logreg = LogisticRegression(\n",
+    "    **config['train']['clf_params'],\n",
+    "    random_state=config['base']['random_state']\n",
+    ")\n",
+    "logreg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(logreg, config['train']['model_path'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.090756Z",
+     "start_time": "2019-06-16T21:21:56.086966Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get X and Y\n",
+    "\n",
+    "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n",
+    "X_test = test_dataset.drop('target', axis=1).values.astype('float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.270245Z",
+     "start_time": "2019-06-16T21:21:56.265054Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "prediction = logreg.predict(X_test)\n",
+    "cm = confusion_matrix(prediction, y_test)\n",
+    "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.493617Z",
+     "start_time": "2019-06-16T21:21:56.489929Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# f1 score value\n",
+    "f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save metrics\n",
+    "metrics = {\n",
+    "    'f1': f1\n",
+    "}\n",
+    "\n",
+    "with open(config['reports']['metrics_file'], 'w') as mf:\n",
+    "    json.dump(\n",
+    "        obj=metrics,\n",
+    "        fp=mf,\n",
+    "        indent=4\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-16T21:21:56.966279Z",
+     "start_time": "2019-06-16T21:21:56.726149Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save confusion matrix image\n",
+    "cm_plot.savefig(config['reports']['confusion_matrix_image'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/stages/data_load.py b/src/stages/data_load.py
new file mode 100644
index 00000000..e69de29b

From 95c7c572c414552da9ddd6eae5a586bf54abcb91 Mon Sep 17 00:00:00 2001
From: Jenifer De Figueiredo <jeny.defigueiredo@gmail.com>
Date: Wed, 5 Apr 2023 09:38:46 -0700
Subject: [PATCH 9/9] changes

---
 notebooks/step-4-build-ml-pipeline.ipynb | 129 ++++++++++-------------
 src/stages/data_load.py                  |  27 +++++
 2 files changed, 83 insertions(+), 73 deletions(-)

diff --git a/notebooks/step-4-build-ml-pipeline.ipynb b/notebooks/step-4-build-ml-pipeline.ipynb
index 51955f51..5f6246f5 100644
--- a/notebooks/step-4-build-ml-pipeline.ipynb
+++ b/notebooks/step-4-build-ml-pipeline.ipynb
@@ -2,34 +2,14 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2019-06-16T21:17:31.460557Z",
      "start_time": "2019-06-16T21:17:29.395297Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    },
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'src'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[2], line 16\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# import plot_confusion_matrix()\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msrc\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreport\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvisualize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m plot_confusion_matrix\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -51,9 +31,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jenif/course-ds-base\n"
+     ]
+    }
+   ],
    "source": [
     "# Go to project root folder\n",
     "%cd .."
@@ -68,9 +56,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'base': {'random_state': 42},\n",
+      " 'data': {'dataset_csv': 'data/raw/iris.csv',\n",
+      "          'features_path': 'data/processed/featured_iris.csv',\n",
+      "          'test_size': 0.2,\n",
+      "          'testset_path': 'data/processed/test_iris.csv',\n",
+      "          'trainset_path': 'data/processed/train_iris.csv'},\n",
+      " 'reports': {'confusion_matrix_image': 'reports/confusion_matrix.png',\n",
+      "             'metrics_file': 'reports/metrics.json'},\n",
+      " 'train': {'clf_params': {'C': 0.001,\n",
+      "                          'max_iter': 100,\n",
+      "                          'multi_class': 'multinomial',\n",
+      "                          'solver': 'lbfgs'},\n",
+      "           'model_path': 'models/model.joblib'}}\n"
+     ]
+    }
+   ],
    "source": [
     "# Read config\n",
     "import pprint\n",
@@ -90,63 +98,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2019-06-16T21:17:31.485189Z",
-     "start_time": "2019-06-16T21:17:31.473720Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# Get data \n",
-    "\n",
-    "import pandas as pd\n",
-    "from sklearn.datasets import load_iris\n",
-    "\n",
-    "data = load_iris(as_frame=True)\n",
-    "dataset = data.frame\n",
-    "dataset.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "# print labels for target values \n",
-    "\n",
-    "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2019-06-16T21:17:32.328046Z",
-     "start_time": "2019-06-16T21:17:32.323611Z"
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data Load complete\n"
+     ]
     }
-   },
-   "outputs": [],
+   ],
    "source": [
-    "# feature names\n",
-    "\n",
-    "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n",
+    "from src.stages.data_load import data_load\n",
     "\n",
-    "feature_names = dataset.columns.tolist()[:4]\n",
-    "feature_names"
+    "data_load(config_path = \"params.yaml\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data Load complete\r\n"
+     ]
+    }
+   ],
    "source": [
-    "# Save raw data\n",
-    "dataset.to_csv(config['data']['dataset_csv'], index=False)"
+    "!python src/stages/data_load.py --config=params.yaml"
    ]
   },
   {
diff --git a/src/stages/data_load.py b/src/stages/data_load.py
index e69de29b..36e8efb1 100644
--- a/src/stages/data_load.py
+++ b/src/stages/data_load.py
@@ -0,0 +1,27 @@
+import argparse
+import pandas as pd
+from sklearn.datasets import load_iris
+from typing import Text
+import yaml
+
+
+def data_load(config_path: Text) -> None:
+
+    with open(config_path) as conf_file:
+        config = yaml.safe_load(conf_file)
+
+    data = load_iris(as_frame=True)
+    dataset = data.frame
+     
+    dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]
+    dataset.to_csv(config['data']['dataset_csv'], index=False)
+
+    print("Data Load complete")
+
+if __name__ == '__main__':
+
+    args_parser = argparse.ArgumentParser()
+    args_parser.add_argument('--config', dest='config', required=True)
+    args = args_parser.parse_args()
+
+    data_load(config_path=args.config)
\ No newline at end of file