From 5ce6e31930528dafaa988ae889c8a96a4fbfe9d1 Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Thu, 26 Aug 2021 00:03:22 +0900 Subject: [PATCH 1/9] Step 1: Create structure --- .gitignore | 5 + README.md | 1 + data/.gitignore | 3 + data/processed/.gitignore | 4 + data/raw/.gitignore | 2 + models/.gitignore | 1 + .../step-0-prototype.ipynb | 0 notebooks/step-1-organize-ml-project.ipynb | 531 ++++++++++++++++++ reports/.gitignore | 1 + 9 files changed, 548 insertions(+) create mode 100644 data/.gitignore create mode 100644 data/processed/.gitignore create mode 100644 data/raw/.gitignore create mode 100644 models/.gitignore rename step-0-prototype.ipynb => notebooks/step-0-prototype.ipynb (100%) create mode 100644 notebooks/step-1-organize-ml-project.ipynb create mode 100644 reports/.gitignore diff --git a/.gitignore b/.gitignore index cd56dce1..186220be 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ ## OS configs .DS_Store +# Project +data/* +models/* +reports/* + # Python __pycache__ .ipynb_checkpoints diff --git a/README.md b/README.md index 83039e3a..5d3aa41d 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ ```bash git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git cd course-ds-base +git checkout step-1 ``` diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 00000000..b6e069c5 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,3 @@ +* +!*/ +!.gitignore \ No newline at end of file diff --git a/data/processed/.gitignore b/data/processed/.gitignore new file mode 100644 index 00000000..6bd59f84 --- /dev/null +++ b/data/processed/.gitignore @@ -0,0 +1,4 @@ +!.gitignore +!*.dvc +/train_iris.csv +/test_iris.csv \ No newline at end of file diff --git a/data/raw/.gitignore b/data/raw/.gitignore new file mode 100644 index 00000000..3fc404be --- /dev/null +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +!.gitignore +!*.dvc \ No newline at end of file diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 00000000..b722e9e1 --- /dev/null +++ b/models/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file diff --git a/step-0-prototype.ipynb b/notebooks/step-0-prototype.ipynb similarity index 100% rename from step-0-prototype.ipynb rename to notebooks/step-0-prototype.ipynb diff --git a/notebooks/step-1-organize-ml-project.ipynb b/notebooks/step-1-organize-ml-project.ipynb new file mode 100644 index 00000000..16d63795 --- /dev/null +++ b/notebooks/step-1-organize-ml-project.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.460557Z", + "start_time": "2019-06-16T21:17:29.395297Z" + } + }, + "outputs": [], + "source": [ + "import itertools\n", + "import joblib\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import confusion_matrix, f1_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Base\n", + "random_state = 42\n", + "\n", + "# Data\n", + "dataset_csv = '../data/raw/iris.csv'\n", + "features_path = '../data/processed/featured_iris.csv'\n", + "\n", + "test_size = 0.2\n", + "\n", + "trainset_path = '../data/processed/train_iris.csv'\n", + "testset_path = '../data/processed/test_iris.csv'\n", + "\n", + "\n", + "# Train\n", + "clf_params = {\n", + " 'C': 0.001,\n", + " 'solver': 'lbfgs',\n", + " 'multi_class': 'multinomial',\n", + " 'max_iter': 100\n", + "}\n", + "model_path= '../models/model.joblib'\n", + "\n", + "# Reports\n", + "metrics_file = '../reports/metrics.json'\n", + "confusion_matrix_image = '../reports/confusion_matrix.png'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.485189Z", + "start_time": "2019-06-16T21:17:31.473720Z" + } + }, + "outputs": [], + "source": [ + "# Get data \n", + "\n", + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "\n", + "data = load_iris(as_frame=True)\n", + "dataset = data.frame\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print labels for target values \n", + "\n", + "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:32.328046Z", + "start_time": "2019-06-16T21:17:32.323611Z" + } + }, + "outputs": [], + "source": [ + "# feature names\n", + "\n", + "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n", + "\n", + "feature_names = dataset.columns.tolist()[:4]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save raw data\n", + "dataset.to_csv(dataset_csv, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features engineering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.150708Z", + "start_time": "2019-06-16T21:21:02.144518Z" + } + }, + "outputs": [], + "source": [ + "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n", + "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n", + "\n", + "dataset = dataset[[\n", + " 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n", + "# 'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n", + " 'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n", + " 'target'\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.987144Z", + "start_time": "2019-06-16T21:21:02.976092Z" + } + }, + "outputs": [], + "source": [ + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save features\n", + "dataset.to_csv(features_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:07.438133Z", + "start_time": "2019-06-16T21:21:07.431649Z" + } + }, + "outputs": [], + "source": [ + "train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)\n", + "train_dataset.shape, test_dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save train and test sets\n", + "train_dataset.to_csv(trainset_path)\n", + "test_dataset.to_csv(testset_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:10.932148Z", + "start_time": "2019-06-16T21:21:10.927844Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_train = train_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.427365Z", + "start_time": "2019-06-16T21:21:55.416431Z" + } + }, + "outputs": [], + "source": [ + "# Create an instance of Logistic Regression Classifier CV and fit the data\n", + "\n", + "logreg = LogisticRegression(**clf_params, random_state=random_state)\n", + "logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(logreg, model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.875303Z", + "start_time": "2019-06-16T21:21:55.864724Z" + } + }, + "outputs": [], + "source": [ + "def plot_confusion_matrix(cm,\n", + " target_names,\n", + " title='Confusion matrix',\n", + " cmap=None,\n", + " normalize=True):\n", + " \"\"\"\n", + " given a sklearn confusion matrix (cm), make a nice plot\n", + "\n", + " Arguments\n", + " ---------\n", + " cm: confusion matrix from sklearn.metrics.confusion_matrix\n", + "\n", + " target_names: given classification classes such as [0, 1, 2]\n", + " the class names, for example: ['high', 'medium', 'low']\n", + "\n", + " title: the text to display at the top of the matrix\n", + "\n", + " cmap: the gradient of the values displayed from matplotlib.pyplot.cm\n", + " see http://matplotlib.org/examples/color/colormaps_reference.html\n", + " plt.get_cmap('jet') or plt.cm.Blues\n", + "\n", + " normalize: If False, plot the raw numbers\n", + " If True, plot the proportions\n", + "\n", + " Usage\n", + " -----\n", + " plot_confusion_matrix(cm = cm, # confusion matrix created by\n", + " # sklearn.metrics.confusion_matrix\n", + " normalize = True, # show proportions\n", + " target_names = y_labels_vals, # list of names of the classes\n", + " title = best_estimator_name) # title of graph\n", + "\n", + " Citiation\n", + " ---------\n", + " http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", + "\n", + " \"\"\"\n", + "\n", + " accuracy = np.trace(cm) / float(np.sum(cm))\n", + " misclass = 1 - accuracy\n", + "\n", + " if cmap is None:\n", + " cmap = plt.get_cmap('Blues')\n", + "\n", + " plt.figure(figsize=(8, 6))\n", + " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", + " plt.title(title)\n", + " plt.colorbar()\n", + "\n", + " if target_names is not None:\n", + " tick_marks = np.arange(len(target_names))\n", + " plt.xticks(tick_marks, target_names, rotation=45)\n", + " plt.yticks(tick_marks, target_names)\n", + "\n", + " if normalize:\n", + " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", + "\n", + " thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n", + " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", + " if normalize:\n", + " plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + " else:\n", + " plt.text(j, i, \"{:,}\".format(cm[i, j]),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + "\n", + " plt.tight_layout()\n", + " plt.ylabel('True label')\n", + " plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n", + " \n", + " return plt.gcf()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.090756Z", + "start_time": "2019-06-16T21:21:56.086966Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_test = test_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.270245Z", + "start_time": "2019-06-16T21:21:56.265054Z" + } + }, + "outputs": [], + "source": [ + "prediction = logreg.predict(X_test)\n", + "cm = confusion_matrix(prediction, y_test)\n", + "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.493617Z", + "start_time": "2019-06-16T21:21:56.489929Z" + } + }, + "outputs": [], + "source": [ + "# f1 score value\n", + "f1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metrics\n", + "metrics = {\n", + " 'f1': f1\n", + "}\n", + "\n", + "with open(metrics_file, 'w') as mf:\n", + " json.dump(\n", + " obj=metrics,\n", + " fp=mf,\n", + " indent=4\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.966279Z", + "start_time": "2019-06-16T21:21:56.726149Z" + } + }, + "outputs": [], + "source": [ + "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save confusion matrix image\n", + "cm_plot.savefig(confusion_matrix_image)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/reports/.gitignore b/reports/.gitignore new file mode 100644 index 00000000..b722e9e1 --- /dev/null +++ b/reports/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file From 1046b4b66185c5cd4b2b9a4aaf0746e4faf2b47b Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Thu, 26 Aug 2021 13:40:59 +0900 Subject: [PATCH 2/9] Add configuration file --- notebooks/step-2-create-config-file.ipynb | 530 ++++++++++++++++++++++ params.yaml | 22 + 2 files changed, 552 insertions(+) create mode 100644 notebooks/step-2-create-config-file.ipynb create mode 100644 params.yaml diff --git a/notebooks/step-2-create-config-file.ipynb b/notebooks/step-2-create-config-file.ipynb new file mode 100644 index 00000000..a95d4434 --- /dev/null +++ b/notebooks/step-2-create-config-file.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.460557Z", + "start_time": "2019-06-16T21:17:29.395297Z" + } + }, + "outputs": [], + "source": [ + "import itertools\n", + "import joblib\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import confusion_matrix, f1_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "import yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Go to project root folder\n", + "%cd .." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read config\n", + "import pprint\n", + "\n", + "with open('params.yaml') as conf_file:\n", + " config = yaml.safe_load(conf_file)\n", + "\n", + "pprint.pprint(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.485189Z", + "start_time": "2019-06-16T21:17:31.473720Z" + } + }, + "outputs": [], + "source": [ + "# Get data \n", + "\n", + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "\n", + "data = load_iris(as_frame=True)\n", + "dataset = data.frame\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print labels for target values \n", + "\n", + "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:32.328046Z", + "start_time": "2019-06-16T21:17:32.323611Z" + } + }, + "outputs": [], + "source": [ + "# feature names\n", + "\n", + "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n", + "\n", + "feature_names = dataset.columns.tolist()[:4]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save raw data\n", + "dataset.to_csv(config['data']['dataset_csv'], index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features engineering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.150708Z", + "start_time": "2019-06-16T21:21:02.144518Z" + } + }, + "outputs": [], + "source": [ + "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n", + "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n", + "\n", + "dataset = dataset[[\n", + " 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n", + "# 'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n", + " 'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n", + " 'target'\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.987144Z", + "start_time": "2019-06-16T21:21:02.976092Z" + } + }, + "outputs": [], + "source": [ + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save features\n", + "dataset.to_csv(config['data']['features_path'], index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:07.438133Z", + "start_time": "2019-06-16T21:21:07.431649Z" + } + }, + "outputs": [], + "source": [ + "train_dataset, test_dataset = train_test_split(\n", + " dataset, test_size=config['data']['test_size'],\n", + " random_state=config['base']['random_state']\n", + ")\n", + "train_dataset.shape, test_dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save train and test sets\n", + "train_dataset.to_csv(config['data']['trainset_path'])\n", + "test_dataset.to_csv(config['data']['testset_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:10.932148Z", + "start_time": "2019-06-16T21:21:10.927844Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_train = train_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.427365Z", + "start_time": "2019-06-16T21:21:55.416431Z" + } + }, + "outputs": [], + "source": [ + "# Create an instance of Logistic Regression Classifier CV and fit the data\n", + "\n", + "logreg = LogisticRegression(\n", + " **config['train']['clf_params'],\n", + " random_state=config['base']['random_state']\n", + ")\n", + "logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(logreg, config['train']['model_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.875303Z", + "start_time": "2019-06-16T21:21:55.864724Z" + } + }, + "outputs": [], + "source": [ + "def plot_confusion_matrix(cm,\n", + " target_names,\n", + " title='Confusion matrix',\n", + " cmap=None,\n", + " normalize=True):\n", + " \"\"\"\n", + " given a sklearn confusion matrix (cm), make a nice plot\n", + "\n", + " Arguments\n", + " ---------\n", + " cm: confusion matrix from sklearn.metrics.confusion_matrix\n", + "\n", + " target_names: given classification classes such as [0, 1, 2]\n", + " the class names, for example: ['high', 'medium', 'low']\n", + "\n", + " title: the text to display at the top of the matrix\n", + "\n", + " cmap: the gradient of the values displayed from matplotlib.pyplot.cm\n", + " see http://matplotlib.org/examples/color/colormaps_reference.html\n", + " plt.get_cmap('jet') or plt.cm.Blues\n", + "\n", + " normalize: If False, plot the raw numbers\n", + " If True, plot the proportions\n", + "\n", + " Usage\n", + " -----\n", + " plot_confusion_matrix(cm = cm, # confusion matrix created by\n", + " # sklearn.metrics.confusion_matrix\n", + " normalize = True, # show proportions\n", + " target_names = y_labels_vals, # list of names of the classes\n", + " title = best_estimator_name) # title of graph\n", + "\n", + " Citiation\n", + " ---------\n", + " http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", + "\n", + " \"\"\"\n", + "\n", + " accuracy = np.trace(cm) / float(np.sum(cm))\n", + " misclass = 1 - accuracy\n", + "\n", + " if cmap is None:\n", + " cmap = plt.get_cmap('Blues')\n", + "\n", + " plt.figure(figsize=(8, 6))\n", + " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", + " plt.title(title)\n", + " plt.colorbar()\n", + "\n", + " if target_names is not None:\n", + " tick_marks = np.arange(len(target_names))\n", + " plt.xticks(tick_marks, target_names, rotation=45)\n", + " plt.yticks(tick_marks, target_names)\n", + "\n", + " if normalize:\n", + " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", + "\n", + " thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n", + " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", + " if normalize:\n", + " plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + " else:\n", + " plt.text(j, i, \"{:,}\".format(cm[i, j]),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + "\n", + " plt.tight_layout()\n", + " plt.ylabel('True label')\n", + " plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n", + " \n", + " return plt.gcf()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.090756Z", + "start_time": "2019-06-16T21:21:56.086966Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_test = test_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.270245Z", + "start_time": "2019-06-16T21:21:56.265054Z" + } + }, + "outputs": [], + "source": [ + "prediction = logreg.predict(X_test)\n", + "cm = confusion_matrix(prediction, y_test)\n", + "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.493617Z", + "start_time": "2019-06-16T21:21:56.489929Z" + } + }, + "outputs": [], + "source": [ + "# f1 score value\n", + "f1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metrics\n", + "metrics = {\n", + " 'f1': f1\n", + "}\n", + "\n", + "with open(config['reports']['metrics_file'], 'w') as mf:\n", + " json.dump(\n", + " obj=metrics,\n", + " fp=mf,\n", + " indent=4\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.966279Z", + "start_time": "2019-06-16T21:21:56.726149Z" + } + }, + "outputs": [], + "source": [ + "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save confusion matrix image\n", + "cm_plot.savefig(config['reports']['confusion_matrix_image'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/params.yaml b/params.yaml new file mode 100644 index 00000000..a48a0538 --- /dev/null +++ b/params.yaml @@ -0,0 +1,22 @@ +base: + random_state: 42 + +data: + dataset_csv: 'data/raw/iris.csv' + features_path: 'data/processed/featured_iris.csv' + test_size: 0.2 + trainset_path: 'data/processed/train_iris.csv' + testset_path: 'data/processed/test_iris.csv' + + +train: + clf_params: + 'C': 0.001 + 'solver': 'lbfgs' + 'multi_class': 'multinomial' + 'max_iter': 100 + model_path: 'models/model.joblib' + +reports: + metrics_file: 'reports/metrics.json' + confusion_matrix_image: 'reports/confusion_matrix.png' \ No newline at end of file From 920f7b46a30537132c980fe88af167fd9f24b0ca Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Thu, 26 Aug 2021 16:11:07 +0900 Subject: [PATCH 3/9] Distribute configs through notebook --- notebooks/step-1-organize-ml-project.ipynb | 58 +++++++--------------- 1 file changed, 18 insertions(+), 40 deletions(-) diff --git a/notebooks/step-1-organize-ml-project.ipynb b/notebooks/step-1-organize-ml-project.ipynb index 16d63795..3a115fea 100644 --- a/notebooks/step-1-organize-ml-project.ipynb +++ b/notebooks/step-1-organize-ml-project.ipynb @@ -22,46 +22,6 @@ "from sklearn.model_selection import train_test_split" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Base\n", - "random_state = 42\n", - "\n", - "# Data\n", - "dataset_csv = '../data/raw/iris.csv'\n", - "features_path = '../data/processed/featured_iris.csv'\n", - "\n", - "test_size = 0.2\n", - "\n", - "trainset_path = '../data/processed/train_iris.csv'\n", - "testset_path = '../data/processed/test_iris.csv'\n", - "\n", - "\n", - "# Train\n", - "clf_params = {\n", - " 'C': 0.001,\n", - " 'solver': 'lbfgs',\n", - " 'multi_class': 'multinomial',\n", - " 'max_iter': 100\n", - "}\n", - "model_path= '../models/model.joblib'\n", - "\n", - "# Reports\n", - "metrics_file = '../reports/metrics.json'\n", - "confusion_matrix_image = '../reports/confusion_matrix.png'" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -127,6 +87,7 @@ "outputs": [], "source": [ "# Save raw data\n", + "dataset_csv = '../data/raw/iris.csv'\n", "dataset.to_csv(dataset_csv, index=False)" ] }, @@ -180,6 +141,7 @@ "outputs": [], "source": [ "# Save features\n", + "features_path = '../data/processed/featured_iris.csv'\n", "dataset.to_csv(features_path, index=False)" ] }, @@ -201,6 +163,9 @@ }, "outputs": [], "source": [ + "random_state = 42\n", + "test_size = 0.2\n", + "\n", "train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)\n", "train_dataset.shape, test_dataset.shape" ] @@ -212,6 +177,9 @@ "outputs": [], "source": [ "# Save train and test sets\n", + "trainset_path = '../data/processed/train_iris.csv'\n", + "testset_path = '../data/processed/test_iris.csv'\n", + "\n", "train_dataset.to_csv(trainset_path)\n", "test_dataset.to_csv(testset_path)" ] @@ -252,6 +220,12 @@ "outputs": [], "source": [ "# Create an instance of Logistic Regression Classifier CV and fit the data\n", + "clf_params = {\n", + " 'C': 0.001,\n", + " 'solver': 'lbfgs',\n", + " 'multi_class': 'multinomial',\n", + " 'max_iter': 100\n", + "}\n", "\n", "logreg = LogisticRegression(**clf_params, random_state=random_state)\n", "logreg.fit(X_train, y_train)" @@ -263,6 +237,7 @@ "metadata": {}, "outputs": [], "source": [ + "model_path= '../models/model.joblib'\n", "joblib.dump(logreg, model_path)" ] }, @@ -414,6 +389,8 @@ "outputs": [], "source": [ "# Save metrics\n", + "metrics_file = '../reports/metrics.json'\n", + "\n", "metrics = {\n", " 'f1': f1\n", "}\n", @@ -447,6 +424,7 @@ "outputs": [], "source": [ "# Save confusion matrix image\n", + "confusion_matrix_image = '../reports/confusion_matrix.png'\n", "cm_plot.savefig(confusion_matrix_image)" ] }, From e85c826b6fb95eb54d1860af4a688b0d7bfd5ee0 Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Thu, 26 Aug 2021 16:51:48 +0900 Subject: [PATCH 4/9] Update docs: Fix branch name to checkout --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d3aa41d..6083ff24 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ```bash git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git cd course-ds-base -git checkout step-1 +git checkout step-2 ``` From ebe019100b0112c8101ca75ecf7aa74b7c1005d2 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 3 Nov 2021 14:43:29 +0300 Subject: [PATCH 5/9] Update README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 6083ff24..2d58776a 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,8 @@ ### 1. Fork / Clone this repository ```bash -git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git +git clone https://github.com/iterative/course-ds-base.git cd course-ds-base -git checkout step-2 ``` From 4a25bd3f5ecf2e433d007f1a2689682fde071c38 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 3 Nov 2021 14:57:12 +0300 Subject: [PATCH 6/9] Update step-3 solution --- README.md | 1 + notebooks/step-2-create-config-file.ipynb | 8 +- notebooks/step-3-reusable-code.ipynb | 435 ++++++++++++++++++++++ src/report/visualize.py | 80 ++++ 4 files changed, 521 insertions(+), 3 deletions(-) create mode 100644 notebooks/step-3-reusable-code.ipynb create mode 100644 src/report/visualize.py diff --git a/README.md b/README.md index 2d58776a..6fd7557f 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ cd course-ds-base Create virtual environment named `dvc-venv` (you may use other name) ```bash python3 -m venv dvc-venv +echo "export PYTHONPATH=$PWD" >> dvc-venv/bin/activate source dvc-venv/bin/activate ``` Install python libraries diff --git a/notebooks/step-2-create-config-file.ipynb b/notebooks/step-2-create-config-file.ipynb index a95d4434..1169b196 100644 --- a/notebooks/step-2-create-config-file.ipynb +++ b/notebooks/step-2-create-config-file.ipynb @@ -465,9 +465,11 @@ } ], "metadata": { + "interpreter": { + "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9" + }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", + "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)", "name": "python3" }, "language_info": { @@ -480,7 +482,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.4" }, "toc": { "base_numbering": 1, diff --git a/notebooks/step-3-reusable-code.ipynb b/notebooks/step-3-reusable-code.ipynb new file mode 100644 index 00000000..0f026c8c --- /dev/null +++ b/notebooks/step-3-reusable-code.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.460557Z", + "start_time": "2019-06-16T21:17:29.395297Z" + } + }, + "outputs": [], + "source": [ + "import itertools\n", + "import joblib\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import confusion_matrix, f1_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "import yaml\n", + "\n", + "# import plot_confusion_matrix()\n", + "from src.report.visualize import plot_confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Go to project root folder\n", + "%cd .." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read config\n", + "import pprint\n", + "\n", + "with open('params.yaml') as conf_file:\n", + " config = yaml.safe_load(conf_file)\n", + "\n", + "pprint.pprint(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.485189Z", + "start_time": "2019-06-16T21:17:31.473720Z" + } + }, + "outputs": [], + "source": [ + "# Get data \n", + "\n", + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "\n", + "data = load_iris(as_frame=True)\n", + "dataset = data.frame\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print labels for target values \n", + "\n", + "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:32.328046Z", + "start_time": "2019-06-16T21:17:32.323611Z" + } + }, + "outputs": [], + "source": [ + "# feature names\n", + "\n", + "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n", + "\n", + "feature_names = dataset.columns.tolist()[:4]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save raw data\n", + "dataset.to_csv(config['data']['dataset_csv'], index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features engineering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.150708Z", + "start_time": "2019-06-16T21:21:02.144518Z" + } + }, + "outputs": [], + "source": [ + "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n", + "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n", + "\n", + "dataset = dataset[[\n", + " 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n", + "# 'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n", + " 'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n", + " 'target'\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.987144Z", + "start_time": "2019-06-16T21:21:02.976092Z" + } + }, + "outputs": [], + "source": [ + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save features\n", + "dataset.to_csv(config['data']['features_path'], index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:07.438133Z", + "start_time": "2019-06-16T21:21:07.431649Z" + } + }, + "outputs": [], + "source": [ + "train_dataset, test_dataset = train_test_split(\n", + " dataset, test_size=config['data']['test_size'],\n", + " random_state=config['base']['random_state']\n", + ")\n", + "train_dataset.shape, test_dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save train and test sets\n", + "train_dataset.to_csv(config['data']['trainset_path'])\n", + "test_dataset.to_csv(config['data']['testset_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:10.932148Z", + "start_time": "2019-06-16T21:21:10.927844Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_train = train_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.427365Z", + "start_time": "2019-06-16T21:21:55.416431Z" + } + }, + "outputs": [], + "source": [ + "# Create an instance of Logistic Regression Classifier CV and fit the data\n", + "\n", + "logreg = LogisticRegression(\n", + " **config['train']['clf_params'],\n", + " random_state=config['base']['random_state']\n", + ")\n", + "logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(logreg, config['train']['model_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.090756Z", + "start_time": "2019-06-16T21:21:56.086966Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_test = test_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.270245Z", + "start_time": "2019-06-16T21:21:56.265054Z" + } + }, + "outputs": [], + "source": [ + "prediction = logreg.predict(X_test)\n", + "cm = confusion_matrix(prediction, y_test)\n", + "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.493617Z", + "start_time": "2019-06-16T21:21:56.489929Z" + } + }, + "outputs": [], + "source": [ + "# f1 score value\n", + "f1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metrics\n", + "metrics = {\n", + " 'f1': f1\n", + "}\n", + "\n", + "with open(config['reports']['metrics_file'], 'w') as mf:\n", + " json.dump(\n", + " obj=metrics,\n", + " fp=mf,\n", + " indent=4\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.966279Z", + "start_time": "2019-06-16T21:21:56.726149Z" + } + }, + "outputs": [], + "source": [ + "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save confusion matrix image\n", + "cm_plot.savefig(config['reports']['confusion_matrix_image'])" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9" + }, + "kernelspec": { + "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/report/visualize.py b/src/report/visualize.py new file mode 100644 index 00000000..7656d4b3 --- /dev/null +++ b/src/report/visualize.py @@ -0,0 +1,80 @@ +import itertools +import matplotlib.colors +import matplotlib.pyplot as plt +import numpy as np +from typing import List, Text + + +def plot_confusion_matrix(cm: np.array, + target_names: List[Text], + title: Text = 'Confusion matrix', + cmap: matplotlib.colors.LinearSegmentedColormap = None, + normalize: bool = True): + """ + given a sklearn confusion matrix (cm), make a nice plot + + Arguments + --------- + cm: confusion matrix from sklearn.metrics.confusion_matrix + + target_names: given classification classes such as [0, 1, 2] + the class names, for example: ['high', 'medium', 'low'] + + title: the text to display at the top of the matrix + + cmap: the gradient of the values displayed from matplotlib.pyplot.cm + see http://matplotlib.org/examples/color/colormaps_reference.html + plt.get_cmap('jet') or plt.cm.Blues + + normalize: If False, plot the raw numbers + If True, plot the proportions + + Usage + ----- + plot_confusion_matrix(cm = cm, # confusion matrix created by + # sklearn.metrics.confusion_matrix + normalize = True, # show proportions + target_names = y_labels_vals, # list of names of the classes + title = best_estimator_name) # title of graph + + Citiation + --------- + http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html + + """ + + accuracy = np.trace(cm) / float(np.sum(cm)) + misclass = 1 - accuracy + + if cmap is None: + cmap = plt.get_cmap('Blues') + + plt.figure(figsize=(8, 6)) + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + + if target_names is not None: + tick_marks = np.arange(len(target_names)) + plt.xticks(tick_marks, target_names, rotation=45) + plt.yticks(tick_marks, target_names) + + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + + thresh = cm.max() / 1.5 if normalize else cm.max() / 2 + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + if normalize: + plt.text(j, i, "{:0.4f}".format(cm[i, j]), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + else: + plt.text(j, i, "{:,}".format(cm[i, j]), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass)) + + return plt.gcf() From b12c5dbcfd7c4e0b3dd2b755854bdbd634334e93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Redzy=C5=84ski?= Date: Wed, 17 Nov 2021 17:10:04 +0100 Subject: [PATCH 7/9] dvc: update version to 2.8.3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d5b4910e..d04337a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -dvc==2.6.4 +dvc==2.8.3 joblib==1.0.1 jupyter==1.0.0 jupyter_contrib_nbextensions==0.5.1 From 511f1e31bb7afb8e8a8e17d8810b23dfe2c944b7 Mon Sep 17 00:00:00 2001 From: Jenifer De Figueiredo Date: Wed, 29 Mar 2023 12:59:04 -0700 Subject: [PATCH 8/9] catching up --- notebooks/step-3-reusable-code.ipynb | 3 +- notebooks/step-4-build-ml-pipeline.ipynb | 459 +++++++++++++++++++++++ src/stages/data_load.py | 0 3 files changed, 461 insertions(+), 1 deletion(-) create mode 100644 notebooks/step-4-build-ml-pipeline.ipynb create mode 100644 src/stages/data_load.py diff --git a/notebooks/step-3-reusable-code.ipynb b/notebooks/step-3-reusable-code.ipynb index 0f026c8c..c083cdb5 100644 --- a/notebooks/step-3-reusable-code.ipynb +++ b/notebooks/step-3-reusable-code.ipynb @@ -372,7 +372,8 @@ "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9" }, "kernelspec": { - "display_name": "Python 3.9.4 64-bit ('dvc-venv': venv)", + "display_name": "Python 3 (ipykernel)", + "language": "python", "name": "python3" }, "language_info": { diff --git a/notebooks/step-4-build-ml-pipeline.ipynb b/notebooks/step-4-build-ml-pipeline.ipynb new file mode 100644 index 00000000..51955f51 --- /dev/null +++ b/notebooks/step-4-build-ml-pipeline.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.460557Z", + "start_time": "2019-06-16T21:17:29.395297Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + }, + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'src'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# import plot_confusion_matrix()\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msrc\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreport\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvisualize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m plot_confusion_matrix\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import itertools\n", + "import joblib\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import confusion_matrix, f1_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "import yaml\n", + "\n", + "# import plot_confusion_matrix()\n", + "from src.report.visualize import plot_confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Go to project root folder\n", + "%cd .." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read config\n", + "import pprint\n", + "\n", + "with open('params.yaml') as conf_file:\n", + " config = yaml.safe_load(conf_file)\n", + "\n", + "pprint.pprint(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.485189Z", + "start_time": "2019-06-16T21:17:31.473720Z" + } + }, + "outputs": [], + "source": [ + "# Get data \n", + "\n", + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "\n", + "data = load_iris(as_frame=True)\n", + "dataset = data.frame\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print labels for target values \n", + "\n", + "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:32.328046Z", + "start_time": "2019-06-16T21:17:32.323611Z" + } + }, + "outputs": [], + "source": [ + "# feature names\n", + "\n", + "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n", + "\n", + "feature_names = dataset.columns.tolist()[:4]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save raw data\n", + "dataset.to_csv(config['data']['dataset_csv'], index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features engineering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.150708Z", + "start_time": "2019-06-16T21:21:02.144518Z" + } + }, + "outputs": [], + "source": [ + "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n", + "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n", + "\n", + "dataset = dataset[[\n", + " 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n", + "# 'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n", + " 'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n", + " 'target'\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.987144Z", + "start_time": "2019-06-16T21:21:02.976092Z" + } + }, + "outputs": [], + "source": [ + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save features\n", + "dataset.to_csv(config['data']['features_path'], index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:07.438133Z", + "start_time": "2019-06-16T21:21:07.431649Z" + } + }, + "outputs": [], + "source": [ + "train_dataset, test_dataset = train_test_split(\n", + " dataset, test_size=config['data']['test_size'],\n", + " random_state=config['base']['random_state']\n", + ")\n", + "train_dataset.shape, test_dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save train and test sets\n", + "train_dataset.to_csv(config['data']['trainset_path'])\n", + "test_dataset.to_csv(config['data']['testset_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:10.932148Z", + "start_time": "2019-06-16T21:21:10.927844Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_train = train_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.427365Z", + "start_time": "2019-06-16T21:21:55.416431Z" + } + }, + "outputs": [], + "source": [ + "# Create an instance of Logistic Regression Classifier CV and fit the data\n", + "\n", + "logreg = LogisticRegression(\n", + " **config['train']['clf_params'],\n", + " random_state=config['base']['random_state']\n", + ")\n", + "logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(logreg, config['train']['model_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.090756Z", + "start_time": "2019-06-16T21:21:56.086966Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_test = test_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.270245Z", + "start_time": "2019-06-16T21:21:56.265054Z" + } + }, + "outputs": [], + "source": [ + "prediction = logreg.predict(X_test)\n", + "cm = confusion_matrix(prediction, y_test)\n", + "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.493617Z", + "start_time": "2019-06-16T21:21:56.489929Z" + } + }, + "outputs": [], + "source": [ + "# f1 score value\n", + "f1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metrics\n", + "metrics = {\n", + " 'f1': f1\n", + "}\n", + "\n", + "with open(config['reports']['metrics_file'], 'w') as mf:\n", + " json.dump(\n", + " obj=metrics,\n", + " fp=mf,\n", + " indent=4\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.966279Z", + "start_time": "2019-06-16T21:21:56.726149Z" + } + }, + "outputs": [], + "source": [ + "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save confusion matrix image\n", + "cm_plot.savefig(config['reports']['confusion_matrix_image'])" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "75b1920f9cbcc979e2f7542df3a177962ac1d8ba339eed6df458570447bf37f9" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/stages/data_load.py b/src/stages/data_load.py new file mode 100644 index 00000000..e69de29b From 95c7c572c414552da9ddd6eae5a586bf54abcb91 Mon Sep 17 00:00:00 2001 From: Jenifer De Figueiredo Date: Wed, 5 Apr 2023 09:38:46 -0700 Subject: [PATCH 9/9] changes --- notebooks/step-4-build-ml-pipeline.ipynb | 129 ++++++++++------------- src/stages/data_load.py | 27 +++++ 2 files changed, 83 insertions(+), 73 deletions(-) diff --git a/notebooks/step-4-build-ml-pipeline.ipynb b/notebooks/step-4-build-ml-pipeline.ipynb index 51955f51..5f6246f5 100644 --- a/notebooks/step-4-build-ml-pipeline.ipynb +++ b/notebooks/step-4-build-ml-pipeline.ipynb @@ -2,34 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-06-16T21:17:31.460557Z", "start_time": "2019-06-16T21:17:29.395297Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - }, - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'src'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# import plot_confusion_matrix()\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msrc\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreport\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvisualize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m plot_confusion_matrix\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'src'" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", @@ -51,9 +31,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/jenif/course-ds-base\n" + ] + } + ], "source": [ "# Go to project root folder\n", "%cd .." @@ -68,9 +56,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'base': {'random_state': 42},\n", + " 'data': {'dataset_csv': 'data/raw/iris.csv',\n", + " 'features_path': 'data/processed/featured_iris.csv',\n", + " 'test_size': 0.2,\n", + " 'testset_path': 'data/processed/test_iris.csv',\n", + " 'trainset_path': 'data/processed/train_iris.csv'},\n", + " 'reports': {'confusion_matrix_image': 'reports/confusion_matrix.png',\n", + " 'metrics_file': 'reports/metrics.json'},\n", + " 'train': {'clf_params': {'C': 0.001,\n", + " 'max_iter': 100,\n", + " 'multi_class': 'multinomial',\n", + " 'solver': 'lbfgs'},\n", + " 'model_path': 'models/model.joblib'}}\n" + ] + } + ], "source": [ "# Read config\n", "import pprint\n", @@ -90,63 +98,38 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2019-06-16T21:17:31.485189Z", - "start_time": "2019-06-16T21:17:31.473720Z" - } - }, - "outputs": [], - "source": [ - "# Get data \n", - "\n", - "import pandas as pd\n", - "from sklearn.datasets import load_iris\n", - "\n", - "data = load_iris(as_frame=True)\n", - "dataset = data.frame\n", - "dataset.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], - "source": [ - "# print labels for target values \n", - "\n", - "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2019-06-16T21:17:32.328046Z", - "start_time": "2019-06-16T21:17:32.323611Z" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data Load complete\n" + ] } - }, - "outputs": [], + ], "source": [ - "# feature names\n", - "\n", - "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n", + "from src.stages.data_load import data_load\n", "\n", - "feature_names = dataset.columns.tolist()[:4]\n", - "feature_names" + "data_load(config_path = \"params.yaml\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data Load complete\r\n" + ] + } + ], "source": [ - "# Save raw data\n", - "dataset.to_csv(config['data']['dataset_csv'], index=False)" + "!python src/stages/data_load.py --config=params.yaml" ] }, { diff --git a/src/stages/data_load.py b/src/stages/data_load.py index e69de29b..36e8efb1 100644 --- a/src/stages/data_load.py +++ b/src/stages/data_load.py @@ -0,0 +1,27 @@ +import argparse +import pandas as pd +from sklearn.datasets import load_iris +from typing import Text +import yaml + + +def data_load(config_path: Text) -> None: + + with open(config_path) as conf_file: + config = yaml.safe_load(conf_file) + + data = load_iris(as_frame=True) + dataset = data.frame + + dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()] + dataset.to_csv(config['data']['dataset_csv'], index=False) + + print("Data Load complete") + +if __name__ == '__main__': + + args_parser = argparse.ArgumentParser() + args_parser.add_argument('--config', dest='config', required=True) + args = args_parser.parse_args() + + data_load(config_path=args.config) \ No newline at end of file