diff --git a/.gitignore b/.gitignore index cd56dce1..186220be 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ ## OS configs .DS_Store +# Project +data/* +models/* +reports/* + # Python __pycache__ .ipynb_checkpoints diff --git a/README.md b/README.md index 83039e3a..b7bd11e6 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ### 1. Fork / Clone this repository ```bash -git clone https://gitlab.com/iterative.ai/cse/tutorials/course-ds-base.git +git clone https://github.com/iterative/course-ds-base.git cd course-ds-base ``` @@ -15,6 +15,7 @@ cd course-ds-base Create virtual environment named `dvc-venv` (you may use other name) ```bash python3 -m venv dvc-venv +echo "export PYTHONPATH=$PWD" >> dvc-venv/bin/activate source dvc-venv/bin/activate ``` Install python libraries @@ -30,9 +31,13 @@ Add Virtual Environment to Jupyter Notebook python -m ipykernel install --user --name=dvc-venv ``` -Configure ToC for jupyter notebook (optional) +Configure ToC for jupyter notebook (optional)/Install the python package ```bash + + +pip install jupyter_contrib_nbextensions + jupyter contrib nbextension install --user jupyter nbextension enable toc2/main ``` diff --git a/file.txt b/file.txt new file mode 100644 index 00000000..e69de29b diff --git a/lineapy-trial-prototype.ipynb b/lineapy-trial-prototype.ipynb new file mode 100644 index 00000000..a1b86c0b --- /dev/null +++ b/lineapy-trial-prototype.ipynb @@ -0,0 +1,1228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: lineapy in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (0.2.3)\n", + "Requirement already satisfied: jinja2 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (2.11.2)\n", + "Requirement already satisfied: pandas in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (1.3.2)\n", + "Requirement already satisfied: pydantic in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (1.8.2)\n", + "Requirement already satisfied: networkx in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (2.5)\n", + "Requirement already satisfied: SQLAlchemy<2.0.0,>=1.4 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (1.4.47)\n", + "Requirement already satisfied: requests in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (2.25.1)\n", + "Requirement already satisfied: alembic==1.8.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (1.8.0)\n", + "Requirement already satisfied: IPython>=7.0.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (7.19.0)\n", + "Requirement already satisfied: isort in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (5.9.3)\n", + "Requirement already satisfied: rich in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (12.4.4)\n", + "Requirement already satisfied: click>=8.0.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (8.1.3)\n", + "Requirement already satisfied: pyyaml in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (5.3.1)\n", + "Requirement already satisfied: fsspec in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (2022.7.1)\n", + "Requirement already satisfied: nbconvert<7.0.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (6.0.7)\n", + "Requirement already satisfied: nbformat in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (5.0.8)\n", + "Requirement already satisfied: cloudpickle in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (1.6.0)\n", + "Requirement already satisfied: asttokens in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (2.2.1)\n", + "Requirement already satisfied: black in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (21.7b0)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from lineapy) (4.3.0)\n", + "Requirement already satisfied: importlib-metadata in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from alembic==1.8.0->lineapy) (2.0.0)\n", + "Requirement already satisfied: Mako in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from alembic==1.8.0->lineapy) (1.2.4)\n", + "Requirement already satisfied: importlib-resources in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from alembic==1.8.0->lineapy) (5.7.1)\n", + "Requirement already satisfied: appnope in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (0.1.0)\n", + "Requirement already satisfied: jedi>=0.10 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (0.17.1)\n", + "Requirement already satisfied: pygments in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (2.7.2)\n", + "Requirement already satisfied: pickleshare in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (0.7.5)\n", + "Requirement already satisfied: traitlets>=4.2 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (5.0.5)\n", + "Requirement already satisfied: pexpect>4.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (4.8.0)\n", + "Requirement already satisfied: decorator in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (4.4.2)\n", + "Requirement already satisfied: backcall in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (0.2.0)\n", + "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (3.0.8)\n", + "Requirement already satisfied: setuptools>=18.5 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from IPython>=7.0.0->lineapy) (50.3.1.post20201107)\n", + "Requirement already satisfied: bleach in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (3.2.1)\n", + "Requirement already satisfied: testpath in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (0.4.4)\n", + "Requirement already satisfied: jupyter-core in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (4.6.3)\n", + "Requirement already satisfied: pandocfilters>=1.4.1 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (1.4.3)\n", + "Requirement already satisfied: defusedxml in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (0.6.0)\n", + "Requirement already satisfied: mistune<2,>=0.8.1 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (0.8.4)\n", + "Requirement already satisfied: jupyterlab-pygments in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (0.1.2)\n", + "Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (0.5.1)\n", + "Requirement already satisfied: entrypoints>=0.2.2 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbconvert<7.0.0->lineapy) (0.3)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from jinja2->lineapy) (1.1.1)\n", + "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbformat->lineapy) (3.2.0)\n", + "Requirement already satisfied: ipython-genutils in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbformat->lineapy) (0.2.0)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from SQLAlchemy<2.0.0,>=1.4->lineapy) (2.0.2)\n", + "Requirement already satisfied: six in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from asttokens->lineapy) (1.15.0)\n", + "Requirement already satisfied: tomli<2.0.0,>=0.2.6 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from black->lineapy) (1.2.3)\n", + "Requirement already satisfied: regex>=2020.1.8 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from black->lineapy) (2020.10.15)\n", + "Requirement already satisfied: mypy-extensions>=0.4.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from black->lineapy) (0.4.3)\n", + "Requirement already satisfied: appdirs in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from black->lineapy) (1.4.4)\n", + "Requirement already satisfied: pathspec<1,>=0.8.1 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from black->lineapy) (0.9.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pandas->lineapy) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pandas->lineapy) (2022.1)\n", + "Requirement already satisfied: numpy>=1.17.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pandas->lineapy) (1.18.5)\n", + "Requirement already satisfied: idna<3,>=2.5 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from requests->lineapy) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from requests->lineapy) (2020.6.20)\n", + "Requirement already satisfied: chardet<5,>=3.0.2 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from requests->lineapy) (3.0.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from requests->lineapy) (1.25.11)\n", + "Requirement already satisfied: commonmark<0.10.0,>=0.9.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from rich->lineapy) (0.9.1)\n", + "Requirement already satisfied: parso<0.8.0,>=0.7.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from jedi>=0.10->IPython>=7.0.0->lineapy) (0.7.0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: attrs>=17.4.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat->lineapy) (20.3.0)\n", + "Requirement already satisfied: pyrsistent>=0.14.0 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat->lineapy) (0.17.3)\n", + "Requirement already satisfied: jupyter-client>=6.1.5 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert<7.0.0->lineapy) (6.1.7)\n", + "Requirement already satisfied: async-generator in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert<7.0.0->lineapy) (1.10)\n", + "Requirement already satisfied: nest-asyncio in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert<7.0.0->lineapy) (1.5.1)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pexpect>4.3->IPython>=7.0.0->lineapy) (0.6.0)\n", + "Requirement already satisfied: wcwidth in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->IPython>=7.0.0->lineapy) (0.2.5)\n", + "Requirement already satisfied: webencodings in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from bleach->nbconvert<7.0.0->lineapy) (0.5.1)\n", + "Requirement already satisfied: packaging in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from bleach->nbconvert<7.0.0->lineapy) (20.4)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from importlib-metadata->alembic==1.8.0->lineapy) (3.4.0)\n", + "Requirement already satisfied: tornado>=4.1 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from jupyter-client>=6.1.5->nbclient<0.6.0,>=0.5.0->nbconvert<7.0.0->lineapy) (6.1)\n", + "Requirement already satisfied: pyzmq>=13 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from jupyter-client>=6.1.5->nbclient<0.6.0,>=0.5.0->nbconvert<7.0.0->lineapy) (19.0.2)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from packaging->bleach->nbconvert<7.0.0->lineapy) (2.4.7)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "! pip install lineapy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas==1.3.2 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (1.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pandas==1.3.2) (2.8.1)\n", + "Requirement already satisfied: numpy>=1.17.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pandas==1.3.2) (1.18.5)\n", + "Requirement already satisfied: pytz>=2017.3 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from pandas==1.3.2) (2022.1)\n", + "Requirement already satisfied: six>=1.5 in /Users/jenif/opt/anaconda3/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas==1.3.2) (1.15.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "! python -m pip install pandas==1.3.2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext lineapy" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "lineapy_config(home_dir=PosixPath('/Users/jenif/.lineapy'), database_url='sqlite:////Users/jenif/.lineapy/db.sqlite', artifact_storage_dir=PosixPath('/Users/jenif/.lineapy/linea_pickles'), customized_annotation_folder=PosixPath('/Users/jenif/.lineapy/custom-annotations'), do_not_track=False, logging_level='INFO', logging_file=PosixPath('/Users/jenif/.lineapy/lineapy.log'), storage_options=None, mlflow_registry_uri=None, mlflow_tracking_uri=None, default_ml_models_storage_backend=None)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lineapy.options" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.460557Z", + "start_time": "2019-06-16T21:17:29.395297Z" + } + }, + "outputs": [], + "source": [ + "import lineapy\n", + "import joblib\n", + "import json\n", + "import itertools\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import confusion_matrix, f1_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:31.485189Z", + "start_time": "2019-06-16T21:17:31.473720Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)target
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", + "
" + ], + "text/plain": [ + " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " target \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get data \n", + "\n", + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "\n", + "data = load_iris(as_frame=True)\n", + "dataset = data.frame\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0: setosa\n", + "1: versicolor\n", + "2: virginica\n" + ] + }, + { + "data": { + "text/plain": [ + "[None, None, None]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print labels for target values \n", + "\n", + "[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:17:32.328046Z", + "start_time": "2019-06-16T21:17:32.323611Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['sepal_length', 'sepal_width', 'petal_length', 'petal_width']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# feature names\n", + "\n", + "dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]\n", + "\n", + "feature_names = dataset.columns.tolist()[:4]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#save raw data as artifact\n", + "dataset_csv = './data/raw/iris.csv'\n", + "dataset.to_csv(dataset_csv, index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_lengthsepal_widthpetal_lengthpetal_widthtarget
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
..................
1456.73.05.22.32
1466.32.55.01.92
1476.53.05.22.02
1486.23.45.42.32
1495.93.05.11.82
\n", + "

150 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " sepal_length sepal_width petal_length petal_width target\n", + "0 5.1 3.5 1.4 0.2 0\n", + "1 4.9 3.0 1.4 0.2 0\n", + "2 4.7 3.2 1.3 0.2 0\n", + "3 4.6 3.1 1.5 0.2 0\n", + "4 5.0 3.6 1.4 0.2 0\n", + ".. ... ... ... ... ...\n", + "145 6.7 3.0 5.2 2.3 2\n", + "146 6.3 2.5 5.0 1.9 2\n", + "147 6.5 3.0 5.2 2.0 2\n", + "148 6.2 3.4 5.4 2.3 2\n", + "149 5.9 3.0 5.1 1.8 2\n", + "\n", + "[150 rows x 5 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.3.2\n" + ] + } + ], + "source": [ + "print(pd.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LineaArtifact(name='iris-raw', _version=4)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#save raw data as artifact to lineapy\n", + "lineapy.save(dataset, \"iris-raw\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features engineering" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.150708Z", + "start_time": "2019-06-16T21:21:02.144518Z" + } + }, + "outputs": [], + "source": [ + "dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']\n", + "dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']\n", + "\n", + "dataset = dataset[[\n", + " 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n", + "# 'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',\n", + " 'sepal_length_to_sepal_width', 'petal_length_to_petal_width',\n", + " 'target'\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:02.987144Z", + "start_time": "2019-06-16T21:21:02.976092Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_lengthsepal_widthpetal_lengthpetal_widthsepal_length_to_sepal_widthpetal_length_to_petal_widthtarget
05.13.51.40.21.4571437.00
14.93.01.40.21.6333337.00
24.73.21.30.21.4687506.50
34.63.11.50.21.4838717.50
45.03.61.40.21.3888897.00
\n", + "
" + ], + "text/plain": [ + " sepal_length sepal_width petal_length petal_width \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " sepal_length_to_sepal_width petal_length_to_petal_width target \n", + "0 1.457143 7.0 0 \n", + "1 1.633333 7.0 0 \n", + "2 1.468750 6.5 0 \n", + "3 1.483871 7.5 0 \n", + "4 1.388889 7.0 0 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Save features\n", + "features_path = './data/processed/featured_iris.csv'\n", + "dataset.to_csv(features_path, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LineaArtifact(name='iris-preprocessed', _version=4)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#save features to lineapy\n", + "lineapy.save(dataset, \"iris-preprocessed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:06.361378Z", + "start_time": "2019-06-16T21:21:06.358647Z" + } + }, + "outputs": [], + "source": [ + "test_size=0.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Splittail train/test" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:07.438133Z", + "start_time": "2019-06-16T21:21:07.431649Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((120, 7), (30, 7))" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=42)\n", + "train_dataset.shape, test_dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Save train and test sets\n", + "trainset_path = './data/processed/train_iris.csv'\n", + "testset_path = './data/processed/test_iris.csv'\n", + "\n", + "train_dataset.to_csv(trainset_path)\n", + "test_dataset.to_csv(testset_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LineaArtifact(name='test-dataset', _version=4)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#save train and test sets to lineapy\n", + "lineapy.save(train_dataset, \"train-dataset\")\n", + "lineapy.save(test_dataset, \"test-dataset\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:10.932148Z", + "start_time": "2019-06-16T21:21:10.927844Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_train = train_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_train = train_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.427365Z", + "start_time": "2019-06-16T21:21:55.416431Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=0.001, multi_class='multinomial')" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create an instance of Logistic Regression Classifier CV and fit the data\n", + "\n", + "logreg = LogisticRegression(C=0.001, solver='lbfgs', multi_class='multinomial', max_iter=100)\n", + "logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['./models/model.joblib']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_path= './models/model.joblib'\n", + "joblib.dump(logreg, model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LineaArtifact(name='logreg-model', _version=3)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#save model to lineapy\n", + "lineapy.save(model_path, \"logreg-model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:55.875303Z", + "start_time": "2019-06-16T21:21:55.864724Z" + } + }, + "outputs": [], + "source": [ + "def plot_confusion_matrix(cm,\n", + " target_names,\n", + " title='Confusion matrix',\n", + " cmap=None,\n", + " normalize=True):\n", + " \"\"\"\n", + " given a sklearn confusion matrix (cm), make a nice plot\n", + "\n", + " Arguments\n", + " ---------\n", + " cm: confusion matrix from sklearn.metrics.confusion_matrix\n", + "\n", + " target_names: given classification classes such as [0, 1, 2]\n", + " the class names, for example: ['high', 'medium', 'low']\n", + "\n", + " title: the text to display at the top of the matrix\n", + "\n", + " cmap: the gradient of the values displayed from matplotlib.pyplot.cm\n", + " see http://matplotlib.org/examples/color/colormaps_reference.html\n", + " plt.get_cmap('jet') or plt.cm.Blues\n", + "\n", + " normalize: If False, plot the raw numbers\n", + " If True, plot the proportions\n", + "\n", + " Usage\n", + " -----\n", + " plot_confusion_matrix(cm = cm, # confusion matrix created by\n", + " # sklearn.metrics.confusion_matrix\n", + " normalize = True, # show proportions\n", + " target_names = y_labels_vals, # list of names of the classes\n", + " title = best_estimator_name) # title of graph\n", + "\n", + " Citiation\n", + " ---------\n", + " http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", + "\n", + " \"\"\"\n", + "\n", + " accuracy = np.trace(cm) / float(np.sum(cm))\n", + " misclass = 1 - accuracy\n", + "\n", + " if cmap is None:\n", + " cmap = plt.get_cmap('Blues')\n", + "\n", + " plt.figure(figsize=(8, 6))\n", + " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", + " plt.title(title)\n", + " plt.colorbar()\n", + "\n", + " if target_names is not None:\n", + " tick_marks = np.arange(len(target_names))\n", + " plt.xticks(tick_marks, target_names, rotation=45)\n", + " plt.yticks(tick_marks, target_names)\n", + "\n", + " if normalize:\n", + " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", + "\n", + " thresh = cm.max() / 1.5 if normalize else cm.max() / 2\n", + " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", + " if normalize:\n", + " plt.text(j, i, \"{:0.4f}\".format(cm[i, j]),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + " else:\n", + " plt.text(j, i, \"{:,}\".format(cm[i, j]),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + "\n", + " plt.tight_layout()\n", + " plt.ylabel('True label')\n", + " plt.xlabel('Predicted label\\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))\n", + " \n", + " return plt.gcf()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.090756Z", + "start_time": "2019-06-16T21:21:56.086966Z" + } + }, + "outputs": [], + "source": [ + "# Get X and Y\n", + "\n", + "y_test = test_dataset.loc[:, 'target'].values.astype('int32')\n", + "X_test = test_dataset.drop('target', axis=1).values.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.270245Z", + "start_time": "2019-06-16T21:21:56.265054Z" + } + }, + "outputs": [], + "source": [ + "prediction = logreg.predict(X_test)\n", + "cm = confusion_matrix(prediction, y_test)\n", + "f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-16T21:21:56.493617Z", + "start_time": "2019-06-16T21:21:56.489929Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9305555555555555" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# f1 score value\n", + "f1" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Save metrics\n", + "metrics_file = './reports/metrics.json'\n", + "\n", + "metrics = {\n", + " 'f1': f1\n", + "}\n", + "\n", + "with open(metrics_file, 'w') as mf:\n", + " json.dump(\n", + " obj=metrics,\n", + " fp=mf,\n", + " indent=4\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Save confusion matrix image\n", + "confusion_matrix_image = './reports/confusion_matrix.png'\n", + "cm_plot.savefig(confusion_matrix_image)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LineaArtifact(name='plot-confusion-matrix', _version=1)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#save confusion matrix to lineapy\n", + "lineapy.save(plot_confusion_matrix, \"plot-confusion-matrix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#commenting for change\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt index d5b4910e..95cfe9b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ -dvc==2.6.4 -joblib==1.0.1 -jupyter==1.0.0 -jupyter_contrib_nbextensions==0.5.1 -matplotlib==3.4.3 -numpy==1.21.2 -pandas==1.3.2 -pytest==6.2.4 -python-box==5.4.1 -pyyaml==5.4.1 -scikit-learn==0.24.2 -scipy==1.7.1 -tqdm==4.62.2 \ No newline at end of file +dvc>=2.8.3,<3 +joblib>=1.0.1,<2 +jupyter>=1.0.0,<2 +jupyter_contrib_nbextensions>=0.5.1,<1 +matplotlib>=3.4.3,<4 +numpy>=1.21.2,<2 +pandas>=1.3.2,<2 +pytest>=6.2.4,<7 +python-box>=5.4.1,<6 +pyyaml>=5.4.1,<6 +scikit-learn>=0.24.2,<2 +scipy>=1.7.1,<2 +tqdm>=4.62.2,<5 diff --git a/step-0-prototype.ipynb b/step-0-prototype.ipynb index 3f7fee6c..608a10f2 100644 --- a/step-0-prototype.ipynb +++ b/step-0-prototype.ipynb @@ -383,7 +383,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.2" }, "toc": { "base_numbering": 1,