From 7fefd1cb7125e84bbc2def7a48a502ab899136c1 Mon Sep 17 00:00:00 2001 From: Srishti Chauhan <104318680+Srishtichauhan5359@users.noreply.github.com> Date: Thu, 20 Oct 2022 00:14:43 +0530 Subject: [PATCH 1/2] Naive-Bayes --- ML Algorithms/NaiveBayes.ipynb | 1692 ++++++++++++++++++++++++++++++++ 1 file changed, 1692 insertions(+) create mode 100644 ML Algorithms/NaiveBayes.ipynb diff --git a/ML Algorithms/NaiveBayes.ipynb b/ML Algorithms/NaiveBayes.ipynb new file mode 100644 index 00000000..f587f9ec --- /dev/null +++ b/ML Algorithms/NaiveBayes.ipynb @@ -0,0 +1,1692 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ytmXJBL3qPjo" + }, + "source": [ + "# **Naive Bayes Classification - Implementation**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "WiC3dFzRe444" + }, + "outputs": [], + "source": [ + "# importing the libraries\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6zUPxkGqjlV8" + }, + "source": [ + "**Once we have imported all the required modules, the next step is to import the data set and split the data sets into inputs and outputs.** " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "BuVm9YfffLck" + }, + "outputs": [], + "source": [ + "# importing the dataset\n", + "dataset = pd.read_csv('NaiveBayes.csv')\n", + "\n", + "# split the data into inputs and outputs\n", + "X = dataset.iloc[:, [0,1]].values\n", + "y = dataset.iloc[:, 2].values" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6GNGuoJETVWF", + "outputId": "733a383d-e965-46b7-e01a-aec46df47f91" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "800\n" + ] + } + ], + "source": [ + "from numpy.ma.core import count\n", + "print(count(X))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BgOg3hk0jvSc" + }, + "source": [ + "**The next step is to divide the input and output values into the training and testing part so that once the training of the model is complete, we can evaluate its performance using testing data.**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "4R9_T3WnfOn9" + }, + "outputs": [], + "source": [ + "# training and testing data\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# assign test data size 25%\n", + "X_train, X_test, y_train, y_test =train_test_split(X,y,test_size= 0.25, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KZe_MTZKSqbC", + "outputId": "e3fd1194-7d45-4334-e757-710575144373" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200\n" + ] + } + ], + "source": [ + "from numpy.ma.core import count\n", + "print(count(X_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uhIM9EN9junr" + }, + "source": [ + "**We set test_size=0.25, which means 25% of the whole data set will be assigned to the testing part and the remaining 75% will be used for the model’s training.**\n", + "**The next step is to scale our dataset to be ready to be used for the training.**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "40RtvU4FfSbM" + }, + "outputs": [], + "source": [ + "# importing standard scaler\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# scalling the input data\n", + "sc_X = StandardScaler() \n", + "X_train = sc_X.fit_transform(X_train)\n", + "X_test = sc_X.fit_transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DYmlUO8WSVp7", + "outputId": "2bea9c3a-788c-4dd4-c918-361b0661e352" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.58164944 -0.88670699]\n", + " [-0.60673761 1.46173768]\n", + " [-0.01254409 -0.5677824 ]\n", + " [-0.60673761 1.89663484]\n", + " [ 1.37390747 -1.40858358]\n", + " [ 1.47293972 0.99784738]\n", + " [ 0.08648817 -0.79972756]\n", + " [-0.01254409 -0.24885782]\n", + " [-0.21060859 -0.5677824 ]\n", + " [-0.21060859 -0.19087153]\n", + " [-0.30964085 -1.29261101]\n", + " [-0.30964085 -0.5677824 ]\n", + " [ 0.38358493 0.09905991]\n", + " [ 0.8787462 -0.59677555]\n", + " [ 2.06713324 -1.17663843]\n", + " [ 1.07681071 -0.13288524]\n", + " [ 0.68068169 1.78066227]\n", + " [-0.70576986 0.56295021]\n", + " [ 0.77971394 0.35999821]\n", + " [ 0.8787462 -0.53878926]\n", + " [-1.20093113 -1.58254245]\n", + " [ 2.1661655 0.93986109]\n", + " [-0.01254409 1.22979253]\n", + " [ 0.18552042 1.08482681]\n", + " [ 0.38358493 -0.48080297]\n", + " [-0.30964085 -0.30684411]\n", + " [ 0.97777845 -0.8287207 ]\n", + " [ 0.97777845 1.8676417 ]\n", + " [-0.01254409 1.25878567]\n", + " [-0.90383437 2.27354572]\n", + " [-1.20093113 -1.58254245]\n", + " [ 2.1661655 -0.79972756]\n", + " [-1.39899564 -1.46656987]\n", + " [ 0.38358493 2.30253886]\n", + " [ 0.77971394 0.76590222]\n", + " [-1.00286662 -0.30684411]\n", + " [ 0.08648817 0.76590222]\n", + " [-1.00286662 0.56295021]\n", + " [ 0.28455268 0.07006676]\n", + " [ 0.68068169 -1.26361786]\n", + " [-0.50770535 -0.01691267]\n", + " [-1.79512465 0.35999821]\n", + " [-0.70576986 0.12805305]\n", + " [ 0.38358493 0.30201192]\n", + " [-0.30964085 0.07006676]\n", + " [-0.50770535 2.30253886]\n", + " [ 0.18552042 0.04107362]\n", + " [ 1.27487521 2.21555943]\n", + " [ 0.77971394 0.27301877]\n", + " [-0.30964085 0.1570462 ]\n", + " [-0.01254409 -0.53878926]\n", + " [-0.21060859 0.1570462 ]\n", + " [-0.11157634 0.24402563]\n", + " [-0.01254409 -0.24885782]\n", + " [ 2.1661655 1.11381995]\n", + " [-1.79512465 0.35999821]\n", + " [ 1.86906873 0.12805305]\n", + " [ 0.38358493 -0.13288524]\n", + " [-1.20093113 0.30201192]\n", + " [ 0.77971394 1.37475825]\n", + " [-0.30964085 -0.24885782]\n", + " [-1.6960924 -0.04590581]\n", + " [-1.00286662 -0.74174127]\n", + " [ 0.28455268 0.50496393]\n", + " [-0.11157634 -1.06066585]\n", + " [-1.10189888 0.59194336]\n", + " [ 0.08648817 -0.79972756]\n", + " [-1.00286662 1.54871711]\n", + " [-0.70576986 1.40375139]\n", + " [-1.29996338 0.50496393]\n", + " [-0.30964085 0.04107362]\n", + " [-0.11157634 0.01208048]\n", + " [-0.30964085 -0.88670699]\n", + " [ 0.8787462 -1.3505973 ]\n", + " [-0.30964085 2.24455257]\n", + " [ 0.97777845 1.98361427]\n", + " [-1.20093113 0.47597078]\n", + " [-1.29996338 0.27301877]\n", + " [ 1.37390747 1.98361427]\n", + " [ 1.27487521 -1.3505973 ]\n", + " [-0.30964085 -0.27785096]\n", + " [-0.50770535 1.25878567]\n", + " [-0.80480212 1.08482681]\n", + " [ 0.97777845 -1.06066585]\n", + " [ 0.28455268 0.30201192]\n", + " [ 0.97777845 0.76590222]\n", + " [-0.70576986 -1.49556302]\n", + " [-0.70576986 0.04107362]\n", + " [ 0.48261718 1.72267598]\n", + " [ 2.06713324 0.18603934]\n", + " [-1.99318916 -0.74174127]\n", + " [-0.21060859 1.40375139]\n", + " [ 0.38358493 0.59194336]\n", + " [ 0.8787462 -1.14764529]\n", + " [-1.20093113 -0.77073441]\n", + " [ 0.18552042 0.24402563]\n", + " [ 0.77971394 -0.30684411]\n", + " [ 2.06713324 -0.79972756]\n", + " [ 0.77971394 0.12805305]\n", + " [-0.30964085 0.6209365 ]\n", + " [-1.00286662 -0.30684411]\n", + " [ 0.18552042 -0.3648304 ]\n", + " [ 2.06713324 2.12857999]\n", + " [ 1.86906873 -1.26361786]\n", + " [ 1.37390747 -0.91570013]\n", + " [ 0.8787462 1.25878567]\n", + " [ 1.47293972 2.12857999]\n", + " [-0.30964085 -1.23462472]\n", + " [ 1.96810099 0.91086794]\n", + " [ 0.68068169 -0.71274813]\n", + " [-1.49802789 0.35999821]\n", + " [ 0.77971394 -1.3505973 ]\n", + " [ 0.38358493 -0.13288524]\n", + " [-1.00286662 0.41798449]\n", + " [-0.01254409 -0.30684411]\n", + " [-1.20093113 0.41798449]\n", + " [-0.90383437 -1.20563157]\n", + " [-0.11157634 0.04107362]\n", + " [-1.59706014 -0.42281668]\n", + " [ 0.97777845 -1.00267957]\n", + " [ 1.07681071 -1.20563157]\n", + " [-0.01254409 -0.13288524]\n", + " [-1.10189888 -1.52455616]\n", + " [ 0.77971394 -1.20563157]\n", + " [ 0.97777845 2.07059371]\n", + " [-1.20093113 -1.52455616]\n", + " [-0.30964085 0.79489537]\n", + " [ 0.08648817 -0.30684411]\n", + " [-1.39899564 -1.23462472]\n", + " [-0.60673761 -1.49556302]\n", + " [ 0.77971394 0.53395707]\n", + " [-0.30964085 -0.33583725]\n", + " [ 1.77003648 -0.27785096]\n", + " [ 0.8787462 -1.03167271]\n", + " [ 0.18552042 0.07006676]\n", + " [-0.60673761 0.8818748 ]\n", + " [-1.89415691 -1.40858358]\n", + " [-1.29996338 0.59194336]\n", + " [-0.30964085 0.53395707]\n", + " [-1.00286662 -1.089659 ]\n", + " [ 1.17584296 -1.43757673]\n", + " [ 0.18552042 -0.30684411]\n", + " [ 1.17584296 -0.74174127]\n", + " [-0.30964085 0.07006676]\n", + " [ 0.18552042 2.09958685]\n", + " [ 0.77971394 -1.089659 ]\n", + " [ 0.08648817 0.04107362]\n", + " [-1.79512465 0.12805305]\n", + " [-0.90383437 0.1570462 ]\n", + " [-0.70576986 0.18603934]\n", + " [ 0.8787462 -1.29261101]\n", + " [ 0.18552042 -0.24885782]\n", + " [-0.4086731 1.22979253]\n", + " [-0.01254409 0.30201192]\n", + " [ 0.38358493 0.1570462 ]\n", + " [ 0.8787462 -0.65476184]\n", + " [ 0.08648817 0.1570462 ]\n", + " [-1.89415691 -1.29261101]\n", + " [-0.11157634 0.30201192]\n", + " [-0.21060859 -0.27785096]\n", + " [ 0.28455268 -0.50979612]\n", + " [-0.21060859 1.6067034 ]\n", + " [ 0.97777845 -1.17663843]\n", + " [-0.21060859 1.63569655]\n", + " [ 1.27487521 1.8676417 ]\n", + " [-1.10189888 -0.3648304 ]\n", + " [-0.01254409 0.04107362]\n", + " [ 0.08648817 -0.24885782]\n", + " [-1.59706014 -1.23462472]\n", + " [-0.50770535 -0.27785096]\n", + " [ 0.97777845 0.12805305]\n", + " [ 1.96810099 -1.3505973 ]\n", + " [ 1.47293972 0.07006676]\n", + " [-0.60673761 1.37475825]\n", + " [ 1.57197197 0.01208048]\n", + " [-0.80480212 0.30201192]\n", + " [ 1.96810099 0.73690908]\n", + " [-1.20093113 -0.50979612]\n", + " [ 0.68068169 0.27301877]\n", + " [-1.39899564 -0.42281668]\n", + " [ 0.18552042 0.1570462 ]\n", + " [-0.50770535 -1.20563157]\n", + " [ 0.58164944 2.01260742]\n", + " [-1.59706014 -1.49556302]\n", + " [-0.50770535 -0.53878926]\n", + " [ 0.48261718 1.83864855]\n", + " [-1.39899564 -1.089659 ]\n", + " [ 0.77971394 -1.37959044]\n", + " [-0.30964085 -0.42281668]\n", + " [ 1.57197197 0.99784738]\n", + " [ 0.97777845 1.43274454]\n", + " [-0.30964085 -0.48080297]\n", + " [-0.11157634 2.15757314]\n", + " [-1.49802789 -0.1038921 ]\n", + " [-0.11157634 1.95462113]\n", + " [-0.70576986 -0.33583725]\n", + " [-0.50770535 -0.8287207 ]\n", + " [ 0.68068169 -1.37959044]\n", + " [-0.80480212 -1.58254245]\n", + " [-1.89415691 -1.46656987]\n", + " [ 1.07681071 0.12805305]\n", + " [ 0.08648817 1.51972397]\n", + " [-0.30964085 0.09905991]\n", + " [ 0.08648817 0.04107362]\n", + " [-1.39899564 -1.3505973 ]\n", + " [ 0.28455268 0.07006676]\n", + " [-0.90383437 0.38899135]\n", + " [ 1.57197197 -1.26361786]\n", + " [-0.30964085 -0.74174127]\n", + " [-0.11157634 0.1570462 ]\n", + " [-0.90383437 -0.65476184]\n", + " [-0.70576986 -0.04590581]\n", + " [ 0.38358493 -0.45180983]\n", + " [-0.80480212 1.89663484]\n", + " [ 1.37390747 1.28777882]\n", + " [ 1.17584296 -0.97368642]\n", + " [ 1.77003648 1.83864855]\n", + " [-0.90383437 -0.24885782]\n", + " [-0.80480212 0.56295021]\n", + " [-1.20093113 -1.5535493 ]\n", + " [-0.50770535 -1.11865214]\n", + " [ 0.28455268 0.07006676]\n", + " [-0.21060859 -1.06066585]\n", + " [ 1.67100423 1.6067034 ]\n", + " [ 0.97777845 1.78066227]\n", + " [ 0.28455268 0.04107362]\n", + " [-0.80480212 -0.21986468]\n", + " [-0.11157634 0.07006676]\n", + " [ 0.28455268 -0.19087153]\n", + " [ 1.96810099 -0.65476184]\n", + " [-0.80480212 1.3457651 ]\n", + " [-1.79512465 -0.59677555]\n", + " [-0.11157634 0.12805305]\n", + " [ 0.28455268 -0.30684411]\n", + " [ 1.07681071 0.56295021]\n", + " [-1.00286662 0.27301877]\n", + " [ 1.47293972 0.35999821]\n", + " [ 0.18552042 -0.3648304 ]\n", + " [ 2.1661655 -1.03167271]\n", + " [-0.30964085 1.11381995]\n", + " [-1.6960924 0.07006676]\n", + " [-0.01254409 0.04107362]\n", + " [ 0.08648817 1.05583366]\n", + " [-0.11157634 -0.3648304 ]\n", + " [-1.20093113 0.07006676]\n", + " [-0.30964085 -1.3505973 ]\n", + " [ 1.57197197 1.11381995]\n", + " [-0.80480212 -1.52455616]\n", + " [ 0.08648817 1.8676417 ]\n", + " [-0.90383437 -0.77073441]\n", + " [-0.50770535 -0.77073441]\n", + " [-0.30964085 -0.91570013]\n", + " [ 0.28455268 -0.71274813]\n", + " [ 0.28455268 0.07006676]\n", + " [ 0.08648817 1.8676417 ]\n", + " [-1.10189888 1.95462113]\n", + " [-1.6960924 -1.5535493 ]\n", + " [-1.20093113 -1.089659 ]\n", + " [-0.70576986 -0.1038921 ]\n", + " [ 0.08648817 0.09905991]\n", + " [ 0.28455268 0.27301877]\n", + " [ 0.8787462 -0.5677824 ]\n", + " [ 0.28455268 -1.14764529]\n", + " [-0.11157634 0.67892279]\n", + " [ 2.1661655 -0.68375498]\n", + " [-1.29996338 -1.37959044]\n", + " [-1.00286662 -0.94469328]\n", + " [-0.01254409 -0.42281668]\n", + " [-0.21060859 -0.45180983]\n", + " [-1.79512465 -0.97368642]\n", + " [ 1.77003648 0.99784738]\n", + " [ 0.18552042 -0.3648304 ]\n", + " [ 0.38358493 1.11381995]\n", + " [-1.79512465 -1.3505973 ]\n", + " [ 0.18552042 -0.13288524]\n", + " [ 0.8787462 -1.43757673]\n", + " [-1.99318916 0.47597078]\n", + " [-0.30964085 0.27301877]\n", + " [ 1.86906873 -1.06066585]\n", + " [-0.4086731 0.07006676]\n", + " [ 1.07681071 -0.88670699]\n", + " [-1.10189888 -1.11865214]\n", + " [-1.89415691 0.01208048]\n", + " [ 0.08648817 0.27301877]\n", + " [-1.20093113 0.33100506]\n", + " [-1.29996338 0.30201192]\n", + " [-1.00286662 0.44697764]\n", + " [ 1.67100423 -0.88670699]\n", + " [ 1.17584296 0.53395707]\n", + " [ 1.07681071 0.53395707]\n", + " [ 1.37390747 2.331532 ]\n", + " [-0.30964085 -0.13288524]\n", + " [ 0.38358493 -0.45180983]\n", + " [-0.4086731 -0.77073441]\n", + " [-0.11157634 -0.50979612]\n", + " [ 0.97777845 -1.14764529]\n", + " [-0.90383437 -0.77073441]\n", + " [-0.21060859 -0.50979612]\n", + " [-1.10189888 -0.45180983]\n", + " [-1.20093113 1.40375139]]\n" + ] + } + ], + "source": [ + "print (X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lx9iPWfHkV9j" + }, + "source": [ + "**Note: scaling (or standardization) of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data.**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w2lSZnqCkZ63" + }, + "source": [ + "# Training the model using Bernolli Naive Bayes classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "FP8t3In8fV9W" + }, + "outputs": [], + "source": [ + "# importing classifier\n", + "from sklearn.naive_bayes import BernoulliNB\n", + "\n", + "# initializaing the NB\n", + "classifer = BernoulliNB()\n", + "\n", + "# training the model\n", + "classifer.fit(X_train, y_train)\n", + "\n", + "# testing the model\n", + "y_pred = classifer.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "K9hmZu_xoB9A", + "outputId": "175d79cb-3615-4534-8b79-72bc6855480a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,\n", + " 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,\n", + " 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ARx119-GkZKE" + }, + "source": [ + "**Now let us check the accuracy of the predicted values using the Bernoulli Naive Bayes classifier.**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tp-0cr1SfYwt", + "outputId": "a2c49250-a97b-4ed5-89a4-187b45492947" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8\n" + ] + } + ], + "source": [ + "# importing accuracy score\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# printing the accuracy of the model\n", + "print(accuracy_score(y_pred, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nu4AMjUWkpUE" + }, + "source": [ + "# We got an accuracy of 80% when we trained our model using Bernoulli Naive Bayes classifier." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UqyqZCoxkyL-" + }, + "source": [ + "# Training model using Gaussian Naive Bayes Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "V-oTMzy3fdgs" + }, + "outputs": [], + "source": [ + "# import Gaussian Naive Bayes classifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "# create a Gaussian Classifier\n", + "classifer1 = GaussianNB()\n", + "\n", + "# training the model\n", + "classifer1.fit(X_train, y_train)\n", + "\n", + "# testing the model\n", + "y_pred1 = classifer1.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sNXmuBvOor3d", + "outputId": "36255849-c7e7-44d7-a496-72f8f5a1128e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,\n", + " 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", + " 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,\n", + " 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,\n", + " 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred1" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HSA2ET12fg5d", + "outputId": "eccabdab-6a0a-4f84-9f18-4c0381e8bc68" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.91\n" + ] + } + ], + "source": [ + "# importing accuracy score\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# printing the accuracy of the model\n", + "print(accuracy_score(y_test,y_pred1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3HncFnjnk5b4" + }, + "source": [ + "# **This time we got an accuracy of 91% when we trained the model on the same dataset.**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZUc0mZdRlJHk" + }, + "source": [ + "**Confusion Matrix for Binary classification:**\n", + "Confusion Matrix is also known as error matrix. It is a table layout that allows visualization of the performance of a classification algorithm. Each row of the matrix represents the instances in an actual class, while each column represents the instances in a predicted class, or vice versa.\n", + "![confusion.PNG]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oc7U9xftl_3_" + }, + "source": [ + "Evaluation of Bernoulli Naive Bayes classifier\n", + "\n", + "Let’s evaluate ourBernoulli Naive Bayes model using a confusion matrix that will visually help us see the number of correct and incorrect classified classes. First of all, we’ll visualize our model’s results. The predicted values are stored in a variable named y_pred." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + }, + "id": "mH3cN09Mfknn", + "outputId": "434c2539-4acd-4f82-b3dd-c63d85f0f69b" + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVoAAAD4CAYAAACt8i4nAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAARS0lEQVR4nO3debScdX3H8fc3C5KwRyBeCAiVCAeQRRDhgBXhVOJSweqhuBwjJxJblbJYAaMFEXuKG4tUW0IWg5UEpCxR3DCCFpRAEEUgKjGampAQKTtYSe58+8ed4jXLnbnc+d1n8uT94vzOnXlm5jffP3I+/vw+v+eZyEwkSeWMqLoASao7g1aSCjNoJakwg1aSCjNoJamwUaW/YM0jS93WoPWM2eU1VZegLrT2uRUx1DkGkzmjd/yLIX9fO1zRSlJhxVe0kjSsGr1VV7Aeg1ZSvfSurbqC9Ri0kmols1F1CeuxRyupXhqN9kcLEbF9RFwbEb+IiMURcUREjIuImyPiwebfHVrNY9BKqpdstD9auxT4dmbuAxwILAbOARZk5kRgQfP5gAxaSfXS6G1/DCAitgP+EpgJkJnPZebjwPHAnObb5gAntCrJoJVUL4NY0UbE1IhY1G9M7TfTnsDvgdkRcU9EzIiIrYDxmbmy+Z5VwPhWJXkyTFKt5CB2HWTmdGD6Rl4eBbwSODUzF0bEpazTJsjMjIiWF0i4opVUL507GbYcWJ6ZC5vPr6UveB+OiB6A5t/VrSYyaCXVS4dOhmXmKuB3EbF389CxwAPAfGBy89hk4MZWJdk6kFQvnb0y7FTgqxGxBbAUOJm+Beo1ETEFWAac2GoSg1ZSvXTwgoXM/Clw6AZeOnYw8xi0kurFS3AlqbA2rvgabgatpFrJ9O5dklRWF95UxqCVVC+2DiSpMFe0klRY75qqK1iPQSupXmwdSFJhtg4kqTBXtJJUmEErSWWlJ8MkqTB7tJJUmK0DSSrMFa0kFeaKVpIKc0UrSYWt9cbfklSWK1pJKswerSQV5opWkgpzRStJhbmilaTC3HUgSYVlVl3BegxaSfVij1aSCjNoJakwT4ZJUmG9vVVXsB6DVlK92DqQpMI6GLQR8VvgKaAXWJuZh0bEOOBqYA/gt8CJmfnYQPOM6FhFktQNstH+aM/rMvOgzDy0+fwcYEFmTgQWNJ8PyKCVVCvZyLbHC3Q8MKf5eA5wQqsPGLSS6qXRaHtExNSIWNRvTF1ntgS+GxF393ttfGaubD5eBYxvVZI9Wkn1MohdB5k5HZg+wFuOyswVEbEzcHNE/GKdz2dEtFwaG7SS6qWDJ8Myc0Xz7+qIuB44DHg4Inoyc2VE9ACrW81j60BSvQyidTCQiNgqIrb5/8fA64H7gPnA5ObbJgM3tirJFW1BTz71NOddeAlLli6DCC6Ydgbfu/V2fnD7QkaNHsVuu/bwqWlnsu02W1ddqiqy3XbbMv3yz7HffnuTmZxyyoe5Y+HdVZe1aevcTWXGA9dHBPRl5VWZ+e2IuAu4JiKmAMuAE1tNFFn4TjdrHlnafbfSGSbTLvgcrzxwf97+lkmsWbOGP/zvH/n5A7/k1YccxKhRI7noSzMBOPMDUyqudPiN2eU1VZfQFWbNvITbblvIrNlzGT16NGPHjuGJJ56suqzKrH1uRQx1jmcvOqXtzBl75hVD/r522Doo5Kmnn+Hun93H2/76OABGjx7NtttszZGvPoRRo0YCcMB++/Dw6keqLFMV2nbbbXjNUa9m1uy5AKxZs2azDtmOaWT7Y5i0bB1ExD707RvbtXloBTA/MxeXLGxTt+KhVeyw/XZ8/J8v4pdLlrLv3hM55/S/Y+yYLZ9/z/U3fZdJx762wipVpT333J1HHvkfZs64mAMO2Jef/ORezjjzXJ599g9Vl7Zp68J7HQy4oo2Is4F5QAB3NkcAcyNio1dD9N+bNuPKuZ2sd5OxtreXxb9awt++9U1c++UvMmbMlsz8yjXPv375nLmMHDmSN7/+dRVWqSqNGjmSgw9+BZdffiWvOuw4nnnmWc4+60NVl7XJy0aj7TFcWq1opwD7Zeaa/gcj4iLgfuDCDX2o/960zbVH+5Kdd2T8TjtywH77APD6o49ixn/0Be0NN93MD2+/kxlf+BeajXZthpavWMny5Su58657ALjuups46yMG7ZANY0ugXa16tA1glw0c72m+po3Y8cXjeMnOO/GbZcsBuOPun/KyPXbntjsWMeuqr3HZp89jzJZbtphFdfbww79n+fKHePnLXwbAMcccxeLFv6q4qhro/L0OhqzVivZ0YEFEPAj8rnlsd2AvwP/pbWHaGX/P2ed/hjVr17DbLj1cMO0MTnrfaTy3Zg2nnP4xoO+E2HlnnVpxparKaWf8E1fOuYwtthjNb37z30x535lVl7Tp68IVbcvtXRExgr6rIfqfDLsrM9vqOG+urQMNzO1d2pBObO965tyT2s6crT45b1h6dy13HWRmA7hjGGqRpKHzp2wkqbAubB0YtJJqZTi3bbXLoJVUL65oJakwg1aSCuvCS3ANWkm1MoTfAivGoJVULwatJBXmrgNJKswVrSQVZtBKUlnZa+tAkspyRStJZbm9S5JKM2glqbDua9EatJLqJdd2X9IatJLqpfty1qCVVC+eDJOk0lzRSlJZrmglqTRXtJJUVq6tuoL1jai6AEnqpGy0P9oRESMj4p6I+Ebz+Z4RsTAilkTE1RGxRas5DFpJ9dIYxGjPacDifs8/DVycmXsBjwFTWk1g0EqqlU6uaCNiAvAmYEbzeQDHANc23zIHOKHVPAatpFoZTNBGxNSIWNRvTF1nukuAs/jT+vfFwOOZz3eClwO7tqrJk2GSaiV7o/33Zk4Hpm/otYh4M7A6M++OiKOHUpNBK6lW2j3J1YYjgbdExBuBLYFtgUuB7SNiVHNVOwFY0WoiWweSaiUb0fYYcJ7Mj2bmhMzcAzgJ+H5mvgu4BXh7822TgRtb1WTQSqqVTm/v2oCzgTMjYgl9PduZrT5g60BSrWS236Ntf868Fbi1+XgpcNhgPm/QSqqVDvZoO8aglVQrjUHsOhguBq2kWml1kqsKBq2kWjFoJamw7L7b0Rq0kurFFa0kFVZie9dQGbSSaqXXXQeSVJYrWkkqzB6tJBXmrgNJKswVrSQV1tvovpsSGrSSasXWgSQV1nDXgSSV5fYuSSpss2wdHHnAyaW/Qpugt/QcUnUJqilbB5JUmLsOJKmwLuwcGLSS6sXWgSQV5q4DSSqsC38E16CVVC+JK1pJKmqtrQNJKssVrSQVZo9WkgpzRStJhXXjirb7rlWTpCHoJdoeA4mILSPizoj4WUTcHxHnN4/vGRELI2JJRFwdEVu0qsmglVQrjWh/tPBH4JjMPBA4CJgUEYcDnwYuzsy9gMeAKa0mMmgl1UqDaHsMJPs83Xw6ujkSOAa4tnl8DnBCq5oMWkm1koMYrUTEyIj4KbAauBn4NfB4Zq5tvmU5sGureQxaSbXSGMSIiKkRsajfmNp/rszszcyDgAnAYcA+L6Qmdx1IqpVGtL+9KzOnA9PbeN/jEXELcASwfUSMaq5qJwArWn3eFa2kWukdxBhIROwUEds3H48B/gpYDNwCvL35tsnAja1qckUrqVba2E3Qrh5gTkSMpG9Rek1mfiMiHgDmRcSngHuAma0mMmgl1Uqr3QTtysx7gYM3cHwpff3athm0kmrFn7KRpMI62DroGINWUq10470ODFpJtdLrilaSynJFK0mFGbSSVFgX/mSYQSupXlzRSlJhrS6trYJBK6lW3EcrSYXZOpCkwgxaSSrMex1IUmH2aCWpMHcdSFJhjS5sHhi0kmrFk2GSVFj3rWcNWkk144pWkgpbG923pjVoJdVK98WsQSupZmwdSFJhbu+SpMK6L2YNWkk1Y+tAkgrr7cI1rUErqVZc0UpSYemKVpLK6sYV7YiqC6irj190Nt++9wbmfn/2eq+98/0ncudDP2C7cdtVUJmq9MHP/gOz776SS7572fPH9th3Ty68/rN8/puX8Jmvf569DpxYYYWbvgbZ9hguBm0hN139LU5710fWO77zLjtx+GtfxcrlqyqoSlW75WsLuGDyJ/7s2Hs++l6uvnQuH37j6cy76Cre89H3VlJbXeQgxkAiYreIuCUiHoiI+yPitObxcRFxc0Q82Py7Q6uaDNpC7ll4L08+9tR6x8/4xIe47FP/Tmb39ZFU3gN33s9Tjz/9Z8cyk7FbjwVg7DZb8ejqR6sorTbWkm2PllPBhzNzX+Bw4IMRsS9wDrAgMycCC5rPB2SPdhj95XFH8vtVj/DgA7+uuhR1kVmfnMG5V57P5I+dTIwYwbS/OavqkjZpnToZlpkrgZXNx09FxGJgV+B44Ojm2+YAtwJnDzTXC17RRsTJA7w2NSIWRcSi1c+ufKFfUSsvGvMi3nvqu7n8s7OqLkVdZtK738DsC2Yw9YgpzP7kDD7wmVOrLmmT1hjE6J9VzTF1Q3NGxB7AwcBCYHwzhAFWAeNb1TSU1sH5G3shM6dn5qGZeejOY3uG8BX1MeGlu7LL7j189XszuWHhPHbu2YmvfOcKXrzTuKpLU8WOftsx3PGtHwPwo5tuZ+KBL6+4ok1bDua/flnVHNPXnS8itgb+Ezg9M5/8s+/q6wG2XEIP2DqIiHs39hJtpLj+5Ne/WMqkA054/vkNC+cx+Q3v54lHn6iwKnWDx1Y/yn6H78/9d9zHK448gJW/fajqkjZpndzeFRGj6QvZr2bmdc3DD0dET2aujIgeYHWreVr1aMcDxwGPrfv9wI8GWfNm5YIvncshRxzE9uO24+uLvsYVn5/N/LnfrLosVeyML/wj+x+xP9vssC1X3DGLeRfP5Utn/ytTPnEKI0eO5Lk/Pse/nfPFqsvcpPV26ERzRAQwE1icmRf1e2k+MBm4sPn3xpZzDXT2OyJmArMz87YNvHZVZr6z1RcctstrPb2u9UwY7R5ire+6ZfNjqHO886VvbTtzrlp2/Ua/LyKOAv4L+Dl/WihPo69Pew2wO7AMODEzB9wqMuCKNjOnDPBay5CVpOHWwV0Ht9H3/9435NjBzOX2Lkm10o2X4Bq0kmrFX1iQpMK8e5ckFdapXQedZNBKqhVbB5JUmCfDJKkwe7SSVJitA0kqrBvv9WzQSqoVf25ckgqzdSBJhdk6kKTCXNFKUmFu75KkwrwEV5IKs3UgSYUZtJJUmLsOJKkwV7SSVJi7DiSpsN7svhslGrSSasUerSQVZo9WkgqzRytJhTVsHUhSWa5oJakwdx1IUmG2DiSpMFsHklRYN65oR1RdgCR1Ug7iv1YiYlZErI6I+/odGxcRN0fEg82/O7Sax6CVVCu92dv2aMOXgUnrHDsHWJCZE4EFzecDMmgl1Upmtj3amOuHwKPrHD4emNN8PAc4odU8Bq2kWmmQbY+ImBoRi/qNqW18xfjMXNl8vAoY3+oDngyTVCuDualMZk4Hpg/huzIiWn6hQSupVoZh18HDEdGTmSsjogdY3eoDtg4k1Uondx1sxHxgcvPxZODGVh9wRSupVjp5CW5EzAWOBnaMiOXAecCFwDURMQVYBpzYah6DVlKtdPLG35n5jo28dOxg5jFoJdVKN14ZZtBKqhV/ykaSCvOnbCSpMFe0klSYN/6WpMI8GSZJhdk6kKTC/IUFSSrMFa0kFdaNPdroxvSvq4iY2rwtm/Q8/13Un3fvGl7t3FRYmx//XdScQStJhRm0klSYQTu87MNpQ/x3UXOeDJOkwlzRSlJhBq0kFWbQDpOImBQRv4yIJRFxTtX1qHoRMSsiVkfEfVXXorIM2mEQESOBLwJvAPYF3hER+1ZblbrAl4FJVReh8gza4XEYsCQzl2bmc8A84PiKa1LFMvOHwKNV16HyDNrhsSvwu37PlzePSdoMGLSSVJhBOzxWALv1ez6heUzSZsCgHR53ARMjYs+I2AI4CZhfcU2SholBOwwycy3wIeA7wGLgmsy8v9qqVLWImAv8GNg7IpZHxJSqa1IZXoIrSYW5opWkwgxaSSrMoJWkwgxaSSrMoJWkwgxaSSrMoJWkwv4PDhTpPIIxNkcAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# importing the required modules\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix\n", + "\n", + "# passing actual and predicted values\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "\n", + "# true write data values in each cell of the matrix\n", + "sns.heatmap(cm, annot=True)\n", + "plt.savefig('confusion.png')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5q5dpLcLmGsF" + }, + "source": [ + "The confusion matrix helps us know which class has been mispredicted.\n", + "\n", + "We can also print the classification report, which will help us further evaluate our model’s performance." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VNColYjhfn-7", + "outputId": "0d1ac13d-4aca-434b-ed84-dd4a5b7e6995" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.91 0.86 68\n", + " 1 0.75 0.56 0.64 32\n", + "\n", + " accuracy 0.80 100\n", + " macro avg 0.78 0.74 0.75 100\n", + "weighted avg 0.79 0.80 0.79 100\n", + "\n" + ] + } + ], + "source": [ + "# importing classification report\n", + "from sklearn.metrics import classification_report\n", + "\n", + "# printing the report\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c8vK73wVmM4G" + }, + "source": [ + "**Note: the Gaussian naive Bayes classifier performed very well on this dataset, as shown in the confusion matrix. Let us now print out the classification report as well,**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + }, + "id": "Bt9ZhqUOftcv", + "outputId": "b7bd1237-6f98-489e-d623-606ff5d29cb8" + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVoAAAD4CAYAAACt8i4nAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAQvklEQVR4nO3de7CcdX3H8fc3CZhwE0LCMSS2gEQodUbsRMChtJRLuSiClfGC2ujEnrEWRexMiWIvKDjotCptte1BkDgohIKYFKrARJBB5CqgYMCEgJILCVcJiJJz9ts/zlM8huTsHrK/82yevF/MM7v77O7vfP/IfPjO9/ntbmQmkqRyJtRdgCQ1nUErSYUZtJJUmEErSYUZtJJU2KTSf2DD4yvc1qCXmLLnYXWXoB40+MKq2NI1xpI5203bZ4v/XifsaCWpsOIdrSSNq9ZQ3RW8hEErqVmGBuuu4CUMWkmNktmqu4SXMGglNUvLoJWksuxoJakwL4ZJUmF2tJJUVrrrQJIK82KYJBXWg6MDP4IrqVlaQ50fbUTErhFxeUTcHxFLI+JNETE1Iq6LiGXV7W7t1jFoJTVLtjo/2jsP+G5m7g+8HlgKzAeWZOZsYEn1eFSODiQ1S5cuhkXEK4E/Ad4PkJkvAC9ExInA4dXLFgA3AGeMtpYdraRmabU6PiKiPyLuGHH0j1hpb+Ax4GsRcVdEfDUidgT6MnNN9ZpHgb52JdnRSmqUzM4/sJCZA8DAZp6eBPwR8JHMvDUizmOjMUFmZkS0/f5bO1pJzdK9Ge1KYGVm3lo9vpzh4F0bETMAqtt17RYyaCU1yxhGB6PJzEeBRyJiv+rUkcBPgcXA3OrcXGBRu5IcHUhqlu7uo/0I8I2I2B5YAXyA4Qb1soiYB/wceEe7RQxaSc0ytKFrS2Xm3cCcTTx15FjWMWglNYsfwZWkwnrwI7gGraRmsaOVpMIMWkkqK7t4MaxbDFpJzeKMVpIKc3QgSYXZ0UpSYXa0klSYHa0kFTbor+BKUll2tJJUmDNaSSrMjlaSCrOjlaTC7GglqTB3HUhSYdn2R2nHnUErqVmc0UpSYQatJBXmxTBJKmxoqO4KXsKgldQsjg4kqTCDVpIKc0YrSWVly320klSWowNJKqyLuw4i4mFgPTAEDGbmnIiYCiwE9gIeBt6RmU+Nts6ErlUkSb2g1er86MyfZeaBmTmnejwfWJKZs4El1eNRGbSSmqX7QbuxE4EF1f0FwEnt3mDQFvTM+mc5/cyzOeHdf8UJp/Rz971LX3zuokuu4HWHHsdTT/+yxgpVtwkTJnD7bdew6MoF7V+szmR2fEREf0TcMeLo33g14NqIuHPEc32Zuaa6/yjQ164kZ7QFnful/+TQg+fwxXM+xYYNG3j+178BYM3ax7j5th8xo2+PmitU3T76kQ9y//3L2GXnnesupTnG0Klm5gAwMMpL/jgzV0XEHsB1EXH/Ru/PiGi7zcGOtpD1zz7Hnffcy9tPOAaA7bbbjl123gmAz//rf/HxD88jos4KVbeZM2dw/HFHcuGFl9RdSrO0svOjjcxcVd2uA64EDgLWRsQMgOp2Xbt12na0EbE/wzOJmdWpVcDizFy6+Xdp1epH2W3XV/Kpc77AA8tXcMB+s5n/sQ9xyx13scf0aew/e5+6S1TNvvAvZzH/E2ezc/U/YHVJl3YdRMSOwITMXF/d/3Pg08BiYC5wbnW7qN1ao3a0EXEGcCkQwG3VEcAlEbHZK20j5x5f/fq2+X/rwaEhlv5sOe9825u5/KIvM2XKZL5ywcWc//WFnPrB99Vdnmr25uOPYt26x/nRXT+pu5TGyVar46ONPuCmiLiH4ey7OjO/y3DAHh0Ry4Cjqsejihzl28gj4mfAH2bmho3Obw/cV21vGNWGx1f03sc0xsHjTzzJKf2nc+0Vwxc57rz7Xr5y4cUse/BhJk9+BQBrH3uc6dN259Lzv8S03afWWe64m7LnYXWXUKtzzp7Pe045mcHBQSZPfgW77LIzV377f5n7/o/WXVqtBl9YtcUDtefO+cuOM2fHM78+LgO8djPaFrDnJs7PqJ7TZkzbfSqv2mM6D/18JQC33Hk3f/Dafbnx6ku59ooFXHvFAvqmT+O/L/y3bS5kBWd+6lz22mcO+772EN7z3g9z/fU/2OZDtmuy1fkxTtrNaD8GLKla5Eeqc78H7AucWrKwJvjk6X/NGWd9ng2DG3j1njP4zCdPr7skqfl68LsORh0dAETEBIavtI28GHZ7ZnY0cd5WRwca3bY+OtCmdWV08A/v6nx08OlLx2V00HbXQWa2gFvGoRZJ2nJ+TaIkFdaDowODVlKjdLBta9wZtJKaxY5WkgozaCWpMH9uXJLK8jfDJKk0g1aSCnPXgSQVZkcrSYUZtJJUVg45OpCksuxoJakst3dJUmkGrSQV1nsjWoNWUrPkYO8lrUErqVl6L2cNWknN4sUwSSrNjlaSyrKjlaTS7GglqawcrLuClzJoJTVKD/7aOBPqLkCSuqo1hqMDETExIu6KiKuqx3tHxK0RsTwiFkbE9u3WMGglNUq2Oj86dBqwdMTjzwFfzMx9gaeAee0WMGglNUo3gzYiZgFvBr5aPQ7gCODy6iULgJParWPQSmqUHIqOj4joj4g7Rhz9Gy33JeDv+O2gYXfg6cwXL7mtBGa2q8mLYZIaZSwXwzJzABjY1HMR8RZgXWbeGRGHb0lNBq2kRslWdGupQ4G3RsTxwGRgF+A8YNeImFR1tbOAVe0WcnQgqVG6NaPNzE9k5qzM3At4F/C9zHwPcD1wcvWyucCidjUZtJIaJTM6Pl6mM4CPR8Ryhme2F7R7g6MDSY1S4gMLmXkDcEN1fwVw0Fjeb9BKapTWUNdmtF1j0EpqlC5eDOsag1ZSoxi0klRY9t7X0Rq0kprFjlaSCtuCbVvFGLSSGmXIXQeSVJYdrSQV5oxWkgpz14EkFWZHK0mFDbV677uyDFpJjeLoQJIKa7nrQJLKcnuXJBW2TY4Odp51eOk/oa3Qf+zxZ3WXoIZydCBJhbnrQJIK68HJgUErqVkcHUhSYe46kKTCCvwI7hYzaCU1SmJHK0lFDTo6kKSy7GglqTBntJJUWC92tL33EQpJ2gKtMRyjiYjJEXFbRNwTEfdFxFnV+b0j4taIWB4RCyNi+3Y1GbSSGmWI6Pho4zfAEZn5euBA4NiIOAT4HPDFzNwXeAqY124hg1ZSo7Si82M0OezZ6uF21ZHAEcDl1fkFwEntajJoJTVKi+j4aCciJkbE3cA64DrgQeDpzBysXrISmNluHYNWUqPkGI6I6I+IO0Yc/b+zVuZQZh4IzAIOAvZ/OTW560BSo4xle1dmDgADHbzu6Yi4HngTsGtETKq62lnAqnbvt6OV1CitiI6P0UTE9IjYtbo/BTgaWApcD5xcvWwusKhdTXa0khplqHtLzQAWRMREhpvSyzLzqoj4KXBpRJwN3AVc0G4hg1ZSo7TbTdCpzPwx8IZNnF/B8Ly2YwatpEbpZDfBeDNoJTWKP2UjSYV1a3TQTQatpEbx27skqbAhO1pJKsuOVpIKM2glqbAe/Mkwg1ZSs9jRSlJhXfwIbtcYtJIaxX20klSYowNJKsyglaTC/K4DSSrMGa0kFeauA0kqrNWDwwODVlKjeDFMkgrrvX7WoJXUMHa0klTYYPReT2vQSmqU3otZg1ZSwzg6kKTC3N4lSYX1XswatJIaxtGBJBU21IM9rUErqVF6saOdUHcBktRNOYb/RhMRr46I6yPipxFxX0ScVp2fGhHXRcSy6na3djUZtJIapTWGo41B4G8z8wDgEOBvIuIAYD6wJDNnA0uqx6NydDBOHnjgB6xf/xxDQ0MMDg5x6KFvqbskjbMdZ0zl8PM+xJRpr4RMln7zeu674BqO+Mqp7PqaGQBsv8sOvPDMr/jWMWfWXO3Wq1vbuzJzDbCmur8+IpYCM4ETgcOrly0AbgDOGG0tg3YcHXPMO3niiafqLkM1aQ21uOXT3+SJex9mux0n87bvfIZVN/6E73343198zcF/fwovrP9VjVVu/cYSsxHRD/SPODWQmQObeN1ewBuAW4G+KoQBHgX62v0dg1YaJ8+ve5rn1z0NwIbnfs1Ty1az46um8vSy1S++Zp8TDubqd362rhIbYXAMUVuF6kuCdaSI2Am4AvhYZj4T8dufcMjMjGj/5QrOaMdJZnLVVRdz881XM2/eKXWXo5rtNGsa0173+6y768EXz73q4P14/rFf8sxDa2usbOvXrYthABGxHcMh+43M/FZ1em1EzKienwGsa7fOy+5oI+IDmfm1zTz3Yjs+adJuTJy408v9M41xxBFvZ/XqtUyfvjtXX/0NHnhgOTfddFvdZakGk3Z4BUcNnMYP/+liNjz7/IvnX3Pim3hw0Q9rrKwZurW9K4Zb1wuApZn5hRFPLQbmAudWt4varbUlHe1Zm3siMwcyc05mzjFkh61ePdylPPbYEyxefA1z5hxYc0WqQ0yayNEDp/HglTfz8Hfu+O35iRPY67g3suJ/bq2xumboYkd7KPA+4IiIuLs6jmc4YI+OiGXAUdXjUY3a0UbEjzf3FB0MgDVshx2mMGHCBJ599jl22GEKRx55GJ/97Hl1l6Ua/Ok/f5Cnlq/mJ+d/53fOzzzsdfzywdU8t+bJmiprjm51tJl5E8NZtylHjmWtdqODPuAYYONL5QHcPJY/tC3r65vOwoXD8/ZJkyaxcOG3ue6679dclcZb3xtfy+yTD+OJpb/gL645B4DbP3cZj3zvHl7z1kN48NuODbphKLe+j+BeBeyUmXdv/ERE3FCkogZ66KFfcNBBx9Zdhmq29vafcf6s927yue9/fNQL3xqDre5rEjNz3ijPeelcUs/pZDfBeHMfraRG6cUvlTFoJTXKVjc6kKStjaMDSSpsa9x1IElbFUcHklSYF8MkqTBntJJUmKMDSSosvRgmSWX5c+OSVJijA0kqzNGBJBVmRytJhbm9S5IK8yO4klSYowNJKsyglaTC3HUgSYXZ0UpSYe46kKTChrL3vijRoJXUKM5oJakwZ7SSVFgvzmgn1F2AJHVTK7Pjo52IuDAi1kXEvSPOTY2I6yJiWXW7W7t1DFpJjZJj+K8DFwHHbnRuPrAkM2cDS6rHozJoJTXKULY6PtrJzBuBJzc6fSKwoLq/ADip3TrOaCU1Sicjgf8XEf1A/4hTA5k50OZtfZm5prr/KNDX7u8YtJIaZSwXw6pQbReso70/I6LtHzRoJTXKWDral2ltRMzIzDURMQNY1+4NzmglNUqXL4ZtymJgbnV/LrCo3RvsaCU1ylAOdW2tiLgEOByYFhErgX8EzgUui4h5wM+Bd7Rbx6CV1Cjd/AhuZr57M08dOZZ1DFpJjeJHcCWpML9URpIKG4ddB2Nm0EpqlF78UhmDVlKj+MXfklSYM1pJKswZrSQVZkcrSYW5j1aSCrOjlaTC3HUgSYV5MUySCnN0IEmF+ckwSSrMjlaSCuvFGW30Yvo3VUT0d/ALm9rG+O+i+fzNsPHV3/4l2gb576LhDFpJKsyglaTCDNrx5RxOm+K/i4bzYpgkFWZHK0mFGbSSVJhBO04i4tiIeCAilkfE/LrrUf0i4sKIWBcR99Zdi8oyaMdBREwEvgwcBxwAvDsiDqi3KvWAi4Bj6y5C5Rm04+MgYHlmrsjMF4BLgRNrrkk1y8wbgSfrrkPlGbTjYybwyIjHK6tzkrYBBq0kFWbQjo9VwKtHPJ5VnZO0DTBox8ftwOyI2DsitgfeBSyuuSZJ48SgHQeZOQicClwDLAUuy8z76q1KdYuIS4AfAvtFxMqImFd3TSrDj+BKUmF2tJJUmEErSYUZtJJUmEErSYUZtJJUmEErSYUZtJJU2P8B7rxs76CWsusAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# importing the required modules\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix\n", + "\n", + "# passing actual and predicted values\n", + "cm = confusion_matrix(y_test, y_pred1)\n", + "\n", + "# true write data values in each cell of the matrix\n", + "sns.heatmap(cm,annot=True)\n", + "plt.savefig('confusion.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aYXctkgDfwUk", + "outputId": "475cf0c1-3d1a-425f-bb61-c68a5472e241" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.94 0.93 68\n", + " 1 0.87 0.84 0.86 32\n", + "\n", + " accuracy 0.91 100\n", + " macro avg 0.90 0.89 0.90 100\n", + "weighted avg 0.91 0.91 0.91 100\n", + "\n" + ] + } + ], + "source": [ + "# importing classification report\n", + "from sklearn.metrics import classification_report\n", + "\n", + "# printing the report\n", + "print(classification_report(y_test, y_pred1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P6wVkNPTmSrd" + }, + "source": [ + "# Features Encoding\n", + "\n", + "**In real life, the data does not always consist of numeric values. For example, playing or not playing are not numeric values. In such scenarios, we need to convert the non-numeric data to numeric values before feeding data to our model. For example, we have the following dataset about whether players will play sport or not, depending on the weather and temperature.**" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "_kF2H9Q0gNqz" + }, + "outputs": [], + "source": [ + "# assigning features and label variables\n", + "weather = ['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny', 'Rainy','Sunny','Overcast','Overcast','Rainy','Rainy']\n", + "\n", + "# output class\n", + "play = ['No','No','Yes','No','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No','Yes']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "uAuoT9z5gRAt" + }, + "outputs": [], + "source": [ + "# Import LabelEncoder\n", + "from sklearn import preprocessing\n", + "\n", + "# creating LabelEncoder\n", + "labelCode = preprocessing.LabelEncoder()\n", + "\n", + "# Converting string labels into numbers.\n", + "weather_encoded=labelCode.fit_transform(weather)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ollTUMu6mbFt" + }, + "source": [ + "***Note: the input and output both are not numeric values. Before feeding this data to our model, we have to encode the non-numeric values into numeric ones. for example, Overcast = 0, Rainy = 1, Sunny = 2. This is called label encoding.***" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hioaVahZgTzj", + "outputId": "edd46685-d470-4bf0-d770-5e8cd7becf46" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2 2 0 1 1 1 0 2 2 1 2 0 0 1 1]\n" + ] + } + ], + "source": [ + "print(weather_encoded)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LuwNOD_6miJt" + }, + "source": [ + "The LabelEncoder will convert the string values to numeric values. For example, if we print the encoded weather, it will no longer contain numeric values." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "r8UkJa2QgdWN" + }, + "outputs": [], + "source": [ + "# import LabelEncoder\n", + "from sklearn import preprocessing\n", + "\n", + "# creating LabelEncoder\n", + "labelCode = preprocessing.LabelEncoder()\n", + "\n", + "# converting string labels into numbers.\n", + "label=labelCode.fit_transform(play)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SupK1QVUsV5y", + "outputId": "55f01cdd-d324-471f-ba33-86716cd0d99b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "label" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CvCfxpdVnbdx" + }, + "source": [ + "**Generating model**\n", + "\n", + "We have already seen that our input values are in a single-dimensional array. By default, the model training takes values in multi-dimensional arrays. If we feed the data without further changes, we will get the following error." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 426 + }, + "id": "EXIhGukIgfyt", + "outputId": "0e54c35d-3bff-4421-9210-c0f8c1b3ab9e" + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "ignored", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m# train the model using the training sets\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweather_encoded\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 245\u001b[0m return self._partial_fit(\n\u001b[0;32m--> 246\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_refit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 247\u001b[0m )\n\u001b[1;32m 248\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36m_partial_fit\u001b[0;34m(self, X, y, classes, _refit, sample_weight)\u001b[0m\n\u001b[1;32m 400\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[0mfirst_call\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_partial_fit_first_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclasses\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 402\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfirst_call\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 403\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_sample_weight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 581\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 582\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 975\u001b[0m \u001b[0mensure_min_features\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_features\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 976\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 977\u001b[0m )\n\u001b[1;32m 978\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 771\u001b[0m \u001b[0;34m\"Reshape your data either using array.reshape(-1, 1) if \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 772\u001b[0m \u001b[0;34m\"your data has a single feature or array.reshape(1, -1) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 773\u001b[0;31m \u001b[0;34m\"if it contains a single sample.\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 774\u001b[0m )\n\u001b[1;32m 775\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Expected 2D array, got 1D array instead:\narray=[2 2 0 1 1 1 0 2 2 1 2 0 0 1 1].\nReshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample." + ] + } + ], + "source": [ + "# import Gaussian Naive Bayes model\n", + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "# create a Gaussian Classifier\n", + "model = GaussianNB()\n", + "\n", + "# train the model using the training sets\n", + "\n", + "model.fit(weather_encoded, label)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cYFFNSAAEPZL", + "outputId": "37046ed5-fa6f-4a10-9099-cd485f87d61d" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2, 2, 0, 1, 1, 1, 0, 2, 2, 1, 2, 0, 0, 1, 1])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_encoded" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Al_ML5RrnjxG" + }, + "source": [ + "So, we need to convert our data to the 2D array before feeding it to our model. Here we will use NumPy array and reshape() method to create a 2D array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4hkxPdnmgqmV" + }, + "outputs": [], + "source": [ + "# importing numpy module\n", + "import numpy as np\n", + "\n", + "# converting 1D array to 2D\n", + "weather_2d = np.reshape(weather_encoded, (-1, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ROYkyf75EGSL", + "outputId": "f8c55973-ac70-44ea-e022-c72e76d337ae" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2],\n", + " [2],\n", + " [0],\n", + " [1],\n", + " [1],\n", + " [1],\n", + " [0],\n", + " [2],\n", + " [2],\n", + " [1],\n", + " [2],\n", + " [0],\n", + " [0],\n", + " [1]])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather_2d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oDNszamvhH_V", + "outputId": "86689611-f9c9-42c1-9475-19a568039f78" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "GaussianNB()" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import Gaussian Naive Bayes model\n", + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "# create a Gaussian Classifier\n", + "model = GaussianNB()\n", + "\n", + "# train the model using the training sets\n", + "model.fit(weather_2d, label)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IslWW4zhnqW9" + }, + "source": [ + "We had used the Gaussian Naive Bayes classifier to train our model. Let us predict the output by providing a testing input." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZHm6dbh6hNFS", + "outputId": "45dba2ea-0a29-448a-f81c-43cdcba35afe" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]\n" + ] + } + ], + "source": [ + "# predicting the odel\n", + "predicted= model.predict([[1]]) # 0:Overcast\n", + "\n", + "# printing predicted value\n", + "print(predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QarFwQTAnt2V" + }, + "source": [ + "The output value 1 indicates that players will Play when there’s an Overcast weather." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JOWEHG_3n59c" + }, + "source": [ + "# Naive Bayes Classification with Multiple Labels" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uf01TpNJn-NF" + }, + "source": [ + "For the training, we will use the built-in data set from the sklearn module named load_wine. This dataset results from a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars.\n", + "\n", + "The dataset consists of 13 features (alcohol, malic_acid, ash, alcalinity_of_ash, magnesium, total_phenols, flavanoids, nonflavanoid_phenols, proanthocyanins, color_intensity, hue, od280/od315_of_diluted_wines, proline) and type of wine cultivar. This data has three types of wine Class_0, Class_1, and Class_3. We can build a model to classify the type of wine using Naive Bayes Classification." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LRoCA-OSoDvG" + }, + "source": [ + "# Loading and Exploring dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_53JwySShW67" + }, + "outputs": [], + "source": [ + "# import scikit-learn dataset library\n", + "from sklearn import datasets\n", + "\n", + "# load dataset\n", + "dataset = datasets.load_wine()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zX5zWs6voNp3" + }, + "source": [ + "Next, we can print the input/features and target/output variables names to ensure the desired dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2IFEbfPZhZ1w", + "outputId": "3bffd787-1459-4a55-bb45-7cf540b78c07" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inputs: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']\n", + "Outputs: ['class_0' 'class_1' 'class_2']\n" + ] + } + ], + "source": [ + "# print the names of the 13 features\n", + "print (\"Inputs: \", dataset.feature_names)\n", + "\n", + "# print the label type of wine\n", + "print (\"Outputs: \", dataset.target_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N8LphfJSoRNV" + }, + "source": [ + "We check the type of data (numeric/non-numeric) by printing three rows from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XDcx91huhfwx", + "outputId": "e2b85589-501d-4e3c-8b81-f99b2a5af06e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00\n", + " 2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]\n", + " [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00\n", + " 2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]\n", + " [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00\n", + " 3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]]\n" + ] + } + ], + "source": [ + "# print the wine data features\n", + "print(dataset.data[0:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hUYWPPUToUmG" + }, + "source": [ + "We can also check the output values to verify that it is a multi-class classification dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GHr8PqBShjVS", + "outputId": "aa84a472-1890-4ffe-837d-ec810d5912c1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n", + " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]\n" + ] + } + ], + "source": [ + "# print the wine labels \n", + "print(dataset.target)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "baJqnPG7oXCW" + }, + "source": [ + "# Training the model using multiclass labels\n", + "\n", + "Before feeding the dataset to our model, let us split the dataset into training and testing parts to evaluate our model by providing the testing dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "owDZfBk-hofQ" + }, + "outputs": [], + "source": [ + "# import train_test_split function\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# input and outputs\n", + "inputs = dataset.data\n", + "outputs = dataset.target\n", + "\n", + "# split dataset into training set and test set\n", + "X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.3, random_state=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9-jVi-5khrhD" + }, + "outputs": [], + "source": [ + "# import Gaussian Naive Bayes model\n", + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "# create a Gaussian Classifier\n", + "classifer = GaussianNB()\n", + "\n", + "# train the model using the training sets\n", + "classifer.fit(X_train, y_train)\n", + "\n", + "# predict the response for test dataset\n", + "y_pred = classifer.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bHUfAB8Nocn_" + }, + "source": [ + "Note: we have used the Gaussian Naive Bayes classification method for the training.\n", + "\n", + "Let us now check the accuracy of our model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HjM-3pk_hvrN", + "outputId": "19bcc8f0-191e-49c3-de49-89cdb3359118" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9814814814814815\n" + ] + } + ], + "source": [ + "# import scikit-learn metrics module for accuracy calculation\n", + "from sklearn import metrics\n", + "\n", + "# printing accuracy\n", + "print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q0wz-x-EogI8" + }, + "source": [ + "**We got 98% accurate results, which is pretty high accuracy.**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iNgGKKVZomII" + }, + "source": [ + "**Evaluation of Naive Bayes Classifier for Multiclassification**\n", + "\n", + "Confusion Matrix is not only used to evaluate binary classification. It can also be useful in evaluating multiclass classification problems as well. The number of columns/rows will equal the number of output classes.\n", + "\n", + "Let’s evaluate our model, which was trained by multi-labeled data using a confusion matrix." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + }, + "id": "MK-u8Y7NhzSe", + "outputId": "346cfc61-87c3-470f-8fae-0637ee87fc19" + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVoAAAD4CAYAAACt8i4nAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASPklEQVR4nO3deZhU5ZXH8d8pRB+DmkhAoJtWUBi3cRRFoo/RgIzgGBW3wRgX4ji2YzDCTNzGaEwek+gscc0ydiIBF1SiGNyG0RAjYFxA5XGgURFB6W4WFR13oarO/EHRlkJ3VXfXW7f6re+H53267r1V9x6v7eF47ntvmbsLABBOKukAACB2JFoACIxECwCBkWgBIDASLQAEtk3oA2x86zWmNQS2fc3hSYcAlER6Q7N1dR8dyTk9++ze5eMVg4oWAAILXtECQFllM0lHsAUSLYC4ZNJJR7AFEi2AqLhnkw5hCyRaAHHJkmgBICwqWgAIjIthABAYFS0AhOXMOgCAwLgYBgCB0ToAgMC4GAYAgVHRAkBgXAwDgMC4GAYAYbnTowWAsOjRAkBgtA4AIDAqWgAILLMx6Qi2QKIFEBdaBwAQGK0DAAiMihYAAiPRAkBYXoEXw1JJBwAAJeXZ4kc7zKzOzB43s0YzW2Jmk3Lre5vZY2a2LPdz50IhkWgBxCWbLX60Ly3p++6+j6RDJE00s30kXSZpjrsPlTQnt9wuEi2AuJSoonX31e7+fO71+5KWSqqVNE7StNzbpkk6oVBI9GgBxKUDF8PMrF5Sfd6qBndv2Mr7BkkaJukZSf3cfXVu0xpJ/Qodh0QLIC4dmEebS6pbJNZ8ZraDpPskTXb398ws//NuZl7oOCRaAHFJl+7B32bWU5uS7J3uPjO3eq2ZDXD31WY2QNK6Qvup6h7t6rVv6uwLLtXxp9dr3Onn6fYZf5Ak3dxwm04863ydPGGizp18uda9+XbCkcZl7JiRWrJ4rl5qnK9LLp6YdDhRqupzXLpZBybpVklL3f26vE0PSJqQez1B0qxCIZl7waq3Sza+9VrYA3TBm2+t15tvr9c+ew7Rhx9+pPHnXKibrrlS/Xbpox169ZIk3fH7WVq+4g1ddcn3Eo62bdvXHJ50CEVLpVJaumSejj7mNDU1rdbTTz2iM878rpYuXZZ0aNHozuc4vaHZCr+rfR8/8J9F55ztj7+ozeOZ2dclzZP0v5I2Z+XLtalPO0PSrpJelzTe3de3d5yqbh307dNbffv0liT16vUl7b5bnda++bb2GLxb63s+/vgTWZf/1WOzEQcP0/LlK7VixRuSpBkzZun448Z2iyTQXVT9OS7Rsw7cfb6ktv7rH92RfVV1os3XvHqtli5brr/Zd09J0o23TNUDs+dox169NOXmaxOOLh41tf21qqmldbmpebVGHDwswYjiU/XnuAJvwS3YozWzvczsUjO7KTcuNbO9yxFcuXz00cf65x/8RJdeeF5ry2DSed/RnPtv1zfHjNL0+x5MOEIARStRj7aU2k20ZnappLu1qXx+NjdM0l1m1ubdEGZWb2YLzWzhb2+7q5TxltzGdFqTf/ATfXPMKB018rAtth87ZpT++OcnE4gsTi3Na1Q3sKZ1eWDtALW0rEkwovhU/TlOp4sfZVKodXCOpH3d/XNPaTCz6yQtkbTV/6fOn5tWyRfD3F0/vOYG7b5bnSZ866TW9a+vatZudbWSpD/Ne0qDdxuYVIjRWbBwkYYMGaxBg+rU3LxG48eP05lnVdlV8cCq/hwHvsDfGYUSbVZSjTZdWcs3QJ9dheu2XnhxiR6cPUdD9xikkyds+kWcdN4EzXzoUa18o0mWMtX030U/vLhyZxx0N5lMRpMmX6FHHp6uHqmUpk67R42NryQdVlSq/hxXYI+23eldZna0pF9IWiZpVW71rpKGSLrA3WcXOkAlV7Sx6E7Tu4D2lGR6151XFj+96/SryzKnqN2K1t1nm9lfSRqhTQ9TkKRmSQvcPRM6OADosO74VTbunpX0dBliAYCuy1ReDcg8WgBxqcAeLYkWQFxItAAQWHfs0QJAd+LZypvoRKIFEBdaBwAQGLMOACAwKloACIxECwCBdcOHygBA90JFCwCBMb0LAAJj1gEAhOW0DgAgMFoHABAYzzoAgMCoaAEgsDQXwwAgLFoHABAYrQMACIvpXQAQGhUtAARGogWAwLgFFwDC4jvDACA0Ei0ABMasAwAIjIoWAAIj0QJAWJ6pwtZBv8FjQx+i6r1340lJhxC9nSbNTDoEFKsCK9pU0gEAQCl51osehZjZFDNbZ2aL89b9yMyazWxRbhxTaD8kWgBxyXrxo7Cpko7eyvrr3f2A3Hik0E7o0QKISwlbtO4+18wGdXU/VLQAouLpbNHDzOrNbGHeqC/yMBeY2Yu51sLOhd5MogUQl2zxw90b3H143mgo4gi/lrSHpAMkrZb080IfoHUAICqhn3Xg7ms3vzaz30h6qNBnqGgBxKUDFW1nmNmAvMUTJS1u672bUdECiEopK1ozu0vSSEl9zKxJ0lWSRprZAZJc0kpJ5xXaD4kWQFxKO+vgtK2svrWj+yHRAoiKp5OOYEskWgBRqcBvGyfRAogMiRYAwqKiBYDASLQAEJhnLOkQtkCiBRAVKloACMyzVLQAEBQVLQAE5k5FCwBBUdECQGBZZh0AQFhcDAOAwEi0ABCYh/2ChU4h0QKIChUtAATG9C4ACCzDrAMACIuKFgACo0cLAIEx6wAAAqOiBYDAMtlU0iFsofIiStDNv7pGL7/2tJ585uGkQ4nKjx5drCNveVyn3P5k67qX172ns+5+Rqfe8ZS+Pf1pLV7zfwlGGJ+xY0ZqyeK5eqlxvi65eGLS4ZSVe/GjXEi0eabfOVN/f+I/JB1GdI7bp0a/PPGgz627Yf4y1X9td91zxqE6/9A9dMO8VxKKLj6pVEo33fhTHXvcGdpv/1E69dQTtPfeQ5MOq2yybkWPciHR5nnqyQV65x0qq1I7aGBvfXm7np9bZ5I+3JCRJH3waVp9d9gugcjiNOLgYVq+fKVWrHhDGzdu1IwZs3T8cWOTDqts3K3oUS70aJGIi0buqYn3P6/r572srEtTTx2RdEjRqKntr1VNLa3LTc2rNeLgYQlGVF6VOOug0xWtmZ3dzrZ6M1toZgs/3UiFiC39/sUmff+IPTX7H7+hi76xp3782JKkQ0IkYmsd/LitDe7e4O7D3X34dj2/3IVDIFYPNbZo9JBdJElHDe2nJWv5C7lUWprXqG5gTevywNoBamlZk2BE5ZXJpooe5dJu68DMXmxrk6R+pQ8H1aJvr+30XNM7Gl7XW8+uWq9dv/KlpEOKxoKFizRkyGANGlSn5uY1Gj9+nM48q3pmHlRg56Bgj7afpLGS3vnCepP0lyARJeg3U67XYYeP0Fe/urMWvzRP1/7sRt1x271Jh9XtXfbIi3quab3e/WSjxv72Cf3TIXvoyr/dR//xxEtKZ13b9UjpitH7Jh1mNDKZjCZNvkKPPDxdPVIpTZ12jxobq2dWRzlbAsUyb6dzbGa3Svqdu8/fyrbp7v7tQgfovePQSvwLJipN/3500iFEb6dJM5MOoSqkNzR3OUs+2f+UonPOYWvuLUtWbreidfdz2tlWMMkCQLlV4JfgMr0LQFxcldc6INECiEq6Anu0JFoAUaGiBYDA6NECQGCVWNHyUBkAUcl2YBRiZlPMbJ2ZLc5b19vMHjOzZbmfOxfaD4kWQFQysqJHEaZK+uJE9cskzXH3oZLm5JbbRaIFEJWsFT8Kcfe5ktZ/YfU4SdNyr6dJOqHQfki0AKKSlRU98p80mBv1RRyin7uvzr1eoyKe+8LFMABR6cg9/+7eIKmh08dydzMreEgqWgBRKeXFsDasNbMBkpT7ua7QB0i0AKKSNSt6dNIDkibkXk+QNKvQB2gdAIhKpoT7MrO7JI2U1MfMmiRdJelaSTPM7BxJr0saX2g/JFoAUSlmNkGx3P20NjaN7sh+SLQAopKtwDvDSLQAolKJ3zRAogUQlVK2DkqFRAsgKjy9CwACy1DRAkBYVLQAEBiJFgACq8CvDCPRAogLFS0ABFbKW3BLhUQLICrMowWAwGgdAEBgJFoACIxnHQBAYPRoASCwqpx18N6nH4U+RNXbadLMpEOI3gu1ByYdAoqUrcDmARUtgKhwMQwAAqu8epZECyAyVLQAEFjaKq+mJdECiErlpVkSLYDI0DoAgMCY3gUAgVVemiXRAogMrQMACCxTgTUtiRZAVKhoASAwp6IFgLCoaAEgMKZ3AUBglZdmSbQAIpOuwFRLogUQFS6GAUBgXAwDgMCoaAEgMCpaAAgs41S0ABBUKefRmtlKSe9r07eYp919eGf2Q6IFEJUAPdpR7v5WV3ZAogUQlUrs0aaSDgAASikrL3oUwSU9ambPmVl9Z2OiogUQlY60DnLJMz+BNrh7Q97y19292cx2kfSYmb3k7nM7GhOJFkBUOjLrIJdUG9rZ3pz7uc7M7pc0QlKHEy2tAwBRKVXrwMx6mdmOm19LGiNpcWdioqIFEJUSXgzrJ+l+M5M25crp7j67Mzsi0QKISqmmd7n7a5L2L8W+SLQAolKJD/6mR5tn7JiRWrJ4rl5qnK9LLp6YdDjR4jyXXu2/Xai9F9yuobN/0bqu/7+eraF//LWG/PdN2vW/Lldqx14JRlg+7l70KBcSbU4qldJNN/5Uxx53hvbbf5ROPfUE7b330KTDig7nOYx37pujFd/50efWfTB/kZaNnahX/+5CbVjRrF2+e0oywZVZRl70KBcSbc6Ig4dp+fKVWrHiDW3cuFEzZszS8ceNTTqs6HCew/jo2SXKvPv+59Z9MO8FKbPp0tBHL7ysnv37JBFa2ZX4hoWSINHm1NT216qmltblpubVqqnpn2BEceI8J2Pn8Ufp/SeeSzqMsuiWrQMz28vMRpvZDl9Yf3S4sACUSt+J4+XpjN79w5+TDqUsul1Fa2YXSpol6XuSFpvZuLzNP2vnc/VmttDMFmazH5Ym0sBamteobmBN6/LA2gFqaVmTYERx4jyX11dOHq2djjxYqyb/POlQysY78KdcClW050o6yN1PkDRS0pVmNim3zdr6kLs3uPtwdx+eSnWPK50LFi7SkCGDNWhQnXr27Knx48fpwYceTTqs6HCey2eHIw5U3/NO0spzr5Z/8mnS4ZRNxr3oUS6F5tGm3P0DSXL3lWY2UtK9Zrab2km03VEmk9GkyVfokYenq0cqpanT7lFj4ytJhxUdznMYdTdepF6H7Kdtdt5Je/3ld1p7w3T1Pf8U2bY9Nfj2qyVtuiDWcsWvEo40vEqcR2vtNYTN7E+S/sXdF+Wt20bSFEmnu3uPQgfYZtvayvunBjrohdoDkw6hKuy34sEuF3CH1o4qOuc81fx4WQrGQhXtWZLS+SvcPS3pLDO7JVhUANBJ5ZxNUKx2E627N7Wz7cnShwMAXVOJrQOedQAgKuWcTVAsEi2AqGS88r41jEQLICrdrkcLAN0NPVoACIweLQAElqV1AABhUdECQGDMOgCAwGgdAEBgtA4AIDAqWgAIjIoWAALLeCbpELZAogUQFW7BBYDAuAUXAAKjogWAwJh1AACBMesAAALjFlwACIweLQAERo8WAAKjogWAwJhHCwCBUdECQGDMOgCAwLgYBgCBVWLrIJV0AABQSt6BP4WY2dFm9rKZvWpml3U2JipaAFEpVUVrZj0k/VLSUZKaJC0wswfcvbGj+yLRAohKCXu0IyS96u6vSZKZ3S1pnKTKS7TpDc0W+hilZmb17t6QdBwx4xyHV63nuCM5x8zqJdXnrWrIO2e1klblbWuS9LXOxESPduvqC78FXcQ5Do9zXIC7N7j78LwR5C8mEi0AbF2zpLq85YG5dR1GogWArVsgaaiZDTazbSV9S9IDndkRF8O2rur6WgngHIfHOe4Cd0+b2QWS/kdSD0lT3H1JZ/ZllTi5FwBiQusAAAIj0QJAYCTaPKW63Q5tM7MpZrbOzBYnHUuszKzOzB43s0YzW2Jmk5KOqdrRo83J3W73ivJut5N0Wmdut0PbzOwISR9Ius3d/zrpeGJkZgMkDXD3581sR0nPSTqB3+XkUNF+pvV2O3ffIGnz7XYoIXefK2l90nHEzN1Xu/vzudfvS1qqTXc5ISEk2s9s7XY7fjnRrZnZIEnDJD2TbCTVjUQLRMrMdpB0n6TJ7v5e0vFUMxLtZ0p2ux2QNDPrqU1J9k53n5l0PNWORPuZkt1uByTJzEzSrZKWuvt1SccDEm0rd09L2ny73VJJMzp7ux3aZmZ3SXpK0p5m1mRm5yQdU4QOk3SmpCPNbFFuHJN0UNWM6V0AEBgVLQAERqIFgMBItAAQGIkWAAIj0QJAYCRaAAiMRAsAgf0/GVSjgf5sPHMAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# importing the required modules\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix\n", + "\n", + "# passing actual and predicted values\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "\n", + "# true Write data values in each cell of the matrix\n", + "sns.heatmap(cm, annot=True)\n", + "plt.savefig('confusion.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rs3l2Gp_h2c0", + "outputId": "02d3aed2-7c83-4094-ab99-45ea9513dc0e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.96 1.00 0.98 23\n", + " 1 1.00 0.95 0.97 19\n", + " 2 1.00 1.00 1.00 12\n", + "\n", + " accuracy 0.98 54\n", + " macro avg 0.99 0.98 0.98 54\n", + "weighted avg 0.98 0.98 0.98 54\n", + "\n" + ] + } + ], + "source": [ + "# Importing classification report\n", + "from sklearn.metrics import classification_report\n", + "\n", + "# printing the report\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pnCYm-1fqNTf" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oVNqnU9Ayl7O" + }, + "outputs": [], + "source": [ + "# importing the dataset\n", + "#Import scikit-learn dataset library\n", + "from sklearn import datasets\n", + "#Load dataset\n", + "cancer = datasets.load_breast_cancer()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f33UdovIyxW5", + "outputId": "93f9ad67-5683-4da6-ca33-b810f447a39b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DESCR': '.. _breast_cancer_dataset:\\n\\nBreast cancer wisconsin (diagnostic) dataset\\n--------------------------------------------\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 569\\n\\n :Number of Attributes: 30 numeric, predictive attributes and the class\\n\\n :Attribute Information:\\n - radius (mean of distances from center to points on the perimeter)\\n - texture (standard deviation of gray-scale values)\\n - perimeter\\n - area\\n - smoothness (local variation in radius lengths)\\n - compactness (perimeter^2 / area - 1.0)\\n - concavity (severity of concave portions of the contour)\\n - concave points (number of concave portions of the contour)\\n - symmetry\\n - fractal dimension (\"coastline approximation\" - 1)\\n\\n The mean, standard error, and \"worst\" or largest (mean of the three\\n worst/largest values) of these features were computed for each image,\\n resulting in 30 features. For instance, field 0 is Mean Radius, field\\n 10 is Radius SE, field 20 is Worst Radius.\\n\\n - class:\\n - WDBC-Malignant\\n - WDBC-Benign\\n\\n :Summary Statistics:\\n\\n ===================================== ====== ======\\n Min Max\\n ===================================== ====== ======\\n radius (mean): 6.981 28.11\\n texture (mean): 9.71 39.28\\n perimeter (mean): 43.79 188.5\\n area (mean): 143.5 2501.0\\n smoothness (mean): 0.053 0.163\\n compactness (mean): 0.019 0.345\\n concavity (mean): 0.0 0.427\\n concave points (mean): 0.0 0.201\\n symmetry (mean): 0.106 0.304\\n fractal dimension (mean): 0.05 0.097\\n radius (standard error): 0.112 2.873\\n texture (standard error): 0.36 4.885\\n perimeter (standard error): 0.757 21.98\\n area (standard error): 6.802 542.2\\n smoothness (standard error): 0.002 0.031\\n compactness (standard error): 0.002 0.135\\n concavity (standard error): 0.0 0.396\\n concave points (standard error): 0.0 0.053\\n symmetry (standard error): 0.008 0.079\\n fractal dimension (standard error): 0.001 0.03\\n radius (worst): 7.93 36.04\\n texture (worst): 12.02 49.54\\n perimeter (worst): 50.41 251.2\\n area (worst): 185.2 4254.0\\n smoothness (worst): 0.071 0.223\\n compactness (worst): 0.027 1.058\\n concavity (worst): 0.0 1.252\\n concave points (worst): 0.0 0.291\\n symmetry (worst): 0.156 0.664\\n fractal dimension (worst): 0.055 0.208\\n ===================================== ====== ======\\n\\n :Missing Attribute Values: None\\n\\n :Class Distribution: 212 - Malignant, 357 - Benign\\n\\n :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\\n\\n :Donor: Nick Street\\n\\n :Date: November, 1995\\n\\nThis is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\\nhttps://goo.gl/U2Uwz2\\n\\nFeatures are computed from a digitized image of a fine needle\\naspirate (FNA) of a breast mass. They describe\\ncharacteristics of the cell nuclei present in the image.\\n\\nSeparating plane described above was obtained using\\nMultisurface Method-Tree (MSM-T) [K. P. Bennett, \"Decision Tree\\nConstruction Via Linear Programming.\" Proceedings of the 4th\\nMidwest Artificial Intelligence and Cognitive Science Society,\\npp. 97-101, 1992], a classification method which uses linear\\nprogramming to construct a decision tree. Relevant features\\nwere selected using an exhaustive search in the space of 1-4\\nfeatures and 1-3 separating planes.\\n\\nThe actual linear program used to obtain the separating plane\\nin the 3-dimensional space is that described in:\\n[K. P. Bennett and O. L. Mangasarian: \"Robust Linear\\nProgramming Discrimination of Two Linearly Inseparable Sets\",\\nOptimization Methods and Software 1, 1992, 23-34].\\n\\nThis database is also available through the UW CS ftp server:\\n\\nftp ftp.cs.wisc.edu\\ncd math-prog/cpo-dataset/machine-learn/WDBC/\\n\\n.. topic:: References\\n\\n - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \\n for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \\n Electronic Imaging: Science and Technology, volume 1905, pages 861-870,\\n San Jose, CA, 1993.\\n - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \\n prognosis via linear programming. Operations Research, 43(4), pages 570-577, \\n July-August 1995.\\n - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\\n to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \\n 163-171.',\n", + " 'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,\n", + " 1.189e-01],\n", + " [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,\n", + " 8.902e-02],\n", + " [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,\n", + " 8.758e-02],\n", + " ...,\n", + " [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,\n", + " 7.820e-02],\n", + " [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,\n", + " 1.240e-01],\n", + " [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,\n", + " 7.039e-02]]),\n", + " 'data_module': 'sklearn.datasets.data',\n", + " 'feature_names': array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',\n", + " 'mean smoothness', 'mean compactness', 'mean concavity',\n", + " 'mean concave points', 'mean symmetry', 'mean fractal dimension',\n", + " 'radius error', 'texture error', 'perimeter error', 'area error',\n", + " 'smoothness error', 'compactness error', 'concavity error',\n", + " 'concave points error', 'symmetry error',\n", + " 'fractal dimension error', 'worst radius', 'worst texture',\n", + " 'worst perimeter', 'worst area', 'worst smoothness',\n", + " 'worst compactness', 'worst concavity', 'worst concave points',\n", + " 'worst symmetry', 'worst fractal dimension'], dtype=' Date: Thu, 20 Oct 2022 00:18:10 +0530 Subject: [PATCH 2/2] Random_forest_Rprog --- ML Algorithms/Random_Forest/RF_model | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 ML Algorithms/Random_Forest/RF_model diff --git a/ML Algorithms/Random_Forest/RF_model b/ML Algorithms/Random_Forest/RF_model new file mode 100644 index 00000000..1bfc0ec9 --- /dev/null +++ b/ML Algorithms/Random_Forest/RF_model @@ -0,0 +1,52 @@ +require(caret) +require(randomForest) +data <- read.csv(file="data_1.csv", header=T) +str(data) +data$ admit <- as.factor(data$admit) +str(data) +set.seed(123) +ind <- sample(1:nrow(data), round(0.70*nrow(data))) +train <- as.data.frame(data[ind, ]) +str(train) +var_names <- names(data) +var_names +formula = as.formula(paste("admit ~", paste(var_names[!var_names %in% "admit"], collapse = " + +"))) +rf_train_model <- randomForest(formula, data = train, ntree = 500, mtry = 2, nodesize = 5, +importance=T) +print(rf_train_model) +jpeg('var_importance.jpg') +impVar <- round(randomForest::importance(rf_train_model), 2) +impVar[order(impVar[,3], decreasing=TRUE),] +varImpPlot <- varImpPlot(rf_train_model, sort = TRUE, main = "Var Impoirance", n.var=4) +rf_train_model +tRF<- tuneRF(x = train[,2:4], y = as.factor(train$admit), mtryStart = 1, ntreeTry = 500, stepFactor = +0.5, improve = 0.001, trace = TRUE, plot = TRUE, doBest = TRUE, nodesize = 5, importance = TRUE +) +train$predict.class <- predict(tRF, train, type = "class", na.action = na.omit) +train$predict.score <- predict(tRF, train, type = "prob") +head(train) +class(train$predict.score) +require(ROCR) +jpeg('RF_training_AUC.jpg') +pred <- prediction(train$predict.score[,2], train$admit) +perf <- performance(pred, "tpr", "fpr") +plot( perf, colorize=TRUE, lwd=1, main="RF ROC Curve for training", print.cutoffs.at=seq(0, 1, +by=0.05), text.adj=c(-0.5, 0.5), text.cex=0.5) +dev.off() +KS <- max(attr(perf, 'y.values')[[1]]-attr(perf, 'x.values')[[1]]) +auc <- performance(pred,"auc"); +auc <- as.numeric(auc@y.values) +require(e1071) +conf_matrix_rf_train <- confusionMatrix(data = train$predict.class, reference=train$admit) +conf_matrix_rf_train +test$predict.class <- predict(tRF, test, type="class") +test$predict.score <- predict(tRF, test, type="prob") +conf_matrix_rf_test <- confusionMatrix(data = test$predict.class, reference=test$admit) +conf_matrix_rf_test +jpeg('RF_internal_test_AUC.jpg') +pred1 <- prediction(test$predict.score[,2], test$admit) +perf1 <- performance(pred1, "tpr", "fpr") +plot( perf1, colorize=TRUE, lwd=1, main="RF ROC Curve for internal test", print.cutoffs.at=seq(0, 1, +by=0.05), text.adj=c(-0.5, 0.5), text.cex=0.5) +dev.off() \ No newline at end of file