diff --git a/mal_extrac.ipynb b/mal_extrac.ipynb new file mode 100644 index 0000000..838a56a --- /dev/null +++ b/mal_extrac.ipynb @@ -0,0 +1,3321 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 548, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3d5CYN8JSQgp" + }, + "outputs": [], + "source": [ + "import os\n", + "import cv2\n", + "import copy\n", + "import csv\n", + "import random\n", + "import pickle\n", + "import numpy as np\n", + "import pandas as pd\n", + "import itertools\n", + "from scipy.stats import randint\n", + "from itertools import cycle\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.pyplot import figure" + ] + }, + { + "cell_type": "code", + "execution_count": 549, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "iiKgZIzNSS1w" + }, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "from sklearn.manifold import TSNE\n", + "from sklearn.decomposition import TruncatedSVD\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import preprocessing\n", + "from scipy.sparse import csr_matrix\n", + "from scipy import stats\n", + "from my_ml_lib import DataManipulationTools, MetricTools, PlotTools\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import RandomizedSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 550, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "V2Q6UcbgSWrc" + }, + "outputs": [], + "source": [ + "from sklearn import svm\n", + "from sklearn.neural_network import MLPClassifier\n", + "from xgboost import XGBClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier\n", + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA\n", + "from sklearn.decomposition import PCA\n", + "from skimage.feature import hog, local_binary_pattern" + ] + }, + { + "cell_type": "code", + "execution_count": 551, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Sm2SnXjoS6m4" + }, + "outputs": [], + "source": [ + "def write_csv(file, a1, a2, a3, a4, a5, a6, a7, name):\n", + " with open(file, mode='w') as csv_file:\n", + " csv_writer = csv.writer(csv_file, delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL)\n", + " csv_writer.writerow(name)\n", + " for i in range(20):\n", + " if a2[i] == None:\n", + " a2[i] = 'None'\n", + " if a4[i] == None:\n", + " a4[i] = 'None'\n", + " csv_writer.writerow([a1[i], a2[i], a3[i], a4[i], a5[i], a6[i], a7[i]])" + ] + }, + { + "cell_type": "code", + "execution_count": 733, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Fn456X2YTDka" + }, + "outputs": [], + "source": [ + "def read_images(path='content/sample_data/cell_images'):\n", + " images = []\n", + " labels = []\n", + " num1 = 32\n", + " num2 = 32\n", + " file_name='Parasitized'\n", + " for file_name in os.listdir(path):\n", + " file_path = path + '/' + file_name\n", + " for img_name in os.listdir(file_path):\n", + " if not img_name.startswith('.'):\n", + " if img_name.endswith('.png'):\n", + " img = cv2.imread(file_path + '/' + img_name)\n", + " new_img = cv2.resize(img, (num2, num1))\n", + " images.append(new_img)\n", + " if file_name == 'Parasitized':\n", + " label = 0\n", + " else:\n", + " label = 1\n", + " labels.append(label)\n", + " \n", + " return np.array(images), np.array(labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xVCjuxcXW5Yy" + }, + "source": [ + "# New Section" + ] + }, + { + "cell_type": "code", + "execution_count": 692, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LWIiqIVnuglw" + }, + "outputs": [], + "source": [ + "def save_feature(feature, name):\n", + " # saving all our feature vectors in pickled file\n", + " with open('content/sample_data/cache/' + name + '.pkl', 'wb') as fp:\n", + " pickle.dump(csr_matrix(feature), fp)\n", + " \n", + " print(f'Feature saved with name cache/{name}.pkl')\n", + "\n", + "def load_feature(feature_name):\n", + " return pickle.load(open(feature_name, 'rb')).A" + ] + }, + { + "cell_type": "code", + "execution_count": 693, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TlA-zG1dupbk" + }, + "outputs": [], + "source": [ + "def save_model(model):\n", + " filename = input('Enter model file name:')\n", + " pickle.dump(model, open('models/'+filename + '.pkl', 'wb'))\n", + " print(f'Successfully saved model in models/{filename}.pkl')\n", + "\n", + "def load_model(model_name):\n", + " return pickle.load(open(model_name, 'rb'))" + ] + }, + { + "cell_type": "code", + "execution_count": 555, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "xybS4ANyuzgJ" + }, + "outputs": [], + "source": [ + "def get_flattened(images, color=cv2.COLOR_RGB2GRAY, name='flattened', save=False):\n", + " \"\"\"\n", + " color: default RGB2GRAY, if None is passed then color is used as it is.\n", + " \"\"\"\n", + " color_images = []\n", + " if color is not None:\n", + " for img in images:\n", + " color_images.append(cv2.cvtColor(img, color))\n", + " else:\n", + " color_images = images\n", + " \n", + " count = len(color_images)\n", + " \n", + " result = np.array(color_images).reshape(count, -1)\n", + " \n", + " if save:\n", + " save_feature(result, name)\n", + " \n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 556, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mMKeYh-eu0cC" + }, + "outputs": [], + "source": [ + "def get_color_hist(images, name='color_hist', save=False):\n", + " histograms = []\n", + " for img in images:\n", + " histograms.append(cv2.calcHist([img], [0, 1, 2],None, [8, 8, 8], [0, 256, 0, 256, 0, 256]).flatten())\n", + " \n", + " result = np.array(histograms)\n", + " \n", + " if save:\n", + " save_feature(result, name)\n", + " \n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 557, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "S5ZDXsPAu391" + }, + "outputs": [], + "source": [ + "def get_hog(images, name='hog', save=False):\n", + " result = np.array([hog(img, block_norm='L2') for img in images])\n", + " \n", + " if save:\n", + " save_feature(result, name)\n", + " \n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 559, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4MLzrEs6vDxz" + }, + "outputs": [], + "source": [ + "def combine_features(features, horizontal=True):\n", + " \"\"\"\n", + " Array of features [f1, f2, f3] where each fi is a feature set \n", + " eg. f1=rgb_flat, f2=SIFT, etc.\n", + " \"\"\"\n", + " if horizontal:\n", + " return np.hstack(features)\n", + " else:\n", + " return np.vstack(features)" + ] + }, + { + "cell_type": "code", + "execution_count": 560, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RYOB5B1MY8Xa" + }, + "outputs": [], + "source": [ + "def norm_features_minmax(train, test):\n", + " min_max_scaler = preprocessing.MinMaxScaler()\n", + " norm_train = min_max_scaler.fit_transform(train)\n", + " norm_test = min_max_scaler.transform(test)\n", + " \n", + " return norm_train, norm_test" + ] + }, + { + "cell_type": "code", + "execution_count": 561, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "a9uuhPj1vHAA" + }, + "outputs": [], + "source": [ + "def norm_features_zscore(train, test):\n", + " min_max_scaler = preprocessing.StandardScaler()\n", + " norm_train = min_max_scaler.fit_transform(train)\n", + " norm_test = min_max_scaler.transform(test)\n", + " \n", + " return norm_train, norm_test" + ] + }, + { + "cell_type": "code", + "execution_count": 562, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "YmxOnsvYK6Wx" + }, + "outputs": [], + "source": [ + "def train_model(train_x, train_y, model_name, validation=None):\n", + " \n", + " model = None\n", + " if model_name == 'SVM':\n", + " model = svm.SVC(gamma='scale', probability=True)\n", + " elif model_name == 'XGB':\n", + " model = XGBClassifier(n_estimators=200, max_depth=5, n_jobs=2)\n", + "# model = XGBClassifier()\n", + " elif model_name == 'MLP':\n", + " model = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=800, alpha=0.0001,\n", + " solver='sgd', verbose=10, tol=0.000000001)\n", + " elif model_name == 'ADA':\n", + " model = AdaBoostClassifier(n_estimators=50)\n", + " elif model_name == 'BAG':\n", + " model = BaggingClassifier(n_jobs=2, n_estimators=50)\n", + " elif model_name == 'RF':\n", + " model = RandomForestClassifier(n_estimators=200, max_depth=10)\n", + " elif model_name == 'KNN':\n", + " model = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)\n", + " else:\n", + " model = GaussianNB()\n", + " \n", + " model.fit(train_x, train_y)\n", + " \n", + " if validation is not None:\n", + " y_hat = model.predict(validation[0])\n", + " acc = metrics.accuracy_score(validation[1], y_hat)\n", + " print(f\"Validation Accuracy in '{model_name}' = {acc}\")\n", + " cm = metrics.confusion_matrix(validation[1], y_hat)\n", + " print(cm)\n", + " recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + " precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + " f1 = 2*(precision*recall)/(precision+recall)\n", + " print(f\"Recall in '{model_name}' = {recall}\")\n", + " print(f\"Precision in '{model_name}' = {precision}\")\n", + " print(f\"F1 Score in '{model_name}' = {f1}\")\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 563, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "St6cuqR0vZXn" + }, + "outputs": [], + "source": [ + "def plot_roc(fpr, tpr, class_name, area):\n", + " figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')\n", + " \n", + " plt.plot(fpr,tpr)\n", + "\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 564, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yPq7Z077vcx0" + }, + "outputs": [], + "source": [ + "def plot_combine_roc(test_y, prob):\n", + " n_classes = 6\n", + " name_arr = ['RF', 'BAG', 'NB', 'SVM', 'KNN', 'XGB']\n", + " fpr = dict()\n", + " tpr = dict()\n", + " roc_auc = dict()\n", + " for i in range(n_classes):\n", + " fpr[i], tpr[i], _ = metrics.roc_curve(test_y, prob[:, i])\n", + " roc_auc[i] = metrics.roc_auc_score(test_y, prob[:, i])\n", + "\n", + " figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')\n", + " colors = cycle(['darkorange', 'cornflowerblue', 'red', 'purple', 'pink', 'violet', 'green'])\n", + " for i, color in zip(range(n_classes), colors):\n", + " plt.plot(fpr[i], tpr[i], color=color,\n", + " label='ROC curve for ' + name_arr[i] + ' class (area = {1:0.2f})'\n", + " ''.format(i, roc_auc[i]))\n", + "\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend()\n", + "# plt.savefig('output/roc_lda_on_pca_uninfected.png')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 565, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cVC7wznFLG5i" + }, + "outputs": [], + "source": [ + "def confusion_mat(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues, figsize=(7,7), path=None, filename=None):\n", + " \"\"\"\n", + " cm: confusion matrix to be plotted.\n", + " classes: array of labels or class names.\n", + " title: title of the confusion matrix.\n", + " cmap: color of the plot matrix.\n", + " figsize: tupple (width, height) representiong size of the plot.\n", + " path: destination where the plot image will be saved.\n", + " filename: name to save the file with on the specified path. (if None, title is used)\n", + " \n", + " # Source: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", + " \"\"\"\n", + " cm = cm.astype(np.int64)\n", + " plt.figure(figsize=figsize)\n", + " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", + " plt.title(title)\n", + " plt.colorbar()\n", + " tick_marks = np.arange(len(classes))\n", + " plt.xticks(tick_marks, classes, rotation=45)\n", + " plt.yticks(tick_marks, classes)\n", + "\n", + " fmt = 'd'\n", + " thresh = cm.max() / 2.\n", + " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", + " plt.text(j, i, format(cm[i, j], fmt),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + " plt.grid(False)\n", + " plt.ylabel('True label')\n", + " plt.xlabel('Predicted label')\n", + " plt.tight_layout()\n", + " \n", + " if path:\n", + " if filename is None:\n", + " plt.savefig(path + title + '.png')\n", + " else:\n", + " plt.savefig(path + filename + '.png')\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 566, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "negSIe_DLJBq" + }, + "outputs": [], + "source": [ + "def pca_projection(mat, name_arr):\n", + " plt.figure(figsize=(15,20))\n", + " for i, img in enumerate(mat, start=1):\n", + " plt.subplot(4, 2, i)\n", + " y = np.var(mat[i-1], axis=0)\n", + " x = list(range(1, len(y)+1))\n", + " plt.plot(x, y, '--o')\n", + " plt.ylabel('Variance')\n", + " plt.xlabel('Data Projected on Eigen Vector Number')\n", + " plt.xticks(x)\n", + " plt.title(name_arr[i-1])\n", + " plt.savefig('content/sample_data/output/pca/subplot.png')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 567, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ebM1ar6zLMMi" + }, + "outputs": [], + "source": [ + "def draw_key_points(image, kp):\n", + " img = cv2.drawKeypoints(image, kp, None, flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)\n", + " return img" + ] + }, + { + "cell_type": "code", + "execution_count": 568, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "B58Lu9H0v87I" + }, + "outputs": [], + "source": [ + "full_data_x, full_data_y = read_images('content/sample_data/cell_images')" + ] + }, + { + "cell_type": "code", + "execution_count": 569, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "WNGx8d9rwCpS", + "outputId": "d42bf387-7f69-4001-8336-14f19983c7ea" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((27558, 32, 32, 3), (27558,))" + ] + }, + "execution_count": 569, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "full_data_x.shape, full_data_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 570, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "3BRPkwsowO3w", + "outputId": "561d0435-a94c-44b2-af1f-010e16aecda0" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(32, 32, 3)" + ] + }, + "execution_count": 570, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "full_data_x[2].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 571, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pS4kRrjW-6q9" + }, + "outputs": [], + "source": [ + "data_x, test_x, data_y, test_y = train_test_split(full_data_x, full_data_y, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 572, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "Exk8wQSQ-_or", + "outputId": "8c60cb46-e5c4-4128-f406-39defa2f7848" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((22046, 32, 32, 3), (5512, 32, 32, 3), (22046,), (5512,))" + ] + }, + "execution_count": 572, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_x.shape, test_x.shape, data_y.shape, test_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 573, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9wbi-cGo_Dm1" + }, + "outputs": [], + "source": [ + "train_imgs, val_imgs, train_y, val_y = train_test_split(data_x, data_y, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 574, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "rg3kkdj-_H2i", + "outputId": "29d8eb58-e7d0-4806-8fce-7bcb8cf1e02c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 32, 32, 3), (4410, 32, 32, 3), (17636,), (4410,))" + ] + }, + "execution_count": 574, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_imgs.shape, val_imgs.shape, train_y.shape, val_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 575, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "8tIiCMYU_Lgi" + }, + "outputs": [], + "source": [ + "np.save('content/sample_data/data/train_imgs.npy', train_imgs)\n", + "np.save('content/sample_data/data/train_y.npy', train_y)\n", + "np.save('content/sample_data/data/val_imgs.npy', val_imgs)\n", + "np.save('content/sample_data/data/val_y.npy', val_y)\n", + "\n", + "np.save('content/sample_data/data/data_x.npy', data_x)\n", + "np.save('content/sample_data/data/data_y.npy', data_y)\n", + "np.save('content/sample_data/data/test_x.npy', test_x)\n", + "np.save('content/sample_data/data/test_y.npy', test_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 576, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ucs2-k9-_vwb" + }, + "outputs": [], + "source": [ + "train_imgs = np.load('content/sample_data/data/train_imgs.npy')\n", + "train_y = np.load('content/sample_data/data/train_y.npy')\n", + "val_imgs = np.load('content/sample_data/data/val_imgs.npy')\n", + "val_y = np.load('content/sample_data/data/val_y.npy')\n", + "\n", + "data_x = np.load('content/sample_data/data/data_x.npy')\n", + "data_y = np.load('content/sample_data/data/data_y.npy')\n", + "test_x = np.load('content/sample_data/data/test_x.npy')\n", + "test_y = np.load('content/sample_data/data/test_y.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": 577, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "npXcpWe8J7-l", + "outputId": "b44bb311-be20-44db-8a40-34a98af69cf3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((22046, 32, 32, 3), (5512, 32, 32, 3), (22046,), (5512,))" + ] + }, + "execution_count": 577, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_x.shape, test_x.shape, data_y.shape, test_y.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 578, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "jf7OBGoXJ_64", + "outputId": "d4ce4c4a-737a-4843-df7c-16d21144febf" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 32, 32, 3), (4410, 32, 32, 3), (17636,), (4410,))" + ] + }, + "execution_count": 578, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_imgs.shape, val_imgs.shape, train_y.shape, val_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 579, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "skdWvOXaKG0G" + }, + "outputs": [], + "source": [ + "infected_img = train_imgs[train_y == 0][8]\n", + "uninfected_img = train_imgs[train_y == 1][4]\n", + "classes = []\n", + "classes.append(infected_img)\n", + "classes.append(uninfected_img)\n", + "class_label= []\n", + "class_label.append('infected image')\n", + "class_label.append('uninfected image')" + ] + }, + { + "cell_type": "code", + "execution_count": 580, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "v5FY65VlZzbP" + }, + "outputs": [], + "source": [ + "vis_img = np.zeros((2, 32, 32))\n", + "for i in range(2):\n", + " temp1, vis_img[i] = hog(classes[i], block_norm='L2', visualize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 581, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "KzXZBogkKLZR" + }, + "outputs": [], + "source": [ + "data_vis = []\n", + "for i in range(len(classes)):\n", + " data_vis.append(classes[i])\n", + "for i in range(len(classes)):\n", + " data_vis.append(vis_img[i-1])\n", + "\n", + "\n", + "img_name_arr = []\n", + "for i in range(len(classes)):\n", + " img_name_arr.append(class_label[i])\n", + "for i in range(len(classes)):\n", + " img_name_arr.append('HOG')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 582, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253.0 + }, + "colab_type": "code", + "id": "l-6ORhFULxcE", + "outputId": "6c961e8f-22e0-4960-9ba6-f99af01e5fe0" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(4,10))\n", + "for i, img in enumerate(data_vis, start=1):\n", + " plt.subplot(5, 2, i)\n", + " plt.title(img_name_arr[i-1])\n", + " frame1 = plt.gca()\n", + " for tick in frame1.axes.get_xticklines():\n", + " tick.set_visible(False)\n", + " for tick in frame1.axes.get_yticklines():\n", + " tick.set_visible(False)\n", + " for xlabel_i in frame1.axes.get_xticklabels():\n", + " xlabel_i.set_visible(False)\n", + " for xlabel_i in frame1.axes.get_yticklabels():\n", + " xlabel_i.set_visible(False)\n", + " plt.imshow(data_vis[i-1], cmap='gray')\n", + "#plt.savefig('/content/sample_data/output/feature_visualization.png')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 694, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50.0 + }, + "colab_type": "code", + "id": "H0t-9SOraRwu", + "outputId": "13c05477-f310-4eb1-fc0b-fd57ff4824ec" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature saved with name cache/hog_train.pkl\n", + "Feature saved with name cache/hog_val.pkl\n" + ] + } + ], + "source": [ + "hog_train = get_hog(train_imgs, name='hog_train', save=True)\n", + "hog_val = get_hog(val_imgs, name='hog_val', save=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 690, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mvM9qW-taWn9" + }, + "outputs": [], + "source": [ + "hog_train = load_feature('content/sample_data/cache/hog_train.pkl')\n", + "hog_val = load_feature('content/sample_data/cache/hog_val.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 691, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "MRP_VIKjam0N", + "outputId": "18bfaa3b-7a53-4971-ce88-42ffa71be184" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 324), (4410, 324))" + ] + }, + "execution_count": 691, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hog_train.shape, hog_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 586, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50.0 + }, + "colab_type": "code", + "id": "mlyaifDnanrm", + "outputId": "5832dd55-633c-4d21-9079-d6e26d577af5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature saved with name cache/flat_rgb_train.pkl\n", + "Feature saved with name cache/flat_rgb_val.pkl\n" + ] + } + ], + "source": [ + "flat_rgb_train = get_flattened(train_imgs, None, name='flat_rgb_train', save=True)\n", + "flat_rgb_val = get_flattened(val_imgs, None, name='flat_rgb_val', save=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 587, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Oa55L9F6bTs0" + }, + "outputs": [], + "source": [ + "flat_rgb_train = load_feature('content/sample_data/cache/flat_rgb_train.pkl')\n", + "flat_rgb_val = load_feature('content/sample_data/cache/flat_rgb_val.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 588, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "d_LqzqSAeQdo", + "outputId": "a3783836-ab89-4c58-9a5f-81d873955144" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 3072), (4410, 3072))" + ] + }, + "execution_count": 588, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flat_rgb_train.shape, flat_rgb_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 589, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50.0 + }, + "colab_type": "code", + "id": "unURj3KceUW2", + "outputId": "f909deb4-76fd-4c7d-8a97-4f83bb5d8ba7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature saved with name cache/flat_gray_train.pkl\n", + "Feature saved with name cache/flat_gray_val.pkl\n" + ] + } + ], + "source": [ + "flat_gray_train = get_flattened(train_imgs, name='flat_gray_train', save=True)\n", + "flat_gray_val = get_flattened(val_imgs, name='flat_gray_val', save=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 590, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "k5o75zg6eYLk" + }, + "outputs": [], + "source": [ + "flat_gray_train = load_feature('content/sample_data/cache/flat_gray_train.pkl')\n", + "flat_gray_val = load_feature('content/sample_data/cache/flat_gray_val.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 591, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "mEo_uxdIecyk", + "outputId": "56073289-d130-4aab-ae16-44e53acf18ea" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 1024), (4410, 1024))" + ] + }, + "execution_count": 591, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flat_gray_train.shape, flat_gray_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 592, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50.0 + }, + "colab_type": "code", + "id": "AmmV8MR0hPuT", + "outputId": "c4abc8e0-4e9c-4146-d0f1-049528fb49f5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature saved with name cache/hist_train.pkl\n", + "Feature saved with name cache/hist_val.pkl\n" + ] + } + ], + "source": [ + "hist_train = get_color_hist(train_imgs, name='hist_train', save=True)\n", + "hist_val = get_color_hist(val_imgs, name='hist_val', save=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 593, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "e6oKRJm2hTgS" + }, + "outputs": [], + "source": [ + "hist_train = load_feature('content/sample_data/cache/hist_train.pkl')\n", + "hist_val = load_feature('content/sample_data/cache/hist_val.pkl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 594, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "d8l4LLBhhXfV", + "outputId": "f92b7caa-6819-46eb-d0bc-f200aa581a11" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 512), (4410, 512))" + ] + }, + "execution_count": 594, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hist_train.shape, hist_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 595, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LZTFDZy0iE27" + }, + "outputs": [], + "source": [ + "norm_hog_train, norm_hog_val = norm_features_zscore(hog_train, hog_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 596, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4PBbVoWlhaPV" + }, + "outputs": [], + "source": [ + "pca = PCA(n_components=10)\n", + "pca_hog_train = pca.fit_transform(norm_hog_train)\n", + "pca_hog_val = pca.transform(norm_hog_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 597, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "w4dh3TxNh5Q7" + }, + "outputs": [], + "source": [ + "np.save('content/sample_data/cache/pca_hog_train.npy', pca_hog_train)\n", + "np.save('content/sample_data/cache/pca_hog_val.npy', pca_hog_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 598, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zcmVvgK1iKm7" + }, + "outputs": [], + "source": [ + "pca_hog_train = np.load('content/sample_data/cache/pca_hog_train.npy')\n", + "pca_hog_val = np.load('content/sample_data/cache/pca_hog_val.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": 599, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "-dZCDYtDijyb", + "outputId": "1012bd0b-ee99-4cf6-8f34-098a871d407c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 10), (4410, 10))" + ] + }, + "execution_count": 599, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca_hog_train.shape, pca_hog_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 600, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "llO_csHfIHd4" + }, + "outputs": [], + "source": [ + "norm_flat_rgb_train, norm_flat_rgb_val = norm_features_zscore(flat_rgb_train, flat_rgb_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 601, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DqDV3cA_IrSh" + }, + "outputs": [], + "source": [ + "pca = PCA(n_components=10)\n", + "pca_flat_rgb_train = pca.fit_transform(norm_flat_rgb_train)\n", + "pca_flat_rgb_val = pca.transform(norm_flat_rgb_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 602, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "skkLTv9TIwiq" + }, + "outputs": [], + "source": [ + "np.save('content/sample_data/cache/pca_flat_rgb_train.npy', pca_flat_rgb_train)\n", + "np.save('content/sample_data/cache/pca_flat_rgb_val.npy', pca_flat_rgb_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 603, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4BCY4ayQJEZL" + }, + "outputs": [], + "source": [ + "pca_flat_rgb_train = np.load('content/sample_data/cache/pca_flat_rgb_train.npy')\n", + "pca_flat_rgb_val = np.load('content/sample_data/cache/pca_flat_rgb_val.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": 604, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "aH0Z1EIxJWjg", + "outputId": "784832e6-6f17-476b-fa3d-4fa0c77d4bb2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 10), (4410, 10))" + ] + }, + "execution_count": 604, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca_flat_rgb_train.shape, pca_flat_rgb_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 605, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JkYD3WpMJbKb" + }, + "outputs": [], + "source": [ + "norm_flat_gray_train, norm_flat_gray_val = norm_features_zscore(flat_gray_train, flat_gray_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 606, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TmCHv2s1JgC7" + }, + "outputs": [], + "source": [ + "pca = PCA(n_components=10)\n", + "pca_flat_gray_train = pca.fit_transform(norm_flat_gray_train)\n", + "pca_flat_gray_val = pca.transform(norm_flat_gray_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 607, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dFgOO5LDJlAd" + }, + "outputs": [], + "source": [ + "np.save('content/sample_data/cache/pca_flat_gray_train.npy', pca_flat_gray_train)\n", + "np.save('content/sample_data/cache/pca_flat_gray_val.npy', pca_flat_gray_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 608, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RQj8nZrEJt15" + }, + "outputs": [], + "source": [ + "pca_flat_gray_train = np.load('content/sample_data/cache/pca_flat_gray_train.npy')\n", + "pca_flat_gray_val = np.load('content/sample_data/cache/pca_flat_gray_val.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": 609, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "2wSvDuaSJ2Ls", + "outputId": "c5395119-944d-4170-c6af-efbf51937778" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 10), (4410, 10))" + ] + }, + "execution_count": 609, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca_flat_gray_train.shape, pca_flat_gray_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 610, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f_qwShplKABz" + }, + "outputs": [], + "source": [ + "norm_hist_train, norm_hist_val = norm_features_zscore(hist_train, hist_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 611, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bdLo8Iy5KC6c" + }, + "outputs": [], + "source": [ + "pca = PCA(n_components=10)\n", + "pca_hist_train = pca.fit_transform(norm_hist_train)\n", + "pca_hist_val = pca.transform(norm_hist_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 612, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "l1hzcAatKIHf" + }, + "outputs": [], + "source": [ + "np.save('content/sample_data/cache/pca_hist_train.npy', pca_hist_train)\n", + "np.save('content/sample_data/cache/pca_hist_val.npy', pca_hist_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 613, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lZ55AXQqKR39" + }, + "outputs": [], + "source": [ + "pca_hist_train = np.load('content/sample_data/cache/pca_hist_train.npy')\n", + "pca_hist_val = np.load('content/sample_data/cache/pca_hist_val.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": 614, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "fl1I7P5uKbFF", + "outputId": "55143c28-2f6a-4d0e-e5f5-5d4a23ccb9b2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 10), (4410, 10))" + ] + }, + "execution_count": 614, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca_hist_train.shape, pca_hist_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 615, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "wKOJE0CHizFU", + "outputId": "edfef704-7984-4bdf-f381-c7cf1f0c1b9a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 615, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca_array = []\n", + "pca_array.append(pca_hog_train)\n", + "\n", + "pca_array.append(pca_hist_train)\n", + "pca_array.append(pca_flat_gray_train)\n", + "pca_array.append(pca_flat_rgb_train)\n", + "\n", + "\n", + "len(pca_array)" + ] + }, + { + "cell_type": "code", + "execution_count": 616, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 598.0 + }, + "colab_type": "code", + "id": "FppOqCCPi7PF", + "outputId": "c9c3aff3-d2c3-4cd4-8733-7234043fdd94" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pca_projection(pca_array, ['HOG', 'Color Histogram', 'Flatten GRAY', 'Flatten RGB'])" + ] + }, + { + "cell_type": "code", + "execution_count": 617, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oVIShD5UMK9u" + }, + "outputs": [], + "source": [ + "features_train = None\n", + "features_val = None\n", + "evs = []\n", + "for ft, fv in zip([hog_train, hist_train, flat_rgb_train], \n", + " [hog_val, hist_val, flat_rgb_val]): \n", + "# scaler = preprocessing.StandardScaler()\n", + "# ft = scaler.fit_transform(ft)\n", + "# fv = scaler.transform(fv)\n", + "# ft = preprocessing.normalize(ft)\n", + "# fv = preprocessing.normalize(fv)\n", + " if features_train is None:\n", + " features_train = ft\n", + " features_val = fv\n", + " else:\n", + " features_train = combine_features([features_train, ft])\n", + " features_val = combine_features([features_val, fv])" + ] + }, + { + "cell_type": "code", + "execution_count": 618, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "HE1f2Kq2M8nu", + "outputId": "18cb5887-c5df-4b38-b934-c7e9620ca18d" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 3908), (4410, 3908))" + ] + }, + "execution_count": 618, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_train.shape, features_val.shape\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 619, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HuvxeFpWNPMd" + }, + "outputs": [], + "source": [ + "features_train = None\n", + "features_val = None\n", + "evs = []\n", + "for ft, fv in zip([pca_hog_train, pca_flat_rgb_train, pca_hist_train,pca_flat_gray_train], \n", + " [pca_hog_val, pca_flat_rgb_val, pca_hist_val,pca_flat_gray_val]): \n", + "# scaler = preprocessing.StandardScaler()\n", + "# ft = scaler.fit_transform(ft)\n", + "# fv = scaler.transform(fv)\n", + "# ft = preprocessing.normalize(ft)\n", + "# fv = preprocessing.normalize(fv)\n", + " if features_train is None:\n", + " features_train = ft\n", + " features_val = fv\n", + " else:\n", + " features_train = combine_features([features_train, ft])\n", + " features_val = combine_features([features_val, fv])" + ] + }, + { + "cell_type": "code", + "execution_count": 620, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "iNIrw_nvPMKI", + "outputId": "cb8e0887-5e61-4630-f889-a4dbbf62ac97" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 40), (4410, 40))" + ] + }, + "execution_count": 620, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_train.shape, features_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 621, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ws1R97uEPc3d" + }, + "outputs": [], + "source": [ + "features_t = pca_hist_train\n", + "features_v = pca_hist_val" + ] + }, + { + "cell_type": "code", + "execution_count": 622, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34.0 + }, + "colab_type": "code", + "id": "g8tzaqkAPopI", + "outputId": "5563464f-9b87-4eb2-cde9-199cbed794fd" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 10), (4410, 10))" + ] + }, + "execution_count": 622, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_t.shape, features_v.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 623, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 118.0 + }, + "colab_type": "code", + "id": "2KPV1EDSPx9O", + "outputId": "0a065599-273f-49b8-a1f6-97d0b3ea9abe" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'RF' = 0.8809523809523809\n", + "[[1946 276]\n", + " [ 249 1939]]\n", + "Recall in 'RF' = 0.8757875787578758\n", + "Precision in 'RF' = 0.8865603644646924\n", + "F1 Score in 'RF' = 0.8811410459587956\n" + ] + } + ], + "source": [ + "model1 = train_model(features_train, train_y, model_name='RF', validation=(features_val, val_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 624, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zkL-9rbgQWss" + }, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val)\n", + "prob10 = prob1[:,0].reshape(prob1.shape[0], 1)\n", + "prob11 = prob1[:,1].reshape(prob1.shape[0], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 625, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'BAG' = 0.8841269841269841\n", + "[[1977 245]\n", + " [ 266 1922]]\n", + "Recall in 'BAG' = 0.8897389738973898\n", + "Precision in 'BAG' = 0.8814088274632189\n", + "F1 Score in 'BAG' = 0.8855543113101905\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train, train_y, model_name='BAG', validation=(features_val, val_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 626, + "metadata": {}, + "outputs": [], + "source": [ + "prob2 = model2.predict_proba(features_val)\n", + "prob20 = prob2[:,0].reshape(prob2.shape[0], 1)\n", + "prob21 = prob2[:,1].reshape(prob2.shape[0], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 627, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'NB' = 0.6598639455782312\n", + "[[ 971 1251]\n", + " [ 249 1939]]\n", + "Recall in 'NB' = 0.436993699369937\n", + "Precision in 'NB' = 0.7959016393442623\n", + "F1 Score in 'NB' = 0.5642068564787914\n" + ] + } + ], + "source": [ + "model4 = train_model(features_train, train_y, model_name='NB', validation=(features_val, val_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 628, + "metadata": {}, + "outputs": [], + "source": [ + "prob4 = model4.predict_proba(features_val)\n", + "prob40 = prob4[:,0].reshape(prob4.shape[0], 1)\n", + "prob41 = prob4[:,1].reshape(prob4.shape[0], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 629, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'SVM' = 0.8555555555555555\n", + "[[1848 374]\n", + " [ 263 1925]]\n", + "Recall in 'SVM' = 0.8316831683168316\n", + "Precision in 'SVM' = 0.8754144954997631\n", + "F1 Score in 'SVM' = 0.852988691437803\n" + ] + } + ], + "source": [ + "model5 = train_model(features_train, train_y, model_name='SVM', validation=(features_val, val_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 630, + "metadata": {}, + "outputs": [], + "source": [ + "prob5 = model5.predict_proba(features_val)\n", + "prob50 = prob5[:,0].reshape(prob5.shape[0], 1)\n", + "prob51 = prob5[:,1].reshape(prob5.shape[0], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 631, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'XGB' = 0.9090702947845805\n", + "[[1993 229]\n", + " [ 172 2016]]\n", + "Recall in 'XGB' = 0.896939693969397\n", + "Precision in 'XGB' = 0.920554272517321\n", + "F1 Score in 'XGB' = 0.9085935719170276\n" + ] + } + ], + "source": [ + "model6 = train_model(features_train, train_y, model_name='XGB', validation=(features_val, val_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 632, + "metadata": {}, + "outputs": [], + "source": [ + "prob6 = model6.predict_proba(features_val)\n", + "prob60 = prob6[:,0].reshape(prob6.shape[0], 1)\n", + "prob61 = prob6[:,1].reshape(prob6.shape[0], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 633, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'KNN' = 0.7401360544217687\n", + "[[1452 770]\n", + " [ 376 1812]]\n", + "Recall in 'KNN' = 0.6534653465346535\n", + "Precision in 'KNN' = 0.7943107221006565\n", + "F1 Score in 'KNN' = 0.7170370370370369\n" + ] + } + ], + "source": [ + "model7 = train_model(np.array(features_train), train_y, model_name='KNN', validation=(features_val, val_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 634, + "metadata": {}, + "outputs": [], + "source": [ + "prob7 = model7.predict_proba(features_val)\n", + "prob70 = prob7[:,0].reshape(prob7.shape[0], 1)\n", + "prob71 = prob7[:,1].reshape(prob7.shape[0], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 635, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "prob = np.concatenate((prob10,prob20, prob40, prob50, prob60 ,prob70), axis=1)\n", + "plot_combine_roc(val_y, prob)" + ] + }, + { + "cell_type": "code", + "execution_count": 636, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "prob = np.concatenate((prob11,prob21, prob41, prob51,prob61, prob71), axis=1)\n", + "plot_combine_roc(val_y, prob)" + ] + }, + { + "cell_type": "code", + "execution_count": 696, + "metadata": {}, + "outputs": [], + "source": [ + "hog_comp_train = combine_features([hog_train, hog_val], horizontal=False)\n", + "flat_rgb_comp_train = combine_features([flat_rgb_train, flat_rgb_val], horizontal=False)\n", + "flat_gray_comp_train = combine_features([flat_gray_train, flat_gray_val], horizontal=False)\n", + "hist_comp_train = combine_features([hist_train, hist_val], horizontal=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 638, + "metadata": {}, + "outputs": [], + "source": [ + "data_y = np.hstack([train_y, val_y])" + ] + }, + { + "cell_type": "code", + "execution_count": 639, + "metadata": {}, + "outputs": [], + "source": [ + "pcas_array = []\n", + "\n", + "pca = PCA(n_components=10)\n", + "pca_hog_comp_train = pca.fit_transform(hog_comp_train)\n", + "pcas_array.append(pca)\n", + "\n", + "pca = PCA(n_components=20)\n", + "pca_flat_rgb_comp_train = pca.fit_transform(flat_rgb_comp_train)\n", + "pcas_array.append(pca)\n", + "\n", + "pca = PCA(n_components=20)\n", + "pca_flat_gray_comp_train = pca.fit_transform(flat_gray_comp_train)\n", + "pcas_array.append(pca)\n", + "\n", + "pca = PCA(n_components=10)\n", + "pca_hist_comp_train = pca.fit_transform(hist_comp_train)\n", + "pcas_array.append(pca)" + ] + }, + { + "cell_type": "code", + "execution_count": 640, + "metadata": {}, + "outputs": [], + "source": [ + "reduced_train_features=[pca_hog_comp_train, \n", + " pca_flat_rgb_comp_train,\n", + " pca_flat_gray_comp_train,\n", + " pca_hist_comp_train]" + ] + }, + { + "cell_type": "code", + "execution_count": 641, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature saved with name cache/hog_test.pkl\n", + "Feature saved with name cache/flat_rgb_test.pkl\n", + "Feature saved with name cache/flat_gray_test.pkl\n", + "Feature saved with name cache/hist_test.pkl\n" + ] + } + ], + "source": [ + "hog_test = get_hog(test_x, name='hog_test', save=True)\n", + "flat_rgb_test = get_flattened(test_x, None, name='flat_rgb_test', save=True)\n", + "flat_gray_test = get_flattened(test_x, name='flat_gray_test', save=True)\n", + "hist_test = get_color_hist(test_x, name='hist_test', save=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 642, + "metadata": {}, + "outputs": [], + "source": [ + "test_features= [hog_test,\n", + " \n", + " flat_rgb_test,\n", + " flat_gray_test,\n", + " hist_test]" + ] + }, + { + "cell_type": "code", + "execution_count": 643, + "metadata": {}, + "outputs": [], + "source": [ + "features_comp_train = None\n", + "features_test = None\n", + "comp_pcas = []\n", + "\n", + "for i, (f_train, f_test) in enumerate(zip(reduced_train_features, \n", + " test_features)):\n", + " if i == 4:\n", + " continue\n", + " f_test = pcas_array[i].transform(f_test)\n", + " \n", + " if features_comp_train is None:\n", + " features_comp_train = f_train\n", + " features_test = f_test\n", + " else:\n", + " features_comp_train = combine_features([features_comp_train, f_train])\n", + " features_test = combine_features([features_test, f_test])" + ] + }, + { + "cell_type": "code", + "execution_count": 644, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((22046, 60), (5512, 60))" + ] + }, + "execution_count": 644, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_comp_train.shape, features_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 645, + "metadata": {}, + "outputs": [], + "source": [ + "comp_model = train_model(features_comp_train, data_y, model_name='XGB')" + ] + }, + { + "cell_type": "code", + "execution_count": 646, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = comp_model.predict(features_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 647, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7948113207547169\n", + "[[2235 539]\n", + " [ 592 2146]]\n", + "0.7905907322249734 0.8056957462148522 0.7980717728976968\n" + ] + } + ], + "source": [ + "acc = metrics.accuracy_score(test_y, y_hat)\n", + "print(acc)\n", + "cm = metrics.confusion_matrix(test_y, y_hat)\n", + "print(cm)\n", + "precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + "recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + "f1 = 2*(recall * precision) / (recall + precision)\n", + "print(precision, recall, f1)" + ] + }, + { + "cell_type": "code", + "execution_count": 648, + "metadata": {}, + "outputs": [], + "source": [ + "comp_model = train_model(features_comp_train, data_y, model_name='KNN')" + ] + }, + { + "cell_type": "code", + "execution_count": 649, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = comp_model.predict(features_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 650, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6469521044992743\n", + "[[1726 1048]\n", + " [ 898 1840]]\n", + "0.6577743902439024 0.6222062004325883 0.6394961096702482\n" + ] + } + ], + "source": [ + "acc = metrics.accuracy_score(test_y, y_hat)\n", + "print(acc)\n", + "cm = metrics.confusion_matrix(test_y, y_hat)\n", + "print(cm)\n", + "precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + "recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + "f1 = 2*(recall * precision) / (recall + precision)\n", + "print(precision, recall, f1)" + ] + }, + { + "cell_type": "code", + "execution_count": 651, + "metadata": {}, + "outputs": [], + "source": [ + "comp_model = train_model(features_comp_train, data_y, model_name='RF')" + ] + }, + { + "cell_type": "code", + "execution_count": 652, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = comp_model.predict(features_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 653, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7567126269956459\n", + "[[2142 632]\n", + " [ 709 2029]]\n", + "0.7513153279551035 0.772170151405912 0.7616\n" + ] + } + ], + "source": [ + "acc = metrics.accuracy_score(test_y, y_hat)\n", + "print(acc)\n", + "cm = metrics.confusion_matrix(test_y, y_hat)\n", + "print(cm)\n", + "precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + "recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + "f1 = 2*(recall * precision) / (recall + precision)\n", + "print(precision, recall, f1)" + ] + }, + { + "cell_type": "code", + "execution_count": 654, + "metadata": {}, + "outputs": [], + "source": [ + "comp_model = train_model(features_comp_train, data_y, model_name='BAG')" + ] + }, + { + "cell_type": "code", + "execution_count": 655, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = comp_model.predict(features_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 656, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7748548621190131\n", + "[[2207 567]\n", + " [ 674 2064]]\n", + "0.7660534536619229 0.7956020187454939 0.7805481874447391\n" + ] + } + ], + "source": [ + "acc = metrics.accuracy_score(test_y, y_hat)\n", + "print(acc)\n", + "cm = metrics.confusion_matrix(test_y, y_hat)\n", + "print(cm)\n", + "precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + "recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + "f1 = 2*(recall * precision) / (recall + precision)\n", + "print(precision, recall, f1)" + ] + }, + { + "cell_type": "code", + "execution_count": 657, + "metadata": {}, + "outputs": [], + "source": [ + "comp_model = train_model(features_comp_train, data_y, model_name='SVM')" + ] + }, + { + "cell_type": "code", + "execution_count": 658, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = comp_model.predict(features_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 659, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7213352685050798\n", + "[[2013 761]\n", + " [ 775 1963]]\n", + "0.7220229555236729 0.7256669069935112 0.7238403451995685\n" + ] + } + ], + "source": [ + "acc = metrics.accuracy_score(test_y, y_hat)\n", + "print(acc)\n", + "cm = metrics.confusion_matrix(test_y, y_hat)\n", + "print(cm)\n", + "precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + "recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + "f1 = 2*(recall * precision) / (recall + precision)\n", + "print(precision, recall, f1)" + ] + }, + { + "cell_type": "code", + "execution_count": 660, + "metadata": {}, + "outputs": [], + "source": [ + "comp_model = train_model(features_comp_train, data_y, model_name='NB')" + ] + }, + { + "cell_type": "code", + "execution_count": 661, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = comp_model.predict(features_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 662, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6716255442670537\n", + "[[1884 890]\n", + " [ 920 1818]]\n", + "0.6718972895863052 0.6791636625811103 0.67551093581929\n" + ] + } + ], + "source": [ + "acc = metrics.accuracy_score(test_y, y_hat)\n", + "print(acc)\n", + "cm = metrics.confusion_matrix(test_y, y_hat)\n", + "print(cm)\n", + "precision = cm[0][0] / (cm[0][0] + cm[1][0])\n", + "recall = cm[0][0] / (cm[0][0] + cm[0][1])\n", + "f1 = 2*(recall * precision) / (recall + precision)\n", + "print(precision, recall, f1)" + ] + }, + { + "cell_type": "code", + "execution_count": 735, + "metadata": {}, + "outputs": [], + "source": [ + "clf = RandomForestClassifier(n_estimators=200)\n", + "\n", + "param_dist = {\"max_depth\": [3, None],\n", + " \"class_weight\" : [\"balanced\", \"balanced_subsample\", None],\n", + " \"min_samples_split\": randint(2, 11),\n", + " \"bootstrap\": [True, False],\n", + " \"criterion\": [\"gini\", \"entropy\"]}\n", + "\n", + "random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 664, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomizedSearchCV(cv=5, error_score=nan,\n", + " estimator=RandomForestClassifier(bootstrap=True,\n", + " ccp_alpha=0.0,\n", + " class_weight=None,\n", + " criterion='gini',\n", + " max_depth=None,\n", + " max_features='auto',\n", + " max_leaf_nodes=None,\n", + " max_samples=None,\n", + " min_impurity_decrease=0.0,\n", + " min_impurity_split=None,\n", + " min_samples_leaf=1,\n", + " min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0,\n", + " n_estimators=200,\n", + " n_jobs...\n", + " iid='deprecated', n_iter=20, n_jobs=None,\n", + " param_distributions={'bootstrap': [True, False],\n", + " 'class_weight': ['balanced',\n", + " 'balanced_subsample',\n", + " None],\n", + " 'criterion': ['gini', 'entropy'],\n", + " 'max_depth': [3, None],\n", + " 'min_samples_split': },\n", + " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", + " return_train_score=False, scoring=None, verbose=0)" + ] + }, + "execution_count": 664, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random_search.fit(features_train, train_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 665, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_bootstrap', 'param_class_weight', 'param_criterion', 'param_max_depth', 'param_min_samples_split', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])\n" + ] + } + ], + "source": [ + "print(random_search.cv_results_.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 666, + "metadata": {}, + "outputs": [], + "source": [ + "a1 = random_search.cv_results_['param_bootstrap']\n", + "a2 = random_search.cv_results_['param_class_weight']\n", + "a3 = random_search.cv_results_['param_criterion']\n", + "a4 = random_search.cv_results_['param_max_depth']\n", + "a5 = random_search.cv_results_['param_min_samples_split']\n", + "a6 = random_search.cv_results_['std_test_score']\n", + "\n", + "a7 = random_search.cv_results_['mean_test_score']" + ] + }, + { + "cell_type": "code", + "execution_count": 667, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}\n" + ] + } + ], + "source": [ + "print(random_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 668, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bootstrap', 'class_weight', 'criterion', 'max_depth', 'min_samples_split', 'mean_train_score', 'mean_test_score']\n" + ] + } + ], + "source": [ + "arr = []\n", + "for key in random_search.best_params_.keys():\n", + " arr.append(key)\n", + " \n", + "arr.append('mean_train_score')\n", + "arr.append('mean_test_score')\n", + "print(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 669, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8950113378684807" + ] + }, + "execution_count": 669, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y1 = random_search.predict(features_val)\n", + "metrics.accuracy_score(y1, val_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 673, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat1 = model1.predict(features_val)\n", + "y_hat2 = model2.predict(features_val)\n", + "\n", + "y_hat4 = model4.predict(features_val)\n", + "y_hat5 = model5.predict(features_val)\n", + "y_hat6 = model6.predict(features_val)\n", + "y_hat7 = model7.predict(features_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 676, + "metadata": {}, + "outputs": [], + "source": [ + "cm1 = MetricTools.confusion_matrix(val_y, y_hat1, 2)\n", + "cm2 = MetricTools.confusion_matrix(val_y, y_hat2, 2)\n", + "\n", + "cm4 = MetricTools.confusion_matrix(val_y, y_hat4, 2)\n", + "cm5 = MetricTools.confusion_matrix(val_y, y_hat5, 2)\n", + "cm6 = MetricTools.confusion_matrix(val_y, y_hat6, 2)\n", + "cm7 = MetricTools.confusion_matrix(val_y, y_hat7, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 677, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1946. 276.]\n", + " [ 249. 1939.]]\n", + "[[1977. 245.]\n", + " [ 266. 1922.]]\n", + "[[ 971. 1251.]\n", + " [ 249. 1939.]]\n", + "[[1848. 374.]\n", + " [ 263. 1925.]]\n", + "[[1993. 229.]\n", + " [ 172. 2016.]]\n", + "[[1452. 770.]\n", + " [ 376. 1812.]]\n" + ] + } + ], + "source": [ + "print(cm1)\n", + "print(cm2)\n", + "\n", + "print(cm4)\n", + "print(cm5)\n", + "print(cm6)\n", + "print(cm7)" + ] + }, + { + "cell_type": "code", + "execution_count": 678, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "PlotTools.confusion_matrix(cm1, list(range(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 679, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "PlotTools.confusion_matrix(cm2, list(range(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 680, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "PlotTools.confusion_matrix(cm4, list(range(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 681, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "PlotTools.confusion_matrix(cm5, list(range(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 682, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "PlotTools.confusion_matrix(cm7, list(range(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 697, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "PlotTools.confusion_matrix(cm6, list(range(2))) #xgb " + ] + }, + { + "cell_type": "code", + "execution_count": 701, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 20), (4410, 20))" + ] + }, + "execution_count": 701, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_train1 = None\n", + "features_val1 = None\n", + "evs = []\n", + "for ft, fv in zip([pca_hist_train, pca_hog_train], \n", + " [pca_hist_val, pca_hog_val]): \n", + "# scaler = preprocessing.StandardScaler()\n", + "# ft = scaler.fit_transform(ft)\n", + "# fv = scaler.transform(fv)\n", + "# ft = preprocessing.normalize(ft)\n", + "# fv = preprocessing.normalize(fv)\n", + " if features_train1 is None:\n", + " features_train1 = ft\n", + " features_val1 = fv\n", + " else:\n", + " features_train1 = combine_features([features_train1, ft])\n", + " features_val1 = combine_features([features_val1, fv])\n", + "features_train1.shape, features_val1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 703, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((17636, 30), (4410, 30))" + ] + }, + "execution_count": 703, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_train2 = None\n", + "features_val2 = None\n", + "evs = []\n", + "for ft, fv in zip([pca_hist_train, pca_flat_gray_train, pca_flat_rgb_train], \n", + " [pca_hist_val, pca_flat_gray_val, pca_flat_rgb_val]): \n", + "# scaler = preprocessing.StandardScaler()\n", + "# ft = scaler.fit_transform(ft)\n", + "# fv = scaler.transform(fv)\n", + "# ft = preprocessing.normalize(ft)\n", + "# fv = preprocessing.normalize(fv)\n", + " if features_train2 is None:\n", + " features_train2 = ft\n", + " features_val2 = fv\n", + " else:\n", + " features_train2 = combine_features([features_train2, ft])\n", + " features_val2 = combine_features([features_val2, fv])\n", + "features_train2.shape, features_val2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 702, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'SVM' = 0.8802721088435375\n", + "[[1927 295]\n", + " [ 233 1955]]\n", + "Recall in 'SVM' = 0.8672367236723673\n", + "Precision in 'SVM' = 0.8921296296296296\n", + "F1 Score in 'SVM' = 0.8795070743952532\n" + ] + } + ], + "source": [ + "\n", + "model1 = train_model(features_train1, train_y, model_name='SVM', validation=(features_val1, val_y))\n", + "prob1 = model1.predict_proba(features_val1)" + ] + }, + { + "cell_type": "code", + "execution_count": 704, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'SVM' = 0.7875283446712018\n", + "[[1689 533]\n", + " [ 404 1784]]\n", + "Recall in 'SVM' = 0.7601260126012601\n", + "Precision in 'SVM' = 0.8069756330625896\n", + "F1 Score in 'SVM' = 0.7828505214368482\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train2, train_y, model_name='SVM', validation=(features_val2, val_y))\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 705, + "metadata": {}, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val1)\n", + "prob2 = model2.predict_proba(features_val2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 706, + "metadata": {}, + "outputs": [], + "source": [ + "prob = (prob1 + prob2 ) / 2\n", + "y_hat = np.argmax(prob, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 707, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8768707482993198" + ] + }, + "execution_count": 707, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(val_y, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": 708, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'KNN' = 0.8412698412698413\n", + "[[1809 413]\n", + " [ 287 1901]]\n", + "Recall in 'KNN' = 0.8141314131413141\n", + "Precision in 'KNN' = 0.8630725190839694\n", + "F1 Score in 'KNN' = 0.8378879110699397\n" + ] + } + ], + "source": [ + "model1 = train_model(features_train1, train_y, model_name='KNN', validation=(features_val1, val_y))\n", + "prob1 = model1.predict_proba(features_val1)" + ] + }, + { + "cell_type": "code", + "execution_count": 709, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'KNN' = 0.6727891156462585\n", + "[[1425 797]\n", + " [ 646 1542]]\n", + "Recall in 'KNN' = 0.6413141314131413\n", + "Precision in 'KNN' = 0.6880733944954128\n", + "F1 Score in 'KNN' = 0.6638714185883997\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train2, train_y, model_name='KNN', validation=(features_val2, val_y))\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 710, + "metadata": {}, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val1)\n", + "prob2 = model2.predict_proba(features_val2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 711, + "metadata": {}, + "outputs": [], + "source": [ + "prob = (prob1 + prob2 ) / 2\n", + "y_hat = np.argmax(prob, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 712, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8204081632653061" + ] + }, + "execution_count": 712, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(val_y, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": 713, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'RF' = 0.8802721088435375\n", + "[[1954 268]\n", + " [ 260 1928]]\n", + "Recall in 'RF' = 0.8793879387938794\n", + "Precision in 'RF' = 0.8825654923215899\n", + "F1 Score in 'RF' = 0.8809738503155997\n" + ] + } + ], + "source": [ + "model1 = train_model(features_train1, train_y, model_name='RF', validation=(features_val1, val_y))\n", + "prob1 = model1.predict_proba(features_val1)" + ] + }, + { + "cell_type": "code", + "execution_count": 714, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'RF' = 0.8519274376417234\n", + "[[1806 416]\n", + " [ 237 1951]]\n", + "Recall in 'RF' = 0.8127812781278128\n", + "Precision in 'RF' = 0.8839941262848752\n", + "F1 Score in 'RF' = 0.8468933177022273\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train2, train_y, model_name='RF', validation=(features_val2, val_y))\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 715, + "metadata": {}, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val1)\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 716, + "metadata": {}, + "outputs": [], + "source": [ + "prob = (prob1 + prob2 ) / 2\n", + "y_hat = np.argmax(prob, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 717, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.881859410430839" + ] + }, + "execution_count": 717, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(val_y, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": 718, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'NB' = 0.654421768707483\n", + "[[ 912 1310]\n", + " [ 214 1974]]\n", + "Recall in 'NB' = 0.41044104410441046\n", + "Precision in 'NB' = 0.8099467140319716\n", + "F1 Score in 'NB' = 0.5448028673835126\n" + ] + } + ], + "source": [ + "model1 = train_model(features_train1, train_y, model_name='NB', validation=(features_val1, val_y))\n", + "prob1 = model1.predict_proba(features_val1)" + ] + }, + { + "cell_type": "code", + "execution_count": 719, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'NB' = 0.5678004535147392\n", + "[[ 502 1720]\n", + " [ 186 2002]]\n", + "Recall in 'NB' = 0.22592259225922592\n", + "Precision in 'NB' = 0.7296511627906976\n", + "F1 Score in 'NB' = 0.3450171821305842\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train2, train_y, model_name='NB', validation=(features_val2, val_y))\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 720, + "metadata": {}, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val1)\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 721, + "metadata": {}, + "outputs": [], + "source": [ + "prob = (prob1 + prob2 ) / 2\n", + "y_hat = np.argmax(prob, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 722, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6219954648526077" + ] + }, + "execution_count": 722, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(val_y, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": 723, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'BAG' = 0.8834467120181406\n", + "[[1974 248]\n", + " [ 266 1922]]\n", + "Recall in 'BAG' = 0.8883888388838884\n", + "Precision in 'BAG' = 0.88125\n", + "F1 Score in 'BAG' = 0.8848050201703272\n" + ] + } + ], + "source": [ + "model1 = train_model(features_train1, train_y, model_name='BAG', validation=(features_val1, val_y))\n", + "prob1 = model1.predict_proba(features_val1)" + ] + }, + { + "cell_type": "code", + "execution_count": 724, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'BAG' = 0.8680272108843538\n", + "[[1874 348]\n", + " [ 234 1954]]\n", + "Recall in 'BAG' = 0.8433843384338434\n", + "Precision in 'BAG' = 0.8889943074003795\n", + "F1 Score in 'BAG' = 0.8655889145496536\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train2, train_y, model_name='BAG', validation=(features_val2, val_y))\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 725, + "metadata": {}, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val1)\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 726, + "metadata": {}, + "outputs": [], + "source": [ + "prob = (prob1 + prob2 ) / 2\n", + "y_hat = np.argmax(prob, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 727, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8947845804988662" + ] + }, + "execution_count": 727, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(val_y, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": 728, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'XGB' = 0.9002267573696145\n", + "[[1985 237]\n", + " [ 203 1985]]\n", + "Recall in 'XGB' = 0.8933393339333934\n", + "Precision in 'XGB' = 0.9072212065813529\n", + "F1 Score in 'XGB' = 0.9002267573696145\n" + ] + } + ], + "source": [ + "model1 = train_model(features_train1, train_y, model_name='XGB', validation=(features_val1, val_y))\n", + "prob1 = model1.predict_proba(features_val1)" + ] + }, + { + "cell_type": "code", + "execution_count": 741, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Accuracy in 'XGB' = 0.8886621315192744\n", + "[[1915 307]\n", + " [ 184 2004]]\n", + "Recall in 'XGB' = 0.8618361836183618\n", + "Precision in 'XGB' = 0.912339209147213\n", + "F1 Score in 'XGB' = 0.8863688960888684\n" + ] + } + ], + "source": [ + "model2 = train_model(features_train2, train_y, model_name='XGB', validation=(features_val2, val_y))\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 738, + "metadata": {}, + "outputs": [], + "source": [ + "prob1 = model1.predict_proba(features_val1)\n", + "prob2 = model2.predict_proba(features_val2)" + ] + }, + { + "cell_type": "code", + "execution_count": 739, + "metadata": {}, + "outputs": [], + "source": [ + "prob = (prob1 + prob2 ) / 2\n", + "y_hat = np.argmax(prob, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 740, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9079365079365079" + ] + }, + "execution_count": 740, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(val_y, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "TPU", + "colab": { + "name": "mal-feat-extrac.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}