From 3fc2fa85c2373f70aadb2f4442834624180c631d Mon Sep 17 00:00:00 2001 From: Arnab Ghosh <43007068+ArnabG99@users.noreply.github.com> Date: Mon, 28 Jan 2019 19:49:49 +0530 Subject: [PATCH 1/3] Created using Colaboratory --- MLCC_NeuralNetwork(Single_Layer).ipynb | 496 +++++++++++++++++++++++++ 1 file changed, 496 insertions(+) create mode 100644 MLCC_NeuralNetwork(Single_Layer).ipynb diff --git a/MLCC_NeuralNetwork(Single_Layer).ipynb b/MLCC_NeuralNetwork(Single_Layer).ipynb new file mode 100644 index 0000000..8ee22d7 --- /dev/null +++ b/MLCC_NeuralNetwork(Single_Layer).ipynb @@ -0,0 +1,496 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "MLCC NeuralNetwork(Single Layer).ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "metadata": { + "id": "RdfxQ2a3qrD1", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn import datasets\n", + "from pandas import DataFrame as df\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "wmOBi6uvqtzZ", + "colab_type": "code", + "outputId": "9045f72a-6eb7-4403-8624-88ae3fccc532", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } + }, + "cell_type": "code", + "source": [ + "iris = datasets.load_iris()\n", + "x = iris.data\n", + "y = iris.target\n", + "x = np.insert(x,x.shape[1],y,axis=1)\n", + "data = pd.DataFrame(x)\n", + "\n", + "data = data.reindex(np.random.permutation(data.index))\n", + "data.head()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234
776.73.05.01.71.0
705.93.24.81.81.0
155.74.41.50.40.0
405.03.51.30.30.0
1077.32.96.31.82.0
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4\n", + "77 6.7 3.0 5.0 1.7 1.0\n", + "70 5.9 3.2 4.8 1.8 1.0\n", + "15 5.7 4.4 1.5 0.4 0.0\n", + "40 5.0 3.5 1.3 0.3 0.0\n", + "107 7.3 2.9 6.3 1.8 2.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 54 + } + ] + }, + { + "metadata": { + "id": "9rj7RDGJssVD", + "colab_type": "code", + "outputId": "c788248b-a8e8-4129-f306-6e6f428d91c1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 2584 + } + }, + "cell_type": "code", + "source": [ + "features = data.iloc[:,:4]\n", + "target = data.iloc[:,4]\n", + "\n", + "target = target.values.reshape((target.shape[0],1))\n", + "target_class = np.zeros(shape=(target.shape[0],3))\n", + "\n", + "for i in range(target.shape[0]):\n", + " if target[i] == 0:\n", + " target_class[i][0] = 1\n", + " elif target[i] == 1:\n", + " target_class[i][1] = 1\n", + " else:\n", + " target_class[i][2] = 1\n", + " \n", + "print(target_class.shape)\n", + "print(target_class)\n", + "\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(150, 3)\n", + "[[0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [1. 0. 0.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 0. 1.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]\n", + " [0. 1. 0.]]\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "yBv3zOuYsArP", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def sigmoid(x):\n", + " return 1/(1+np.exp(-x))\n", + "\n", + "def der_sigmoid(x):\n", + " return (sigmoid(x)*(1-sigmoid(x)))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "9WbZu7ihvSSp", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "input_nodes = features.shape[1]\n", + "output_nodes = 3;\n", + "weight_matrix = np.random.uniform(size=(input_nodes,output_nodes))\n", + "\n", + "epoch = 2000 #increase it to increase accuracy\n", + "lr = 0.015 #learning rate decrease it to increase accuracy " + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "NJzxuP_X0IIn", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "for i in range(epoch):\n", + " output_node_input = features.dot(weight_matrix)\n", + " output = sigmoid(output_node_input)\n", + " \n", + " error = target_class - output\n", + " drv = der_sigmoid(output_node_input)\n", + " delta_weight = error*drv\n", + " \n", + " weight_matrix = weight_matrix + lr*features.T.dot(delta_weight)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "6yfIk5ky1EJt", + "colab_type": "code", + "outputId": "32cb0d02-4dd7-41db-92fe-e1fa8957ba6b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 136 + } + }, + "cell_type": "code", + "source": [ + "final_output = np.array(output)\n", + "#print(final_output) #test here\n", + "output_class = np.zeros(shape=final_output.shape[0])\n", + "\n", + "for i in range(final_output.shape[0]):\n", + " output_class[i] = np.argmax(final_output[i])\n", + "\n", + "print(output_class) #test here\n", + "class_diff = output_class - target[:,0]" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[1. 2. 0. 0. 1. 0. 0. 1. 1. 0. 2. 2. 1. 0. 1. 2. 0. 0. 0. 1. 0. 0. 0. 1.\n", + " 1. 1. 2. 1. 0. 2. 1. 2. 2. 0. 1. 0. 2. 2. 0. 1. 0. 0. 0. 0. 0. 0. 1. 2.\n", + " 2. 1. 0. 0. 2. 2. 0. 1. 2. 2. 0. 2. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 2. 1.\n", + " 1. 2. 2. 2. 2. 1. 1. 2. 1. 2. 2. 2. 0. 1. 1. 2. 1. 2. 2. 0. 1. 2. 1. 2.\n", + " 0. 0. 0. 1. 2. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.\n", + " 1. 0. 0. 2. 1. 1. 1. 1. 2. 0. 2. 1. 2. 1. 2. 2. 0. 2. 1. 0. 1. 0. 1. 2.\n", + " 2. 1. 2. 1. 2. 1.]\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "q9xP2z-dEqQC", + "colab_type": "code", + "outputId": "397bad9d-7060-4797-b494-e9584b5df4f1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 323 + } + }, + "cell_type": "code", + "source": [ + "print('Actual Class')\n", + "print(target[:,0])\n", + "print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>')\n", + "print('Predicted Class')\n", + "print(output_class)\n", + "\n", + "wrong_prediction = np.count_nonzero(class_diff)\n", + "N = len(class_diff)\n", + "#print(class_diff)\n", + "simple_accuracy = 100 * (N-wrong_prediction)/N\n", + "print('Accuracy : ', (simple_accuracy), '%')" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Actual Class\n", + "[1. 1. 0. 0. 2. 0. 0. 2. 1. 0. 2. 2. 1. 0. 1. 2. 0. 0. 0. 1. 0. 0. 0. 1.\n", + " 1. 1. 2. 1. 0. 2. 2. 2. 2. 0. 1. 0. 2. 2. 0. 2. 0. 0. 0. 0. 0. 0. 1. 2.\n", + " 2. 1. 0. 0. 2. 2. 0. 2. 2. 2. 0. 2. 0. 0. 1. 2. 0. 0. 0. 0. 1. 1. 2. 1.\n", + " 2. 2. 2. 2. 2. 1. 1. 2. 1. 2. 2. 2. 0. 1. 1. 2. 1. 2. 2. 0. 1. 1. 1. 2.\n", + " 0. 0. 0. 1. 2. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 2. 1.\n", + " 1. 0. 0. 2. 1. 1. 1. 1. 2. 0. 2. 1. 2. 1. 2. 1. 0. 2. 1. 0. 1. 0. 1. 2.\n", + " 2. 1. 2. 1. 1. 1.]\n", + ">>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", + "Predicted Class\n", + "[1. 2. 0. 0. 1. 0. 0. 1. 1. 0. 2. 2. 1. 0. 1. 2. 0. 0. 0. 1. 0. 0. 0. 1.\n", + " 1. 1. 2. 1. 0. 2. 1. 2. 2. 0. 1. 0. 2. 2. 0. 1. 0. 0. 0. 0. 0. 0. 1. 2.\n", + " 2. 1. 0. 0. 2. 2. 0. 1. 2. 2. 0. 2. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 2. 1.\n", + " 1. 2. 2. 2. 2. 1. 1. 2. 1. 2. 2. 2. 0. 1. 1. 2. 1. 2. 2. 0. 1. 2. 1. 2.\n", + " 0. 0. 0. 1. 2. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.\n", + " 1. 0. 0. 2. 1. 1. 1. 1. 2. 0. 2. 1. 2. 1. 2. 2. 0. 2. 1. 0. 1. 0. 1. 2.\n", + " 2. 1. 2. 1. 2. 1.]\n", + "Accuracy : 92.0 %\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file From 1fac85faf9238c1614d8339e0a9102fad0ed4ea3 Mon Sep 17 00:00:00 2001 From: Arnab Ghosh <43007068+ArnabG99@users.noreply.github.com> Date: Wed, 30 Jan 2019 18:23:02 +0530 Subject: [PATCH 2/3] Created using Colaboratory --- intro_to_pandas.ipynb | 660 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 660 insertions(+) create mode 100644 intro_to_pandas.ipynb diff --git a/intro_to_pandas.ipynb b/intro_to_pandas.ipynb new file mode 100644 index 0000000..aa51c27 --- /dev/null +++ b/intro_to_pandas.ipynb @@ -0,0 +1,660 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "intro_to_pandas.ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [ + "JndnmDMp66FL", + "YHIWvc9Ms-Ll", + "TJffr5_Jwqvd" + ], + "include_colab_link": true + }, + "kernelspec": { + "name": "python2", + "display_name": "Python 2" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "JndnmDMp66FL" + }, + "cell_type": "markdown", + "source": [ + "#### Copyright 2017 Google LLC." + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "hMqWDc_m6rUC", + "cellView": "both", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "rHLcriKWLRe4" + }, + "cell_type": "markdown", + "source": [ + "# Intro to pandas" + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "QvJBqX8_Bctk" + }, + "cell_type": "markdown", + "source": [ + "**Learning Objectives:**\n", + " * Gain an introduction to the `DataFrame` and `Series` data structures of the *pandas* library\n", + " * Access and manipulate data within a `DataFrame` and `Series`\n", + " * Import CSV data into a *pandas* `DataFrame`\n", + " * Reindex a `DataFrame` to shuffle data" + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "TIFJ83ZTBctl" + }, + "cell_type": "markdown", + "source": [ + "[*pandas*](http://pandas.pydata.org/) is a column-oriented data analysis API. It's a great tool for handling and analyzing input data, and many ML frameworks support *pandas* data structures as inputs.\n", + "Although a comprehensive introduction to the *pandas* API would span many pages, the core concepts are fairly straightforward, and we'll present them below. For a more complete reference, the [*pandas* docs site](http://pandas.pydata.org/pandas-docs/stable/index.html) contains extensive documentation and many tutorials." + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "s_JOISVgmn9v" + }, + "cell_type": "markdown", + "source": [ + "## Basic Concepts\n", + "\n", + "The following line imports the *pandas* API and prints the API version:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "aSRYu62xUi3g", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from __future__ import print_function\n", + "\n", + "import pandas as pd\n", + "pd.__version__" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "daQreKXIUslr" + }, + "cell_type": "markdown", + "source": [ + "The primary data structures in *pandas* are implemented as two classes:\n", + "\n", + " * **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns.\n", + " * **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`.\n", + "\n", + "The data frame is a commonly used abstraction for data manipulation. Similar implementations exist in [Spark](https://spark.apache.org/) and [R](https://www.r-project.org/about.html)." + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "fjnAk1xcU0yc" + }, + "cell_type": "markdown", + "source": [ + "One way to create a `Series` is to construct a `Series` object. For example:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "DFZ42Uq7UFDj", + "colab": {} + }, + "cell_type": "code", + "source": [ + "pd.Series(['San Francisco', 'San Jose', 'Sacramento'])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "U5ouUp1cU6pC" + }, + "cell_type": "markdown", + "source": [ + "`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. Example:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "avgr6GfiUh8t", + "colab": {} + }, + "cell_type": "code", + "source": [ + "city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n", + "population = pd.Series([852469, 1015785, 485199])\n", + "\n", + "pd.DataFrame({ 'City name': city_names, 'Population': population })" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "oa5wfZT7VHJl" + }, + "cell_type": "markdown", + "source": [ + "But most of the time, you load an entire file into a `DataFrame`. The following example loads a file with California housing data. Run the following cell to load the data and create feature definitions:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "av6RYOraVG1V", + "colab": {} + }, + "cell_type": "code", + "source": [ + "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n", + "california_housing_dataframe.describe()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "WrkBjfz5kEQu" + }, + "cell_type": "markdown", + "source": [ + "The example above used `DataFrame.describe` to show interesting statistics about a `DataFrame`. Another useful function is `DataFrame.head`, which displays the first few records of a `DataFrame`:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "s3ND3bgOkB5k", + "colab": {} + }, + "cell_type": "code", + "source": [ + "california_housing_dataframe.head()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "w9-Es5Y6laGd" + }, + "cell_type": "markdown", + "source": [ + "Another powerful feature of *pandas* is graphing. For example, `DataFrame.hist` lets you quickly study the distribution of values in a column:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "nqndFVXVlbPN", + "colab": {} + }, + "cell_type": "code", + "source": [ + "california_housing_dataframe.hist('housing_median_age')" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "XtYZ7114n3b-" + }, + "cell_type": "markdown", + "source": [ + "## Accessing Data\n", + "\n", + "You can access `DataFrame` data using familiar Python dict/list operations:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "_TFm7-looBFF", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities = pd.DataFrame({ 'City name': city_names, 'Population': population })\n", + "print(type(cities['City name']))\n", + "cities['City name']" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "V5L6xacLoxyv", + "colab": {} + }, + "cell_type": "code", + "source": [ + "print(type(cities['City name'][1]))\n", + "cities['City name'][1]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "gcYX1tBPugZl", + "colab": {} + }, + "cell_type": "code", + "source": [ + "print(type(cities[0:2]))\n", + "cities[0:2]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "65g1ZdGVjXsQ" + }, + "cell_type": "markdown", + "source": [ + "In addition, *pandas* provides an extremely rich API for advanced [indexing and selection](http://pandas.pydata.org/pandas-docs/stable/indexing.html) that is too extensive to be covered here." + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "RM1iaD-ka3Y1" + }, + "cell_type": "markdown", + "source": [ + "## Manipulating Data\n", + "\n", + "You may apply Python's basic arithmetic operations to `Series`. For example:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "XWmyCFJ5bOv-", + "colab": {} + }, + "cell_type": "code", + "source": [ + "population / 1000." + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "TQzIVnbnmWGM" + }, + "cell_type": "markdown", + "source": [ + "[NumPy](http://www.numpy.org/) is a popular toolkit for scientific computing. *pandas* `Series` can be used as arguments to most NumPy functions:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "ko6pLK6JmkYP", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import numpy as np\n", + "\n", + "np.log(population)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "xmxFuQmurr6d" + }, + "cell_type": "markdown", + "source": [ + "For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map), \n", + "`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n", + "\n", + "The example below creates a new `Series` that indicates whether `population` is over one million:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "Fc1DvPAbstjI", + "colab": {} + }, + "cell_type": "code", + "source": [ + "population.apply(lambda val: val > 1000000)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "ZeYYLoV9b9fB" + }, + "cell_type": "markdown", + "source": [ + "\n", + "Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "0gCEX99Hb8LR", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n", + "cities['Population density'] = cities['Population'] / cities['Area square miles']\n", + "cities" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "6qh63m-ayb-c" + }, + "cell_type": "markdown", + "source": [ + "## Exercise #1\n", + "\n", + "Modify the `cities` table by adding a new boolean column that is True if and only if *both* of the following are True:\n", + "\n", + " * The city is named after a saint.\n", + " * The city has an area greater than 50 square miles.\n", + "\n", + "**Note:** Boolean `Series` are combined using the bitwise, rather than the traditional boolean, operators. For example, when performing *logical and*, use `&` instead of `and`.\n", + "\n", + "**Hint:** \"San\" in Spanish means \"saint.\"" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "zCOn8ftSyddH", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Your code here" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "YHIWvc9Ms-Ll" + }, + "cell_type": "markdown", + "source": [ + "### Solution\n", + "\n", + "Click below for a solution." + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "T5OlrqtdtCIb", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities['Is wide and has saint name'] = (cities['Area square miles'] > 50) & cities['City name'].apply(lambda name: name.startswith('San'))\n", + "cities" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "f-xAOJeMiXFB" + }, + "cell_type": "markdown", + "source": [ + "## Indexes\n", + "Both `Series` and `DataFrame` objects also define an `index` property that assigns an identifier value to each `Series` item or `DataFrame` row. \n", + "\n", + "By default, at construction, *pandas* assigns index values that reflect the ordering of the source data. Once created, the index values are stable; that is, they do not change when data is reordered." + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "2684gsWNinq9", + "colab": {} + }, + "cell_type": "code", + "source": [ + "city_names.index" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "code", + "id": "F_qPe2TBjfWd", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities.index" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "hp2oWY9Slo_h" + }, + "cell_type": "markdown", + "source": [ + "Call `DataFrame.reindex` to manually reorder the rows. For example, the following has the same effect as sorting by city name:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "sN0zUzSAj-U1", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities.reindex([2, 0, 1])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "-GQFz8NZuS06" + }, + "cell_type": "markdown", + "source": [ + "Reindexing is a great way to shuffle (randomize) a `DataFrame`. In the example below, we take the index, which is array-like, and pass it to NumPy's `random.permutation` function, which shuffles its values in place. Calling `reindex` with this shuffled array causes the `DataFrame` rows to be shuffled in the same way.\n", + "Try running the following cell multiple times!" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "mF8GC0k8uYhz", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities.reindex(np.random.permutation(cities.index))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "fSso35fQmGKb" + }, + "cell_type": "markdown", + "source": [ + "For more information, see the [Index documentation](http://pandas.pydata.org/pandas-docs/stable/indexing.html#index-objects)." + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "8UngIdVhz8C0" + }, + "cell_type": "markdown", + "source": [ + "## Exercise #2\n", + "\n", + "The `reindex` method allows index values that are not in the original `DataFrame`'s index values. Try it and see what happens if you use such values! Why do you think this is allowed?" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "PN55GrDX0jzO", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Your code here" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "TJffr5_Jwqvd" + }, + "cell_type": "markdown", + "source": [ + "### Solution\n", + "\n", + "Click below for the solution." + ] + }, + { + "metadata": { + "colab_type": "text", + "id": "8oSvi2QWwuDH" + }, + "cell_type": "markdown", + "source": [ + "If your `reindex` input array includes values not in the original `DataFrame` index values, `reindex` will add new rows for these \"missing\" indices and populate all corresponding columns with `NaN` values:" + ] + }, + { + "metadata": { + "colab_type": "code", + "id": "yBdkucKCwy4x", + "colab": {} + }, + "cell_type": "code", + "source": [ + "cities.reindex([0, 4, 5, 2])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "colab_type": "text", + "id": "2l82PhPbwz7g" + }, + "cell_type": "markdown", + "source": [ + "This behavior is desirable because indexes are often strings pulled from the actual data (see the [*pandas* reindex\n", + "documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html) for an example\n", + "in which the index values are browser names).\n", + "\n", + "In this case, allowing \"missing\" indices makes it easy to reindex using an external list, as you don't have to worry about\n", + "sanitizing the input." + ] + } + ] +} \ No newline at end of file From a7693a493afff154343bbbfc426d55e2a9ad1775 Mon Sep 17 00:00:00 2001 From: Arnab Ghosh <43007068+ArnabG99@users.noreply.github.com> Date: Wed, 30 Jan 2019 18:23:37 +0530 Subject: [PATCH 3/3] Delete intro_to_pandas.ipynb --- intro_to_pandas.ipynb | 660 ------------------------------------------ 1 file changed, 660 deletions(-) delete mode 100644 intro_to_pandas.ipynb diff --git a/intro_to_pandas.ipynb b/intro_to_pandas.ipynb deleted file mode 100644 index aa51c27..0000000 --- a/intro_to_pandas.ipynb +++ /dev/null @@ -1,660 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "intro_to_pandas.ipynb", - "version": "0.3.2", - "provenance": [], - "collapsed_sections": [ - "JndnmDMp66FL", - "YHIWvc9Ms-Ll", - "TJffr5_Jwqvd" - ], - "include_colab_link": true - }, - "kernelspec": { - "name": "python2", - "display_name": "Python 2" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "JndnmDMp66FL" - }, - "cell_type": "markdown", - "source": [ - "#### Copyright 2017 Google LLC." - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "hMqWDc_m6rUC", - "cellView": "both", - "colab": {} - }, - "cell_type": "code", - "source": [ - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "rHLcriKWLRe4" - }, - "cell_type": "markdown", - "source": [ - "# Intro to pandas" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "QvJBqX8_Bctk" - }, - "cell_type": "markdown", - "source": [ - "**Learning Objectives:**\n", - " * Gain an introduction to the `DataFrame` and `Series` data structures of the *pandas* library\n", - " * Access and manipulate data within a `DataFrame` and `Series`\n", - " * Import CSV data into a *pandas* `DataFrame`\n", - " * Reindex a `DataFrame` to shuffle data" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "TIFJ83ZTBctl" - }, - "cell_type": "markdown", - "source": [ - "[*pandas*](http://pandas.pydata.org/) is a column-oriented data analysis API. It's a great tool for handling and analyzing input data, and many ML frameworks support *pandas* data structures as inputs.\n", - "Although a comprehensive introduction to the *pandas* API would span many pages, the core concepts are fairly straightforward, and we'll present them below. For a more complete reference, the [*pandas* docs site](http://pandas.pydata.org/pandas-docs/stable/index.html) contains extensive documentation and many tutorials." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "s_JOISVgmn9v" - }, - "cell_type": "markdown", - "source": [ - "## Basic Concepts\n", - "\n", - "The following line imports the *pandas* API and prints the API version:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "aSRYu62xUi3g", - "colab": {} - }, - "cell_type": "code", - "source": [ - "from __future__ import print_function\n", - "\n", - "import pandas as pd\n", - "pd.__version__" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "daQreKXIUslr" - }, - "cell_type": "markdown", - "source": [ - "The primary data structures in *pandas* are implemented as two classes:\n", - "\n", - " * **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns.\n", - " * **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`.\n", - "\n", - "The data frame is a commonly used abstraction for data manipulation. Similar implementations exist in [Spark](https://spark.apache.org/) and [R](https://www.r-project.org/about.html)." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "fjnAk1xcU0yc" - }, - "cell_type": "markdown", - "source": [ - "One way to create a `Series` is to construct a `Series` object. For example:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "DFZ42Uq7UFDj", - "colab": {} - }, - "cell_type": "code", - "source": [ - "pd.Series(['San Francisco', 'San Jose', 'Sacramento'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "U5ouUp1cU6pC" - }, - "cell_type": "markdown", - "source": [ - "`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. Example:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "avgr6GfiUh8t", - "colab": {} - }, - "cell_type": "code", - "source": [ - "city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n", - "population = pd.Series([852469, 1015785, 485199])\n", - "\n", - "pd.DataFrame({ 'City name': city_names, 'Population': population })" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "oa5wfZT7VHJl" - }, - "cell_type": "markdown", - "source": [ - "But most of the time, you load an entire file into a `DataFrame`. The following example loads a file with California housing data. Run the following cell to load the data and create feature definitions:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "av6RYOraVG1V", - "colab": {} - }, - "cell_type": "code", - "source": [ - "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n", - "california_housing_dataframe.describe()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "WrkBjfz5kEQu" - }, - "cell_type": "markdown", - "source": [ - "The example above used `DataFrame.describe` to show interesting statistics about a `DataFrame`. Another useful function is `DataFrame.head`, which displays the first few records of a `DataFrame`:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "s3ND3bgOkB5k", - "colab": {} - }, - "cell_type": "code", - "source": [ - "california_housing_dataframe.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "w9-Es5Y6laGd" - }, - "cell_type": "markdown", - "source": [ - "Another powerful feature of *pandas* is graphing. For example, `DataFrame.hist` lets you quickly study the distribution of values in a column:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "nqndFVXVlbPN", - "colab": {} - }, - "cell_type": "code", - "source": [ - "california_housing_dataframe.hist('housing_median_age')" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "XtYZ7114n3b-" - }, - "cell_type": "markdown", - "source": [ - "## Accessing Data\n", - "\n", - "You can access `DataFrame` data using familiar Python dict/list operations:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "_TFm7-looBFF", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities = pd.DataFrame({ 'City name': city_names, 'Population': population })\n", - "print(type(cities['City name']))\n", - "cities['City name']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "code", - "id": "V5L6xacLoxyv", - "colab": {} - }, - "cell_type": "code", - "source": [ - "print(type(cities['City name'][1]))\n", - "cities['City name'][1]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "code", - "id": "gcYX1tBPugZl", - "colab": {} - }, - "cell_type": "code", - "source": [ - "print(type(cities[0:2]))\n", - "cities[0:2]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "65g1ZdGVjXsQ" - }, - "cell_type": "markdown", - "source": [ - "In addition, *pandas* provides an extremely rich API for advanced [indexing and selection](http://pandas.pydata.org/pandas-docs/stable/indexing.html) that is too extensive to be covered here." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "RM1iaD-ka3Y1" - }, - "cell_type": "markdown", - "source": [ - "## Manipulating Data\n", - "\n", - "You may apply Python's basic arithmetic operations to `Series`. For example:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "XWmyCFJ5bOv-", - "colab": {} - }, - "cell_type": "code", - "source": [ - "population / 1000." - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "TQzIVnbnmWGM" - }, - "cell_type": "markdown", - "source": [ - "[NumPy](http://www.numpy.org/) is a popular toolkit for scientific computing. *pandas* `Series` can be used as arguments to most NumPy functions:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "ko6pLK6JmkYP", - "colab": {} - }, - "cell_type": "code", - "source": [ - "import numpy as np\n", - "\n", - "np.log(population)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "xmxFuQmurr6d" - }, - "cell_type": "markdown", - "source": [ - "For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map), \n", - "`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n", - "\n", - "The example below creates a new `Series` that indicates whether `population` is over one million:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "Fc1DvPAbstjI", - "colab": {} - }, - "cell_type": "code", - "source": [ - "population.apply(lambda val: val > 1000000)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "ZeYYLoV9b9fB" - }, - "cell_type": "markdown", - "source": [ - "\n", - "Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "0gCEX99Hb8LR", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n", - "cities['Population density'] = cities['Population'] / cities['Area square miles']\n", - "cities" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "6qh63m-ayb-c" - }, - "cell_type": "markdown", - "source": [ - "## Exercise #1\n", - "\n", - "Modify the `cities` table by adding a new boolean column that is True if and only if *both* of the following are True:\n", - "\n", - " * The city is named after a saint.\n", - " * The city has an area greater than 50 square miles.\n", - "\n", - "**Note:** Boolean `Series` are combined using the bitwise, rather than the traditional boolean, operators. For example, when performing *logical and*, use `&` instead of `and`.\n", - "\n", - "**Hint:** \"San\" in Spanish means \"saint.\"" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "zCOn8ftSyddH", - "colab": {} - }, - "cell_type": "code", - "source": [ - "# Your code here" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "YHIWvc9Ms-Ll" - }, - "cell_type": "markdown", - "source": [ - "### Solution\n", - "\n", - "Click below for a solution." - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "T5OlrqtdtCIb", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities['Is wide and has saint name'] = (cities['Area square miles'] > 50) & cities['City name'].apply(lambda name: name.startswith('San'))\n", - "cities" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "f-xAOJeMiXFB" - }, - "cell_type": "markdown", - "source": [ - "## Indexes\n", - "Both `Series` and `DataFrame` objects also define an `index` property that assigns an identifier value to each `Series` item or `DataFrame` row. \n", - "\n", - "By default, at construction, *pandas* assigns index values that reflect the ordering of the source data. Once created, the index values are stable; that is, they do not change when data is reordered." - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "2684gsWNinq9", - "colab": {} - }, - "cell_type": "code", - "source": [ - "city_names.index" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "code", - "id": "F_qPe2TBjfWd", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities.index" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "hp2oWY9Slo_h" - }, - "cell_type": "markdown", - "source": [ - "Call `DataFrame.reindex` to manually reorder the rows. For example, the following has the same effect as sorting by city name:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "sN0zUzSAj-U1", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities.reindex([2, 0, 1])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "-GQFz8NZuS06" - }, - "cell_type": "markdown", - "source": [ - "Reindexing is a great way to shuffle (randomize) a `DataFrame`. In the example below, we take the index, which is array-like, and pass it to NumPy's `random.permutation` function, which shuffles its values in place. Calling `reindex` with this shuffled array causes the `DataFrame` rows to be shuffled in the same way.\n", - "Try running the following cell multiple times!" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "mF8GC0k8uYhz", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities.reindex(np.random.permutation(cities.index))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "fSso35fQmGKb" - }, - "cell_type": "markdown", - "source": [ - "For more information, see the [Index documentation](http://pandas.pydata.org/pandas-docs/stable/indexing.html#index-objects)." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "8UngIdVhz8C0" - }, - "cell_type": "markdown", - "source": [ - "## Exercise #2\n", - "\n", - "The `reindex` method allows index values that are not in the original `DataFrame`'s index values. Try it and see what happens if you use such values! Why do you think this is allowed?" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "PN55GrDX0jzO", - "colab": {} - }, - "cell_type": "code", - "source": [ - "# Your code here" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "TJffr5_Jwqvd" - }, - "cell_type": "markdown", - "source": [ - "### Solution\n", - "\n", - "Click below for the solution." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "8oSvi2QWwuDH" - }, - "cell_type": "markdown", - "source": [ - "If your `reindex` input array includes values not in the original `DataFrame` index values, `reindex` will add new rows for these \"missing\" indices and populate all corresponding columns with `NaN` values:" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "yBdkucKCwy4x", - "colab": {} - }, - "cell_type": "code", - "source": [ - "cities.reindex([0, 4, 5, 2])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "2l82PhPbwz7g" - }, - "cell_type": "markdown", - "source": [ - "This behavior is desirable because indexes are often strings pulled from the actual data (see the [*pandas* reindex\n", - "documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html) for an example\n", - "in which the index values are browser names).\n", - "\n", - "In this case, allowing \"missing\" indices makes it easy to reindex using an external list, as you don't have to worry about\n", - "sanitizing the input." - ] - } - ] -} \ No newline at end of file