From 3fc2fa85c2373f70aadb2f4442834624180c631d Mon Sep 17 00:00:00 2001
From: Arnab Ghosh <43007068+ArnabG99@users.noreply.github.com>
Date: Mon, 28 Jan 2019 19:49:49 +0530
Subject: [PATCH 1/3] Created using Colaboratory
---
MLCC_NeuralNetwork(Single_Layer).ipynb | 496 +++++++++++++++++++++++++
1 file changed, 496 insertions(+)
create mode 100644 MLCC_NeuralNetwork(Single_Layer).ipynb
diff --git a/MLCC_NeuralNetwork(Single_Layer).ipynb b/MLCC_NeuralNetwork(Single_Layer).ipynb
new file mode 100644
index 0000000..8ee22d7
--- /dev/null
+++ b/MLCC_NeuralNetwork(Single_Layer).ipynb
@@ -0,0 +1,496 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "MLCC NeuralNetwork(Single Layer).ipynb",
+ "version": "0.3.2",
+ "provenance": [],
+ "collapsed_sections": [],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "metadata": {
+ "id": "RdfxQ2a3qrD1",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn import datasets\n",
+ "from pandas import DataFrame as df\n"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "wmOBi6uvqtzZ",
+ "colab_type": "code",
+ "outputId": "9045f72a-6eb7-4403-8624-88ae3fccc532",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 204
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "iris = datasets.load_iris()\n",
+ "x = iris.data\n",
+ "y = iris.target\n",
+ "x = np.insert(x,x.shape[1],y,axis=1)\n",
+ "data = pd.DataFrame(x)\n",
+ "\n",
+ "data = data.reindex(np.random.permutation(data.index))\n",
+ "data.head()"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 77 | \n",
+ " 6.7 | \n",
+ " 3.0 | \n",
+ " 5.0 | \n",
+ " 1.7 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 70 | \n",
+ " 5.9 | \n",
+ " 3.2 | \n",
+ " 4.8 | \n",
+ " 1.8 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 5.7 | \n",
+ " 4.4 | \n",
+ " 1.5 | \n",
+ " 0.4 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " 5.0 | \n",
+ " 3.5 | \n",
+ " 1.3 | \n",
+ " 0.3 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 107 | \n",
+ " 7.3 | \n",
+ " 2.9 | \n",
+ " 6.3 | \n",
+ " 1.8 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4\n",
+ "77 6.7 3.0 5.0 1.7 1.0\n",
+ "70 5.9 3.2 4.8 1.8 1.0\n",
+ "15 5.7 4.4 1.5 0.4 0.0\n",
+ "40 5.0 3.5 1.3 0.3 0.0\n",
+ "107 7.3 2.9 6.3 1.8 2.0"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 54
+ }
+ ]
+ },
+ {
+ "metadata": {
+ "id": "9rj7RDGJssVD",
+ "colab_type": "code",
+ "outputId": "c788248b-a8e8-4129-f306-6e6f428d91c1",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 2584
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "features = data.iloc[:,:4]\n",
+ "target = data.iloc[:,4]\n",
+ "\n",
+ "target = target.values.reshape((target.shape[0],1))\n",
+ "target_class = np.zeros(shape=(target.shape[0],3))\n",
+ "\n",
+ "for i in range(target.shape[0]):\n",
+ " if target[i] == 0:\n",
+ " target_class[i][0] = 1\n",
+ " elif target[i] == 1:\n",
+ " target_class[i][1] = 1\n",
+ " else:\n",
+ " target_class[i][2] = 1\n",
+ " \n",
+ "print(target_class.shape)\n",
+ "print(target_class)\n",
+ "\n"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "(150, 3)\n",
+ "[[0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 1. 0.]]\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "metadata": {
+ "id": "yBv3zOuYsArP",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "def sigmoid(x):\n",
+ " return 1/(1+np.exp(-x))\n",
+ "\n",
+ "def der_sigmoid(x):\n",
+ " return (sigmoid(x)*(1-sigmoid(x)))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "9WbZu7ihvSSp",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "input_nodes = features.shape[1]\n",
+ "output_nodes = 3;\n",
+ "weight_matrix = np.random.uniform(size=(input_nodes,output_nodes))\n",
+ "\n",
+ "epoch = 2000 #increase it to increase accuracy\n",
+ "lr = 0.015 #learning rate decrease it to increase accuracy "
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "NJzxuP_X0IIn",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "for i in range(epoch):\n",
+ " output_node_input = features.dot(weight_matrix)\n",
+ " output = sigmoid(output_node_input)\n",
+ " \n",
+ " error = target_class - output\n",
+ " drv = der_sigmoid(output_node_input)\n",
+ " delta_weight = error*drv\n",
+ " \n",
+ " weight_matrix = weight_matrix + lr*features.T.dot(delta_weight)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "6yfIk5ky1EJt",
+ "colab_type": "code",
+ "outputId": "32cb0d02-4dd7-41db-92fe-e1fa8957ba6b",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 136
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "final_output = np.array(output)\n",
+ "#print(final_output) #test here\n",
+ "output_class = np.zeros(shape=final_output.shape[0])\n",
+ "\n",
+ "for i in range(final_output.shape[0]):\n",
+ " output_class[i] = np.argmax(final_output[i])\n",
+ "\n",
+ "print(output_class) #test here\n",
+ "class_diff = output_class - target[:,0]"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "[1. 2. 0. 0. 1. 0. 0. 1. 1. 0. 2. 2. 1. 0. 1. 2. 0. 0. 0. 1. 0. 0. 0. 1.\n",
+ " 1. 1. 2. 1. 0. 2. 1. 2. 2. 0. 1. 0. 2. 2. 0. 1. 0. 0. 0. 0. 0. 0. 1. 2.\n",
+ " 2. 1. 0. 0. 2. 2. 0. 1. 2. 2. 0. 2. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 2. 1.\n",
+ " 1. 2. 2. 2. 2. 1. 1. 2. 1. 2. 2. 2. 0. 1. 1. 2. 1. 2. 2. 0. 1. 2. 1. 2.\n",
+ " 0. 0. 0. 1. 2. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.\n",
+ " 1. 0. 0. 2. 1. 1. 1. 1. 2. 0. 2. 1. 2. 1. 2. 2. 0. 2. 1. 0. 1. 0. 1. 2.\n",
+ " 2. 1. 2. 1. 2. 1.]\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "metadata": {
+ "id": "q9xP2z-dEqQC",
+ "colab_type": "code",
+ "outputId": "397bad9d-7060-4797-b494-e9584b5df4f1",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 323
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "print('Actual Class')\n",
+ "print(target[:,0])\n",
+ "print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>')\n",
+ "print('Predicted Class')\n",
+ "print(output_class)\n",
+ "\n",
+ "wrong_prediction = np.count_nonzero(class_diff)\n",
+ "N = len(class_diff)\n",
+ "#print(class_diff)\n",
+ "simple_accuracy = 100 * (N-wrong_prediction)/N\n",
+ "print('Accuracy : ', (simple_accuracy), '%')"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Actual Class\n",
+ "[1. 1. 0. 0. 2. 0. 0. 2. 1. 0. 2. 2. 1. 0. 1. 2. 0. 0. 0. 1. 0. 0. 0. 1.\n",
+ " 1. 1. 2. 1. 0. 2. 2. 2. 2. 0. 1. 0. 2. 2. 0. 2. 0. 0. 0. 0. 0. 0. 1. 2.\n",
+ " 2. 1. 0. 0. 2. 2. 0. 2. 2. 2. 0. 2. 0. 0. 1. 2. 0. 0. 0. 0. 1. 1. 2. 1.\n",
+ " 2. 2. 2. 2. 2. 1. 1. 2. 1. 2. 2. 2. 0. 1. 1. 2. 1. 2. 2. 0. 1. 1. 1. 2.\n",
+ " 0. 0. 0. 1. 2. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 2. 1.\n",
+ " 1. 0. 0. 2. 1. 1. 1. 1. 2. 0. 2. 1. 2. 1. 2. 1. 0. 2. 1. 0. 1. 0. 1. 2.\n",
+ " 2. 1. 2. 1. 1. 1.]\n",
+ ">>>>>>>>>>>>>>>>>>>>>>>>>>>>\n",
+ "Predicted Class\n",
+ "[1. 2. 0. 0. 1. 0. 0. 1. 1. 0. 2. 2. 1. 0. 1. 2. 0. 0. 0. 1. 0. 0. 0. 1.\n",
+ " 1. 1. 2. 1. 0. 2. 1. 2. 2. 0. 1. 0. 2. 2. 0. 1. 0. 0. 0. 0. 0. 0. 1. 2.\n",
+ " 2. 1. 0. 0. 2. 2. 0. 1. 2. 2. 0. 2. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 2. 1.\n",
+ " 1. 2. 2. 2. 2. 1. 1. 2. 1. 2. 2. 2. 0. 1. 1. 2. 1. 2. 2. 0. 1. 2. 1. 2.\n",
+ " 0. 0. 0. 1. 2. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.\n",
+ " 1. 0. 0. 2. 1. 1. 1. 1. 2. 0. 2. 1. 2. 1. 2. 2. 0. 2. 1. 0. 1. 0. 1. 2.\n",
+ " 2. 1. 2. 1. 2. 1.]\n",
+ "Accuracy : 92.0 %\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
From 1fac85faf9238c1614d8339e0a9102fad0ed4ea3 Mon Sep 17 00:00:00 2001
From: Arnab Ghosh <43007068+ArnabG99@users.noreply.github.com>
Date: Wed, 30 Jan 2019 18:23:02 +0530
Subject: [PATCH 2/3] Created using Colaboratory
---
intro_to_pandas.ipynb | 660 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 660 insertions(+)
create mode 100644 intro_to_pandas.ipynb
diff --git a/intro_to_pandas.ipynb b/intro_to_pandas.ipynb
new file mode 100644
index 0000000..aa51c27
--- /dev/null
+++ b/intro_to_pandas.ipynb
@@ -0,0 +1,660 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "intro_to_pandas.ipynb",
+ "version": "0.3.2",
+ "provenance": [],
+ "collapsed_sections": [
+ "JndnmDMp66FL",
+ "YHIWvc9Ms-Ll",
+ "TJffr5_Jwqvd"
+ ],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python2",
+ "display_name": "Python 2"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "JndnmDMp66FL"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "#### Copyright 2017 Google LLC."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "hMqWDc_m6rUC",
+ "cellView": "both",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "rHLcriKWLRe4"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "# Intro to pandas"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "QvJBqX8_Bctk"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "**Learning Objectives:**\n",
+ " * Gain an introduction to the `DataFrame` and `Series` data structures of the *pandas* library\n",
+ " * Access and manipulate data within a `DataFrame` and `Series`\n",
+ " * Import CSV data into a *pandas* `DataFrame`\n",
+ " * Reindex a `DataFrame` to shuffle data"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "TIFJ83ZTBctl"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "[*pandas*](http://pandas.pydata.org/) is a column-oriented data analysis API. It's a great tool for handling and analyzing input data, and many ML frameworks support *pandas* data structures as inputs.\n",
+ "Although a comprehensive introduction to the *pandas* API would span many pages, the core concepts are fairly straightforward, and we'll present them below. For a more complete reference, the [*pandas* docs site](http://pandas.pydata.org/pandas-docs/stable/index.html) contains extensive documentation and many tutorials."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "s_JOISVgmn9v"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Basic Concepts\n",
+ "\n",
+ "The following line imports the *pandas* API and prints the API version:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "aSRYu62xUi3g",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "from __future__ import print_function\n",
+ "\n",
+ "import pandas as pd\n",
+ "pd.__version__"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "daQreKXIUslr"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "The primary data structures in *pandas* are implemented as two classes:\n",
+ "\n",
+ " * **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns.\n",
+ " * **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`.\n",
+ "\n",
+ "The data frame is a commonly used abstraction for data manipulation. Similar implementations exist in [Spark](https://spark.apache.org/) and [R](https://www.r-project.org/about.html)."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "fjnAk1xcU0yc"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "One way to create a `Series` is to construct a `Series` object. For example:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "DFZ42Uq7UFDj",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "pd.Series(['San Francisco', 'San Jose', 'Sacramento'])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "U5ouUp1cU6pC"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. Example:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "avgr6GfiUh8t",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n",
+ "population = pd.Series([852469, 1015785, 485199])\n",
+ "\n",
+ "pd.DataFrame({ 'City name': city_names, 'Population': population })"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "oa5wfZT7VHJl"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "But most of the time, you load an entire file into a `DataFrame`. The following example loads a file with California housing data. Run the following cell to load the data and create feature definitions:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "av6RYOraVG1V",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
+ "california_housing_dataframe.describe()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "WrkBjfz5kEQu"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "The example above used `DataFrame.describe` to show interesting statistics about a `DataFrame`. Another useful function is `DataFrame.head`, which displays the first few records of a `DataFrame`:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "s3ND3bgOkB5k",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "california_housing_dataframe.head()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "w9-Es5Y6laGd"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "Another powerful feature of *pandas* is graphing. For example, `DataFrame.hist` lets you quickly study the distribution of values in a column:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "nqndFVXVlbPN",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "california_housing_dataframe.hist('housing_median_age')"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "XtYZ7114n3b-"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Accessing Data\n",
+ "\n",
+ "You can access `DataFrame` data using familiar Python dict/list operations:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "_TFm7-looBFF",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities = pd.DataFrame({ 'City name': city_names, 'Population': population })\n",
+ "print(type(cities['City name']))\n",
+ "cities['City name']"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "V5L6xacLoxyv",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "print(type(cities['City name'][1]))\n",
+ "cities['City name'][1]"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "gcYX1tBPugZl",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "print(type(cities[0:2]))\n",
+ "cities[0:2]"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "65g1ZdGVjXsQ"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "In addition, *pandas* provides an extremely rich API for advanced [indexing and selection](http://pandas.pydata.org/pandas-docs/stable/indexing.html) that is too extensive to be covered here."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "RM1iaD-ka3Y1"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Manipulating Data\n",
+ "\n",
+ "You may apply Python's basic arithmetic operations to `Series`. For example:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "XWmyCFJ5bOv-",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "population / 1000."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "TQzIVnbnmWGM"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "[NumPy](http://www.numpy.org/) is a popular toolkit for scientific computing. *pandas* `Series` can be used as arguments to most NumPy functions:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "ko6pLK6JmkYP",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "np.log(population)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "xmxFuQmurr6d"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map), \n",
+ "`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n",
+ "\n",
+ "The example below creates a new `Series` that indicates whether `population` is over one million:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "Fc1DvPAbstjI",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "population.apply(lambda val: val > 1000000)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "ZeYYLoV9b9fB"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "0gCEX99Hb8LR",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n",
+ "cities['Population density'] = cities['Population'] / cities['Area square miles']\n",
+ "cities"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "6qh63m-ayb-c"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Exercise #1\n",
+ "\n",
+ "Modify the `cities` table by adding a new boolean column that is True if and only if *both* of the following are True:\n",
+ "\n",
+ " * The city is named after a saint.\n",
+ " * The city has an area greater than 50 square miles.\n",
+ "\n",
+ "**Note:** Boolean `Series` are combined using the bitwise, rather than the traditional boolean, operators. For example, when performing *logical and*, use `&` instead of `and`.\n",
+ "\n",
+ "**Hint:** \"San\" in Spanish means \"saint.\""
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "zCOn8ftSyddH",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "# Your code here"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "YHIWvc9Ms-Ll"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for a solution."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "T5OlrqtdtCIb",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities['Is wide and has saint name'] = (cities['Area square miles'] > 50) & cities['City name'].apply(lambda name: name.startswith('San'))\n",
+ "cities"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "f-xAOJeMiXFB"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Indexes\n",
+ "Both `Series` and `DataFrame` objects also define an `index` property that assigns an identifier value to each `Series` item or `DataFrame` row. \n",
+ "\n",
+ "By default, at construction, *pandas* assigns index values that reflect the ordering of the source data. Once created, the index values are stable; that is, they do not change when data is reordered."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "2684gsWNinq9",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "city_names.index"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "F_qPe2TBjfWd",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities.index"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "hp2oWY9Slo_h"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "Call `DataFrame.reindex` to manually reorder the rows. For example, the following has the same effect as sorting by city name:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "sN0zUzSAj-U1",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities.reindex([2, 0, 1])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "-GQFz8NZuS06"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "Reindexing is a great way to shuffle (randomize) a `DataFrame`. In the example below, we take the index, which is array-like, and pass it to NumPy's `random.permutation` function, which shuffles its values in place. Calling `reindex` with this shuffled array causes the `DataFrame` rows to be shuffled in the same way.\n",
+ "Try running the following cell multiple times!"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "mF8GC0k8uYhz",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities.reindex(np.random.permutation(cities.index))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "fSso35fQmGKb"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "For more information, see the [Index documentation](http://pandas.pydata.org/pandas-docs/stable/indexing.html#index-objects)."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "8UngIdVhz8C0"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Exercise #2\n",
+ "\n",
+ "The `reindex` method allows index values that are not in the original `DataFrame`'s index values. Try it and see what happens if you use such values! Why do you think this is allowed?"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "PN55GrDX0jzO",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "# Your code here"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "TJffr5_Jwqvd"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for the solution."
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "8oSvi2QWwuDH"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "If your `reindex` input array includes values not in the original `DataFrame` index values, `reindex` will add new rows for these \"missing\" indices and populate all corresponding columns with `NaN` values:"
+ ]
+ },
+ {
+ "metadata": {
+ "colab_type": "code",
+ "id": "yBdkucKCwy4x",
+ "colab": {}
+ },
+ "cell_type": "code",
+ "source": [
+ "cities.reindex([0, 4, 5, 2])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "colab_type": "text",
+ "id": "2l82PhPbwz7g"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "This behavior is desirable because indexes are often strings pulled from the actual data (see the [*pandas* reindex\n",
+ "documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html) for an example\n",
+ "in which the index values are browser names).\n",
+ "\n",
+ "In this case, allowing \"missing\" indices makes it easy to reindex using an external list, as you don't have to worry about\n",
+ "sanitizing the input."
+ ]
+ }
+ ]
+}
\ No newline at end of file
From a7693a493afff154343bbbfc426d55e2a9ad1775 Mon Sep 17 00:00:00 2001
From: Arnab Ghosh <43007068+ArnabG99@users.noreply.github.com>
Date: Wed, 30 Jan 2019 18:23:37 +0530
Subject: [PATCH 3/3] Delete intro_to_pandas.ipynb
---
intro_to_pandas.ipynb | 660 ------------------------------------------
1 file changed, 660 deletions(-)
delete mode 100644 intro_to_pandas.ipynb
diff --git a/intro_to_pandas.ipynb b/intro_to_pandas.ipynb
deleted file mode 100644
index aa51c27..0000000
--- a/intro_to_pandas.ipynb
+++ /dev/null
@@ -1,660 +0,0 @@
-{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "name": "intro_to_pandas.ipynb",
- "version": "0.3.2",
- "provenance": [],
- "collapsed_sections": [
- "JndnmDMp66FL",
- "YHIWvc9Ms-Ll",
- "TJffr5_Jwqvd"
- ],
- "include_colab_link": true
- },
- "kernelspec": {
- "name": "python2",
- "display_name": "Python 2"
- }
- },
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- "
"
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "JndnmDMp66FL"
- },
- "cell_type": "markdown",
- "source": [
- "#### Copyright 2017 Google LLC."
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "hMqWDc_m6rUC",
- "cellView": "both",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
- "# you may not use this file except in compliance with the License.\n",
- "# You may obtain a copy of the License at\n",
- "#\n",
- "# https://www.apache.org/licenses/LICENSE-2.0\n",
- "#\n",
- "# Unless required by applicable law or agreed to in writing, software\n",
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
- "# See the License for the specific language governing permissions and\n",
- "# limitations under the License."
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "rHLcriKWLRe4"
- },
- "cell_type": "markdown",
- "source": [
- "# Intro to pandas"
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "QvJBqX8_Bctk"
- },
- "cell_type": "markdown",
- "source": [
- "**Learning Objectives:**\n",
- " * Gain an introduction to the `DataFrame` and `Series` data structures of the *pandas* library\n",
- " * Access and manipulate data within a `DataFrame` and `Series`\n",
- " * Import CSV data into a *pandas* `DataFrame`\n",
- " * Reindex a `DataFrame` to shuffle data"
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "TIFJ83ZTBctl"
- },
- "cell_type": "markdown",
- "source": [
- "[*pandas*](http://pandas.pydata.org/) is a column-oriented data analysis API. It's a great tool for handling and analyzing input data, and many ML frameworks support *pandas* data structures as inputs.\n",
- "Although a comprehensive introduction to the *pandas* API would span many pages, the core concepts are fairly straightforward, and we'll present them below. For a more complete reference, the [*pandas* docs site](http://pandas.pydata.org/pandas-docs/stable/index.html) contains extensive documentation and many tutorials."
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "s_JOISVgmn9v"
- },
- "cell_type": "markdown",
- "source": [
- "## Basic Concepts\n",
- "\n",
- "The following line imports the *pandas* API and prints the API version:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "aSRYu62xUi3g",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "from __future__ import print_function\n",
- "\n",
- "import pandas as pd\n",
- "pd.__version__"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "daQreKXIUslr"
- },
- "cell_type": "markdown",
- "source": [
- "The primary data structures in *pandas* are implemented as two classes:\n",
- "\n",
- " * **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns.\n",
- " * **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`.\n",
- "\n",
- "The data frame is a commonly used abstraction for data manipulation. Similar implementations exist in [Spark](https://spark.apache.org/) and [R](https://www.r-project.org/about.html)."
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "fjnAk1xcU0yc"
- },
- "cell_type": "markdown",
- "source": [
- "One way to create a `Series` is to construct a `Series` object. For example:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "DFZ42Uq7UFDj",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "pd.Series(['San Francisco', 'San Jose', 'Sacramento'])"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "U5ouUp1cU6pC"
- },
- "cell_type": "markdown",
- "source": [
- "`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. Example:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "avgr6GfiUh8t",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n",
- "population = pd.Series([852469, 1015785, 485199])\n",
- "\n",
- "pd.DataFrame({ 'City name': city_names, 'Population': population })"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "oa5wfZT7VHJl"
- },
- "cell_type": "markdown",
- "source": [
- "But most of the time, you load an entire file into a `DataFrame`. The following example loads a file with California housing data. Run the following cell to load the data and create feature definitions:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "av6RYOraVG1V",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
- "california_housing_dataframe.describe()"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "WrkBjfz5kEQu"
- },
- "cell_type": "markdown",
- "source": [
- "The example above used `DataFrame.describe` to show interesting statistics about a `DataFrame`. Another useful function is `DataFrame.head`, which displays the first few records of a `DataFrame`:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "s3ND3bgOkB5k",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "california_housing_dataframe.head()"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "w9-Es5Y6laGd"
- },
- "cell_type": "markdown",
- "source": [
- "Another powerful feature of *pandas* is graphing. For example, `DataFrame.hist` lets you quickly study the distribution of values in a column:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "nqndFVXVlbPN",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "california_housing_dataframe.hist('housing_median_age')"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "XtYZ7114n3b-"
- },
- "cell_type": "markdown",
- "source": [
- "## Accessing Data\n",
- "\n",
- "You can access `DataFrame` data using familiar Python dict/list operations:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "_TFm7-looBFF",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities = pd.DataFrame({ 'City name': city_names, 'Population': population })\n",
- "print(type(cities['City name']))\n",
- "cities['City name']"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "V5L6xacLoxyv",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "print(type(cities['City name'][1]))\n",
- "cities['City name'][1]"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "gcYX1tBPugZl",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "print(type(cities[0:2]))\n",
- "cities[0:2]"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "65g1ZdGVjXsQ"
- },
- "cell_type": "markdown",
- "source": [
- "In addition, *pandas* provides an extremely rich API for advanced [indexing and selection](http://pandas.pydata.org/pandas-docs/stable/indexing.html) that is too extensive to be covered here."
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "RM1iaD-ka3Y1"
- },
- "cell_type": "markdown",
- "source": [
- "## Manipulating Data\n",
- "\n",
- "You may apply Python's basic arithmetic operations to `Series`. For example:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "XWmyCFJ5bOv-",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "population / 1000."
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "TQzIVnbnmWGM"
- },
- "cell_type": "markdown",
- "source": [
- "[NumPy](http://www.numpy.org/) is a popular toolkit for scientific computing. *pandas* `Series` can be used as arguments to most NumPy functions:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "ko6pLK6JmkYP",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "import numpy as np\n",
- "\n",
- "np.log(population)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "xmxFuQmurr6d"
- },
- "cell_type": "markdown",
- "source": [
- "For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map), \n",
- "`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n",
- "\n",
- "The example below creates a new `Series` that indicates whether `population` is over one million:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "Fc1DvPAbstjI",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "population.apply(lambda val: val > 1000000)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "ZeYYLoV9b9fB"
- },
- "cell_type": "markdown",
- "source": [
- "\n",
- "Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "0gCEX99Hb8LR",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n",
- "cities['Population density'] = cities['Population'] / cities['Area square miles']\n",
- "cities"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "6qh63m-ayb-c"
- },
- "cell_type": "markdown",
- "source": [
- "## Exercise #1\n",
- "\n",
- "Modify the `cities` table by adding a new boolean column that is True if and only if *both* of the following are True:\n",
- "\n",
- " * The city is named after a saint.\n",
- " * The city has an area greater than 50 square miles.\n",
- "\n",
- "**Note:** Boolean `Series` are combined using the bitwise, rather than the traditional boolean, operators. For example, when performing *logical and*, use `&` instead of `and`.\n",
- "\n",
- "**Hint:** \"San\" in Spanish means \"saint.\""
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "zCOn8ftSyddH",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "# Your code here"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "YHIWvc9Ms-Ll"
- },
- "cell_type": "markdown",
- "source": [
- "### Solution\n",
- "\n",
- "Click below for a solution."
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "T5OlrqtdtCIb",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities['Is wide and has saint name'] = (cities['Area square miles'] > 50) & cities['City name'].apply(lambda name: name.startswith('San'))\n",
- "cities"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "f-xAOJeMiXFB"
- },
- "cell_type": "markdown",
- "source": [
- "## Indexes\n",
- "Both `Series` and `DataFrame` objects also define an `index` property that assigns an identifier value to each `Series` item or `DataFrame` row. \n",
- "\n",
- "By default, at construction, *pandas* assigns index values that reflect the ordering of the source data. Once created, the index values are stable; that is, they do not change when data is reordered."
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "2684gsWNinq9",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "city_names.index"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "F_qPe2TBjfWd",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities.index"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "hp2oWY9Slo_h"
- },
- "cell_type": "markdown",
- "source": [
- "Call `DataFrame.reindex` to manually reorder the rows. For example, the following has the same effect as sorting by city name:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "sN0zUzSAj-U1",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities.reindex([2, 0, 1])"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "-GQFz8NZuS06"
- },
- "cell_type": "markdown",
- "source": [
- "Reindexing is a great way to shuffle (randomize) a `DataFrame`. In the example below, we take the index, which is array-like, and pass it to NumPy's `random.permutation` function, which shuffles its values in place. Calling `reindex` with this shuffled array causes the `DataFrame` rows to be shuffled in the same way.\n",
- "Try running the following cell multiple times!"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "mF8GC0k8uYhz",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities.reindex(np.random.permutation(cities.index))"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "fSso35fQmGKb"
- },
- "cell_type": "markdown",
- "source": [
- "For more information, see the [Index documentation](http://pandas.pydata.org/pandas-docs/stable/indexing.html#index-objects)."
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "8UngIdVhz8C0"
- },
- "cell_type": "markdown",
- "source": [
- "## Exercise #2\n",
- "\n",
- "The `reindex` method allows index values that are not in the original `DataFrame`'s index values. Try it and see what happens if you use such values! Why do you think this is allowed?"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "PN55GrDX0jzO",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "# Your code here"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "TJffr5_Jwqvd"
- },
- "cell_type": "markdown",
- "source": [
- "### Solution\n",
- "\n",
- "Click below for the solution."
- ]
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "8oSvi2QWwuDH"
- },
- "cell_type": "markdown",
- "source": [
- "If your `reindex` input array includes values not in the original `DataFrame` index values, `reindex` will add new rows for these \"missing\" indices and populate all corresponding columns with `NaN` values:"
- ]
- },
- {
- "metadata": {
- "colab_type": "code",
- "id": "yBdkucKCwy4x",
- "colab": {}
- },
- "cell_type": "code",
- "source": [
- "cities.reindex([0, 4, 5, 2])"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "metadata": {
- "colab_type": "text",
- "id": "2l82PhPbwz7g"
- },
- "cell_type": "markdown",
- "source": [
- "This behavior is desirable because indexes are often strings pulled from the actual data (see the [*pandas* reindex\n",
- "documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html) for an example\n",
- "in which the index values are browser names).\n",
- "\n",
- "In this case, allowing \"missing\" indices makes it easy to reindex using an external list, as you don't have to worry about\n",
- "sanitizing the input."
- ]
- }
- ]
-}
\ No newline at end of file