diff --git a/Done_Challenge__Day2.ipynb b/Done_Challenge__Day2.ipynb new file mode 100644 index 0000000..5c0e597 --- /dev/null +++ b/Done_Challenge__Day2.ipynb @@ -0,0 +1,628 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Done_Challenge_ Day2.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Done Project: Data Minining Project for X company" + ], + "metadata": { + "id": "zroHHWfG7V2M" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zDwep1K8Erxl" + }, + "source": [ + "**Project:** Data Minining Project for X company" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JzIu-UWIDXHw" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d7-ii3uyI8KY" + }, + "source": [ + "The CRISP-DM Framework\n", + "\n", + "\n", + "The CRISP-DM methodology provides a structured approach to planning a data mining project. It is a robust and well-proven methodology.\n", + "* Business understanding (BU): Determine Business Objectives, Assess Situation, Determine Data Mining Goals, Produce Project Plan\n", + "\n", + "* Data understanding (DU): Collect Initial Data, Describe Data, Explore Data, Verify Data Quality\n", + "\n", + "* Data preparation (DP): Select Data, Clean Data, Construct Data, Integrate Data\n", + "\n", + "* Modeling (M): Select modeling technique, Generate Test Design, Build Model, Assess Model\n", + "* Evaluation (E): Evaluate Results, Review Process, Determine Next Steps\n", + "* Deployment (D): Plan Deployment, Plan Monitoring and Maintenance, Produce Final Report, Review Project\n", + "\n", + "\n", + "References:\n", + "\n", + "[What is the CRISP-DM methodology?](https://www.sv-europe.com/crisp-dm-methodology/)\n", + "\n", + "[Introduction to CRISP DM Framework for Data Science and Machine Learning](https://www.linkedin.com/pulse/chapter-1-introduction-crisp-dm-framework-data-science-anshul-roy/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5lo7Ml7tMQOf" + }, + "source": [ + "**Data Set**\n", + "### The data is for company X which is trying to control attrition. \n", + "### There are two sets of data: \"Existing employees\" and \"Employees who have left\". The following attributes are available for every employee.\n", + "\n", + "\n", + "* Satisfaction Level\n", + "\n", + "* Last evaluation\n", + "\n", + "* Number of projects\n", + "\n", + "* Average monthly hours\n", + "\n", + "* Time spent at the company\n", + "* Whether they have had a work accident\n", + "\n", + "\n", + "* Whether they have had a promotion in the last 5 years\n", + "\n", + "\n", + "* Departments (column sales)\n", + "\n", + "\n", + "* Salary\n", + "\n", + "\n", + "* Whether the employee has left\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sjSj2A2sSph_" + }, + "source": [ + "**Your Role**\n", + " \n", + "\n", + "* As data science team member X company asked you to answer this two questions.\n", + "* What type of employees is leaving? \n", + "\n", + "* Determine which employees are prone to leave next.\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ajdEVA7LiBUp" + }, + "source": [ + "Business Understanding\n", + "\n", + "---\n", + "\n", + "This step mostly focuses on understanding the Business in all the different aspects. It follows the below different steps.\n", + "\n", + "\n", + "\n", + "\n", + "* Identify the goal and frame the business problem.\n", + "* Prepare Analytical Goal i.e. what type of performance metric and loss function to use\n", + "* Gather information on resource, constraints, assumptions, risks etc\n", + "* Gather information on resource, constraints, assumptions, risks etc\n", + "* Prepare Work Flow Chart" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J4MwiCYzj2_u" + }, + "source": [ + "### Write the main objectives of this project in your words?\n", + "minimum of 100 characters" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "STyLda45j1Mf" + }, + "source": [ + "main_objectives ='''This project aims to allow me to understand about Data mining methodologies such as CRISP in general, \n", + "particularly business and data understanding. we have two classes, namely: \"Existing employees\" and \"Employees who have left\"\n", + "It could be identified based on the value of each given questions as the model will train from it. \n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CuOlxLxKMOLI" + }, + "source": [ + "assert len(main_objectives) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "assert len(main_objectives) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NyXeNxlCkbaw" + }, + "source": [ + "### Outline the different data analysis steps you will follow to carry out the project" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rC-tl8sUksQq" + }, + "source": [ + "dm_outline = '''According to Will Hillier, we have 7 data analysis steps [1] (https://careerfoundry.com/en/blog/data-analytics/the-data-analysis-process-step-by-step/#step-four-analyzing-the-data) \n", + "1. Defining the question: we have already given two questions 'What type of employees is leaving?' and 'which employees are prone to leave next?'\n", + "2. Collecting the data: we need a massive amount of data in order to train the model well. ML requires large amount of data.\n", + "3. Cleaning the data: uncleaned data leads to wrong prediction, hence cleaning data is mandatory.\n", + "4. Analyzing the data: This one is the main step. after we cleaned the data, analyzing or using it for training is the next level with such as predictive analysis\n", + "5. Sharing your results: We have found something (insights) in analysis level, the next is sharing it to the x organization\n", + "6. Embracing failure: failure is the sign of working something harder to work, hence accepting failure and hone your ability to spot and rectify errors is the main thing. \n", + "7. Summary: the final step, is to summarize what we have done.\n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-K1mWuDoksTk" + }, + "source": [ + "assert len(dm_outline) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "assert len(dm_outline) > 70 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pmUDFG1wkzUy" + }, + "source": [ + "I will use the Accuracy metric to measure the performance of this data analysis model\n", + "# accuracy = **$\\frac{correct-predictions}{all-predictions}$**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KCNulojKk_BP" + }, + "source": [ + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vLS2YHoRk_EK" + }, + "source": [ + "Why do you choose these metrics? minimum of 100 characters" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LSynT14KlPSJ" + }, + "source": [ + "why_metrics = '''we are developing a model to predict whether the employees of the x organization will leave or not based on the data collected data from it. \n", + "Hence, we want to build a more accurate model that can be able to outcomes result in better decisions. There might be a cost of errors, but optimizing model accuracy mitigates that cost. \n", + "There are many optimization algorithms to handle such losses of a model. The benefits of improving model accuracy help avoid considerable time, money, and undue stress.\n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "yr-Mk0E8lPVJ" + }, + "source": [ + "assert len(why_metrics) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "assert len(why_metrics) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aAo19Ip6lUtm" + }, + "source": [ + "### How would you know if your data analysis work is a success or not?\n", + "minimum of 100 characters" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HESsiXW5llX-" + }, + "source": [ + "how_success = '''After we have analyzed the data (or experiment the model) we'll demonstrate the result to the organization. \n", + "What is next is taking their response and feedback by applying a usability testing and quality testing measurements. \n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "FdUoiMIOlmXq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "cf546265-bd20-46dd-e5fe-273e53eef495" + }, + "source": [ + "assert len(how_success) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "print(len(how_success))\n", + "assert len(how_success) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "227\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DQE6dqo6l1TZ" + }, + "source": [ + "## What kind of challenges do you expect in your analysis?\n", + "List at least 3 challenges" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WrAhBQhQl8Lh" + }, + "source": [ + "challenge_text = '''The most challenge will be related to the data collection process. however, we could also face other challenges. \n", + "Here are some challenges what I expect during data analysis.\n", + "1. Collecting meaningful data: Identifying and collecting which data is vital for the organization/business is one \n", + "2. Selecting the right tool: Since the nature of data may vary as per the area we are going to work, selecting the right tool for the collected data may also a challenge.\n", + "3. Consolidate data from multiple sources: data can be collected form different sources; hence structure of these data will be different, putting these data together and using it is another challenge \n", + "'''\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EedHa-Pll8X7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1b8df4ac-173a-42aa-d0f7-df15a5844714" + }, + "source": [ + "assert len(challenge_text) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "print(len(challenge_text))\n", + "assert len(how_success) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "663\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZcJ8M6uWDeSE" + }, + "source": [ + "

Using the processed twitter data from yesterday's challenge

.\n", + "\n", + "\n", + "- Form a new data frame (named `cleanTweet`), containing columns $\\textbf{clean-text}$ and $\\textbf{polarity}$.\n", + "\n", + "- Write a function `text_category` that takes a value `p` and returns, depending on the value of p, a string `'positive'`, `'negative'` or `'neutral'`.\n", + "\n", + "- Apply this function (`text_category`) on the $\\textbf{polarity}$ column of `cleanTweet` in 1 above to form a new column called $\\textbf{score}$ in `cleanTweet`.\n", + "\n", + "- Visualize The $\\textbf{score}$ column using piechart and barchart\n", + "\n", + "
Now I want to build a classification model on the clean tweet following the steps below:
\n", + "\n", + "* Remove rows from `cleanTweet` where $\\textbf{polarity}$ $= 0$ (i.e where $\\textbf{score}$ = Neutral) and reset the frame index.\n", + "* Construct a column $\\textbf{scoremap}$ Use the mapping {'positive':1, 'negative':0} on the $\\textbf{score}$ column\n", + "* Create feature and target variables `(X,y)` from $\\textbf{clean-text}$ and $\\textbf{scoremap}$ columns respectively.\n", + "* Use `train_test_split` function to construct `(X_train, y_train)` and `(X_test, y_test)` from `(X,y)`\n", + "\n", + "* Build an `SGDClassifier` model from the vectorize train text data. Use `CountVectorizer()` with a $\\textit{trigram}$ parameter.\n", + "\n", + "* Evaluate your model on the test data.\n" + ] + }, + { + "cell_type": "code", + "source": [ + "#install tweepy if not installed\n", + "#!pip uninstall tweepy\n", + "#!pip install git+https://github.com/tweepy/tweepy.git" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H9QudDFcB1S2", + "outputId": "39983ce2-28e1-4d39-c569-1b197c976c5e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found existing installation: tweepy 4.10.0\n", + "Uninstalling tweepy-4.10.0:\n", + " Would remove:\n", + " /usr/local/lib/python3.7/dist-packages/tweepy-4.10.0.dist-info/*\n", + " /usr/local/lib/python3.7/dist-packages/tweepy/*\n", + "Proceed (y/n)? y\n", + " Successfully uninstalled tweepy-4.10.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting git+https://github.com/tweepy/tweepy.git\n", + " Cloning https://github.com/tweepy/tweepy.git to /tmp/pip-req-build-ilv676b3\n", + " Running command git clone -q https://github.com/tweepy/tweepy.git /tmp/pip-req-build-ilv676b3\n", + "Requirement already satisfied: oauthlib<4,>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (3.2.0)\n", + "Requirement already satisfied: requests<3,>=2.27.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (2.28.1)\n", + "Requirement already satisfied: requests-oauthlib<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (1.3.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2022.6.15)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (1.24.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2.10)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2.1.0)\n", + "Building wheels for collected packages: tweepy\n", + " Building wheel for tweepy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for tweepy: filename=tweepy-4.10.0-py3-none-any.whl size=94559 sha256=960c825fe40e9aa906c08568d99e2a97c0c100c3f794f8b777a495e3cbe3c83f\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-8f1vqvt6/wheels/b4/a5/5a/5074abdb9f4bd5bd0e22631a63fc41ae2fa71ad83780ea18d1\n", + "Successfully built tweepy\n", + "Installing collected packages: tweepy\n", + "Successfully installed tweepy-4.10.0\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "85WxmGNGDcBY" + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%reload_ext autoreload" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Imports\n", + "import tweepy\n", + "import json\n", + "import time" + ], + "metadata": { + "id": "d-5-tkS4_oLE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Config file variables\n", + "consumer_key = ''\n", + "consumer_secret = ''\n", + "access_token = ''\n", + "access_token_secret = ''\n" + ], + "metadata": { + "id": "iSz4D89j_UE3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Authenticate into Tweepy\n", + "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", + "auth.set_access_token(access_token, access_token_secret)\n", + "api = tweepy.API(auth, wait_on_rate_limit=True)\n" + ], + "metadata": { + "id": "yuKFY_G__UHz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "keywords = \"#blockchain OR #cryptocurreny OR #financialmarket OR #bitcoin OR #ethereum\"\n", + "# keywords_with_geocode = \"#blockchain OR #cryptocurreny OR #financialmarket OR #bitcoin OR #ethereum geocode:6.611,20.934,240km\"\n", + "limit = 50" + ], + "metadata": { + "id": "40YuqSGS_UKb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# save tweets to json file\n", + "c = tweepy.Cursor(\n", + " api.search_tweets,\n", + " q=keywords,\n", + " tweet_mode=\"extended\",\n", + " include_entities=True,\n", + ").items(limit)" + ], + "metadata": { + "id": "9qFjY8hN_UMz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "while True:\n", + " try:\n", + " tweet = c.next()\n", + " with open(\"../data/\" + \"web3.json\", \"a\", encoding=\"utf-8\") as f:\n", + " # for tweet in tweets:\n", + " data = tweet._json\n", + " f.write(json.dumps(data))\n", + " f.write(\"\\n\")\n", + " except tweepy.TooManyRequests:\n", + " print(\"Limit Reached. Sleeping for 15 minutes\")\n", + " time.sleep(60 * 15)\n", + " continue\n", + " except StopIteration:\n", + " break" + ], + "metadata": { + "id": "Iul5TTYw_UPn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "working on" + ], + "metadata": { + "id": "upr7mlvDFG58" + } + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "mJ64ezpVlxAT" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/clean_tweets_dataframe.py b/clean_tweets_dataframe.py index a6f6114..6c25099 100644 --- a/clean_tweets_dataframe.py +++ b/clean_tweets_dataframe.py @@ -1,13 +1,10 @@ - import pandas as pd import re - class Clean_Tweets: """ The PEP8 Standard AMAZING!!! """ - def __init__(self): print('Automation in Action...!!!') @@ -92,4 +89,4 @@ def clean_text(original_text: str) -> str: cleaned_text = " ".join(cleaned_text) cleaned_text = re.sub(r'http.*', "", cleaned_text) - return cleaned_text + return cleaned_text \ No newline at end of file diff --git a/extract_dataframe.py b/extract_dataframe.py index dd08b9b..519abb3 100644 --- a/extract_dataframe.py +++ b/extract_dataframe.py @@ -229,4 +229,7 @@ def get_tweet_df(self, save=False) -> pd.DataFrame: 'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries'] _, tweet_list = read_json("./data/covid19.json") tweet = TweetDfExtractor(tweet_list) - tweet_df = tweet.get_tweet_df(save=True) \ No newline at end of file + tweet_df = tweet.get_tweet_df(save=True) + tweet_df = tweet.get_tweet_df(save=True) + + # use all defined functions to generate a dataframe with the specified columns above