diff --git a/notebooks/README.md b/notebooks/README.md
deleted file mode 100644
index 87e239b..0000000
--- a/notebooks/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-
-### Some of the functions available in the notebooks and codes in this repository
-
-#### Slack Data Parsing Functions
-`slack_parser`: Parses Slack data to extract relevant information such as message type, content, sender details, thread information, etc. Combines data from multiple JSON files and returns a DataFrame.
-
-`parse_slack_reaction`: Retrieves reaction-related information from Slack data, including reaction name, count, users, associated message, and user ID. Returns a DataFrame.
-
-`convert_2_timestamp`: Converts Unix time to a readable timestamp for specified columns in the DataFrame.
-
-#### User Interaction and Community Analysis Functions
-`get_tagged_users`: Extracts all user mentions (@) from messages.
-
-`get_community_participation`: Analyzes community participation by counting the number of replies for each user.
-
-`map_userid_2_realname`: Maps Slack IDs to real names using user profiles. Optionally, plots a bar graph of message counts for each user.
-
-`get_top_20_user`: Plots the top 20 message senders in a specified channel.
-
-`draw_avg_reply_count`: Plots the average number of reply counts per sender in a channel.
-
-`draw_avg_reply_users_count`: Plots the average number of reply user counts per sender in a channel.
-
-`draw_wordcloud`: Generates and displays a word cloud visualization for message content.
-
-`draw_user_reaction`: Plots users with the most reactions in a channel.
-
-#### Data Analysis and Visualization
-`get_top_20_user(dfall_week, channel='All learning')`: Visualizes the top 20 message senders.
-
-`draw_avg_reply_count(dfall_week, channel='All Learning')`: Visualizes the average reply count per sender.
-
-`draw_avg_reply_users_count(dfall_week, channel='All learning')`: Visualizes the average reply user count per sender.
-
-`draw_wordcloud(dfall_week['msg_content'], week='All Learning Week')`: Displays a word cloud for message content.
-
-`draw_user_reaction`: Plots users with the most reactions.
-
diff --git a/notebooks/mongod.ipynb b/notebooks/mongod.ipynb
new file mode 100644
index 0000000..3d550e3
--- /dev/null
+++ b/notebooks/mongod.ipynb
@@ -0,0 +1,587 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "51f00fd715d6d40c",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "[Documentation](https://pymongo.readthedocs.io/en/stable/tutorial.html)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e643732fe0e2efd8",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.063608059Z",
+ "start_time": "2023-11-27T09:22:09.736035918Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from pymongo import MongoClient\n",
+ "\n",
+ "\n",
+ "class DB:\n",
+ " def __init__(self):\n",
+ " self.client = MongoClient(\"mongodb://localhost:27017/\")\n",
+ " self.db = self.client[\"10Academy\"]\n",
+ "\n",
+ " def list_collections(self):\n",
+ " # Returns collections inside the database\n",
+ " return self.db.list_collection_names()\n",
+ "\n",
+ " def check_if_collection_exist(self, collection_name: str):\n",
+ " if not self.list_collections().__contains__(collection_name):\n",
+ " raise Exception(f\"Collection, {collection_name} not found.\")\n",
+ "\n",
+ " def insert_to_collection(self, collection_name, data):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " collection = self.db[collection_name]\n",
+ " return collection.insert_one(data)\n",
+ "\n",
+ " def insert_many_to_collection(self, collection_name, data):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " result = self.db[collection_name].insert_many(data)\n",
+ " return result.inserted_ids\n",
+ "\n",
+ " def find_all(self, collection_name):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find()\n",
+ "\n",
+ " def find(self, collection_name, key, value):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find({key: value})\n",
+ " \n",
+ " def find_by_id(self, collection_name, _id):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find\n",
+ "\n",
+ " def find_one(self, collection_name, key, value):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find_one({key: value})\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c767a1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class DBWithSchema:\n",
+ " def __init__(self) -> None:\n",
+ " self.client = MongoClient(\"mongodb://localhost:27017/\")\n",
+ " self.db = self.client[\"10Academy1\"]\n",
+ "\n",
+ " self.employee_validator = {\n",
+ " \"$jsonSchema\": {\n",
+ " \"bsonType\": \"object\",\n",
+ " \"required\": [\"name\", \"age\", \"company\"],\n",
+ " \"properties\": {\n",
+ " \"name\": {\n",
+ " \"bsonType\": \"string\",\n",
+ " \"description\": \"must be a string and is required\"\n",
+ " },\n",
+ " \"age\": {\n",
+ " \"bsonType\": \"number\",\n",
+ " \"description\": \"must be an number and is required\"\n",
+ " },\n",
+ " \"company\": {\n",
+ " \"bsonType\": \"objectId\",\n",
+ " \"description\": \"must be an objectId and is required\"\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " self.company_validator = {\n",
+ " \"$jsonSchema\": {\n",
+ " \"bsonType\": \"object\",\n",
+ " \"required\": [\"name\", \"country\", \"city\"],\n",
+ " \"properties\": {\n",
+ " \"name\": {\n",
+ " \"bsonType\": \"string\",\n",
+ " \"description\": \"must be a string and is required\"\n",
+ " },\n",
+ " \"city\": {\n",
+ " \"bsonType\": \"string\",\n",
+ " \"description\": \"must be an string and is required\"\n",
+ " },\n",
+ " \"country\": {\n",
+ " \"bsonType\": \"string\",\n",
+ " \"description\": \"must be an string and is required\"\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " try:\n",
+ " self.db.create_collection(\"employee\")\n",
+ " self.db.create_collection(\"company\")\n",
+ " except Exception as e:\n",
+ " print(e)\n",
+ "\n",
+ " self.db.command(\"collMod\", \"employee\", validator=self.employee_validator)\n",
+ " self.db.command(\"collMod\", \"company\", validator=self.company_validator)\n",
+ "\n",
+ " def list_collections(self):\n",
+ " return self.db.list_collection_names()\n",
+ " \n",
+ " def get_validation(self, collection_name: str) -> dict:\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db.get_collection(collection_name).options()\n",
+ " \n",
+ " def check_if_collection_exist(self, collection_name: str):\n",
+ " if not self.list_collections().__contains__(collection_name):\n",
+ " raise Exception(f\"Collection, {collection_name} not found.\")\n",
+ "\n",
+ " def insert_to_collection(self, collection_name, data):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " collection = self.db[collection_name]\n",
+ " return collection.insert_one(data)\n",
+ "\n",
+ " def insert_many_to_collection(self, collection_name, data):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " result = self.db[collection_name].insert_many(data)\n",
+ " return result.inserted_ids\n",
+ "\n",
+ " def find_all(self, collection_name):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find()\n",
+ "\n",
+ " def find(self, collection_name, key, value):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find({key: value})\n",
+ " \n",
+ " def find_by_id(self, collection_name, _id):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find\n",
+ "\n",
+ " def find_one(self, collection_name, key, value):\n",
+ " self.check_if_collection_exist(collection_name)\n",
+ " return self.db[collection_name].find_one({key: value})\n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "42edf846f8ff36b6",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Initialize DB Class"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "initial_id",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.102209282Z",
+ "start_time": "2023-11-27T09:22:09.762921575Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "db = DB()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67a77c0aa2951381",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Get list of collections"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1943d67b90ef5345",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.139918225Z",
+ "start_time": "2023-11-27T09:22:09.771118477Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "db.list_collections()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eff826746c1013d4",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Insert record to collection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "feda1ca67e1f08bf",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.174968485Z",
+ "start_time": "2023-11-27T09:22:09.840811255Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "data = {\n",
+ " \"name\": \"John\"\n",
+ "}\n",
+ "inserted_record = db.insert_to_collection('users', data)\n",
+ "print(inserted_record.inserted_id)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a758e8de3c6b4947",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Insert more than record to collection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1613dd98803f815",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.210845641Z",
+ "start_time": "2023-11-27T09:22:09.841239357Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "data = [{\n",
+ " \"name\": \"Tom\"\n",
+ "},\n",
+ " {\n",
+ " \"name\": \"Jane\",\n",
+ " \"age\": 33\n",
+ "}\n",
+ "]\n",
+ "inserted_records = db.insert_many_to_collection('users', data)\n",
+ "print(inserted_records)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e902fb63fe9eca24",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Find All records from collection "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cddcaee42da38732",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.211491539Z",
+ "start_time": "2023-11-27T09:22:09.863917702Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "import pprint\n",
+ "\n",
+ "records = db.find_all('users')\n",
+ "for record in records:\n",
+ " pprint.pprint(record)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7f5cb0be2c22957",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Filter record from collection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "af345ea0ccb0bbfd",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.215331714Z",
+ "start_time": "2023-11-27T09:22:09.908992355Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "results = db.find('users', 'name', 'Jane')\n",
+ "for result in results:\n",
+ " pprint.pprint(result)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "974526cb5e1ea0c4",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:22:10.235289519Z",
+ "start_time": "2023-11-27T09:22:09.909276711Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "results = db.find('users', 'name', 'Rix')\n",
+ "for result in results:\n",
+ " pprint.pprint(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43a0d42c731246ea",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "#### Find one from collection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6e65a86719e4dd5d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-27T09:23:09.020008428Z",
+ "start_time": "2023-11-27T09:23:08.994515406Z"
+ },
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "result = db.find_one('users', 'name', 'Jane')\n",
+ "pprint.pprint(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "37a9ad04",
+ "metadata": {},
+ "source": [
+ "#### Initialize DBWithSchemaClass"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a44faa0420149106",
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "db_with_schema = DBWithSchema()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "076a3b3c",
+ "metadata": {},
+ "source": [
+ "#### View validators"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d4f5c537",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pprint\n",
+ "company_validator = db_with_schema.get_validation('company')\n",
+ "pprint.pprint(company_validator)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc3e1f07",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "employee_validator = db_with_schema.get_validation('employee')\n",
+ "pprint.pprint(employee_validator)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59dc048c",
+ "metadata": {},
+ "source": [
+ "#### Create Company"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3240bb1a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "company_data = [\n",
+ " {\n",
+ " \"name\": \"Mercedes-Benz\",\n",
+ " \"city\": \"Stuttgart\",\n",
+ " \"country\": \"Germany\"\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"Chevrolet\",\n",
+ " \"city\": \"Detroit\",\n",
+ " \"country\": \"United States\"\n",
+ " }\n",
+ "]\n",
+ "inserted_records = db_with_schema.insert_many_to_collection('company', company_data)\n",
+ "print(inserted_records)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22bedf96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "employee_data = [\n",
+ " {\n",
+ " \"name\": \"Jane\",\n",
+ " \"age\": 24,\n",
+ " \"company\": inserted_records[0]\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"John\",\n",
+ " \"age\": 26,\n",
+ " \"company\": inserted_records[1]\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"Amy\",\n",
+ " \"age\": 23,\n",
+ " \"company\": inserted_records[1]\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"Jack\",\n",
+ " \"age\": 22,\n",
+ " \"company\": inserted_records[0]\n",
+ " },\n",
+ "]\n",
+ "inserted_employees = db_with_schema.insert_many_to_collection('employee', employee_data)\n",
+ "print(inserted_employees)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4d71c56b",
+ "metadata": {},
+ "source": [
+ "#### Select all Employees"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "76099a40",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pprint\n",
+ "\n",
+ "employees = db_with_schema.find_all('employee')\n",
+ "for employee in employees:\n",
+ " pprint.pprint(employee)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "85c56722",
+ "metadata": {},
+ "source": [
+ "#### Look UP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf62050e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lookup = [\n",
+ " {\n",
+ " \"$lookup\": {\n",
+ " \"from\": \"employee\",\n",
+ " \"localField\": \"_id\",\n",
+ " \"foreignField\": \"company\",\n",
+ " \"as\": \"employees\"\n",
+ " }\n",
+ " }\n",
+ "]\n",
+ "employees_with_company = db_with_schema.db.company.aggregate(lookup)\n",
+ "employees_list = list(employees_with_company)\n",
+ "for employee in employees_list:\n",
+ " pprint.pprint(employee)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5ee3446",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/parse_slack_data.ipynb b/notebooks/parse_slack_data.ipynb
deleted file mode 100644
index e3774f8..0000000
--- a/notebooks/parse_slack_data.ipynb
+++ /dev/null
@@ -1,452 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%reload_ext autoreload\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os, sys\n",
- "import re\n",
- "import json\n",
- "import glob\n",
- "import datetime\n",
- "from collections import Counter\n",
- "\n",
- "import pandas as pd\n",
- "from matplotlib import pyplot as plt\n",
- "import seaborn as sns\n",
- "\n",
- "from nltk.corpus import stopwords\n",
- "from wordcloud import WordCloud"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Add parent directory to path to import modules from src\n",
- "rpath = os.path.abspath('..')\n",
- "if rpath not in sys.path:\n",
- " sys.path.insert(0, rpath)\n",
- "\n",
- "from src.loader import SlackDataLoader\n",
- "import src.utils as utils"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Columns we can get from a slack message
\n",
- "\n",
- "message_type, message_content, sender_id, time_sent, message_distribution, time_thread_start, reply_count, reply_user_count, time_thread_end, reply_users"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "From a single slack message, we can get
\n",
- "\n",
- "1. The message
\n",
- "2. Type (message, file, link, etc)
\n",
- "3. The sender_id (assigned by slack)
\n",
- "4. The time the message was sent
\n",
- "5. The team (i don't know what that is now)
\n",
- "6. The type of the message (broadcast message, inhouse, just messgae)
\n",
- "7. The thread the message generated (from here we can go):
\n",
- " 7.1 Text/content of the message
\n",
- " 7.2 The thread time of the message
\n",
- " 7.3 The thread count (reply count)
\n",
- " 7.4 The number of user that reply the message (count of users that participated in the thread)
\n",
- " 7.5 The time the last thread message was sent
\n",
- " 7.6 The users that participated in the thread (their ids are stored as well)
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# combine all json file in all-weeks8-9\n",
- "def slack_parser(path_channel):\n",
- " \"\"\" parse slack data to extract useful informations from the json file\n",
- " step of execution\n",
- " 1. Import the required modules\n",
- " 2. read all json file from the provided path\n",
- " 3. combine all json files in the provided path\n",
- " 4. extract all required informations from the slack data\n",
- " 5. convert to dataframe and merge all\n",
- " 6. reset the index and return dataframe\n",
- " \"\"\"\n",
- "\n",
- " # specify path to get json files\n",
- " combined = []\n",
- " for json_file in glob.glob(f\"{path_channel}*.json\"):\n",
- " with open(json_file, 'r', encoding=\"utf8\") as slack_data:\n",
- " combined.append(slack_data)\n",
- "\n",
- " # loop through all json files and extract required informations\n",
- " dflist = []\n",
- " for slack_data in combined:\n",
- "\n",
- " msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st, reply_users, \\\n",
- " reply_count, reply_users_count, tm_thread_end = [],[],[],[],[],[],[],[],[],[]\n",
- "\n",
- " for row in slack_data:\n",
- " if 'bot_id' in row.keys():\n",
- " continue\n",
- " else:\n",
- " msg_type.append(row['type'])\n",
- " msg_content.append(row['text'])\n",
- " if 'user_profile' in row.keys(): sender_id.append(row['user_profile']['real_name'])\n",
- " else: sender_id.append('Not provided')\n",
- " time_msg.append(row['ts'])\n",
- " if 'blocks' in row.keys() and len(row['blocks'][0]['elements'][0]['elements']) != 0 :\n",
- " msg_dist.append(row['blocks'][0]['elements'][0]['elements'][0]['type'])\n",
- " else: msg_dist.append('reshared')\n",
- " if 'thread_ts' in row.keys():\n",
- " time_thread_st.append(row['thread_ts'])\n",
- " else:\n",
- " time_thread_st.append(0)\n",
- " if 'reply_users' in row.keys(): reply_users.append(\",\".join(row['reply_users'])) \n",
- " else: reply_users.append(0)\n",
- " if 'reply_count' in row.keys():\n",
- " reply_count.append(row['reply_count'])\n",
- " reply_users_count.append(row['reply_users_count'])\n",
- " tm_thread_end.append(row['latest_reply'])\n",
- " else:\n",
- " reply_count.append(0)\n",
- " reply_users_count.append(0)\n",
- " tm_thread_end.append(0)\n",
- " data = zip(msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st,\n",
- " reply_count, reply_users_count, reply_users, tm_thread_end)\n",
- " columns = ['msg_type', 'msg_content', 'sender_name', 'msg_sent_time', 'msg_dist_type',\n",
- " 'time_thread_start', 'reply_count', 'reply_users_count', 'reply_users', 'tm_thread_end']\n",
- "\n",
- " df = pd.DataFrame(data=data, columns=columns)\n",
- " df = df[df['sender_name'] != 'Not provided']\n",
- " dflist.append(df)\n",
- "\n",
- " dfall = pd.concat(dflist, ignore_index=True)\n",
- " dfall['channel'] = path_channel.split('/')[-1].split('.')[0] \n",
- " dfall = dfall.reset_index(drop=True)\n",
- " \n",
- " return dfall\n",
- "\n",
- "\n",
- "def parse_slack_reaction(path, channel):\n",
- " \"\"\"get reactions\"\"\"\n",
- " dfall_reaction = pd.DataFrame()\n",
- " combined = []\n",
- " for json_file in glob.glob(f\"{path}*.json\"):\n",
- " with open(json_file, 'r') as slack_data:\n",
- " combined.append(slack_data)\n",
- "\n",
- " reaction_name, reaction_count, reaction_users, msg, user_id = [], [], [], [], []\n",
- "\n",
- " for k in combined:\n",
- " slack_data = json.load(open(k.name, 'r', encoding=\"utf-8\"))\n",
- " \n",
- " for i_count, i in enumerate(slack_data):\n",
- " if 'reactions' in i.keys():\n",
- " for j in range(len(i['reactions'])):\n",
- " msg.append(i['text'])\n",
- " user_id.append(i['user'])\n",
- " reaction_name.append(i['reactions'][j]['name'])\n",
- " reaction_count.append(i['reactions'][j]['count'])\n",
- " reaction_users.append(\",\".join(i['reactions'][j]['users']))\n",
- " \n",
- " data_reaction = zip(reaction_name, reaction_count, reaction_users, msg, user_id)\n",
- " columns_reaction = ['reaction_name', 'reaction_count', 'reaction_users_count', 'message', 'user_id']\n",
- " df_reaction = pd.DataFrame(data=data_reaction, columns=columns_reaction)\n",
- " df_reaction['channel'] = channel\n",
- " return df_reaction\n",
- "\n",
- "def get_community_participation(path):\n",
- " \"\"\" specify path to get json files\"\"\"\n",
- " combined = []\n",
- " comm_dict = {}\n",
- " for json_file in glob.glob(f\"{path}*.json\"):\n",
- " with open(json_file, 'r') as slack_data:\n",
- " combined.append(slack_data)\n",
- " # print(f\"Total json files is {len(combined)}\")\n",
- " for i in combined:\n",
- " a = json.load(open(i.name, 'r', encoding='utf-8'))\n",
- "\n",
- " for msg in a:\n",
- " if 'replies' in msg.keys():\n",
- " for i in msg['replies']:\n",
- " comm_dict[i['user']] = comm_dict.get(i['user'], 0)+1\n",
- " return comm_dict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def convert_2_timestamp(column, data):\n",
- " \"\"\"convert from unix time to readable timestamp\n",
- " args: column: columns that needs to be converted to timestamp\n",
- " data: data that has the specified column\n",
- " \"\"\"\n",
- " if column in data.columns.values:\n",
- " timestamp_ = []\n",
- " for time_unix in data[column]:\n",
- " if time_unix == 0:\n",
- " timestamp_.append(0)\n",
- " else:\n",
- " a = datetime.datetime.fromtimestamp(float(time_unix))\n",
- " timestamp_.append(a.strftime('%Y-%m-%d %H:%M:%S'))\n",
- " return timestamp_\n",
- " else: \n",
- " print(f\"{column} not in data\")\n",
- "\n",
- "def get_tagged_users(df):\n",
- " \"\"\"get all @ in the messages\"\"\"\n",
- "\n",
- " return df['msg_content'].map(lambda x: re.findall(r'@U\\w+', x))\n",
- "\n",
- "\n",
- " \n",
- "def map_userid_2_realname(user_profile: dict, comm_dict: dict, plot=False):\n",
- " \"\"\"\n",
- " map slack_id to realnames\n",
- " user_profile: a dictionary that contains users info such as real_names\n",
- " comm_dict: a dictionary that contains slack_id and total_message sent by that slack_id\n",
- " \"\"\"\n",
- " user_dict = {} # to store the id\n",
- " real_name = [] # to store the real name\n",
- " ac_comm_dict = {} # to store the mapping\n",
- " count = 0\n",
- " # collect all the real names\n",
- " for i in range(len(user_profile['profile'])):\n",
- " real_name.append(dict(user_profile['profile'])[i]['real_name'])\n",
- "\n",
- " # loop the slack ids\n",
- " for i in user_profile['id']:\n",
- " user_dict[i] = real_name[count]\n",
- " count += 1\n",
- "\n",
- " # to store mapping\n",
- " for i in comm_dict:\n",
- " if i in user_dict:\n",
- " ac_comm_dict[user_dict[i]] = comm_dict[i]\n",
- "\n",
- " ac_comm_dict = pd.DataFrame(data= zip(ac_comm_dict.keys(), ac_comm_dict.values()),\n",
- " columns=['LearnerName', '# of Msg sent in Threads']).sort_values(by='# of Msg sent in Threads', ascending=False)\n",
- " \n",
- " if plot:\n",
- " ac_comm_dict.plot.bar(figsize=(15, 7.5), x='LearnerName', y='# of Msg sent in Threads')\n",
- " plt.title('Student based on Message sent in thread', size=20)\n",
- " \n",
- " return ac_comm_dict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_top_20_user(data, channel='Random'):\n",
- " \"\"\"get user with the highest number of message sent to any channel\"\"\"\n",
- "\n",
- " data['sender_name'].value_counts()[:20].plot.bar(figsize=(15, 7.5))\n",
- " plt.title(f'Top 20 Message Senders in #{channel} channels', size=15, fontweight='bold')\n",
- " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=14);\n",
- " plt.xticks(size=12); plt.yticks(size=12);\n",
- " plt.show()\n",
- "\n",
- " data['sender_name'].value_counts()[-10:].plot.bar(figsize=(15, 7.5))\n",
- " plt.title(f'Bottom 10 Message Senders in #{channel} channels', size=15, fontweight='bold')\n",
- " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=14);\n",
- " plt.xticks(size=12); plt.yticks(size=12);\n",
- " plt.show()\n",
- "\n",
- "def draw_avg_reply_count(data, channel='Random'):\n",
- " \"\"\"who commands many reply?\"\"\"\n",
- "\n",
- " data.groupby('sender_name')['reply_count'].mean().sort_values(ascending=False)[:20]\\\n",
- " .plot(kind='bar', figsize=(15,7.5));\n",
- " plt.title(f'Average Number of reply count per Sender in #{channel}', size=20, fontweight='bold')\n",
- " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n",
- " plt.xticks(size=14); plt.yticks(size=14);\n",
- " plt.show()\n",
- "\n",
- "def draw_avg_reply_users_count(data, channel='Random'):\n",
- " \"\"\"who commands many user reply?\"\"\"\n",
- "\n",
- " data.groupby('sender_name')['reply_users_count'].mean().sort_values(ascending=False)[:20].plot(kind='bar',\n",
- " figsize=(15,7.5));\n",
- " plt.title(f'Average Number of reply user count per Sender in #{channel}', size=20, fontweight='bold')\n",
- " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n",
- " plt.xticks(size=14); plt.yticks(size=14);\n",
- " plt.show()\n",
- "\n",
- "def draw_wordcloud(msg_content, week): \n",
- " # word cloud visualization\n",
- " allWords = ' '.join([twts for twts in msg_content])\n",
- " wordCloud = WordCloud(background_color='#975429', width=500, height=300, random_state=21, max_words=500, mode='RGBA',\n",
- " max_font_size=140, stopwords=stopwords.words('english')).generate(allWords)\n",
- " plt.figure(figsize=(15, 7.5))\n",
- " plt.imshow(wordCloud, interpolation=\"bilinear\")\n",
- " plt.axis('off')\n",
- " plt.tight_layout()\n",
- " plt.title(f'WordCloud for {week}', size=30)\n",
- " plt.show()\n",
- "\n",
- "def draw_user_reaction(data, channel='General'):\n",
- " data.groupby('sender_name')[['reply_count', 'reply_users_count']].sum()\\\n",
- " .sort_values(by='reply_count',ascending=False)[:10].plot(kind='bar', figsize=(15, 7.5))\n",
- " plt.title(f'User with the most reaction in #{channel}', size=25);\n",
- " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n",
- " plt.xticks(size=14); plt.yticks(size=14);\n",
- " plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Insight Extraction\n",
- "\n",
- "Below are some useful questions to answer. Feel free to explore to answer other interesting questions that may be of help to get insight about student's behaviour, need, and future performance "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# which user has the highest number of reply counts?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visualize reply counts per user per channel"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# what is the time range of the day that most messages are sent?\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# what kind of messages are replied faster than others?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Relationship between # of messages and # of reactions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Classify messages into different categories such as questions, answers, comments, etc."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Which users got the most reactions?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Model topics mentioned in the channel"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# What are the topics that got the most reactions?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Harder questions to look into"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Based on messages, reactions, references shared, and other relevant data such as classification of questions into techical question, comment, answer, aorder stu the python, statistics, and sql skill level of a user?"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/schema.sql b/notebooks/schema.sql
new file mode 100644
index 0000000..927d4bc
--- /dev/null
+++ b/notebooks/schema.sql
@@ -0,0 +1,16 @@
+-- Create a table called 'users'
+CREATE TABLE users (
+ user_id SERIAL PRIMARY KEY,
+ username VARCHAR(50) NOT NULL,
+ email VARCHAR(100) NOT NULL,
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Create a table called 'posts'
+CREATE TABLE posts (
+ post_id SERIAL PRIMARY KEY,
+ user_id INT REFERENCES users(user_id),
+ title VARCHAR(255) NOT NULL,
+ content TEXT,
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
diff --git a/tests/test_slack_data_loader.py b/tests/test_slack_data_loader.py
new file mode 100644
index 0000000..f601f12
--- /dev/null
+++ b/tests/test_slack_data_loader.py
@@ -0,0 +1,33 @@
+# File: tests/test_slack_data_loader.py
+
+import pytest
+from your_module import SlackDataLoader # Replace 'your_module' with the actual module where SlackDataLoader is defined
+import pandas as pd
+
+@pytest.fixture
+def slack_data_loader():
+ # Set up SlackDataLoader instance if needed
+ return SlackDataLoader()
+
+def test_load_data_columns(slack_data_loader):
+ # Arrange
+ expected_columns = ['column1', 'column2', 'column3'] # Replace with the expected column names
+
+ # Act
+ data_frame = slack_data_loader.load_data() # Replace with the actual method you want to test
+
+ # Assert
+ assert isinstance(data_frame, pd.DataFrame)
+ assert all(column in data_frame.columns for column in expected_columns), "Columns do not match the expected columns"
+
+def test_process_data_columns(slack_data_loader):
+ # Arrange
+ expected_columns = ['processed_column1', 'processed_column2'] # Replace with the expected processed column names
+
+ # Act
+ data_frame = slack_data_loader.load_data() # Assuming process_data depends on load_data
+ processed_data_frame = slack_data_loader.process_data(data_frame) # Replace with the actual method you want to test
+
+ # Assert
+ assert isinstance(processed_data_frame, pd.DataFrame)
+ assert all(column in processed_data_frame.columns for column in expected_columns), "Processed columns do not match the expected columns"