diff --git a/notebooks/README.md b/notebooks/README.md deleted file mode 100644 index 87e239b..0000000 --- a/notebooks/README.md +++ /dev/null @@ -1,38 +0,0 @@ - -### Some of the functions available in the notebooks and codes in this repository - -#### Slack Data Parsing Functions -`slack_parser`: Parses Slack data to extract relevant information such as message type, content, sender details, thread information, etc. Combines data from multiple JSON files and returns a DataFrame. - -`parse_slack_reaction`: Retrieves reaction-related information from Slack data, including reaction name, count, users, associated message, and user ID. Returns a DataFrame. - -`convert_2_timestamp`: Converts Unix time to a readable timestamp for specified columns in the DataFrame. - -#### User Interaction and Community Analysis Functions -`get_tagged_users`: Extracts all user mentions (@) from messages. - -`get_community_participation`: Analyzes community participation by counting the number of replies for each user. - -`map_userid_2_realname`: Maps Slack IDs to real names using user profiles. Optionally, plots a bar graph of message counts for each user. - -`get_top_20_user`: Plots the top 20 message senders in a specified channel. - -`draw_avg_reply_count`: Plots the average number of reply counts per sender in a channel. - -`draw_avg_reply_users_count`: Plots the average number of reply user counts per sender in a channel. - -`draw_wordcloud`: Generates and displays a word cloud visualization for message content. - -`draw_user_reaction`: Plots users with the most reactions in a channel. - -#### Data Analysis and Visualization -`get_top_20_user(dfall_week, channel='All learning')`: Visualizes the top 20 message senders. - -`draw_avg_reply_count(dfall_week, channel='All Learning')`: Visualizes the average reply count per sender. - -`draw_avg_reply_users_count(dfall_week, channel='All learning')`: Visualizes the average reply user count per sender. - -`draw_wordcloud(dfall_week['msg_content'], week='All Learning Week')`: Displays a word cloud for message content. - -`draw_user_reaction`: Plots users with the most reactions. - diff --git a/notebooks/Topic_Modelling.ipynb b/notebooks/Topic_Modelling.ipynb new file mode 100644 index 0000000..fd66371 --- /dev/null +++ b/notebooks/Topic_Modelling.ipynb @@ -0,0 +1,368 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import gensim\n", + "from gensim.models import CoherenceModel\n", + "from gensim import corpora\n", + "import pandas as pd\n", + "from pprint import pprint\n", + "import string\n", + "import os\n", + "import re\n", + "import sys\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", + "import pyLDAvis.gensim_models as gensimvis\n", + "import pyLDAvis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install pyLDAvis" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Add parent directory to path to import modules from src\n", + "rpath = os.path.abspath('C:/Users/moyka/OneDrive/Documents/GitHub/week0_starter_network_analysis')\n", + "if rpath not in sys.path:\n", + " sys.path.insert(0, rpath)\n", + "\n", + "from src.loader import SlackDataLoader\n", + "import src.utils as utils" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data_loader = SlackDataLoader(r\"C:/Users/moyka/Desktop/Dashboard/anonymized/\")\n", + "\n", + "def get_channel_messages(channel):\n", + " channel_messages = utils.get_messages_on_channel(f\"C:/Users/moyka/Desktop/Dashboard/anonymized/{channel}\") \n", + " # Create an empty DataFrame\n", + " df = pd.DataFrame(channel_messages)\n", + " return df\n", + "\n", + "def get_all_channels_message():\n", + " dfs = [] # List to store individual DataFrames\n", + "\n", + " for channel in data_loader.channels:\n", + " dfs.append(get_channel_messages(channel[\"name\"]))\n", + "\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " result_df = pd.concat(dfs, ignore_index=True)\n", + "\n", + " return result_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def preprocess_text(text):\n", + " # Extract and remove URLs\n", + " urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\\\(\\\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)\n", + " for url in urls:\n", + " text = text.replace(url, '')\n", + "\n", + " text = re.sub(r'<@.*?>', '', text)\n", + "\n", + " # Convert to lowercase\n", + " text = text.lower()\n", + "\n", + " # Remove punctuation\n", + " text = ''.join([char for char in text if char not in string.punctuation])\n", + "\n", + " # Remove numbers\n", + " text = re.sub(r'\\d+', '', text)\n", + "\n", + " # Tokenize\n", + " tokens = word_tokenize(text)\n", + "\n", + " # Remove stop words\n", + " stop_words = set(stopwords.words('english'))\n", + " tokens = [word for word in tokens if word not in stop_words]\n", + "\n", + " # Perform stemming\n", + " stemmer = PorterStemmer()\n", + " tokens = [stemmer.stem(word) for word in tokens]\n", + "\n", + " # Perform lemmatization\n", + " lemmatizer = WordNetLemmatizer()\n", + " tokens = [lemmatizer.lemmatize(word) for word in tokens]\n", + "\n", + " # Join the tokens back into a string\n", + " text = ' '.join(tokens)\n", + "\n", + " return text\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def prepare_data(df):\n", + " df['cleaned_text'] = df['text'].apply(preprocess_text)\n", + " sentence_list = [tweet for tweet in df['cleaned_text']]\n", + " word_list = [sent.split() for sent in sentence_list]\n", + "\n", + " #Create dictionary which contains Id and word\n", + " word_to_id = corpora.Dictionary(word_list) #generate unique tokens\n", + " corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]\n", + " \n", + " return df, word_list, word_to_id, corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(corpus, word_to_id):\n", + " # Build LDA model\n", + " lda_model = gensim.models.ldamodel.LdaModel(corpus,\n", + " id2word=word_to_id,\n", + " num_topics=5,\n", + " random_state=100,\n", + " update_every=1,\n", + " chunksize=100,\n", + " passes=10,\n", + " alpha='auto',\n", + " per_word_topics=True) \n", + " return lda_model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def show_topics(lda_model):\n", + " pprint(lda_model.show_topics(formatted=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def model_analysis(lda_model, corpus, word_list, word_to_id):\n", + " print('\\nPerplexity: ', lda_model.log_perplexity(corpus))\n", + " doc_lda = lda_model[corpus]\n", + "\n", + "\n", + " # Compute Coherence Score\n", + " coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')\n", + " coherence_lda = coherence_model_lda.get_coherence()\n", + " print('\\n Lda model Coherence Score/Accuracy on Tweets: ', coherence_lda)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def get_top_topics(df):\n", + " df, word_list, word_to_id, corpus = prepare_data(df)\n", + " lda_model = build_model(corpus, word_to_id)\n", + "\n", + " # Show the top 10 topics\n", + " show_topics(lda_model)\n", + " \n", + "\n", + " # Visualize the top 10 topics\n", + " pyLDAvis.enable_notebook()\n", + " LDAvis_prepared = gensimvis.prepare(lda_model, corpus, word_to_id)\n", + " return LDAvis_prepared" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top 10 topics of the different channels" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'src.utils' has no attribute 'get_messages_on_channel'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[11], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_channel_messages(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-week1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "Cell \u001b[1;32mIn[3], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" + ] + } + ], + "source": [ + "df = get_channel_messages(\"all-week1\")\n", + "get_top_topics(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\moyka\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "nltk.download('wordnet')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'src.utils' has no attribute 'get_messages_on_channel'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[13], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_channel_messages(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-community-building\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "Cell \u001b[1;32mIn[3], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" + ] + } + ], + "source": [ + "df = get_channel_messages(\"all-community-building\")\n", + "get_top_topics(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top 10 topics of all channels" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'src.utils' has no attribute 'get_messages_on_channel'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_all_channels_message()\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "Cell \u001b[1;32mIn[3], line 13\u001b[0m, in \u001b[0;36mget_all_channels_message\u001b[1;34m()\u001b[0m\n\u001b[0;32m 10\u001b[0m dfs \u001b[38;5;241m=\u001b[39m [] \u001b[38;5;66;03m# List to store individual DataFrames\u001b[39;00m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m channel \u001b[38;5;129;01min\u001b[39;00m data_loader\u001b[38;5;241m.\u001b[39mchannels:\n\u001b[1;32m---> 13\u001b[0m dfs\u001b[38;5;241m.\u001b[39mappend(get_channel_messages(channel[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m]))\n\u001b[0;32m 15\u001b[0m \u001b[38;5;66;03m# Concatenate all DataFrames into a single DataFrame\u001b[39;00m\n\u001b[0;32m 16\u001b[0m result_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(dfs, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "Cell \u001b[1;32mIn[3], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" + ] + } + ], + "source": [ + "df = get_all_channels_message()\n", + "get_top_topics(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/parse_slack_data.ipynb b/notebooks/parse_slack_data.ipynb deleted file mode 100644 index e3774f8..0000000 --- a/notebooks/parse_slack_data.ipynb +++ /dev/null @@ -1,452 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os, sys\n", - "import re\n", - "import json\n", - "import glob\n", - "import datetime\n", - "from collections import Counter\n", - "\n", - "import pandas as pd\n", - "from matplotlib import pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from nltk.corpus import stopwords\n", - "from wordcloud import WordCloud" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Add parent directory to path to import modules from src\n", - "rpath = os.path.abspath('..')\n", - "if rpath not in sys.path:\n", - " sys.path.insert(0, rpath)\n", - "\n", - "from src.loader import SlackDataLoader\n", - "import src.utils as utils" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Columns we can get from a slack message
\n", - "\n", - "message_type, message_content, sender_id, time_sent, message_distribution, time_thread_start, reply_count, reply_user_count, time_thread_end, reply_users" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From a single slack message, we can get
\n", - "\n", - "1. The message
\n", - "2. Type (message, file, link, etc)
\n", - "3. The sender_id (assigned by slack)
\n", - "4. The time the message was sent
\n", - "5. The team (i don't know what that is now)
\n", - "6. The type of the message (broadcast message, inhouse, just messgae)
\n", - "7. The thread the message generated (from here we can go):
\n", - " 7.1 Text/content of the message
\n", - " 7.2 The thread time of the message
\n", - " 7.3 The thread count (reply count)
\n", - " 7.4 The number of user that reply the message (count of users that participated in the thread)
\n", - " 7.5 The time the last thread message was sent
\n", - " 7.6 The users that participated in the thread (their ids are stored as well)
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# combine all json file in all-weeks8-9\n", - "def slack_parser(path_channel):\n", - " \"\"\" parse slack data to extract useful informations from the json file\n", - " step of execution\n", - " 1. Import the required modules\n", - " 2. read all json file from the provided path\n", - " 3. combine all json files in the provided path\n", - " 4. extract all required informations from the slack data\n", - " 5. convert to dataframe and merge all\n", - " 6. reset the index and return dataframe\n", - " \"\"\"\n", - "\n", - " # specify path to get json files\n", - " combined = []\n", - " for json_file in glob.glob(f\"{path_channel}*.json\"):\n", - " with open(json_file, 'r', encoding=\"utf8\") as slack_data:\n", - " combined.append(slack_data)\n", - "\n", - " # loop through all json files and extract required informations\n", - " dflist = []\n", - " for slack_data in combined:\n", - "\n", - " msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st, reply_users, \\\n", - " reply_count, reply_users_count, tm_thread_end = [],[],[],[],[],[],[],[],[],[]\n", - "\n", - " for row in slack_data:\n", - " if 'bot_id' in row.keys():\n", - " continue\n", - " else:\n", - " msg_type.append(row['type'])\n", - " msg_content.append(row['text'])\n", - " if 'user_profile' in row.keys(): sender_id.append(row['user_profile']['real_name'])\n", - " else: sender_id.append('Not provided')\n", - " time_msg.append(row['ts'])\n", - " if 'blocks' in row.keys() and len(row['blocks'][0]['elements'][0]['elements']) != 0 :\n", - " msg_dist.append(row['blocks'][0]['elements'][0]['elements'][0]['type'])\n", - " else: msg_dist.append('reshared')\n", - " if 'thread_ts' in row.keys():\n", - " time_thread_st.append(row['thread_ts'])\n", - " else:\n", - " time_thread_st.append(0)\n", - " if 'reply_users' in row.keys(): reply_users.append(\",\".join(row['reply_users'])) \n", - " else: reply_users.append(0)\n", - " if 'reply_count' in row.keys():\n", - " reply_count.append(row['reply_count'])\n", - " reply_users_count.append(row['reply_users_count'])\n", - " tm_thread_end.append(row['latest_reply'])\n", - " else:\n", - " reply_count.append(0)\n", - " reply_users_count.append(0)\n", - " tm_thread_end.append(0)\n", - " data = zip(msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st,\n", - " reply_count, reply_users_count, reply_users, tm_thread_end)\n", - " columns = ['msg_type', 'msg_content', 'sender_name', 'msg_sent_time', 'msg_dist_type',\n", - " 'time_thread_start', 'reply_count', 'reply_users_count', 'reply_users', 'tm_thread_end']\n", - "\n", - " df = pd.DataFrame(data=data, columns=columns)\n", - " df = df[df['sender_name'] != 'Not provided']\n", - " dflist.append(df)\n", - "\n", - " dfall = pd.concat(dflist, ignore_index=True)\n", - " dfall['channel'] = path_channel.split('/')[-1].split('.')[0] \n", - " dfall = dfall.reset_index(drop=True)\n", - " \n", - " return dfall\n", - "\n", - "\n", - "def parse_slack_reaction(path, channel):\n", - " \"\"\"get reactions\"\"\"\n", - " dfall_reaction = pd.DataFrame()\n", - " combined = []\n", - " for json_file in glob.glob(f\"{path}*.json\"):\n", - " with open(json_file, 'r') as slack_data:\n", - " combined.append(slack_data)\n", - "\n", - " reaction_name, reaction_count, reaction_users, msg, user_id = [], [], [], [], []\n", - "\n", - " for k in combined:\n", - " slack_data = json.load(open(k.name, 'r', encoding=\"utf-8\"))\n", - " \n", - " for i_count, i in enumerate(slack_data):\n", - " if 'reactions' in i.keys():\n", - " for j in range(len(i['reactions'])):\n", - " msg.append(i['text'])\n", - " user_id.append(i['user'])\n", - " reaction_name.append(i['reactions'][j]['name'])\n", - " reaction_count.append(i['reactions'][j]['count'])\n", - " reaction_users.append(\",\".join(i['reactions'][j]['users']))\n", - " \n", - " data_reaction = zip(reaction_name, reaction_count, reaction_users, msg, user_id)\n", - " columns_reaction = ['reaction_name', 'reaction_count', 'reaction_users_count', 'message', 'user_id']\n", - " df_reaction = pd.DataFrame(data=data_reaction, columns=columns_reaction)\n", - " df_reaction['channel'] = channel\n", - " return df_reaction\n", - "\n", - "def get_community_participation(path):\n", - " \"\"\" specify path to get json files\"\"\"\n", - " combined = []\n", - " comm_dict = {}\n", - " for json_file in glob.glob(f\"{path}*.json\"):\n", - " with open(json_file, 'r') as slack_data:\n", - " combined.append(slack_data)\n", - " # print(f\"Total json files is {len(combined)}\")\n", - " for i in combined:\n", - " a = json.load(open(i.name, 'r', encoding='utf-8'))\n", - "\n", - " for msg in a:\n", - " if 'replies' in msg.keys():\n", - " for i in msg['replies']:\n", - " comm_dict[i['user']] = comm_dict.get(i['user'], 0)+1\n", - " return comm_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def convert_2_timestamp(column, data):\n", - " \"\"\"convert from unix time to readable timestamp\n", - " args: column: columns that needs to be converted to timestamp\n", - " data: data that has the specified column\n", - " \"\"\"\n", - " if column in data.columns.values:\n", - " timestamp_ = []\n", - " for time_unix in data[column]:\n", - " if time_unix == 0:\n", - " timestamp_.append(0)\n", - " else:\n", - " a = datetime.datetime.fromtimestamp(float(time_unix))\n", - " timestamp_.append(a.strftime('%Y-%m-%d %H:%M:%S'))\n", - " return timestamp_\n", - " else: \n", - " print(f\"{column} not in data\")\n", - "\n", - "def get_tagged_users(df):\n", - " \"\"\"get all @ in the messages\"\"\"\n", - "\n", - " return df['msg_content'].map(lambda x: re.findall(r'@U\\w+', x))\n", - "\n", - "\n", - " \n", - "def map_userid_2_realname(user_profile: dict, comm_dict: dict, plot=False):\n", - " \"\"\"\n", - " map slack_id to realnames\n", - " user_profile: a dictionary that contains users info such as real_names\n", - " comm_dict: a dictionary that contains slack_id and total_message sent by that slack_id\n", - " \"\"\"\n", - " user_dict = {} # to store the id\n", - " real_name = [] # to store the real name\n", - " ac_comm_dict = {} # to store the mapping\n", - " count = 0\n", - " # collect all the real names\n", - " for i in range(len(user_profile['profile'])):\n", - " real_name.append(dict(user_profile['profile'])[i]['real_name'])\n", - "\n", - " # loop the slack ids\n", - " for i in user_profile['id']:\n", - " user_dict[i] = real_name[count]\n", - " count += 1\n", - "\n", - " # to store mapping\n", - " for i in comm_dict:\n", - " if i in user_dict:\n", - " ac_comm_dict[user_dict[i]] = comm_dict[i]\n", - "\n", - " ac_comm_dict = pd.DataFrame(data= zip(ac_comm_dict.keys(), ac_comm_dict.values()),\n", - " columns=['LearnerName', '# of Msg sent in Threads']).sort_values(by='# of Msg sent in Threads', ascending=False)\n", - " \n", - " if plot:\n", - " ac_comm_dict.plot.bar(figsize=(15, 7.5), x='LearnerName', y='# of Msg sent in Threads')\n", - " plt.title('Student based on Message sent in thread', size=20)\n", - " \n", - " return ac_comm_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_top_20_user(data, channel='Random'):\n", - " \"\"\"get user with the highest number of message sent to any channel\"\"\"\n", - "\n", - " data['sender_name'].value_counts()[:20].plot.bar(figsize=(15, 7.5))\n", - " plt.title(f'Top 20 Message Senders in #{channel} channels', size=15, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=14);\n", - " plt.xticks(size=12); plt.yticks(size=12);\n", - " plt.show()\n", - "\n", - " data['sender_name'].value_counts()[-10:].plot.bar(figsize=(15, 7.5))\n", - " plt.title(f'Bottom 10 Message Senders in #{channel} channels', size=15, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=14);\n", - " plt.xticks(size=12); plt.yticks(size=12);\n", - " plt.show()\n", - "\n", - "def draw_avg_reply_count(data, channel='Random'):\n", - " \"\"\"who commands many reply?\"\"\"\n", - "\n", - " data.groupby('sender_name')['reply_count'].mean().sort_values(ascending=False)[:20]\\\n", - " .plot(kind='bar', figsize=(15,7.5));\n", - " plt.title(f'Average Number of reply count per Sender in #{channel}', size=20, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n", - " plt.xticks(size=14); plt.yticks(size=14);\n", - " plt.show()\n", - "\n", - "def draw_avg_reply_users_count(data, channel='Random'):\n", - " \"\"\"who commands many user reply?\"\"\"\n", - "\n", - " data.groupby('sender_name')['reply_users_count'].mean().sort_values(ascending=False)[:20].plot(kind='bar',\n", - " figsize=(15,7.5));\n", - " plt.title(f'Average Number of reply user count per Sender in #{channel}', size=20, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n", - " plt.xticks(size=14); plt.yticks(size=14);\n", - " plt.show()\n", - "\n", - "def draw_wordcloud(msg_content, week): \n", - " # word cloud visualization\n", - " allWords = ' '.join([twts for twts in msg_content])\n", - " wordCloud = WordCloud(background_color='#975429', width=500, height=300, random_state=21, max_words=500, mode='RGBA',\n", - " max_font_size=140, stopwords=stopwords.words('english')).generate(allWords)\n", - " plt.figure(figsize=(15, 7.5))\n", - " plt.imshow(wordCloud, interpolation=\"bilinear\")\n", - " plt.axis('off')\n", - " plt.tight_layout()\n", - " plt.title(f'WordCloud for {week}', size=30)\n", - " plt.show()\n", - "\n", - "def draw_user_reaction(data, channel='General'):\n", - " data.groupby('sender_name')[['reply_count', 'reply_users_count']].sum()\\\n", - " .sort_values(by='reply_count',ascending=False)[:10].plot(kind='bar', figsize=(15, 7.5))\n", - " plt.title(f'User with the most reaction in #{channel}', size=25);\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n", - " plt.xticks(size=14); plt.yticks(size=14);\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Insight Extraction\n", - "\n", - "Below are some useful questions to answer. Feel free to explore to answer other interesting questions that may be of help to get insight about student's behaviour, need, and future performance " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# which user has the highest number of reply counts?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize reply counts per user per channel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# what is the time range of the day that most messages are sent?\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# what kind of messages are replied faster than others?" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# Relationship between # of messages and # of reactions" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Classify messages into different categories such as questions, answers, comments, etc." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Which users got the most reactions?" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# Model topics mentioned in the channel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# What are the topics that got the most reactions?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Harder questions to look into" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Based on messages, reactions, references shared, and other relevant data such as classification of questions into techical question, comment, answer, aorder stu the python, statistics, and sql skill level of a user?" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/src/loader.py b/src/loader.py index c75b68d..5faa2f7 100644 --- a/src/loader.py +++ b/src/loader.py @@ -34,7 +34,7 @@ def __init__(self, path): ''' self.path = path self.channels = self.get_channels() - self.users = self.get_ussers() + self.users = self.get_users() def get_users(self): diff --git a/src/utils.py b/src/utils.py index 45dda22..7f98cac 100644 --- a/src/utils.py +++ b/src/utils.py @@ -180,3 +180,25 @@ def convert_2_timestamp(column, data): timestamp_.append(a.strftime('%Y-%m-%d %H:%M:%S')) return timestamp_ else: print(f"{column} not in data") +def get_messages_on_channel(channel_path): + + json_files = [ + f"{channel_path}/{pos_json}" + for pos_json in os.listdir(channel_path) + if pos_json.endswith('.json') + ] + combined = [] + + for json_file in json_files: + with open(json_file, 'r', encoding="utf8") as slack_data: + json_content = json.load(slack_data) + combined.extend(json_content) + + messages = [] + + for msg in combined: + msg_list, _ = process_message(msg) + messages.append(msg_list) + + + return messages \ No newline at end of file