From 5fb73bb6260ca144831db1903ab97784a5dec839 Mon Sep 17 00:00:00 2001 From: moykad Date: Fri, 1 Dec 2023 11:10:40 +0300 Subject: [PATCH 1/4] Topic Modeling --- notebooks/Topic_Modelling.ipynb | 800 +++++++++++++++++++++++++++++++ notebooks/parse_slack_data.ipynb | 452 ----------------- week0_starter_network_analysis | 1 + 3 files changed, 801 insertions(+), 452 deletions(-) create mode 100644 notebooks/Topic_Modelling.ipynb delete mode 100644 notebooks/parse_slack_data.ipynb create mode 160000 week0_starter_network_analysis diff --git a/notebooks/Topic_Modelling.ipynb b/notebooks/Topic_Modelling.ipynb new file mode 100644 index 0000000..6ee4f98 --- /dev/null +++ b/notebooks/Topic_Modelling.ipynb @@ -0,0 +1,800 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import gensim\n", + "from gensim.models import CoherenceModel\n", + "from gensim import corpora\n", + "import pandas as pd\n", + "from pprint import pprint\n", + "import string\n", + "import os\n", + "import re\n", + "import sys\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", + "import pyLDAvis.gensim_models as gensimvis\n", + "import pyLDAvis\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: pyLDAvis in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (3.4.1)\n", + "Requirement already satisfied: numpy>=1.24.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.24.3)\n", + "Requirement already satisfied: scipy in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.11.1)\n", + "Requirement already satisfied: pandas>=2.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (2.0.3)\n", + "Requirement already satisfied: joblib>=1.2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.2.0)\n", + "Requirement already satisfied: jinja2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (3.1.2)\n", + "Requirement already satisfied: numexpr in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (2.8.4)\n", + "Requirement already satisfied: funcy in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from pyLDAvis) (2.0)\n", + "Requirement already satisfied: scikit-learn>=1.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.3.0)\n", + "Requirement already satisfied: gensim in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (4.3.0)\n", + "Requirement already satisfied: setuptools in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (68.0.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=2.0.0->pyLDAvis) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=2.0.0->pyLDAvis) (2023.3.post1)\n", + "Requirement already satisfied: tzdata>=2022.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=2.0.0->pyLDAvis) (2023.3)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=1.0.0->pyLDAvis) (2.2.0)\n", + "Requirement already satisfied: smart-open>=1.8.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from gensim->pyLDAvis) (5.2.1)\n", + "Requirement already satisfied: FuzzyTM>=0.4.0 in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from gensim->pyLDAvis) (2.0.5)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from jinja2->pyLDAvis) (2.1.1)\n", + "Requirement already satisfied: pyfume in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from FuzzyTM>=0.4.0->gensim->pyLDAvis) (0.2.25)\n", + "Requirement already satisfied: six>=1.5 in c:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.2->pandas>=2.0.0->pyLDAvis) (1.16.0)\n", + "Requirement already satisfied: simpful in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis) (2.11.1)\n", + "Requirement already satisfied: fst-pso in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis) (1.8.1)\n", + "Requirement already satisfied: miniful in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis) (0.0.6)\n" + ] + } + ], + "source": [ + "!pip install pyLDAvis" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Add parent directory to path to import modules from src\n", + "rpath = os.path.abspath('C:/Users/moyka/OneDrive/Documents/GitHub/week0_starter_network_analysis')\n", + "if rpath not in sys.path:\n", + " sys.path.insert(0, rpath)\n", + "\n", + "from src.loader import SlackDataLoader\n", + "import src.utils as utils" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "data_loader = SlackDataLoader(r\"C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/\")\n", + "\n", + "def get_channel_messages(channel):\n", + " channel_messages = utils.get_messages_on_channel(f\"C:/Users/moyka/OneDrive/Documents/GitHub//{channel}\") \n", + " # Create an empty DataFrame\n", + " df = pd.DataFrame(channel_messages)\n", + " return df\n", + "\n", + "def get_all_channels_message():\n", + " dfs = [] # List to store individual DataFrames\n", + "\n", + " for channel in data_loader.channels:\n", + " dfs.append(get_channel_messages(channel[\"name\"]))\n", + "\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " result_df = pd.concat(dfs, ignore_index=True)\n", + "\n", + " return result_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_text(text):\n", + " # Extract and remove URLs\n", + " urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\\\(\\\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)\n", + " for url in urls:\n", + " text = text.replace(url, '')\n", + "\n", + " text = re.sub(r'<@.*?>', '', text)\n", + "\n", + " # Convert to lowercase\n", + " text = text.lower()\n", + "\n", + " # Remove punctuation\n", + " text = ''.join([char for char in text if char not in string.punctuation])\n", + "\n", + " # Remove numbers\n", + " text = re.sub(r'\\d+', '', text)\n", + "\n", + " # Tokenize\n", + " tokens = word_tokenize(text)\n", + "\n", + " # Remove stop words\n", + " stop_words = set(stopwords.words('english'))\n", + " tokens = [word for word in tokens if word not in stop_words]\n", + "\n", + " # Perform stemming\n", + " stemmer = PorterStemmer()\n", + " tokens = [stemmer.stem(word) for word in tokens]\n", + "\n", + " # Perform lemmatization\n", + " lemmatizer = WordNetLemmatizer()\n", + " tokens = [lemmatizer.lemmatize(word) for word in tokens]\n", + "\n", + " # Join the tokens back into a string\n", + " text = ' '.join(tokens)\n", + "\n", + " return text\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def prepare_data(df):\n", + " df['cleaned_text'] = df['text'].apply(preprocess_text)\n", + " sentence_list = [tweet for tweet in df['cleaned_text']]\n", + " word_list = [sent.split() for sent in sentence_list]\n", + "\n", + " #Create dictionary which contains Id and word\n", + " word_to_id = corpora.Dictionary(word_list) #generate unique tokens\n", + " corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]\n", + " \n", + " return df, word_list, word_to_id, corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(corpus, word_to_id):\n", + " # Build LDA model\n", + " lda_model = gensim.models.ldamodel.LdaModel(corpus,\n", + " id2word=word_to_id,\n", + " num_topics=5,\n", + " random_state=100,\n", + " update_every=1,\n", + " chunksize=100,\n", + " passes=10,\n", + " alpha='auto',\n", + " per_word_topics=True) \n", + " return lda_model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def show_topics(lda_model):\n", + " pprint(lda_model.show_topics(formatted=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def model_analysis(lda_model, corpus, word_list, word_to_id):\n", + " print('\\nPerplexity: ', lda_model.log_perplexity(corpus))\n", + " doc_lda = lda_model[corpus]\n", + "\n", + "\n", + " # Compute Coherence Score\n", + " coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')\n", + " coherence_lda = coherence_model_lda.get_coherence()\n", + " print('\\n Lda model Coherence Score/Accuracy on Tweets: ', coherence_lda)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def get_top_topics(df):\n", + " df, word_list, word_to_id, corpus = prepare_data(df)\n", + " lda_model = build_model(corpus, word_to_id)\n", + "\n", + " # Show the top 10 topics\n", + " show_topics(lda_model)\n", + " \n", + "\n", + " # Visualize the top 10 topics\n", + " pyLDAvis.enable_notebook()\n", + " LDAvis_prepared = gensimvis.prepare(lda_model, corpus, word_to_id)\n", + " return LDAvis_prepared" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top 10 topics of the different channels" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0,\n", + " [('help', 0.039370384),\n", + " ('time', 0.03086105),\n", + " ('could', 0.026818035),\n", + " ('submiss', 0.02321841),\n", + " ('dont', 0.016984716),\n", + " ('got', 0.015203388),\n", + " ('explain', 0.015147981),\n", + " ('anyon', 0.014908787),\n", + " ('ye', 0.014444858),\n", + " ('see', 0.014271538)]),\n", + " (1,\n", + " [('use', 0.03954693),\n", + " ('featur', 0.027709492),\n", + " ('make', 0.02753079),\n", + " ('one', 0.022003634),\n", + " ('task', 0.016114524),\n", + " ('streamlit', 0.015737709),\n", + " ('work', 0.014302439),\n", + " ('tri', 0.014013731),\n", + " ('sure', 0.013291272),\n", + " ('im', 0.011552603)]),\n", + " (2,\n", + " [('data', 0.050371937),\n", + " ('think', 0.03176376),\n", + " ('use', 0.022391073),\n", + " ('engag', 0.019544115),\n", + " ('score', 0.01950423),\n", + " ('cluster', 0.017591735),\n", + " ('normal', 0.01662211),\n", + " ('valu', 0.01587388),\n", + " ('experi', 0.015768496),\n", + " ('get', 0.015297936)]),\n", + " (3,\n", + " [('link', 0.029798074),\n", + " ('column', 0.025568051),\n", + " ('submit', 0.025158001),\n", + " ('task', 0.023643794),\n", + " ('use', 0.022553697),\n", + " ('go', 0.019353328),\n", + " ('user', 0.019226132),\n", + " ('repo', 0.018446513),\n", + " ('dashboard', 0.017137216),\n", + " ('need', 0.015926724)]),\n", + " (4,\n", + " [('thank', 0.09694823),\n", + " ('like', 0.0417825),\n", + " ('report', 0.034107205),\n", + " ('channel', 0.021120002),\n", + " ('would', 0.018848905),\n", + " ('okay', 0.01763415),\n", + " ('ask', 0.01697686),\n", + " ('much', 0.014890067),\n", + " ('join', 0.0138974115),\n", + " ('handset', 0.0118703805)])]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "PreparedData(topic_coordinates= x y topics cluster Freq\n", + "topic \n", + "3 0.222826 0.162338 1 1 28.690014\n", + "2 0.091699 -0.250624 2 1 24.262345\n", + "1 -0.007488 -0.012498 3 1 19.354587\n", + "0 -0.131076 0.099128 4 1 15.543622\n", + "4 -0.175960 0.001657 5 1 12.149431, topic_info= Term Freq Total Category logprob loglift\n", + "24 thank 47.000000 47.000000 Default 30.0000 30.0000\n", + "4 data 49.000000 49.000000 Default 29.0000 29.0000\n", + "39 help 33.000000 33.000000 Default 28.0000 28.0000\n", + "55 like 20.000000 20.000000 Default 27.0000 27.0000\n", + "86 think 31.000000 31.000000 Default 26.0000 26.0000\n", + ".. ... ... ... ... ... ...\n", + "649 ’ 4.981852 9.575574 Topic5 -4.5703 1.4545\n", + "335 realli 3.838983 6.969174 Topic5 -4.8309 1.5116\n", + "538 df 3.444510 6.668235 Topic5 -4.9393 1.4473\n", + "94 tri 3.817392 23.164060 Topic5 -4.8365 0.3049\n", + "945 mentor 2.762573 4.804383 Topic5 -5.1599 1.5545\n", + "\n", + "[218 rows x 6 columns], token_table= Topic Freq Term\n", + "term \n", + "572 4 0.798847 actual\n", + "132 5 0.895951 alright\n", + "2 1 0.938798 also\n", + "760 4 0.789122 alumnus\n", + "322 1 0.931152 analysi\n", + "... ... ... ...\n", + "648 5 0.798406 ‘\n", + "649 2 0.208865 ’\n", + "649 4 0.208865 ’\n", + "649 5 0.522162 ’\n", + "203 2 0.959415 •\n", + "\n", + "[218 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[4, 3, 2, 1, 5])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = get_channel_messages(\"all-week1\")\n", + "get_top_topics(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\Eshetu\\AppData\\Roaming\\nltk_data...\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "nltk.download('wordnet')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0,\n", + " [('rollingonthefloorlaugh', 0.088871546),\n", + " ('parent', 0.053244308),\n", + " ('good', 0.04075855),\n", + " ('make', 0.02399027),\n", + " ('morn', 0.018489508),\n", + " ('would', 0.017024687),\n", + " ('man', 0.01674058),\n", + " ('realli', 0.014869037),\n", + " ('love', 0.012299013),\n", + " ('life', 0.010928868)]),\n", + " (1,\n", + " [('pray', 0.080668695),\n", + " ('u', 0.07174005),\n", + " ('like', 0.065636344),\n", + " ('also', 0.049389277),\n", + " ('someon', 0.041917514),\n", + " ('one', 0.030192202),\n", + " ('prayer', 0.028453592),\n", + " ('know', 0.0248715),\n", + " ('think', 0.017885309),\n", + " ('bzw', 0.0164379)]),\n", + " (2,\n", + " [('hello', 0.046209835),\n", + " ('dont', 0.033363096),\n", + " ('want', 0.025301958),\n", + " ('peopl', 0.025089722),\n", + " ('last', 0.022465738),\n", + " ('miss', 0.021834275),\n", + " ('use', 0.020633506),\n", + " ('today', 0.018617414),\n", + " ('everyon', 0.013510507),\n", + " ('start', 0.0125754215)]),\n", + " (3,\n", + " [('need', 0.074772716),\n", + " ('job', 0.06344194),\n", + " ('grin', 0.017321585),\n", + " ('go', 0.017111976),\n", + " ('see', 0.017013228),\n", + " ('cant', 0.012809223),\n", + " ('graduat', 0.012070753),\n", + " ('done', 0.011215276),\n", + " ('sure', 0.011110627),\n", + " ('outsourc', 0.009723263)]),\n", + " (4,\n", + " [('told', 0.0705787),\n", + " ('get', 0.052049227),\n", + " ('immedi', 0.026630463),\n", + " ('•', 0.025180869),\n", + " ('cb', 0.02216999),\n", + " ('thank', 0.021754235),\n", + " ('time', 0.018611282),\n", + " ('guy', 0.01785792),\n", + " ('academi', 0.016819155),\n", + " ('plea', 0.014971133)])]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "PreparedData(topic_coordinates= x y topics cluster Freq\n", + "topic \n", + "4 -0.316967 -0.090289 1 1 27.181033\n", + "1 0.170308 -0.266332 2 1 19.650094\n", + "0 0.066181 0.098589 3 1 18.739438\n", + "2 0.034563 0.143814 4 1 18.571699\n", + "3 0.045915 0.114218 5 1 15.857736, topic_info= Term Freq Total Category logprob \\\n", + "77 rollingonthefloorlaugh 588.000000 588.000000 Default 30.0000 \n", + "759 pray 561.000000 561.000000 Default 29.0000 \n", + "1506 told 680.000000 680.000000 Default 28.0000 \n", + "371 u 498.000000 498.000000 Default 27.0000 \n", + "648 need 419.000000 419.000000 Default 26.0000 \n", + "... ... ... ... ... ... \n", + "2579 kg 41.549688 43.765998 Topic5 -4.9040 \n", + "3869 sell 29.744465 30.623335 Topic5 -5.2382 \n", + "5639 outsourc 54.470326 278.651722 Topic5 -4.6332 \n", + "5637 bzw 42.705441 269.061348 Topic5 -4.8766 \n", + "5638 palmsuptogeth 42.705436 269.061318 Topic5 -4.8766 \n", + "\n", + " loglift \n", + "77 30.0000 \n", + "759 29.0000 \n", + "1506 28.0000 \n", + "371 27.0000 \n", + "648 26.0000 \n", + "... ... \n", + "2579 1.7895 \n", + "3869 1.8124 \n", + "5639 0.2092 \n", + "5637 0.0009 \n", + "5638 0.0009 \n", + "\n", + "[203 rows x 6 columns], token_table= Topic Freq Term\n", + "term \n", + "305 1 0.999103 academi\n", + "189 4 0.992490 activ\n", + "3983 5 0.975318 alien\n", + "174 2 0.977639 alreadi\n", + "628 2 0.998343 also\n", + "... ... ... ...\n", + "440 2 0.986758 ye\n", + "429 5 0.987888 yeah\n", + "810 3 0.980327 your\n", + "906 1 0.991522 ’\n", + "39 1 0.998276 •\n", + "\n", + "[196 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[5, 2, 1, 3, 4])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = get_channel_messages(\"all-community-building\")\n", + "get_top_topics(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top 10 topics of all channels" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0,\n", + " [('work', 0.07804269),\n", + " ('instal', 0.039492182),\n", + " ('get', 0.03234122),\n", + " ('guy', 0.026092794),\n", + " ('face', 0.02030699),\n", + " ('return', 0.019842522),\n", + " ('ok', 0.015252527),\n", + " ('okay', 0.014948669),\n", + " ('pip', 0.01389798),\n", + " ('read', 0.013267612)]),\n", + " (1,\n", + " [('file', 0.0549693),\n", + " ('line', 0.034151033),\n", + " ('meet', 0.02210031),\n", + " ('extract', 0.020621968),\n", + " ('one', 0.020602355),\n", + " ('good', 0.017431682),\n", + " ('task', 0.017173653),\n", + " ('imag', 0.015683454),\n", + " ('logo', 0.01498308),\n", + " ('need', 0.014000982)]),\n", + " (2,\n", + " [('channel', 0.18221219),\n", + " ('join', 0.18198967),\n", + " ('connect', 0.018376695),\n", + " ('plea', 0.018241962),\n", + " ('time', 0.01755661),\n", + " ('start', 0.016417718),\n", + " ('•', 0.014001578),\n", + " ('today', 0.012244016),\n", + " ('code', 0.012109588),\n", + " ('link', 0.011688723)]),\n", + " (3,\n", + " [('thank', 0.041847672),\n", + " ('think', 0.03968491),\n", + " ('data', 0.03801513),\n", + " ('sure', 0.021638291),\n", + " ('creat', 0.019395646),\n", + " ('right', 0.019231077),\n", + " ('present', 0.01881592),\n", + " ('dont', 0.018710904),\n", + " ('model', 0.016673988),\n", + " ('project', 0.016466739)]),\n", + " (4,\n", + " [('use', 0.065344),\n", + " ('tri', 0.05039213),\n", + " ('error', 0.028861312),\n", + " ('ye', 0.028690428),\n", + " ('instanc', 0.02610646),\n", + " ('let', 0.021604491),\n", + " ('u', 0.019047005),\n", + " ('group', 0.017978994),\n", + " ('im', 0.01785427),\n", + " ('make', 0.017489491)])]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "PreparedData(topic_coordinates= x y topics cluster Freq\n", + "topic \n", + "2 0.371924 -0.020110 1 1 25.993380\n", + "4 -0.090644 0.267852 2 1 20.377566\n", + "1 -0.119150 -0.263536 3 1 19.558450\n", + "3 -0.070161 0.083032 4 1 18.391670\n", + "0 -0.091969 -0.067238 5 1 15.678934, topic_info= Term Freq Total Category logprob loglift\n", + "0 channel 4785.000000 4785.000000 Default 30.0000 30.0000\n", + "1 join 4780.000000 4780.000000 Default 29.0000 29.0000\n", + "203 work 1237.000000 1237.000000 Default 28.0000 28.0000\n", + "24 use 1346.000000 1346.000000 Default 27.0000 27.0000\n", + "5632 file 1087.000000 1087.000000 Default 26.0000 26.0000\n", + "... ... ... ... ... ... ...\n", + "5079 fix 93.573706 94.353067 Topic5 -5.1316 1.8446\n", + "2138 frame 184.716212 186.261848 Topic5 -4.4515 1.8445\n", + "1393 recent 98.995630 100.062156 Topic5 -5.0753 1.8421\n", + "7526 conda 125.739371 131.182746 Topic5 -4.8361 1.8105\n", + "8245 sudo 123.883955 154.399777 Topic5 -4.8510 1.6327\n", + "\n", + "[189 rows x 6 columns], token_table= Topic Freq Term\n", + "term \n", + "351 4 0.993177 abl\n", + "377 2 0.997747 access\n", + "2050 3 0.996749 ad\n", + "3254 4 0.991437 airflow\n", + "174 5 0.997243 alreadi\n", + "... ... ... ...\n", + "429 4 0.994975 yeah\n", + "1131 2 0.994165 yesterday\n", + "810 5 0.995969 your\n", + "906 5 0.994548 ’\n", + "39 1 0.998721 •\n", + "\n", + "[181 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[3, 5, 2, 4, 1])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = get_all_channels_message()\n", + "get_top_topics(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/parse_slack_data.ipynb b/notebooks/parse_slack_data.ipynb deleted file mode 100644 index e3774f8..0000000 --- a/notebooks/parse_slack_data.ipynb +++ /dev/null @@ -1,452 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os, sys\n", - "import re\n", - "import json\n", - "import glob\n", - "import datetime\n", - "from collections import Counter\n", - "\n", - "import pandas as pd\n", - "from matplotlib import pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from nltk.corpus import stopwords\n", - "from wordcloud import WordCloud" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Add parent directory to path to import modules from src\n", - "rpath = os.path.abspath('..')\n", - "if rpath not in sys.path:\n", - " sys.path.insert(0, rpath)\n", - "\n", - "from src.loader import SlackDataLoader\n", - "import src.utils as utils" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Columns we can get from a slack message
\n", - "\n", - "message_type, message_content, sender_id, time_sent, message_distribution, time_thread_start, reply_count, reply_user_count, time_thread_end, reply_users" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From a single slack message, we can get
\n", - "\n", - "1. The message
\n", - "2. Type (message, file, link, etc)
\n", - "3. The sender_id (assigned by slack)
\n", - "4. The time the message was sent
\n", - "5. The team (i don't know what that is now)
\n", - "6. The type of the message (broadcast message, inhouse, just messgae)
\n", - "7. The thread the message generated (from here we can go):
\n", - " 7.1 Text/content of the message
\n", - " 7.2 The thread time of the message
\n", - " 7.3 The thread count (reply count)
\n", - " 7.4 The number of user that reply the message (count of users that participated in the thread)
\n", - " 7.5 The time the last thread message was sent
\n", - " 7.6 The users that participated in the thread (their ids are stored as well)
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# combine all json file in all-weeks8-9\n", - "def slack_parser(path_channel):\n", - " \"\"\" parse slack data to extract useful informations from the json file\n", - " step of execution\n", - " 1. Import the required modules\n", - " 2. read all json file from the provided path\n", - " 3. combine all json files in the provided path\n", - " 4. extract all required informations from the slack data\n", - " 5. convert to dataframe and merge all\n", - " 6. reset the index and return dataframe\n", - " \"\"\"\n", - "\n", - " # specify path to get json files\n", - " combined = []\n", - " for json_file in glob.glob(f\"{path_channel}*.json\"):\n", - " with open(json_file, 'r', encoding=\"utf8\") as slack_data:\n", - " combined.append(slack_data)\n", - "\n", - " # loop through all json files and extract required informations\n", - " dflist = []\n", - " for slack_data in combined:\n", - "\n", - " msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st, reply_users, \\\n", - " reply_count, reply_users_count, tm_thread_end = [],[],[],[],[],[],[],[],[],[]\n", - "\n", - " for row in slack_data:\n", - " if 'bot_id' in row.keys():\n", - " continue\n", - " else:\n", - " msg_type.append(row['type'])\n", - " msg_content.append(row['text'])\n", - " if 'user_profile' in row.keys(): sender_id.append(row['user_profile']['real_name'])\n", - " else: sender_id.append('Not provided')\n", - " time_msg.append(row['ts'])\n", - " if 'blocks' in row.keys() and len(row['blocks'][0]['elements'][0]['elements']) != 0 :\n", - " msg_dist.append(row['blocks'][0]['elements'][0]['elements'][0]['type'])\n", - " else: msg_dist.append('reshared')\n", - " if 'thread_ts' in row.keys():\n", - " time_thread_st.append(row['thread_ts'])\n", - " else:\n", - " time_thread_st.append(0)\n", - " if 'reply_users' in row.keys(): reply_users.append(\",\".join(row['reply_users'])) \n", - " else: reply_users.append(0)\n", - " if 'reply_count' in row.keys():\n", - " reply_count.append(row['reply_count'])\n", - " reply_users_count.append(row['reply_users_count'])\n", - " tm_thread_end.append(row['latest_reply'])\n", - " else:\n", - " reply_count.append(0)\n", - " reply_users_count.append(0)\n", - " tm_thread_end.append(0)\n", - " data = zip(msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st,\n", - " reply_count, reply_users_count, reply_users, tm_thread_end)\n", - " columns = ['msg_type', 'msg_content', 'sender_name', 'msg_sent_time', 'msg_dist_type',\n", - " 'time_thread_start', 'reply_count', 'reply_users_count', 'reply_users', 'tm_thread_end']\n", - "\n", - " df = pd.DataFrame(data=data, columns=columns)\n", - " df = df[df['sender_name'] != 'Not provided']\n", - " dflist.append(df)\n", - "\n", - " dfall = pd.concat(dflist, ignore_index=True)\n", - " dfall['channel'] = path_channel.split('/')[-1].split('.')[0] \n", - " dfall = dfall.reset_index(drop=True)\n", - " \n", - " return dfall\n", - "\n", - "\n", - "def parse_slack_reaction(path, channel):\n", - " \"\"\"get reactions\"\"\"\n", - " dfall_reaction = pd.DataFrame()\n", - " combined = []\n", - " for json_file in glob.glob(f\"{path}*.json\"):\n", - " with open(json_file, 'r') as slack_data:\n", - " combined.append(slack_data)\n", - "\n", - " reaction_name, reaction_count, reaction_users, msg, user_id = [], [], [], [], []\n", - "\n", - " for k in combined:\n", - " slack_data = json.load(open(k.name, 'r', encoding=\"utf-8\"))\n", - " \n", - " for i_count, i in enumerate(slack_data):\n", - " if 'reactions' in i.keys():\n", - " for j in range(len(i['reactions'])):\n", - " msg.append(i['text'])\n", - " user_id.append(i['user'])\n", - " reaction_name.append(i['reactions'][j]['name'])\n", - " reaction_count.append(i['reactions'][j]['count'])\n", - " reaction_users.append(\",\".join(i['reactions'][j]['users']))\n", - " \n", - " data_reaction = zip(reaction_name, reaction_count, reaction_users, msg, user_id)\n", - " columns_reaction = ['reaction_name', 'reaction_count', 'reaction_users_count', 'message', 'user_id']\n", - " df_reaction = pd.DataFrame(data=data_reaction, columns=columns_reaction)\n", - " df_reaction['channel'] = channel\n", - " return df_reaction\n", - "\n", - "def get_community_participation(path):\n", - " \"\"\" specify path to get json files\"\"\"\n", - " combined = []\n", - " comm_dict = {}\n", - " for json_file in glob.glob(f\"{path}*.json\"):\n", - " with open(json_file, 'r') as slack_data:\n", - " combined.append(slack_data)\n", - " # print(f\"Total json files is {len(combined)}\")\n", - " for i in combined:\n", - " a = json.load(open(i.name, 'r', encoding='utf-8'))\n", - "\n", - " for msg in a:\n", - " if 'replies' in msg.keys():\n", - " for i in msg['replies']:\n", - " comm_dict[i['user']] = comm_dict.get(i['user'], 0)+1\n", - " return comm_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def convert_2_timestamp(column, data):\n", - " \"\"\"convert from unix time to readable timestamp\n", - " args: column: columns that needs to be converted to timestamp\n", - " data: data that has the specified column\n", - " \"\"\"\n", - " if column in data.columns.values:\n", - " timestamp_ = []\n", - " for time_unix in data[column]:\n", - " if time_unix == 0:\n", - " timestamp_.append(0)\n", - " else:\n", - " a = datetime.datetime.fromtimestamp(float(time_unix))\n", - " timestamp_.append(a.strftime('%Y-%m-%d %H:%M:%S'))\n", - " return timestamp_\n", - " else: \n", - " print(f\"{column} not in data\")\n", - "\n", - "def get_tagged_users(df):\n", - " \"\"\"get all @ in the messages\"\"\"\n", - "\n", - " return df['msg_content'].map(lambda x: re.findall(r'@U\\w+', x))\n", - "\n", - "\n", - " \n", - "def map_userid_2_realname(user_profile: dict, comm_dict: dict, plot=False):\n", - " \"\"\"\n", - " map slack_id to realnames\n", - " user_profile: a dictionary that contains users info such as real_names\n", - " comm_dict: a dictionary that contains slack_id and total_message sent by that slack_id\n", - " \"\"\"\n", - " user_dict = {} # to store the id\n", - " real_name = [] # to store the real name\n", - " ac_comm_dict = {} # to store the mapping\n", - " count = 0\n", - " # collect all the real names\n", - " for i in range(len(user_profile['profile'])):\n", - " real_name.append(dict(user_profile['profile'])[i]['real_name'])\n", - "\n", - " # loop the slack ids\n", - " for i in user_profile['id']:\n", - " user_dict[i] = real_name[count]\n", - " count += 1\n", - "\n", - " # to store mapping\n", - " for i in comm_dict:\n", - " if i in user_dict:\n", - " ac_comm_dict[user_dict[i]] = comm_dict[i]\n", - "\n", - " ac_comm_dict = pd.DataFrame(data= zip(ac_comm_dict.keys(), ac_comm_dict.values()),\n", - " columns=['LearnerName', '# of Msg sent in Threads']).sort_values(by='# of Msg sent in Threads', ascending=False)\n", - " \n", - " if plot:\n", - " ac_comm_dict.plot.bar(figsize=(15, 7.5), x='LearnerName', y='# of Msg sent in Threads')\n", - " plt.title('Student based on Message sent in thread', size=20)\n", - " \n", - " return ac_comm_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_top_20_user(data, channel='Random'):\n", - " \"\"\"get user with the highest number of message sent to any channel\"\"\"\n", - "\n", - " data['sender_name'].value_counts()[:20].plot.bar(figsize=(15, 7.5))\n", - " plt.title(f'Top 20 Message Senders in #{channel} channels', size=15, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=14);\n", - " plt.xticks(size=12); plt.yticks(size=12);\n", - " plt.show()\n", - "\n", - " data['sender_name'].value_counts()[-10:].plot.bar(figsize=(15, 7.5))\n", - " plt.title(f'Bottom 10 Message Senders in #{channel} channels', size=15, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=14);\n", - " plt.xticks(size=12); plt.yticks(size=12);\n", - " plt.show()\n", - "\n", - "def draw_avg_reply_count(data, channel='Random'):\n", - " \"\"\"who commands many reply?\"\"\"\n", - "\n", - " data.groupby('sender_name')['reply_count'].mean().sort_values(ascending=False)[:20]\\\n", - " .plot(kind='bar', figsize=(15,7.5));\n", - " plt.title(f'Average Number of reply count per Sender in #{channel}', size=20, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n", - " plt.xticks(size=14); plt.yticks(size=14);\n", - " plt.show()\n", - "\n", - "def draw_avg_reply_users_count(data, channel='Random'):\n", - " \"\"\"who commands many user reply?\"\"\"\n", - "\n", - " data.groupby('sender_name')['reply_users_count'].mean().sort_values(ascending=False)[:20].plot(kind='bar',\n", - " figsize=(15,7.5));\n", - " plt.title(f'Average Number of reply user count per Sender in #{channel}', size=20, fontweight='bold')\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n", - " plt.xticks(size=14); plt.yticks(size=14);\n", - " plt.show()\n", - "\n", - "def draw_wordcloud(msg_content, week): \n", - " # word cloud visualization\n", - " allWords = ' '.join([twts for twts in msg_content])\n", - " wordCloud = WordCloud(background_color='#975429', width=500, height=300, random_state=21, max_words=500, mode='RGBA',\n", - " max_font_size=140, stopwords=stopwords.words('english')).generate(allWords)\n", - " plt.figure(figsize=(15, 7.5))\n", - " plt.imshow(wordCloud, interpolation=\"bilinear\")\n", - " plt.axis('off')\n", - " plt.tight_layout()\n", - " plt.title(f'WordCloud for {week}', size=30)\n", - " plt.show()\n", - "\n", - "def draw_user_reaction(data, channel='General'):\n", - " data.groupby('sender_name')[['reply_count', 'reply_users_count']].sum()\\\n", - " .sort_values(by='reply_count',ascending=False)[:10].plot(kind='bar', figsize=(15, 7.5))\n", - " plt.title(f'User with the most reaction in #{channel}', size=25);\n", - " plt.xlabel(\"Sender Name\", size=18); plt.ylabel(\"Frequency\", size=18);\n", - " plt.xticks(size=14); plt.yticks(size=14);\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Insight Extraction\n", - "\n", - "Below are some useful questions to answer. Feel free to explore to answer other interesting questions that may be of help to get insight about student's behaviour, need, and future performance " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# which user has the highest number of reply counts?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize reply counts per user per channel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# what is the time range of the day that most messages are sent?\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# what kind of messages are replied faster than others?" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# Relationship between # of messages and # of reactions" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Classify messages into different categories such as questions, answers, comments, etc." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Which users got the most reactions?" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# Model topics mentioned in the channel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# What are the topics that got the most reactions?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Harder questions to look into" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Based on messages, reactions, references shared, and other relevant data such as classification of questions into techical question, comment, answer, aorder stu the python, statistics, and sql skill level of a user?" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/week0_starter_network_analysis b/week0_starter_network_analysis new file mode 160000 index 0000000..b6d012a --- /dev/null +++ b/week0_starter_network_analysis @@ -0,0 +1 @@ +Subproject commit b6d012a6f49fea4acd239a5ea7f09c8fe183e4c8 From 976df1b61e4f9adaeac379ac8e2d272c5e9229c2 Mon Sep 17 00:00:00 2001 From: moykad Date: Fri, 1 Dec 2023 11:21:47 +0300 Subject: [PATCH 2/4] Topic Modeling-1 --- notebooks/Topic_Modelling.ipynb | 349 ++++---------------------------- 1 file changed, 39 insertions(+), 310 deletions(-) diff --git a/notebooks/Topic_Modelling.ipynb b/notebooks/Topic_Modelling.ipynb index 6ee4f98..08be075 100644 --- a/notebooks/Topic_Modelling.ipynb +++ b/notebooks/Topic_Modelling.ipynb @@ -83,9 +83,23 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/channels.json'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\moyka\\Desktop\\Task-2\\notebooks\\Topic_Modelling.ipynb Cell 4\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m data_loader \u001b[39m=\u001b[39m SlackDataLoader(\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mC:/Users/moyka/OneDrive/Documents/GitHub/anonymized/\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_channel_messages\u001b[39m(channel):\n\u001b[0;32m 4\u001b[0m channel_messages \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mget_messages_on_channel(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mC:/Users/moyka/OneDrive/Documents/GitHub//\u001b[39m\u001b[39m{\u001b[39;00mchannel\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \n", + "File \u001b[1;32m~\\OneDrive\\Documents\\GitHub\\week0_starter_network_analysis\\src\\loader.py:36\u001b[0m, in \u001b[0;36mSlackDataLoader.__init__\u001b[1;34m(self, path)\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[39m\u001b[39m\u001b[39m'''\u001b[39;00m\n\u001b[0;32m 33\u001b[0m \u001b[39mpath: path to the slack exported data folder\u001b[39;00m\n\u001b[0;32m 34\u001b[0m \u001b[39m'''\u001b[39;00m\n\u001b[0;32m 35\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpath \u001b[39m=\u001b[39m path\n\u001b[1;32m---> 36\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchannels \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mget_channels()\n\u001b[0;32m 37\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39musers \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mget_ussers()\n", + "File \u001b[1;32m~\\OneDrive\\Documents\\GitHub\\week0_starter_network_analysis\\src\\loader.py:53\u001b[0m, in \u001b[0;36mSlackDataLoader.get_channels\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_channels\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m 50\u001b[0m \u001b[39m \u001b[39m\u001b[39m'''\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \u001b[39m write a function to get all the channels from the json file\u001b[39;00m\n\u001b[0;32m 52\u001b[0m \u001b[39m '''\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpath, \u001b[39m'\u001b[39m\u001b[39mchannels.json\u001b[39m\u001b[39m'\u001b[39m), \u001b[39m'\u001b[39m\u001b[39mr\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[0;32m 54\u001b[0m channels \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(f)\n\u001b[0;32m 56\u001b[0m \u001b[39mreturn\u001b[39;00m channels\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/channels.json'" + ] + } + ], "source": [ "data_loader = SlackDataLoader(r\"C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/\")\n", "\n", @@ -109,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -153,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -192,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -203,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -220,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -247,155 +261,19 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(0,\n", - " [('help', 0.039370384),\n", - " ('time', 0.03086105),\n", - " ('could', 0.026818035),\n", - " ('submiss', 0.02321841),\n", - " ('dont', 0.016984716),\n", - " ('got', 0.015203388),\n", - " ('explain', 0.015147981),\n", - " ('anyon', 0.014908787),\n", - " ('ye', 0.014444858),\n", - " ('see', 0.014271538)]),\n", - " (1,\n", - " [('use', 0.03954693),\n", - " ('featur', 0.027709492),\n", - " ('make', 0.02753079),\n", - " ('one', 0.022003634),\n", - " ('task', 0.016114524),\n", - " ('streamlit', 0.015737709),\n", - " ('work', 0.014302439),\n", - " ('tri', 0.014013731),\n", - " ('sure', 0.013291272),\n", - " ('im', 0.011552603)]),\n", - " (2,\n", - " [('data', 0.050371937),\n", - " ('think', 0.03176376),\n", - " ('use', 0.022391073),\n", - " ('engag', 0.019544115),\n", - " ('score', 0.01950423),\n", - " ('cluster', 0.017591735),\n", - " ('normal', 0.01662211),\n", - " ('valu', 0.01587388),\n", - " ('experi', 0.015768496),\n", - " ('get', 0.015297936)]),\n", - " (3,\n", - " [('link', 0.029798074),\n", - " ('column', 0.025568051),\n", - " ('submit', 0.025158001),\n", - " ('task', 0.023643794),\n", - " ('use', 0.022553697),\n", - " ('go', 0.019353328),\n", - " ('user', 0.019226132),\n", - " ('repo', 0.018446513),\n", - " ('dashboard', 0.017137216),\n", - " ('need', 0.015926724)]),\n", - " (4,\n", - " [('thank', 0.09694823),\n", - " ('like', 0.0417825),\n", - " ('report', 0.034107205),\n", - " ('channel', 0.021120002),\n", - " ('would', 0.018848905),\n", - " ('okay', 0.01763415),\n", - " ('ask', 0.01697686),\n", - " ('much', 0.014890067),\n", - " ('join', 0.0138974115),\n", - " ('handset', 0.0118703805)])]\n" + "ename": "NameError", + "evalue": "name 'get_channel_messages' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\moyka\\Desktop\\Task-2\\notebooks\\Topic_Modelling.ipynb Cell 12\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m df \u001b[39m=\u001b[39m get_channel_messages(\u001b[39m\"\u001b[39m\u001b[39mall-week1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "\u001b[1;31mNameError\u001b[0m: name 'get_channel_messages' is not defined" ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "PreparedData(topic_coordinates= x y topics cluster Freq\n", - "topic \n", - "3 0.222826 0.162338 1 1 28.690014\n", - "2 0.091699 -0.250624 2 1 24.262345\n", - "1 -0.007488 -0.012498 3 1 19.354587\n", - "0 -0.131076 0.099128 4 1 15.543622\n", - "4 -0.175960 0.001657 5 1 12.149431, topic_info= Term Freq Total Category logprob loglift\n", - "24 thank 47.000000 47.000000 Default 30.0000 30.0000\n", - "4 data 49.000000 49.000000 Default 29.0000 29.0000\n", - "39 help 33.000000 33.000000 Default 28.0000 28.0000\n", - "55 like 20.000000 20.000000 Default 27.0000 27.0000\n", - "86 think 31.000000 31.000000 Default 26.0000 26.0000\n", - ".. ... ... ... ... ... ...\n", - "649 ’ 4.981852 9.575574 Topic5 -4.5703 1.4545\n", - "335 realli 3.838983 6.969174 Topic5 -4.8309 1.5116\n", - "538 df 3.444510 6.668235 Topic5 -4.9393 1.4473\n", - "94 tri 3.817392 23.164060 Topic5 -4.8365 0.3049\n", - "945 mentor 2.762573 4.804383 Topic5 -5.1599 1.5545\n", - "\n", - "[218 rows x 6 columns], token_table= Topic Freq Term\n", - "term \n", - "572 4 0.798847 actual\n", - "132 5 0.895951 alright\n", - "2 1 0.938798 also\n", - "760 4 0.789122 alumnus\n", - "322 1 0.931152 analysi\n", - "... ... ... ...\n", - "648 5 0.798406 ‘\n", - "649 2 0.208865 ’\n", - "649 4 0.208865 ’\n", - "649 5 0.522162 ’\n", - "203 2 0.959415 •\n", - "\n", - "[218 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[4, 3, 2, 1, 5])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -434,168 +312,19 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(0,\n", - " [('rollingonthefloorlaugh', 0.088871546),\n", - " ('parent', 0.053244308),\n", - " ('good', 0.04075855),\n", - " ('make', 0.02399027),\n", - " ('morn', 0.018489508),\n", - " ('would', 0.017024687),\n", - " ('man', 0.01674058),\n", - " ('realli', 0.014869037),\n", - " ('love', 0.012299013),\n", - " ('life', 0.010928868)]),\n", - " (1,\n", - " [('pray', 0.080668695),\n", - " ('u', 0.07174005),\n", - " ('like', 0.065636344),\n", - " ('also', 0.049389277),\n", - " ('someon', 0.041917514),\n", - " ('one', 0.030192202),\n", - " ('prayer', 0.028453592),\n", - " ('know', 0.0248715),\n", - " ('think', 0.017885309),\n", - " ('bzw', 0.0164379)]),\n", - " (2,\n", - " [('hello', 0.046209835),\n", - " ('dont', 0.033363096),\n", - " ('want', 0.025301958),\n", - " ('peopl', 0.025089722),\n", - " ('last', 0.022465738),\n", - " ('miss', 0.021834275),\n", - " ('use', 0.020633506),\n", - " ('today', 0.018617414),\n", - " ('everyon', 0.013510507),\n", - " ('start', 0.0125754215)]),\n", - " (3,\n", - " [('need', 0.074772716),\n", - " ('job', 0.06344194),\n", - " ('grin', 0.017321585),\n", - " ('go', 0.017111976),\n", - " ('see', 0.017013228),\n", - " ('cant', 0.012809223),\n", - " ('graduat', 0.012070753),\n", - " ('done', 0.011215276),\n", - " ('sure', 0.011110627),\n", - " ('outsourc', 0.009723263)]),\n", - " (4,\n", - " [('told', 0.0705787),\n", - " ('get', 0.052049227),\n", - " ('immedi', 0.026630463),\n", - " ('•', 0.025180869),\n", - " ('cb', 0.02216999),\n", - " ('thank', 0.021754235),\n", - " ('time', 0.018611282),\n", - " ('guy', 0.01785792),\n", - " ('academi', 0.016819155),\n", - " ('plea', 0.014971133)])]\n" + "ename": "NameError", + "evalue": "name 'get_channel_messages' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\moyka\\Desktop\\Task-2\\notebooks\\Topic_Modelling.ipynb Cell 14\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m df \u001b[39m=\u001b[39m get_channel_messages(\u001b[39m\"\u001b[39m\u001b[39mall-community-building\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "\u001b[1;31mNameError\u001b[0m: name 'get_channel_messages' is not defined" ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "PreparedData(topic_coordinates= x y topics cluster Freq\n", - "topic \n", - "4 -0.316967 -0.090289 1 1 27.181033\n", - "1 0.170308 -0.266332 2 1 19.650094\n", - "0 0.066181 0.098589 3 1 18.739438\n", - "2 0.034563 0.143814 4 1 18.571699\n", - "3 0.045915 0.114218 5 1 15.857736, topic_info= Term Freq Total Category logprob \\\n", - "77 rollingonthefloorlaugh 588.000000 588.000000 Default 30.0000 \n", - "759 pray 561.000000 561.000000 Default 29.0000 \n", - "1506 told 680.000000 680.000000 Default 28.0000 \n", - "371 u 498.000000 498.000000 Default 27.0000 \n", - "648 need 419.000000 419.000000 Default 26.0000 \n", - "... ... ... ... ... ... \n", - "2579 kg 41.549688 43.765998 Topic5 -4.9040 \n", - "3869 sell 29.744465 30.623335 Topic5 -5.2382 \n", - "5639 outsourc 54.470326 278.651722 Topic5 -4.6332 \n", - "5637 bzw 42.705441 269.061348 Topic5 -4.8766 \n", - "5638 palmsuptogeth 42.705436 269.061318 Topic5 -4.8766 \n", - "\n", - " loglift \n", - "77 30.0000 \n", - "759 29.0000 \n", - "1506 28.0000 \n", - "371 27.0000 \n", - "648 26.0000 \n", - "... ... \n", - "2579 1.7895 \n", - "3869 1.8124 \n", - "5639 0.2092 \n", - "5637 0.0009 \n", - "5638 0.0009 \n", - "\n", - "[203 rows x 6 columns], token_table= Topic Freq Term\n", - "term \n", - "305 1 0.999103 academi\n", - "189 4 0.992490 activ\n", - "3983 5 0.975318 alien\n", - "174 2 0.977639 alreadi\n", - "628 2 0.998343 also\n", - "... ... ... ...\n", - "440 2 0.986758 ye\n", - "429 5 0.987888 yeah\n", - "810 3 0.980327 your\n", - "906 1 0.991522 ’\n", - "39 1 0.998276 •\n", - "\n", - "[196 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[5, 2, 1, 3, 4])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ From 9119d824a311929c7b9bd9f4155a2fa9fbc3971d Mon Sep 17 00:00:00 2001 From: moykad Date: Fri, 1 Dec 2023 23:40:34 +0300 Subject: [PATCH 3/4] Data modeling --- notebooks/README.md | 38 ----- notebooks/Topic_Modelling.ipynb | 262 +++++--------------------------- src/loader.py | 2 +- week0_starter_network_analysis | 1 - 4 files changed, 40 insertions(+), 263 deletions(-) delete mode 100644 notebooks/README.md delete mode 160000 week0_starter_network_analysis diff --git a/notebooks/README.md b/notebooks/README.md deleted file mode 100644 index 87e239b..0000000 --- a/notebooks/README.md +++ /dev/null @@ -1,38 +0,0 @@ - -### Some of the functions available in the notebooks and codes in this repository - -#### Slack Data Parsing Functions -`slack_parser`: Parses Slack data to extract relevant information such as message type, content, sender details, thread information, etc. Combines data from multiple JSON files and returns a DataFrame. - -`parse_slack_reaction`: Retrieves reaction-related information from Slack data, including reaction name, count, users, associated message, and user ID. Returns a DataFrame. - -`convert_2_timestamp`: Converts Unix time to a readable timestamp for specified columns in the DataFrame. - -#### User Interaction and Community Analysis Functions -`get_tagged_users`: Extracts all user mentions (@) from messages. - -`get_community_participation`: Analyzes community participation by counting the number of replies for each user. - -`map_userid_2_realname`: Maps Slack IDs to real names using user profiles. Optionally, plots a bar graph of message counts for each user. - -`get_top_20_user`: Plots the top 20 message senders in a specified channel. - -`draw_avg_reply_count`: Plots the average number of reply counts per sender in a channel. - -`draw_avg_reply_users_count`: Plots the average number of reply user counts per sender in a channel. - -`draw_wordcloud`: Generates and displays a word cloud visualization for message content. - -`draw_user_reaction`: Plots users with the most reactions in a channel. - -#### Data Analysis and Visualization -`get_top_20_user(dfall_week, channel='All learning')`: Visualizes the top 20 message senders. - -`draw_avg_reply_count(dfall_week, channel='All Learning')`: Visualizes the average reply count per sender. - -`draw_avg_reply_users_count(dfall_week, channel='All learning')`: Visualizes the average reply user count per sender. - -`draw_wordcloud(dfall_week['msg_content'], week='All Learning Week')`: Displays a word cloud for message content. - -`draw_user_reaction`: Plots users with the most reactions. - diff --git a/notebooks/Topic_Modelling.ipynb b/notebooks/Topic_Modelling.ipynb index 08be075..413cb9f 100644 --- a/notebooks/Topic_Modelling.ipynb +++ b/notebooks/Topic_Modelling.ipynb @@ -2,8 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 1, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import warnings\n", @@ -28,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -68,8 +70,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add parent directory to path to import modules from src\n", @@ -83,28 +87,16 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/channels.json'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\moyka\\Desktop\\Task-2\\notebooks\\Topic_Modelling.ipynb Cell 4\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m data_loader \u001b[39m=\u001b[39m SlackDataLoader(\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mC:/Users/moyka/OneDrive/Documents/GitHub/anonymized/\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_channel_messages\u001b[39m(channel):\n\u001b[0;32m 4\u001b[0m channel_messages \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mget_messages_on_channel(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mC:/Users/moyka/OneDrive/Documents/GitHub//\u001b[39m\u001b[39m{\u001b[39;00mchannel\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \n", - "File \u001b[1;32m~\\OneDrive\\Documents\\GitHub\\week0_starter_network_analysis\\src\\loader.py:36\u001b[0m, in \u001b[0;36mSlackDataLoader.__init__\u001b[1;34m(self, path)\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[39m\u001b[39m\u001b[39m'''\u001b[39;00m\n\u001b[0;32m 33\u001b[0m \u001b[39mpath: path to the slack exported data folder\u001b[39;00m\n\u001b[0;32m 34\u001b[0m \u001b[39m'''\u001b[39;00m\n\u001b[0;32m 35\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpath \u001b[39m=\u001b[39m path\n\u001b[1;32m---> 36\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mchannels \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mget_channels()\n\u001b[0;32m 37\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39musers \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mget_ussers()\n", - "File \u001b[1;32m~\\OneDrive\\Documents\\GitHub\\week0_starter_network_analysis\\src\\loader.py:53\u001b[0m, in \u001b[0;36mSlackDataLoader.get_channels\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_channels\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m 50\u001b[0m \u001b[39m \u001b[39m\u001b[39m'''\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \u001b[39m write a function to get all the channels from the json file\u001b[39;00m\n\u001b[0;32m 52\u001b[0m \u001b[39m '''\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpath, \u001b[39m'\u001b[39m\u001b[39mchannels.json\u001b[39m\u001b[39m'\u001b[39m), \u001b[39m'\u001b[39m\u001b[39mr\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[0;32m 54\u001b[0m channels \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(f)\n\u001b[0;32m 56\u001b[0m \u001b[39mreturn\u001b[39;00m channels\n", - "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/channels.json'" - ] - } - ], + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "data_loader = SlackDataLoader(r\"C:/Users/moyka/OneDrive/Documents/GitHub/anonymized/\")\n", + "data_loader = SlackDataLoader(r\"C:/Users/moyka/Desktop/Dashboard/anonymized/\")\n", "\n", "def get_channel_messages(channel):\n", - " channel_messages = utils.get_messages_on_channel(f\"C:/Users/moyka/OneDrive/Documents/GitHub//{channel}\") \n", + " channel_messages = utils.get_messages_on_channel(f\"C:/Users/moyka/Desktop/Dashboard/anonymized/{channel}\") \n", " # Create an empty DataFrame\n", " df = pd.DataFrame(channel_messages)\n", " return df\n", @@ -123,8 +115,10 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, + "execution_count": 5, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def preprocess_text(text):\n", @@ -167,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -186,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -206,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -217,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -234,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -261,18 +255,19 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'get_channel_messages' is not defined", + "ename": "AttributeError", + "evalue": "module 'src.utils' has no attribute 'get_messages_on_channel'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\moyka\\Desktop\\Task-2\\notebooks\\Topic_Modelling.ipynb Cell 12\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m df \u001b[39m=\u001b[39m get_channel_messages(\u001b[39m\"\u001b[39m\u001b[39mall-week1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", - "\u001b[1;31mNameError\u001b[0m: name 'get_channel_messages' is not defined" + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[11], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_channel_messages(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-week1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "Cell \u001b[1;32mIn[4], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" ] } ], @@ -283,28 +278,9 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package wordnet to\n", - "[nltk_data] C:\\Users\\Eshetu\\AppData\\Roaming\\nltk_data...\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import nltk\n", "nltk.download('wordnet')" @@ -312,21 +288,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'get_channel_messages' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\moyka\\Desktop\\Task-2\\notebooks\\Topic_Modelling.ipynb Cell 14\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m df \u001b[39m=\u001b[39m get_channel_messages(\u001b[39m\"\u001b[39m\u001b[39mall-community-building\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", - "\u001b[1;31mNameError\u001b[0m: name 'get_channel_messages' is not defined" - ] - } - ], + "outputs": [], "source": [ "df = get_channel_messages(\"all-community-building\")\n", "get_top_topics(df)" @@ -341,157 +305,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(0,\n", - " [('work', 0.07804269),\n", - " ('instal', 0.039492182),\n", - " ('get', 0.03234122),\n", - " ('guy', 0.026092794),\n", - " ('face', 0.02030699),\n", - " ('return', 0.019842522),\n", - " ('ok', 0.015252527),\n", - " ('okay', 0.014948669),\n", - " ('pip', 0.01389798),\n", - " ('read', 0.013267612)]),\n", - " (1,\n", - " [('file', 0.0549693),\n", - " ('line', 0.034151033),\n", - " ('meet', 0.02210031),\n", - " ('extract', 0.020621968),\n", - " ('one', 0.020602355),\n", - " ('good', 0.017431682),\n", - " ('task', 0.017173653),\n", - " ('imag', 0.015683454),\n", - " ('logo', 0.01498308),\n", - " ('need', 0.014000982)]),\n", - " (2,\n", - " [('channel', 0.18221219),\n", - " ('join', 0.18198967),\n", - " ('connect', 0.018376695),\n", - " ('plea', 0.018241962),\n", - " ('time', 0.01755661),\n", - " ('start', 0.016417718),\n", - " ('•', 0.014001578),\n", - " ('today', 0.012244016),\n", - " ('code', 0.012109588),\n", - " ('link', 0.011688723)]),\n", - " (3,\n", - " [('thank', 0.041847672),\n", - " ('think', 0.03968491),\n", - " ('data', 0.03801513),\n", - " ('sure', 0.021638291),\n", - " ('creat', 0.019395646),\n", - " ('right', 0.019231077),\n", - " ('present', 0.01881592),\n", - " ('dont', 0.018710904),\n", - " ('model', 0.016673988),\n", - " ('project', 0.016466739)]),\n", - " (4,\n", - " [('use', 0.065344),\n", - " ('tri', 0.05039213),\n", - " ('error', 0.028861312),\n", - " ('ye', 0.028690428),\n", - " ('instanc', 0.02610646),\n", - " ('let', 0.021604491),\n", - " ('u', 0.019047005),\n", - " ('group', 0.017978994),\n", - " ('im', 0.01785427),\n", - " ('make', 0.017489491)])]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "PreparedData(topic_coordinates= x y topics cluster Freq\n", - "topic \n", - "2 0.371924 -0.020110 1 1 25.993380\n", - "4 -0.090644 0.267852 2 1 20.377566\n", - "1 -0.119150 -0.263536 3 1 19.558450\n", - "3 -0.070161 0.083032 4 1 18.391670\n", - "0 -0.091969 -0.067238 5 1 15.678934, topic_info= Term Freq Total Category logprob loglift\n", - "0 channel 4785.000000 4785.000000 Default 30.0000 30.0000\n", - "1 join 4780.000000 4780.000000 Default 29.0000 29.0000\n", - "203 work 1237.000000 1237.000000 Default 28.0000 28.0000\n", - "24 use 1346.000000 1346.000000 Default 27.0000 27.0000\n", - "5632 file 1087.000000 1087.000000 Default 26.0000 26.0000\n", - "... ... ... ... ... ... ...\n", - "5079 fix 93.573706 94.353067 Topic5 -5.1316 1.8446\n", - "2138 frame 184.716212 186.261848 Topic5 -4.4515 1.8445\n", - "1393 recent 98.995630 100.062156 Topic5 -5.0753 1.8421\n", - "7526 conda 125.739371 131.182746 Topic5 -4.8361 1.8105\n", - "8245 sudo 123.883955 154.399777 Topic5 -4.8510 1.6327\n", - "\n", - "[189 rows x 6 columns], token_table= Topic Freq Term\n", - "term \n", - "351 4 0.993177 abl\n", - "377 2 0.997747 access\n", - "2050 3 0.996749 ad\n", - "3254 4 0.991437 airflow\n", - "174 5 0.997243 alreadi\n", - "... ... ... ...\n", - "429 4 0.994975 yeah\n", - "1131 2 0.994165 yesterday\n", - "810 5 0.995969 your\n", - "906 5 0.994548 ’\n", - "39 1 0.998721 •\n", - "\n", - "[181 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[3, 5, 2, 4, 1])" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df = get_all_channels_message()\n", "get_top_topics(df)" @@ -525,5 +341,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/loader.py b/src/loader.py index c75b68d..5faa2f7 100644 --- a/src/loader.py +++ b/src/loader.py @@ -34,7 +34,7 @@ def __init__(self, path): ''' self.path = path self.channels = self.get_channels() - self.users = self.get_ussers() + self.users = self.get_users() def get_users(self): diff --git a/week0_starter_network_analysis b/week0_starter_network_analysis deleted file mode 160000 index b6d012a..0000000 --- a/week0_starter_network_analysis +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b6d012a6f49fea4acd239a5ea7f09c8fe183e4c8 From 199f5dc0b8eaa35fa15b4e0a3140350a4452c4ad Mon Sep 17 00:00:00 2001 From: moykad Date: Sat, 2 Dec 2023 00:11:46 +0300 Subject: [PATCH 4/4] Data modeling-1 --- notebooks/Topic_Modelling.ipynb | 119 +++++++++++++++++++------------- src/utils.py | 22 ++++++ 2 files changed, 93 insertions(+), 48 deletions(-) diff --git a/notebooks/Topic_Modelling.ipynb b/notebooks/Topic_Modelling.ipynb index 413cb9f..fd66371 100644 --- a/notebooks/Topic_Modelling.ipynb +++ b/notebooks/Topic_Modelling.ipynb @@ -30,47 +30,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defaulting to user installation because normal site-packages is not writeable\n", - "Requirement already satisfied: pyLDAvis in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (3.4.1)\n", - "Requirement already satisfied: numpy>=1.24.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.24.3)\n", - "Requirement already satisfied: scipy in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.11.1)\n", - "Requirement already satisfied: pandas>=2.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (2.0.3)\n", - "Requirement already satisfied: joblib>=1.2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.2.0)\n", - "Requirement already satisfied: jinja2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (3.1.2)\n", - "Requirement already satisfied: numexpr in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (2.8.4)\n", - "Requirement already satisfied: funcy in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from pyLDAvis) (2.0)\n", - "Requirement already satisfied: scikit-learn>=1.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (1.3.0)\n", - "Requirement already satisfied: gensim in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (4.3.0)\n", - "Requirement already satisfied: setuptools in c:\\programdata\\anaconda3\\lib\\site-packages (from pyLDAvis) (68.0.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=2.0.0->pyLDAvis) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=2.0.0->pyLDAvis) (2023.3.post1)\n", - "Requirement already satisfied: tzdata>=2022.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=2.0.0->pyLDAvis) (2023.3)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from scikit-learn>=1.0.0->pyLDAvis) (2.2.0)\n", - "Requirement already satisfied: smart-open>=1.8.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from gensim->pyLDAvis) (5.2.1)\n", - "Requirement already satisfied: FuzzyTM>=0.4.0 in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from gensim->pyLDAvis) (2.0.5)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from jinja2->pyLDAvis) (2.1.1)\n", - "Requirement already satisfied: pyfume in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from FuzzyTM>=0.4.0->gensim->pyLDAvis) (0.2.25)\n", - "Requirement already satisfied: six>=1.5 in c:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.2->pandas>=2.0.0->pyLDAvis) (1.16.0)\n", - "Requirement already satisfied: simpful in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis) (2.11.1)\n", - "Requirement already satisfied: fst-pso in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis) (1.8.1)\n", - "Requirement already satisfied: miniful in c:\\users\\moyka\\appdata\\roaming\\python\\python311\\site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis) (0.0.6)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install pyLDAvis" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "tags": [] }, @@ -87,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "tags": [] }, @@ -115,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "tags": [] }, @@ -161,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -200,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -211,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -228,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -266,7 +235,7 @@ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[11], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_channel_messages(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-week1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", - "Cell \u001b[1;32mIn[4], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "Cell \u001b[1;32mIn[3], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" ] } @@ -278,9 +247,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\moyka\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import nltk\n", "nltk.download('wordnet')" @@ -288,9 +277,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'src.utils' has no attribute 'get_messages_on_channel'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[13], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_channel_messages(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-community-building\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "Cell \u001b[1;32mIn[3], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" + ] + } + ], "source": [ "df = get_channel_messages(\"all-community-building\")\n", "get_top_topics(df)" @@ -305,14 +307,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'src.utils' has no attribute 'get_messages_on_channel'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m get_all_channels_message()\n\u001b[0;32m 2\u001b[0m get_top_topics(df)\n", + "Cell \u001b[1;32mIn[3], line 13\u001b[0m, in \u001b[0;36mget_all_channels_message\u001b[1;34m()\u001b[0m\n\u001b[0;32m 10\u001b[0m dfs \u001b[38;5;241m=\u001b[39m [] \u001b[38;5;66;03m# List to store individual DataFrames\u001b[39;00m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m channel \u001b[38;5;129;01min\u001b[39;00m data_loader\u001b[38;5;241m.\u001b[39mchannels:\n\u001b[1;32m---> 13\u001b[0m dfs\u001b[38;5;241m.\u001b[39mappend(get_channel_messages(channel[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m]))\n\u001b[0;32m 15\u001b[0m \u001b[38;5;66;03m# Concatenate all DataFrames into a single DataFrame\u001b[39;00m\n\u001b[0;32m 16\u001b[0m result_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat(dfs, ignore_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "Cell \u001b[1;32mIn[3], line 4\u001b[0m, in \u001b[0;36mget_channel_messages\u001b[1;34m(channel)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_channel_messages\u001b[39m(channel):\n\u001b[1;32m----> 4\u001b[0m channel_messages \u001b[38;5;241m=\u001b[39m utils\u001b[38;5;241m.\u001b[39mget_messages_on_channel(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mC:/Users/moyka/Desktop/Dashboard/anonymized/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mchannel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Create an empty DataFrame\u001b[39;00m\n\u001b[0;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(channel_messages)\n", + "\u001b[1;31mAttributeError\u001b[0m: module 'src.utils' has no attribute 'get_messages_on_channel'" + ] + } + ], "source": [ "df = get_all_channels_message()\n", "get_top_topics(df)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/utils.py b/src/utils.py index 45dda22..7f98cac 100644 --- a/src/utils.py +++ b/src/utils.py @@ -180,3 +180,25 @@ def convert_2_timestamp(column, data): timestamp_.append(a.strftime('%Y-%m-%d %H:%M:%S')) return timestamp_ else: print(f"{column} not in data") +def get_messages_on_channel(channel_path): + + json_files = [ + f"{channel_path}/{pos_json}" + for pos_json in os.listdir(channel_path) + if pos_json.endswith('.json') + ] + combined = [] + + for json_file in json_files: + with open(json_file, 'r', encoding="utf8") as slack_data: + json_content = json.load(slack_data) + combined.extend(json_content) + + messages = [] + + for msg in combined: + msg_list, _ = process_message(msg) + messages.append(msg_list) + + + return messages \ No newline at end of file