diff --git a/.gitignore b/.gitignore index 8cd10b4..152d003 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,7 @@ conf/**/*credentials* # ignore everything in the following folders data/** logs/** - +anonymized/** # except their sub-folders !data/**/ !logs/**/ diff --git a/Text.txt.txt b/Text.txt.txt new file mode 100644 index 0000000..150fce7 --- /dev/null +++ b/Text.txt.txt @@ -0,0 +1,10 @@ +My first text +<<<<<<< HEAD +Second Text +======= +Before submission +30 minutes before submission +>>>>>>> 8ab44a9c7bb8cdbc61971c3fa6ef7584546bbe2d +First day training +no one +go \ No newline at end of file diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..f80e15c --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1d795814-731b-4f8b-a4ec-82668183d083", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#Loading necessary packages\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import gensim\n", + "from gensim.models import CoherenceModel\n", + "from gensim import corpora\n", + "import pandas as pd\n", + "from pprint import pprint\n", + "import string\n", + "import os\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2f935b8-9bf5-4715-9fcd-1c5907ea8848", + "metadata": {}, + "outputs": [], + "source": [ + "#data loader class\n", + "class DataLoader:\n", + " def __init__(self,dir_name,file_name):\n", + " self.dir_name=dir_name\n", + " self.file_name = file_name\n", + "\n", + "\n", + " def read_csv(self):\n", + " os.chdir(self.dir_name)\n", + " tweets_df=pd.read_csv(self.file_name)\n", + " return tweets_df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d5d2b3c-d207-4181-9843-260120eb29ea", + "metadata": {}, + "outputs": [], + "source": [ + "#object creation\n", + "DataLoader_obj= DataLoader('drive/MyDrive','cleaned_fintech_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a81c803-f209-49cc-88bc-70c57b2abd5e", + "metadata": {}, + "outputs": [], + "source": [ + "tweets_df=DataLoader_obj.read_csv()\n", + "tweets_df.dropna()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/parse_slack_data.ipynb b/notebooks/parse_slack_data.ipynb index e3774f8..1caab9e 100644 --- a/notebooks/parse_slack_data.ipynb +++ b/notebooks/parse_slack_data.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%reload_ext autoreload\n", @@ -13,7 +15,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import os, sys\n", @@ -28,17 +32,36 @@ "import seaborn as sns\n", "\n", "from nltk.corpus import stopwords\n", - "from wordcloud import WordCloud" + "from wordcloud import WordCloud #installing wordcloud library\n", + "\n", + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.corpus import stopwords\n", + "from wordcloud import WordCloud\n", + "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install wordcloud" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add parent directory to path to import modules from src\n", - "rpath = os.path.abspath('..')\n", + "rpath = os.path.abspath(r'C:\\Users\\moyka\\OneDrive\\Documents\\GitHub\\week0_starter_network_analysis')\n", "if rpath not in sys.path:\n", " sys.path.insert(0, rpath)\n", "\n", @@ -98,7 +121,8 @@ " combined = []\n", " for json_file in glob.glob(f\"{path_channel}*.json\"):\n", " with open(json_file, 'r', encoding=\"utf8\") as slack_data:\n", - " combined.append(slack_data)\n", + " file_content = json.load(slack_data)\n", + " combined.append(file_content)\n", "\n", " # loop through all json files and extract required informations\n", " dflist = []\n", @@ -195,6 +219,43 @@ " return comm_dict" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = slack_parser(r'C:/Users/moyka/OneDrive/Documents/GitHub/allweeks89/')\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "PSR = parse_slack_reaction(r'C:/Users/moyka/OneDrive/Documents/GitHub/allweeks89/', 'allweeks89')\n", + "PSR[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result_data = parse_slack_reaction(r'C:/Users/moyka/OneDrive/Documents/GitHub/allweeks89/', 'allweeks89')\n", + "df = pd.DataFrame(result_data)\n", + "df.to_csv(r'C:\\Users\\moyka\\Desktop\\New\\file2.csv')" + ] + }, { "cell_type": "code", "execution_count": null, @@ -259,6 +320,30 @@ " return ac_comm_dict" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "converted_time = convert_2_timestamp('msg_sent_time', df)\n", + "converted_time[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "get_tagged_use = get_tagged_users(df)\n", + "get_tagged_use[:5]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -321,6 +406,19 @@ " plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pfill = slack_parser(r'C:/Users/moyka/OneDrive/Documents/GitHub/allweeks89/')\n", + "pfill.info()\n", + "\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -336,7 +434,10 @@ "metadata": {}, "outputs": [], "source": [ - "# which user has the highest number of reply counts?" + "# which user has the highest number of reply counts?\n", + "#draw_avg_reply_users_count(pfill, channel='YourChannelName')\n", + "user_with_highest_replies = pfill.groupby('sender_name')['reply_count'].sum().idxmax()\n", + "print(f\"The user with the highest number of reply counts is: {user_with_highest_replies}\")" ] }, { @@ -345,7 +446,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Visualize reply counts per user per channel" + "# Visualize reply counts per user per channel\n", + "draw_avg_reply_users_count(pfill,'allweeks89')" ] }, { @@ -354,52 +456,117 @@ "metadata": {}, "outputs": [], "source": [ - "# what is the time range of the day that most messages are sent?\n" + "# what is the time range of the day that most messages are sent?\n", + "pfill['msg_sent_timestamp'] = utils.convert_2_timestamp('msg_sent_time', pfill)\n", + "pfill['hour'] = pd.to_datetime(pfill['msg_sent_timestamp']).dt.hour\n", + "\n", + "# Plot the histogram\n", + "sns.set(style=\"whitegrid\")\n", + "plt.figure(figsize=(12, 8))\n", + "sns.histplot(pfill['hour'], bins=24, kde=False)\n", + "plt.yscale('log')\n", + "plt.title('Message Counts by Hour of the Day')\n", + "plt.xlabel('Hour of the Day')\n", + "plt.ylabel('Message Count')\n", + "\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# what kind of messages are replied faster than others?" + "# what kind of messages are replied faster than others?\n", + "# Convert msg_sent_time and tm_thread_end to timestamps\n", + "pfill['msg_sent_timestamp'] = utils.convert_2_timestamp('msg_sent_time', pfill)\n", + "pfill['tm_thread_end_timestamp'] = utils.convert_2_timestamp('tm_thread_end', pfill)\n", + "\n", + "# Calculate the time it takes to receive a reply\n", + "pfill['reply_time'] = (pd.to_datetime(pfill['tm_thread_end_timestamp']) - pd.to_datetime(pfill['msg_sent_timestamp'])).dt.total_seconds()\n", + "\n", + "# Plotting\n", + "sns.set(style=\"whitegrid\")\n", + "plt.figure(figsize=(8, 6))\n", + "sns.boxplot(x='msg_dist_type', y='reply_time', data=pfill)\n", + "plt.yscale('log')\n", + "plt.title('Time to Receive Reply for Different Message Types')\n", + "plt.xlabel('Message Distribution Type')\n", + "plt.ylabel('Reply Time (seconds)')\n", + "plt.xticks(rotation=45, ha='right')\n", + "\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Relationship between # of messages and # of reactions" + "# Relationship between # of messages and # of reactions\n", + "grouped_data = pfill.groupby(['sender_name', 'channel'])['reply_count'].sum().reset_index()\n", + "\n", + "# Plotting\n", + "plt.figure(figsize=(12, 8))\n", + "sns.barplot(x='sender_name', y='reply_count', hue='channel', data=grouped_data)\n", + "plt.title('Reply Counts per User per Channel')\n", + "plt.xlabel('User')\n", + "plt.ylabel('Total Reply Counts')\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.legend(title='Channel', loc='upper right')\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Classify messages into different categories such as questions, answers, comments, etc." + "# Classify messages into different categories such as questions, answers, comments, etc.\n", + "# Get tagged users in the messages\n", + "pfill['tagged_users'] = get_tagged_users(pfill)\n", + "\n", + "# Create new columns for message categories\n", + "pfill['category'] = 'Other' # Default category\n", + "\n", + "# Identify and classify messages based on certain keywords or patterns\n", + "pfill.loc[pfill['msg_content'].str.contains('(?:^|\\W)\\\\?+'), 'category'] = 'Question'\n", + "pfill.loc[pfill['msg_content'].str.contains('(?:^|\\W)answer(?:$|\\W)', case=False), 'category'] = 'Answer'\n", + "pfill.loc[pfill['msg_content'].str.contains('(?:^|\\W)comment(?:$|\\W)', case=False), 'category'] = 'Comment'\n", + "\n", + "# Display the distribution of message categories\n", + "category_counts = pfill['category'].value_counts()\n", + "print(category_counts)\n", + "\n", + "# Plot the distribution\n", + "category_counts.plot(kind='bar', figsize=(10, 6))\n", + "plt.title('Distribution of Message Categories')\n", + "plt.xlabel('Message Category')\n", + "plt.ylabel('Count')\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Which users got the most reactions?" + "# Which users got the most reactions?\n", + "draw_user_reaction(pfill, channel='Random')" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Model topics mentioned in the channel" + "# Model topics mentioned in the channel\n", + "draw_wordcloud(pfill['msg_content'], week='Random')" ] }, { @@ -408,7 +575,17 @@ "metadata": {}, "outputs": [], "source": [ - "# What are the topics that got the most reactions?" + "# What are the topics that got the most reactions?\n", + "\n", + "\n", + "# Parse Slack reactions for the 'allweeks89' channel\n", + "reaction_data = parse_slack_reaction(r'C:/Users/moyka/OneDrive/Documents/GitHub/allweeks89/', 'allweeks89')\n", + "\n", + "# Get the topics with the most reactions\n", + "top_reacted_topics = reaction_data.groupby('message')['reaction_count'].sum().sort_values(ascending=False)\n", + "\n", + "# Display the top topics with the most reactions\n", + "print(top_reacted_topics.head(10))" ] }, { @@ -430,7 +607,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -444,7 +621,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.11.5" } }, "nbformat": 4,