From 13b7940b1c0003ffb8b37e52b5ddec343540dfc4 Mon Sep 17 00:00:00 2001 From: haotianzhang Date: Tue, 1 Apr 2025 01:56:14 +0800 Subject: [PATCH 1/5] Added missing sentiment classifier, fixes #1 --- ...Development_Sentiment_Classification.ipynb | 2174 +++++++++++++++++ 1 file changed, 2174 insertions(+) create mode 100644 Software_Development_Sentiment_Classification.ipynb diff --git a/Software_Development_Sentiment_Classification.ipynb b/Software_Development_Sentiment_Classification.ipynb new file mode 100644 index 0000000..ff12ae7 --- /dev/null +++ b/Software_Development_Sentiment_Classification.ipynb @@ -0,0 +1,2174 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oe8X-6s9btXo" + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import re\n", + "import string\n", + "import random\n", + "import warnings\n", + "import argparse\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "import time\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "from io import StringIO\n", + "from unicodedata import category\n", + "from bs4 import BeautifulSoup\n", + "from markdown import markdown\n", + "from google.colab import drive\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n", + "from torch.utils.data import DataLoader, RandomSampler, Dataset\n", + "from transformers import (\n", + " BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n", + " XLNetTokenizer, XLNetForSequenceClassification,\n", + " RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n", + " AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n", + " get_scheduler, AdamW\n", + ")\n", + "\n", + "\n", + "drive.mount('/content/drive')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RAXtSnSK4LPr" + }, + "source": [ + "# Tokenlized" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9gOgONc04PiO" + }, + "outputs": [], + "source": [ + "# Regular expression for GitHub username mentions\n", + "USERNAME_REGEX = r\"(\\s|^)@(\\S*\\s?)\"\n", + "\n", + "# Generate Unicode punctuation set\n", + "punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith((\"P\", \"S\"))}\n", + "\n", + "# Dictionary to count token replacements\n", + "counters = {}\n", + "\n", + "def remove_punctuation(text):\n", + " \"\"\"Remove all punctuation characters from the given text.\"\"\"\n", + " return \"\".join(char for char in text if char not in punctuation)\n", + "\n", + "def clean_text(text):\n", + " \"\"\"Remove quoted text and large code blocks from GitHub issues or comments.\"\"\"\n", + " # Remove quoted text from emails/notifications\n", + " text = re.sub(r\"^(On[\\s\\S]*?notifications@github\\.com\\s*?wrote:\\s*?)?(^(\\>).*\\s)*\", '', text, flags=re.MULTILINE)\n", + "\n", + " # Remove code blocks enclosed in triple backticks\n", + " text = re.sub(r\"```[a-z]*\\n[\\s\\S]*?\\n```\", \"\", text)\n", + "\n", + " return text\n", + "\n", + "def replace_token(regex, token_name, text):\n", + " \"\"\"\n", + " Replace matched patterns in the text with the specified token.\n", + "\n", + " Args:\n", + " regex (str): The regular expression pattern to match.\n", + " token_name (str): The replacement token name.\n", + " text (str): The input text.\n", + "\n", + " Returns:\n", + " tuple: (processed_text, number_of_replacements)\n", + " \"\"\"\n", + " replaced_text, replacements = re.subn(regex, f\" {token_name} \", text, flags=re.MULTILINE)\n", + " counters[token_name] = counters.get(token_name, 0) + replacements\n", + " return replaced_text, replacements\n", + "\n", + "def tokenize_text(text):\n", + " \"\"\"\n", + " Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.\n", + "\n", + " Args:\n", + " text (str): The input text.\n", + "\n", + " Returns:\n", + " tuple: (processed_text, total_replacements)\n", + " \"\"\"\n", + " total_replacements = 0\n", + "\n", + " text, replacements = replace_token(r\"\\S+@\\S*\\s?\", \"MEMAIL\", text)\n", + " total_replacements += replacements\n", + "\n", + " text, replacements = replace_token(USERNAME_REGEX, \"MMENTION\", text)\n", + " total_replacements += replacements\n", + "\n", + " text, replacements = replace_token(r\"`([^`]*)`\", \"MICODE\", text)\n", + " total_replacements += replacements\n", + "\n", + " text, replacements = replace_token(r\"\\b\\d+\\.\\d+(\\.\\d+)*\\b\", \"MVERSIONNUMBER\", text)\n", + " total_replacements += replacements\n", + "\n", + " text, replacements = replace_token(r\"(\\s|^)#\\d+\", \"MISSUEMENTION\", text)\n", + " total_replacements += replacements\n", + "\n", + " text, replacements = replace_token(\n", + " r\"([a-zA-Z0-9]+):\\/\\/([\\w_-]+(?:\\.[\\w_-]+)*)[\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-]\",\n", + " \"MURL\",\n", + " text,\n", + " )\n", + " total_replacements += replacements\n", + "\n", + " return text, total_replacements\n", + "\n", + "def remove_markdown_content(text):\n", + " \"\"\"\n", + " Converts Markdown content to plain text by removing all Markdown formatting.\n", + "\n", + " Args:\n", + " text (str): The input Markdown text.\n", + "\n", + " Returns:\n", + " str: Cleaned text without Markdown formatting.\n", + " \"\"\"\n", + " html = markdown(text)\n", + " return \"\".join(BeautifulSoup(html, \"lxml\").findAll(text=True))\n", + "\n", + "def transform_text(row):\n", + " \"\"\"\n", + " Transforms a row by cleaning and tokenizing its text content.\n", + "\n", + " Args:\n", + " row (dict): A dictionary containing a 'Text' key.\n", + "\n", + " Returns:\n", + " tuple: (processed text, number of replacements)\n", + " \"\"\"\n", + " text = row.get(\"Text\", \"\")\n", + "\n", + " if not isinstance(text, str):\n", + " warnings.warn(f\"Converting non-string type to string: {type(text)}\")\n", + " text = str(text)\n", + "\n", + " text, replaced_count = tokenize_text(text)\n", + " text = text.replace(\"\\n\", \"\")\n", + " return text, replaced_count" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uCPyaXMt4ao_" + }, + "source": [ + "##Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8RhSXEyO4Tnu", + "outputId": "7988bf5a-6c84-46af-f1fa-273b6b31baeb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\n", + " Text Polarity\n", + "0 In this situation, when I click on the greyed ... 0\n", + "1 After that a download progress status with pro... 0\n", + "2 Change the double quotationCODE_FRAGMENT to to... 0\n", + "3 E.g. I get an array of CODE_FRAGMENT objects. 0\n", + "4 Then I tried my own implementation with CODE_F... 0\n", + "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv: 450\n", + "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv\n", + "\n", + "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\n", + " Text Polarity\n", + "0 Guess there is a typo here: `translate`.\" 0\n", + "1 @arturoc: If you multiply-include `gst.h`, wou... 0\n", + "2 Thank you Vlad. Your contribution to Mangos an... 1\n", + "3 @opdenkamp Hi Lars, I'm afraid that you forgot... 2\n", + "4 Ok, so let's be paranoid.\" 2\n", + "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv: 2136\n", + "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv\n", + "\n", + "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\n", + " Text Platform Polarity\n", + "0 [error] testWorkflowTimeoutWhenWorkflowComplet... 0 2\n", + "1 Thanks Hunter. Sure. 0 1\n", + "2 Approved by I will merge it soon. 0 1\n", + "3 It was reported and verified that the current ... 0 2\n", + "4 Thanks for the comments. I am figuring out to ... 0 1\n", + "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv: 3227\n", + "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv\n", + "\n" + ] + } + ], + "source": [ + "\n", + "\n", + "# Define input dataset paths\n", + "input_paths = [\n", + " \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\",\n", + " \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\",\n", + " \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\"\n", + "]\n", + "\n", + "# Define the text transformation function (ensure transform_text is correctly implemented)\n", + "def transform_text(row):\n", + " # Modify this function according to your needs\n", + " # Example: return the original text and a dummy replacement count\n", + " return row[\"Text\"], 1\n", + "\n", + "# Loop through each dataset and process it\n", + "for input_path in input_paths:\n", + " # Generate output file name\n", + " output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n", + " output_path = os.path.join(os.path.dirname(input_path), output_filename)\n", + "\n", + " # Load dataset\n", + " df = pd.read_csv(input_path)\n", + " print(f\"Processing dataset: {input_path}\")\n", + " print(df.head()) # Print first few rows for verification\n", + "\n", + " # Apply text transformation\n", + " df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n", + "\n", + " # Calculate total replacements from `replaced_token` column\n", + " total_replacements = df[\"replaced_token\"].sum()\n", + "\n", + " # Save processed dataset\n", + " df.to_csv(output_path, header=True, index=False)\n", + "\n", + " print(f\"Tokenized dataset saved to: {output_path}\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o3Cm2Gn_CbM5" + }, + "source": [ + "# Dataset Overview: `crossplatform_sf_dataset.csv`\n", + "\n", + "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n", + "\n", + "## **Column Descriptions**\n", + "- **`Text`**: The user comment or discussion content. \n", + "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text: \n", + " - `2`: Negative sentiment \n", + " - `0`: Neutral sentiment \n", + " - `1`: Positive sentiment \n", + "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated: \n", + " - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests) \n", + " - `1`: **Jira** (Bug reports, task comments in software development management tools) \n", + " - `2`: **Mailbox** (Developer communication through emails) \n", + "\n", + "## **Dataset Distribution**\n", + "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QaHSklrRClHU", + "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================================================\n", + "📊 Dataset Information: cf-dataset.csv\n", + "==================================================\n", + "Total Samples: 3227\n", + "\n", + "📌 Polarity Distribution:\n", + "+---+----------+-------+\n", + "| | Polarity | Count |\n", + "+---+----------+-------+\n", + "| 0 | 0 | 1125 |\n", + "| 1 | 1 | 1042 |\n", + "| 2 | 2 | 1060 |\n", + "+---+----------+-------+\n", + "\n", + "\n", + "📌 Platform Distribution:\n", + "+---+----------+-------+\n", + "| | Platform | Count |\n", + "+---+----------+-------+\n", + "| 0 | 0 | 1079 |\n", + "| 1 | 1 | 1054 |\n", + "| 2 | 2 | 1094 |\n", + "+---+----------+-------+\n", + "\n", + "\n", + "📌 Platform-wise Polarity Distribution:\n", + "+----------+-----+-----+-----+\n", + "| Platform | 0 | 1 | 2 |\n", + "+----------+-----+-----+-----+\n", + "| 0 | 392 | 361 | 326 |\n", + "| 1 | 367 | 316 | 371 |\n", + "| 2 | 366 | 365 | 363 |\n", + "+----------+-----+-----+-----+\n", + "==================================================\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from tabulate import tabulate\n", + "\n", + "# Load dataset\n", + "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv'\n", + "df = pd.read_csv(input_path)\n", + "\n", + "# Compute dataset statistics\n", + "total_samples = len(df)\n", + "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n", + "platform_counts = df[\"Platform\"].value_counts().sort_index()\n", + "\n", + "# Compute Polarity distribution within each Platform\n", + "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n", + "\n", + "# Print results with formatting\n", + "print(\"=\" * 50)\n", + "print(f\"📊 Dataset Information: cf-dataset.csv\")\n", + "print(\"=\" * 50)\n", + "print(f\"Total Samples: {total_samples}\\n\")\n", + "\n", + "# Polarity distribution\n", + "print(\"📌 Polarity Distribution:\")\n", + "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n", + "print(\"\\n\")\n", + "\n", + "# Platform distribution\n", + "print(\"📌 Platform Distribution:\")\n", + "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n", + "print(\"\\n\")\n", + "\n", + "# Platform-wise Polarity distribution\n", + "print(\"📌 Platform-wise Polarity Distribution:\")\n", + "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n", + "print(\"=\" * 50)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9kxjNt7QqLlh" + }, + "source": [ + "# Sentiment Classification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "0AiOM3jWKL4P", + "outputId": "bbb7a3d9-e9d7-4378-b8f8-ab68ed4670ec" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'NVIDIA L4'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def seed_torch(seed):\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.backends.cudnn.deterministic=True\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "n_gpu = torch.cuda.device_count()\n", + "torch.cuda.get_device_name(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gQGXycAgbchc" + }, + "outputs": [], + "source": [ + "# Train\n", + "MAX_LEN = 256\n", + "BATCH_SIZE = 16\n", + "LEARNING_RATE = 2e-5\n", + "EPOCHS = 4\n", + "WEIGHT_DECAY = 0.01\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "\n", + "MODELS = [\n", + " (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),\n", + " (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),\n", + " (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),\n", + " (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1')\n", + "]\n", + "MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert']\n", + "\n", + "def train_model(train_df, model_save_path, model_select=0):\n", + " seed_torch(42)\n", + "\n", + " cur_model = MODELS[model_select]\n", + " m_name = MODEL_NAMES[model_select]\n", + "\n", + "\n", + " train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0})\n", + " tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n", + "\n", + " sentences = train_df.Text.values\n", + " labels = train_df.Polarity.values\n", + "\n", + " input_ids = []\n", + " attention_masks = []\n", + "\n", + " for sent in sentences:\n", + " encoded_dict = tokenizer.encode_plus(\n", + " str(sent),\n", + " add_special_tokens=True,\n", + " max_length=MAX_LEN,\n", + " padding='max_length',\n", + " return_attention_mask=True,\n", + " return_tensors='pt',\n", + " truncation=True\n", + " )\n", + " input_ids.append(encoded_dict['input_ids'])\n", + " attention_masks.append(encoded_dict['attention_mask'])\n", + "\n", + " input_ids = torch.cat(input_ids, dim=0)\n", + " attention_masks = torch.cat(attention_masks, dim=0)\n", + " labels = torch.tensor(labels)\n", + "\n", + " print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}')\n", + "\n", + "\n", + " train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n", + " input_ids, labels, test_size=0.1, random_state=42)\n", + " train_masks, val_masks, _, _ = train_test_split(\n", + " attention_masks, labels, test_size=0.1, random_state=42)\n", + "\n", + "\n", + " train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", + " train_sampler = RandomSampler(train_data)\n", + " train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)\n", + "\n", + " val_data = TensorDataset(val_inputs, val_masks, val_labels)\n", + " val_sampler = SequentialSampler(val_data)\n", + " val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)\n", + "\n", + "\n", + " model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n", + " model.to(device)\n", + "\n", + "\n", + " optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n", + "\n", + "\n", + " num_training_steps = EPOCHS * len(train_dataloader)\n", + " lr_scheduler = get_scheduler(\n", + " name=\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps\n", + " )\n", + "\n", + "\n", + " print(\"Starting training...\")\n", + " best_f1 = 0\n", + " for epoch in range(EPOCHS):\n", + " model.train()\n", + " total_loss = 0\n", + " predictions, true_labels = [], []\n", + "\n", + " for batch in train_dataloader:\n", + " b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n", + " optimizer.zero_grad()\n", + " outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)\n", + " loss, logits = outputs[:2]\n", + " loss.backward()\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + "\n", + " total_loss += loss.item()\n", + " predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n", + " true_labels.extend(b_labels.cpu().numpy())\n", + "\n", + " train_acc = accuracy_score(true_labels, predictions)\n", + " print(f\"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}\")\n", + "\n", + "\n", + " model.eval()\n", + " val_predictions, val_labels = [], []\n", + " with torch.no_grad():\n", + " for batch in val_dataloader:\n", + " b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n", + " outputs = model(b_input_ids, attention_mask=b_input_mask)\n", + " logits = outputs[0]\n", + " val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n", + " val_labels.extend(b_labels.cpu().numpy())\n", + "\n", + " val_acc = accuracy_score(val_labels, val_predictions)\n", + " val_f1 = f1_score(val_labels, val_predictions, average='weighted')\n", + " print(f\"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}\")\n", + "\n", + "\n", + " if val_f1 > best_f1:\n", + " best_f1 = val_f1\n", + " torch.save(model.state_dict(), model_save_path)\n", + " print(f\"Best model saved at {model_save_path}\")\n", + "\n", + "\n", + " print(\"Final Model Performance on Validation Set:\")\n", + " print(classification_report(val_labels, val_predictions, digits=4))\n", + " return model_save_path\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sf7jGxHYhNIb" + }, + "outputs": [], + "source": [ + "\n", + "def test_model(test_df, model_saved_path, model_select=0):\n", + "\n", + " MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),\n", + " (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),\n", + " (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),\n", + " (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')\n", + " ]\n", + " MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + " seed_torch(42)\n", + "\n", + " cur_model=MODELS[model_select]\n", + " m_name=MODEL_NAMES[model_select]\n", + "\n", + " tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n", + "\n", + " begin=time.time()\n", + "\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + " 'positive':1,\n", + " 'negative':2,\n", + " 'neutral':0})\n", + "\n", + "\n", + " sentences = test_df.Text.values\n", + " labels = test_df.Polarity.values\n", + "\n", + " input_ids = []\n", + " attention_masks = []\n", + "\n", + " for sent in sentences:\n", + " encoded_dict = tokenizer.encode_plus(\n", + " str(sent),\n", + " add_special_tokens = True,\n", + " max_length = MAX_LEN,\n", + " pad_to_max_length = True,\n", + " return_attention_mask = True,\n", + " return_tensors = 'pt',\n", + " )\n", + "\n", + " input_ids.append(encoded_dict['input_ids'])\n", + " attention_masks.append(encoded_dict['attention_mask'])\n", + "\n", + " prediction_inputs = torch.cat(input_ids,dim=0)\n", + " prediction_masks = torch.cat(attention_masks,dim=0)\n", + " prediction_labels = torch.tensor(labels)\n", + "\n", + " prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n", + " prediction_sampler = SequentialSampler(prediction_data)\n", + " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)\n", + "\n", + " model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + " model.cuda()\n", + " model.eval()\n", + "\n", + " predictions,true_labels=[],[]\n", + "\n", + " for batch in prediction_dataloader:\n", + " batch = tuple(t.to(device) for t in batch)\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + "\n", + " with torch.no_grad():\n", + " outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n", + " logits = outputs[0]\n", + "\n", + " logits = logits.detach().cpu().numpy()\n", + " label_ids = b_labels.to('cpu').numpy()\n", + "\n", + " predictions.append(logits)\n", + " true_labels.append(label_ids)\n", + "\n", + " end=time.time()\n", + " print('Prediction used {:.2f} seconds'.format(end - begin))\n", + "\n", + " flat_predictions = [item for sublist in predictions for item in sublist]\n", + " flat_predictions = np.argmax(flat_predictions, axis=1).flatten()\n", + " flat_true_labels = [item for sublist in true_labels for item in sublist]\n", + "\n", + " print(\"Accuracy of {} is: {}\".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))\n", + "\n", + " print(classification_report(flat_true_labels,flat_predictions))\n", + "\n", + "\n", + " df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])\n", + "\n", + " df_combined = pd.concat([test_df, df_prediction], axis=1)\n", + "\n", + " counts = df_combined['prediction_Polarity'].value_counts()\n", + " print(counts)\n", + "\n", + " return df_combined" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dw2VuMkVAfec" + }, + "source": [ + "## Train" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IyDoiOFoMP3W" + }, + "source": [ + "### Dataset Preparation and Splitting\n", + "\n", + "In this section, we prepare the datasets for training and testing.\n", + "\n", + "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n", + "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n", + "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a Cross-Platform Setting?*).\n", + "\n", + "### Steps:\n", + "\n", + "1. **Load Datasets** \n", + " We read the three datasets into Pandas DataFrames.\n", + "\n", + "2. **Split into Training and Testing Sets** \n", + " - The **GitHub dataset (`df_gh`)** and **Stack Overflow dataset (`df_so`)** are each split into 70% training and 30% testing subsets. \n", + " - Similarly, the **cross-platform dataset (`df_crossplatform`)** is divided into a 70% training set and a 30% testing set. \n", + " - The splitting is performed using `train_test_split` with a `random_state` of 42 for reproducibility.\n", + "\n", + "3. **Save Processed Data** \n", + " - The training and testing subsets are saved as CSV files for further use.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fo57JVQT9GIq" + }, + "outputs": [], + "source": [ + "\n", + "# Read datasets\n", + "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv'\n", + "so_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv'\n", + "gh_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv'\n", + "\n", + "# Load datasets into Pandas DataFrames\n", + "df_crossplatform = pd.read_csv(input_path)\n", + "df_so = pd.read_csv(so_input_path)\n", + "df_gh = pd.read_csv(gh_input_path)\n", + "\n", + "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n", + "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n", + "\n", + "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n", + "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n", + "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n", + "\n", + "# Save all datasets to CSV files for further use\n", + "\n", + "train_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df.csv', index=False)\n", + "test_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv', index=False)\n", + "\n", + "train_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_gh.csv', index=False)\n", + "train_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_so.csv', index=False)\n", + "test_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv', index=False)\n", + "test_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RogFCndjOGet" + }, + "source": [ + "### Train for Cross-Platform Dataset\n", + "We combine three training datasets (`train_df(ours)`, `train_gh`, and `train_so`) into a final dataset for training four different models.\n", + "\n", + "## Training Models \n", + "The following models are trained: \n", + "- **BERT** \n", + "- **XLNet** \n", + "- **RoBERTa** \n", + "- **ALBERT** \n", + "\n", + "## Training Parameters \n", + "- **MAX_LEN**: `256` \n", + "- **BATCH_SIZE**: `16` \n", + "- **LEARNING_RATE**: `2e-5` \n", + "- **EPOCHS**: `4` \n", + "\n", + "Each model is trained using the merged dataset and saved for further evaluation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "slhvZg0Ow3ae", + "outputId": "a950593a-2b5c-49a1-823b-40845fbb77cb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training bert model...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n", + "Starting training...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1: Train Loss: 0.5891, Accuracy: 0.7577\n", + "Validation Accuracy: 0.8673, F1 Score: 0.8675\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", + "Epoch 2: Train Loss: 0.2148, Accuracy: 0.9312\n", + "Validation Accuracy: 0.8771, F1 Score: 0.8772\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", + "Epoch 3: Train Loss: 0.1029, Accuracy: 0.9683\n", + "Validation Accuracy: 0.8845, F1 Score: 0.8841\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", + "Epoch 4: Train Loss: 0.0526, Accuracy: 0.9869\n", + "Validation Accuracy: 0.9017, F1 Score: 0.9015\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", + "Final Model Performance on Validation Set:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.9294 0.8587 0.8927 184\n", + " 1 0.8739 0.9369 0.9043 111\n", + " 2 0.8898 0.9375 0.9130 112\n", + "\n", + " accuracy 0.9017 407\n", + " macro avg 0.8977 0.9110 0.9033 407\n", + "weighted avg 0.9034 0.9017 0.9015 407\n", + "\n", + "Training xlnet model...\n", + "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting training...\n", + "Epoch 1: Train Loss: 0.6808, Accuracy: 0.6927\n", + "Validation Accuracy: 0.8059, F1 Score: 0.8043\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", + "Epoch 2: Train Loss: 0.3285, Accuracy: 0.8839\n", + "Validation Accuracy: 0.8280, F1 Score: 0.8275\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", + "Epoch 3: Train Loss: 0.1917, Accuracy: 0.9355\n", + "Validation Accuracy: 0.8550, F1 Score: 0.8547\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", + "Epoch 4: Train Loss: 0.1199, Accuracy: 0.9604\n", + "Validation Accuracy: 0.8575, F1 Score: 0.8573\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", + "Final Model Performance on Validation Set:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.9241 0.7935 0.8538 184\n", + " 1 0.8306 0.9279 0.8766 111\n", + " 2 0.8000 0.8929 0.8439 112\n", + "\n", + " accuracy 0.8575 407\n", + " macro avg 0.8516 0.8714 0.8581 407\n", + "weighted avg 0.8644 0.8575 0.8573 407\n", + "\n", + "Training Roberta model...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n", + "Starting training...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1: Train Loss: 0.6244, Accuracy: 0.7228\n", + "Validation Accuracy: 0.8698, F1 Score: 0.8697\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", + "Epoch 2: Train Loss: 0.2585, Accuracy: 0.9123\n", + "Validation Accuracy: 0.8747, F1 Score: 0.8745\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", + "Epoch 3: Train Loss: 0.1612, Accuracy: 0.9506\n", + "Validation Accuracy: 0.8968, F1 Score: 0.8968\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", + "Epoch 4: Train Loss: 0.0974, Accuracy: 0.9727\n", + "Validation Accuracy: 0.8968, F1 Score: 0.8969\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", + "Final Model Performance on Validation Set:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.9302 0.8696 0.8989 184\n", + " 1 0.8803 0.9279 0.9035 111\n", + " 2 0.8644 0.9107 0.8870 112\n", + "\n", + " accuracy 0.8968 407\n", + " macro avg 0.8917 0.9027 0.8964 407\n", + "weighted avg 0.8985 0.8968 0.8969 407\n", + "\n", + "Training albert model...\n", + "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting training...\n", + "Epoch 1: Train Loss: 0.6810, Accuracy: 0.7110\n", + "Validation Accuracy: 0.8305, F1 Score: 0.8332\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n", + "Epoch 2: Train Loss: 0.3244, Accuracy: 0.8913\n", + "Validation Accuracy: 0.8550, F1 Score: 0.8550\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n", + "Epoch 3: Train Loss: 0.2071, Accuracy: 0.9306\n", + "Validation Accuracy: 0.8550, F1 Score: 0.8548\n", + "Epoch 4: Train Loss: 0.1405, Accuracy: 0.9566\n", + "Validation Accuracy: 0.8526, F1 Score: 0.8524\n", + "Final Model Performance on Validation Set:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.8715 0.8478 0.8595 184\n", + " 1 0.8547 0.9009 0.8772 111\n", + " 2 0.8198 0.8125 0.8161 112\n", + "\n", + " accuracy 0.8526 407\n", + " macro avg 0.8487 0.8537 0.8509 407\n", + "weighted avg 0.8527 0.8526 0.8524 407\n", + "\n" + ] + } + ], + "source": [ + "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n", + "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n", + "train_df_final.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df_final.csv', index=False)\n", + "\n", + "# Define the list of model names to be trained\n", + "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + "\n", + "# Train each model and save the trained model files\n", + "for i, model_name in enumerate(MODEL_NAMES):\n", + " model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n", + " print(f\"Training {model_name} model...\")\n", + " train_model(train_df_final, model_save_path, model_select=i)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "krgDqVN9x_my" + }, + "source": [ + "### Train for existing dataset on Bert\n", + "In this section, we train the **BERT** model using the existing **GitHub** and **Stack Overflow** datasets.\n", + "\n", + "## Training Data \n", + "The training dataset consists of: \n", + "- **GitHub Data** (`train_gh`) \n", + "- **Stack Overflow Data** (`train_so`) \n", + "\n", + "## Training Parameters \n", + "- **MAX_LEN**: `256` \n", + "- **BATCH_SIZE**: `16` \n", + "- **LEARNING_RATE**: `2e-5` \n", + "- **EPOCHS**: `4` \n", + "\n", + "The trained **BERT** model will be saved for further evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sxHJGW6Lbgnt", + "outputId": "8d60c4d7-c1b2-4dee-dc01-a7069146a577" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training bert model on SO dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training data shape: torch.Size([315, 256]), torch.Size([315, 256]), torch.Size([315])\n", + "Starting training...\n", + "Epoch 1: Train Loss: 0.7896, Accuracy: 0.7032\n", + "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\n", + "Epoch 2: Train Loss: 0.5530, Accuracy: 0.8163\n", + "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", + "Epoch 3: Train Loss: 0.4831, Accuracy: 0.8163\n", + "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", + "Epoch 4: Train Loss: 0.4383, Accuracy: 0.8163\n", + "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", + "Final Model Performance on Validation Set:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.8438 1.0000 0.9153 27\n", + " 1 0.0000 0.0000 0.0000 4\n", + " 2 0.0000 0.0000 0.0000 1\n", + "\n", + " accuracy 0.8438 32\n", + " macro avg 0.2812 0.3333 0.3051 32\n", + "weighted avg 0.7119 0.8438 0.7722 32\n", + "\n", + "Training bert model on GH dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training data shape: torch.Size([1495, 256]), torch.Size([1495, 256]), torch.Size([1495])\n", + "Starting training...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1: Train Loss: 0.8448, Accuracy: 0.6141\n", + "Validation Accuracy: 0.7933, F1 Score: 0.7983\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n", + "Epoch 2: Train Loss: 0.3207, Accuracy: 0.8900\n", + "Validation Accuracy: 0.8733, F1 Score: 0.8741\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n", + "Epoch 3: Train Loss: 0.1529, Accuracy: 0.9532\n", + "Validation Accuracy: 0.8867, F1 Score: 0.8851\n", + "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n", + "Epoch 4: Train Loss: 0.0991, Accuracy: 0.9762\n", + "Validation Accuracy: 0.8800, F1 Score: 0.8783\n", + "Final Model Performance on Validation Set:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.8868 0.8246 0.8545 57\n", + " 1 0.8846 1.0000 0.9388 46\n", + " 2 0.8667 0.8298 0.8478 47\n", + "\n", + " accuracy 0.8800 150\n", + " macro avg 0.8794 0.8848 0.8804 150\n", + "weighted avg 0.8798 0.8800 0.8783 150\n", + "\n" + ] + } + ], + "source": [ + "# Train the model for github-golden-rule and stackoverflow on Bert\n", + "\n", + "MODEL_NAMES = ['bert']\n", + "for dataset_name, train_df in [('SO', train_so), ('GH', train_gh)]:\n", + " for i, model_name in enumerate(MODEL_NAMES):\n", + " model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{dataset_name}_{model_name}_model\"\n", + " print(f\"Training {model_name} model on {dataset_name} dataset...\")\n", + " train_model(train_df, model_save_path, model_select=i)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ugVi3T4wAkSQ" + }, + "source": [ + "## Test" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XMgVVJ9U9XBa" + }, + "source": [ + "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n", + "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n", + "\n", + "## Evaluation Metrics \n", + "We will assess: \n", + "1. **Overall model performance** across all platforms. \n", + "2. **Platform-specific performance** for each model on: \n", + " - **GitHub** \n", + " - **Jira** \n", + " - **Mailbox** \n", + "\n", + "## Results \n", + "The evaluation will print: \n", + "- **Overall accuracy** of each model. \n", + "- **Performance breakdown per platform** for each model. \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RSJI327nEKAz", + "outputId": "264447bc-b509-4e8a-e5ce-49cbed8a44a4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating bert model...for overall platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 9.38 seconds\n", + "Accuracy of bert is: 0.9422084623323014\n", + " precision recall f1-score support\n", + "\n", + " 0 0.95 0.91 0.93 329\n", + " 1 0.94 0.97 0.95 318\n", + " 2 0.94 0.95 0.94 322\n", + "\n", + " accuracy 0.94 969\n", + " macro avg 0.94 0.94 0.94 969\n", + "weighted avg 0.94 0.94 0.94 969\n", + "\n", + "prediction_Polarity\n", + "2 327\n", + "1 326\n", + "0 316\n", + "Name: count, dtype: int64\n", + "Evaluating bert model...for GitHub platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 3.93 seconds\n", + "Accuracy of bert is: 0.956140350877193\n", + " precision recall f1-score support\n", + "\n", + " 0 0.96 0.94 0.95 128\n", + " 1 0.97 0.96 0.96 122\n", + " 2 0.94 0.98 0.96 92\n", + "\n", + " accuracy 0.96 342\n", + " macro avg 0.95 0.96 0.96 342\n", + "weighted avg 0.96 0.96 0.96 342\n", + "\n", + "prediction_Polarity\n", + "0.0 125\n", + "1.0 121\n", + "2.0 96\n", + "Name: count, dtype: int64\n", + "Evaluating bert model...for Jira platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 4.05 seconds\n", + "Accuracy of bert is: 0.926829268292683\n", + " precision recall f1-score support\n", + "\n", + " 0 0.94 0.88 0.91 93\n", + " 1 0.92 0.94 0.93 86\n", + " 2 0.92 0.95 0.94 108\n", + "\n", + " accuracy 0.93 287\n", + " macro avg 0.93 0.93 0.93 287\n", + "weighted avg 0.93 0.93 0.93 287\n", + "\n", + "prediction_Polarity\n", + "2.0 112\n", + "1.0 88\n", + "0.0 87\n", + "Name: count, dtype: int64\n", + "Evaluating bert model...for Mailbox platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 5.19 seconds\n", + "Accuracy of bert is: 0.9411764705882353\n", + " precision recall f1-score support\n", + "\n", + " 0 0.94 0.91 0.92 108\n", + " 1 0.93 0.99 0.96 110\n", + " 2 0.95 0.93 0.94 122\n", + "\n", + " accuracy 0.94 340\n", + " macro avg 0.94 0.94 0.94 340\n", + "weighted avg 0.94 0.94 0.94 340\n", + "\n", + "prediction_Polarity\n", + "2.0 119\n", + "1.0 117\n", + "0.0 104\n", + "Name: count, dtype: int64\n", + "Evaluating xlnet model...for overall platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 14.16 seconds\n", + "Accuracy of xlnet is: 0.8823529411764706\n", + " precision recall f1-score support\n", + "\n", + " 0 0.89 0.82 0.85 329\n", + " 1 0.88 0.95 0.91 318\n", + " 2 0.88 0.88 0.88 322\n", + "\n", + " accuracy 0.88 969\n", + " macro avg 0.88 0.88 0.88 969\n", + "weighted avg 0.88 0.88 0.88 969\n", + "\n", + "prediction_Polarity\n", + "1 343\n", + "2 320\n", + "0 306\n", + "Name: count, dtype: int64\n", + "Evaluating xlnet model...for GitHub platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 7.19 seconds\n", + "Accuracy of xlnet is: 0.9005847953216374\n", + " precision recall f1-score support\n", + "\n", + " 0 0.90 0.85 0.88 128\n", + " 1 0.90 0.96 0.93 122\n", + " 2 0.90 0.89 0.90 92\n", + "\n", + " accuracy 0.90 342\n", + " macro avg 0.90 0.90 0.90 342\n", + "weighted avg 0.90 0.90 0.90 342\n", + "\n", + "prediction_Polarity\n", + "1.0 130\n", + "0.0 121\n", + "2.0 91\n", + "Name: count, dtype: int64\n", + "Evaluating xlnet model...for Jira platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 5.72 seconds\n", + "Accuracy of xlnet is: 0.9024390243902439\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 0.81 0.86 93\n", + " 1 0.91 0.95 0.93 86\n", + " 2 0.89 0.94 0.91 108\n", + "\n", + " accuracy 0.90 287\n", + " macro avg 0.90 0.90 0.90 287\n", + "weighted avg 0.90 0.90 0.90 287\n", + "\n", + "prediction_Polarity\n", + "2.0 115\n", + "1.0 90\n", + "0.0 82\n", + "Name: count, dtype: int64\n", + "Evaluating xlnet model...for Mailbox platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 6.06 seconds\n", + "Accuracy of xlnet is: 0.8470588235294118\n", + " precision recall f1-score support\n", + "\n", + " 0 0.84 0.81 0.82 108\n", + " 1 0.83 0.93 0.88 110\n", + " 2 0.87 0.81 0.84 122\n", + "\n", + " accuracy 0.85 340\n", + " macro avg 0.85 0.85 0.85 340\n", + "weighted avg 0.85 0.85 0.85 340\n", + "\n", + "prediction_Polarity\n", + "1.0 123\n", + "2.0 114\n", + "0.0 103\n", + "Name: count, dtype: int64\n", + "Evaluating Roberta model...for overall platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 9.61 seconds\n", + "Accuracy of Roberta is: 0.9060887512899897\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.83 0.87 329\n", + " 1 0.91 0.96 0.93 318\n", + " 2 0.88 0.94 0.91 322\n", + "\n", + " accuracy 0.91 969\n", + " macro avg 0.91 0.91 0.91 969\n", + "weighted avg 0.91 0.91 0.91 969\n", + "\n", + "prediction_Polarity\n", + "2 342\n", + "1 333\n", + "0 294\n", + "Name: count, dtype: int64\n", + "Evaluating Roberta model...for GitHub platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 4.66 seconds\n", + "Accuracy of Roberta is: 0.9298245614035088\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.88 0.90 128\n", + " 1 0.95 0.98 0.96 122\n", + " 2 0.91 0.93 0.92 92\n", + "\n", + " accuracy 0.93 342\n", + " macro avg 0.93 0.93 0.93 342\n", + "weighted avg 0.93 0.93 0.93 342\n", + "\n", + "prediction_Polarity\n", + "1.0 125\n", + "0.0 122\n", + "2.0 95\n", + "Name: count, dtype: int64\n", + "Evaluating Roberta model...for Jira platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 4.47 seconds\n", + "Accuracy of Roberta is: 0.9163763066202091\n", + " precision recall f1-score support\n", + "\n", + " 0 0.95 0.81 0.87 93\n", + " 1 0.91 0.95 0.93 86\n", + " 2 0.90 0.98 0.94 108\n", + "\n", + " accuracy 0.92 287\n", + " macro avg 0.92 0.91 0.91 287\n", + "weighted avg 0.92 0.92 0.91 287\n", + "\n", + "prediction_Polarity\n", + "2.0 118\n", + "1.0 90\n", + "0.0 79\n", + "Name: count, dtype: int64\n", + "Evaluating Roberta model...for Mailbox platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 5.03 seconds\n", + "Accuracy of Roberta is: 0.8735294117647059\n", + " precision recall f1-score support\n", + "\n", + " 0 0.90 0.78 0.84 108\n", + " 1 0.87 0.94 0.90 110\n", + " 2 0.85 0.90 0.88 122\n", + "\n", + " accuracy 0.87 340\n", + " macro avg 0.88 0.87 0.87 340\n", + "weighted avg 0.88 0.87 0.87 340\n", + "\n", + "prediction_Polarity\n", + "2.0 129\n", + "1.0 118\n", + "0.0 93\n", + "Name: count, dtype: int64\n", + "Evaluating albert model...for overall platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 8.62 seconds\n", + "Accuracy of albert is: 0.8844169246646026\n", + " precision recall f1-score support\n", + "\n", + " 0 0.90 0.79 0.84 329\n", + " 1 0.91 0.95 0.93 318\n", + " 2 0.85 0.92 0.88 322\n", + "\n", + " accuracy 0.88 969\n", + " macro avg 0.89 0.89 0.88 969\n", + "weighted avg 0.89 0.88 0.88 969\n", + "\n", + "prediction_Polarity\n", + "2 350\n", + "1 331\n", + "0 288\n", + "Name: count, dtype: int64\n", + "Evaluating albert model...for GitHub platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 3.99 seconds\n", + "Accuracy of albert is: 0.9269005847953217\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.88 0.90 128\n", + " 1 0.97 0.96 0.96 122\n", + " 2 0.87 0.96 0.91 92\n", + "\n", + " accuracy 0.93 342\n", + " macro avg 0.92 0.93 0.93 342\n", + "weighted avg 0.93 0.93 0.93 342\n", + "\n", + "prediction_Polarity\n", + "1.0 121\n", + "0.0 120\n", + "2.0 101\n", + "Name: count, dtype: int64\n", + "Evaluating albert model...for Jira platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 3.30 seconds\n", + "Accuracy of albert is: 0.89198606271777\n", + " precision recall f1-score support\n", + "\n", + " 0 0.95 0.75 0.84 93\n", + " 1 0.91 0.94 0.93 86\n", + " 2 0.85 0.97 0.91 108\n", + "\n", + " accuracy 0.89 287\n", + " macro avg 0.90 0.89 0.89 287\n", + "weighted avg 0.90 0.89 0.89 287\n", + "\n", + "prediction_Polarity\n", + "2.0 124\n", + "1.0 89\n", + "0.0 74\n", + "Name: count, dtype: int64\n", + "Evaluating albert model...for Mailbox platform\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " test_df['Polarity']=test_df['Polarity'].replace({\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 4.12 seconds\n", + "Accuracy of albert is: 0.8352941176470589\n", + " precision recall f1-score support\n", + "\n", + " 0 0.83 0.72 0.77 108\n", + " 1 0.85 0.94 0.89 110\n", + " 2 0.82 0.84 0.83 122\n", + "\n", + " accuracy 0.84 340\n", + " macro avg 0.84 0.83 0.83 340\n", + "weighted avg 0.83 0.84 0.83 340\n", + "\n", + "prediction_Polarity\n", + "2.0 125\n", + "1.0 121\n", + "0.0 94\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "\n", + "# Load test dataset\n", + "test_df = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv')\n", + "\n", + "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + "model_results = {}\n", + "\n", + "# Define platform mapping\n", + "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n", + "\n", + "# Evaluate each model\n", + "for i, model_name in enumerate(MODEL_NAMES):\n", + " model_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n", + " print(f\"Evaluating {model_name} model...for overall platform\")\n", + "\n", + " # Get overall accuracy\n", + " overall_accuracy = test_model(test_df, model_path, model_select=i)\n", + "\n", + " # Evaluate accuracy per platform\n", + " platform_accuracies = {}\n", + " for platform_id, platform_name in platforms.items():\n", + " test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n", + " if not test_df_platform.empty:\n", + " print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n", + " accuracy = test_model(test_df_platform, model_path, model_select=i)\n", + " platform_accuracies[platform_name] = accuracy\n", + " else:\n", + " platform_accuracies[platform_name] = \"No data\"\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2rTxz8aIRkYQ" + }, + "source": [ + "### Generalization Performance of the Model (Table 3.3)\n", + "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets: \n", + "- **GitHub Golden Rule Dataset** \n", + "- **Stack Overflow Dataset** \n", + "\n", + "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n", + "\n", + "## Evaluation Process \n", + "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n", + "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n", + "\n", + "## Goals \n", + "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n", + "- To highlight the **superiority** of our cross-platform model over dataset-specific models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ekatnWlSAnyd", + "outputId": "7ff88fb0-109c-43b0-97ae-36a792730275" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating bert_model on GitHub test dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 7.34 seconds\n", + "Accuracy of bert is: 0.8829953198127926\n", + " precision recall f1-score support\n", + "\n", + " 0 0.91 0.86 0.88 267\n", + " 1 0.89 0.92 0.91 170\n", + " 2 0.85 0.88 0.86 204\n", + "\n", + " accuracy 0.88 641\n", + " macro avg 0.88 0.89 0.88 641\n", + "weighted avg 0.88 0.88 0.88 641\n", + "\n", + "prediction_Polarity\n", + "0 252\n", + "2 213\n", + "1 176\n", + "Name: count, dtype: int64\n", + "Evaluating bert_model on Stack Overflow test dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 4.10 seconds\n", + "Accuracy of bert is: 0.8814814814814815\n", + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.93 0.93 110\n", + " 1 0.44 0.36 0.40 11\n", + " 2 0.81 0.93 0.87 14\n", + "\n", + " accuracy 0.88 135\n", + " macro avg 0.73 0.74 0.73 135\n", + "weighted avg 0.88 0.88 0.88 135\n", + "\n", + "prediction_Polarity\n", + "0 110\n", + "2 16\n", + "1 9\n", + "Name: count, dtype: int64\n", + "Evaluating GH_bert_model on Stack Overflow test dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 2.21 seconds\n", + "Accuracy of bert is: 0.8148148148148148\n", + " precision recall f1-score support\n", + "\n", + " 0 0.85 0.95 0.89 110\n", + " 1 0.50 0.27 0.35 11\n", + " 2 0.50 0.21 0.30 14\n", + "\n", + " accuracy 0.81 135\n", + " macro avg 0.62 0.48 0.52 135\n", + "weighted avg 0.78 0.81 0.79 135\n", + "\n", + "prediction_Polarity\n", + "0 123\n", + "2 6\n", + "1 6\n", + "Name: count, dtype: int64\n", + "Evaluating SO_bert_model on GitHub test dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(model_saved_path))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction used 6.62 seconds\n", + "Accuracy of bert is: 0.4165366614664587\n", + " precision recall f1-score support\n", + "\n", + " 0 0.42 1.00 0.59 267\n", + " 1 0.00 0.00 0.00 170\n", + " 2 0.00 0.00 0.00 204\n", + "\n", + " accuracy 0.42 641\n", + " macro avg 0.14 0.33 0.20 641\n", + "weighted avg 0.17 0.42 0.24 641\n", + "\n", + "prediction_Polarity\n", + "0 641\n", + "Name: count, dtype: int64\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + ] + } + ], + "source": [ + "\n", + "\n", + "# Load test datasets\n", + "test_gh = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv')\n", + "test_so = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv')\n", + "\n", + "# Define model paths\n", + "bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\"\n", + "gh_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\"\n", + "so_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\"\n", + "\n", + "# Store results\n", + "model_results = {}\n", + "\n", + "# 1. Validate bert_model on test_gh and test_so\n", + "print(\"Evaluating bert_model on GitHub test dataset...\")\n", + "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n", + "\n", + "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n", + "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n", + "\n", + "model_results[\"bert_model\"] = {\n", + " \"test_gh Accuracy\": bert_on_gh,\n", + " \"test_so Accuracy\": bert_on_so\n", + "}\n", + "\n", + "# 2. Validate GH_bert_model on test_so\n", + "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n", + "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n", + "\n", + "model_results[\"GH_bert_model\"] = {\n", + " \"test_so Accuracy\": gh_bert_on_so\n", + "}\n", + "\n", + "# 3. Validate SO_bert_model on test_gh\n", + "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n", + "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n", + "\n", + "model_results[\"SO_bert_model\"] = {\n", + " \"test_gh Accuracy\": so_bert_on_gh\n", + "}\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "L4", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 606a14c27082bdefe7c9b9704c3eca85e10ab44f Mon Sep 17 00:00:00 2001 From: Connor Narowetz Date: Sat, 3 May 2025 13:34:28 -1000 Subject: [PATCH 2/5] Restructured Notebook, added API folder - Added API folder - Train and Test notebook for each part of the process, to break it up --- ...Development_Sentiment_Classification.ipynb | 2174 ----------------- api/filter.py | 64 + api/test.py | 121 + api/tokenizer.py | 99 + api/train.py | 165 ++ ...Development_Sentiment_Classification.ipynb | 239 ++ notebooks/Test.ipynb | 261 ++ notebooks/Train.ipynb | 156 ++ 8 files changed, 1105 insertions(+), 2174 deletions(-) delete mode 100644 Software_Development_Sentiment_Classification.ipynb create mode 100644 api/filter.py create mode 100644 api/test.py create mode 100644 api/tokenizer.py create mode 100644 api/train.py create mode 100644 notebooks/Software_Development_Sentiment_Classification.ipynb create mode 100644 notebooks/Test.ipynb create mode 100644 notebooks/Train.ipynb diff --git a/Software_Development_Sentiment_Classification.ipynb b/Software_Development_Sentiment_Classification.ipynb deleted file mode 100644 index ff12ae7..0000000 --- a/Software_Development_Sentiment_Classification.ipynb +++ /dev/null @@ -1,2174 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oe8X-6s9btXo" - }, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "import re\n", - "import string\n", - "import random\n", - "import warnings\n", - "import argparse\n", - "import numpy as np\n", - "import pandas as pd\n", - "import torch\n", - "import time\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", - "from io import StringIO\n", - "from unicodedata import category\n", - "from bs4 import BeautifulSoup\n", - "from markdown import markdown\n", - "from google.colab import drive\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n", - "from torch.utils.data import DataLoader, RandomSampler, Dataset\n", - "from transformers import (\n", - " BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n", - " XLNetTokenizer, XLNetForSequenceClassification,\n", - " RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n", - " AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n", - " get_scheduler, AdamW\n", - ")\n", - "\n", - "\n", - "drive.mount('/content/drive')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RAXtSnSK4LPr" - }, - "source": [ - "# Tokenlized" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9gOgONc04PiO" - }, - "outputs": [], - "source": [ - "# Regular expression for GitHub username mentions\n", - "USERNAME_REGEX = r\"(\\s|^)@(\\S*\\s?)\"\n", - "\n", - "# Generate Unicode punctuation set\n", - "punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith((\"P\", \"S\"))}\n", - "\n", - "# Dictionary to count token replacements\n", - "counters = {}\n", - "\n", - "def remove_punctuation(text):\n", - " \"\"\"Remove all punctuation characters from the given text.\"\"\"\n", - " return \"\".join(char for char in text if char not in punctuation)\n", - "\n", - "def clean_text(text):\n", - " \"\"\"Remove quoted text and large code blocks from GitHub issues or comments.\"\"\"\n", - " # Remove quoted text from emails/notifications\n", - " text = re.sub(r\"^(On[\\s\\S]*?notifications@github\\.com\\s*?wrote:\\s*?)?(^(\\>).*\\s)*\", '', text, flags=re.MULTILINE)\n", - "\n", - " # Remove code blocks enclosed in triple backticks\n", - " text = re.sub(r\"```[a-z]*\\n[\\s\\S]*?\\n```\", \"\", text)\n", - "\n", - " return text\n", - "\n", - "def replace_token(regex, token_name, text):\n", - " \"\"\"\n", - " Replace matched patterns in the text with the specified token.\n", - "\n", - " Args:\n", - " regex (str): The regular expression pattern to match.\n", - " token_name (str): The replacement token name.\n", - " text (str): The input text.\n", - "\n", - " Returns:\n", - " tuple: (processed_text, number_of_replacements)\n", - " \"\"\"\n", - " replaced_text, replacements = re.subn(regex, f\" {token_name} \", text, flags=re.MULTILINE)\n", - " counters[token_name] = counters.get(token_name, 0) + replacements\n", - " return replaced_text, replacements\n", - "\n", - "def tokenize_text(text):\n", - " \"\"\"\n", - " Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.\n", - "\n", - " Args:\n", - " text (str): The input text.\n", - "\n", - " Returns:\n", - " tuple: (processed_text, total_replacements)\n", - " \"\"\"\n", - " total_replacements = 0\n", - "\n", - " text, replacements = replace_token(r\"\\S+@\\S*\\s?\", \"MEMAIL\", text)\n", - " total_replacements += replacements\n", - "\n", - " text, replacements = replace_token(USERNAME_REGEX, \"MMENTION\", text)\n", - " total_replacements += replacements\n", - "\n", - " text, replacements = replace_token(r\"`([^`]*)`\", \"MICODE\", text)\n", - " total_replacements += replacements\n", - "\n", - " text, replacements = replace_token(r\"\\b\\d+\\.\\d+(\\.\\d+)*\\b\", \"MVERSIONNUMBER\", text)\n", - " total_replacements += replacements\n", - "\n", - " text, replacements = replace_token(r\"(\\s|^)#\\d+\", \"MISSUEMENTION\", text)\n", - " total_replacements += replacements\n", - "\n", - " text, replacements = replace_token(\n", - " r\"([a-zA-Z0-9]+):\\/\\/([\\w_-]+(?:\\.[\\w_-]+)*)[\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-]\",\n", - " \"MURL\",\n", - " text,\n", - " )\n", - " total_replacements += replacements\n", - "\n", - " return text, total_replacements\n", - "\n", - "def remove_markdown_content(text):\n", - " \"\"\"\n", - " Converts Markdown content to plain text by removing all Markdown formatting.\n", - "\n", - " Args:\n", - " text (str): The input Markdown text.\n", - "\n", - " Returns:\n", - " str: Cleaned text without Markdown formatting.\n", - " \"\"\"\n", - " html = markdown(text)\n", - " return \"\".join(BeautifulSoup(html, \"lxml\").findAll(text=True))\n", - "\n", - "def transform_text(row):\n", - " \"\"\"\n", - " Transforms a row by cleaning and tokenizing its text content.\n", - "\n", - " Args:\n", - " row (dict): A dictionary containing a 'Text' key.\n", - "\n", - " Returns:\n", - " tuple: (processed text, number of replacements)\n", - " \"\"\"\n", - " text = row.get(\"Text\", \"\")\n", - "\n", - " if not isinstance(text, str):\n", - " warnings.warn(f\"Converting non-string type to string: {type(text)}\")\n", - " text = str(text)\n", - "\n", - " text, replaced_count = tokenize_text(text)\n", - " text = text.replace(\"\\n\", \"\")\n", - " return text, replaced_count" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uCPyaXMt4ao_" - }, - "source": [ - "##Usage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8RhSXEyO4Tnu", - "outputId": "7988bf5a-6c84-46af-f1fa-273b6b31baeb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\n", - " Text Polarity\n", - "0 In this situation, when I click on the greyed ... 0\n", - "1 After that a download progress status with pro... 0\n", - "2 Change the double quotationCODE_FRAGMENT to to... 0\n", - "3 E.g. I get an array of CODE_FRAGMENT objects. 0\n", - "4 Then I tried my own implementation with CODE_F... 0\n", - "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv: 450\n", - "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv\n", - "\n", - "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\n", - " Text Polarity\n", - "0 Guess there is a typo here: `translate`.\" 0\n", - "1 @arturoc: If you multiply-include `gst.h`, wou... 0\n", - "2 Thank you Vlad. Your contribution to Mangos an... 1\n", - "3 @opdenkamp Hi Lars, I'm afraid that you forgot... 2\n", - "4 Ok, so let's be paranoid.\" 2\n", - "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv: 2136\n", - "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv\n", - "\n", - "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\n", - " Text Platform Polarity\n", - "0 [error] testWorkflowTimeoutWhenWorkflowComplet... 0 2\n", - "1 Thanks Hunter. Sure. 0 1\n", - "2 Approved by I will merge it soon. 0 1\n", - "3 It was reported and verified that the current ... 0 2\n", - "4 Thanks for the comments. I am figuring out to ... 0 1\n", - "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv: 3227\n", - "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv\n", - "\n" - ] - } - ], - "source": [ - "\n", - "\n", - "# Define input dataset paths\n", - "input_paths = [\n", - " \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\",\n", - " \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\",\n", - " \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\"\n", - "]\n", - "\n", - "# Define the text transformation function (ensure transform_text is correctly implemented)\n", - "def transform_text(row):\n", - " # Modify this function according to your needs\n", - " # Example: return the original text and a dummy replacement count\n", - " return row[\"Text\"], 1\n", - "\n", - "# Loop through each dataset and process it\n", - "for input_path in input_paths:\n", - " # Generate output file name\n", - " output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n", - " output_path = os.path.join(os.path.dirname(input_path), output_filename)\n", - "\n", - " # Load dataset\n", - " df = pd.read_csv(input_path)\n", - " print(f\"Processing dataset: {input_path}\")\n", - " print(df.head()) # Print first few rows for verification\n", - "\n", - " # Apply text transformation\n", - " df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n", - "\n", - " # Calculate total replacements from `replaced_token` column\n", - " total_replacements = df[\"replaced_token\"].sum()\n", - "\n", - " # Save processed dataset\n", - " df.to_csv(output_path, header=True, index=False)\n", - "\n", - " print(f\"Tokenized dataset saved to: {output_path}\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "o3Cm2Gn_CbM5" - }, - "source": [ - "# Dataset Overview: `crossplatform_sf_dataset.csv`\n", - "\n", - "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n", - "\n", - "## **Column Descriptions**\n", - "- **`Text`**: The user comment or discussion content. \n", - "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text: \n", - " - `2`: Negative sentiment \n", - " - `0`: Neutral sentiment \n", - " - `1`: Positive sentiment \n", - "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated: \n", - " - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests) \n", - " - `1`: **Jira** (Bug reports, task comments in software development management tools) \n", - " - `2`: **Mailbox** (Developer communication through emails) \n", - "\n", - "## **Dataset Distribution**\n", - "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QaHSklrRClHU", - "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n", - "📊 Dataset Information: cf-dataset.csv\n", - "==================================================\n", - "Total Samples: 3227\n", - "\n", - "📌 Polarity Distribution:\n", - "+---+----------+-------+\n", - "| | Polarity | Count |\n", - "+---+----------+-------+\n", - "| 0 | 0 | 1125 |\n", - "| 1 | 1 | 1042 |\n", - "| 2 | 2 | 1060 |\n", - "+---+----------+-------+\n", - "\n", - "\n", - "📌 Platform Distribution:\n", - "+---+----------+-------+\n", - "| | Platform | Count |\n", - "+---+----------+-------+\n", - "| 0 | 0 | 1079 |\n", - "| 1 | 1 | 1054 |\n", - "| 2 | 2 | 1094 |\n", - "+---+----------+-------+\n", - "\n", - "\n", - "📌 Platform-wise Polarity Distribution:\n", - "+----------+-----+-----+-----+\n", - "| Platform | 0 | 1 | 2 |\n", - "+----------+-----+-----+-----+\n", - "| 0 | 392 | 361 | 326 |\n", - "| 1 | 367 | 316 | 371 |\n", - "| 2 | 366 | 365 | 363 |\n", - "+----------+-----+-----+-----+\n", - "==================================================\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from tabulate import tabulate\n", - "\n", - "# Load dataset\n", - "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv'\n", - "df = pd.read_csv(input_path)\n", - "\n", - "# Compute dataset statistics\n", - "total_samples = len(df)\n", - "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n", - "platform_counts = df[\"Platform\"].value_counts().sort_index()\n", - "\n", - "# Compute Polarity distribution within each Platform\n", - "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n", - "\n", - "# Print results with formatting\n", - "print(\"=\" * 50)\n", - "print(f\"📊 Dataset Information: cf-dataset.csv\")\n", - "print(\"=\" * 50)\n", - "print(f\"Total Samples: {total_samples}\\n\")\n", - "\n", - "# Polarity distribution\n", - "print(\"📌 Polarity Distribution:\")\n", - "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n", - "print(\"\\n\")\n", - "\n", - "# Platform distribution\n", - "print(\"📌 Platform Distribution:\")\n", - "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n", - "print(\"\\n\")\n", - "\n", - "# Platform-wise Polarity distribution\n", - "print(\"📌 Platform-wise Polarity Distribution:\")\n", - "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n", - "print(\"=\" * 50)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9kxjNt7QqLlh" - }, - "source": [ - "# Sentiment Classification" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "0AiOM3jWKL4P", - "outputId": "bbb7a3d9-e9d7-4378-b8f8-ab68ed4670ec" - }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'NVIDIA L4'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def seed_torch(seed):\n", - " random.seed(seed)\n", - " np.random.seed(seed)\n", - " torch.manual_seed(seed)\n", - " torch.cuda.manual_seed(seed)\n", - " torch.backends.cudnn.deterministic=True\n", - "\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "n_gpu = torch.cuda.device_count()\n", - "torch.cuda.get_device_name(0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gQGXycAgbchc" - }, - "outputs": [], - "source": [ - "# Train\n", - "MAX_LEN = 256\n", - "BATCH_SIZE = 16\n", - "LEARNING_RATE = 2e-5\n", - "EPOCHS = 4\n", - "WEIGHT_DECAY = 0.01\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "\n", - "\n", - "MODELS = [\n", - " (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),\n", - " (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),\n", - " (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),\n", - " (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1')\n", - "]\n", - "MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert']\n", - "\n", - "def train_model(train_df, model_save_path, model_select=0):\n", - " seed_torch(42)\n", - "\n", - " cur_model = MODELS[model_select]\n", - " m_name = MODEL_NAMES[model_select]\n", - "\n", - "\n", - " train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0})\n", - " tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n", - "\n", - " sentences = train_df.Text.values\n", - " labels = train_df.Polarity.values\n", - "\n", - " input_ids = []\n", - " attention_masks = []\n", - "\n", - " for sent in sentences:\n", - " encoded_dict = tokenizer.encode_plus(\n", - " str(sent),\n", - " add_special_tokens=True,\n", - " max_length=MAX_LEN,\n", - " padding='max_length',\n", - " return_attention_mask=True,\n", - " return_tensors='pt',\n", - " truncation=True\n", - " )\n", - " input_ids.append(encoded_dict['input_ids'])\n", - " attention_masks.append(encoded_dict['attention_mask'])\n", - "\n", - " input_ids = torch.cat(input_ids, dim=0)\n", - " attention_masks = torch.cat(attention_masks, dim=0)\n", - " labels = torch.tensor(labels)\n", - "\n", - " print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}')\n", - "\n", - "\n", - " train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n", - " input_ids, labels, test_size=0.1, random_state=42)\n", - " train_masks, val_masks, _, _ = train_test_split(\n", - " attention_masks, labels, test_size=0.1, random_state=42)\n", - "\n", - "\n", - " train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", - " train_sampler = RandomSampler(train_data)\n", - " train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)\n", - "\n", - " val_data = TensorDataset(val_inputs, val_masks, val_labels)\n", - " val_sampler = SequentialSampler(val_data)\n", - " val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)\n", - "\n", - "\n", - " model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n", - " model.to(device)\n", - "\n", - "\n", - " optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n", - "\n", - "\n", - " num_training_steps = EPOCHS * len(train_dataloader)\n", - " lr_scheduler = get_scheduler(\n", - " name=\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps\n", - " )\n", - "\n", - "\n", - " print(\"Starting training...\")\n", - " best_f1 = 0\n", - " for epoch in range(EPOCHS):\n", - " model.train()\n", - " total_loss = 0\n", - " predictions, true_labels = [], []\n", - "\n", - " for batch in train_dataloader:\n", - " b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n", - " optimizer.zero_grad()\n", - " outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)\n", - " loss, logits = outputs[:2]\n", - " loss.backward()\n", - " optimizer.step()\n", - " lr_scheduler.step()\n", - "\n", - " total_loss += loss.item()\n", - " predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n", - " true_labels.extend(b_labels.cpu().numpy())\n", - "\n", - " train_acc = accuracy_score(true_labels, predictions)\n", - " print(f\"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}\")\n", - "\n", - "\n", - " model.eval()\n", - " val_predictions, val_labels = [], []\n", - " with torch.no_grad():\n", - " for batch in val_dataloader:\n", - " b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n", - " outputs = model(b_input_ids, attention_mask=b_input_mask)\n", - " logits = outputs[0]\n", - " val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n", - " val_labels.extend(b_labels.cpu().numpy())\n", - "\n", - " val_acc = accuracy_score(val_labels, val_predictions)\n", - " val_f1 = f1_score(val_labels, val_predictions, average='weighted')\n", - " print(f\"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}\")\n", - "\n", - "\n", - " if val_f1 > best_f1:\n", - " best_f1 = val_f1\n", - " torch.save(model.state_dict(), model_save_path)\n", - " print(f\"Best model saved at {model_save_path}\")\n", - "\n", - "\n", - " print(\"Final Model Performance on Validation Set:\")\n", - " print(classification_report(val_labels, val_predictions, digits=4))\n", - " return model_save_path\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Sf7jGxHYhNIb" - }, - "outputs": [], - "source": [ - "\n", - "def test_model(test_df, model_saved_path, model_select=0):\n", - "\n", - " MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),\n", - " (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),\n", - " (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),\n", - " (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')\n", - " ]\n", - " MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", - " seed_torch(42)\n", - "\n", - " cur_model=MODELS[model_select]\n", - " m_name=MODEL_NAMES[model_select]\n", - "\n", - " tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n", - "\n", - " begin=time.time()\n", - "\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - " 'positive':1,\n", - " 'negative':2,\n", - " 'neutral':0})\n", - "\n", - "\n", - " sentences = test_df.Text.values\n", - " labels = test_df.Polarity.values\n", - "\n", - " input_ids = []\n", - " attention_masks = []\n", - "\n", - " for sent in sentences:\n", - " encoded_dict = tokenizer.encode_plus(\n", - " str(sent),\n", - " add_special_tokens = True,\n", - " max_length = MAX_LEN,\n", - " pad_to_max_length = True,\n", - " return_attention_mask = True,\n", - " return_tensors = 'pt',\n", - " )\n", - "\n", - " input_ids.append(encoded_dict['input_ids'])\n", - " attention_masks.append(encoded_dict['attention_mask'])\n", - "\n", - " prediction_inputs = torch.cat(input_ids,dim=0)\n", - " prediction_masks = torch.cat(attention_masks,dim=0)\n", - " prediction_labels = torch.tensor(labels)\n", - "\n", - " prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n", - " prediction_sampler = SequentialSampler(prediction_data)\n", - " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)\n", - "\n", - " model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - " model.cuda()\n", - " model.eval()\n", - "\n", - " predictions,true_labels=[],[]\n", - "\n", - " for batch in prediction_dataloader:\n", - " batch = tuple(t.to(device) for t in batch)\n", - " b_input_ids, b_input_mask, b_labels = batch\n", - "\n", - " with torch.no_grad():\n", - " outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n", - " logits = outputs[0]\n", - "\n", - " logits = logits.detach().cpu().numpy()\n", - " label_ids = b_labels.to('cpu').numpy()\n", - "\n", - " predictions.append(logits)\n", - " true_labels.append(label_ids)\n", - "\n", - " end=time.time()\n", - " print('Prediction used {:.2f} seconds'.format(end - begin))\n", - "\n", - " flat_predictions = [item for sublist in predictions for item in sublist]\n", - " flat_predictions = np.argmax(flat_predictions, axis=1).flatten()\n", - " flat_true_labels = [item for sublist in true_labels for item in sublist]\n", - "\n", - " print(\"Accuracy of {} is: {}\".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))\n", - "\n", - " print(classification_report(flat_true_labels,flat_predictions))\n", - "\n", - "\n", - " df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])\n", - "\n", - " df_combined = pd.concat([test_df, df_prediction], axis=1)\n", - "\n", - " counts = df_combined['prediction_Polarity'].value_counts()\n", - " print(counts)\n", - "\n", - " return df_combined" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dw2VuMkVAfec" - }, - "source": [ - "## Train" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IyDoiOFoMP3W" - }, - "source": [ - "### Dataset Preparation and Splitting\n", - "\n", - "In this section, we prepare the datasets for training and testing.\n", - "\n", - "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n", - "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n", - "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a Cross-Platform Setting?*).\n", - "\n", - "### Steps:\n", - "\n", - "1. **Load Datasets** \n", - " We read the three datasets into Pandas DataFrames.\n", - "\n", - "2. **Split into Training and Testing Sets** \n", - " - The **GitHub dataset (`df_gh`)** and **Stack Overflow dataset (`df_so`)** are each split into 70% training and 30% testing subsets. \n", - " - Similarly, the **cross-platform dataset (`df_crossplatform`)** is divided into a 70% training set and a 30% testing set. \n", - " - The splitting is performed using `train_test_split` with a `random_state` of 42 for reproducibility.\n", - "\n", - "3. **Save Processed Data** \n", - " - The training and testing subsets are saved as CSV files for further use.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fo57JVQT9GIq" - }, - "outputs": [], - "source": [ - "\n", - "# Read datasets\n", - "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv'\n", - "so_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv'\n", - "gh_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv'\n", - "\n", - "# Load datasets into Pandas DataFrames\n", - "df_crossplatform = pd.read_csv(input_path)\n", - "df_so = pd.read_csv(so_input_path)\n", - "df_gh = pd.read_csv(gh_input_path)\n", - "\n", - "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n", - "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n", - "\n", - "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n", - "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n", - "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n", - "\n", - "# Save all datasets to CSV files for further use\n", - "\n", - "train_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df.csv', index=False)\n", - "test_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv', index=False)\n", - "\n", - "train_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_gh.csv', index=False)\n", - "train_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_so.csv', index=False)\n", - "test_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv', index=False)\n", - "test_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RogFCndjOGet" - }, - "source": [ - "### Train for Cross-Platform Dataset\n", - "We combine three training datasets (`train_df(ours)`, `train_gh`, and `train_so`) into a final dataset for training four different models.\n", - "\n", - "## Training Models \n", - "The following models are trained: \n", - "- **BERT** \n", - "- **XLNet** \n", - "- **RoBERTa** \n", - "- **ALBERT** \n", - "\n", - "## Training Parameters \n", - "- **MAX_LEN**: `256` \n", - "- **BATCH_SIZE**: `16` \n", - "- **LEARNING_RATE**: `2e-5` \n", - "- **EPOCHS**: `4` \n", - "\n", - "Each model is trained using the merged dataset and saved for further evaluation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "slhvZg0Ow3ae", - "outputId": "a950593a-2b5c-49a1-823b-40845fbb77cb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training bert model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n", - "Starting training...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1: Train Loss: 0.5891, Accuracy: 0.7577\n", - "Validation Accuracy: 0.8673, F1 Score: 0.8675\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", - "Epoch 2: Train Loss: 0.2148, Accuracy: 0.9312\n", - "Validation Accuracy: 0.8771, F1 Score: 0.8772\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", - "Epoch 3: Train Loss: 0.1029, Accuracy: 0.9683\n", - "Validation Accuracy: 0.8845, F1 Score: 0.8841\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", - "Epoch 4: Train Loss: 0.0526, Accuracy: 0.9869\n", - "Validation Accuracy: 0.9017, F1 Score: 0.9015\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n", - "Final Model Performance on Validation Set:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.9294 0.8587 0.8927 184\n", - " 1 0.8739 0.9369 0.9043 111\n", - " 2 0.8898 0.9375 0.9130 112\n", - "\n", - " accuracy 0.9017 407\n", - " macro avg 0.8977 0.9110 0.9033 407\n", - "weighted avg 0.9034 0.9017 0.9015 407\n", - "\n", - "Training xlnet model...\n", - "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting training...\n", - "Epoch 1: Train Loss: 0.6808, Accuracy: 0.6927\n", - "Validation Accuracy: 0.8059, F1 Score: 0.8043\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", - "Epoch 2: Train Loss: 0.3285, Accuracy: 0.8839\n", - "Validation Accuracy: 0.8280, F1 Score: 0.8275\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", - "Epoch 3: Train Loss: 0.1917, Accuracy: 0.9355\n", - "Validation Accuracy: 0.8550, F1 Score: 0.8547\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", - "Epoch 4: Train Loss: 0.1199, Accuracy: 0.9604\n", - "Validation Accuracy: 0.8575, F1 Score: 0.8573\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n", - "Final Model Performance on Validation Set:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.9241 0.7935 0.8538 184\n", - " 1 0.8306 0.9279 0.8766 111\n", - " 2 0.8000 0.8929 0.8439 112\n", - "\n", - " accuracy 0.8575 407\n", - " macro avg 0.8516 0.8714 0.8581 407\n", - "weighted avg 0.8644 0.8575 0.8573 407\n", - "\n", - "Training Roberta model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n", - "Starting training...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1: Train Loss: 0.6244, Accuracy: 0.7228\n", - "Validation Accuracy: 0.8698, F1 Score: 0.8697\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", - "Epoch 2: Train Loss: 0.2585, Accuracy: 0.9123\n", - "Validation Accuracy: 0.8747, F1 Score: 0.8745\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", - "Epoch 3: Train Loss: 0.1612, Accuracy: 0.9506\n", - "Validation Accuracy: 0.8968, F1 Score: 0.8968\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", - "Epoch 4: Train Loss: 0.0974, Accuracy: 0.9727\n", - "Validation Accuracy: 0.8968, F1 Score: 0.8969\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n", - "Final Model Performance on Validation Set:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.9302 0.8696 0.8989 184\n", - " 1 0.8803 0.9279 0.9035 111\n", - " 2 0.8644 0.9107 0.8870 112\n", - "\n", - " accuracy 0.8968 407\n", - " macro avg 0.8917 0.9027 0.8964 407\n", - "weighted avg 0.8985 0.8968 0.8969 407\n", - "\n", - "Training albert model...\n", - "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting training...\n", - "Epoch 1: Train Loss: 0.6810, Accuracy: 0.7110\n", - "Validation Accuracy: 0.8305, F1 Score: 0.8332\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n", - "Epoch 2: Train Loss: 0.3244, Accuracy: 0.8913\n", - "Validation Accuracy: 0.8550, F1 Score: 0.8550\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n", - "Epoch 3: Train Loss: 0.2071, Accuracy: 0.9306\n", - "Validation Accuracy: 0.8550, F1 Score: 0.8548\n", - "Epoch 4: Train Loss: 0.1405, Accuracy: 0.9566\n", - "Validation Accuracy: 0.8526, F1 Score: 0.8524\n", - "Final Model Performance on Validation Set:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.8715 0.8478 0.8595 184\n", - " 1 0.8547 0.9009 0.8772 111\n", - " 2 0.8198 0.8125 0.8161 112\n", - "\n", - " accuracy 0.8526 407\n", - " macro avg 0.8487 0.8537 0.8509 407\n", - "weighted avg 0.8527 0.8526 0.8524 407\n", - "\n" - ] - } - ], - "source": [ - "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n", - "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n", - "train_df_final.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df_final.csv', index=False)\n", - "\n", - "# Define the list of model names to be trained\n", - "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", - "\n", - "# Train each model and save the trained model files\n", - "for i, model_name in enumerate(MODEL_NAMES):\n", - " model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n", - " print(f\"Training {model_name} model...\")\n", - " train_model(train_df_final, model_save_path, model_select=i)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "krgDqVN9x_my" - }, - "source": [ - "### Train for existing dataset on Bert\n", - "In this section, we train the **BERT** model using the existing **GitHub** and **Stack Overflow** datasets.\n", - "\n", - "## Training Data \n", - "The training dataset consists of: \n", - "- **GitHub Data** (`train_gh`) \n", - "- **Stack Overflow Data** (`train_so`) \n", - "\n", - "## Training Parameters \n", - "- **MAX_LEN**: `256` \n", - "- **BATCH_SIZE**: `16` \n", - "- **LEARNING_RATE**: `2e-5` \n", - "- **EPOCHS**: `4` \n", - "\n", - "The trained **BERT** model will be saved for further evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sxHJGW6Lbgnt", - "outputId": "8d60c4d7-c1b2-4dee-dc01-a7069146a577" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training bert model on SO dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training data shape: torch.Size([315, 256]), torch.Size([315, 256]), torch.Size([315])\n", - "Starting training...\n", - "Epoch 1: Train Loss: 0.7896, Accuracy: 0.7032\n", - "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\n", - "Epoch 2: Train Loss: 0.5530, Accuracy: 0.8163\n", - "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", - "Epoch 3: Train Loss: 0.4831, Accuracy: 0.8163\n", - "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", - "Epoch 4: Train Loss: 0.4383, Accuracy: 0.8163\n", - "Validation Accuracy: 0.8438, F1 Score: 0.7722\n", - "Final Model Performance on Validation Set:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.8438 1.0000 0.9153 27\n", - " 1 0.0000 0.0000 0.0000 4\n", - " 2 0.0000 0.0000 0.0000 1\n", - "\n", - " accuracy 0.8438 32\n", - " macro avg 0.2812 0.3333 0.3051 32\n", - "weighted avg 0.7119 0.8438 0.7722 32\n", - "\n", - "Training bert model on GH dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training data shape: torch.Size([1495, 256]), torch.Size([1495, 256]), torch.Size([1495])\n", - "Starting training...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1: Train Loss: 0.8448, Accuracy: 0.6141\n", - "Validation Accuracy: 0.7933, F1 Score: 0.7983\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n", - "Epoch 2: Train Loss: 0.3207, Accuracy: 0.8900\n", - "Validation Accuracy: 0.8733, F1 Score: 0.8741\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n", - "Epoch 3: Train Loss: 0.1529, Accuracy: 0.9532\n", - "Validation Accuracy: 0.8867, F1 Score: 0.8851\n", - "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n", - "Epoch 4: Train Loss: 0.0991, Accuracy: 0.9762\n", - "Validation Accuracy: 0.8800, F1 Score: 0.8783\n", - "Final Model Performance on Validation Set:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.8868 0.8246 0.8545 57\n", - " 1 0.8846 1.0000 0.9388 46\n", - " 2 0.8667 0.8298 0.8478 47\n", - "\n", - " accuracy 0.8800 150\n", - " macro avg 0.8794 0.8848 0.8804 150\n", - "weighted avg 0.8798 0.8800 0.8783 150\n", - "\n" - ] - } - ], - "source": [ - "# Train the model for github-golden-rule and stackoverflow on Bert\n", - "\n", - "MODEL_NAMES = ['bert']\n", - "for dataset_name, train_df in [('SO', train_so), ('GH', train_gh)]:\n", - " for i, model_name in enumerate(MODEL_NAMES):\n", - " model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{dataset_name}_{model_name}_model\"\n", - " print(f\"Training {model_name} model on {dataset_name} dataset...\")\n", - " train_model(train_df, model_save_path, model_select=i)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ugVi3T4wAkSQ" - }, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XMgVVJ9U9XBa" - }, - "source": [ - "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n", - "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n", - "\n", - "## Evaluation Metrics \n", - "We will assess: \n", - "1. **Overall model performance** across all platforms. \n", - "2. **Platform-specific performance** for each model on: \n", - " - **GitHub** \n", - " - **Jira** \n", - " - **Mailbox** \n", - "\n", - "## Results \n", - "The evaluation will print: \n", - "- **Overall accuracy** of each model. \n", - "- **Performance breakdown per platform** for each model. \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RSJI327nEKAz", - "outputId": "264447bc-b509-4e8a-e5ce-49cbed8a44a4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluating bert model...for overall platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 9.38 seconds\n", - "Accuracy of bert is: 0.9422084623323014\n", - " precision recall f1-score support\n", - "\n", - " 0 0.95 0.91 0.93 329\n", - " 1 0.94 0.97 0.95 318\n", - " 2 0.94 0.95 0.94 322\n", - "\n", - " accuracy 0.94 969\n", - " macro avg 0.94 0.94 0.94 969\n", - "weighted avg 0.94 0.94 0.94 969\n", - "\n", - "prediction_Polarity\n", - "2 327\n", - "1 326\n", - "0 316\n", - "Name: count, dtype: int64\n", - "Evaluating bert model...for GitHub platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 3.93 seconds\n", - "Accuracy of bert is: 0.956140350877193\n", - " precision recall f1-score support\n", - "\n", - " 0 0.96 0.94 0.95 128\n", - " 1 0.97 0.96 0.96 122\n", - " 2 0.94 0.98 0.96 92\n", - "\n", - " accuracy 0.96 342\n", - " macro avg 0.95 0.96 0.96 342\n", - "weighted avg 0.96 0.96 0.96 342\n", - "\n", - "prediction_Polarity\n", - "0.0 125\n", - "1.0 121\n", - "2.0 96\n", - "Name: count, dtype: int64\n", - "Evaluating bert model...for Jira platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 4.05 seconds\n", - "Accuracy of bert is: 0.926829268292683\n", - " precision recall f1-score support\n", - "\n", - " 0 0.94 0.88 0.91 93\n", - " 1 0.92 0.94 0.93 86\n", - " 2 0.92 0.95 0.94 108\n", - "\n", - " accuracy 0.93 287\n", - " macro avg 0.93 0.93 0.93 287\n", - "weighted avg 0.93 0.93 0.93 287\n", - "\n", - "prediction_Polarity\n", - "2.0 112\n", - "1.0 88\n", - "0.0 87\n", - "Name: count, dtype: int64\n", - "Evaluating bert model...for Mailbox platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 5.19 seconds\n", - "Accuracy of bert is: 0.9411764705882353\n", - " precision recall f1-score support\n", - "\n", - " 0 0.94 0.91 0.92 108\n", - " 1 0.93 0.99 0.96 110\n", - " 2 0.95 0.93 0.94 122\n", - "\n", - " accuracy 0.94 340\n", - " macro avg 0.94 0.94 0.94 340\n", - "weighted avg 0.94 0.94 0.94 340\n", - "\n", - "prediction_Polarity\n", - "2.0 119\n", - "1.0 117\n", - "0.0 104\n", - "Name: count, dtype: int64\n", - "Evaluating xlnet model...for overall platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 14.16 seconds\n", - "Accuracy of xlnet is: 0.8823529411764706\n", - " precision recall f1-score support\n", - "\n", - " 0 0.89 0.82 0.85 329\n", - " 1 0.88 0.95 0.91 318\n", - " 2 0.88 0.88 0.88 322\n", - "\n", - " accuracy 0.88 969\n", - " macro avg 0.88 0.88 0.88 969\n", - "weighted avg 0.88 0.88 0.88 969\n", - "\n", - "prediction_Polarity\n", - "1 343\n", - "2 320\n", - "0 306\n", - "Name: count, dtype: int64\n", - "Evaluating xlnet model...for GitHub platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 7.19 seconds\n", - "Accuracy of xlnet is: 0.9005847953216374\n", - " precision recall f1-score support\n", - "\n", - " 0 0.90 0.85 0.88 128\n", - " 1 0.90 0.96 0.93 122\n", - " 2 0.90 0.89 0.90 92\n", - "\n", - " accuracy 0.90 342\n", - " macro avg 0.90 0.90 0.90 342\n", - "weighted avg 0.90 0.90 0.90 342\n", - "\n", - "prediction_Polarity\n", - "1.0 130\n", - "0.0 121\n", - "2.0 91\n", - "Name: count, dtype: int64\n", - "Evaluating xlnet model...for Jira platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 5.72 seconds\n", - "Accuracy of xlnet is: 0.9024390243902439\n", - " precision recall f1-score support\n", - "\n", - " 0 0.91 0.81 0.86 93\n", - " 1 0.91 0.95 0.93 86\n", - " 2 0.89 0.94 0.91 108\n", - "\n", - " accuracy 0.90 287\n", - " macro avg 0.90 0.90 0.90 287\n", - "weighted avg 0.90 0.90 0.90 287\n", - "\n", - "prediction_Polarity\n", - "2.0 115\n", - "1.0 90\n", - "0.0 82\n", - "Name: count, dtype: int64\n", - "Evaluating xlnet model...for Mailbox platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 6.06 seconds\n", - "Accuracy of xlnet is: 0.8470588235294118\n", - " precision recall f1-score support\n", - "\n", - " 0 0.84 0.81 0.82 108\n", - " 1 0.83 0.93 0.88 110\n", - " 2 0.87 0.81 0.84 122\n", - "\n", - " accuracy 0.85 340\n", - " macro avg 0.85 0.85 0.85 340\n", - "weighted avg 0.85 0.85 0.85 340\n", - "\n", - "prediction_Polarity\n", - "1.0 123\n", - "2.0 114\n", - "0.0 103\n", - "Name: count, dtype: int64\n", - "Evaluating Roberta model...for overall platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 9.61 seconds\n", - "Accuracy of Roberta is: 0.9060887512899897\n", - " precision recall f1-score support\n", - "\n", - " 0 0.93 0.83 0.87 329\n", - " 1 0.91 0.96 0.93 318\n", - " 2 0.88 0.94 0.91 322\n", - "\n", - " accuracy 0.91 969\n", - " macro avg 0.91 0.91 0.91 969\n", - "weighted avg 0.91 0.91 0.91 969\n", - "\n", - "prediction_Polarity\n", - "2 342\n", - "1 333\n", - "0 294\n", - "Name: count, dtype: int64\n", - "Evaluating Roberta model...for GitHub platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 4.66 seconds\n", - "Accuracy of Roberta is: 0.9298245614035088\n", - " precision recall f1-score support\n", - "\n", - " 0 0.93 0.88 0.90 128\n", - " 1 0.95 0.98 0.96 122\n", - " 2 0.91 0.93 0.92 92\n", - "\n", - " accuracy 0.93 342\n", - " macro avg 0.93 0.93 0.93 342\n", - "weighted avg 0.93 0.93 0.93 342\n", - "\n", - "prediction_Polarity\n", - "1.0 125\n", - "0.0 122\n", - "2.0 95\n", - "Name: count, dtype: int64\n", - "Evaluating Roberta model...for Jira platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 4.47 seconds\n", - "Accuracy of Roberta is: 0.9163763066202091\n", - " precision recall f1-score support\n", - "\n", - " 0 0.95 0.81 0.87 93\n", - " 1 0.91 0.95 0.93 86\n", - " 2 0.90 0.98 0.94 108\n", - "\n", - " accuracy 0.92 287\n", - " macro avg 0.92 0.91 0.91 287\n", - "weighted avg 0.92 0.92 0.91 287\n", - "\n", - "prediction_Polarity\n", - "2.0 118\n", - "1.0 90\n", - "0.0 79\n", - "Name: count, dtype: int64\n", - "Evaluating Roberta model...for Mailbox platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 5.03 seconds\n", - "Accuracy of Roberta is: 0.8735294117647059\n", - " precision recall f1-score support\n", - "\n", - " 0 0.90 0.78 0.84 108\n", - " 1 0.87 0.94 0.90 110\n", - " 2 0.85 0.90 0.88 122\n", - "\n", - " accuracy 0.87 340\n", - " macro avg 0.88 0.87 0.87 340\n", - "weighted avg 0.88 0.87 0.87 340\n", - "\n", - "prediction_Polarity\n", - "2.0 129\n", - "1.0 118\n", - "0.0 93\n", - "Name: count, dtype: int64\n", - "Evaluating albert model...for overall platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 8.62 seconds\n", - "Accuracy of albert is: 0.8844169246646026\n", - " precision recall f1-score support\n", - "\n", - " 0 0.90 0.79 0.84 329\n", - " 1 0.91 0.95 0.93 318\n", - " 2 0.85 0.92 0.88 322\n", - "\n", - " accuracy 0.88 969\n", - " macro avg 0.89 0.89 0.88 969\n", - "weighted avg 0.89 0.88 0.88 969\n", - "\n", - "prediction_Polarity\n", - "2 350\n", - "1 331\n", - "0 288\n", - "Name: count, dtype: int64\n", - "Evaluating albert model...for GitHub platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 3.99 seconds\n", - "Accuracy of albert is: 0.9269005847953217\n", - " precision recall f1-score support\n", - "\n", - " 0 0.93 0.88 0.90 128\n", - " 1 0.97 0.96 0.96 122\n", - " 2 0.87 0.96 0.91 92\n", - "\n", - " accuracy 0.93 342\n", - " macro avg 0.92 0.93 0.93 342\n", - "weighted avg 0.93 0.93 0.93 342\n", - "\n", - "prediction_Polarity\n", - "1.0 121\n", - "0.0 120\n", - "2.0 101\n", - "Name: count, dtype: int64\n", - "Evaluating albert model...for Jira platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 3.30 seconds\n", - "Accuracy of albert is: 0.89198606271777\n", - " precision recall f1-score support\n", - "\n", - " 0 0.95 0.75 0.84 93\n", - " 1 0.91 0.94 0.93 86\n", - " 2 0.85 0.97 0.91 108\n", - "\n", - " accuracy 0.89 287\n", - " macro avg 0.90 0.89 0.89 287\n", - "weighted avg 0.90 0.89 0.89 287\n", - "\n", - "prediction_Polarity\n", - "2.0 124\n", - "1.0 89\n", - "0.0 74\n", - "Name: count, dtype: int64\n", - "Evaluating albert model...for Mailbox platform\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " test_df['Polarity']=test_df['Polarity'].replace({\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 4.12 seconds\n", - "Accuracy of albert is: 0.8352941176470589\n", - " precision recall f1-score support\n", - "\n", - " 0 0.83 0.72 0.77 108\n", - " 1 0.85 0.94 0.89 110\n", - " 2 0.82 0.84 0.83 122\n", - "\n", - " accuracy 0.84 340\n", - " macro avg 0.84 0.83 0.83 340\n", - "weighted avg 0.83 0.84 0.83 340\n", - "\n", - "prediction_Polarity\n", - "2.0 125\n", - "1.0 121\n", - "0.0 94\n", - "Name: count, dtype: int64\n" - ] - } - ], - "source": [ - "\n", - "# Load test dataset\n", - "test_df = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv')\n", - "\n", - "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", - "model_results = {}\n", - "\n", - "# Define platform mapping\n", - "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n", - "\n", - "# Evaluate each model\n", - "for i, model_name in enumerate(MODEL_NAMES):\n", - " model_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n", - " print(f\"Evaluating {model_name} model...for overall platform\")\n", - "\n", - " # Get overall accuracy\n", - " overall_accuracy = test_model(test_df, model_path, model_select=i)\n", - "\n", - " # Evaluate accuracy per platform\n", - " platform_accuracies = {}\n", - " for platform_id, platform_name in platforms.items():\n", - " test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n", - " if not test_df_platform.empty:\n", - " print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n", - " accuracy = test_model(test_df_platform, model_path, model_select=i)\n", - " platform_accuracies[platform_name] = accuracy\n", - " else:\n", - " platform_accuracies[platform_name] = \"No data\"\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2rTxz8aIRkYQ" - }, - "source": [ - "### Generalization Performance of the Model (Table 3.3)\n", - "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets: \n", - "- **GitHub Golden Rule Dataset** \n", - "- **Stack Overflow Dataset** \n", - "\n", - "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n", - "\n", - "## Evaluation Process \n", - "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n", - "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n", - "\n", - "## Goals \n", - "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n", - "- To highlight the **superiority** of our cross-platform model over dataset-specific models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ekatnWlSAnyd", - "outputId": "7ff88fb0-109c-43b0-97ae-36a792730275" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluating bert_model on GitHub test dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 7.34 seconds\n", - "Accuracy of bert is: 0.8829953198127926\n", - " precision recall f1-score support\n", - "\n", - " 0 0.91 0.86 0.88 267\n", - " 1 0.89 0.92 0.91 170\n", - " 2 0.85 0.88 0.86 204\n", - "\n", - " accuracy 0.88 641\n", - " macro avg 0.88 0.89 0.88 641\n", - "weighted avg 0.88 0.88 0.88 641\n", - "\n", - "prediction_Polarity\n", - "0 252\n", - "2 213\n", - "1 176\n", - "Name: count, dtype: int64\n", - "Evaluating bert_model on Stack Overflow test dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n", - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 4.10 seconds\n", - "Accuracy of bert is: 0.8814814814814815\n", - " precision recall f1-score support\n", - "\n", - " 0 0.93 0.93 0.93 110\n", - " 1 0.44 0.36 0.40 11\n", - " 2 0.81 0.93 0.87 14\n", - "\n", - " accuracy 0.88 135\n", - " macro avg 0.73 0.74 0.73 135\n", - "weighted avg 0.88 0.88 0.88 135\n", - "\n", - "prediction_Polarity\n", - "0 110\n", - "2 16\n", - "1 9\n", - "Name: count, dtype: int64\n", - "Evaluating GH_bert_model on Stack Overflow test dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 2.21 seconds\n", - "Accuracy of bert is: 0.8148148148148148\n", - " precision recall f1-score support\n", - "\n", - " 0 0.85 0.95 0.89 110\n", - " 1 0.50 0.27 0.35 11\n", - " 2 0.50 0.21 0.30 14\n", - "\n", - " accuracy 0.81 135\n", - " macro avg 0.62 0.48 0.52 135\n", - "weighted avg 0.78 0.81 0.79 135\n", - "\n", - "prediction_Polarity\n", - "0 123\n", - "2 6\n", - "1 6\n", - "Name: count, dtype: int64\n", - "Evaluating SO_bert_model on GitHub test dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - ":53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", - " model.load_state_dict(torch.load(model_saved_path))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction used 6.62 seconds\n", - "Accuracy of bert is: 0.4165366614664587\n", - " precision recall f1-score support\n", - "\n", - " 0 0.42 1.00 0.59 267\n", - " 1 0.00 0.00 0.00 170\n", - " 2 0.00 0.00 0.00 204\n", - "\n", - " accuracy 0.42 641\n", - " macro avg 0.14 0.33 0.20 641\n", - "weighted avg 0.17 0.42 0.24 641\n", - "\n", - "prediction_Polarity\n", - "0 641\n", - "Name: count, dtype: int64\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", - "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" - ] - } - ], - "source": [ - "\n", - "\n", - "# Load test datasets\n", - "test_gh = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv')\n", - "test_so = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv')\n", - "\n", - "# Define model paths\n", - "bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\"\n", - "gh_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\"\n", - "so_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\"\n", - "\n", - "# Store results\n", - "model_results = {}\n", - "\n", - "# 1. Validate bert_model on test_gh and test_so\n", - "print(\"Evaluating bert_model on GitHub test dataset...\")\n", - "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n", - "\n", - "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n", - "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n", - "\n", - "model_results[\"bert_model\"] = {\n", - " \"test_gh Accuracy\": bert_on_gh,\n", - " \"test_so Accuracy\": bert_on_so\n", - "}\n", - "\n", - "# 2. Validate GH_bert_model on test_so\n", - "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n", - "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n", - "\n", - "model_results[\"GH_bert_model\"] = {\n", - " \"test_so Accuracy\": gh_bert_on_so\n", - "}\n", - "\n", - "# 3. Validate SO_bert_model on test_gh\n", - "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n", - "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n", - "\n", - "model_results[\"SO_bert_model\"] = {\n", - " \"test_gh Accuracy\": so_bert_on_gh\n", - "}\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "L4", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/api/filter.py b/api/filter.py new file mode 100644 index 0000000..7ed57f6 --- /dev/null +++ b/api/filter.py @@ -0,0 +1,64 @@ +import sys +import re +from unicodedata import category +from bs4 import BeautifulSoup +from markdown import markdown + + +USERNAME_REGEX = r"(\s|^)@(\S*\s?)" + +# Generate Unicode punctuation set +punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith(("P", "S"))} + +# Dictionary to count token replacements +counters = {} + +def remove_punctuation(text): + """ + Remove all punctuation characters from the given text. + + Args: + text (str): The input text. + + Returns: + str: The text without any punctuation. + """ + return "".join(char for char in text if char not in punctuation) + +def clean_text(text): + """ + Remove quoted text and large code blocks from GitHub issues or comments. + + This function performs the following clean-up: + - Removes quoted email/notification text from GitHub. + - Removes code blocks enclosed in triple backticks. + + Args: + text (str): The input text (typically from a GitHub issue or comment). + + Returns: + str: The cleaned text without quoted text or code blocks. + """ + # Remove quoted text from emails/notifications + text = re.sub(r"^(On[\s\S]*?notifications@github\.com\s*?wrote:\s*?)?(^(\>).*\s)*", '', text, flags=re.MULTILINE) + + # Remove code blocks enclosed in triple backticks + text = re.sub(r"```[a-z]*\n[\s\S]*?\n```", "", text) + + return text + +def remove_markdown_content(text): + """ + Converts Markdown content to plain text by removing all Markdown formatting. + + This function processes the input Markdown text and converts it to plain text + by removing all Markdown syntax. + + Args: + text (str): The input Markdown text. + + Returns: + str: Cleaned text without Markdown formatting. + """ + html = markdown(text) + return "".join(BeautifulSoup(html, "lxml").findAll(text=True)) \ No newline at end of file diff --git a/api/test.py b/api/test.py new file mode 100644 index 0000000..a2bfe29 --- /dev/null +++ b/api/test.py @@ -0,0 +1,121 @@ +import os +import re +import string +import random +import warnings +import argparse +import numpy as np +import pandas as pd +import torch +import time +import seaborn as sns +import matplotlib.pyplot as plt +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from io import StringIO +from unicodedata import category +from bs4 import BeautifulSoup +from markdown import markdown +from sklearn.model_selection import train_test_split +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report +from torch.utils.data import DataLoader, RandomSampler, Dataset +from transformers import ( + BertTokenizer, BertForSequenceClassification, BertForMaskedLM, + XLNetTokenizer, XLNetForSequenceClassification, + RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM, + AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM, + get_scheduler +) +from torch.optim import AdamW +from api.train import * + +def test_model(test_df, model_saved_path, model_select=0): + + MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model=MODELS[model_select] + m_name=MODEL_NAMES[model_select] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin=time.time() + + test_df['Polarity']=test_df['Polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0}) + + + sentences = test_df.Text.values + labels = test_df.Polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens = True, + max_length = MAX_LEN, + pad_to_max_length = True, + return_attention_mask = True, + return_tensors = 'pt', + ) + + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids,dim=0) + prediction_masks = torch.cat(attention_masks,dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) +# model.cuda() + model.eval() + + predictions,true_labels=[],[] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, b_labels = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + predictions.append(logits) + true_labels.append(label_ids) + + end=time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + flat_true_labels = [item for sublist in true_labels for item in sublist] + + print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) + + print(classification_report(flat_true_labels,flat_predictions)) + + + df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity']) + + df_combined = pd.concat([test_df, df_prediction], axis=1) + + counts = df_combined['prediction_Polarity'].value_counts() + print(counts) + + return df_combined \ No newline at end of file diff --git a/api/tokenizer.py b/api/tokenizer.py new file mode 100644 index 0000000..c546182 --- /dev/null +++ b/api/tokenizer.py @@ -0,0 +1,99 @@ +import re +import warnings + +# Dictionary to count token replacements +counters = {} + + +def replace_token(regex, token_name, text): + """ + Replace matched patterns in the text with the specified token. + + This function uses regular expressions to find occurrences of the pattern + and replaces them with a token name. The number of replacements made is counted. + + Args: + regex (str): The regular expression pattern to match. + token_name (str): The replacement token name. + text (str): The input text. + + Returns: + tuple: A tuple containing: + - str: The text with the tokens replacing the matches. + - int: The number of replacements made. + """ + replaced_text, replacements = re.subn(regex, f" {token_name} ", text, flags=re.MULTILINE) + counters[token_name] = counters.get(token_name, 0) + replacements + return replaced_text, replacements + +def tokenize_text(text): + """ + Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc. + + This function processes the input text and replaces various elements, such as: + - Email addresses (replaced with 'MEMAIL'). + - GitHub mentions (replaced with 'MMENTION'). + - Code blocks (replaced with 'MICODE'). + - Version numbers (replaced with 'MVERSIONNUMBER'). + - Issue mentions (replaced with 'MISSUEMENTION'). + - URLs (replaced with 'MURL'). + + Args: + text (str): The input text. + + Returns: + tuple: A tuple containing: + - str: The tokenized text. + - int: The total number of replacements made. + """ + total_replacements = 0 + + text, replacements = replace_token(r"\S+@\S*\s?", "MEMAIL", text) + total_replacements += replacements + + text, replacements = replace_token(USERNAME_REGEX, "MMENTION", text) + total_replacements += replacements + + text, replacements = replace_token(r"`([^`]*)`", "MICODE", text) + total_replacements += replacements + + text, replacements = replace_token(r"\b\d+\.\d+(\.\d+)*\b", "MVERSIONNUMBER", text) + total_replacements += replacements + + text, replacements = replace_token(r"(\s|^)#\d+", "MISSUEMENTION", text) + total_replacements += replacements + + text, replacements = replace_token( + r"([a-zA-Z0-9]+):\/\/([\w_-]+(?:\.[\w_-]+)*)[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]", + "MURL", + text, + ) + total_replacements += replacements + + return text, total_replacements + +def transform_text(row): + """ + Transforms a row by cleaning and tokenizing its text content. + + This function extracts the "Text" key from the input dictionary and processes + it using the `tokenize_text` function. The text is also cleaned by removing + newline characters. + + Args: + row (dict): A dictionary containing a 'Text' key. + + Returns: + tuple: A tuple containing: + - str: The processed text after cleaning and tokenization. + - int: The number of replacements made. + """ + text = row.get("Text", "") + + if not isinstance(text, str): + warnings.warn(f"Converting non-string type to string: {type(text)}") + text = str(text) + + text, replaced_count = tokenize_text(text) + text = text.replace("\n", "") + return text, replaced_count \ No newline at end of file diff --git a/api/train.py b/api/train.py new file mode 100644 index 0000000..30f5514 --- /dev/null +++ b/api/train.py @@ -0,0 +1,165 @@ +import os +import re +import string +import random +import warnings +import argparse +import numpy as np +import pandas as pd +import torch +import time +import seaborn as sns +import matplotlib.pyplot as plt +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from io import StringIO +from unicodedata import category +from bs4 import BeautifulSoup +from markdown import markdown +from sklearn.model_selection import train_test_split +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report +from torch.utils.data import DataLoader, RandomSampler, Dataset +from transformers import ( + BertTokenizer, BertForSequenceClassification, BertForMaskedLM, + XLNetTokenizer, XLNetForSequenceClassification, + RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM, + AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM, + get_scheduler +) +from torch.optim import AdamW + +MAX_LEN = 256 +BATCH_SIZE = 16 +LEARNING_RATE = 2e-5 +EPOCHS = 4 +WEIGHT_DECAY = 0.01 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +MODELS = [ + (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1') +] +MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert'] + + +def train_model(train_df, model_save_path, model_select=0): + seed_torch(42) + + cur_model = MODELS[model_select] + m_name = MODEL_NAMES[model_select] + + + train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.Text.values + labels = train_df.Polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def seed_torch(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic=True \ No newline at end of file diff --git a/notebooks/Software_Development_Sentiment_Classification.ipynb b/notebooks/Software_Development_Sentiment_Classification.ipynb new file mode 100644 index 0000000..dbe68a8 --- /dev/null +++ b/notebooks/Software_Development_Sentiment_Classification.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sentiment Analysis\n", + "\n", + "In this notebook, we test how well four different **machine learning models** can analyze and understand the **sentiment** (positive or negative emotions) expressed in messages on various platforms used by software developers. These platforms include:\n", + "\n", + "- **GitHub** (where developers collaborate on code)\n", + "- **Jira** (used for tracking issues and tasks)\n", + "- **Mailbox** (for email-based communication)\n", + "\n", + "The models we’re testing are:\n", + "- **BERT**\n", + "- **XLNet**\n", + "- **RoBERTa**\n", + "- **ALBERT**\n", + "\n", + "Each model was trained on a mix of data from all three platforms. We then test how well each model performs on new, unseen data from the same platforms.\n", + "\n", + "1. **Overall Accuracy**: How well each model performs across all platforms.\n", + "2. **Platform-Specific Accuracy**: How well each model performs on **GitHub**, **Jira**, and **Mailbox** separately.\n", + "\n", + "The results help us understand which models work best across different communication tools and give insights into how sentiment analysis can be applied to real-world developer conversations.\n", + "\n", + "Note: This notebook aims to get the data ready, the `Train.ipynb` notebook trains the models, the `Test.ipynb` tests the models against the datasets.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "oe8X-6s9btXo" + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "from tabulate import tabulate\n", + "\n", + "from api.filter import *\n", + "from api.tokenizer import *\n", + "from api.train import *\n", + "from api.test import *" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RAXtSnSK4LPr" + }, + "source": [ + "# Tokenlized\n", + "\n", + "This section processes the raw sentiment analysis datasets (`so-dataset.csv`, `gh-dataset.csv`, and `crossplatform_sf_dataset.csv`) by applying a custom text transformation function. The goal is to standardize and clean the text data before training. You can change the transform_text function to specific needs.\n", + "\n", + "There are aditional functions provided in [filter.py](../api/filter.py) and [tokenizer.py](../api/tokenizer.py) that can be used for specific use cases. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_directory = os.getcwd()\n", + "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n", + "\n", + "# Define input dataset paths\n", + "input_paths = [\n", + " f\"{root}/so-dataset.csv\",\n", + " f\"{root}/gh-dataset.csv\",\n", + " f\"{root}/crossplatform_sf_dataset.csv\"\n", + "]\n", + "\n", + "# Define the text transformation function (ensure transform_text is correctly implemented)\n", + "def transform_text(row):\n", + " # Modify this function according to your needs\n", + " # Example: return the original text and a dummy replacement count\n", + " return row[\"Text\"], 1\n", + "\n", + "# Loop through each dataset and process it\n", + "for input_path in input_paths:\n", + " # Generate output file name\n", + " output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n", + " output_path = os.path.join(os.path.dirname(input_path), output_filename)\n", + "\n", + " # Load dataset\n", + " df = pd.read_csv(input_path)\n", + " print(f\"Processing dataset: {input_path}\")\n", + " print(df.head()) # Print first few rows for verification\n", + "\n", + " # Apply text transformation\n", + " df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n", + "\n", + " # Calculate total replacements from `replaced_token` column\n", + " total_replacements = df[\"replaced_token\"].sum()\n", + "\n", + " # Save processed dataset\n", + " df.to_csv(output_path, header=True, index=False)\n", + "\n", + " print(f\"Tokenized dataset saved to: {output_path}\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the data is cleaned and tokenized lets look at the datasets we have. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o3Cm2Gn_CbM5" + }, + "source": [ + "# Dataset Overview: \n", + "\n", + "`so-dataset.csv` : Contains Stack Overflow comment data.\n", + "\n", + "\n", + " `gh-dataset.csv` : Contains GitHub Stack overflow comment data. \n", + "\n", + "\n", + "\n", + "\n", + "`crossplatform_sf_dataset.csv`\n", + "\n", + "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n", + "\n", + "## **Column Descriptions**\n", + "- **`Text`**: The user comment or discussion content. \n", + "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text: \n", + " - `2`: Negative sentiment \n", + " - `0`: Neutral sentiment \n", + " - `1`: Positive sentiment \n", + "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated: \n", + " - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests) \n", + " - `1`: **Jira** (Bug reports, task comments in software development management tools) \n", + " - `2`: **Mailbox** (Developer communication through emails) \n", + "\n", + "## **Dataset Distribution**\n", + "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QaHSklrRClHU", + "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e" + }, + "outputs": [], + "source": [ + "\n", + "# Load dataset\n", + "input_path = f\"{root}/crossplatform_sf_dataset.csv\"\n", + "df = pd.read_csv(input_path)\n", + "\n", + "# Compute dataset statistics\n", + "total_samples = len(df)\n", + "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n", + "platform_counts = df[\"Platform\"].value_counts().sort_index()\n", + "\n", + "# Compute Polarity distribution within each Platform\n", + "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n", + "\n", + "# Print results with formatting\n", + "print(\"=\" * 50)\n", + "print(f\"📊 Dataset Information: cf-dataset.csv\")\n", + "print(\"=\" * 50)\n", + "print(f\"Total Samples: {total_samples}\\n\")\n", + "\n", + "# Polarity distribution\n", + "print(\"📌 Polarity Distribution:\")\n", + "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n", + "print(\"\\n\")\n", + "\n", + "# Platform distribution\n", + "print(\"📌 Platform Distribution:\")\n", + "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n", + "print(\"\\n\")\n", + "\n", + "# Platform-wise Polarity distribution\n", + "print(\"📌 Platform-wise Polarity Distribution:\")\n", + "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n", + "print(\"=\" * 50)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have an understanding of the three datasets we can move over to the [Train.ipynb](./Train.ipynb) Notebook to start training the models based on the datasets we just prepared and reviewed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the models are trained they can be tested with the [Test.ipynb](./Test.ipynb) Notebook. " + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "L4", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb new file mode 100644 index 0000000..75b01bb --- /dev/null +++ b/notebooks/Test.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test\n", + "\n", + "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n", + "\n", + "## Evaluation Metrics \n", + "We will assess: \n", + "1. **Overall model performance** across all platforms. \n", + "2. **Platform-specific performance** for each model on: \n", + " - **GitHub** \n", + " - **Jira** \n", + " - **Mailbox** \n", + "\n", + "## Results \n", + "The evaluation will print: \n", + "- **Overall accuracy** of each model. \n", + "- **Performance breakdown per platform** for each model. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import string\n", + "import random\n", + "import warnings\n", + "import argparse\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "import time\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "from io import StringIO\n", + "from unicodedata import category\n", + "from bs4 import BeautifulSoup\n", + "from markdown import markdown\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n", + "from torch.utils.data import DataLoader, RandomSampler, Dataset\n", + "from transformers import (\n", + " BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n", + " XLNetTokenizer, XLNetForSequenceClassification,\n", + " RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n", + " AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n", + " get_scheduler\n", + ")\n", + "\n", + "# Changed because AdamW Depreciated\n", + "from torch.optim import AdamW\n", + "from api.preprocessing import *\n", + "from api.train import *\n", + "from api.test import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "n_gpu = torch.cuda.device_count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluates four pretrained models \n", + "current_directory = os.getcwd()\n", + "# Load test dataset\n", + "test_df = pd.read_csv(f'{current_directory}/test_df.csv')\n", + "\n", + "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + "model_results = {}\n", + "\n", + "# Define platform mapping\n", + "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n", + "\n", + "# Evaluate each model\n", + "for i, model_name in enumerate(MODEL_NAMES):\n", + " model_path = f\"{current_directory}/{model_name}_model\"\n", + " print(f\"Evaluating {model_name} model...for overall platform\")\n", + "\n", + " # Get overall accuracy\n", + " overall_accuracy = test_model(test_df, model_path, model_select=i)\n", + "\n", + " # Evaluate accuracy per platform\n", + " platform_accuracies = {}\n", + " for platform_id, platform_name in platforms.items():\n", + " test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n", + " if not test_df_platform.empty:\n", + " print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n", + " accuracy = test_model(test_df_platform, model_path, model_select=i)\n", + " platform_accuracies[platform_name] = accuracy\n", + " else:\n", + " platform_accuracies[platform_name] = \"No data\"\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generalization Performance of the Model (Table 3.3)\n", + "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets: \n", + "- **GitHub Golden Rule Dataset** \n", + "- **Stack Overflow Dataset** \n", + "\n", + "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n", + "\n", + "## Evaluation Process \n", + "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n", + "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n", + "\n", + "## Goals \n", + "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n", + "- To highlight the **superiority** of our cross-platform model over dataset-specific models." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating bert_model on GitHub test dataset...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:2700: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 17\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# 1. Validate bert_model on test_gh and test_so\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on GitHub test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 17\u001b[0m bert_on_gh \u001b[38;5;241m=\u001b[39m test_model(test_gh, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on Stack Overflow test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 20\u001b[0m bert_on_so \u001b[38;5;241m=\u001b[39m test_model(test_so, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/a2/sentiment_classifier/api/test.py:93\u001b[0m, in \u001b[0;36mtest_model\u001b[0;34m(test_df, model_saved_path, model_select)\u001b[0m\n\u001b[1;32m 90\u001b[0m b_input_ids, b_input_mask, b_labels \u001b[38;5;241m=\u001b[39m batch\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 93\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(b_input_ids, token_type_ids\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, attention_mask\u001b[38;5;241m=\u001b[39mb_input_mask)\n\u001b[1;32m 94\u001b[0m logits \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 96\u001b[0m logits \u001b[38;5;241m=\u001b[39m logits\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1675\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1667\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1668\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m 1669\u001b[0m \u001b[38;5;124;03m Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[1;32m 1670\u001b[0m \u001b[38;5;124;03m config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;124;03m `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1673\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1675\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbert(\n\u001b[1;32m 1676\u001b[0m input_ids,\n\u001b[1;32m 1677\u001b[0m attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[1;32m 1678\u001b[0m token_type_ids\u001b[38;5;241m=\u001b[39mtoken_type_ids,\n\u001b[1;32m 1679\u001b[0m position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m 1680\u001b[0m head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m 1681\u001b[0m inputs_embeds\u001b[38;5;241m=\u001b[39minputs_embeds,\n\u001b[1;32m 1682\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 1683\u001b[0m output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m 1684\u001b[0m return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m 1685\u001b[0m )\n\u001b[1;32m 1687\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 1689\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1144\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1137\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m 1139\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m 1140\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m 1141\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m 1142\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m-> 1144\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder(\n\u001b[1;32m 1145\u001b[0m embedding_output,\n\u001b[1;32m 1146\u001b[0m attention_mask\u001b[38;5;241m=\u001b[39mextended_attention_mask,\n\u001b[1;32m 1147\u001b[0m head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m 1148\u001b[0m encoder_hidden_states\u001b[38;5;241m=\u001b[39mencoder_hidden_states,\n\u001b[1;32m 1149\u001b[0m encoder_attention_mask\u001b[38;5;241m=\u001b[39mencoder_extended_attention_mask,\n\u001b[1;32m 1150\u001b[0m past_key_values\u001b[38;5;241m=\u001b[39mpast_key_values,\n\u001b[1;32m 1151\u001b[0m use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m 1152\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 1153\u001b[0m output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m 1154\u001b[0m return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m 1155\u001b[0m )\n\u001b[1;32m 1156\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1157\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:695\u001b[0m, in \u001b[0;36mBertEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 684\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 685\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 686\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 692\u001b[0m output_attentions,\n\u001b[1;32m 693\u001b[0m )\n\u001b[1;32m 694\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 695\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m layer_module(\n\u001b[1;32m 696\u001b[0m hidden_states,\n\u001b[1;32m 697\u001b[0m attention_mask,\n\u001b[1;32m 698\u001b[0m layer_head_mask,\n\u001b[1;32m 699\u001b[0m encoder_hidden_states,\n\u001b[1;32m 700\u001b[0m encoder_attention_mask,\n\u001b[1;32m 701\u001b[0m past_key_value,\n\u001b[1;32m 702\u001b[0m output_attentions,\n\u001b[1;32m 703\u001b[0m )\n\u001b[1;32m 705\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 706\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:585\u001b[0m, in \u001b[0;36mBertLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 574\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 575\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 582\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 583\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m 584\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 585\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattention(\n\u001b[1;32m 586\u001b[0m hidden_states,\n\u001b[1;32m 587\u001b[0m attention_mask,\n\u001b[1;32m 588\u001b[0m head_mask,\n\u001b[1;32m 589\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 590\u001b[0m past_key_value\u001b[38;5;241m=\u001b[39mself_attn_past_key_value,\n\u001b[1;32m 591\u001b[0m )\n\u001b[1;32m 592\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 594\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:524\u001b[0m, in \u001b[0;36mBertAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 507\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 513\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 514\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 515\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mself(\n\u001b[1;32m 516\u001b[0m hidden_states,\n\u001b[1;32m 517\u001b[0m attention_mask,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 522\u001b[0m output_attentions,\n\u001b[1;32m 523\u001b[0m )\n\u001b[0;32m--> 524\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 525\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# Evaluate how bert trained on different datasets\n", + "\n", + "# Load test datasets\n", + "test_gh = pd.read_csv(f'{current_directory}/test_gh.csv')\n", + "test_so = pd.read_csv(f'{current_directory}/test_so.csv')\n", + "\n", + "# Define model paths\n", + "bert_model_path = f\"{current_directory}/bert_model\"\n", + "gh_bert_model_path = f\"{current_directory}/GH_bert_model\"\n", + "so_bert_model_path = f\"{current_directory}/SO_bert_model\"\n", + "\n", + "# Store results\n", + "model_results = {}\n", + "\n", + "# 1. Validate bert_model on test_gh and test_so\n", + "print(\"Evaluating bert_model on GitHub test dataset...\")\n", + "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n", + "\n", + "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n", + "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n", + "\n", + "model_results[\"bert_model\"] = {\n", + " \"test_gh Accuracy\": bert_on_gh,\n", + " \"test_so Accuracy\": bert_on_so\n", + "}\n", + "\n", + "# 2. Validate GH_bert_model on test_so\n", + "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n", + "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n", + "\n", + "model_results[\"GH_bert_model\"] = {\n", + " \"test_so Accuracy\": gh_bert_on_so\n", + "}\n", + "\n", + "# 3. Validate SO_bert_model on test_gh\n", + "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n", + "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n", + "\n", + "model_results[\"SO_bert_model\"] = {\n", + " \"test_gh Accuracy\": so_bert_on_gh\n", + "}\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Train.ipynb b/notebooks/Train.ipynb new file mode 100644 index 0000000..5d6b73b --- /dev/null +++ b/notebooks/Train.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train\n", + "\n", + "We will train the following models: \n", + "- **BERT** \n", + "- **XLNet** \n", + "- **RoBERTa** \n", + "- **ALBERT** \n", + "\n", + "## Training Parameters \n", + "- **MAX_LEN**: `256` \n", + "- **BATCH_SIZE**: `16` \n", + "- **LEARNING_RATE**: `2e-5` \n", + "- **EPOCHS**: `4` \n", + "\n", + "Each model is trained using the merged dataset and saved for further evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "import numpy as np\n", + "import pandas as pd\n", + "from transformers import (\n", + " BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n", + " XLNetTokenizer, XLNetForSequenceClassification,\n", + " RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n", + " AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n", + " get_scheduler\n", + ")\n", + "\n", + "from api.train import *\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we load and split the tokenized datasets into training and testing sets (70/30 split).\n", + "\n", + "### Datasets:\n", + "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n", + "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n", + "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a \n", + "\n", + "### Output:\n", + "- Training: `train_df.csv`, `train_gh.csv`, `train_so.csv`\n", + "- Testing: `test_df.csv`, `test_gh.csv`, `test_so.csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "current_directory = os.getcwd()\n", + "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n", + "\n", + "# Sets up the csv and splits into 70 and 30 split\n", + "# Read datasets\n", + "current_directory = os.getcwd()\n", + "input_path = f'{root}/crossplatform_sf_dataset_tokenized.csv'\n", + "so_input_path = f'{root}/so-dataset_tokenized.csv'\n", + "gh_input_path = f'{root}/gh-dataset_tokenized.csv'\n", + "\n", + "# Load datasets into Pandas DataFrames\n", + "df_crossplatform = pd.read_csv(input_path)\n", + "df_so = pd.read_csv(so_input_path)\n", + "df_gh = pd.read_csv(gh_input_path)\n", + "\n", + "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n", + "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n", + "\n", + "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n", + "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n", + "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n", + "\n", + "# Save all datasets to CSV files for further use\n", + "\n", + "train_df.to_csv(f'{root}/train_df.csv', index=False)\n", + "test_df.to_csv(f'{root}/test_df.csv', index=False)\n", + "train_gh.to_csv(f'{root}/train_gh.csv', index=False)\n", + "train_so.to_csv(f'{root}/train_so.csv', index=False)\n", + "test_gh.to_csv(f'{root}/test_gh.csv', index=False)\n", + "test_so.to_csv(f'{root}/test_so.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These splits are saved for consistent model training and evaluation. `train_df.csv`, `train_gh.csv`, and `train_so.csv` are merged into a dataset and saved for model training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n", + "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n", + "train_df_final.to_csv(f'{root}/train_df_final.csv', index=False)\n", + "\n", + "# Define the list of model names to be trained\n", + "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + "\n", + "# Train each model and save the trained model files\n", + "for i, model_name in enumerate(MODEL_NAMES):\n", + " model_save_path = f\"{root}/{model_name}_model\"\n", + " print(f\"Training {model_name} model...\")\n", + " train_model(train_df_final, model_save_path, model_select=i)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The models trained are saved to the root directory. We can see them in finder. At this point we can test the models we have created on the datasets, do this in the [Test.ipynb](./Test.ipynb) notebook." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From c573a33f0b38a65bea237d240eede5f6f03f48e9 Mon Sep 17 00:00:00 2001 From: Connor Narowetz Date: Mon, 5 May 2025 19:48:21 -1000 Subject: [PATCH 3/5] Changed Imports - Cleaned up imports --- notebooks/Test.ipynb | 137 ++++++++++--------------------------------- 1 file changed, 32 insertions(+), 105 deletions(-) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index 75b01bb..5115613 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -19,7 +19,7 @@ "## Results \n", "The evaluation will print: \n", "- **Overall accuracy** of each model. \n", - "- **Performance breakdown per platform** for each model. " + "- **Performance breakdown per platform** for each model. \n" ] }, { @@ -29,48 +29,31 @@ "outputs": [], "source": [ "import os\n", - "import re\n", - "import string\n", - "import random\n", - "import warnings\n", - "import argparse\n", - "import numpy as np\n", - "import pandas as pd\n", - "import torch\n", - "import time\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", - "from io import StringIO\n", - "from unicodedata import category\n", - "from bs4 import BeautifulSoup\n", - "from markdown import markdown\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n", - "from torch.utils.data import DataLoader, RandomSampler, Dataset\n", - "from transformers import (\n", - " BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n", - " XLNetTokenizer, XLNetForSequenceClassification,\n", - " RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n", - " AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n", - " get_scheduler\n", - ")\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", "\n", - "# Changed because AdamW Depreciated\n", - "from torch.optim import AdamW\n", - "from api.preprocessing import *\n", - "from api.train import *\n", "from api.test import *" ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "n_gpu = torch.cuda.device_count()" + "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n", + "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n", + "\n", + "## Evaluation Metrics \n", + "We will assess: \n", + "1. **Overall model performance** across all platforms. \n", + "2. **Platform-specific performance** for each model on: \n", + " - **GitHub** \n", + " - **Jira** \n", + " - **Mailbox** \n", + "\n", + "## Results \n", + "The evaluation will print: \n", + "- **Overall accuracy** of each model. \n", + "- **Performance breakdown per platform** for each model. " ] }, { @@ -79,10 +62,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Evaluates four pretrained models \n", "current_directory = os.getcwd()\n", + "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n", + "\n", "# Load test dataset\n", - "test_df = pd.read_csv(f'{current_directory}/test_df.csv')\n", + "test_df = pd.read_csv(f'{root}/test_df.csv')\n", "\n", "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", "model_results = {}\n", @@ -92,7 +76,7 @@ "\n", "# Evaluate each model\n", "for i, model_name in enumerate(MODEL_NAMES):\n", - " model_path = f\"{current_directory}/{model_name}_model\"\n", + " model_path = f\"{root}/{model_name}_model\"\n", " print(f\"Evaluating {model_name} model...for overall platform\")\n", "\n", " # Get overall accuracy\n", @@ -107,10 +91,7 @@ " accuracy = test_model(test_df_platform, model_path, model_select=i)\n", " platform_accuracies[platform_name] = accuracy\n", " else:\n", - " platform_accuracies[platform_name] = \"No data\"\n", - "\n", - "\n", - "\n" + " platform_accuracies[platform_name] = \"No data\"" ] }, { @@ -135,74 +116,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluating bert_model on GitHub test dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:2700: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " warnings.warn(\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 17\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# 1. Validate bert_model on test_gh and test_so\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on GitHub test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 17\u001b[0m bert_on_gh \u001b[38;5;241m=\u001b[39m test_model(test_gh, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on Stack Overflow test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 20\u001b[0m bert_on_so \u001b[38;5;241m=\u001b[39m test_model(test_so, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n", - "File \u001b[0;32m~/Desktop/a2/sentiment_classifier/api/test.py:93\u001b[0m, in \u001b[0;36mtest_model\u001b[0;34m(test_df, model_saved_path, model_select)\u001b[0m\n\u001b[1;32m 90\u001b[0m b_input_ids, b_input_mask, b_labels \u001b[38;5;241m=\u001b[39m batch\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 93\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(b_input_ids, token_type_ids\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, attention_mask\u001b[38;5;241m=\u001b[39mb_input_mask)\n\u001b[1;32m 94\u001b[0m logits \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 96\u001b[0m logits \u001b[38;5;241m=\u001b[39m logits\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1675\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1667\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1668\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m 1669\u001b[0m \u001b[38;5;124;03m Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[1;32m 1670\u001b[0m \u001b[38;5;124;03m config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;124;03m `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1673\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1675\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbert(\n\u001b[1;32m 1676\u001b[0m input_ids,\n\u001b[1;32m 1677\u001b[0m attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[1;32m 1678\u001b[0m token_type_ids\u001b[38;5;241m=\u001b[39mtoken_type_ids,\n\u001b[1;32m 1679\u001b[0m position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m 1680\u001b[0m head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m 1681\u001b[0m inputs_embeds\u001b[38;5;241m=\u001b[39minputs_embeds,\n\u001b[1;32m 1682\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 1683\u001b[0m output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m 1684\u001b[0m return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m 1685\u001b[0m )\n\u001b[1;32m 1687\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 1689\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1144\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1137\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m 1139\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m 1140\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m 1141\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m 1142\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m-> 1144\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder(\n\u001b[1;32m 1145\u001b[0m embedding_output,\n\u001b[1;32m 1146\u001b[0m attention_mask\u001b[38;5;241m=\u001b[39mextended_attention_mask,\n\u001b[1;32m 1147\u001b[0m head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m 1148\u001b[0m encoder_hidden_states\u001b[38;5;241m=\u001b[39mencoder_hidden_states,\n\u001b[1;32m 1149\u001b[0m encoder_attention_mask\u001b[38;5;241m=\u001b[39mencoder_extended_attention_mask,\n\u001b[1;32m 1150\u001b[0m past_key_values\u001b[38;5;241m=\u001b[39mpast_key_values,\n\u001b[1;32m 1151\u001b[0m use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m 1152\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 1153\u001b[0m output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m 1154\u001b[0m return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m 1155\u001b[0m )\n\u001b[1;32m 1156\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1157\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:695\u001b[0m, in \u001b[0;36mBertEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 684\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 685\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 686\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 692\u001b[0m output_attentions,\n\u001b[1;32m 693\u001b[0m )\n\u001b[1;32m 694\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 695\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m layer_module(\n\u001b[1;32m 696\u001b[0m hidden_states,\n\u001b[1;32m 697\u001b[0m attention_mask,\n\u001b[1;32m 698\u001b[0m layer_head_mask,\n\u001b[1;32m 699\u001b[0m encoder_hidden_states,\n\u001b[1;32m 700\u001b[0m encoder_attention_mask,\n\u001b[1;32m 701\u001b[0m past_key_value,\n\u001b[1;32m 702\u001b[0m output_attentions,\n\u001b[1;32m 703\u001b[0m )\n\u001b[1;32m 705\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 706\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:585\u001b[0m, in \u001b[0;36mBertLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 574\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 575\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 582\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 583\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m 584\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 585\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattention(\n\u001b[1;32m 586\u001b[0m hidden_states,\n\u001b[1;32m 587\u001b[0m attention_mask,\n\u001b[1;32m 588\u001b[0m head_mask,\n\u001b[1;32m 589\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 590\u001b[0m past_key_value\u001b[38;5;241m=\u001b[39mself_attn_past_key_value,\n\u001b[1;32m 591\u001b[0m )\n\u001b[1;32m 592\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 594\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:524\u001b[0m, in \u001b[0;36mBertAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 507\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 513\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 514\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 515\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mself(\n\u001b[1;32m 516\u001b[0m hidden_states,\n\u001b[1;32m 517\u001b[0m attention_mask,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 522\u001b[0m output_attentions,\n\u001b[1;32m 523\u001b[0m )\n\u001b[0;32m--> 524\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 525\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "# Evaluate how bert trained on different datasets\n", "\n", "# Load test datasets\n", - "test_gh = pd.read_csv(f'{current_directory}/test_gh.csv')\n", - "test_so = pd.read_csv(f'{current_directory}/test_so.csv')\n", + "test_gh = pd.read_csv(f'{root}/test_gh.csv')\n", + "test_so = pd.read_csv(f'{root}/test_so.csv')\n", "\n", "# Define model paths\n", - "bert_model_path = f\"{current_directory}/bert_model\"\n", - "gh_bert_model_path = f\"{current_directory}/GH_bert_model\"\n", - "so_bert_model_path = f\"{current_directory}/SO_bert_model\"\n", + "bert_model_path = f\"{root}/bert_model\"\n", + "gh_bert_model_path = f\"{root}/GH_bert_model\"\n", + "so_bert_model_path = f\"{root}/SO_bert_model\"\n", "\n", "# Store results\n", "model_results = {}\n", @@ -233,7 +160,7 @@ "\n", "model_results[\"SO_bert_model\"] = {\n", " \"test_gh Accuracy\": so_bert_on_gh\n", - "}\n" + "}" ] } ], From 57c0d38f7673ee6f24563be32d48301b42410da1 Mon Sep 17 00:00:00 2001 From: Connor Narowetz Date: Thu, 8 May 2025 21:23:19 -1000 Subject: [PATCH 4/5] Added .env combined test.py and train.py - train.py and test.py now exist in model.py - env added for required packages - Minor typo changes Signed-off-by: Connor Narowetz --- api/{train.py => model.py} | 116 ++++++++++++++--- api/test.py | 121 ------------------ env.yml | 21 +++ notebooks/Test.ipynb | 6 +- notebooks/Train.ipynb | 16 +-- ...cation.ipynb => tokenize_statistics.ipynb} | 11 +- 6 files changed, 133 insertions(+), 158 deletions(-) rename api/{train.py => model.py} (59%) delete mode 100644 api/test.py create mode 100644 env.yml rename notebooks/{Software_Development_Sentiment_Classification.ipynb => tokenize_statistics.ipynb} (98%) diff --git a/api/train.py b/api/model.py similarity index 59% rename from api/train.py rename to api/model.py index 30f5514..b2a11f1 100644 --- a/api/train.py +++ b/api/model.py @@ -1,23 +1,16 @@ -import os -import re -import string import random -import warnings -import argparse import numpy as np import pandas as pd import torch import time -import seaborn as sns import matplotlib.pyplot as plt from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from io import StringIO from unicodedata import category -from bs4 import BeautifulSoup from markdown import markdown from sklearn.model_selection import train_test_split from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report -from torch.utils.data import DataLoader, RandomSampler, Dataset +from torch.utils.data import DataLoader, RandomSampler from transformers import ( BertTokenizer, BertForSequenceClassification, BertForMaskedLM, XLNetTokenizer, XLNetForSequenceClassification, @@ -34,15 +27,14 @@ WEIGHT_DECAY = 0.01 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -MODELS = [ - (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'), - (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'), - (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'), - (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1') -] MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert'] +MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + def train_model(train_df, model_save_path, model_select=0): seed_torch(42) @@ -162,4 +154,96 @@ def seed_torch(seed): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic=True \ No newline at end of file + torch.backends.cudnn.deterministic=True + +def test_model(test_df, model_saved_path, model_select=0): + + MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model=MODELS[model_select] + m_name=MODEL_NAMES[model_select] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin=time.time() + + test_df['Polarity']=test_df['Polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0}) + + + sentences = test_df.Text.values + labels = test_df.Polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens = True, + max_length = MAX_LEN, + pad_to_max_length = True, + return_attention_mask = True, + return_tensors = 'pt', + ) + + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids,dim=0) + prediction_masks = torch.cat(attention_masks,dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) +# model.cuda() + model.eval() + + predictions,true_labels=[],[] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, b_labels = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + predictions.append(logits) + true_labels.append(label_ids) + + end=time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + flat_true_labels = [item for sublist in true_labels for item in sublist] + + print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) + + print(classification_report(flat_true_labels,flat_predictions)) + + + df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity']) + + df_combined = pd.concat([test_df, df_prediction], axis=1) + + counts = df_combined['prediction_Polarity'].value_counts() + print(counts) + + return df_combined \ No newline at end of file diff --git a/api/test.py b/api/test.py deleted file mode 100644 index a2bfe29..0000000 --- a/api/test.py +++ /dev/null @@ -1,121 +0,0 @@ -import os -import re -import string -import random -import warnings -import argparse -import numpy as np -import pandas as pd -import torch -import time -import seaborn as sns -import matplotlib.pyplot as plt -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -from io import StringIO -from unicodedata import category -from bs4 import BeautifulSoup -from markdown import markdown -from sklearn.model_selection import train_test_split -from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report -from torch.utils.data import DataLoader, RandomSampler, Dataset -from transformers import ( - BertTokenizer, BertForSequenceClassification, BertForMaskedLM, - XLNetTokenizer, XLNetForSequenceClassification, - RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM, - AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM, - get_scheduler -) -from torch.optim import AdamW -from api.train import * - -def test_model(test_df, model_saved_path, model_select=0): - - MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), - (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), - (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), - (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') - ] - MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] - seed_torch(42) - - cur_model=MODELS[model_select] - m_name=MODEL_NAMES[model_select] - - tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) - - begin=time.time() - - test_df['Polarity']=test_df['Polarity'].replace({ - 'positive':1, - 'negative':2, - 'neutral':0}) - - - sentences = test_df.Text.values - labels = test_df.Polarity.values - - input_ids = [] - attention_masks = [] - - for sent in sentences: - encoded_dict = tokenizer.encode_plus( - str(sent), - add_special_tokens = True, - max_length = MAX_LEN, - pad_to_max_length = True, - return_attention_mask = True, - return_tensors = 'pt', - ) - - input_ids.append(encoded_dict['input_ids']) - attention_masks.append(encoded_dict['attention_mask']) - - prediction_inputs = torch.cat(input_ids,dim=0) - prediction_masks = torch.cat(attention_masks,dim=0) - prediction_labels = torch.tensor(labels) - - prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) - prediction_sampler = SequentialSampler(prediction_data) - prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) - - model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) - model.load_state_dict(torch.load(model_saved_path)) -# model.cuda() - model.eval() - - predictions,true_labels=[],[] - - for batch in prediction_dataloader: - batch = tuple(t.to(device) for t in batch) - b_input_ids, b_input_mask, b_labels = batch - - with torch.no_grad(): - outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) - logits = outputs[0] - - logits = logits.detach().cpu().numpy() - label_ids = b_labels.to('cpu').numpy() - - predictions.append(logits) - true_labels.append(label_ids) - - end=time.time() - print('Prediction used {:.2f} seconds'.format(end - begin)) - - flat_predictions = [item for sublist in predictions for item in sublist] - flat_predictions = np.argmax(flat_predictions, axis=1).flatten() - flat_true_labels = [item for sublist in true_labels for item in sublist] - - print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) - - print(classification_report(flat_true_labels,flat_predictions)) - - - df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity']) - - df_combined = pd.concat([test_df, df_prediction], axis=1) - - counts = df_combined['prediction_Polarity'].value_counts() - print(counts) - - return df_combined \ No newline at end of file diff --git a/env.yml b/env.yml new file mode 100644 index 0000000..97b3bcf --- /dev/null +++ b/env.yml @@ -0,0 +1,21 @@ +name: sentiment_classifier +channels: + - conda-forge + - defaults +dependencies: + - python=3.13.3 + - ipykernel + - pip + - pip: + - bs4 + - torch + - scikit-learn + - seaborn + - tabulate + - markdown + - numpy + - pandas + - scikit-learn + - transformers + - ipywidgets +prefix: /opt/anaconda3/envs/sentiment_classifier diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb index 5115613..13e77a4 100644 --- a/notebooks/Test.ipynb +++ b/notebooks/Test.ipynb @@ -32,7 +32,7 @@ "import sys\n", "sys.path.append(os.path.abspath(\"..\"))\n", "\n", - "from api.test import *" + "from api.model import *" ] }, { @@ -166,7 +166,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "sentiment_classifier", "language": "python", "name": "python3" }, @@ -180,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.3" } }, "nbformat": 4, diff --git a/notebooks/Train.ipynb b/notebooks/Train.ipynb index 5d6b73b..c66c041 100644 --- a/notebooks/Train.ipynb +++ b/notebooks/Train.ipynb @@ -30,17 +30,9 @@ "import os\n", "import sys\n", "sys.path.append(os.path.abspath(\"..\"))\n", - "import numpy as np\n", "import pandas as pd\n", - "from transformers import (\n", - " BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n", - " XLNetTokenizer, XLNetForSequenceClassification,\n", - " RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n", - " AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n", - " get_scheduler\n", - ")\n", "\n", - "from api.train import *\n" + "from api.model import *\n" ] }, { @@ -61,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -134,7 +126,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "sentiment_classifier", "language": "python", "name": "python3" }, @@ -148,7 +140,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.3" } }, "nbformat": 4, diff --git a/notebooks/Software_Development_Sentiment_Classification.ipynb b/notebooks/tokenize_statistics.ipynb similarity index 98% rename from notebooks/Software_Development_Sentiment_Classification.ipynb rename to notebooks/tokenize_statistics.ipynb index dbe68a8..9264df8 100644 --- a/notebooks/Software_Development_Sentiment_Classification.ipynb +++ b/notebooks/tokenize_statistics.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "metadata": { "id": "oe8X-6s9btXo" }, @@ -43,8 +43,7 @@ "\n", "from api.filter import *\n", "from api.tokenizer import *\n", - "from api.train import *\n", - "from api.test import *" + "from api.model import *" ] }, { @@ -53,7 +52,7 @@ "id": "RAXtSnSK4LPr" }, "source": [ - "# Tokenlized\n", + "# Tokenized\n", "\n", "This section processes the raw sentiment analysis datasets (`so-dataset.csv`, `gh-dataset.csv`, and `crossplatform_sf_dataset.csv`) by applying a custom text transformation function. The goal is to standardize and clean the text data before training. You can change the transform_text function to specific needs.\n", "\n", @@ -217,7 +216,7 @@ "toc_visible": true }, "kernelspec": { - "display_name": "base", + "display_name": "sentiment_classifier", "language": "python", "name": "python3" }, @@ -231,7 +230,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.3" } }, "nbformat": 4, From 012084d222c74b2f621c0e543a0040dcca6e5572 Mon Sep 17 00:00:00 2001 From: Connor Narowetz Date: Thu, 8 May 2025 21:39:09 -1000 Subject: [PATCH 5/5] Added __init__.py to create pdoc and added model.py docs - __init__.py added - docs for model.py added Signed-off-by: Connor Narowetz --- api/__init__.py | 0 api/model.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 api/__init__.py diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/model.py b/api/model.py index b2a11f1..7302b17 100644 --- a/api/model.py +++ b/api/model.py @@ -37,6 +37,21 @@ def train_model(train_df, model_save_path, model_select=0): + """ +Trains a sentiment classification model on the provided dataset. + +Args: +- train_df (pd.DataFrame) +- model_save_path (str) +- model_select (int, optional) + +Returns: + str: The path where the best model was saved. + +Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the models + """ seed_torch(42) cur_model = MODELS[model_select] @@ -150,6 +165,15 @@ def train_model(train_df, model_save_path, model_select=0): return model_save_path def seed_torch(seed): + """ +Set random seeds for reproducibility in PyTorch and related libraries. + +Args: +- Seed (int) : number to use for all random generators. + +Example: +seed_torch(42) + """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -157,6 +181,17 @@ def seed_torch(seed): torch.backends.cudnn.deterministic=True def test_model(test_df, model_saved_path, model_select=0): + """ +Tests a pre-trained sentiment classification model on a test dataset and evaluates its performance. + +Args: +- test_df (pd.DataFrame) +- model_saved_path (str) +- model_select (int, optional) + +Returns: +pd.DataFrame: A DataFrame with the original test data and the model's predictions. + """ MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),