From 13b7940b1c0003ffb8b37e52b5ddec343540dfc4 Mon Sep 17 00:00:00 2001
From: haotianzhang <haotianzhang@bytedance.com>
Date: Tue, 1 Apr 2025 01:56:14 +0800
Subject: [PATCH 1/5] Added missing sentiment classifier, fixes #1

---
 ...Development_Sentiment_Classification.ipynb | 2174 +++++++++++++++++
 1 file changed, 2174 insertions(+)
 create mode 100644 Software_Development_Sentiment_Classification.ipynb

diff --git a/Software_Development_Sentiment_Classification.ipynb b/Software_Development_Sentiment_Classification.ipynb
new file mode 100644
index 0000000..ff12ae7
--- /dev/null
+++ b/Software_Development_Sentiment_Classification.ipynb
@@ -0,0 +1,2174 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oe8X-6s9btXo"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "import re\n",
+        "import string\n",
+        "import random\n",
+        "import warnings\n",
+        "import argparse\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import torch\n",
+        "import time\n",
+        "import seaborn as sns\n",
+        "import matplotlib.pyplot as plt\n",
+        "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
+        "from io import StringIO\n",
+        "from unicodedata import category\n",
+        "from bs4 import BeautifulSoup\n",
+        "from markdown import markdown\n",
+        "from google.colab import drive\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n",
+        "from torch.utils.data import DataLoader, RandomSampler, Dataset\n",
+        "from transformers import (\n",
+        "    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n",
+        "    XLNetTokenizer, XLNetForSequenceClassification,\n",
+        "    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n",
+        "    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n",
+        "    get_scheduler, AdamW\n",
+        ")\n",
+        "\n",
+        "\n",
+        "drive.mount('/content/drive')\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RAXtSnSK4LPr"
+      },
+      "source": [
+        "# Tokenlized"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9gOgONc04PiO"
+      },
+      "outputs": [],
+      "source": [
+        "# Regular expression for GitHub username mentions\n",
+        "USERNAME_REGEX = r\"(\\s|^)@(\\S*\\s?)\"\n",
+        "\n",
+        "# Generate Unicode punctuation set\n",
+        "punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith((\"P\", \"S\"))}\n",
+        "\n",
+        "# Dictionary to count token replacements\n",
+        "counters = {}\n",
+        "\n",
+        "def remove_punctuation(text):\n",
+        "    \"\"\"Remove all punctuation characters from the given text.\"\"\"\n",
+        "    return \"\".join(char for char in text if char not in punctuation)\n",
+        "\n",
+        "def clean_text(text):\n",
+        "    \"\"\"Remove quoted text and large code blocks from GitHub issues or comments.\"\"\"\n",
+        "    # Remove quoted text from emails/notifications\n",
+        "    text = re.sub(r\"^(On[\\s\\S]*?notifications@github\\.com\\s*?wrote:\\s*?)?(^(\\>).*\\s)*\", '', text, flags=re.MULTILINE)\n",
+        "\n",
+        "    # Remove code blocks enclosed in triple backticks\n",
+        "    text = re.sub(r\"```[a-z]*\\n[\\s\\S]*?\\n```\", \"\", text)\n",
+        "\n",
+        "    return text\n",
+        "\n",
+        "def replace_token(regex, token_name, text):\n",
+        "    \"\"\"\n",
+        "    Replace matched patterns in the text with the specified token.\n",
+        "\n",
+        "    Args:\n",
+        "        regex (str): The regular expression pattern to match.\n",
+        "        token_name (str): The replacement token name.\n",
+        "        text (str): The input text.\n",
+        "\n",
+        "    Returns:\n",
+        "        tuple: (processed_text, number_of_replacements)\n",
+        "    \"\"\"\n",
+        "    replaced_text, replacements = re.subn(regex, f\" {token_name} \", text, flags=re.MULTILINE)\n",
+        "    counters[token_name] = counters.get(token_name, 0) + replacements\n",
+        "    return replaced_text, replacements\n",
+        "\n",
+        "def tokenize_text(text):\n",
+        "    \"\"\"\n",
+        "    Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.\n",
+        "\n",
+        "    Args:\n",
+        "        text (str): The input text.\n",
+        "\n",
+        "    Returns:\n",
+        "        tuple: (processed_text, total_replacements)\n",
+        "    \"\"\"\n",
+        "    total_replacements = 0\n",
+        "\n",
+        "    text, replacements = replace_token(r\"\\S+@\\S*\\s?\", \"MEMAIL\", text)\n",
+        "    total_replacements += replacements\n",
+        "\n",
+        "    text, replacements = replace_token(USERNAME_REGEX, \"MMENTION\", text)\n",
+        "    total_replacements += replacements\n",
+        "\n",
+        "    text, replacements = replace_token(r\"`([^`]*)`\", \"MICODE\", text)\n",
+        "    total_replacements += replacements\n",
+        "\n",
+        "    text, replacements = replace_token(r\"\\b\\d+\\.\\d+(\\.\\d+)*\\b\", \"MVERSIONNUMBER\", text)\n",
+        "    total_replacements += replacements\n",
+        "\n",
+        "    text, replacements = replace_token(r\"(\\s|^)#\\d+\", \"MISSUEMENTION\", text)\n",
+        "    total_replacements += replacements\n",
+        "\n",
+        "    text, replacements = replace_token(\n",
+        "        r\"([a-zA-Z0-9]+):\\/\\/([\\w_-]+(?:\\.[\\w_-]+)*)[\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-]\",\n",
+        "        \"MURL\",\n",
+        "        text,\n",
+        "    )\n",
+        "    total_replacements += replacements\n",
+        "\n",
+        "    return text, total_replacements\n",
+        "\n",
+        "def remove_markdown_content(text):\n",
+        "    \"\"\"\n",
+        "    Converts Markdown content to plain text by removing all Markdown formatting.\n",
+        "\n",
+        "    Args:\n",
+        "        text (str): The input Markdown text.\n",
+        "\n",
+        "    Returns:\n",
+        "        str: Cleaned text without Markdown formatting.\n",
+        "    \"\"\"\n",
+        "    html = markdown(text)\n",
+        "    return \"\".join(BeautifulSoup(html, \"lxml\").findAll(text=True))\n",
+        "\n",
+        "def transform_text(row):\n",
+        "    \"\"\"\n",
+        "    Transforms a row by cleaning and tokenizing its text content.\n",
+        "\n",
+        "    Args:\n",
+        "        row (dict): A dictionary containing a 'Text' key.\n",
+        "\n",
+        "    Returns:\n",
+        "        tuple: (processed text, number of replacements)\n",
+        "    \"\"\"\n",
+        "    text = row.get(\"Text\", \"\")\n",
+        "\n",
+        "    if not isinstance(text, str):\n",
+        "        warnings.warn(f\"Converting non-string type to string: {type(text)}\")\n",
+        "        text = str(text)\n",
+        "\n",
+        "    text, replaced_count = tokenize_text(text)\n",
+        "    text = text.replace(\"\\n\", \"\")\n",
+        "    return text, replaced_count"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uCPyaXMt4ao_"
+      },
+      "source": [
+        "##Usage"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8RhSXEyO4Tnu",
+        "outputId": "7988bf5a-6c84-46af-f1fa-273b6b31baeb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\n",
+            "                                                Text  Polarity\n",
+            "0  In this situation, when I click on the greyed ...         0\n",
+            "1  After that a download progress status with pro...         0\n",
+            "2  Change the double quotationCODE_FRAGMENT to to...         0\n",
+            "3      E.g. I get an array of CODE_FRAGMENT objects.         0\n",
+            "4  Then I tried my own implementation with CODE_F...         0\n",
+            "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv: 450\n",
+            "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv\n",
+            "\n",
+            "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\n",
+            "                                                Text  Polarity\n",
+            "0          Guess there is a typo here: `translate`.\"         0\n",
+            "1  @arturoc: If you multiply-include `gst.h`, wou...         0\n",
+            "2  Thank you Vlad. Your contribution to Mangos an...         1\n",
+            "3  @opdenkamp Hi Lars, I'm afraid that you forgot...         2\n",
+            "4                         Ok, so let's be paranoid.\"         2\n",
+            "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv: 2136\n",
+            "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv\n",
+            "\n",
+            "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\n",
+            "                                                Text  Platform  Polarity\n",
+            "0  [error] testWorkflowTimeoutWhenWorkflowComplet...         0         2\n",
+            "1                               Thanks Hunter. Sure.         0         1\n",
+            "2                 Approved by  I will merge it soon.         0         1\n",
+            "3  It was reported and verified that the current ...         0         2\n",
+            "4  Thanks for the comments. I am figuring out to ...         0         1\n",
+            "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv: 3227\n",
+            "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "\n",
+        "# Define input dataset paths\n",
+        "input_paths = [\n",
+        "    \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\",\n",
+        "    \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\",\n",
+        "    \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\"\n",
+        "]\n",
+        "\n",
+        "# Define the text transformation function (ensure transform_text is correctly implemented)\n",
+        "def transform_text(row):\n",
+        "    # Modify this function according to your needs\n",
+        "    # Example: return the original text and a dummy replacement count\n",
+        "    return row[\"Text\"], 1\n",
+        "\n",
+        "# Loop through each dataset and process it\n",
+        "for input_path in input_paths:\n",
+        "    # Generate output file name\n",
+        "    output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n",
+        "    output_path = os.path.join(os.path.dirname(input_path), output_filename)\n",
+        "\n",
+        "    # Load dataset\n",
+        "    df = pd.read_csv(input_path)\n",
+        "    print(f\"Processing dataset: {input_path}\")\n",
+        "    print(df.head())  # Print first few rows for verification\n",
+        "\n",
+        "    # Apply text transformation\n",
+        "    df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n",
+        "\n",
+        "    # Calculate total replacements from `replaced_token` column\n",
+        "    total_replacements = df[\"replaced_token\"].sum()\n",
+        "\n",
+        "    # Save processed dataset\n",
+        "    df.to_csv(output_path, header=True, index=False)\n",
+        "\n",
+        "    print(f\"Tokenized dataset saved to: {output_path}\\n\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "o3Cm2Gn_CbM5"
+      },
+      "source": [
+        "# Dataset Overview: `crossplatform_sf_dataset.csv`\n",
+        "\n",
+        "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n",
+        "\n",
+        "## **Column Descriptions**\n",
+        "- **`Text`**: The user comment or discussion content.  \n",
+        "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text:  \n",
+        "  - `2`: Negative sentiment  \n",
+        "  - `0`: Neutral sentiment  \n",
+        "  - `1`: Positive sentiment  \n",
+        "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated:  \n",
+        "  - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests)  \n",
+        "  - `1`: **Jira** (Bug reports, task comments in software development management tools)  \n",
+        "  - `2`: **Mailbox** (Developer communication through emails)  \n",
+        "\n",
+        "## **Dataset Distribution**\n",
+        "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms.  \n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QaHSklrRClHU",
+        "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "==================================================\n",
+            "📊 Dataset Information: cf-dataset.csv\n",
+            "==================================================\n",
+            "Total Samples: 3227\n",
+            "\n",
+            "📌 Polarity Distribution:\n",
+            "+---+----------+-------+\n",
+            "|   | Polarity | Count |\n",
+            "+---+----------+-------+\n",
+            "| 0 |    0     | 1125  |\n",
+            "| 1 |    1     | 1042  |\n",
+            "| 2 |    2     | 1060  |\n",
+            "+---+----------+-------+\n",
+            "\n",
+            "\n",
+            "📌 Platform Distribution:\n",
+            "+---+----------+-------+\n",
+            "|   | Platform | Count |\n",
+            "+---+----------+-------+\n",
+            "| 0 |    0     | 1079  |\n",
+            "| 1 |    1     | 1054  |\n",
+            "| 2 |    2     | 1094  |\n",
+            "+---+----------+-------+\n",
+            "\n",
+            "\n",
+            "📌 Platform-wise Polarity Distribution:\n",
+            "+----------+-----+-----+-----+\n",
+            "| Platform |  0  |  1  |  2  |\n",
+            "+----------+-----+-----+-----+\n",
+            "|    0     | 392 | 361 | 326 |\n",
+            "|    1     | 367 | 316 | 371 |\n",
+            "|    2     | 366 | 365 | 363 |\n",
+            "+----------+-----+-----+-----+\n",
+            "==================================================\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "from tabulate import tabulate\n",
+        "\n",
+        "# Load dataset\n",
+        "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv'\n",
+        "df = pd.read_csv(input_path)\n",
+        "\n",
+        "# Compute dataset statistics\n",
+        "total_samples = len(df)\n",
+        "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n",
+        "platform_counts = df[\"Platform\"].value_counts().sort_index()\n",
+        "\n",
+        "# Compute Polarity distribution within each Platform\n",
+        "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n",
+        "\n",
+        "# Print results with formatting\n",
+        "print(\"=\" * 50)\n",
+        "print(f\"📊 Dataset Information: cf-dataset.csv\")\n",
+        "print(\"=\" * 50)\n",
+        "print(f\"Total Samples: {total_samples}\\n\")\n",
+        "\n",
+        "# Polarity distribution\n",
+        "print(\"📌 Polarity Distribution:\")\n",
+        "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n",
+        "print(\"\\n\")\n",
+        "\n",
+        "# Platform distribution\n",
+        "print(\"📌 Platform Distribution:\")\n",
+        "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n",
+        "print(\"\\n\")\n",
+        "\n",
+        "# Platform-wise Polarity distribution\n",
+        "print(\"📌 Platform-wise Polarity Distribution:\")\n",
+        "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n",
+        "print(\"=\" * 50)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9kxjNt7QqLlh"
+      },
+      "source": [
+        "# Sentiment Classification"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "id": "0AiOM3jWKL4P",
+        "outputId": "bbb7a3d9-e9d7-4378-b8f8-ab68ed4670ec"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            },
+            "text/plain": [
+              "'NVIDIA L4'"
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "def seed_torch(seed):\n",
+        "    random.seed(seed)\n",
+        "    np.random.seed(seed)\n",
+        "    torch.manual_seed(seed)\n",
+        "    torch.cuda.manual_seed(seed)\n",
+        "    torch.backends.cudnn.deterministic=True\n",
+        "\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "n_gpu = torch.cuda.device_count()\n",
+        "torch.cuda.get_device_name(0)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gQGXycAgbchc"
+      },
+      "outputs": [],
+      "source": [
+        "# Train\n",
+        "MAX_LEN = 256\n",
+        "BATCH_SIZE = 16\n",
+        "LEARNING_RATE = 2e-5\n",
+        "EPOCHS = 4\n",
+        "WEIGHT_DECAY = 0.01\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "\n",
+        "\n",
+        "MODELS = [\n",
+        "    (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),\n",
+        "    (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),\n",
+        "    (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),\n",
+        "    (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1')\n",
+        "]\n",
+        "MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert']\n",
+        "\n",
+        "def train_model(train_df, model_save_path, model_select=0):\n",
+        "    seed_torch(42)\n",
+        "\n",
+        "    cur_model = MODELS[model_select]\n",
+        "    m_name = MODEL_NAMES[model_select]\n",
+        "\n",
+        "\n",
+        "    train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0})\n",
+        "    tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n",
+        "\n",
+        "    sentences = train_df.Text.values\n",
+        "    labels = train_df.Polarity.values\n",
+        "\n",
+        "    input_ids = []\n",
+        "    attention_masks = []\n",
+        "\n",
+        "    for sent in sentences:\n",
+        "        encoded_dict = tokenizer.encode_plus(\n",
+        "            str(sent),\n",
+        "            add_special_tokens=True,\n",
+        "            max_length=MAX_LEN,\n",
+        "            padding='max_length',\n",
+        "            return_attention_mask=True,\n",
+        "            return_tensors='pt',\n",
+        "            truncation=True\n",
+        "        )\n",
+        "        input_ids.append(encoded_dict['input_ids'])\n",
+        "        attention_masks.append(encoded_dict['attention_mask'])\n",
+        "\n",
+        "    input_ids = torch.cat(input_ids, dim=0)\n",
+        "    attention_masks = torch.cat(attention_masks, dim=0)\n",
+        "    labels = torch.tensor(labels)\n",
+        "\n",
+        "    print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}')\n",
+        "\n",
+        "\n",
+        "    train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n",
+        "        input_ids, labels, test_size=0.1, random_state=42)\n",
+        "    train_masks, val_masks, _, _ = train_test_split(\n",
+        "        attention_masks, labels, test_size=0.1, random_state=42)\n",
+        "\n",
+        "\n",
+        "    train_data = TensorDataset(train_inputs, train_masks, train_labels)\n",
+        "    train_sampler = RandomSampler(train_data)\n",
+        "    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)\n",
+        "\n",
+        "    val_data = TensorDataset(val_inputs, val_masks, val_labels)\n",
+        "    val_sampler = SequentialSampler(val_data)\n",
+        "    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)\n",
+        "\n",
+        "\n",
+        "    model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n",
+        "    model.to(device)\n",
+        "\n",
+        "\n",
+        "    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
+        "\n",
+        "\n",
+        "    num_training_steps = EPOCHS * len(train_dataloader)\n",
+        "    lr_scheduler = get_scheduler(\n",
+        "        name=\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps\n",
+        "    )\n",
+        "\n",
+        "\n",
+        "    print(\"Starting training...\")\n",
+        "    best_f1 = 0\n",
+        "    for epoch in range(EPOCHS):\n",
+        "        model.train()\n",
+        "        total_loss = 0\n",
+        "        predictions, true_labels = [], []\n",
+        "\n",
+        "        for batch in train_dataloader:\n",
+        "            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n",
+        "            optimizer.zero_grad()\n",
+        "            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)\n",
+        "            loss, logits = outputs[:2]\n",
+        "            loss.backward()\n",
+        "            optimizer.step()\n",
+        "            lr_scheduler.step()\n",
+        "\n",
+        "            total_loss += loss.item()\n",
+        "            predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n",
+        "            true_labels.extend(b_labels.cpu().numpy())\n",
+        "\n",
+        "        train_acc = accuracy_score(true_labels, predictions)\n",
+        "        print(f\"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}\")\n",
+        "\n",
+        "\n",
+        "        model.eval()\n",
+        "        val_predictions, val_labels = [], []\n",
+        "        with torch.no_grad():\n",
+        "            for batch in val_dataloader:\n",
+        "                b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n",
+        "                outputs = model(b_input_ids, attention_mask=b_input_mask)\n",
+        "                logits = outputs[0]\n",
+        "                val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n",
+        "                val_labels.extend(b_labels.cpu().numpy())\n",
+        "\n",
+        "        val_acc = accuracy_score(val_labels, val_predictions)\n",
+        "        val_f1 = f1_score(val_labels, val_predictions, average='weighted')\n",
+        "        print(f\"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}\")\n",
+        "\n",
+        "\n",
+        "        if val_f1 > best_f1:\n",
+        "            best_f1 = val_f1\n",
+        "            torch.save(model.state_dict(), model_save_path)\n",
+        "            print(f\"Best model saved at {model_save_path}\")\n",
+        "\n",
+        "\n",
+        "    print(\"Final Model Performance on Validation Set:\")\n",
+        "    print(classification_report(val_labels, val_predictions, digits=4))\n",
+        "    return model_save_path\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Sf7jGxHYhNIb"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "def test_model(test_df, model_saved_path, model_select=0):\n",
+        "\n",
+        "  MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),\n",
+        "          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),\n",
+        "          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),\n",
+        "          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')\n",
+        "        ]\n",
+        "  MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
+        "  seed_torch(42)\n",
+        "\n",
+        "  cur_model=MODELS[model_select]\n",
+        "  m_name=MODEL_NAMES[model_select]\n",
+        "\n",
+        "  tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n",
+        "\n",
+        "  begin=time.time()\n",
+        "\n",
+        "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+        "      'positive':1,\n",
+        "      'negative':2,\n",
+        "      'neutral':0})\n",
+        "\n",
+        "\n",
+        "  sentences = test_df.Text.values\n",
+        "  labels = test_df.Polarity.values\n",
+        "\n",
+        "  input_ids = []\n",
+        "  attention_masks = []\n",
+        "\n",
+        "  for sent in sentences:\n",
+        "      encoded_dict = tokenizer.encode_plus(\n",
+        "                          str(sent),\n",
+        "                          add_special_tokens = True,\n",
+        "                          max_length = MAX_LEN,\n",
+        "                          pad_to_max_length = True,\n",
+        "                          return_attention_mask = True,\n",
+        "                          return_tensors = 'pt',\n",
+        "                    )\n",
+        "\n",
+        "      input_ids.append(encoded_dict['input_ids'])\n",
+        "      attention_masks.append(encoded_dict['attention_mask'])\n",
+        "\n",
+        "  prediction_inputs = torch.cat(input_ids,dim=0)\n",
+        "  prediction_masks = torch.cat(attention_masks,dim=0)\n",
+        "  prediction_labels = torch.tensor(labels)\n",
+        "\n",
+        "  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n",
+        "  prediction_sampler = SequentialSampler(prediction_data)\n",
+        "  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)\n",
+        "\n",
+        "  model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n",
+        "  model.load_state_dict(torch.load(model_saved_path))\n",
+        "  model.cuda()\n",
+        "  model.eval()\n",
+        "\n",
+        "  predictions,true_labels=[],[]\n",
+        "\n",
+        "  for batch in prediction_dataloader:\n",
+        "      batch = tuple(t.to(device) for t in batch)\n",
+        "      b_input_ids, b_input_mask, b_labels = batch\n",
+        "\n",
+        "      with torch.no_grad():\n",
+        "          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n",
+        "          logits = outputs[0]\n",
+        "\n",
+        "      logits = logits.detach().cpu().numpy()\n",
+        "      label_ids = b_labels.to('cpu').numpy()\n",
+        "\n",
+        "      predictions.append(logits)\n",
+        "      true_labels.append(label_ids)\n",
+        "\n",
+        "  end=time.time()\n",
+        "  print('Prediction used {:.2f} seconds'.format(end - begin))\n",
+        "\n",
+        "  flat_predictions = [item for sublist in predictions for item in sublist]\n",
+        "  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()\n",
+        "  flat_true_labels = [item for sublist in true_labels for item in sublist]\n",
+        "\n",
+        "  print(\"Accuracy of {} is: {}\".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))\n",
+        "\n",
+        "  print(classification_report(flat_true_labels,flat_predictions))\n",
+        "\n",
+        "\n",
+        "  df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])\n",
+        "\n",
+        "  df_combined = pd.concat([test_df, df_prediction], axis=1)\n",
+        "\n",
+        "  counts = df_combined['prediction_Polarity'].value_counts()\n",
+        "  print(counts)\n",
+        "\n",
+        "  return df_combined"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dw2VuMkVAfec"
+      },
+      "source": [
+        "## Train"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IyDoiOFoMP3W"
+      },
+      "source": [
+        "### Dataset Preparation and Splitting\n",
+        "\n",
+        "In this section, we prepare the datasets for training and testing.\n",
+        "\n",
+        "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n",
+        "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n",
+        "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a Cross-Platform Setting?*).\n",
+        "\n",
+        "### Steps:\n",
+        "\n",
+        "1. **Load Datasets**  \n",
+        "   We read the three datasets into Pandas DataFrames.\n",
+        "\n",
+        "2. **Split into Training and Testing Sets**  \n",
+        "   - The **GitHub dataset (`df_gh`)** and **Stack Overflow dataset (`df_so`)** are each split into 70% training and 30% testing subsets.  \n",
+        "   - Similarly, the **cross-platform dataset (`df_crossplatform`)** is divided into a 70% training set and a 30% testing set.  \n",
+        "   - The splitting is performed using `train_test_split` with a `random_state` of 42 for reproducibility.\n",
+        "\n",
+        "3. **Save Processed Data**  \n",
+        "   - The training and testing subsets are saved as CSV files for further use.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fo57JVQT9GIq"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "# Read datasets\n",
+        "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv'\n",
+        "so_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv'\n",
+        "gh_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv'\n",
+        "\n",
+        "# Load datasets into Pandas DataFrames\n",
+        "df_crossplatform = pd.read_csv(input_path)\n",
+        "df_so = pd.read_csv(so_input_path)\n",
+        "df_gh = pd.read_csv(gh_input_path)\n",
+        "\n",
+        "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n",
+        "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n",
+        "\n",
+        "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n",
+        "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n",
+        "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n",
+        "\n",
+        "# Save all datasets to CSV files for further use\n",
+        "\n",
+        "train_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df.csv', index=False)\n",
+        "test_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv', index=False)\n",
+        "\n",
+        "train_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_gh.csv', index=False)\n",
+        "train_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_so.csv', index=False)\n",
+        "test_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv', index=False)\n",
+        "test_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv', index=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RogFCndjOGet"
+      },
+      "source": [
+        "### Train for Cross-Platform Dataset\n",
+        "We combine three training datasets (`train_df(ours)`, `train_gh`, and `train_so`) into a final dataset for training four different models.\n",
+        "\n",
+        "## Training Models  \n",
+        "The following models are trained:  \n",
+        "- **BERT**  \n",
+        "- **XLNet**  \n",
+        "- **RoBERTa**  \n",
+        "- **ALBERT**  \n",
+        "\n",
+        "## Training Parameters  \n",
+        "- **MAX_LEN**: `256`  \n",
+        "- **BATCH_SIZE**: `16`  \n",
+        "- **LEARNING_RATE**: `2e-5`  \n",
+        "- **EPOCHS**: `4`  \n",
+        "\n",
+        "Each model is trained using the merged dataset and saved for further evaluation.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "slhvZg0Ow3ae",
+        "outputId": "a950593a-2b5c-49a1-823b-40845fbb77cb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training bert model...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n",
+            "Starting training...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Epoch 1: Train Loss: 0.5891, Accuracy: 0.7577\n",
+            "Validation Accuracy: 0.8673, F1 Score: 0.8675\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
+            "Epoch 2: Train Loss: 0.2148, Accuracy: 0.9312\n",
+            "Validation Accuracy: 0.8771, F1 Score: 0.8772\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
+            "Epoch 3: Train Loss: 0.1029, Accuracy: 0.9683\n",
+            "Validation Accuracy: 0.8845, F1 Score: 0.8841\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
+            "Epoch 4: Train Loss: 0.0526, Accuracy: 0.9869\n",
+            "Validation Accuracy: 0.9017, F1 Score: 0.9015\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
+            "Final Model Performance on Validation Set:\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0     0.9294    0.8587    0.8927       184\n",
+            "           1     0.8739    0.9369    0.9043       111\n",
+            "           2     0.8898    0.9375    0.9130       112\n",
+            "\n",
+            "    accuracy                         0.9017       407\n",
+            "   macro avg     0.8977    0.9110    0.9033       407\n",
+            "weighted avg     0.9034    0.9017    0.9015       407\n",
+            "\n",
+            "Training xlnet model...\n",
+            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Starting training...\n",
+            "Epoch 1: Train Loss: 0.6808, Accuracy: 0.6927\n",
+            "Validation Accuracy: 0.8059, F1 Score: 0.8043\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
+            "Epoch 2: Train Loss: 0.3285, Accuracy: 0.8839\n",
+            "Validation Accuracy: 0.8280, F1 Score: 0.8275\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
+            "Epoch 3: Train Loss: 0.1917, Accuracy: 0.9355\n",
+            "Validation Accuracy: 0.8550, F1 Score: 0.8547\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
+            "Epoch 4: Train Loss: 0.1199, Accuracy: 0.9604\n",
+            "Validation Accuracy: 0.8575, F1 Score: 0.8573\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
+            "Final Model Performance on Validation Set:\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0     0.9241    0.7935    0.8538       184\n",
+            "           1     0.8306    0.9279    0.8766       111\n",
+            "           2     0.8000    0.8929    0.8439       112\n",
+            "\n",
+            "    accuracy                         0.8575       407\n",
+            "   macro avg     0.8516    0.8714    0.8581       407\n",
+            "weighted avg     0.8644    0.8575    0.8573       407\n",
+            "\n",
+            "Training Roberta model...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n",
+            "Starting training...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Epoch 1: Train Loss: 0.6244, Accuracy: 0.7228\n",
+            "Validation Accuracy: 0.8698, F1 Score: 0.8697\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
+            "Epoch 2: Train Loss: 0.2585, Accuracy: 0.9123\n",
+            "Validation Accuracy: 0.8747, F1 Score: 0.8745\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
+            "Epoch 3: Train Loss: 0.1612, Accuracy: 0.9506\n",
+            "Validation Accuracy: 0.8968, F1 Score: 0.8968\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
+            "Epoch 4: Train Loss: 0.0974, Accuracy: 0.9727\n",
+            "Validation Accuracy: 0.8968, F1 Score: 0.8969\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
+            "Final Model Performance on Validation Set:\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0     0.9302    0.8696    0.8989       184\n",
+            "           1     0.8803    0.9279    0.9035       111\n",
+            "           2     0.8644    0.9107    0.8870       112\n",
+            "\n",
+            "    accuracy                         0.8968       407\n",
+            "   macro avg     0.8917    0.9027    0.8964       407\n",
+            "weighted avg     0.8985    0.8968    0.8969       407\n",
+            "\n",
+            "Training albert model...\n",
+            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Starting training...\n",
+            "Epoch 1: Train Loss: 0.6810, Accuracy: 0.7110\n",
+            "Validation Accuracy: 0.8305, F1 Score: 0.8332\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n",
+            "Epoch 2: Train Loss: 0.3244, Accuracy: 0.8913\n",
+            "Validation Accuracy: 0.8550, F1 Score: 0.8550\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n",
+            "Epoch 3: Train Loss: 0.2071, Accuracy: 0.9306\n",
+            "Validation Accuracy: 0.8550, F1 Score: 0.8548\n",
+            "Epoch 4: Train Loss: 0.1405, Accuracy: 0.9566\n",
+            "Validation Accuracy: 0.8526, F1 Score: 0.8524\n",
+            "Final Model Performance on Validation Set:\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0     0.8715    0.8478    0.8595       184\n",
+            "           1     0.8547    0.9009    0.8772       111\n",
+            "           2     0.8198    0.8125    0.8161       112\n",
+            "\n",
+            "    accuracy                         0.8526       407\n",
+            "   macro avg     0.8487    0.8537    0.8509       407\n",
+            "weighted avg     0.8527    0.8526    0.8524       407\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n",
+        "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n",
+        "train_df_final.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df_final.csv', index=False)\n",
+        "\n",
+        "# Define the list of model names to be trained\n",
+        "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
+        "\n",
+        "# Train each model and save the trained model files\n",
+        "for i, model_name in enumerate(MODEL_NAMES):\n",
+        "    model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n",
+        "    print(f\"Training {model_name} model...\")\n",
+        "    train_model(train_df_final, model_save_path, model_select=i)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "krgDqVN9x_my"
+      },
+      "source": [
+        "### Train for existing dataset on Bert\n",
+        "In this section, we train the **BERT** model using the existing **GitHub** and **Stack Overflow** datasets.\n",
+        "\n",
+        "## Training Data  \n",
+        "The training dataset consists of:  \n",
+        "- **GitHub Data** (`train_gh`)  \n",
+        "- **Stack Overflow Data** (`train_so`)  \n",
+        "\n",
+        "## Training Parameters  \n",
+        "- **MAX_LEN**: `256`  \n",
+        "- **BATCH_SIZE**: `16`  \n",
+        "- **LEARNING_RATE**: `2e-5`  \n",
+        "- **EPOCHS**: `4`  \n",
+        "\n",
+        "The trained **BERT** model will be saved for further evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "sxHJGW6Lbgnt",
+        "outputId": "8d60c4d7-c1b2-4dee-dc01-a7069146a577"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training bert model on SO dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training data shape: torch.Size([315, 256]), torch.Size([315, 256]), torch.Size([315])\n",
+            "Starting training...\n",
+            "Epoch 1: Train Loss: 0.7896, Accuracy: 0.7032\n",
+            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\n",
+            "Epoch 2: Train Loss: 0.5530, Accuracy: 0.8163\n",
+            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
+            "Epoch 3: Train Loss: 0.4831, Accuracy: 0.8163\n",
+            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
+            "Epoch 4: Train Loss: 0.4383, Accuracy: 0.8163\n",
+            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
+            "Final Model Performance on Validation Set:\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0     0.8438    1.0000    0.9153        27\n",
+            "           1     0.0000    0.0000    0.0000         4\n",
+            "           2     0.0000    0.0000    0.0000         1\n",
+            "\n",
+            "    accuracy                         0.8438        32\n",
+            "   macro avg     0.2812    0.3333    0.3051        32\n",
+            "weighted avg     0.7119    0.8438    0.7722        32\n",
+            "\n",
+            "Training bert model on GH dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training data shape: torch.Size([1495, 256]), torch.Size([1495, 256]), torch.Size([1495])\n",
+            "Starting training...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Epoch 1: Train Loss: 0.8448, Accuracy: 0.6141\n",
+            "Validation Accuracy: 0.7933, F1 Score: 0.7983\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n",
+            "Epoch 2: Train Loss: 0.3207, Accuracy: 0.8900\n",
+            "Validation Accuracy: 0.8733, F1 Score: 0.8741\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n",
+            "Epoch 3: Train Loss: 0.1529, Accuracy: 0.9532\n",
+            "Validation Accuracy: 0.8867, F1 Score: 0.8851\n",
+            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n",
+            "Epoch 4: Train Loss: 0.0991, Accuracy: 0.9762\n",
+            "Validation Accuracy: 0.8800, F1 Score: 0.8783\n",
+            "Final Model Performance on Validation Set:\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0     0.8868    0.8246    0.8545        57\n",
+            "           1     0.8846    1.0000    0.9388        46\n",
+            "           2     0.8667    0.8298    0.8478        47\n",
+            "\n",
+            "    accuracy                         0.8800       150\n",
+            "   macro avg     0.8794    0.8848    0.8804       150\n",
+            "weighted avg     0.8798    0.8800    0.8783       150\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Train the model for github-golden-rule and stackoverflow on Bert\n",
+        "\n",
+        "MODEL_NAMES = ['bert']\n",
+        "for dataset_name, train_df in [('SO', train_so), ('GH', train_gh)]:\n",
+        "    for i, model_name in enumerate(MODEL_NAMES):\n",
+        "        model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{dataset_name}_{model_name}_model\"\n",
+        "        print(f\"Training {model_name} model on {dataset_name} dataset...\")\n",
+        "        train_model(train_df, model_save_path, model_select=i)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ugVi3T4wAkSQ"
+      },
+      "source": [
+        "## Test"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XMgVVJ9U9XBa"
+      },
+      "source": [
+        "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n",
+        "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n",
+        "\n",
+        "## Evaluation Metrics  \n",
+        "We will assess:  \n",
+        "1. **Overall model performance** across all platforms.  \n",
+        "2. **Platform-specific performance** for each model on:  \n",
+        "   - **GitHub**  \n",
+        "   - **Jira**  \n",
+        "   - **Mailbox**  \n",
+        "\n",
+        "## Results  \n",
+        "The evaluation will print:  \n",
+        "- **Overall accuracy** of each model.  \n",
+        "- **Performance breakdown per platform** for each model.  \n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "RSJI327nEKAz",
+        "outputId": "264447bc-b509-4e8a-e5ce-49cbed8a44a4"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Evaluating bert model...for overall platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 9.38 seconds\n",
+            "Accuracy of bert is: 0.9422084623323014\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.95      0.91      0.93       329\n",
+            "           1       0.94      0.97      0.95       318\n",
+            "           2       0.94      0.95      0.94       322\n",
+            "\n",
+            "    accuracy                           0.94       969\n",
+            "   macro avg       0.94      0.94      0.94       969\n",
+            "weighted avg       0.94      0.94      0.94       969\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2    327\n",
+            "1    326\n",
+            "0    316\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating bert model...for GitHub platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 3.93 seconds\n",
+            "Accuracy of bert is: 0.956140350877193\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.96      0.94      0.95       128\n",
+            "           1       0.97      0.96      0.96       122\n",
+            "           2       0.94      0.98      0.96        92\n",
+            "\n",
+            "    accuracy                           0.96       342\n",
+            "   macro avg       0.95      0.96      0.96       342\n",
+            "weighted avg       0.96      0.96      0.96       342\n",
+            "\n",
+            "prediction_Polarity\n",
+            "0.0    125\n",
+            "1.0    121\n",
+            "2.0     96\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating bert model...for Jira platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 4.05 seconds\n",
+            "Accuracy of bert is: 0.926829268292683\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.94      0.88      0.91        93\n",
+            "           1       0.92      0.94      0.93        86\n",
+            "           2       0.92      0.95      0.94       108\n",
+            "\n",
+            "    accuracy                           0.93       287\n",
+            "   macro avg       0.93      0.93      0.93       287\n",
+            "weighted avg       0.93      0.93      0.93       287\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    112\n",
+            "1.0     88\n",
+            "0.0     87\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating bert model...for Mailbox platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 5.19 seconds\n",
+            "Accuracy of bert is: 0.9411764705882353\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.94      0.91      0.92       108\n",
+            "           1       0.93      0.99      0.96       110\n",
+            "           2       0.95      0.93      0.94       122\n",
+            "\n",
+            "    accuracy                           0.94       340\n",
+            "   macro avg       0.94      0.94      0.94       340\n",
+            "weighted avg       0.94      0.94      0.94       340\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    119\n",
+            "1.0    117\n",
+            "0.0    104\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating xlnet model...for overall platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 14.16 seconds\n",
+            "Accuracy of xlnet is: 0.8823529411764706\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.89      0.82      0.85       329\n",
+            "           1       0.88      0.95      0.91       318\n",
+            "           2       0.88      0.88      0.88       322\n",
+            "\n",
+            "    accuracy                           0.88       969\n",
+            "   macro avg       0.88      0.88      0.88       969\n",
+            "weighted avg       0.88      0.88      0.88       969\n",
+            "\n",
+            "prediction_Polarity\n",
+            "1    343\n",
+            "2    320\n",
+            "0    306\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating xlnet model...for GitHub platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 7.19 seconds\n",
+            "Accuracy of xlnet is: 0.9005847953216374\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.90      0.85      0.88       128\n",
+            "           1       0.90      0.96      0.93       122\n",
+            "           2       0.90      0.89      0.90        92\n",
+            "\n",
+            "    accuracy                           0.90       342\n",
+            "   macro avg       0.90      0.90      0.90       342\n",
+            "weighted avg       0.90      0.90      0.90       342\n",
+            "\n",
+            "prediction_Polarity\n",
+            "1.0    130\n",
+            "0.0    121\n",
+            "2.0     91\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating xlnet model...for Jira platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 5.72 seconds\n",
+            "Accuracy of xlnet is: 0.9024390243902439\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.91      0.81      0.86        93\n",
+            "           1       0.91      0.95      0.93        86\n",
+            "           2       0.89      0.94      0.91       108\n",
+            "\n",
+            "    accuracy                           0.90       287\n",
+            "   macro avg       0.90      0.90      0.90       287\n",
+            "weighted avg       0.90      0.90      0.90       287\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    115\n",
+            "1.0     90\n",
+            "0.0     82\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating xlnet model...for Mailbox platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 6.06 seconds\n",
+            "Accuracy of xlnet is: 0.8470588235294118\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.84      0.81      0.82       108\n",
+            "           1       0.83      0.93      0.88       110\n",
+            "           2       0.87      0.81      0.84       122\n",
+            "\n",
+            "    accuracy                           0.85       340\n",
+            "   macro avg       0.85      0.85      0.85       340\n",
+            "weighted avg       0.85      0.85      0.85       340\n",
+            "\n",
+            "prediction_Polarity\n",
+            "1.0    123\n",
+            "2.0    114\n",
+            "0.0    103\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating Roberta model...for overall platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 9.61 seconds\n",
+            "Accuracy of Roberta is: 0.9060887512899897\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.93      0.83      0.87       329\n",
+            "           1       0.91      0.96      0.93       318\n",
+            "           2       0.88      0.94      0.91       322\n",
+            "\n",
+            "    accuracy                           0.91       969\n",
+            "   macro avg       0.91      0.91      0.91       969\n",
+            "weighted avg       0.91      0.91      0.91       969\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2    342\n",
+            "1    333\n",
+            "0    294\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating Roberta model...for GitHub platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 4.66 seconds\n",
+            "Accuracy of Roberta is: 0.9298245614035088\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.93      0.88      0.90       128\n",
+            "           1       0.95      0.98      0.96       122\n",
+            "           2       0.91      0.93      0.92        92\n",
+            "\n",
+            "    accuracy                           0.93       342\n",
+            "   macro avg       0.93      0.93      0.93       342\n",
+            "weighted avg       0.93      0.93      0.93       342\n",
+            "\n",
+            "prediction_Polarity\n",
+            "1.0    125\n",
+            "0.0    122\n",
+            "2.0     95\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating Roberta model...for Jira platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 4.47 seconds\n",
+            "Accuracy of Roberta is: 0.9163763066202091\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.95      0.81      0.87        93\n",
+            "           1       0.91      0.95      0.93        86\n",
+            "           2       0.90      0.98      0.94       108\n",
+            "\n",
+            "    accuracy                           0.92       287\n",
+            "   macro avg       0.92      0.91      0.91       287\n",
+            "weighted avg       0.92      0.92      0.91       287\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    118\n",
+            "1.0     90\n",
+            "0.0     79\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating Roberta model...for Mailbox platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 5.03 seconds\n",
+            "Accuracy of Roberta is: 0.8735294117647059\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.90      0.78      0.84       108\n",
+            "           1       0.87      0.94      0.90       110\n",
+            "           2       0.85      0.90      0.88       122\n",
+            "\n",
+            "    accuracy                           0.87       340\n",
+            "   macro avg       0.88      0.87      0.87       340\n",
+            "weighted avg       0.88      0.87      0.87       340\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    129\n",
+            "1.0    118\n",
+            "0.0     93\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating albert model...for overall platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 8.62 seconds\n",
+            "Accuracy of albert is: 0.8844169246646026\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.90      0.79      0.84       329\n",
+            "           1       0.91      0.95      0.93       318\n",
+            "           2       0.85      0.92      0.88       322\n",
+            "\n",
+            "    accuracy                           0.88       969\n",
+            "   macro avg       0.89      0.89      0.88       969\n",
+            "weighted avg       0.89      0.88      0.88       969\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2    350\n",
+            "1    331\n",
+            "0    288\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating albert model...for GitHub platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 3.99 seconds\n",
+            "Accuracy of albert is: 0.9269005847953217\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.93      0.88      0.90       128\n",
+            "           1       0.97      0.96      0.96       122\n",
+            "           2       0.87      0.96      0.91        92\n",
+            "\n",
+            "    accuracy                           0.93       342\n",
+            "   macro avg       0.92      0.93      0.93       342\n",
+            "weighted avg       0.93      0.93      0.93       342\n",
+            "\n",
+            "prediction_Polarity\n",
+            "1.0    121\n",
+            "0.0    120\n",
+            "2.0    101\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating albert model...for Jira platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 3.30 seconds\n",
+            "Accuracy of albert is: 0.89198606271777\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.95      0.75      0.84        93\n",
+            "           1       0.91      0.94      0.93        86\n",
+            "           2       0.85      0.97      0.91       108\n",
+            "\n",
+            "    accuracy                           0.89       287\n",
+            "   macro avg       0.90      0.89      0.89       287\n",
+            "weighted avg       0.90      0.89      0.89       287\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    124\n",
+            "1.0     89\n",
+            "0.0     74\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating albert model...for Mailbox platform\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
+            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+            "Try using .loc[row_indexer,col_indexer] = value instead\n",
+            "\n",
+            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 4.12 seconds\n",
+            "Accuracy of albert is: 0.8352941176470589\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.83      0.72      0.77       108\n",
+            "           1       0.85      0.94      0.89       110\n",
+            "           2       0.82      0.84      0.83       122\n",
+            "\n",
+            "    accuracy                           0.84       340\n",
+            "   macro avg       0.84      0.83      0.83       340\n",
+            "weighted avg       0.83      0.84      0.83       340\n",
+            "\n",
+            "prediction_Polarity\n",
+            "2.0    125\n",
+            "1.0    121\n",
+            "0.0     94\n",
+            "Name: count, dtype: int64\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "# Load test dataset\n",
+        "test_df = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv')\n",
+        "\n",
+        "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
+        "model_results = {}\n",
+        "\n",
+        "# Define platform mapping\n",
+        "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n",
+        "\n",
+        "# Evaluate each model\n",
+        "for i, model_name in enumerate(MODEL_NAMES):\n",
+        "    model_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n",
+        "    print(f\"Evaluating {model_name} model...for overall platform\")\n",
+        "\n",
+        "    # Get overall accuracy\n",
+        "    overall_accuracy = test_model(test_df, model_path, model_select=i)\n",
+        "\n",
+        "    # Evaluate accuracy per platform\n",
+        "    platform_accuracies = {}\n",
+        "    for platform_id, platform_name in platforms.items():\n",
+        "        test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n",
+        "        if not test_df_platform.empty:\n",
+        "            print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n",
+        "            accuracy = test_model(test_df_platform, model_path, model_select=i)\n",
+        "            platform_accuracies[platform_name] = accuracy\n",
+        "        else:\n",
+        "            platform_accuracies[platform_name] = \"No data\"\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2rTxz8aIRkYQ"
+      },
+      "source": [
+        "### Generalization Performance of the Model (Table 3.3)\n",
+        "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets:  \n",
+        "- **GitHub Golden Rule Dataset**  \n",
+        "- **Stack Overflow Dataset**  \n",
+        "\n",
+        "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n",
+        "\n",
+        "## Evaluation Process  \n",
+        "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n",
+        "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n",
+        "\n",
+        "## Goals  \n",
+        "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n",
+        "- To highlight the **superiority** of our cross-platform model over dataset-specific models."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ekatnWlSAnyd",
+        "outputId": "7ff88fb0-109c-43b0-97ae-36a792730275"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Evaluating bert_model on GitHub test dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 7.34 seconds\n",
+            "Accuracy of bert is: 0.8829953198127926\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.91      0.86      0.88       267\n",
+            "           1       0.89      0.92      0.91       170\n",
+            "           2       0.85      0.88      0.86       204\n",
+            "\n",
+            "    accuracy                           0.88       641\n",
+            "   macro avg       0.88      0.89      0.88       641\n",
+            "weighted avg       0.88      0.88      0.88       641\n",
+            "\n",
+            "prediction_Polarity\n",
+            "0    252\n",
+            "2    213\n",
+            "1    176\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating bert_model on Stack Overflow test dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n",
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 4.10 seconds\n",
+            "Accuracy of bert is: 0.8814814814814815\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.93      0.93      0.93       110\n",
+            "           1       0.44      0.36      0.40        11\n",
+            "           2       0.81      0.93      0.87        14\n",
+            "\n",
+            "    accuracy                           0.88       135\n",
+            "   macro avg       0.73      0.74      0.73       135\n",
+            "weighted avg       0.88      0.88      0.88       135\n",
+            "\n",
+            "prediction_Polarity\n",
+            "0    110\n",
+            "2     16\n",
+            "1      9\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating GH_bert_model on Stack Overflow test dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 2.21 seconds\n",
+            "Accuracy of bert is: 0.8148148148148148\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.85      0.95      0.89       110\n",
+            "           1       0.50      0.27      0.35        11\n",
+            "           2       0.50      0.21      0.30        14\n",
+            "\n",
+            "    accuracy                           0.81       135\n",
+            "   macro avg       0.62      0.48      0.52       135\n",
+            "weighted avg       0.78      0.81      0.79       135\n",
+            "\n",
+            "prediction_Polarity\n",
+            "0    123\n",
+            "2      6\n",
+            "1      6\n",
+            "Name: count, dtype: int64\n",
+            "Evaluating SO_bert_model on GitHub test dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  warnings.warn(\n",
+            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+            "  model.load_state_dict(torch.load(model_saved_path))\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Prediction used 6.62 seconds\n",
+            "Accuracy of bert is: 0.4165366614664587\n",
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "           0       0.42      1.00      0.59       267\n",
+            "           1       0.00      0.00      0.00       170\n",
+            "           2       0.00      0.00      0.00       204\n",
+            "\n",
+            "    accuracy                           0.42       641\n",
+            "   macro avg       0.14      0.33      0.20       641\n",
+            "weighted avg       0.17      0.42      0.24       641\n",
+            "\n",
+            "prediction_Polarity\n",
+            "0    641\n",
+            "Name: count, dtype: int64\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "\n",
+        "# Load test datasets\n",
+        "test_gh = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv')\n",
+        "test_so = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv')\n",
+        "\n",
+        "# Define model paths\n",
+        "bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\"\n",
+        "gh_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\"\n",
+        "so_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\"\n",
+        "\n",
+        "# Store results\n",
+        "model_results = {}\n",
+        "\n",
+        "# 1. Validate bert_model on test_gh and test_so\n",
+        "print(\"Evaluating bert_model on GitHub test dataset...\")\n",
+        "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n",
+        "\n",
+        "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n",
+        "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n",
+        "\n",
+        "model_results[\"bert_model\"] = {\n",
+        "    \"test_gh Accuracy\": bert_on_gh,\n",
+        "    \"test_so Accuracy\": bert_on_so\n",
+        "}\n",
+        "\n",
+        "# 2. Validate GH_bert_model on test_so\n",
+        "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n",
+        "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n",
+        "\n",
+        "model_results[\"GH_bert_model\"] = {\n",
+        "    \"test_so Accuracy\": gh_bert_on_so\n",
+        "}\n",
+        "\n",
+        "# 3. Validate SO_bert_model on test_gh\n",
+        "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n",
+        "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n",
+        "\n",
+        "model_results[\"SO_bert_model\"] = {\n",
+        "    \"test_gh Accuracy\": so_bert_on_gh\n",
+        "}\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "L4",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From 606a14c27082bdefe7c9b9704c3eca85e10ab44f Mon Sep 17 00:00:00 2001
From: Connor Narowetz <cnarowetz@gmail.com>
Date: Sat, 3 May 2025 13:34:28 -1000
Subject: [PATCH 2/5] Restructured Notebook, added API folder

- Added API folder
- Train and Test notebook for each part of the process, to break it up
---
 ...Development_Sentiment_Classification.ipynb | 2174 -----------------
 api/filter.py                                 |   64 +
 api/test.py                                   |  121 +
 api/tokenizer.py                              |   99 +
 api/train.py                                  |  165 ++
 ...Development_Sentiment_Classification.ipynb |  239 ++
 notebooks/Test.ipynb                          |  261 ++
 notebooks/Train.ipynb                         |  156 ++
 8 files changed, 1105 insertions(+), 2174 deletions(-)
 delete mode 100644 Software_Development_Sentiment_Classification.ipynb
 create mode 100644 api/filter.py
 create mode 100644 api/test.py
 create mode 100644 api/tokenizer.py
 create mode 100644 api/train.py
 create mode 100644 notebooks/Software_Development_Sentiment_Classification.ipynb
 create mode 100644 notebooks/Test.ipynb
 create mode 100644 notebooks/Train.ipynb

diff --git a/Software_Development_Sentiment_Classification.ipynb b/Software_Development_Sentiment_Classification.ipynb
deleted file mode 100644
index ff12ae7..0000000
--- a/Software_Development_Sentiment_Classification.ipynb
+++ /dev/null
@@ -1,2174 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "oe8X-6s9btXo"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "import sys\n",
-        "import re\n",
-        "import string\n",
-        "import random\n",
-        "import warnings\n",
-        "import argparse\n",
-        "import numpy as np\n",
-        "import pandas as pd\n",
-        "import torch\n",
-        "import time\n",
-        "import seaborn as sns\n",
-        "import matplotlib.pyplot as plt\n",
-        "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
-        "from io import StringIO\n",
-        "from unicodedata import category\n",
-        "from bs4 import BeautifulSoup\n",
-        "from markdown import markdown\n",
-        "from google.colab import drive\n",
-        "from sklearn.model_selection import train_test_split\n",
-        "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n",
-        "from torch.utils.data import DataLoader, RandomSampler, Dataset\n",
-        "from transformers import (\n",
-        "    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n",
-        "    XLNetTokenizer, XLNetForSequenceClassification,\n",
-        "    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n",
-        "    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n",
-        "    get_scheduler, AdamW\n",
-        ")\n",
-        "\n",
-        "\n",
-        "drive.mount('/content/drive')\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RAXtSnSK4LPr"
-      },
-      "source": [
-        "# Tokenlized"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9gOgONc04PiO"
-      },
-      "outputs": [],
-      "source": [
-        "# Regular expression for GitHub username mentions\n",
-        "USERNAME_REGEX = r\"(\\s|^)@(\\S*\\s?)\"\n",
-        "\n",
-        "# Generate Unicode punctuation set\n",
-        "punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith((\"P\", \"S\"))}\n",
-        "\n",
-        "# Dictionary to count token replacements\n",
-        "counters = {}\n",
-        "\n",
-        "def remove_punctuation(text):\n",
-        "    \"\"\"Remove all punctuation characters from the given text.\"\"\"\n",
-        "    return \"\".join(char for char in text if char not in punctuation)\n",
-        "\n",
-        "def clean_text(text):\n",
-        "    \"\"\"Remove quoted text and large code blocks from GitHub issues or comments.\"\"\"\n",
-        "    # Remove quoted text from emails/notifications\n",
-        "    text = re.sub(r\"^(On[\\s\\S]*?notifications@github\\.com\\s*?wrote:\\s*?)?(^(\\>).*\\s)*\", '', text, flags=re.MULTILINE)\n",
-        "\n",
-        "    # Remove code blocks enclosed in triple backticks\n",
-        "    text = re.sub(r\"```[a-z]*\\n[\\s\\S]*?\\n```\", \"\", text)\n",
-        "\n",
-        "    return text\n",
-        "\n",
-        "def replace_token(regex, token_name, text):\n",
-        "    \"\"\"\n",
-        "    Replace matched patterns in the text with the specified token.\n",
-        "\n",
-        "    Args:\n",
-        "        regex (str): The regular expression pattern to match.\n",
-        "        token_name (str): The replacement token name.\n",
-        "        text (str): The input text.\n",
-        "\n",
-        "    Returns:\n",
-        "        tuple: (processed_text, number_of_replacements)\n",
-        "    \"\"\"\n",
-        "    replaced_text, replacements = re.subn(regex, f\" {token_name} \", text, flags=re.MULTILINE)\n",
-        "    counters[token_name] = counters.get(token_name, 0) + replacements\n",
-        "    return replaced_text, replacements\n",
-        "\n",
-        "def tokenize_text(text):\n",
-        "    \"\"\"\n",
-        "    Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.\n",
-        "\n",
-        "    Args:\n",
-        "        text (str): The input text.\n",
-        "\n",
-        "    Returns:\n",
-        "        tuple: (processed_text, total_replacements)\n",
-        "    \"\"\"\n",
-        "    total_replacements = 0\n",
-        "\n",
-        "    text, replacements = replace_token(r\"\\S+@\\S*\\s?\", \"MEMAIL\", text)\n",
-        "    total_replacements += replacements\n",
-        "\n",
-        "    text, replacements = replace_token(USERNAME_REGEX, \"MMENTION\", text)\n",
-        "    total_replacements += replacements\n",
-        "\n",
-        "    text, replacements = replace_token(r\"`([^`]*)`\", \"MICODE\", text)\n",
-        "    total_replacements += replacements\n",
-        "\n",
-        "    text, replacements = replace_token(r\"\\b\\d+\\.\\d+(\\.\\d+)*\\b\", \"MVERSIONNUMBER\", text)\n",
-        "    total_replacements += replacements\n",
-        "\n",
-        "    text, replacements = replace_token(r\"(\\s|^)#\\d+\", \"MISSUEMENTION\", text)\n",
-        "    total_replacements += replacements\n",
-        "\n",
-        "    text, replacements = replace_token(\n",
-        "        r\"([a-zA-Z0-9]+):\\/\\/([\\w_-]+(?:\\.[\\w_-]+)*)[\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-]\",\n",
-        "        \"MURL\",\n",
-        "        text,\n",
-        "    )\n",
-        "    total_replacements += replacements\n",
-        "\n",
-        "    return text, total_replacements\n",
-        "\n",
-        "def remove_markdown_content(text):\n",
-        "    \"\"\"\n",
-        "    Converts Markdown content to plain text by removing all Markdown formatting.\n",
-        "\n",
-        "    Args:\n",
-        "        text (str): The input Markdown text.\n",
-        "\n",
-        "    Returns:\n",
-        "        str: Cleaned text without Markdown formatting.\n",
-        "    \"\"\"\n",
-        "    html = markdown(text)\n",
-        "    return \"\".join(BeautifulSoup(html, \"lxml\").findAll(text=True))\n",
-        "\n",
-        "def transform_text(row):\n",
-        "    \"\"\"\n",
-        "    Transforms a row by cleaning and tokenizing its text content.\n",
-        "\n",
-        "    Args:\n",
-        "        row (dict): A dictionary containing a 'Text' key.\n",
-        "\n",
-        "    Returns:\n",
-        "        tuple: (processed text, number of replacements)\n",
-        "    \"\"\"\n",
-        "    text = row.get(\"Text\", \"\")\n",
-        "\n",
-        "    if not isinstance(text, str):\n",
-        "        warnings.warn(f\"Converting non-string type to string: {type(text)}\")\n",
-        "        text = str(text)\n",
-        "\n",
-        "    text, replaced_count = tokenize_text(text)\n",
-        "    text = text.replace(\"\\n\", \"\")\n",
-        "    return text, replaced_count"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uCPyaXMt4ao_"
-      },
-      "source": [
-        "##Usage"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "8RhSXEyO4Tnu",
-        "outputId": "7988bf5a-6c84-46af-f1fa-273b6b31baeb"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\n",
-            "                                                Text  Polarity\n",
-            "0  In this situation, when I click on the greyed ...         0\n",
-            "1  After that a download progress status with pro...         0\n",
-            "2  Change the double quotationCODE_FRAGMENT to to...         0\n",
-            "3      E.g. I get an array of CODE_FRAGMENT objects.         0\n",
-            "4  Then I tried my own implementation with CODE_F...         0\n",
-            "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv: 450\n",
-            "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv\n",
-            "\n",
-            "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\n",
-            "                                                Text  Polarity\n",
-            "0          Guess there is a typo here: `translate`.\"         0\n",
-            "1  @arturoc: If you multiply-include `gst.h`, wou...         0\n",
-            "2  Thank you Vlad. Your contribution to Mangos an...         1\n",
-            "3  @opdenkamp Hi Lars, I'm afraid that you forgot...         2\n",
-            "4                         Ok, so let's be paranoid.\"         2\n",
-            "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv: 2136\n",
-            "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv\n",
-            "\n",
-            "Processing dataset: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\n",
-            "                                                Text  Platform  Polarity\n",
-            "0  [error] testWorkflowTimeoutWhenWorkflowComplet...         0         2\n",
-            "1                               Thanks Hunter. Sure.         0         1\n",
-            "2                 Approved by  I will merge it soon.         0         1\n",
-            "3  It was reported and verified that the current ...         0         2\n",
-            "4  Thanks for the comments. I am figuring out to ...         0         1\n",
-            "Total replacements for /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv: 3227\n",
-            "Tokenized dataset saved to: /content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "\n",
-        "# Define input dataset paths\n",
-        "input_paths = [\n",
-        "    \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset.csv\",\n",
-        "    \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset.csv\",\n",
-        "    \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv\"\n",
-        "]\n",
-        "\n",
-        "# Define the text transformation function (ensure transform_text is correctly implemented)\n",
-        "def transform_text(row):\n",
-        "    # Modify this function according to your needs\n",
-        "    # Example: return the original text and a dummy replacement count\n",
-        "    return row[\"Text\"], 1\n",
-        "\n",
-        "# Loop through each dataset and process it\n",
-        "for input_path in input_paths:\n",
-        "    # Generate output file name\n",
-        "    output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n",
-        "    output_path = os.path.join(os.path.dirname(input_path), output_filename)\n",
-        "\n",
-        "    # Load dataset\n",
-        "    df = pd.read_csv(input_path)\n",
-        "    print(f\"Processing dataset: {input_path}\")\n",
-        "    print(df.head())  # Print first few rows for verification\n",
-        "\n",
-        "    # Apply text transformation\n",
-        "    df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n",
-        "\n",
-        "    # Calculate total replacements from `replaced_token` column\n",
-        "    total_replacements = df[\"replaced_token\"].sum()\n",
-        "\n",
-        "    # Save processed dataset\n",
-        "    df.to_csv(output_path, header=True, index=False)\n",
-        "\n",
-        "    print(f\"Tokenized dataset saved to: {output_path}\\n\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "o3Cm2Gn_CbM5"
-      },
-      "source": [
-        "# Dataset Overview: `crossplatform_sf_dataset.csv`\n",
-        "\n",
-        "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n",
-        "\n",
-        "## **Column Descriptions**\n",
-        "- **`Text`**: The user comment or discussion content.  \n",
-        "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text:  \n",
-        "  - `2`: Negative sentiment  \n",
-        "  - `0`: Neutral sentiment  \n",
-        "  - `1`: Positive sentiment  \n",
-        "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated:  \n",
-        "  - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests)  \n",
-        "  - `1`: **Jira** (Bug reports, task comments in software development management tools)  \n",
-        "  - `2`: **Mailbox** (Developer communication through emails)  \n",
-        "\n",
-        "## **Dataset Distribution**\n",
-        "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms.  \n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "QaHSklrRClHU",
-        "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "==================================================\n",
-            "📊 Dataset Information: cf-dataset.csv\n",
-            "==================================================\n",
-            "Total Samples: 3227\n",
-            "\n",
-            "📌 Polarity Distribution:\n",
-            "+---+----------+-------+\n",
-            "|   | Polarity | Count |\n",
-            "+---+----------+-------+\n",
-            "| 0 |    0     | 1125  |\n",
-            "| 1 |    1     | 1042  |\n",
-            "| 2 |    2     | 1060  |\n",
-            "+---+----------+-------+\n",
-            "\n",
-            "\n",
-            "📌 Platform Distribution:\n",
-            "+---+----------+-------+\n",
-            "|   | Platform | Count |\n",
-            "+---+----------+-------+\n",
-            "| 0 |    0     | 1079  |\n",
-            "| 1 |    1     | 1054  |\n",
-            "| 2 |    2     | 1094  |\n",
-            "+---+----------+-------+\n",
-            "\n",
-            "\n",
-            "📌 Platform-wise Polarity Distribution:\n",
-            "+----------+-----+-----+-----+\n",
-            "| Platform |  0  |  1  |  2  |\n",
-            "+----------+-----+-----+-----+\n",
-            "|    0     | 392 | 361 | 326 |\n",
-            "|    1     | 367 | 316 | 371 |\n",
-            "|    2     | 366 | 365 | 363 |\n",
-            "+----------+-----+-----+-----+\n",
-            "==================================================\n"
-          ]
-        }
-      ],
-      "source": [
-        "import pandas as pd\n",
-        "from tabulate import tabulate\n",
-        "\n",
-        "# Load dataset\n",
-        "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset.csv'\n",
-        "df = pd.read_csv(input_path)\n",
-        "\n",
-        "# Compute dataset statistics\n",
-        "total_samples = len(df)\n",
-        "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n",
-        "platform_counts = df[\"Platform\"].value_counts().sort_index()\n",
-        "\n",
-        "# Compute Polarity distribution within each Platform\n",
-        "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n",
-        "\n",
-        "# Print results with formatting\n",
-        "print(\"=\" * 50)\n",
-        "print(f\"📊 Dataset Information: cf-dataset.csv\")\n",
-        "print(\"=\" * 50)\n",
-        "print(f\"Total Samples: {total_samples}\\n\")\n",
-        "\n",
-        "# Polarity distribution\n",
-        "print(\"📌 Polarity Distribution:\")\n",
-        "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n",
-        "print(\"\\n\")\n",
-        "\n",
-        "# Platform distribution\n",
-        "print(\"📌 Platform Distribution:\")\n",
-        "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n",
-        "print(\"\\n\")\n",
-        "\n",
-        "# Platform-wise Polarity distribution\n",
-        "print(\"📌 Platform-wise Polarity Distribution:\")\n",
-        "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n",
-        "print(\"=\" * 50)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9kxjNt7QqLlh"
-      },
-      "source": [
-        "# Sentiment Classification"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 35
-        },
-        "id": "0AiOM3jWKL4P",
-        "outputId": "bbb7a3d9-e9d7-4378-b8f8-ab68ed4670ec"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "string"
-            },
-            "text/plain": [
-              "'NVIDIA L4'"
-            ]
-          },
-          "execution_count": 2,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "def seed_torch(seed):\n",
-        "    random.seed(seed)\n",
-        "    np.random.seed(seed)\n",
-        "    torch.manual_seed(seed)\n",
-        "    torch.cuda.manual_seed(seed)\n",
-        "    torch.backends.cudnn.deterministic=True\n",
-        "\n",
-        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "n_gpu = torch.cuda.device_count()\n",
-        "torch.cuda.get_device_name(0)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gQGXycAgbchc"
-      },
-      "outputs": [],
-      "source": [
-        "# Train\n",
-        "MAX_LEN = 256\n",
-        "BATCH_SIZE = 16\n",
-        "LEARNING_RATE = 2e-5\n",
-        "EPOCHS = 4\n",
-        "WEIGHT_DECAY = 0.01\n",
-        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "\n",
-        "\n",
-        "MODELS = [\n",
-        "    (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),\n",
-        "    (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),\n",
-        "    (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),\n",
-        "    (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1')\n",
-        "]\n",
-        "MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert']\n",
-        "\n",
-        "def train_model(train_df, model_save_path, model_select=0):\n",
-        "    seed_torch(42)\n",
-        "\n",
-        "    cur_model = MODELS[model_select]\n",
-        "    m_name = MODEL_NAMES[model_select]\n",
-        "\n",
-        "\n",
-        "    train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0})\n",
-        "    tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n",
-        "\n",
-        "    sentences = train_df.Text.values\n",
-        "    labels = train_df.Polarity.values\n",
-        "\n",
-        "    input_ids = []\n",
-        "    attention_masks = []\n",
-        "\n",
-        "    for sent in sentences:\n",
-        "        encoded_dict = tokenizer.encode_plus(\n",
-        "            str(sent),\n",
-        "            add_special_tokens=True,\n",
-        "            max_length=MAX_LEN,\n",
-        "            padding='max_length',\n",
-        "            return_attention_mask=True,\n",
-        "            return_tensors='pt',\n",
-        "            truncation=True\n",
-        "        )\n",
-        "        input_ids.append(encoded_dict['input_ids'])\n",
-        "        attention_masks.append(encoded_dict['attention_mask'])\n",
-        "\n",
-        "    input_ids = torch.cat(input_ids, dim=0)\n",
-        "    attention_masks = torch.cat(attention_masks, dim=0)\n",
-        "    labels = torch.tensor(labels)\n",
-        "\n",
-        "    print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}')\n",
-        "\n",
-        "\n",
-        "    train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n",
-        "        input_ids, labels, test_size=0.1, random_state=42)\n",
-        "    train_masks, val_masks, _, _ = train_test_split(\n",
-        "        attention_masks, labels, test_size=0.1, random_state=42)\n",
-        "\n",
-        "\n",
-        "    train_data = TensorDataset(train_inputs, train_masks, train_labels)\n",
-        "    train_sampler = RandomSampler(train_data)\n",
-        "    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)\n",
-        "\n",
-        "    val_data = TensorDataset(val_inputs, val_masks, val_labels)\n",
-        "    val_sampler = SequentialSampler(val_data)\n",
-        "    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)\n",
-        "\n",
-        "\n",
-        "    model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n",
-        "    model.to(device)\n",
-        "\n",
-        "\n",
-        "    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
-        "\n",
-        "\n",
-        "    num_training_steps = EPOCHS * len(train_dataloader)\n",
-        "    lr_scheduler = get_scheduler(\n",
-        "        name=\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps\n",
-        "    )\n",
-        "\n",
-        "\n",
-        "    print(\"Starting training...\")\n",
-        "    best_f1 = 0\n",
-        "    for epoch in range(EPOCHS):\n",
-        "        model.train()\n",
-        "        total_loss = 0\n",
-        "        predictions, true_labels = [], []\n",
-        "\n",
-        "        for batch in train_dataloader:\n",
-        "            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n",
-        "            optimizer.zero_grad()\n",
-        "            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)\n",
-        "            loss, logits = outputs[:2]\n",
-        "            loss.backward()\n",
-        "            optimizer.step()\n",
-        "            lr_scheduler.step()\n",
-        "\n",
-        "            total_loss += loss.item()\n",
-        "            predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n",
-        "            true_labels.extend(b_labels.cpu().numpy())\n",
-        "\n",
-        "        train_acc = accuracy_score(true_labels, predictions)\n",
-        "        print(f\"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}\")\n",
-        "\n",
-        "\n",
-        "        model.eval()\n",
-        "        val_predictions, val_labels = [], []\n",
-        "        with torch.no_grad():\n",
-        "            for batch in val_dataloader:\n",
-        "                b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]\n",
-        "                outputs = model(b_input_ids, attention_mask=b_input_mask)\n",
-        "                logits = outputs[0]\n",
-        "                val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())\n",
-        "                val_labels.extend(b_labels.cpu().numpy())\n",
-        "\n",
-        "        val_acc = accuracy_score(val_labels, val_predictions)\n",
-        "        val_f1 = f1_score(val_labels, val_predictions, average='weighted')\n",
-        "        print(f\"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}\")\n",
-        "\n",
-        "\n",
-        "        if val_f1 > best_f1:\n",
-        "            best_f1 = val_f1\n",
-        "            torch.save(model.state_dict(), model_save_path)\n",
-        "            print(f\"Best model saved at {model_save_path}\")\n",
-        "\n",
-        "\n",
-        "    print(\"Final Model Performance on Validation Set:\")\n",
-        "    print(classification_report(val_labels, val_predictions, digits=4))\n",
-        "    return model_save_path\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Sf7jGxHYhNIb"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "def test_model(test_df, model_saved_path, model_select=0):\n",
-        "\n",
-        "  MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),\n",
-        "          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),\n",
-        "          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),\n",
-        "          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')\n",
-        "        ]\n",
-        "  MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
-        "  seed_torch(42)\n",
-        "\n",
-        "  cur_model=MODELS[model_select]\n",
-        "  m_name=MODEL_NAMES[model_select]\n",
-        "\n",
-        "  tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)\n",
-        "\n",
-        "  begin=time.time()\n",
-        "\n",
-        "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-        "      'positive':1,\n",
-        "      'negative':2,\n",
-        "      'neutral':0})\n",
-        "\n",
-        "\n",
-        "  sentences = test_df.Text.values\n",
-        "  labels = test_df.Polarity.values\n",
-        "\n",
-        "  input_ids = []\n",
-        "  attention_masks = []\n",
-        "\n",
-        "  for sent in sentences:\n",
-        "      encoded_dict = tokenizer.encode_plus(\n",
-        "                          str(sent),\n",
-        "                          add_special_tokens = True,\n",
-        "                          max_length = MAX_LEN,\n",
-        "                          pad_to_max_length = True,\n",
-        "                          return_attention_mask = True,\n",
-        "                          return_tensors = 'pt',\n",
-        "                    )\n",
-        "\n",
-        "      input_ids.append(encoded_dict['input_ids'])\n",
-        "      attention_masks.append(encoded_dict['attention_mask'])\n",
-        "\n",
-        "  prediction_inputs = torch.cat(input_ids,dim=0)\n",
-        "  prediction_masks = torch.cat(attention_masks,dim=0)\n",
-        "  prediction_labels = torch.tensor(labels)\n",
-        "\n",
-        "  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)\n",
-        "  prediction_sampler = SequentialSampler(prediction_data)\n",
-        "  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)\n",
-        "\n",
-        "  model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)\n",
-        "  model.load_state_dict(torch.load(model_saved_path))\n",
-        "  model.cuda()\n",
-        "  model.eval()\n",
-        "\n",
-        "  predictions,true_labels=[],[]\n",
-        "\n",
-        "  for batch in prediction_dataloader:\n",
-        "      batch = tuple(t.to(device) for t in batch)\n",
-        "      b_input_ids, b_input_mask, b_labels = batch\n",
-        "\n",
-        "      with torch.no_grad():\n",
-        "          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n",
-        "          logits = outputs[0]\n",
-        "\n",
-        "      logits = logits.detach().cpu().numpy()\n",
-        "      label_ids = b_labels.to('cpu').numpy()\n",
-        "\n",
-        "      predictions.append(logits)\n",
-        "      true_labels.append(label_ids)\n",
-        "\n",
-        "  end=time.time()\n",
-        "  print('Prediction used {:.2f} seconds'.format(end - begin))\n",
-        "\n",
-        "  flat_predictions = [item for sublist in predictions for item in sublist]\n",
-        "  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()\n",
-        "  flat_true_labels = [item for sublist in true_labels for item in sublist]\n",
-        "\n",
-        "  print(\"Accuracy of {} is: {}\".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))\n",
-        "\n",
-        "  print(classification_report(flat_true_labels,flat_predictions))\n",
-        "\n",
-        "\n",
-        "  df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])\n",
-        "\n",
-        "  df_combined = pd.concat([test_df, df_prediction], axis=1)\n",
-        "\n",
-        "  counts = df_combined['prediction_Polarity'].value_counts()\n",
-        "  print(counts)\n",
-        "\n",
-        "  return df_combined"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dw2VuMkVAfec"
-      },
-      "source": [
-        "## Train"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IyDoiOFoMP3W"
-      },
-      "source": [
-        "### Dataset Preparation and Splitting\n",
-        "\n",
-        "In this section, we prepare the datasets for training and testing.\n",
-        "\n",
-        "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n",
-        "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n",
-        "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a Cross-Platform Setting?*).\n",
-        "\n",
-        "### Steps:\n",
-        "\n",
-        "1. **Load Datasets**  \n",
-        "   We read the three datasets into Pandas DataFrames.\n",
-        "\n",
-        "2. **Split into Training and Testing Sets**  \n",
-        "   - The **GitHub dataset (`df_gh`)** and **Stack Overflow dataset (`df_so`)** are each split into 70% training and 30% testing subsets.  \n",
-        "   - Similarly, the **cross-platform dataset (`df_crossplatform`)** is divided into a 70% training set and a 30% testing set.  \n",
-        "   - The splitting is performed using `train_test_split` with a `random_state` of 42 for reproducibility.\n",
-        "\n",
-        "3. **Save Processed Data**  \n",
-        "   - The training and testing subsets are saved as CSV files for further use.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fo57JVQT9GIq"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "# Read datasets\n",
-        "input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/crossplatform_sf_dataset_tokenized.csv'\n",
-        "so_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/so-dataset_tokenized.csv'\n",
-        "gh_input_path = '/content/drive/MyDrive/Software_Development_Sentiment_Classification/gh-dataset_tokenized.csv'\n",
-        "\n",
-        "# Load datasets into Pandas DataFrames\n",
-        "df_crossplatform = pd.read_csv(input_path)\n",
-        "df_so = pd.read_csv(so_input_path)\n",
-        "df_gh = pd.read_csv(gh_input_path)\n",
-        "\n",
-        "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n",
-        "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n",
-        "\n",
-        "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n",
-        "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n",
-        "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n",
-        "\n",
-        "# Save all datasets to CSV files for further use\n",
-        "\n",
-        "train_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df.csv', index=False)\n",
-        "test_df.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv', index=False)\n",
-        "\n",
-        "train_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_gh.csv', index=False)\n",
-        "train_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_so.csv', index=False)\n",
-        "test_gh.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv', index=False)\n",
-        "test_so.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv', index=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RogFCndjOGet"
-      },
-      "source": [
-        "### Train for Cross-Platform Dataset\n",
-        "We combine three training datasets (`train_df(ours)`, `train_gh`, and `train_so`) into a final dataset for training four different models.\n",
-        "\n",
-        "## Training Models  \n",
-        "The following models are trained:  \n",
-        "- **BERT**  \n",
-        "- **XLNet**  \n",
-        "- **RoBERTa**  \n",
-        "- **ALBERT**  \n",
-        "\n",
-        "## Training Parameters  \n",
-        "- **MAX_LEN**: `256`  \n",
-        "- **BATCH_SIZE**: `16`  \n",
-        "- **LEARNING_RATE**: `2e-5`  \n",
-        "- **EPOCHS**: `4`  \n",
-        "\n",
-        "Each model is trained using the merged dataset and saved for further evaluation.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "slhvZg0Ow3ae",
-        "outputId": "a950593a-2b5c-49a1-823b-40845fbb77cb"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training bert model...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n",
-            "Starting training...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Epoch 1: Train Loss: 0.5891, Accuracy: 0.7577\n",
-            "Validation Accuracy: 0.8673, F1 Score: 0.8675\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
-            "Epoch 2: Train Loss: 0.2148, Accuracy: 0.9312\n",
-            "Validation Accuracy: 0.8771, F1 Score: 0.8772\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
-            "Epoch 3: Train Loss: 0.1029, Accuracy: 0.9683\n",
-            "Validation Accuracy: 0.8845, F1 Score: 0.8841\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
-            "Epoch 4: Train Loss: 0.0526, Accuracy: 0.9869\n",
-            "Validation Accuracy: 0.9017, F1 Score: 0.9015\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\n",
-            "Final Model Performance on Validation Set:\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0     0.9294    0.8587    0.8927       184\n",
-            "           1     0.8739    0.9369    0.9043       111\n",
-            "           2     0.8898    0.9375    0.9130       112\n",
-            "\n",
-            "    accuracy                         0.9017       407\n",
-            "   macro avg     0.8977    0.9110    0.9033       407\n",
-            "weighted avg     0.9034    0.9017    0.9015       407\n",
-            "\n",
-            "Training xlnet model...\n",
-            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Starting training...\n",
-            "Epoch 1: Train Loss: 0.6808, Accuracy: 0.6927\n",
-            "Validation Accuracy: 0.8059, F1 Score: 0.8043\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
-            "Epoch 2: Train Loss: 0.3285, Accuracy: 0.8839\n",
-            "Validation Accuracy: 0.8280, F1 Score: 0.8275\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
-            "Epoch 3: Train Loss: 0.1917, Accuracy: 0.9355\n",
-            "Validation Accuracy: 0.8550, F1 Score: 0.8547\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
-            "Epoch 4: Train Loss: 0.1199, Accuracy: 0.9604\n",
-            "Validation Accuracy: 0.8575, F1 Score: 0.8573\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/xlnet_model\n",
-            "Final Model Performance on Validation Set:\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0     0.9241    0.7935    0.8538       184\n",
-            "           1     0.8306    0.9279    0.8766       111\n",
-            "           2     0.8000    0.8929    0.8439       112\n",
-            "\n",
-            "    accuracy                         0.8575       407\n",
-            "   macro avg     0.8516    0.8714    0.8581       407\n",
-            "weighted avg     0.8644    0.8575    0.8573       407\n",
-            "\n",
-            "Training Roberta model...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n",
-            "Starting training...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Epoch 1: Train Loss: 0.6244, Accuracy: 0.7228\n",
-            "Validation Accuracy: 0.8698, F1 Score: 0.8697\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
-            "Epoch 2: Train Loss: 0.2585, Accuracy: 0.9123\n",
-            "Validation Accuracy: 0.8747, F1 Score: 0.8745\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
-            "Epoch 3: Train Loss: 0.1612, Accuracy: 0.9506\n",
-            "Validation Accuracy: 0.8968, F1 Score: 0.8968\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
-            "Epoch 4: Train Loss: 0.0974, Accuracy: 0.9727\n",
-            "Validation Accuracy: 0.8968, F1 Score: 0.8969\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/Roberta_model\n",
-            "Final Model Performance on Validation Set:\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0     0.9302    0.8696    0.8989       184\n",
-            "           1     0.8803    0.9279    0.9035       111\n",
-            "           2     0.8644    0.9107    0.8870       112\n",
-            "\n",
-            "    accuracy                         0.8968       407\n",
-            "   macro avg     0.8917    0.9027    0.8964       407\n",
-            "weighted avg     0.8985    0.8968    0.8969       407\n",
-            "\n",
-            "Training albert model...\n",
-            "Training data shape: torch.Size([4068, 256]), torch.Size([4068, 256]), torch.Size([4068])\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Starting training...\n",
-            "Epoch 1: Train Loss: 0.6810, Accuracy: 0.7110\n",
-            "Validation Accuracy: 0.8305, F1 Score: 0.8332\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n",
-            "Epoch 2: Train Loss: 0.3244, Accuracy: 0.8913\n",
-            "Validation Accuracy: 0.8550, F1 Score: 0.8550\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/albert_model\n",
-            "Epoch 3: Train Loss: 0.2071, Accuracy: 0.9306\n",
-            "Validation Accuracy: 0.8550, F1 Score: 0.8548\n",
-            "Epoch 4: Train Loss: 0.1405, Accuracy: 0.9566\n",
-            "Validation Accuracy: 0.8526, F1 Score: 0.8524\n",
-            "Final Model Performance on Validation Set:\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0     0.8715    0.8478    0.8595       184\n",
-            "           1     0.8547    0.9009    0.8772       111\n",
-            "           2     0.8198    0.8125    0.8161       112\n",
-            "\n",
-            "    accuracy                         0.8526       407\n",
-            "   macro avg     0.8487    0.8537    0.8509       407\n",
-            "weighted avg     0.8527    0.8526    0.8524       407\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n",
-        "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n",
-        "train_df_final.to_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/train_df_final.csv', index=False)\n",
-        "\n",
-        "# Define the list of model names to be trained\n",
-        "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
-        "\n",
-        "# Train each model and save the trained model files\n",
-        "for i, model_name in enumerate(MODEL_NAMES):\n",
-        "    model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n",
-        "    print(f\"Training {model_name} model...\")\n",
-        "    train_model(train_df_final, model_save_path, model_select=i)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "krgDqVN9x_my"
-      },
-      "source": [
-        "### Train for existing dataset on Bert\n",
-        "In this section, we train the **BERT** model using the existing **GitHub** and **Stack Overflow** datasets.\n",
-        "\n",
-        "## Training Data  \n",
-        "The training dataset consists of:  \n",
-        "- **GitHub Data** (`train_gh`)  \n",
-        "- **Stack Overflow Data** (`train_so`)  \n",
-        "\n",
-        "## Training Parameters  \n",
-        "- **MAX_LEN**: `256`  \n",
-        "- **BATCH_SIZE**: `16`  \n",
-        "- **LEARNING_RATE**: `2e-5`  \n",
-        "- **EPOCHS**: `4`  \n",
-        "\n",
-        "The trained **BERT** model will be saved for further evaluation."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "sxHJGW6Lbgnt",
-        "outputId": "8d60c4d7-c1b2-4dee-dc01-a7069146a577"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training bert model on SO dataset...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training data shape: torch.Size([315, 256]), torch.Size([315, 256]), torch.Size([315])\n",
-            "Starting training...\n",
-            "Epoch 1: Train Loss: 0.7896, Accuracy: 0.7032\n",
-            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\n",
-            "Epoch 2: Train Loss: 0.5530, Accuracy: 0.8163\n",
-            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
-            "Epoch 3: Train Loss: 0.4831, Accuracy: 0.8163\n",
-            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
-            "Epoch 4: Train Loss: 0.4383, Accuracy: 0.8163\n",
-            "Validation Accuracy: 0.8438, F1 Score: 0.7722\n",
-            "Final Model Performance on Validation Set:\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0     0.8438    1.0000    0.9153        27\n",
-            "           1     0.0000    0.0000    0.0000         4\n",
-            "           2     0.0000    0.0000    0.0000         1\n",
-            "\n",
-            "    accuracy                         0.8438        32\n",
-            "   macro avg     0.2812    0.3333    0.3051        32\n",
-            "weighted avg     0.7119    0.8438    0.7722        32\n",
-            "\n",
-            "Training bert model on GH dataset...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
-            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
-            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Training data shape: torch.Size([1495, 256]), torch.Size([1495, 256]), torch.Size([1495])\n",
-            "Starting training...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Epoch 1: Train Loss: 0.8448, Accuracy: 0.6141\n",
-            "Validation Accuracy: 0.7933, F1 Score: 0.7983\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n",
-            "Epoch 2: Train Loss: 0.3207, Accuracy: 0.8900\n",
-            "Validation Accuracy: 0.8733, F1 Score: 0.8741\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n",
-            "Epoch 3: Train Loss: 0.1529, Accuracy: 0.9532\n",
-            "Validation Accuracy: 0.8867, F1 Score: 0.8851\n",
-            "Best model saved at /content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\n",
-            "Epoch 4: Train Loss: 0.0991, Accuracy: 0.9762\n",
-            "Validation Accuracy: 0.8800, F1 Score: 0.8783\n",
-            "Final Model Performance on Validation Set:\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0     0.8868    0.8246    0.8545        57\n",
-            "           1     0.8846    1.0000    0.9388        46\n",
-            "           2     0.8667    0.8298    0.8478        47\n",
-            "\n",
-            "    accuracy                         0.8800       150\n",
-            "   macro avg     0.8794    0.8848    0.8804       150\n",
-            "weighted avg     0.8798    0.8800    0.8783       150\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Train the model for github-golden-rule and stackoverflow on Bert\n",
-        "\n",
-        "MODEL_NAMES = ['bert']\n",
-        "for dataset_name, train_df in [('SO', train_so), ('GH', train_gh)]:\n",
-        "    for i, model_name in enumerate(MODEL_NAMES):\n",
-        "        model_save_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{dataset_name}_{model_name}_model\"\n",
-        "        print(f\"Training {model_name} model on {dataset_name} dataset...\")\n",
-        "        train_model(train_df, model_save_path, model_select=i)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ugVi3T4wAkSQ"
-      },
-      "source": [
-        "## Test"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XMgVVJ9U9XBa"
-      },
-      "source": [
-        "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n",
-        "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n",
-        "\n",
-        "## Evaluation Metrics  \n",
-        "We will assess:  \n",
-        "1. **Overall model performance** across all platforms.  \n",
-        "2. **Platform-specific performance** for each model on:  \n",
-        "   - **GitHub**  \n",
-        "   - **Jira**  \n",
-        "   - **Mailbox**  \n",
-        "\n",
-        "## Results  \n",
-        "The evaluation will print:  \n",
-        "- **Overall accuracy** of each model.  \n",
-        "- **Performance breakdown per platform** for each model.  \n",
-        "\n",
-        "\n",
-        "\n",
-        "\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "RSJI327nEKAz",
-        "outputId": "264447bc-b509-4e8a-e5ce-49cbed8a44a4"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Evaluating bert model...for overall platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 9.38 seconds\n",
-            "Accuracy of bert is: 0.9422084623323014\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.95      0.91      0.93       329\n",
-            "           1       0.94      0.97      0.95       318\n",
-            "           2       0.94      0.95      0.94       322\n",
-            "\n",
-            "    accuracy                           0.94       969\n",
-            "   macro avg       0.94      0.94      0.94       969\n",
-            "weighted avg       0.94      0.94      0.94       969\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2    327\n",
-            "1    326\n",
-            "0    316\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating bert model...for GitHub platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 3.93 seconds\n",
-            "Accuracy of bert is: 0.956140350877193\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.96      0.94      0.95       128\n",
-            "           1       0.97      0.96      0.96       122\n",
-            "           2       0.94      0.98      0.96        92\n",
-            "\n",
-            "    accuracy                           0.96       342\n",
-            "   macro avg       0.95      0.96      0.96       342\n",
-            "weighted avg       0.96      0.96      0.96       342\n",
-            "\n",
-            "prediction_Polarity\n",
-            "0.0    125\n",
-            "1.0    121\n",
-            "2.0     96\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating bert model...for Jira platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 4.05 seconds\n",
-            "Accuracy of bert is: 0.926829268292683\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.94      0.88      0.91        93\n",
-            "           1       0.92      0.94      0.93        86\n",
-            "           2       0.92      0.95      0.94       108\n",
-            "\n",
-            "    accuracy                           0.93       287\n",
-            "   macro avg       0.93      0.93      0.93       287\n",
-            "weighted avg       0.93      0.93      0.93       287\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    112\n",
-            "1.0     88\n",
-            "0.0     87\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating bert model...for Mailbox platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 5.19 seconds\n",
-            "Accuracy of bert is: 0.9411764705882353\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.94      0.91      0.92       108\n",
-            "           1       0.93      0.99      0.96       110\n",
-            "           2       0.95      0.93      0.94       122\n",
-            "\n",
-            "    accuracy                           0.94       340\n",
-            "   macro avg       0.94      0.94      0.94       340\n",
-            "weighted avg       0.94      0.94      0.94       340\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    119\n",
-            "1.0    117\n",
-            "0.0    104\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating xlnet model...for overall platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 14.16 seconds\n",
-            "Accuracy of xlnet is: 0.8823529411764706\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.89      0.82      0.85       329\n",
-            "           1       0.88      0.95      0.91       318\n",
-            "           2       0.88      0.88      0.88       322\n",
-            "\n",
-            "    accuracy                           0.88       969\n",
-            "   macro avg       0.88      0.88      0.88       969\n",
-            "weighted avg       0.88      0.88      0.88       969\n",
-            "\n",
-            "prediction_Polarity\n",
-            "1    343\n",
-            "2    320\n",
-            "0    306\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating xlnet model...for GitHub platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 7.19 seconds\n",
-            "Accuracy of xlnet is: 0.9005847953216374\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.90      0.85      0.88       128\n",
-            "           1       0.90      0.96      0.93       122\n",
-            "           2       0.90      0.89      0.90        92\n",
-            "\n",
-            "    accuracy                           0.90       342\n",
-            "   macro avg       0.90      0.90      0.90       342\n",
-            "weighted avg       0.90      0.90      0.90       342\n",
-            "\n",
-            "prediction_Polarity\n",
-            "1.0    130\n",
-            "0.0    121\n",
-            "2.0     91\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating xlnet model...for Jira platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 5.72 seconds\n",
-            "Accuracy of xlnet is: 0.9024390243902439\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.91      0.81      0.86        93\n",
-            "           1       0.91      0.95      0.93        86\n",
-            "           2       0.89      0.94      0.91       108\n",
-            "\n",
-            "    accuracy                           0.90       287\n",
-            "   macro avg       0.90      0.90      0.90       287\n",
-            "weighted avg       0.90      0.90      0.90       287\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    115\n",
-            "1.0     90\n",
-            "0.0     82\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating xlnet model...for Mailbox platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 6.06 seconds\n",
-            "Accuracy of xlnet is: 0.8470588235294118\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.84      0.81      0.82       108\n",
-            "           1       0.83      0.93      0.88       110\n",
-            "           2       0.87      0.81      0.84       122\n",
-            "\n",
-            "    accuracy                           0.85       340\n",
-            "   macro avg       0.85      0.85      0.85       340\n",
-            "weighted avg       0.85      0.85      0.85       340\n",
-            "\n",
-            "prediction_Polarity\n",
-            "1.0    123\n",
-            "2.0    114\n",
-            "0.0    103\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating Roberta model...for overall platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 9.61 seconds\n",
-            "Accuracy of Roberta is: 0.9060887512899897\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.93      0.83      0.87       329\n",
-            "           1       0.91      0.96      0.93       318\n",
-            "           2       0.88      0.94      0.91       322\n",
-            "\n",
-            "    accuracy                           0.91       969\n",
-            "   macro avg       0.91      0.91      0.91       969\n",
-            "weighted avg       0.91      0.91      0.91       969\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2    342\n",
-            "1    333\n",
-            "0    294\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating Roberta model...for GitHub platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 4.66 seconds\n",
-            "Accuracy of Roberta is: 0.9298245614035088\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.93      0.88      0.90       128\n",
-            "           1       0.95      0.98      0.96       122\n",
-            "           2       0.91      0.93      0.92        92\n",
-            "\n",
-            "    accuracy                           0.93       342\n",
-            "   macro avg       0.93      0.93      0.93       342\n",
-            "weighted avg       0.93      0.93      0.93       342\n",
-            "\n",
-            "prediction_Polarity\n",
-            "1.0    125\n",
-            "0.0    122\n",
-            "2.0     95\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating Roberta model...for Jira platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 4.47 seconds\n",
-            "Accuracy of Roberta is: 0.9163763066202091\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.95      0.81      0.87        93\n",
-            "           1       0.91      0.95      0.93        86\n",
-            "           2       0.90      0.98      0.94       108\n",
-            "\n",
-            "    accuracy                           0.92       287\n",
-            "   macro avg       0.92      0.91      0.91       287\n",
-            "weighted avg       0.92      0.92      0.91       287\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    118\n",
-            "1.0     90\n",
-            "0.0     79\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating Roberta model...for Mailbox platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 5.03 seconds\n",
-            "Accuracy of Roberta is: 0.8735294117647059\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.90      0.78      0.84       108\n",
-            "           1       0.87      0.94      0.90       110\n",
-            "           2       0.85      0.90      0.88       122\n",
-            "\n",
-            "    accuracy                           0.87       340\n",
-            "   macro avg       0.88      0.87      0.87       340\n",
-            "weighted avg       0.88      0.87      0.87       340\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    129\n",
-            "1.0    118\n",
-            "0.0     93\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating albert model...for overall platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 8.62 seconds\n",
-            "Accuracy of albert is: 0.8844169246646026\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.90      0.79      0.84       329\n",
-            "           1       0.91      0.95      0.93       318\n",
-            "           2       0.85      0.92      0.88       322\n",
-            "\n",
-            "    accuracy                           0.88       969\n",
-            "   macro avg       0.89      0.89      0.88       969\n",
-            "weighted avg       0.89      0.88      0.88       969\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2    350\n",
-            "1    331\n",
-            "0    288\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating albert model...for GitHub platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 3.99 seconds\n",
-            "Accuracy of albert is: 0.9269005847953217\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.93      0.88      0.90       128\n",
-            "           1       0.97      0.96      0.96       122\n",
-            "           2       0.87      0.96      0.91        92\n",
-            "\n",
-            "    accuracy                           0.93       342\n",
-            "   macro avg       0.92      0.93      0.93       342\n",
-            "weighted avg       0.93      0.93      0.93       342\n",
-            "\n",
-            "prediction_Polarity\n",
-            "1.0    121\n",
-            "0.0    120\n",
-            "2.0    101\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating albert model...for Jira platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 3.30 seconds\n",
-            "Accuracy of albert is: 0.89198606271777\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.95      0.75      0.84        93\n",
-            "           1       0.91      0.94      0.93        86\n",
-            "           2       0.85      0.97      0.91       108\n",
-            "\n",
-            "    accuracy                           0.89       287\n",
-            "   macro avg       0.90      0.89      0.89       287\n",
-            "weighted avg       0.90      0.89      0.89       287\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    124\n",
-            "1.0     89\n",
-            "0.0     74\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating albert model...for Mailbox platform\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "<ipython-input-4-486b4f5d4361>:19: SettingWithCopyWarning: \n",
-            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-            "Try using .loc[row_indexer,col_indexer] = value instead\n",
-            "\n",
-            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-            "  test_df['Polarity']=test_df['Polarity'].replace({\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 4.12 seconds\n",
-            "Accuracy of albert is: 0.8352941176470589\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.83      0.72      0.77       108\n",
-            "           1       0.85      0.94      0.89       110\n",
-            "           2       0.82      0.84      0.83       122\n",
-            "\n",
-            "    accuracy                           0.84       340\n",
-            "   macro avg       0.84      0.83      0.83       340\n",
-            "weighted avg       0.83      0.84      0.83       340\n",
-            "\n",
-            "prediction_Polarity\n",
-            "2.0    125\n",
-            "1.0    121\n",
-            "0.0     94\n",
-            "Name: count, dtype: int64\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "# Load test dataset\n",
-        "test_df = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_df.csv')\n",
-        "\n",
-        "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
-        "model_results = {}\n",
-        "\n",
-        "# Define platform mapping\n",
-        "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n",
-        "\n",
-        "# Evaluate each model\n",
-        "for i, model_name in enumerate(MODEL_NAMES):\n",
-        "    model_path = f\"/content/drive/MyDrive/Software_Development_Sentiment_Classification/{model_name}_model\"\n",
-        "    print(f\"Evaluating {model_name} model...for overall platform\")\n",
-        "\n",
-        "    # Get overall accuracy\n",
-        "    overall_accuracy = test_model(test_df, model_path, model_select=i)\n",
-        "\n",
-        "    # Evaluate accuracy per platform\n",
-        "    platform_accuracies = {}\n",
-        "    for platform_id, platform_name in platforms.items():\n",
-        "        test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n",
-        "        if not test_df_platform.empty:\n",
-        "            print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n",
-        "            accuracy = test_model(test_df_platform, model_path, model_select=i)\n",
-        "            platform_accuracies[platform_name] = accuracy\n",
-        "        else:\n",
-        "            platform_accuracies[platform_name] = \"No data\"\n",
-        "\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2rTxz8aIRkYQ"
-      },
-      "source": [
-        "### Generalization Performance of the Model (Table 3.3)\n",
-        "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets:  \n",
-        "- **GitHub Golden Rule Dataset**  \n",
-        "- **Stack Overflow Dataset**  \n",
-        "\n",
-        "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n",
-        "\n",
-        "## Evaluation Process  \n",
-        "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n",
-        "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n",
-        "\n",
-        "## Goals  \n",
-        "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n",
-        "- To highlight the **superiority** of our cross-platform model over dataset-specific models."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ekatnWlSAnyd",
-        "outputId": "7ff88fb0-109c-43b0-97ae-36a792730275"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Evaluating bert_model on GitHub test dataset...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 7.34 seconds\n",
-            "Accuracy of bert is: 0.8829953198127926\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.91      0.86      0.88       267\n",
-            "           1       0.89      0.92      0.91       170\n",
-            "           2       0.85      0.88      0.86       204\n",
-            "\n",
-            "    accuracy                           0.88       641\n",
-            "   macro avg       0.88      0.89      0.88       641\n",
-            "weighted avg       0.88      0.88      0.88       641\n",
-            "\n",
-            "prediction_Polarity\n",
-            "0    252\n",
-            "2    213\n",
-            "1    176\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating bert_model on Stack Overflow test dataset...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n",
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 4.10 seconds\n",
-            "Accuracy of bert is: 0.8814814814814815\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.93      0.93      0.93       110\n",
-            "           1       0.44      0.36      0.40        11\n",
-            "           2       0.81      0.93      0.87        14\n",
-            "\n",
-            "    accuracy                           0.88       135\n",
-            "   macro avg       0.73      0.74      0.73       135\n",
-            "weighted avg       0.88      0.88      0.88       135\n",
-            "\n",
-            "prediction_Polarity\n",
-            "0    110\n",
-            "2     16\n",
-            "1      9\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating GH_bert_model on Stack Overflow test dataset...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 2.21 seconds\n",
-            "Accuracy of bert is: 0.8148148148148148\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.85      0.95      0.89       110\n",
-            "           1       0.50      0.27      0.35        11\n",
-            "           2       0.50      0.21      0.30        14\n",
-            "\n",
-            "    accuracy                           0.81       135\n",
-            "   macro avg       0.62      0.48      0.52       135\n",
-            "weighted avg       0.78      0.81      0.79       135\n",
-            "\n",
-            "prediction_Polarity\n",
-            "0    123\n",
-            "2      6\n",
-            "1      6\n",
-            "Name: count, dtype: int64\n",
-            "Evaluating SO_bert_model on GitHub test dataset...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-            "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2673: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-            "  warnings.warn(\n",
-            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-            "<ipython-input-4-486b4f5d4361>:53: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-            "  model.load_state_dict(torch.load(model_saved_path))\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Prediction used 6.62 seconds\n",
-            "Accuracy of bert is: 0.4165366614664587\n",
-            "              precision    recall  f1-score   support\n",
-            "\n",
-            "           0       0.42      1.00      0.59       267\n",
-            "           1       0.00      0.00      0.00       170\n",
-            "           2       0.00      0.00      0.00       204\n",
-            "\n",
-            "    accuracy                           0.42       641\n",
-            "   macro avg       0.14      0.33      0.20       641\n",
-            "weighted avg       0.17      0.42      0.24       641\n",
-            "\n",
-            "prediction_Polarity\n",
-            "0    641\n",
-            "Name: count, dtype: int64\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
-            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
-            "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-            "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
-          ]
-        }
-      ],
-      "source": [
-        "\n",
-        "\n",
-        "# Load test datasets\n",
-        "test_gh = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_gh.csv')\n",
-        "test_so = pd.read_csv('/content/drive/MyDrive/Software_Development_Sentiment_Classification/test_so.csv')\n",
-        "\n",
-        "# Define model paths\n",
-        "bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/bert_model\"\n",
-        "gh_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/GH_bert_model\"\n",
-        "so_bert_model_path = \"/content/drive/MyDrive/Software_Development_Sentiment_Classification/SO_bert_model\"\n",
-        "\n",
-        "# Store results\n",
-        "model_results = {}\n",
-        "\n",
-        "# 1. Validate bert_model on test_gh and test_so\n",
-        "print(\"Evaluating bert_model on GitHub test dataset...\")\n",
-        "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n",
-        "\n",
-        "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n",
-        "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n",
-        "\n",
-        "model_results[\"bert_model\"] = {\n",
-        "    \"test_gh Accuracy\": bert_on_gh,\n",
-        "    \"test_so Accuracy\": bert_on_so\n",
-        "}\n",
-        "\n",
-        "# 2. Validate GH_bert_model on test_so\n",
-        "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n",
-        "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n",
-        "\n",
-        "model_results[\"GH_bert_model\"] = {\n",
-        "    \"test_so Accuracy\": gh_bert_on_so\n",
-        "}\n",
-        "\n",
-        "# 3. Validate SO_bert_model on test_gh\n",
-        "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n",
-        "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n",
-        "\n",
-        "model_results[\"SO_bert_model\"] = {\n",
-        "    \"test_gh Accuracy\": so_bert_on_gh\n",
-        "}\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "L4",
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/api/filter.py b/api/filter.py
new file mode 100644
index 0000000..7ed57f6
--- /dev/null
+++ b/api/filter.py
@@ -0,0 +1,64 @@
+import sys
+import re
+from unicodedata import category
+from bs4 import BeautifulSoup
+from markdown import markdown
+
+
+USERNAME_REGEX = r"(\s|^)@(\S*\s?)"
+
+# Generate Unicode punctuation set
+punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith(("P", "S"))}
+
+# Dictionary to count token replacements
+counters = {}
+
+def remove_punctuation(text):
+    """
+    Remove all punctuation characters from the given text.
+
+    Args:
+        text (str): The input text.
+
+    Returns:
+        str: The text without any punctuation.
+    """
+    return "".join(char for char in text if char not in punctuation)
+
+def clean_text(text):
+    """
+    Remove quoted text and large code blocks from GitHub issues or comments.
+
+    This function performs the following clean-up:
+    - Removes quoted email/notification text from GitHub.
+    - Removes code blocks enclosed in triple backticks.
+
+    Args:
+        text (str): The input text (typically from a GitHub issue or comment).
+
+    Returns:
+        str: The cleaned text without quoted text or code blocks.
+    """
+    # Remove quoted text from emails/notifications
+    text = re.sub(r"^(On[\s\S]*?notifications@github\.com\s*?wrote:\s*?)?(^(\>).*\s)*", '', text, flags=re.MULTILINE)
+
+    # Remove code blocks enclosed in triple backticks
+    text = re.sub(r"```[a-z]*\n[\s\S]*?\n```", "", text)
+
+    return text
+
+def remove_markdown_content(text):
+    """
+    Converts Markdown content to plain text by removing all Markdown formatting.
+
+    This function processes the input Markdown text and converts it to plain text
+    by removing all Markdown syntax.
+
+    Args:
+        text (str): The input Markdown text.
+
+    Returns:
+        str: Cleaned text without Markdown formatting.
+    """
+    html = markdown(text)
+    return "".join(BeautifulSoup(html, "lxml").findAll(text=True))
\ No newline at end of file
diff --git a/api/test.py b/api/test.py
new file mode 100644
index 0000000..a2bfe29
--- /dev/null
+++ b/api/test.py
@@ -0,0 +1,121 @@
+import os
+import re
+import string
+import random
+import warnings
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import time
+import seaborn as sns
+import matplotlib.pyplot as plt
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from io import StringIO
+from unicodedata import category
+from bs4 import BeautifulSoup
+from markdown import markdown
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
+from torch.utils.data import DataLoader, RandomSampler, Dataset
+from transformers import (
+    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,
+    XLNetTokenizer, XLNetForSequenceClassification,
+    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,
+    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,
+    get_scheduler
+)
+from torch.optim import AdamW
+from api.train import *
+
+def test_model(test_df, model_saved_path, model_select=0):
+
+  MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
+          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
+          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
+          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
+        ]
+  MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
+  seed_torch(42)
+
+  cur_model=MODELS[model_select]
+  m_name=MODEL_NAMES[model_select]
+
+  tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
+
+  begin=time.time()
+
+  test_df['Polarity']=test_df['Polarity'].replace({
+      'positive':1,
+      'negative':2,
+      'neutral':0})
+
+
+  sentences = test_df.Text.values
+  labels = test_df.Polarity.values
+
+  input_ids = []
+  attention_masks = []
+
+  for sent in sentences:
+      encoded_dict = tokenizer.encode_plus(
+                          str(sent),
+                          add_special_tokens = True,
+                          max_length = MAX_LEN,
+                          pad_to_max_length = True,
+                          return_attention_mask = True,
+                          return_tensors = 'pt',
+                    )
+
+      input_ids.append(encoded_dict['input_ids'])
+      attention_masks.append(encoded_dict['attention_mask'])
+
+  prediction_inputs = torch.cat(input_ids,dim=0)
+  prediction_masks = torch.cat(attention_masks,dim=0)
+  prediction_labels = torch.tensor(labels)
+
+  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+  prediction_sampler = SequentialSampler(prediction_data)
+  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
+
+  model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
+  model.load_state_dict(torch.load(model_saved_path))
+# model.cuda()
+  model.eval()
+
+  predictions,true_labels=[],[]
+
+  for batch in prediction_dataloader:
+      batch = tuple(t.to(device) for t in batch)
+      b_input_ids, b_input_mask, b_labels = batch
+
+      with torch.no_grad():
+          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
+          logits = outputs[0]
+
+      logits = logits.detach().cpu().numpy()
+      label_ids = b_labels.to('cpu').numpy()
+
+      predictions.append(logits)
+      true_labels.append(label_ids)
+
+  end=time.time()
+  print('Prediction used {:.2f} seconds'.format(end - begin))
+
+  flat_predictions = [item for sublist in predictions for item in sublist]
+  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
+  flat_true_labels = [item for sublist in true_labels for item in sublist]
+
+  print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
+
+  print(classification_report(flat_true_labels,flat_predictions))
+
+
+  df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])
+
+  df_combined = pd.concat([test_df, df_prediction], axis=1)
+
+  counts = df_combined['prediction_Polarity'].value_counts()
+  print(counts)
+
+  return df_combined
\ No newline at end of file
diff --git a/api/tokenizer.py b/api/tokenizer.py
new file mode 100644
index 0000000..c546182
--- /dev/null
+++ b/api/tokenizer.py
@@ -0,0 +1,99 @@
+import re
+import warnings
+
+# Dictionary to count token replacements
+counters = {}
+
+
+def replace_token(regex, token_name, text):
+    """
+    Replace matched patterns in the text with the specified token.
+
+    This function uses regular expressions to find occurrences of the pattern
+    and replaces them with a token name. The number of replacements made is counted.
+
+    Args:
+        regex (str): The regular expression pattern to match.
+        token_name (str): The replacement token name.
+        text (str): The input text.
+
+    Returns:
+        tuple: A tuple containing:
+            - str: The text with the tokens replacing the matches.
+            - int: The number of replacements made.
+    """
+    replaced_text, replacements = re.subn(regex, f" {token_name} ", text, flags=re.MULTILINE)
+    counters[token_name] = counters.get(token_name, 0) + replacements
+    return replaced_text, replacements
+
+def tokenize_text(text):
+    """
+    Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.
+
+    This function processes the input text and replaces various elements, such as:
+    - Email addresses (replaced with 'MEMAIL').
+    - GitHub mentions (replaced with 'MMENTION').
+    - Code blocks (replaced with 'MICODE').
+    - Version numbers (replaced with 'MVERSIONNUMBER').
+    - Issue mentions (replaced with 'MISSUEMENTION').
+    - URLs (replaced with 'MURL').
+
+    Args:
+        text (str): The input text.
+
+    Returns:
+        tuple: A tuple containing:
+            - str: The tokenized text.
+            - int: The total number of replacements made.
+    """
+    total_replacements = 0
+
+    text, replacements = replace_token(r"\S+@\S*\s?", "MEMAIL", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(USERNAME_REGEX, "MMENTION", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(r"`([^`]*)`", "MICODE", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(r"\b\d+\.\d+(\.\d+)*\b", "MVERSIONNUMBER", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(r"(\s|^)#\d+", "MISSUEMENTION", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(
+        r"([a-zA-Z0-9]+):\/\/([\w_-]+(?:\.[\w_-]+)*)[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]",
+        "MURL",
+        text,
+    )
+    total_replacements += replacements
+
+    return text, total_replacements
+
+def transform_text(row):
+    """
+    Transforms a row by cleaning and tokenizing its text content.
+
+    This function extracts the "Text" key from the input dictionary and processes
+    it using the `tokenize_text` function. The text is also cleaned by removing
+    newline characters.
+
+    Args:
+        row (dict): A dictionary containing a 'Text' key.
+
+    Returns:
+        tuple: A tuple containing:
+            - str: The processed text after cleaning and tokenization.
+            - int: The number of replacements made.
+    """
+    text = row.get("Text", "")
+
+    if not isinstance(text, str):
+        warnings.warn(f"Converting non-string type to string: {type(text)}")
+        text = str(text)
+
+    text, replaced_count = tokenize_text(text)
+    text = text.replace("\n", "")
+    return text, replaced_count
\ No newline at end of file
diff --git a/api/train.py b/api/train.py
new file mode 100644
index 0000000..30f5514
--- /dev/null
+++ b/api/train.py
@@ -0,0 +1,165 @@
+import os
+import re
+import string
+import random
+import warnings
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import time
+import seaborn as sns
+import matplotlib.pyplot as plt
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from io import StringIO
+from unicodedata import category
+from bs4 import BeautifulSoup
+from markdown import markdown
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
+from torch.utils.data import DataLoader, RandomSampler, Dataset
+from transformers import (
+    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,
+    XLNetTokenizer, XLNetForSequenceClassification,
+    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,
+    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,
+    get_scheduler
+)
+from torch.optim import AdamW
+
+MAX_LEN = 256
+BATCH_SIZE = 16
+LEARNING_RATE = 2e-5
+EPOCHS = 4
+WEIGHT_DECAY = 0.01
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+MODELS = [
+    (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),
+    (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),
+    (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),
+    (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1')
+]
+MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert']
+
+
+def train_model(train_df, model_save_path, model_select=0):
+    seed_torch(42)
+
+    cur_model = MODELS[model_select]
+    m_name = MODEL_NAMES[model_select]
+
+
+    train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0})
+    tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
+
+    sentences = train_df.Text.values
+    labels = train_df.Polarity.values
+
+    input_ids = []
+    attention_masks = []
+
+    for sent in sentences:
+        encoded_dict = tokenizer.encode_plus(
+            str(sent),
+            add_special_tokens=True,
+            max_length=MAX_LEN,
+            padding='max_length',
+            return_attention_mask=True,
+            return_tensors='pt',
+            truncation=True
+        )
+        input_ids.append(encoded_dict['input_ids'])
+        attention_masks.append(encoded_dict['attention_mask'])
+
+    input_ids = torch.cat(input_ids, dim=0)
+    attention_masks = torch.cat(attention_masks, dim=0)
+    labels = torch.tensor(labels)
+
+    print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}')
+
+
+    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
+        input_ids, labels, test_size=0.1, random_state=42)
+    train_masks, val_masks, _, _ = train_test_split(
+        attention_masks, labels, test_size=0.1, random_state=42)
+
+
+    train_data = TensorDataset(train_inputs, train_masks, train_labels)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
+
+    val_data = TensorDataset(val_inputs, val_masks, val_labels)
+    val_sampler = SequentialSampler(val_data)
+    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)
+
+
+    model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
+    model.to(device)
+
+
+    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+
+
+    num_training_steps = EPOCHS * len(train_dataloader)
+    lr_scheduler = get_scheduler(
+        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+    )
+
+
+    print("Starting training...")
+    best_f1 = 0
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0
+        predictions, true_labels = [], []
+
+        for batch in train_dataloader:
+            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
+            optimizer.zero_grad()
+            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+            loss, logits = outputs[:2]
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+
+            total_loss += loss.item()
+            predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
+            true_labels.extend(b_labels.cpu().numpy())
+
+        train_acc = accuracy_score(true_labels, predictions)
+        print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}")
+
+
+        model.eval()
+        val_predictions, val_labels = [], []
+        with torch.no_grad():
+            for batch in val_dataloader:
+                b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
+                outputs = model(b_input_ids, attention_mask=b_input_mask)
+                logits = outputs[0]
+                val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
+                val_labels.extend(b_labels.cpu().numpy())
+
+        val_acc = accuracy_score(val_labels, val_predictions)
+        val_f1 = f1_score(val_labels, val_predictions, average='weighted')
+        print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")
+
+
+        if val_f1 > best_f1:
+            best_f1 = val_f1
+            torch.save(model.state_dict(), model_save_path)
+            print(f"Best model saved at {model_save_path}")
+
+
+    print("Final Model Performance on Validation Set:")
+    print(classification_report(val_labels, val_predictions, digits=4))
+    return model_save_path
+
+def seed_torch(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic=True
\ No newline at end of file
diff --git a/notebooks/Software_Development_Sentiment_Classification.ipynb b/notebooks/Software_Development_Sentiment_Classification.ipynb
new file mode 100644
index 0000000..dbe68a8
--- /dev/null
+++ b/notebooks/Software_Development_Sentiment_Classification.ipynb
@@ -0,0 +1,239 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Sentiment Analysis\n",
+        "\n",
+        "In this notebook, we test how well four different **machine learning models** can analyze and understand the **sentiment** (positive or negative emotions) expressed in messages on various platforms used by software developers. These platforms include:\n",
+        "\n",
+        "- **GitHub** (where developers collaborate on code)\n",
+        "- **Jira** (used for tracking issues and tasks)\n",
+        "- **Mailbox** (for email-based communication)\n",
+        "\n",
+        "The models we’re testing are:\n",
+        "- **BERT**\n",
+        "- **XLNet**\n",
+        "- **RoBERTa**\n",
+        "- **ALBERT**\n",
+        "\n",
+        "Each model was trained on a mix of data from all three platforms. We then test how well each model performs on new, unseen data from the same platforms.\n",
+        "\n",
+        "1. **Overall Accuracy**: How well each model performs across all platforms.\n",
+        "2. **Platform-Specific Accuracy**: How well each model performs on **GitHub**, **Jira**, and **Mailbox** separately.\n",
+        "\n",
+        "The results help us understand which models work best across different communication tools and give insights into how sentiment analysis can be applied to real-world developer conversations.\n",
+        "\n",
+        "Note: This notebook aims to get the data ready, the `Train.ipynb` notebook trains the models, the `Test.ipynb` tests the models against the datasets.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 25,
+      "metadata": {
+        "id": "oe8X-6s9btXo"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "sys.path.append(os.path.abspath(\"..\"))\n",
+        "from tabulate import tabulate\n",
+        "\n",
+        "from api.filter import *\n",
+        "from api.tokenizer import *\n",
+        "from api.train import *\n",
+        "from api.test import *"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RAXtSnSK4LPr"
+      },
+      "source": [
+        "# Tokenlized\n",
+        "\n",
+        "This section processes the raw sentiment analysis datasets (`so-dataset.csv`, `gh-dataset.csv`, and `crossplatform_sf_dataset.csv`) by applying a custom text transformation function. The goal is to standardize and clean the text data before training. You can change the transform_text function to specific needs.\n",
+        "\n",
+        "There are aditional functions provided in [filter.py](../api/filter.py) and [tokenizer.py](../api/tokenizer.py) that can be used for specific use cases. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "current_directory = os.getcwd()\n",
+        "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n",
+        "\n",
+        "# Define input dataset paths\n",
+        "input_paths = [\n",
+        "    f\"{root}/so-dataset.csv\",\n",
+        "    f\"{root}/gh-dataset.csv\",\n",
+        "    f\"{root}/crossplatform_sf_dataset.csv\"\n",
+        "]\n",
+        "\n",
+        "# Define the text transformation function (ensure transform_text is correctly implemented)\n",
+        "def transform_text(row):\n",
+        "    # Modify this function according to your needs\n",
+        "    # Example: return the original text and a dummy replacement count\n",
+        "    return row[\"Text\"], 1\n",
+        "\n",
+        "# Loop through each dataset and process it\n",
+        "for input_path in input_paths:\n",
+        "    # Generate output file name\n",
+        "    output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n",
+        "    output_path = os.path.join(os.path.dirname(input_path), output_filename)\n",
+        "\n",
+        "    # Load dataset\n",
+        "    df = pd.read_csv(input_path)\n",
+        "    print(f\"Processing dataset: {input_path}\")\n",
+        "    print(df.head())  # Print first few rows for verification\n",
+        "\n",
+        "    # Apply text transformation\n",
+        "    df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n",
+        "\n",
+        "    # Calculate total replacements from `replaced_token` column\n",
+        "    total_replacements = df[\"replaced_token\"].sum()\n",
+        "\n",
+        "    # Save processed dataset\n",
+        "    df.to_csv(output_path, header=True, index=False)\n",
+        "\n",
+        "    print(f\"Tokenized dataset saved to: {output_path}\\n\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now that the data is cleaned and tokenized lets look at the datasets we have. "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "o3Cm2Gn_CbM5"
+      },
+      "source": [
+        "# Dataset Overview: \n",
+        "\n",
+        "`so-dataset.csv` : Contains Stack Overflow comment data.\n",
+        "\n",
+        "\n",
+        " `gh-dataset.csv` : Contains GitHub Stack overflow comment data. \n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "`crossplatform_sf_dataset.csv`\n",
+        "\n",
+        "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n",
+        "\n",
+        "## **Column Descriptions**\n",
+        "- **`Text`**: The user comment or discussion content.  \n",
+        "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text:  \n",
+        "  - `2`: Negative sentiment  \n",
+        "  - `0`: Neutral sentiment  \n",
+        "  - `1`: Positive sentiment  \n",
+        "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated:  \n",
+        "  - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests)  \n",
+        "  - `1`: **Jira** (Bug reports, task comments in software development management tools)  \n",
+        "  - `2`: **Mailbox** (Developer communication through emails)  \n",
+        "\n",
+        "## **Dataset Distribution**\n",
+        "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms.  \n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QaHSklrRClHU",
+        "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "# Load dataset\n",
+        "input_path = f\"{root}/crossplatform_sf_dataset.csv\"\n",
+        "df = pd.read_csv(input_path)\n",
+        "\n",
+        "# Compute dataset statistics\n",
+        "total_samples = len(df)\n",
+        "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n",
+        "platform_counts = df[\"Platform\"].value_counts().sort_index()\n",
+        "\n",
+        "# Compute Polarity distribution within each Platform\n",
+        "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n",
+        "\n",
+        "# Print results with formatting\n",
+        "print(\"=\" * 50)\n",
+        "print(f\"📊 Dataset Information: cf-dataset.csv\")\n",
+        "print(\"=\" * 50)\n",
+        "print(f\"Total Samples: {total_samples}\\n\")\n",
+        "\n",
+        "# Polarity distribution\n",
+        "print(\"📌 Polarity Distribution:\")\n",
+        "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n",
+        "print(\"\\n\")\n",
+        "\n",
+        "# Platform distribution\n",
+        "print(\"📌 Platform Distribution:\")\n",
+        "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n",
+        "print(\"\\n\")\n",
+        "\n",
+        "# Platform-wise Polarity distribution\n",
+        "print(\"📌 Platform-wise Polarity Distribution:\")\n",
+        "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n",
+        "print(\"=\" * 50)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Now that we have an understanding of the three datasets we can move over to the [Train.ipynb](./Train.ipynb) Notebook to start training the models based on the datasets we just prepared and reviewed."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "After the models are trained they can be tested with the [Test.ipynb](./Test.ipynb) Notebook. "
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "L4",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "base",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.4"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
new file mode 100644
index 0000000..75b01bb
--- /dev/null
+++ b/notebooks/Test.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test\n",
+    "\n",
+    "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n",
+    "\n",
+    "## Evaluation Metrics  \n",
+    "We will assess:  \n",
+    "1. **Overall model performance** across all platforms.  \n",
+    "2. **Platform-specific performance** for each model on:  \n",
+    "   - **GitHub**  \n",
+    "   - **Jira**  \n",
+    "   - **Mailbox**  \n",
+    "\n",
+    "## Results  \n",
+    "The evaluation will print:  \n",
+    "- **Overall accuracy** of each model.  \n",
+    "- **Performance breakdown per platform** for each model.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import string\n",
+    "import random\n",
+    "import warnings\n",
+    "import argparse\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "import time\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
+    "from io import StringIO\n",
+    "from unicodedata import category\n",
+    "from bs4 import BeautifulSoup\n",
+    "from markdown import markdown\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n",
+    "from torch.utils.data import DataLoader, RandomSampler, Dataset\n",
+    "from transformers import (\n",
+    "    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n",
+    "    XLNetTokenizer, XLNetForSequenceClassification,\n",
+    "    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n",
+    "    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n",
+    "    get_scheduler\n",
+    ")\n",
+    "\n",
+    "# Changed because AdamW Depreciated\n",
+    "from torch.optim import AdamW\n",
+    "from api.preprocessing import *\n",
+    "from api.train import *\n",
+    "from api.test import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "n_gpu = torch.cuda.device_count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluates four pretrained models \n",
+    "current_directory = os.getcwd()\n",
+    "# Load test dataset\n",
+    "test_df = pd.read_csv(f'{current_directory}/test_df.csv')\n",
+    "\n",
+    "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
+    "model_results = {}\n",
+    "\n",
+    "# Define platform mapping\n",
+    "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n",
+    "\n",
+    "# Evaluate each model\n",
+    "for i, model_name in enumerate(MODEL_NAMES):\n",
+    "    model_path = f\"{current_directory}/{model_name}_model\"\n",
+    "    print(f\"Evaluating {model_name} model...for overall platform\")\n",
+    "\n",
+    "    # Get overall accuracy\n",
+    "    overall_accuracy = test_model(test_df, model_path, model_select=i)\n",
+    "\n",
+    "    # Evaluate accuracy per platform\n",
+    "    platform_accuracies = {}\n",
+    "    for platform_id, platform_name in platforms.items():\n",
+    "        test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n",
+    "        if not test_df_platform.empty:\n",
+    "            print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n",
+    "            accuracy = test_model(test_df_platform, model_path, model_select=i)\n",
+    "            platform_accuracies[platform_name] = accuracy\n",
+    "        else:\n",
+    "            platform_accuracies[platform_name] = \"No data\"\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Generalization Performance of the Model (Table 3.3)\n",
+    "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets:  \n",
+    "- **GitHub Golden Rule Dataset**  \n",
+    "- **Stack Overflow Dataset**  \n",
+    "\n",
+    "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n",
+    "\n",
+    "## Evaluation Process  \n",
+    "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n",
+    "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n",
+    "\n",
+    "## Goals  \n",
+    "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n",
+    "- To highlight the **superiority** of our cross-platform model over dataset-specific models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating bert_model on GitHub test dataset...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:2700: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+      "  warnings.warn(\n",
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 17\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# 1. Validate bert_model on test_gh and test_so\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on GitHub test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 17\u001b[0m bert_on_gh \u001b[38;5;241m=\u001b[39m test_model(test_gh, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on Stack Overflow test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     20\u001b[0m bert_on_so \u001b[38;5;241m=\u001b[39m test_model(test_so, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/a2/sentiment_classifier/api/test.py:93\u001b[0m, in \u001b[0;36mtest_model\u001b[0;34m(test_df, model_saved_path, model_select)\u001b[0m\n\u001b[1;32m     90\u001b[0m b_input_ids, b_input_mask, b_labels \u001b[38;5;241m=\u001b[39m batch\n\u001b[1;32m     92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 93\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m model(b_input_ids, token_type_ids\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, attention_mask\u001b[38;5;241m=\u001b[39mb_input_mask)\n\u001b[1;32m     94\u001b[0m     logits \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     96\u001b[0m logits \u001b[38;5;241m=\u001b[39m logits\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1675\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1667\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1668\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m   1669\u001b[0m \u001b[38;5;124;03m    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[1;32m   1670\u001b[0m \u001b[38;5;124;03m    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[1;32m   1671\u001b[0m \u001b[38;5;124;03m    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[1;32m   1672\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1673\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1675\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbert(\n\u001b[1;32m   1676\u001b[0m     input_ids,\n\u001b[1;32m   1677\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[1;32m   1678\u001b[0m     token_type_ids\u001b[38;5;241m=\u001b[39mtoken_type_ids,\n\u001b[1;32m   1679\u001b[0m     position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m   1680\u001b[0m     head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m   1681\u001b[0m     inputs_embeds\u001b[38;5;241m=\u001b[39minputs_embeds,\n\u001b[1;32m   1682\u001b[0m     output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m   1683\u001b[0m     output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m   1684\u001b[0m     return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m   1685\u001b[0m )\n\u001b[1;32m   1687\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   1689\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1144\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1137\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m   1138\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m   1139\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m   1140\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m   1141\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m   1142\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m-> 1144\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder(\n\u001b[1;32m   1145\u001b[0m     embedding_output,\n\u001b[1;32m   1146\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mextended_attention_mask,\n\u001b[1;32m   1147\u001b[0m     head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m   1148\u001b[0m     encoder_hidden_states\u001b[38;5;241m=\u001b[39mencoder_hidden_states,\n\u001b[1;32m   1149\u001b[0m     encoder_attention_mask\u001b[38;5;241m=\u001b[39mencoder_extended_attention_mask,\n\u001b[1;32m   1150\u001b[0m     past_key_values\u001b[38;5;241m=\u001b[39mpast_key_values,\n\u001b[1;32m   1151\u001b[0m     use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m   1152\u001b[0m     output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m   1153\u001b[0m     output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m   1154\u001b[0m     return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m   1155\u001b[0m )\n\u001b[1;32m   1156\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1157\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:695\u001b[0m, in \u001b[0;36mBertEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    684\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    685\u001b[0m         layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    686\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    692\u001b[0m         output_attentions,\n\u001b[1;32m    693\u001b[0m     )\n\u001b[1;32m    694\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 695\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m layer_module(\n\u001b[1;32m    696\u001b[0m         hidden_states,\n\u001b[1;32m    697\u001b[0m         attention_mask,\n\u001b[1;32m    698\u001b[0m         layer_head_mask,\n\u001b[1;32m    699\u001b[0m         encoder_hidden_states,\n\u001b[1;32m    700\u001b[0m         encoder_attention_mask,\n\u001b[1;32m    701\u001b[0m         past_key_value,\n\u001b[1;32m    702\u001b[0m         output_attentions,\n\u001b[1;32m    703\u001b[0m     )\n\u001b[1;32m    705\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    706\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:585\u001b[0m, in \u001b[0;36mBertLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m    573\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    574\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    575\u001b[0m     hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    582\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m    583\u001b[0m     \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m    584\u001b[0m     self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 585\u001b[0m     self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattention(\n\u001b[1;32m    586\u001b[0m         hidden_states,\n\u001b[1;32m    587\u001b[0m         attention_mask,\n\u001b[1;32m    588\u001b[0m         head_mask,\n\u001b[1;32m    589\u001b[0m         output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m    590\u001b[0m         past_key_value\u001b[38;5;241m=\u001b[39mself_attn_past_key_value,\n\u001b[1;32m    591\u001b[0m     )\n\u001b[1;32m    592\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    594\u001b[0m     \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:524\u001b[0m, in \u001b[0;36mBertAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m    505\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    506\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    507\u001b[0m     hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    513\u001b[0m     output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    514\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m    515\u001b[0m     self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mself(\n\u001b[1;32m    516\u001b[0m         hidden_states,\n\u001b[1;32m    517\u001b[0m         attention_mask,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    522\u001b[0m         output_attentions,\n\u001b[1;32m    523\u001b[0m     )\n\u001b[0;32m--> 524\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m    525\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n\u001b[1;32m    526\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Evaluate how bert trained on different datasets\n",
+    "\n",
+    "# Load test datasets\n",
+    "test_gh = pd.read_csv(f'{current_directory}/test_gh.csv')\n",
+    "test_so = pd.read_csv(f'{current_directory}/test_so.csv')\n",
+    "\n",
+    "# Define model paths\n",
+    "bert_model_path = f\"{current_directory}/bert_model\"\n",
+    "gh_bert_model_path = f\"{current_directory}/GH_bert_model\"\n",
+    "so_bert_model_path = f\"{current_directory}/SO_bert_model\"\n",
+    "\n",
+    "# Store results\n",
+    "model_results = {}\n",
+    "\n",
+    "# 1. Validate bert_model on test_gh and test_so\n",
+    "print(\"Evaluating bert_model on GitHub test dataset...\")\n",
+    "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n",
+    "\n",
+    "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n",
+    "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n",
+    "\n",
+    "model_results[\"bert_model\"] = {\n",
+    "    \"test_gh Accuracy\": bert_on_gh,\n",
+    "    \"test_so Accuracy\": bert_on_so\n",
+    "}\n",
+    "\n",
+    "# 2. Validate GH_bert_model on test_so\n",
+    "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n",
+    "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n",
+    "\n",
+    "model_results[\"GH_bert_model\"] = {\n",
+    "    \"test_so Accuracy\": gh_bert_on_so\n",
+    "}\n",
+    "\n",
+    "# 3. Validate SO_bert_model on test_gh\n",
+    "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n",
+    "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n",
+    "\n",
+    "model_results[\"SO_bert_model\"] = {\n",
+    "    \"test_gh Accuracy\": so_bert_on_gh\n",
+    "}\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/Train.ipynb b/notebooks/Train.ipynb
new file mode 100644
index 0000000..5d6b73b
--- /dev/null
+++ b/notebooks/Train.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train\n",
+    "\n",
+    "We will train the following models:  \n",
+    "- **BERT**  \n",
+    "- **XLNet**  \n",
+    "- **RoBERTa**  \n",
+    "- **ALBERT**  \n",
+    "\n",
+    "## Training Parameters  \n",
+    "- **MAX_LEN**: `256`  \n",
+    "- **BATCH_SIZE**: `16`  \n",
+    "- **LEARNING_RATE**: `2e-5`  \n",
+    "- **EPOCHS**: `4`  \n",
+    "\n",
+    "Each model is trained using the merged dataset and saved for further evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.append(os.path.abspath(\"..\"))\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from transformers import (\n",
+    "    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n",
+    "    XLNetTokenizer, XLNetForSequenceClassification,\n",
+    "    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n",
+    "    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n",
+    "    get_scheduler\n",
+    ")\n",
+    "\n",
+    "from api.train import *\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First we load and split the tokenized datasets into training and testing sets (70/30 split).\n",
+    "\n",
+    "### Datasets:\n",
+    "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n",
+    "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n",
+    "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a \n",
+    "\n",
+    "### Output:\n",
+    "- Training: `train_df.csv`, `train_gh.csv`, `train_so.csv`\n",
+    "- Testing: `test_df.csv`, `test_gh.csv`, `test_so.csv`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_directory = os.getcwd()\n",
+    "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n",
+    "\n",
+    "# Sets up the csv and splits into 70 and 30 split\n",
+    "# Read datasets\n",
+    "current_directory = os.getcwd()\n",
+    "input_path = f'{root}/crossplatform_sf_dataset_tokenized.csv'\n",
+    "so_input_path = f'{root}/so-dataset_tokenized.csv'\n",
+    "gh_input_path = f'{root}/gh-dataset_tokenized.csv'\n",
+    "\n",
+    "# Load datasets into Pandas DataFrames\n",
+    "df_crossplatform = pd.read_csv(input_path)\n",
+    "df_so = pd.read_csv(so_input_path)\n",
+    "df_gh = pd.read_csv(gh_input_path)\n",
+    "\n",
+    "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n",
+    "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n",
+    "\n",
+    "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n",
+    "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n",
+    "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n",
+    "\n",
+    "# Save all datasets to CSV files for further use\n",
+    "\n",
+    "train_df.to_csv(f'{root}/train_df.csv', index=False)\n",
+    "test_df.to_csv(f'{root}/test_df.csv', index=False)\n",
+    "train_gh.to_csv(f'{root}/train_gh.csv', index=False)\n",
+    "train_so.to_csv(f'{root}/train_so.csv', index=False)\n",
+    "test_gh.to_csv(f'{root}/test_gh.csv', index=False)\n",
+    "test_so.to_csv(f'{root}/test_so.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These splits are saved for consistent model training and evaluation. `train_df.csv`, `train_gh.csv`, and `train_so.csv` are merged into a dataset and saved for model training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n",
+    "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n",
+    "train_df_final.to_csv(f'{root}/train_df_final.csv', index=False)\n",
+    "\n",
+    "# Define the list of model names to be trained\n",
+    "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
+    "\n",
+    "# Train each model and save the trained model files\n",
+    "for i, model_name in enumerate(MODEL_NAMES):\n",
+    "    model_save_path = f\"{root}/{model_name}_model\"\n",
+    "    print(f\"Training {model_name} model...\")\n",
+    "    train_model(train_df_final, model_save_path, model_select=i)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The models trained are saved to the root directory. We can see them in finder. At this point we can test the models we have created on the datasets, do this in the [Test.ipynb](./Test.ipynb) notebook."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From c573a33f0b38a65bea237d240eede5f6f03f48e9 Mon Sep 17 00:00:00 2001
From: Connor Narowetz <cnarowetz@gmail.com>
Date: Mon, 5 May 2025 19:48:21 -1000
Subject: [PATCH 3/5] Changed Imports

- Cleaned up imports
---
 notebooks/Test.ipynb | 137 ++++++++++---------------------------------
 1 file changed, 32 insertions(+), 105 deletions(-)

diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index 75b01bb..5115613 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -19,7 +19,7 @@
     "## Results  \n",
     "The evaluation will print:  \n",
     "- **Overall accuracy** of each model.  \n",
-    "- **Performance breakdown per platform** for each model.  "
+    "- **Performance breakdown per platform** for each model.  \n"
    ]
   },
   {
@@ -29,48 +29,31 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import re\n",
-    "import string\n",
-    "import random\n",
-    "import warnings\n",
-    "import argparse\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import torch\n",
-    "import time\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
-    "from io import StringIO\n",
-    "from unicodedata import category\n",
-    "from bs4 import BeautifulSoup\n",
-    "from markdown import markdown\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report\n",
-    "from torch.utils.data import DataLoader, RandomSampler, Dataset\n",
-    "from transformers import (\n",
-    "    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n",
-    "    XLNetTokenizer, XLNetForSequenceClassification,\n",
-    "    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n",
-    "    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n",
-    "    get_scheduler\n",
-    ")\n",
+    "import sys\n",
+    "sys.path.append(os.path.abspath(\"..\"))\n",
     "\n",
-    "# Changed because AdamW Depreciated\n",
-    "from torch.optim import AdamW\n",
-    "from api.preprocessing import *\n",
-    "from api.train import *\n",
     "from api.test import *"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 2,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-    "n_gpu = torch.cuda.device_count()"
+    "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n",
+    "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n",
+    "\n",
+    "## Evaluation Metrics  \n",
+    "We will assess:  \n",
+    "1. **Overall model performance** across all platforms.  \n",
+    "2. **Platform-specific performance** for each model on:  \n",
+    "   - **GitHub**  \n",
+    "   - **Jira**  \n",
+    "   - **Mailbox**  \n",
+    "\n",
+    "## Results  \n",
+    "The evaluation will print:  \n",
+    "- **Overall accuracy** of each model.  \n",
+    "- **Performance breakdown per platform** for each model.  "
    ]
   },
   {
@@ -79,10 +62,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Evaluates four pretrained models \n",
     "current_directory = os.getcwd()\n",
+    "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n",
+    "\n",
     "# Load test dataset\n",
-    "test_df = pd.read_csv(f'{current_directory}/test_df.csv')\n",
+    "test_df = pd.read_csv(f'{root}/test_df.csv')\n",
     "\n",
     "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n",
     "model_results = {}\n",
@@ -92,7 +76,7 @@
     "\n",
     "# Evaluate each model\n",
     "for i, model_name in enumerate(MODEL_NAMES):\n",
-    "    model_path = f\"{current_directory}/{model_name}_model\"\n",
+    "    model_path = f\"{root}/{model_name}_model\"\n",
     "    print(f\"Evaluating {model_name} model...for overall platform\")\n",
     "\n",
     "    # Get overall accuracy\n",
@@ -107,10 +91,7 @@
     "            accuracy = test_model(test_df_platform, model_path, model_select=i)\n",
     "            platform_accuracies[platform_name] = accuracy\n",
     "        else:\n",
-    "            platform_accuracies[platform_name] = \"No data\"\n",
-    "\n",
-    "\n",
-    "\n"
+    "            platform_accuracies[platform_name] = \"No data\""
    ]
   },
   {
@@ -135,74 +116,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Evaluating bert_model on GitHub test dataset...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:2700: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-      "  warnings.warn(\n",
-      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[5], line 17\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# 1. Validate bert_model on test_gh and test_so\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on GitHub test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 17\u001b[0m bert_on_gh \u001b[38;5;241m=\u001b[39m test_model(test_gh, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvaluating bert_model on Stack Overflow test dataset...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     20\u001b[0m bert_on_so \u001b[38;5;241m=\u001b[39m test_model(test_so, bert_model_path, model_select\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n",
-      "File \u001b[0;32m~/Desktop/a2/sentiment_classifier/api/test.py:93\u001b[0m, in \u001b[0;36mtest_model\u001b[0;34m(test_df, model_saved_path, model_select)\u001b[0m\n\u001b[1;32m     90\u001b[0m b_input_ids, b_input_mask, b_labels \u001b[38;5;241m=\u001b[39m batch\n\u001b[1;32m     92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 93\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m model(b_input_ids, token_type_ids\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, attention_mask\u001b[38;5;241m=\u001b[39mb_input_mask)\n\u001b[1;32m     94\u001b[0m     logits \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     96\u001b[0m logits \u001b[38;5;241m=\u001b[39m logits\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1675\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1667\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1668\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m   1669\u001b[0m \u001b[38;5;124;03m    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[1;32m   1670\u001b[0m \u001b[38;5;124;03m    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[1;32m   1671\u001b[0m \u001b[38;5;124;03m    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[1;32m   1672\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1673\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1675\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbert(\n\u001b[1;32m   1676\u001b[0m     input_ids,\n\u001b[1;32m   1677\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[1;32m   1678\u001b[0m     token_type_ids\u001b[38;5;241m=\u001b[39mtoken_type_ids,\n\u001b[1;32m   1679\u001b[0m     position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m   1680\u001b[0m     head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m   1681\u001b[0m     inputs_embeds\u001b[38;5;241m=\u001b[39minputs_embeds,\n\u001b[1;32m   1682\u001b[0m     output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m   1683\u001b[0m     output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m   1684\u001b[0m     return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m   1685\u001b[0m )\n\u001b[1;32m   1687\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   1689\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:1144\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1137\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m   1138\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m   1139\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m   1140\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m   1141\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m   1142\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m-> 1144\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder(\n\u001b[1;32m   1145\u001b[0m     embedding_output,\n\u001b[1;32m   1146\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mextended_attention_mask,\n\u001b[1;32m   1147\u001b[0m     head_mask\u001b[38;5;241m=\u001b[39mhead_mask,\n\u001b[1;32m   1148\u001b[0m     encoder_hidden_states\u001b[38;5;241m=\u001b[39mencoder_hidden_states,\n\u001b[1;32m   1149\u001b[0m     encoder_attention_mask\u001b[38;5;241m=\u001b[39mencoder_extended_attention_mask,\n\u001b[1;32m   1150\u001b[0m     past_key_values\u001b[38;5;241m=\u001b[39mpast_key_values,\n\u001b[1;32m   1151\u001b[0m     use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m   1152\u001b[0m     output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m   1153\u001b[0m     output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m   1154\u001b[0m     return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m   1155\u001b[0m )\n\u001b[1;32m   1156\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1157\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:695\u001b[0m, in \u001b[0;36mBertEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    684\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    685\u001b[0m         layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    686\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    692\u001b[0m         output_attentions,\n\u001b[1;32m    693\u001b[0m     )\n\u001b[1;32m    694\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 695\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m layer_module(\n\u001b[1;32m    696\u001b[0m         hidden_states,\n\u001b[1;32m    697\u001b[0m         attention_mask,\n\u001b[1;32m    698\u001b[0m         layer_head_mask,\n\u001b[1;32m    699\u001b[0m         encoder_hidden_states,\n\u001b[1;32m    700\u001b[0m         encoder_attention_mask,\n\u001b[1;32m    701\u001b[0m         past_key_value,\n\u001b[1;32m    702\u001b[0m         output_attentions,\n\u001b[1;32m    703\u001b[0m     )\n\u001b[1;32m    705\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    706\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:585\u001b[0m, in \u001b[0;36mBertLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m    573\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    574\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    575\u001b[0m     hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    582\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m    583\u001b[0m     \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m    584\u001b[0m     self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 585\u001b[0m     self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattention(\n\u001b[1;32m    586\u001b[0m         hidden_states,\n\u001b[1;32m    587\u001b[0m         attention_mask,\n\u001b[1;32m    588\u001b[0m         head_mask,\n\u001b[1;32m    589\u001b[0m         output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m    590\u001b[0m         past_key_value\u001b[38;5;241m=\u001b[39mself_attn_past_key_value,\n\u001b[1;32m    591\u001b[0m     )\n\u001b[1;32m    592\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    594\u001b[0m     \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py:524\u001b[0m, in \u001b[0;36mBertAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m    505\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    506\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    507\u001b[0m     hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    513\u001b[0m     output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    514\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m    515\u001b[0m     self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mself(\n\u001b[1;32m    516\u001b[0m         hidden_states,\n\u001b[1;32m    517\u001b[0m         attention_mask,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    522\u001b[0m         output_attentions,\n\u001b[1;32m    523\u001b[0m     )\n\u001b[0;32m--> 524\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m    525\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n\u001b[1;32m    526\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1737\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1748\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1749\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Evaluate how bert trained on different datasets\n",
     "\n",
     "# Load test datasets\n",
-    "test_gh = pd.read_csv(f'{current_directory}/test_gh.csv')\n",
-    "test_so = pd.read_csv(f'{current_directory}/test_so.csv')\n",
+    "test_gh = pd.read_csv(f'{root}/test_gh.csv')\n",
+    "test_so = pd.read_csv(f'{root}/test_so.csv')\n",
     "\n",
     "# Define model paths\n",
-    "bert_model_path = f\"{current_directory}/bert_model\"\n",
-    "gh_bert_model_path = f\"{current_directory}/GH_bert_model\"\n",
-    "so_bert_model_path = f\"{current_directory}/SO_bert_model\"\n",
+    "bert_model_path = f\"{root}/bert_model\"\n",
+    "gh_bert_model_path = f\"{root}/GH_bert_model\"\n",
+    "so_bert_model_path = f\"{root}/SO_bert_model\"\n",
     "\n",
     "# Store results\n",
     "model_results = {}\n",
@@ -233,7 +160,7 @@
     "\n",
     "model_results[\"SO_bert_model\"] = {\n",
     "    \"test_gh Accuracy\": so_bert_on_gh\n",
-    "}\n"
+    "}"
    ]
   }
  ],

From 57c0d38f7673ee6f24563be32d48301b42410da1 Mon Sep 17 00:00:00 2001
From: Connor Narowetz <cnarowetz@gmail.com>
Date: Thu, 8 May 2025 21:23:19 -1000
Subject: [PATCH 4/5] Added .env combined test.py and train.py

- train.py and test.py now exist in model.py
- env added for required packages
- Minor typo changes

Signed-off-by: Connor Narowetz <cnarowetz@gmail.com>
---
 api/{train.py => model.py}                    | 116 ++++++++++++++---
 api/test.py                                   | 121 ------------------
 env.yml                                       |  21 +++
 notebooks/Test.ipynb                          |   6 +-
 notebooks/Train.ipynb                         |  16 +--
 ...cation.ipynb => tokenize_statistics.ipynb} |  11 +-
 6 files changed, 133 insertions(+), 158 deletions(-)
 rename api/{train.py => model.py} (59%)
 delete mode 100644 api/test.py
 create mode 100644 env.yml
 rename notebooks/{Software_Development_Sentiment_Classification.ipynb => tokenize_statistics.ipynb} (98%)

diff --git a/api/train.py b/api/model.py
similarity index 59%
rename from api/train.py
rename to api/model.py
index 30f5514..b2a11f1 100644
--- a/api/train.py
+++ b/api/model.py
@@ -1,23 +1,16 @@
-import os
-import re
-import string
 import random
-import warnings
-import argparse
 import numpy as np
 import pandas as pd
 import torch
 import time
-import seaborn as sns
 import matplotlib.pyplot as plt
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from io import StringIO
 from unicodedata import category
-from bs4 import BeautifulSoup
 from markdown import markdown
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
-from torch.utils.data import DataLoader, RandomSampler, Dataset
+from torch.utils.data import DataLoader, RandomSampler
 from transformers import (
     BertTokenizer, BertForSequenceClassification, BertForMaskedLM,
     XLNetTokenizer, XLNetForSequenceClassification,
@@ -34,15 +27,14 @@
 WEIGHT_DECAY = 0.01
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-
-MODELS = [
-    (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),
-    (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),
-    (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),
-    (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1')
-]
 MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert']
 
+MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
+          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
+          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
+          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
+        ]
+
 
 def train_model(train_df, model_save_path, model_select=0):
     seed_torch(42)
@@ -162,4 +154,96 @@ def seed_torch(seed):
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic=True
\ No newline at end of file
+    torch.backends.cudnn.deterministic=True
+
+def test_model(test_df, model_saved_path, model_select=0):
+
+  MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
+          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
+          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
+          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
+        ]
+  MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
+  seed_torch(42)
+
+  cur_model=MODELS[model_select]
+  m_name=MODEL_NAMES[model_select]
+
+  tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
+
+  begin=time.time()
+
+  test_df['Polarity']=test_df['Polarity'].replace({
+      'positive':1,
+      'negative':2,
+      'neutral':0})
+
+
+  sentences = test_df.Text.values
+  labels = test_df.Polarity.values
+
+  input_ids = []
+  attention_masks = []
+
+  for sent in sentences:
+      encoded_dict = tokenizer.encode_plus(
+                          str(sent),
+                          add_special_tokens = True,
+                          max_length = MAX_LEN,
+                          pad_to_max_length = True,
+                          return_attention_mask = True,
+                          return_tensors = 'pt',
+                    )
+
+      input_ids.append(encoded_dict['input_ids'])
+      attention_masks.append(encoded_dict['attention_mask'])
+
+  prediction_inputs = torch.cat(input_ids,dim=0)
+  prediction_masks = torch.cat(attention_masks,dim=0)
+  prediction_labels = torch.tensor(labels)
+
+  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+  prediction_sampler = SequentialSampler(prediction_data)
+  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
+
+  model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
+  model.load_state_dict(torch.load(model_saved_path))
+# model.cuda()
+  model.eval()
+
+  predictions,true_labels=[],[]
+
+  for batch in prediction_dataloader:
+      batch = tuple(t.to(device) for t in batch)
+      b_input_ids, b_input_mask, b_labels = batch
+
+      with torch.no_grad():
+          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
+          logits = outputs[0]
+
+      logits = logits.detach().cpu().numpy()
+      label_ids = b_labels.to('cpu').numpy()
+
+      predictions.append(logits)
+      true_labels.append(label_ids)
+
+  end=time.time()
+  print('Prediction used {:.2f} seconds'.format(end - begin))
+
+  flat_predictions = [item for sublist in predictions for item in sublist]
+  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
+  flat_true_labels = [item for sublist in true_labels for item in sublist]
+
+  print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
+
+  print(classification_report(flat_true_labels,flat_predictions))
+
+
+  df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])
+
+  df_combined = pd.concat([test_df, df_prediction], axis=1)
+
+  counts = df_combined['prediction_Polarity'].value_counts()
+  print(counts)
+
+  return df_combined
\ No newline at end of file
diff --git a/api/test.py b/api/test.py
deleted file mode 100644
index a2bfe29..0000000
--- a/api/test.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import os
-import re
-import string
-import random
-import warnings
-import argparse
-import numpy as np
-import pandas as pd
-import torch
-import time
-import seaborn as sns
-import matplotlib.pyplot as plt
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from io import StringIO
-from unicodedata import category
-from bs4 import BeautifulSoup
-from markdown import markdown
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
-from torch.utils.data import DataLoader, RandomSampler, Dataset
-from transformers import (
-    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,
-    XLNetTokenizer, XLNetForSequenceClassification,
-    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,
-    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,
-    get_scheduler
-)
-from torch.optim import AdamW
-from api.train import *
-
-def test_model(test_df, model_saved_path, model_select=0):
-
-  MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
-          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
-          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
-          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
-        ]
-  MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
-  seed_torch(42)
-
-  cur_model=MODELS[model_select]
-  m_name=MODEL_NAMES[model_select]
-
-  tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
-
-  begin=time.time()
-
-  test_df['Polarity']=test_df['Polarity'].replace({
-      'positive':1,
-      'negative':2,
-      'neutral':0})
-
-
-  sentences = test_df.Text.values
-  labels = test_df.Polarity.values
-
-  input_ids = []
-  attention_masks = []
-
-  for sent in sentences:
-      encoded_dict = tokenizer.encode_plus(
-                          str(sent),
-                          add_special_tokens = True,
-                          max_length = MAX_LEN,
-                          pad_to_max_length = True,
-                          return_attention_mask = True,
-                          return_tensors = 'pt',
-                    )
-
-      input_ids.append(encoded_dict['input_ids'])
-      attention_masks.append(encoded_dict['attention_mask'])
-
-  prediction_inputs = torch.cat(input_ids,dim=0)
-  prediction_masks = torch.cat(attention_masks,dim=0)
-  prediction_labels = torch.tensor(labels)
-
-  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
-  prediction_sampler = SequentialSampler(prediction_data)
-  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
-
-  model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
-  model.load_state_dict(torch.load(model_saved_path))
-# model.cuda()
-  model.eval()
-
-  predictions,true_labels=[],[]
-
-  for batch in prediction_dataloader:
-      batch = tuple(t.to(device) for t in batch)
-      b_input_ids, b_input_mask, b_labels = batch
-
-      with torch.no_grad():
-          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
-          logits = outputs[0]
-
-      logits = logits.detach().cpu().numpy()
-      label_ids = b_labels.to('cpu').numpy()
-
-      predictions.append(logits)
-      true_labels.append(label_ids)
-
-  end=time.time()
-  print('Prediction used {:.2f} seconds'.format(end - begin))
-
-  flat_predictions = [item for sublist in predictions for item in sublist]
-  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
-  flat_true_labels = [item for sublist in true_labels for item in sublist]
-
-  print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
-
-  print(classification_report(flat_true_labels,flat_predictions))
-
-
-  df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])
-
-  df_combined = pd.concat([test_df, df_prediction], axis=1)
-
-  counts = df_combined['prediction_Polarity'].value_counts()
-  print(counts)
-
-  return df_combined
\ No newline at end of file
diff --git a/env.yml b/env.yml
new file mode 100644
index 0000000..97b3bcf
--- /dev/null
+++ b/env.yml
@@ -0,0 +1,21 @@
+name: sentiment_classifier
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.13.3
+  - ipykernel
+  - pip
+  - pip:
+      - bs4
+      - torch
+      - scikit-learn
+      - seaborn
+      - tabulate
+      - markdown
+      - numpy
+      - pandas
+      - scikit-learn
+      - transformers
+      - ipywidgets
+prefix: /opt/anaconda3/envs/sentiment_classifier
diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb
index 5115613..13e77a4 100644
--- a/notebooks/Test.ipynb
+++ b/notebooks/Test.ipynb
@@ -32,7 +32,7 @@
     "import sys\n",
     "sys.path.append(os.path.abspath(\"..\"))\n",
     "\n",
-    "from api.test import *"
+    "from api.model import *"
    ]
   },
   {
@@ -166,7 +166,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "sentiment_classifier",
    "language": "python",
    "name": "python3"
   },
@@ -180,7 +180,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/Train.ipynb b/notebooks/Train.ipynb
index 5d6b73b..c66c041 100644
--- a/notebooks/Train.ipynb
+++ b/notebooks/Train.ipynb
@@ -30,17 +30,9 @@
     "import os\n",
     "import sys\n",
     "sys.path.append(os.path.abspath(\"..\"))\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
-    "from transformers import (\n",
-    "    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,\n",
-    "    XLNetTokenizer, XLNetForSequenceClassification,\n",
-    "    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,\n",
-    "    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,\n",
-    "    get_scheduler\n",
-    ")\n",
     "\n",
-    "from api.train import *\n"
+    "from api.model import *\n"
    ]
   },
   {
@@ -61,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -134,7 +126,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "sentiment_classifier",
    "language": "python",
    "name": "python3"
   },
@@ -148,7 +140,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/Software_Development_Sentiment_Classification.ipynb b/notebooks/tokenize_statistics.ipynb
similarity index 98%
rename from notebooks/Software_Development_Sentiment_Classification.ipynb
rename to notebooks/tokenize_statistics.ipynb
index dbe68a8..9264df8 100644
--- a/notebooks/Software_Development_Sentiment_Classification.ipynb
+++ b/notebooks/tokenize_statistics.ipynb
@@ -30,7 +30,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 25,
+      "execution_count": 1,
       "metadata": {
         "id": "oe8X-6s9btXo"
       },
@@ -43,8 +43,7 @@
         "\n",
         "from api.filter import *\n",
         "from api.tokenizer import *\n",
-        "from api.train import *\n",
-        "from api.test import *"
+        "from api.model import *"
       ]
     },
     {
@@ -53,7 +52,7 @@
         "id": "RAXtSnSK4LPr"
       },
       "source": [
-        "# Tokenlized\n",
+        "# Tokenized\n",
         "\n",
         "This section processes the raw sentiment analysis datasets (`so-dataset.csv`, `gh-dataset.csv`, and `crossplatform_sf_dataset.csv`) by applying a custom text transformation function. The goal is to standardize and clean the text data before training. You can change the transform_text function to specific needs.\n",
         "\n",
@@ -217,7 +216,7 @@
       "toc_visible": true
     },
     "kernelspec": {
-      "display_name": "base",
+      "display_name": "sentiment_classifier",
       "language": "python",
       "name": "python3"
     },
@@ -231,7 +230,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.4"
+      "version": "3.13.3"
     }
   },
   "nbformat": 4,

From 012084d222c74b2f621c0e543a0040dcca6e5572 Mon Sep 17 00:00:00 2001
From: Connor Narowetz <cnarowetz@gmail.com>
Date: Thu, 8 May 2025 21:39:09 -1000
Subject: [PATCH 5/5] Added __init__.py to create pdoc and added model.py docs

- __init__.py added
- docs for model.py added

Signed-off-by: Connor Narowetz <cnarowetz@gmail.com>
---
 api/__init__.py |  0
 api/model.py    | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 api/__init__.py

diff --git a/api/__init__.py b/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/api/model.py b/api/model.py
index b2a11f1..7302b17 100644
--- a/api/model.py
+++ b/api/model.py
@@ -37,6 +37,21 @@
 
 
 def train_model(train_df, model_save_path, model_select=0):
+    """
+Trains a sentiment classification model on the provided dataset.
+
+Args:
+- train_df (pd.DataFrame)
+- model_save_path (str)
+- model_select (int, optional)
+
+Returns:
+        str: The path where the best model was saved.
+    
+Notes:
+        - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0).
+        - Saves the models
+    """
     seed_torch(42)
 
     cur_model = MODELS[model_select]
@@ -150,6 +165,15 @@ def train_model(train_df, model_save_path, model_select=0):
     return model_save_path
 
 def seed_torch(seed):
+    """
+Set random seeds for reproducibility in PyTorch and related libraries. 
+
+Args: 
+- Seed (int) : number to use for all random generators. 
+
+Example:
+seed_torch(42)
+    """
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
@@ -157,6 +181,17 @@ def seed_torch(seed):
     torch.backends.cudnn.deterministic=True
 
 def test_model(test_df, model_saved_path, model_select=0):
+  """
+Tests a pre-trained sentiment classification model on a test dataset and evaluates its performance.
+    
+Args:
+- test_df (pd.DataFrame)
+- model_saved_path (str)
+- model_select (int, optional)
+    
+Returns:
+pd.DataFrame: A DataFrame with the original test data and the model's predictions.
+    """
 
   MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),