From 18570ba29641537ec80f3b0e2300ae4c9c919bd3 Mon Sep 17 00:00:00 2001 From: Vee Date: Thu, 1 May 2025 18:05:39 -0400 Subject: [PATCH 1/2] reorganized files and implemented VADER and BilSTM, both of which need more work --- test_data.csv => data/test_data.csv | 0 train_data.csv => data/train_data.csv | 0 train_test.py => data/train_test.py | 0 .../twitter_sentiment_data.csv | 0 evaluate.py => models/BERT/evaluate.py | 4 +- model.py => models/BERT/model.py | 52 ---------------- .../BERT/run_training.py | 2 +- train.py => models/BERT/train.py | 2 +- models/BERT/utils.py | 53 ++++++++++++++++ models/BilSTM/dataset.py | 16 +++++ models/BilSTM/evaluate.py | 62 +++++++++++++++++++ models/BilSTM/main.py | 59 ++++++++++++++++++ models/BilSTM/model.py | 24 +++++++ models/BilSTM/train.py | 24 +++++++ models/BilSTM/utils.py | 20 ++++++ models/VADER/vader.py | 51 +++++++++++++++ 16 files changed, 313 insertions(+), 56 deletions(-) rename test_data.csv => data/test_data.csv (100%) rename train_data.csv => data/train_data.csv (100%) rename train_test.py => data/train_test.py (100%) rename twitter_sentiment_data.csv => data/twitter_sentiment_data.csv (100%) rename evaluate.py => models/BERT/evaluate.py (97%) rename model.py => models/BERT/model.py (51%) rename run_training.py => models/BERT/run_training.py (83%) rename train.py => models/BERT/train.py (96%) create mode 100644 models/BERT/utils.py create mode 100644 models/BilSTM/dataset.py create mode 100644 models/BilSTM/evaluate.py create mode 100644 models/BilSTM/main.py create mode 100644 models/BilSTM/model.py create mode 100644 models/BilSTM/train.py create mode 100644 models/BilSTM/utils.py create mode 100644 models/VADER/vader.py diff --git a/test_data.csv b/data/test_data.csv similarity index 100% rename from test_data.csv rename to data/test_data.csv diff --git a/train_data.csv b/data/train_data.csv similarity index 100% rename from train_data.csv rename to data/train_data.csv diff --git a/train_test.py b/data/train_test.py similarity index 100% rename from train_test.py rename to data/train_test.py diff --git a/twitter_sentiment_data.csv b/data/twitter_sentiment_data.csv similarity index 100% rename from twitter_sentiment_data.csv rename to data/twitter_sentiment_data.csv diff --git a/evaluate.py b/models/BERT/evaluate.py similarity index 97% rename from evaluate.py rename to models/BERT/evaluate.py index b2030ef..7c07f32 100644 --- a/evaluate.py +++ b/models/BERT/evaluate.py @@ -1,11 +1,11 @@ import pandas as pd -import torch +import torch import numpy as np from transformers import AutoTokenizer from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import matplotlib.pyplot as plt import seaborn as sns -from model import ClimateModel +from NLP_climate_analysis.models.BERT.model import ClimateModel def load_model(model_path): """Load the saved model""" diff --git a/model.py b/models/BERT/model.py similarity index 51% rename from model.py rename to models/BERT/model.py index ba0ba00..abc207b 100644 --- a/model.py +++ b/models/BERT/model.py @@ -50,59 +50,7 @@ def __getitem__(self, idx): def __len__(self): return len(self.labels) -# Function to compute metrics for evaluation -def compute_metrics(eval_pred): - from sklearn.metrics import accuracy_score, precision_recall_fscore_support - - predictions, labels = eval_pred - predictions = predictions.argmax(axis=1) - - accuracy = accuracy_score(labels, predictions) - precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted') - - return { - 'accuracy': accuracy, - 'precision': precision, - 'recall': recall, - 'f1': f1 - } -# Training arguments for the Trainer -def get_training_args(output_dir='./results'): - from transformers import TrainingArguments - - return TrainingArguments( - output_dir=output_dir, - eval_strategy='epoch', - save_strategy='epoch', - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=32, - num_train_epochs=3, - weight_decay=0.01, - load_best_model_at_end=True, - metric_for_best_model='accuracy', - logging_dir='./logs', - logging_steps=10 - ) -# Prediction function to use after training -def predict_sentiment(model, tokenizer, texts): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model.to(device) - model.eval() - encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt') - encodings = {key: val.to(device) for key, val in encodings.items()} - - with torch.no_grad(): - # Pass only input_ids and attention_mask for prediction - outputs = model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask']) - # The model should return only logits when labels are not provided - logits = outputs - - predictions = torch.argmax(logits, dim=1).cpu().numpy() - # Ensure the sentiment map covers all possible prediction indices (0, 1, 2) - sentiment_map = {0: 'anti', 1: 'neutral', 2: 'pro'} - return [sentiment_map[pred] for pred in predictions] diff --git a/run_training.py b/models/BERT/run_training.py similarity index 83% rename from run_training.py rename to models/BERT/run_training.py index 4744b6c..8ee4a05 100644 --- a/run_training.py +++ b/models/BERT/run_training.py @@ -6,7 +6,7 @@ # Import the training script and run it sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from train import main +from NLP_climate_analysis.models.BERT.train import main # Run the main function if __name__ == "__main__": diff --git a/train.py b/models/BERT/train.py similarity index 96% rename from train.py rename to models/BERT/train.py index ef18b37..3399215 100644 --- a/train.py +++ b/models/BERT/train.py @@ -2,7 +2,7 @@ import torch from transformers import AutoTokenizer, Trainer from sklearn.metrics import classification_report -from model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment +from NLP_climate_analysis.models.BERT.model import ClimateModel, ClimateDataset, compute_metrics, get_training_args, predict_sentiment def main(): # Check for CUDA diff --git a/models/BERT/utils.py b/models/BERT/utils.py new file mode 100644 index 0000000..2fe869c --- /dev/null +++ b/models/BERT/utils.py @@ -0,0 +1,53 @@ +import torch +from sklearn.metrics import accuracy_score, precision_recall_fscore_support +from transformers import TrainingArguments + +# Function to compute metrics for evaluation +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = predictions.argmax(axis=1) + accuracy = accuracy_score(labels, predictions) + precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted') + return { + 'accuracy': accuracy, + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + +# Training arguments for the Trainer +def get_training_args(output_dir='./results'): + return TrainingArguments( + output_dir=output_dir, + evaluation_strategy='epoch', + save_strategy='epoch', + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=32, + num_train_epochs=3, + weight_decay=0.01, + load_best_model_at_end=True, + metric_for_best_model='accuracy', + logging_dir='./logs', + logging_steps=10 + ) + +# Prediction function to use after training +def predict_sentiment(model, tokenizer, texts): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + model.eval() + + encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt') + encodings = {key: val.to(device) for key, val in encodings.items()} + + with torch.no_grad(): + # Pass only input_ids and attention_mask for prediction + outputs = model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask']) + # The model should return only logits when labels are not provided + logits = outputs + + predictions = torch.argmax(logits, dim=1).cpu().numpy() + # Ensure the sentiment map covers all possible prediction indices (0, 1, 2) + sentiment_map = {0: 'anti', 1: 'neutral', 2: 'pro'} + return [sentiment_map[pred] for pred in predictions] \ No newline at end of file diff --git a/models/BilSTM/dataset.py b/models/BilSTM/dataset.py new file mode 100644 index 0000000..bbe2aa2 --- /dev/null +++ b/models/BilSTM/dataset.py @@ -0,0 +1,16 @@ +import torch +from torch.utils.data import Dataset + +class TweetDataset(Dataset): + """ + Custom PyTorch dataset for sentiment classification. + """ + def __init__(self, inputs, labels): + self.inputs = torch.tensor(inputs, dtype=torch.long) + self.labels = torch.tensor(labels, dtype=torch.long) + + def __len__(self): + return len(self.labels) + + def __getitem__(self, idx): + return self.inputs[idx], self.labels[idx] diff --git a/models/BilSTM/evaluate.py b/models/BilSTM/evaluate.py new file mode 100644 index 0000000..8915ad7 --- /dev/null +++ b/models/BilSTM/evaluate.py @@ -0,0 +1,62 @@ +import torch +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score +import matplotlib.pyplot as plt +import seaborn as sns + +def predict(model, tokenizer, text_list, word_index, embedding_dim=100, max_len=100, device='cpu'): + """ + Predict sentiment for a list of input texts using the trained BiLSTM model. + Returns the predicted sentiment labels. + """ + from keras.preprocessing.sequence import pad_sequences # type: ignore + from keras.preprocessing.text import Tokenizer # type: ignore + + model.eval() + model.to(device) + + # Use the same tokenizer that was fit on training data + sequences = tokenizer.texts_to_sequences(text_list) + padded = pad_sequences(sequences, maxlen=max_len) + inputs = torch.tensor(padded, dtype=torch.long).to(device) + + with torch.no_grad(): + outputs = model(inputs) + preds = torch.argmax(outputs, dim=1).cpu().numpy() + + label_map = {0: 'anti', 1: 'neutral', 2: 'pro'} + return [label_map[p] for p in preds] + +def evaluate_model(model, test_loader, device, label_names): + """ + Evaluate the model on the test set and print metrics. + """ + model.eval() + all_preds = [] + all_labels = [] + + with torch.no_grad(): + for inputs, labels in test_loader: + inputs = inputs.to(device) + outputs = model(inputs) + preds = torch.argmax(outputs, dim=1).cpu().numpy() + all_preds.extend(preds) + all_labels.extend(labels.numpy()) + + print("\nClassification Report:") + print(classification_report(all_labels, all_preds, target_names=label_names)) + print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}") + + cm = confusion_matrix(all_labels, all_preds) + plot_confusion_matrix(cm, label_names) + + +def plot_confusion_matrix(cm, class_names): + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=class_names, yticklabels=class_names) + plt.xlabel("Predicted") + plt.ylabel("True") + plt.title("Confusion Matrix") + plt.tight_layout() + plt.savefig("bilstm_confusion_matrix.png") + plt.close() diff --git a/models/BilSTM/main.py b/models/BilSTM/main.py new file mode 100644 index 0000000..f0fca36 --- /dev/null +++ b/models/BilSTM/main.py @@ -0,0 +1,59 @@ +import pandas as pd +import torch +import numpy as np +from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore +from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore +from torch.utils.data import DataLoader +import torch.nn as nn +import torch.optim as optim + +from BilSTM.dataset import TweetDataset +from BilSTM.model import BiLSTMModel +from BilSTM.train import train_model +from BilSTM.evaluate import evaluate_model +from BilSTM.utils import load_glove_embeddings + +def main(): + MAX_LEN = 100 + BATCH_SIZE = 64 + EMBEDDING_DIM = 100 + NUM_EPOCHS = 5 + GLOVE_PATH = "embeddings/glove.6B.100d.txt" + CLASS_NAMES = ['anti', 'neutral', 'pro'] + + # Load data + train_df = pd.read_csv("data/train_data.csv") + test_df = pd.read_csv("data/test_data.csv") + + tokenizer = Tokenizer() + tokenizer.fit_on_texts(train_df['message']) + word_index = tokenizer.word_index + + X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['message']), maxlen=MAX_LEN) + X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['message']), maxlen=MAX_LEN) + y_train = train_df['sentiment'].values + y_test = test_df['sentiment'].values + + # Load embeddings + embedding_matrix = load_glove_embeddings(GLOVE_PATH, word_index, EMBEDDING_DIM) + + # Model and training setup + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = BiLSTMModel(embedding_matrix).to(device) + + train_loader = DataLoader(TweetDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True) + test_loader = DataLoader(TweetDataset(X_test, y_test), batch_size=BATCH_SIZE) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), lr=1e-3) + + # Train loop + for epoch in range(NUM_EPOCHS): + loss = train_model(model, train_loader, optimizer, criterion, device) + print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss:.4f}") + + # Evaluate + evaluate_model(model, test_loader, device, CLASS_NAMES) + +if __name__ == "__main__": + main() diff --git a/models/BilSTM/model.py b/models/BilSTM/model.py new file mode 100644 index 0000000..13f8366 --- /dev/null +++ b/models/BilSTM/model.py @@ -0,0 +1,24 @@ +import torch +import torch.nn as nn + +class BiLSTMModel(nn.Module): + """ + BiLSTM model using pre-trained embeddings. + """ + def __init__(self, embedding_matrix, hidden_dim=128, output_dim=3, dropout=0.5): + super(BiLSTMModel, self).__init__() + vocab_size, embedding_dim = embedding_matrix.shape + self.embedding = nn.Embedding(vocab_size, embedding_dim) + self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32)) + self.embedding.weight.requires_grad = False # Freeze embeddings + + self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True) + self.dropout = nn.Dropout(dropout) + self.fc = nn.Linear(hidden_dim * 2, output_dim) + + def forward(self, x): + embedded = self.embedding(x) + lstm_out, _ = self.lstm(embedded) + pooled = torch.mean(lstm_out, dim=1) + output = self.fc(self.dropout(pooled)) + return output diff --git a/models/BilSTM/train.py b/models/BilSTM/train.py new file mode 100644 index 0000000..c0e7baf --- /dev/null +++ b/models/BilSTM/train.py @@ -0,0 +1,24 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +def train_model(model, train_loader, optimizer, device): + """ + Train the BiLSTM model on the training set using CrossEntropyLoss. + """ + model.train() + total_loss = 0 + criterion = nn.CrossEntropyLoss() + + for inputs, labels in train_loader: + inputs, labels = inputs.to(device), labels.to(device) + + optimizer.zero_grad() + outputs = model(inputs) # Raw logits + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + total_loss += loss.item() + + return total_loss / len(train_loader) diff --git a/models/BilSTM/utils.py b/models/BilSTM/utils.py new file mode 100644 index 0000000..7fb2f22 --- /dev/null +++ b/models/BilSTM/utils.py @@ -0,0 +1,20 @@ +import numpy as np + +def load_glove_embeddings(glove_path, word_index, embedding_dim=100): + """ + Loads GloVe embeddings into an embedding matrix aligned with the tokenizer word index. + """ + embeddings_index = {} + with open(glove_path, 'r', encoding='utf8') as f: + for line in f: + values = line.split() + word = values[0] + vector = np.asarray(values[1:], dtype='float32') + embeddings_index[word] = vector + + embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) + for word, i in word_index.items(): + vec = embeddings_index.get(word) + if vec is not None: + embedding_matrix[i] = vec + return embedding_matrix diff --git a/models/VADER/vader.py b/models/VADER/vader.py new file mode 100644 index 0000000..665333a --- /dev/null +++ b/models/VADER/vader.py @@ -0,0 +1,51 @@ +import os +import nltk +import pandas as pd +from nltk.sentiment import SentimentIntensityAnalyzer +from sklearn.metrics import classification_report + +# Setup +nltk.download('vader_lexicon') +script_dir = os.path.dirname(os.path.abspath(__file__)) +file_path = os.path.join(script_dir, 'test_data.csv') +df = pd.read_csv(file_path) + +# Initialize VADER +sia = SentimentIntensityAnalyzer() + +# Label mapping +label_map = {-1: 'anti', 0: 'neutral', 1: 'pro'} +df['true_sentiment'] = df['sentiment'].map(label_map) + +def get_vader_sentiment(text): + score = sia.polarity_scores(text)['compound'] + if score > 0: + return 'pro' + elif score < 0: + return 'anti' + else: + return 'neutral' + +# Apply sentiment classification +df['vader_sentiment'] = df['message'].astype(str).apply(get_vader_sentiment) + +# Ensure clean string types +df['true_sentiment'] = df['true_sentiment'].astype(str) +df['vader_sentiment'] = df['vader_sentiment'].astype(str) + +# Evaluation +print("Classification Report") +print(classification_report( + df['true_sentiment'], + df['vader_sentiment'], + labels=['anti', 'neutral', 'pro'], + target_names=['anti', 'neutral', 'pro'] +)) + +print(df['true_sentiment'].value_counts()) + + +# Print common misclassifications +print("\nTop Misclassified Examples:") +misclassified = df[df['true_sentiment'] != df['vader_sentiment']] +print(misclassified[['message', 'true_sentiment', 'vader_sentiment']].sample(10, random_state=42)) From 7acbfa6b2315f6f6c9999c9f4f01f7c031f14a39 Mon Sep 17 00:00:00 2001 From: nedcut Date: Thu, 1 May 2025 22:18:35 -0400 Subject: [PATCH 2/2] fixing merge --- models/BERT/evaluate.py | 86 ------------------------------------- models/BERT/run_training.py | 13 ------ requirements.txt | 4 +- 3 files changed, 3 insertions(+), 100 deletions(-) delete mode 100644 models/BERT/evaluate.py delete mode 100644 models/BERT/run_training.py diff --git a/models/BERT/evaluate.py b/models/BERT/evaluate.py deleted file mode 100644 index 7c07f32..0000000 --- a/models/BERT/evaluate.py +++ /dev/null @@ -1,86 +0,0 @@ -import pandas as pd -import torch -import numpy as np -from transformers import AutoTokenizer -from sklearn.metrics import classification_report, confusion_matrix, accuracy_score -import matplotlib.pyplot as plt -import seaborn as sns -from NLP_climate_analysis.models.BERT.model import ClimateModel - -def load_model(model_path): - """Load the saved model""" - model = ClimateModel() - model.load_state_dict(torch.load(f"{model_path}/pytorch_model.bin", map_location=torch.device('cpu'))) - return model - -def predict(model, tokenizer, texts, device='cpu'): - """Run prediction on a list of texts""" - model.to(device) - model.eval() - - encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt') - encodings = {k: v.to(device) for k, v in encodings.items()} - - with torch.no_grad(): - outputs = model(**encodings) - - predictions = torch.argmax(outputs, dim=1).cpu().numpy() - return predictions - -def plot_confusion_matrix(cm, class_names): - """Plot confusion matrix""" - plt.figure(figsize=(10, 7)) - sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names) - plt.xlabel('Predicted') - plt.ylabel('True') - plt.title('Confusion Matrix') - plt.savefig('confusion_matrix.png') - plt.close() - -def main(): - model_path = './saved_model' - test_data_path = 'test_data.csv' - class_names = ['anti', 'neutral', 'pro'] - - # Load the model and tokenizer - print("Loading model and tokenizer...") - model = load_model(model_path) - tokenizer = AutoTokenizer.from_pretrained(model_path) - - # Load test data - print("Loading test data...") - test_df = pd.read_csv(test_data_path) - - # Make predictions - print("Making predictions...") - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - predictions = predict(model, tokenizer, test_df['message'].tolist(), device) - - # Evaluate predictions - print("Evaluating predictions...") - true_labels = test_df['sentiment'].values - accuracy = accuracy_score(true_labels, predictions) - print(f"Accuracy: {accuracy:.4f}") - - print("\nClassification Report:") - report = classification_report(true_labels, predictions, target_names=class_names) - print(report) - - # Create and save confusion matrix - cm = confusion_matrix(true_labels, predictions) - plot_confusion_matrix(cm, class_names) - print("Confusion matrix saved as 'confusion_matrix.png'") - - # Interactive prediction mode - print("\nEnter text to classify (or type 'exit' to quit):") - while True: - text = input("> ") - if text.lower() == 'exit': - break - - pred = predict(model, tokenizer, [text], device)[0] - sentiment = class_names[pred] - print(f"Predicted sentiment: {sentiment}") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/models/BERT/run_training.py b/models/BERT/run_training.py deleted file mode 100644 index 8ee4a05..0000000 --- a/models/BERT/run_training.py +++ /dev/null @@ -1,13 +0,0 @@ -import os -import sys - -# Bypass transformers version check -os.environ['TRANSFORMERS_SKIP_DEPENDENCY_CHECK'] = '1' - -# Import the training script and run it -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from NLP_climate_analysis.models.BERT.train import main - -# Run the main function -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index dabecae..337f9cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ torch>=1.7.0 transformers>=4.5.0 scikit-learn>=0.24.0 matplotlib>=3.3.0 -seaborn>=0.11.0 \ No newline at end of file +seaborn>=0.11.0 + accelerate>=0.26.0 + huggingface_hub[hf_xet] \ No newline at end of file